diff --git a/include/tvm/relay/attrs/annotation.h b/include/tvm/relay/attrs/annotation.h
index 4a2eb63c7e6af..1c8859e07cc13 100644
--- a/include/tvm/relay/attrs/annotation.h
+++ b/include/tvm/relay/attrs/annotation.h
@@ -67,6 +67,18 @@ struct CompilerAttrs : public tvm::AttrsNode<CompilerAttrs> {
   }
 };
 
+/*!
+ * \brief Options for the operators used to annotate a compiler.
+ */
+struct TIRCallAttrs : public tvm::AttrsNode<TIRCallAttrs> {
+  /*! \brief A 3rd party compiler for code generation. */
+  Map<String, ObjectRef> metadata;
+
+  TVM_DECLARE_ATTRS(TIRCallAttrs, "relay.attrs.TIRCallAttrs") {
+    TVM_ATTR_FIELD(metadata).describe("Metadata attached to the TIR function call.");
+  }
+};
+
 }  // namespace relay
 }  // namespace tvm
 #endif  // TVM_RELAY_ATTRS_ANNOTATION_H_
diff --git a/python/tvm/auto_scheduler/relay_integration.py b/python/tvm/auto_scheduler/relay_integration.py
index 0d18bc08e5ed9..7d6d746fb16ca 100644
--- a/python/tvm/auto_scheduler/relay_integration.py
+++ b/python/tvm/auto_scheduler/relay_integration.py
@@ -232,6 +232,7 @@ def add_workload_input_names(self, workload_key, input_names):
 @tvm._ffi.register_func("auto_scheduler.enter_layout_rewrite")
 def enter_layout_rewrite():
     """Enter layout rewrite tracing environment"""
+    # import pdb; pdb.set_trace()
     env = TracingEnvironment(TracingMode.PREPARE_LAYOUT_REWRITE)
     env.__enter__()
 
diff --git a/python/tvm/micro/model_library_format.py b/python/tvm/micro/model_library_format.py
index 7062b20e0d54b..c934440322b7a 100644
--- a/python/tvm/micro/model_library_format.py
+++ b/python/tvm/micro/model_library_format.py
@@ -150,6 +150,7 @@ def _build_function_memory_map(function_metadata):
         2.) A global memory requirement if all functions are executed sequentially
     """
     device_max_workspace = dict()
+    print("TOTAL FUNCTION METADATA: ", function_metadata)
     main_func_metadata = function_metadata[MAIN_FUNC_NAME_STR]
     num_targets = len(main_func_metadata.workspace_sizes.items())
     func_entries = []
diff --git a/python/tvm/relay/backend/compile_engine.py b/python/tvm/relay/backend/compile_engine.py
index 2db8c5a669f08..e9129db7b200d 100644
--- a/python/tvm/relay/backend/compile_engine.py
+++ b/python/tvm/relay/backend/compile_engine.py
@@ -429,7 +429,7 @@ def dump(self):
             res += "------------------------------------\n"
             res += "target={}\n".format(k.target)
             res += "use_count={}\n".format(v.use_count)
-            res += "func_name={}\n".format(v.cached_func.func_name)
+            res += "func_name={}\n".format(v.cached_func.prim_fn_var.name_hint)
             res += "----relay function----\n"
             res += k.source_func.astext() + "\n"
             res += "----tir function----- \n"
@@ -444,7 +444,7 @@ def dump(self):
             res += "------------------------------------\n"
             res += "target={}\n".format(k.target)
             res += "use_count={}\n".format(v.use_count)
-            res += "func_name={}\n".format(v.cached_func.func_name)
+            res += "func_name={}\n".format(v.cached_func.prim_fn_var.name_hint)
             res += "----relay function----\n"
             res += k.source_func.astext() + "\n"
             res += "----tir function----- \n"
diff --git a/python/tvm/relay/expr.py b/python/tvm/relay/expr.py
index 8d73a090ed6f8..cdfac53430cf3 100644
--- a/python/tvm/relay/expr.py
+++ b/python/tvm/relay/expr.py
@@ -23,7 +23,7 @@
 import tvm._ffi
 from tvm._ffi import base as _base
 from tvm.runtime import NDArray, ndarray as _nd
-from tvm.ir import RelayExpr, GlobalVar
+from tvm.ir import RelayExpr, GlobalVar, Node
 
 from .base import RelayNode
 from . import _ffi_api
@@ -538,3 +538,18 @@ def bind(expr, binds):
         The expression or function after binding.
     """
     return _ffi_api.Bind(expr, binds)
+
+
+@tvm._ffi.register_object("relay.StorageInfo")
+class StorageInfo(Node):
+    @property
+    def storage_ids(self):
+        return _ffi_api.StorageInfoStorageIds(self)
+
+    @property
+    def device_types(self):
+        return _ffi_api.StorageInfoDeviceTypes(self)
+
+    @property
+    def storage_sizes(self):
+        return _ffi_api.StorageInfoStorageSizes(self)
diff --git a/src/driver/driver_api.cc b/src/driver/driver_api.cc
index cd8173717d5f2..50f00140df9b5 100644
--- a/src/driver/driver_api.cc
+++ b/src/driver/driver_api.cc
@@ -437,14 +437,18 @@ std::pair<IRModule, IRModule> SplitDevHostFuncs(IRModule mod_mixed, const Target
   }
 
   if (target->kind->device_type == kDLCPU && target_host == target) {
-    ICHECK(mdevice->functions.empty()) << "No device code should be generated when target "
-                                       << "and host_target are both llvm target."
-                                       << "\n";
+    // TODO(@jroesch): This check is no longer true we need to figure out if we care about this.
+    // We need to relax this check for just TIR functions.
+    // ICHECK(mdevice->functions.empty()) << "No device code should be generated when target "
+    //                                   << "and host_target are both llvm target."
+    //                                   << "\n";
   }
 
   return {mhost, mdevice};
 }
 
+// Can we make this take one annotated IRModule?
+//
 // Build for heterogeneous execution.
 runtime::Module build(const Map<Target, IRModule>& inputs_arg, const Target& target_host_arg) {
   auto pass_ctx = transform::PassContext::Current();
diff --git a/src/relay/backend/aot_executor_codegen.cc b/src/relay/backend/aot_executor_codegen.cc
index 93935af70fca9..9ec8baa1add71 100644
--- a/src/relay/backend/aot_executor_codegen.cc
+++ b/src/relay/backend/aot_executor_codegen.cc
@@ -42,6 +42,8 @@
 
 namespace tvm {
 namespace relay {
+// TODO(@jroesch, @csullivan): declare directly elsewhere
+backend::StaticMemoryPlan GraphPlanMemory(const Function& func);
 namespace backend {
 
 using IntegerArray = Array<Integer>;
@@ -49,19 +51,19 @@ using TargetsMap = std::unordered_map<int, Target>;
 
 class AotReturnSidVisitor : public ExprVisitor {
  public:
-  explicit AotReturnSidVisitor(Map<Expr, Array<IntegerArray>> storage_device_map)
-      : storage_device_map_{storage_device_map}, return_sid_{-1} {}
+  explicit AotReturnSidVisitor(Map<Expr, StorageInfo> storage_info_map)
+      : storage_info_map_{storage_info_map}, return_sid_{-1} {}
 
-  IntegerArray FindReturnSid(Function func) {
+  std::vector<int64_t> FindReturnSid(Function func) {
     VisitExpr(func->body);
     return return_sid_;
   }
 
  protected:
   void AssignReturnSid(Expr e) {
-    auto iter = storage_device_map_.find(e);
-    if (iter != storage_device_map_.end()) {
-      return_sid_ = (*iter).second[0];
+    auto iter = storage_info_map_.find(e);
+    if (iter != storage_info_map_.end()) {
+      return_sid_ = (*iter).second->storage_ids;
     }
   }
 
@@ -88,8 +90,8 @@ class AotReturnSidVisitor : public ExprVisitor {
   }
 
  private:
-  Map<Expr, Array<IntegerArray>> storage_device_map_;
-  IntegerArray return_sid_;
+  Map<Expr, StorageInfo> storage_info_map_;
+  std::vector<int64_t> return_sid_;
 };
 
 /*! \brief Code generator for AOT executor */
@@ -120,15 +122,16 @@ class AOTExecutorCodegen : public ExprVisitor {
    * \brief Return a vector of variables that represents the sids for the given Relay Expr
    */
   std::vector<tir::Var> PackSid(Expr expr) {
-    Array<IntegerArray> sids = storage_device_map_[expr];
+    Map<Expr, StorageInfo> storage_info_map = memory_plan_->expr_to_storage_info;
+    StorageInfo storage_info = storage_info_map[expr];
+    // std::vector<int64_t> sids = storage_device_map_[expr];
     std::vector<tir::Var> sid_vars;
 
     // Note that an expression can have multiple sids associated with it
     // e.g., returning multiple values from a function
-    for (const auto& sid : sids[0]) {
+    for (int64_t sid : storage_info->storage_ids) {
       // Determine if an sid is an output buffer
-      int sid_int = static_cast<int>((sid.as<IntImmNode>())->value);
-      auto output_iter = std::find(return_sid_.begin(), return_sid_.end(), sid_int);
+      auto output_iter = std::find(return_sid_.begin(), return_sid_.end(), sid);
       if (output_iter != return_sid_.end()) {
         int output_index = std::distance(return_sid_.begin(), output_iter);
         sid_vars.push_back(main_signature_[input_vars_.size() + output_index]);
@@ -346,6 +349,7 @@ class AOTExecutorCodegen : public ExprVisitor {
       fi_node->tir_primfuncs.Set(primfunc_target, primfunc);
       fi_node->relay_primfuncs.Set(primfunc_target, relay_func);
     }
+    std::cout << "Update function metadata called" << std::endl;
     function_metadata_.Set(cfunc->func_name, FunctionInfo(fi_node));
   }
 
@@ -385,13 +389,14 @@ class AOTExecutorCodegen : public ExprVisitor {
       UpdateConstants(func, &params_);
 
       // Generate the TIR function call
-      CreateFuncCall(GetRef<Call>(op), ext_func->func_name);
+      CreateFuncCall(GetRef<Call>(op), ext_func->prim_fn_var->name_hint);
       return;
     }
 
-    ICHECK_GE(storage_device_map_.count(expr), 0);
-    auto& device_type = storage_device_map_[expr][1];
-    auto call_dev_type = device_type[0]->value;
+    Map<Expr, StorageInfo> storage_info_map = memory_plan_->expr_to_storage_info;
+    ICHECK_GE(storage_info_map.count(expr), 0);
+    auto& device_type = storage_info_map[expr]->device_types;
+    auto call_dev_type = device_type[0];  // TODO(@electriclilies): what is happening here
     // Normal Relay Function
     if (targets_.size() == 1) {
       // homogeneous execution.
@@ -420,7 +425,7 @@ class AOTExecutorCodegen : public ExprVisitor {
     UpdateFunctionMetadata(lowered_func, func, target);
 
     // Generate the TIR function call
-    CreateFuncCall(GetRef<Call>(op), lowered_func->func_name);
+    CreateFuncCall(GetRef<Call>(op), lowered_func->prim_fn_var->name_hint);
   }
 
   void VisitExpr_(const VarNode* op) override {
@@ -428,14 +433,17 @@ class AOTExecutorCodegen : public ExprVisitor {
 
     // If the Var node is an output node we need to copy the content of the variable to the output
     // It's safe to check the SID here because Var StorageToken are never reallocated
-    Array<IntegerArray> sids = storage_device_map_[expr];
+    Map<Expr, StorageInfo> storage_info_map = memory_plan_->expr_to_storage_info;
+    std::vector<int64_t> sids = storage_info_map[expr]->storage_ids;
+    std::vector<DLDeviceType> device_types = storage_info_map[expr]->device_types;
 
-    auto output_iter = std::find(return_sid_.begin(), return_sid_.end(),
-                                 static_cast<int>((sids[0][0].as<IntImmNode>())->value));
+    // TODO(@electriclilies): Not sure if this is right
+    auto output_iter = std::find(return_sid_.begin(), return_sid_.end(), sids[0]);
     if (output_iter != return_sid_.end()) {
       int output_index = std::distance(return_sid_.begin(), output_iter);
       auto var_expr = FindExpr(expr);
-      CopyToOutput(main_signature_[input_vars_.size() + output_index], var_expr[0], sids[2][0]);
+      CopyToOutput(main_signature_[input_vars_.size() + output_index], var_expr[0],
+                   device_types[0]);
     }
   }
 
@@ -443,19 +451,23 @@ class AOTExecutorCodegen : public ExprVisitor {
     Expr expr = GetRef<Expr>(op);
     size_t index = params_.size();
     std::string name = "p" + std::to_string(index);
+    Map<Expr, StorageInfo> storage_info_map = memory_plan_->expr_to_storage_info;
 
-    param_storage_ids_[name] = storage_device_map_[expr][0][0]->value;
+    param_storage_ids_[name] = storage_info_map[expr]->storage_ids[0];
     params_[name] = op->data;
     params_by_expr_.Set(expr, name);
 
     // If the Constant node is an output node we need to copy the content of the parameter to the
     // output A Var node can only produce a single output
-    Array<IntegerArray> sids = storage_device_map_[expr];
-    auto output_iter = std::find(return_sid_.begin(), return_sid_.end(),
-                                 static_cast<int>((sids[0][0].as<IntImmNode>())->value));
+    std::vector<int64_t> sids = storage_info_map[expr]->storage_ids;
+    // Array<IntegerArray> sids = storage_device_map_[expr];
+    // TODO(@electriclilies): this might be wrong, hacked in change for now
+    std::vector<int64_t> storage_sizes = storage_info_map[expr]->storage_sizes_in_bytes;
+    auto output_iter = std::find(return_sid_.begin(), return_sid_.end(), sids[0]);
     if (output_iter != return_sid_.end()) {
       int output_index = std::distance(return_sid_.begin(), output_iter);
-      CopyToOutput(main_signature_[input_vars_.size() + output_index], PackParam(expr), sids[2][0]);
+      CopyToOutput(main_signature_[input_vars_.size() + output_index], PackParam(expr),
+                   storage_sizes[0]);
     }
   }
 
@@ -502,8 +514,17 @@ class AOTExecutorCodegen : public ExprVisitor {
     // Allocate the sids
     std::unordered_map<int, bool> allocated;
 
-    for (auto kv : storage_device_map_) {
+    for (auto kv : memory_plan_->expr_to_storage_info) {
       // Only allocate sids that are needed
+      auto expr = kv.first;
+      auto storage_info = kv.second;
+      auto sids = storage_info->storage_ids;
+      auto device_types = storage_info->device_types;
+      auto storage_sizes_in_bytes = storage_info->storage_sizes_in_bytes;
+      // sids = kv.second[0]
+      // devices = kv.second[1]
+      //
+
       const bool is_input =
           (std::find(input_vars_.begin(), input_vars_.end(), kv.first) != input_vars_.end());
       const bool is_param = (params_by_expr_.find(kv.first) != params_by_expr_.end());
@@ -511,10 +532,13 @@ class AOTExecutorCodegen : public ExprVisitor {
         continue;
       }
 
-      for (unsigned int i = 0; i < kv.second[0].size(); i++) {
-        int size = kv.second[2][i];
-        int sid = static_cast<int>((kv.second[0][i].as<IntImmNode>())->value);
+      CHECK_EQ(sids.size(), storage_sizes_in_bytes.size())
+          << "the mapping between storage ids and sizes is incorrect"
+          << "found " << sids.size() << " ids and " << storage_sizes_in_bytes.size() << "sizes";
 
+      for (unsigned int i = 0; i < sids.size(); i++) {
+        int size = storage_sizes_in_bytes.at(i);
+        int sid = sids.at(i);
         if (std::find(return_sid_.begin(), return_sid_.end(), sid) != return_sid_.end()) {
           continue;
         }
@@ -578,7 +602,8 @@ class AOTExecutorCodegen : public ExprVisitor {
   std::unordered_map<std::string, int64_t> param_storage_ids_;
 
   /*! \brief plan memory of device result */
-  Map<Expr, Array<IntegerArray>> storage_device_map_;
+  StaticMemoryPlan memory_plan_;
+  // Map<Expr, Array<IntegerArray>> storage_device_map_;
   std::unordered_map<int, te::Var> sids_table_;
   /*! \brief lowered funcs */
   std::unordered_map<std::string, IRModule> lowered_funcs_;
@@ -589,7 +614,7 @@ class AOTExecutorCodegen : public ExprVisitor {
   /*! \brief the set of statements that make the program */
   std::vector<tir::Stmt> stmts_;
   /*! \brief the list of return sids (note that the function might return more then one output */
-  IntegerArray return_sid_;
+  std::vector<int64_t> return_sid_;
   /*! \brief the module name we use to mangle the function names */
   String mod_name_;
 
@@ -603,8 +628,7 @@ class AOTExecutorCodegen : public ExprVisitor {
 
   LoweredOutput Codegen(relay::Function func, String mod_name) {
     // Get the module, storage map and token sizes
-    auto pf = GetPackedFunc("relay.backend.GraphPlanMemory");
-    storage_device_map_ = (*pf)(func);
+    memory_plan_ = GraphPlanMemory(func);
     mod_name_ = mod_name;
 
     for (auto input : func->params) {
@@ -613,15 +637,17 @@ class AOTExecutorCodegen : public ExprVisitor {
     }
 
     // Define the storage allocator ids
-    for (auto kv : storage_device_map_) {
-      for (const auto& sid : kv.second[0]) {
+    for (auto kv : memory_plan_->expr_to_storage_info) {
+      auto storage_info = kv.second;
+      auto sids = storage_info->storage_ids;
+      for (const auto& sid : sids) {
         te::Var sid_var(MakeString("sid_", sid), PointerType(PrimType(DataType::Int(8))));
         sids_table_[sid] = sid_var;
       }
     }
 
     // Find the return sid
-    return_sid_ = AotReturnSidVisitor(storage_device_map_).FindReturnSid(func);
+    return_sid_ = AotReturnSidVisitor(memory_plan_->expr_to_storage_info).FindReturnSid(func);
     for (unsigned int output_index = 0; output_index < return_sid_.size(); output_index++) {
       main_signature_.push_back(tir::Var("output", DataType::Handle()));
     }
diff --git a/src/relay/backend/compile_engine.cc b/src/relay/backend/compile_engine.cc
index f0b43b14c6500..3ac2c42f81942 100644
--- a/src/relay/backend/compile_engine.cc
+++ b/src/relay/backend/compile_engine.cc
@@ -46,569 +46,14 @@
 
 #include "../../runtime/meta_data.h"
 #include "../transforms/pass_utils.h"
+#include "te_compiler_cache.h"
 #include "utils.h"
 
 namespace tvm {
 namespace relay {
 
-TVM_REGISTER_NODE_TYPE(LoweredOutputNode);
-TVM_REGISTER_NODE_TYPE(CachedFuncNode);
-TVM_REGISTER_NODE_TYPE(CCacheKeyNode);
-TVM_REGISTER_NODE_TYPE(CCacheValueNode);
 TVM_REGISTER_OBJECT_TYPE(CompileEngineNode);
 
-LoweredOutput::LoweredOutput(tvm::Array<te::Tensor> outputs, OpImplementation impl) {
-  auto n = make_object<LoweredOutputNode>();
-  n->outputs = std::move(outputs);
-  n->implementation = std::move(impl);
-  data_ = std::move(n);
-}
-
-CCacheKey::CCacheKey(Function source_func, Target target) {
-  auto n = make_object<CCacheKeyNode>();
-  n->source_func = std::move(source_func);
-  n->target = std::move(target);
-  data_ = std::move(n);
-}
-
-Array<IndexExpr> GetShape(const Array<IndexExpr>& shape) {
-  // for now, we always use int32 shape when possible
-  // even if the result of shape inference becomes int64.
-  Array<IndexExpr> res;
-  for (IndexExpr val : shape) {
-    const int64_t* pval = tir::as_const_int(val);
-    if (pval != nullptr) {
-#ifndef TVM_INDEX_DEFAULT_I64
-      ICHECK_LE(pval[0], std::numeric_limits<int32_t>::max());
-      ICHECK_GE(pval[0], std::numeric_limits<int32_t>::min());
-      res.push_back(IntImm(DataType::Int(32), *pval));
-#else
-      res.push_back(val);
-#endif  // TVM_INDEX_DEFAULT_I64
-    } else if (val->IsInstance<tir::AnyNode>()) {
-      res.push_back(val.as<tir::AnyNode>()->ToVar());
-    } else {
-      res.push_back(val);
-    }
-  }
-  return res;
-}
-
-// The getter to get schedule from compile engine.
-// Get schedule from functor.
-class ScheduleGetter : public backend::MemoizedExprTranslator<Array<te::Tensor>> {
- public:
-  explicit ScheduleGetter(Target target)
-      : target_(target), device_copy_op_(Op::Get("device_copy")) {
-    // Whether to use auto_scheduler schedule.
-    use_auto_scheduler_ = backend::IsAutoSchedulerEnabled();
-  }
-
-  CachedFunc Create(const Function& prim_func) {
-    auto cache_node = make_object<CachedFuncNode>();
-    cache_node->target = target_;
-    for (Var param : prim_func->params) {
-      Array<tvm::te::Tensor> inputs;
-      if (const auto* ttype = param->checked_type().as<TensorTypeNode>()) {
-        tvm::te::Tensor tensor = tvm::te::placeholder(GetShape(ttype->shape), ttype->dtype);
-        cache_node->inputs.push_back(tensor);
-        inputs.push_back(tensor);
-      } else {
-        // flatten tuple of tensor type.
-        const auto* tuple_type = param->type_as<TupleTypeNode>();
-        for (Type field : tuple_type->fields) {
-          const auto* ttype = field.as<TensorTypeNode>();
-          // TODO(@icemelon): Allow recursive tuple
-          ICHECK(ttype != nullptr);
-          tvm::te::Tensor tensor = tvm::te::placeholder(GetShape(ttype->shape), ttype->dtype);
-          cache_node->inputs.push_back(tensor);
-          inputs.push_back(tensor);
-        }
-      }
-      memo_[param] = inputs;
-    }
-    readable_name_stream_ << "fused";
-    cache_node->outputs = this->VisitExpr(prim_func->body);
-    auto candidate_name = readable_name_stream_.str();
-    constexpr static size_t kMaxFuncNameLength = 80;
-    if (candidate_name.size() > kMaxFuncNameLength) {
-      std::stringstream truncated_name;
-      truncated_name << candidate_name.substr(0, kMaxFuncNameLength);
-      truncated_name << "_" << std::hash<std::string>{}(candidate_name) << "_";
-      candidate_name = truncated_name.str();
-    }
-    cache_node->func_name = candidate_name;
-    ICHECK(anchor_op_.defined());
-    // Fusion over tupled results may leave identity relationships
-    // between inputs and outputs, and those should not be scheduled.
-    // Hence schedule only non PlaceholderOp outputs.
-    tvm::Array<te::Tensor> tensor_outs;
-    for (const auto& tensor : cache_node->outputs) {
-      if (!tensor->op.as<te::PlaceholderOpNode>()) {
-        tensor_outs.push_back(tensor);
-      }
-    }
-
-    te::Schedule schedule;
-    // No need to register schedule for device copy op.
-    if (anchor_attrs_.as<DeviceCopyAttrs>() == nullptr) {
-      if (use_auto_scheduler_) {
-        const auto* fauto_schedule =
-            runtime::Registry::Get("auto_scheduler.relay_integration.auto_schedule_topi_compute");
-        ICHECK(fauto_schedule != nullptr)
-            << "auto_scheduler.relay_integration.auto_schedule_topi_compute is not registered";
-        ObjectRef obj = (*fauto_schedule)(String(cache_node->func_name), tensor_outs);
-        if (obj.defined()) {
-          schedule = Downcast<te::Schedule>(obj);
-        }
-      }
-
-      // Use TOPI schedule if user specificed, or the function has no auto_scheduler schedule.
-      if (!schedule.defined()) {
-        ICHECK(anchor_implementation_.defined());
-        schedule = anchor_implementation_.Schedule(anchor_attrs_, tensor_outs, target_);
-      }
-      for (const auto& scalar : scalars_) {
-        if (schedule->Contain(scalar)) {
-          schedule[scalar].compute_inline();
-        }
-      }
-    }
-    cache_node->schedule = std::move(schedule);
-    return CachedFunc(cache_node);
-  }
-
-  Array<te::Tensor> VisitExpr_(const VarNode* op) final {
-    LOG(FATAL) << "Free variable " << op->name_hint();
-    return {};
-  }
-
-  Array<te::Tensor> VisitExpr_(const ConstantNode* op) final {
-    using tir::make_const;
-    ICHECK(op->is_scalar());
-    void* data = op->data->data;
-    DataType dtype = DataType(op->data->dtype);
-    auto value = te::compute(
-        {},
-        [&](const Array<tvm::tir::Var>&) {
-          if (dtype == DataType::Int(32)) {
-            return make_const(dtype, static_cast<const int32_t*>(data)[0]);
-          } else if (dtype == DataType::Int(64)) {
-            return make_const(dtype, static_cast<const int64_t*>(data)[0]);
-          } else if (dtype == DataType::Float(32)) {
-            return make_const(dtype, static_cast<const float*>(data)[0]);
-          } else if (dtype == DataType::Float(64)) {
-            return make_const(dtype, static_cast<const double*>(data)[0]);
-          } else if (dtype == DataType::Bool()) {
-            return make_const(dtype, static_cast<const uint8_t*>(data)[0]);
-          } else {
-            LOG(FATAL) << "not handled";
-            return tvm::PrimExpr();
-          }
-        },
-        "compile_engine_const", topi::kBroadcast);
-    scalars_.push_back(value->op);
-    return {value};
-  }
-
-  Array<te::Tensor> VisitExpr_(const CallNode* call_node) final {
-    static auto fpattern = Op::GetAttrMap<TOpPattern>("TOpPattern");
-    static auto flower_call = tvm::runtime::Registry::Get("relay.backend.lower_call");
-    ICHECK(flower_call) << "relay.backend.lower_call is not registered.";
-
-    Array<te::Tensor> inputs;
-    int count_tuple = 0;
-    for (Expr arg : call_node->args) {
-      if (arg->checked_type().as<TupleTypeNode>()) {
-        ++count_tuple;
-      }
-      for (te::Tensor tensor : VisitExpr(arg)) {
-        inputs.push_back(tensor);
-      }
-    }
-    if (count_tuple) {
-      ICHECK_EQ(call_node->args.size(), 1U) << "Only allow function with a single tuple input";
-    }
-
-    ICHECK(call_node->op.as<OpNode>()) << "Primitive function only allows call into primitive ops";
-    Op op = Downcast<Op>(call_node->op);
-
-    Array<te::Tensor> outputs;
-    OpImplementation impl;
-    // Skip fcompute for device copy operators as it is not registered.
-    if (op == device_copy_op_) {
-      const auto* copy_input = inputs[0].operator->();
-      outputs.push_back(te::Tensor(copy_input->shape, copy_input->dtype, te::Operation(), 0));
-    } else {
-      LoweredOutput lowered_out = (*flower_call)(GetRef<Call>(call_node), inputs, target_);
-      outputs = lowered_out->outputs;
-      impl = lowered_out->implementation;
-    }
-
-    int op_pattern = fpattern[op];
-    if (!use_auto_scheduler_ && op_pattern >= kCommReduce) {
-      ICHECK(!anchor_op_.defined() || anchor_op_pattern_ < kCommReduce)
-          << "Cannot apply TOPI schedule to a primitive function with two complicated ops"
-          << " anchor=" << anchor_op_ << " current=" << op;
-    }
-    if (op_pattern > anchor_op_pattern_) {
-      anchor_op_ = op;
-      anchor_attrs_ = call_node->attrs;
-      anchor_op_pattern_ = op_pattern;
-      anchor_implementation_ = impl;
-    }
-    if (outputs.size() != 1) {
-      const auto* tuple_type = call_node->checked_type().as<TupleTypeNode>();
-      ICHECK(tuple_type) << "Expect output to be a tuple type";
-      ICHECK_EQ(tuple_type->fields.size(), outputs.size());
-    }
-    // Set the name to `__copy`. It will be detected in graph executor to perform
-    // data copy across devices.
-    if (op == device_copy_op_) {
-      readable_name_stream_.str(std::string());
-      readable_name_stream_ << "__copy";
-    } else {
-      readable_name_stream_ << '_' << op->name;
-    }
-    return outputs;
-  }
-
-  Array<te::Tensor> VisitExpr_(const FunctionNode* op) final {
-    LOG(FATAL) << "Do not support sub function";
-    return Array<te::Tensor>();
-  }
-
-  Array<te::Tensor> VisitExpr_(const LetNode* op) final {
-    Array<te::Tensor> val = VisitExpr(op->value);
-    ICHECK(!memo_.count(op->var));
-    memo_[op->var] = val;
-    return VisitExpr(op->body);
-  }
-
-  Array<te::Tensor> VisitExpr_(const TupleNode* op) final {
-    Array<te::Tensor> fields;
-    for (Expr field : op->fields) {
-      ICHECK(field->checked_type().as<TensorTypeNode>()) << "Only allow Tuple of Tensor";
-      Array<te::Tensor> res = VisitExpr(field);
-      ICHECK_EQ(res.size(), 1);
-      fields.push_back(res[0]);
-    }
-    return fields;
-  }
-
-  Array<te::Tensor> VisitExpr_(const TupleGetItemNode* op) final {
-    const auto* tuple_type = op->tuple->type_as<TupleTypeNode>();
-    Array<te::Tensor> tuple = VisitExpr(op->tuple);
-    ICHECK_EQ(tuple_type->fields.size(), tuple.size());
-    ICHECK_GE(op->index, 0);
-    ICHECK_LT(static_cast<size_t>(op->index), tuple.size());
-    return {tuple[op->index]};
-  }
-
- private:
-  tvm::Target target_;
-  Op anchor_op_;
-  Attrs anchor_attrs_;
-  int anchor_op_pattern_{-1};
-  OpImplementation anchor_implementation_;
-  std::ostringstream readable_name_stream_;
-  Array<te::Operation> scalars_;
-  bool use_auto_scheduler_;
-  // Cache device copy op for equivalence checking to reduce registry lookup
-  // overhead for each invocation of call node when retrieving schedules.
-  const Op& device_copy_op_;
-};
-
-/*!
- * \brief Create schedule for target.
- * \param source_func The primitive function to be lowered.
- * \param target The target we want to create schedule for.
- * \return Pair of schedule and cache.
- *  The funcs field in cache is not yet populated.
- */
-CachedFunc CreateSchedule(const Function& source_func, const Target& target) {
-  return ScheduleGetter(target).Create(source_func);
-}
-
-// Creates shape function from functor.
-class MakeShapeFunc : public backend::MemoizedExprTranslator<Array<te::Tensor>> {
- public:
-  MakeShapeFunc() {}
-
-  std::pair<te::Schedule, CachedFunc> Create(const Function& prim_func) {
-    for (auto param : prim_func->params) {
-      param_states_[param] = kNoNeed;
-      Array<tvm::te::Tensor> data_inputs;
-      Array<tvm::te::Tensor> shape_inputs;
-
-      auto add_placeholder = [&data_inputs, &shape_inputs](const TensorTypeNode* ttype) {
-        // Add data placeholder
-        Shape shape = GetShape(ttype->shape);
-        tvm::te::Tensor data_tensor = tvm::te::placeholder(shape, ttype->dtype);
-        data_inputs.push_back(data_tensor);
-        // Add shape placeholder
-        int64_t ndim = shape.size();
-        Shape sshape;
-        if (ndim > 0) {
-          sshape.push_back(tvm::Integer(ndim));
-        }
-        tvm::te::Tensor shape_tensor = tvm::te::placeholder(sshape, DataType::Int(64));
-        shape_inputs.push_back(shape_tensor);
-      };
-
-      if (const auto* ttype = param->checked_type().as<TensorTypeNode>()) {
-        add_placeholder(ttype);
-      } else {
-        // flatten tuple of tensor type.
-        const auto* tuple_type = param->type_as<TupleTypeNode>();
-        // TODO(@icemelon): Support recursive tuple
-        ICHECK(tuple_type);
-        for (Type field : tuple_type->fields) {
-          const auto* ttype = field.as<TensorTypeNode>();
-          ICHECK(ttype);
-          add_placeholder(ttype);
-        }
-      }
-      param_data_[param] = data_inputs;
-      param_shapes_[param] = shape_inputs;
-    }
-    readable_name_stream_ << "shape_func";
-    auto cache_node = make_object<CachedFuncNode>();
-    cache_node->outputs = VisitExpr(prim_func->body);
-    auto candidate_name = readable_name_stream_.str();
-    constexpr static size_t kMaxFuncNameLength = 80;
-    if (candidate_name.size() > kMaxFuncNameLength) {
-      std::stringstream truncated_name;
-      truncated_name << candidate_name.substr(0, kMaxFuncNameLength);
-      truncated_name << "_" << std::hash<std::string>{}(candidate_name) << "_";
-      candidate_name = truncated_name.str();
-    }
-    cache_node->func_name = candidate_name;
-
-    // set inputs
-    for (auto param : prim_func->params) {
-      int state = param_states_[param];
-      cache_node->shape_func_param_states.push_back(IntImm(DataType::Int(32), state));
-      if (state & kNeedInputData) {
-        for (auto t : param_data_[param]) {
-          cache_node->inputs.push_back(t);
-        }
-      }
-      if (state & kNeedInputShape) {
-        for (auto t : param_shapes_[param]) {
-          cache_node->inputs.push_back(t);
-        }
-      }
-    }
-
-    CachedFunc cfunc(cache_node);
-    // generate schedule for shape func
-    Array<te::Operation> out_ops;
-    for (auto t : cache_node->outputs) {
-      out_ops.push_back(t->op);
-    }
-    auto schedule = te::create_schedule(out_ops);
-    tvm::te::AutoInlineInjective(schedule);
-    for (const auto& scalar : scalars_) {
-      auto scalar_op = scalar->op;
-      if (schedule->Contain(scalar_op)) {
-        schedule[scalar_op].compute_inline();
-      }
-    }
-    return std::make_pair(schedule, cfunc);
-  }
-
-  Array<te::Tensor> VisitExpr(const Expr& expr) final {
-    if (expr.as<VarNode>()) {
-      // Do not memoize vars because shape functions could use either the data
-      // or the shape of a var each time.
-      return ExprFunctor::VisitExpr(expr);
-    }
-    // For other case, do memoized visit
-    return backend::MemoizedExprTranslator<Array<te::Tensor>>::VisitExpr(expr);
-  }
-
-  Array<te::Tensor> VisitExpr_(const VarNode* var_node) final {
-    auto var = GetRef<Var>(var_node);
-    auto it = param_states_.find(var);
-    if (it == param_states_.end()) {
-      LOG(FATAL) << "Free variable " << var->name_hint();
-      return {};
-    } else {
-      ICHECK(data_dependents_per_input_.size());
-      auto data_dependent = data_dependents_per_input_.back();
-      if (data_dependent) {
-        param_states_[var] |= kNeedInputData;
-        return param_data_[var];
-      } else {
-        param_states_[var] |= kNeedInputShape;
-        return param_shapes_[var];
-      }
-    }
-  }
-
-  Array<te::Tensor> VisitExpr_(const ConstantNode* op) final {
-    using tir::make_const;
-    ICHECK(data_dependents_per_input_.size());
-    bool data_dependent = data_dependents_per_input_.back();
-    if (!op->is_scalar()) {
-      // This is a constant weight, extract the shape of the weight tensor.
-      // This can not be data dependent.
-      CHECK(!data_dependent);
-      auto ttype = op->checked_type().as<TensorTypeNode>();
-      int ndim = static_cast<int>(ttype->shape.size());
-      Array<PrimExpr> out_shape{ndim};
-      te::Tensor value = tvm::te::compute(
-          out_shape,
-          [&](const Array<tvm::tir::Var>& indices) {
-            auto idx = indices[0];
-            PrimExpr ret = make_const(DataType::Int(64), 0);
-            for (int i = 0; i < ndim; i++) {
-              ret = tvm::if_then_else(idx == i, ttype->shape[i], ret);
-            }
-            return ret;
-          },
-          "shape_const", topi::kBroadcast);
-      scalars_.push_back(value);
-      return {value};
-    }
-    if (data_dependent) {
-      void* data = op->data->data;
-      DataType dtype = DataType(op->data->dtype);
-      auto value = tvm::te::compute(
-          {},
-          [&](const Array<tvm::tir::Var>&) {
-            if (dtype == DataType::Int(32)) {
-              return make_const(dtype, static_cast<const int32_t*>(data)[0]);
-            } else if (dtype == DataType::Int(64)) {
-              return make_const(dtype, static_cast<const int64_t*>(data)[0]);
-            } else if (dtype == DataType::Float(32)) {
-              return make_const(dtype, static_cast<const float*>(data)[0]);
-            } else if (dtype == DataType::Float(64)) {
-              return make_const(dtype, static_cast<const double*>(data)[0]);
-            } else if (dtype == DataType::Bool()) {
-              return make_const(dtype, static_cast<const uint8_t*>(data)[0]);
-            } else {
-              LOG(FATAL) << "not handled";
-              return tvm::PrimExpr();
-            }
-          },
-          "data_const", topi::kBroadcast);
-      scalars_.push_back(value);
-      return {value};
-    } else {
-      auto value = tvm::te::compute(
-          {}, [&](const Array<tvm::tir::Var>&) { return tir::make_const(DataType::Int(64), 0); },
-          "shape_const", topi::kBroadcast);
-      scalars_.push_back(value);
-      return {value};
-    }
-  }
-
-  Array<te::Tensor> VisitExpr_(const CallNode* call_node) final {
-    static auto fshape_func = Op::GetAttrMap<FShapeFunc>("FShapeFunc");
-    static auto tshape_data_dependent = Op::GetAttrMap<TShapeDataDependent>("TShapeDataDependent");
-    ICHECK(call_node->op.as<OpNode>()) << "Primitive function only allows call into primitive ops";
-    Op op = Downcast<Op>(call_node->op);
-    ICHECK(data_dependents_per_input_.empty() || !data_dependents_per_input_.back())
-        << "Error in op fusion: output of the shape func is fed to a "
-        << "data-dependent shape func";
-    ICHECK_GT(fshape_func.count(op), 0) << "Internal error, cannot find ShapeFunc for " << op->name;
-    ICHECK_GT(tshape_data_dependent.count(op), 0)
-        << "Internal error, cannot find TShapeDataDependent for " << op->name;
-
-    Array<Integer> dep_spec = tshape_data_dependent[op];
-    if (dep_spec.size() == 1) {
-      // This is for cases when data dependence is specified per op
-      // Replicate 0 or 1 flag to all arguments
-      for (size_t i = 1; i < call_node->args.size(); ++i) {
-        dep_spec.push_back(dep_spec[0]);
-      }
-    }
-
-    // Visit all inputs
-    Array<te::Tensor> inputs;
-    int count_tuple = 0;
-    for (size_t i = 0; i < call_node->args.size(); ++i) {
-      Expr arg = call_node->args[i];
-      if (arg->checked_type().as<TupleTypeNode>()) {
-        ++count_tuple;
-      }
-      data_dependents_per_input_.push_back(dep_spec[i]->value != 0);
-      for (te::Tensor tensor : VisitExpr(arg)) {
-        inputs.push_back(tensor);
-      }
-      data_dependents_per_input_.pop_back();
-    }
-    if (count_tuple) {
-      ICHECK_EQ(call_node->args.size(), 1U) << "Only allow function with a single tuple input";
-    }
-    // Get output ndims
-    auto ret_type = call_node->checked_type();
-    Array<IndexExpr> out_ndims;
-    if (const auto* ttype = ret_type.as<TensorTypeNode>()) {
-      out_ndims.push_back(IntImm(DataType::Int(32), ttype->shape.size()));
-    } else {
-      auto rtype = ret_type.as<TupleTypeNode>();
-      // TODO(@icemelon): Allow recursive tuple
-      ICHECK(rtype);
-      for (size_t i = 0; i < rtype->fields.size(); ++i) {
-        auto ttype = rtype->fields[i].as<TensorTypeNode>();
-        ICHECK(ttype);
-        out_ndims.push_back(IntImm(DataType::Int(32), ttype->shape.size()));
-      }
-    }
-    // Call shape function
-    auto outputs = fshape_func[op](call_node->attrs, inputs, out_ndims);
-    readable_name_stream_ << "_" << op->name;
-    return outputs;
-  }
-
-  Array<te::Tensor> VisitExpr_(const FunctionNode* op) final {
-    LOG(FATAL) << "Do not support sub function";
-    return Array<te::Tensor>();
-  }
-
-  Array<te::Tensor> VisitExpr_(const LetNode* op) final {
-    Array<te::Tensor> val = VisitExpr(op->value);
-    ICHECK(!memo_.count(op->var));
-    memo_[op->var] = val;
-    return VisitExpr(op->body);
-  }
-
-  Array<te::Tensor> VisitExpr_(const TupleNode* op) final {
-    Array<te::Tensor> fields;
-    for (Expr field : op->fields) {
-      ICHECK(field->checked_type().as<TensorTypeNode>()) << "Only allow Tuple of Tensor";
-      Array<te::Tensor> res = VisitExpr(field);
-      ICHECK_EQ(res.size(), 1);
-      fields.push_back(res[0]);
-    }
-    return fields;
-  }
-
-  Array<te::Tensor> VisitExpr_(const TupleGetItemNode* op) final {
-    Array<te::Tensor> input_shapes = VisitExpr(op->tuple);
-    Array<te::Tensor> out;
-    out.push_back(input_shapes[op->index]);
-    return out;
-  }
-
- private:
-  /*! \brief String stream for function name */
-  std::ostringstream readable_name_stream_;
-  /*! \brief Map from parameter to its shape function usage state */
-  std::unordered_map<Expr, int, ObjectPtrHash, ObjectPtrEqual> param_states_;
-  /*! \brief Map from parameter to list of data placeholder */
-  std::unordered_map<Expr, Array<te::Tensor>, ObjectPtrHash, ObjectPtrEqual> param_data_;
-  /*! \brief Map from parameter to list of shape placeholder */
-  std::unordered_map<Expr, Array<te::Tensor>, ObjectPtrHash, ObjectPtrEqual> param_shapes_;
-  /*! \brief Stack of data dependencies for shape function, specified per each op input */
-  std::vector<bool> data_dependents_per_input_;
-  /*! \brief Scalars used in the shape function */
-  Array<te::Tensor> scalars_;
-};
-
 class CompileEngineImpl : public CompileEngineNode {
  public:
   // Lower the function.
@@ -621,14 +66,8 @@ class CompileEngineImpl : public CompileEngineNode {
     auto mangle_fn = [](String name) { return name; };
     CCacheValue value = LowerInternal(key, mangle_fn);
     if (value->packed_func != nullptr) return value->packed_func;
-    // build the function.
-    tvm::runtime::Module m;
-    if (const auto* f = runtime::Registry::Get("relay.backend.build")) {
-      m = (*f)(value->cached_func->funcs, key->target);
-    } else {
-      m = build(value->cached_func->funcs, key->target, Target(nullptr));
-    }
-    value->packed_func = m.GetFunction(value->cached_func->func_name);
+    auto m = build(value->cached_func->funcs, key->target, Target(nullptr));
+    value->packed_func = m.GetFunction(value->cached_func->prim_fn_var->name_hint);
     return value->packed_func;
   }
 
@@ -643,6 +82,7 @@ class CompileEngineImpl : public CompileEngineNode {
     for (const auto& it : cache_) {
       auto src_func = it.first->source_func;
       ICHECK(src_func.defined());
+
       if (src_func->GetAttr<String>(attr::kCompiler).defined()) {
         auto code_gen = src_func->GetAttr<String>(attr::kCompiler);
         ICHECK(code_gen.defined()) << "No external codegen is set";
@@ -651,7 +91,9 @@ class CompileEngineImpl : public CompileEngineNode {
 
         auto symbol_name = src_func->GetAttr<String>(tvm::attr::kGlobalSymbol);
         ICHECK(symbol_name.defined()) << "No external symbol is set for:\n"
-                                      << AsText(src_func, false);
+                                      << AsText(src_func, false) << "\n"
+                                      << "Functions with external codegen must have the "
+                                      << tvm::attr::kGlobalSymbol << " attr set.";
 
         std::string sn = symbol_name.value();
         if (!cached_symbol.count(sn)) {
@@ -669,7 +111,12 @@ class CompileEngineImpl : public CompileEngineNode {
         src_func = WithAttr(std::move(src_func), attr::kCompiler, NullValue<ObjectRef>());
         runtime::Module ext_mod = (*pf)(src_func);
 
-        ICHECK(ext_mod.defined()) << "No external runtime is generated.";
+        // todo(@zhiics, @jroesch): Should this be a user visible error?
+        ICHECK(ext_mod.defined()) << "No external library was generated for " << ext_name
+                                  << "even though it was requested"
+                                     "by the annotated function "
+                                  << PrettyPrint(src_func);
+
         ret.push_back(ext_mod);
       }
     }
@@ -734,44 +181,49 @@ class CompileEngineImpl : public CompileEngineNode {
     // No need to lower external functions for now. We will invoke the external
     // codegen tool once and lower all functions together.
     if (key->source_func->GetAttr<String>(attr::kCompiler).defined()) {
-      auto cache_node = make_object<CachedFuncNode>();
+      auto ir_module = IRModule();
       const auto name_node = key->source_func->GetAttr<String>(tvm::attr::kGlobalSymbol);
       ICHECK(name_node.defined()) << "External function has not been attached a name yet.";
-      cache_node->func_name = std::string(name_node.value());
-      cache_node->target = Target("ext_dev");
-      cache_node->funcs->Add(GlobalVar(cache_node->func_name), key->source_func);
-      value->cached_func = CachedFunc(cache_node);
+      auto func_name = std::string(name_node.value());
+      auto target = Target("ext_dev");
+      auto global_var = GlobalVar(func_name);
+      global_var->checked_type_ = key->source_func->checked_type();
+      ir_module->Add(global_var, key->source_func);
+      value->cached_func = CachedFunc(target, global_var, {}, {}, te::Schedule(), {}, ir_module);
       return value;
     }
+
     // Enforce use the target.
     With<Target> target_scope(key->target);
 
     ICHECK(!value->cached_func.defined());
-    auto cfunc = CreateSchedule(key->source_func, key->target);
-    auto cache_node = make_object<CachedFuncNode>(*(cfunc.operator->()));
+    auto cfunc = PrimFuncFor(key->source_func, key->target,
+                             [&](std::string name) { return GetUniqueName(name, &name_map_); });
 
     // Skip lowering for device copy node.
     const Expr body = (key->source_func)->body;
     if (const CallNode* call_node = body.as<CallNode>()) {
       if (call_node->attrs.as<DeviceCopyAttrs>()) {
-        value->cached_func = CachedFunc(cache_node);
+        value->cached_func = cfunc;
         return value;
       }
     }
     cache_node->func_name = GetUniqueName(mangle_fn(cache_node->func_name));
 
     // NOTE: array will copy on write.
-    Array<te::Tensor> all_args = cache_node->inputs;
-    for (te::Tensor arg : cache_node->outputs) {
+    Array<te::Tensor> all_args = Array<te::Tensor>(cfunc->inputs);
+    for (te::Tensor arg : cfunc->outputs) {
       all_args.push_back(arg);
     }
     // lower the function
     std::unordered_map<te::Tensor, tir::Buffer> binds;
-    cache_node->funcs = tvm::LowerSchedule(cfunc->schedule, all_args, cache_node->func_name, binds);
+    auto func_name = cfunc->prim_fn_var->name_hint;
+    cfunc->funcs->Update(tvm::LowerSchedule(cfunc->schedule, all_args, func_name, binds));
+    value->cached_func = cfunc;
 
-    value->cached_func = CachedFunc(cache_node);
     return value;
   }
+
   // implement lowered shape func
   CCacheValue LowerShapeFuncInternal(const CCacheKey& key) {
     std::lock_guard<std::mutex> lock(mutex_);
@@ -790,47 +242,17 @@ class CompileEngineImpl : public CompileEngineNode {
     With<Target> target_scope(key->target);
 
     ICHECK(!value->cached_func.defined());
-    auto spair = MakeShapeFunc().Create(key->source_func);
-    auto cache_node = make_object<CachedFuncNode>(*(spair.second.operator->()));
-    cache_node->func_name = GetUniqueName(cache_node->func_name);
-    cache_node->target = key->target;
-
-    Array<te::Tensor> all_args = cache_node->inputs;
-    for (te::Tensor arg : cache_node->outputs) {
-      all_args.push_back(arg);
-    }
-
     using tvm::transform::PassContext;
     With<PassContext> fresh_pass_ctx_scope(PassContext::Create());
 
-    std::unordered_map<te::Tensor, tir::Buffer> binds;
-    cache_node->funcs = tvm::LowerSchedule(spair.first, all_args, cache_node->func_name, binds);
-    value->cached_func = CachedFunc(cache_node);
+    auto cached_func = ShapeFuncFor(key->source_func, key->target, [&](std::string name) {
+      return GetUniqueName(name, &name_map_);
+    });
+
+    value->cached_func = cached_func;
     return value;
   }
-  /*!
-   * \brief Get unique name from name.
-   * \param name The orginal name.
-   * \return Updated name which is unique.
-   */
-  std::string GetUniqueName(std::string name) {
-    for (size_t i = 0; i < name.length(); ++i) {
-      if (name[i] == '.') name[i] = '_';
-    }
-    while (true) {
-      auto it = name_map_.find(name);
-      if (it == name_map_.end()) {
-        name_map_[name] = 1;
-        return name;
-      } else {
-        std::ostringstream os;
-        os << name << "_" << it->second;
-        ++(it->second);
-        name = os.str();
-      }
-    }
-    return name;
-  }
+
   /*! \brief compiler cache lock*/
   std::mutex mutex_;
   /*! \brief internal name map to get an unique name */
diff --git a/src/relay/backend/compile_engine.h b/src/relay/backend/compile_engine.h
index f766fcf97ea71..94f2db0659376 100644
--- a/src/relay/backend/compile_engine.h
+++ b/src/relay/backend/compile_engine.h
@@ -19,8 +19,12 @@
 
 /*!
  * \file relay/backend/compile_engine.h
- * \brief Internal compialtion engine handle function cache.
- *  and interface to low level code generation.
+ * \brief Internal compilation layer which lowers Relay "primitive functions" to TIR PrimFns.
+ *
+ * This layer represents the older design of the Relay compilation flow and is being deprecated
+ * in favor of te_compiler.h which is a migration step towards a standard pass based lowering of
+ * Relay functions.
+ *
  */
 #ifndef TVM_RELAY_BACKEND_COMPILE_ENGINE_H_
 #define TVM_RELAY_BACKEND_COMPILE_ENGINE_H_
@@ -36,157 +40,12 @@
 #include <functional>
 #include <string>
 
+#include "te_compiler_cache.h"
+
 namespace tvm {
 namespace relay {
 
-/*! \brief Indicate whether the data or shape or both of a parameter is used in the shape func. */
-enum ShapeFuncParamState {
-  kNoNeed = 0,
-  kNeedInputData = 1,
-  kNeedInputShape = 2,
-  kNeedBoth = 3,
-};
-
-struct LoweredOutputNode : public Object {
-  /*! \brief The outputs to the function */
-  tvm::Array<te::Tensor> outputs;
-  /*! \brief The implementation used to compute the output */
-  OpImplementation implementation;
-
-  void VisitAttrs(tvm::AttrVisitor* v) {
-    v->Visit("outputs", &outputs);
-    v->Visit("implementation", &implementation);
-  }
-
-  static constexpr const char* _type_key = "relay.LoweredOutput";
-  TVM_DECLARE_FINAL_OBJECT_INFO(LoweredOutputNode, Object);
-};
-
-class LoweredOutput : public ObjectRef {
- public:
-  TVM_DLL LoweredOutput(tvm::Array<te::Tensor> outputs, OpImplementation impl);
-
-  TVM_DEFINE_OBJECT_REF_METHODS(LoweredOutput, ObjectRef, LoweredOutputNode);
-};
-
-/*! \brief Node container to represent a cached function. */
-struct CachedFuncNode : public Object {
-  /* \brief compiled target */
-  tvm::Target target;
-  /*! \brief Function name */
-  std::string func_name;
-  /* \brief The inputs to the function */
-  tvm::Array<te::Tensor> inputs;
-  /* \brief The outputs to the function */
-  tvm::Array<te::Tensor> outputs;
-  /*! \brief The schedule to the function */
-  te::Schedule schedule;
-  /*! \brief The lowered functions to support the function. */
-  IRModule funcs = IRModule(Map<GlobalVar, BaseFunc>({}));
-
-  /*! \brief Parameter usage states in the shape function. */
-  tvm::Array<Integer> shape_func_param_states;
-
-  void VisitAttrs(tvm::AttrVisitor* v) {
-    v->Visit("target", &target);
-    v->Visit("func_name", &func_name);
-    v->Visit("inputs", &inputs);
-    v->Visit("outputs", &outputs);
-    v->Visit("schedule", &schedule);
-    v->Visit("funcs", &funcs);
-    v->Visit("shape_func_param_states", &shape_func_param_states);
-  }
-
-  static constexpr const char* _type_key = "relay.CachedFunc";
-  TVM_DECLARE_FINAL_OBJECT_INFO(CachedFuncNode, Object);
-};
-
-class CachedFunc : public ObjectRef {
- public:
-  TVM_DEFINE_OBJECT_REF_METHODS(CachedFunc, ObjectRef, CachedFuncNode);
-};
-
-class CCacheKey;
-/*! \brief Compile cache key */
-class CCacheKeyNode : public Object {
- public:
-  /*! \brief The source function to be lowered. */
-  Function source_func;
-  /*! \brief The hardware target.*/
-  Target target;
-
-  void VisitAttrs(tvm::AttrVisitor* v) {
-    v->Visit("source_func", &source_func);
-    v->Visit("target", &target);
-  }
-  /*! \return The hash value of CCacheKey. */
-  inline size_t Hash() const;
-  /*!
-   * \brief check content equality
-   * \param other The other value.
-   * \return The result of equality check.
-   */
-  inline bool Equal(const CCacheKeyNode* other) const;
-
-  static constexpr const char* _type_key = "relay.CCacheKey";
-  TVM_DECLARE_FINAL_OBJECT_INFO(CCacheKeyNode, tvm::Object);
-
- private:
-  /*!
-   * \brief internal cached hash value.
-   */
-  mutable size_t hash_{0};
-};
-
-/*! \brief cache entry used in compile engine */
-class CCacheKey : public ObjectRef {
- public:
-  CCacheKey() {}
-  explicit CCacheKey(ObjectPtr<Object> n) : ObjectRef(n) {}
-
-  /*!
-   * \brief The constructor
-   * \param source_func The source function.
-   * \param target The target device.
-   */
-  TVM_DLL CCacheKey(Function source_func, Target target);
-
-  const CCacheKeyNode* operator->() const { return static_cast<const CCacheKeyNode*>(get()); }
-  // comparator
-  inline bool operator==(const CCacheKey& other) const {
-    ICHECK(defined() && other.defined());
-    return (*this)->Equal(other.operator->());
-  }
-  using ContainerType = CCacheKeyNode;
-};
-
-/*! \brief Node container for compile cache. */
-class CCacheValueNode : public Object {
- public:
-  /*! \brief The corresponding function */
-  CachedFunc cached_func;
-  /*! \brief Result of Packed function generated by JIT */
-  PackedFunc packed_func;
-  /*! \brief usage statistics */
-  int use_count{0};
-
-  void VisitAttrs(tvm::AttrVisitor* v) {
-    v->Visit("cached_func", &cached_func);
-    v->Visit("use_count", &use_count);
-  }
-  static constexpr const char* _type_key = "relay.CCacheValue";
-  TVM_DECLARE_FINAL_OBJECT_INFO(CCacheValueNode, tvm::Object);
-};
-
-/*! \brief cache entry used in compile engine */
-class CCacheValue : public ObjectRef {
- public:
-  CCacheValue() {}
-  explicit CCacheValue(ObjectPtr<Object> n) : ObjectRef(n) {}
-  CCacheValueNode* operator->() { return static_cast<CCacheValueNode*>(get_mutable()); }
-  const CCacheValueNode* operator->() const { return static_cast<const CCacheValueNode*>(get()); }
-  using ContainerType = CCacheValueNode;
-};
+using namespace tvm::relay::tec;
 
 /*!
  * \brief Backend compilation engine for
@@ -242,49 +101,7 @@ class CompileEngine : public ObjectRef {
   TVM_DLL static CompileEngine& Global();
 };
 
-/*!
- * \brief Create schedule for target.
- * \param source_func The primitive function to be lowered.
- * \param target The target we want to create schedule for.
- * \return Pair of schedule and cache.
- *  The funcs field in cache is not yet populated.
- */
-CachedFunc CreateSchedule(const Function& source_func, const Target& target);
-
-/*!
- * \brief Check if the type is dynamic.
- * \param ty The type to be checked.
- * \return The result.
- */
-bool IsDynamic(const Type& ty);
-
-// implementations
-inline size_t CCacheKeyNode::Hash() const {
-  if (hash_ != 0) return hash_;
-  // do structral hash, avoid 0.
-  hash_ = tvm::StructuralHash()(this->source_func);
-  hash_ = dmlc::HashCombine(hash_, std::hash<std::string>()(target->str()));
-  if (hash_ == 0) hash_ = 1;
-  return hash_;
-}
-
-inline bool CCacheKeyNode::Equal(const CCacheKeyNode* other) const {
-  if (Hash() != other->Hash()) return false;
-  return this->target->str() == other->target->str() &&
-         tvm::StructuralEqual()(this->source_func, other->source_func);
-}
-
 }  // namespace relay
 }  // namespace tvm
 
-namespace std {
-// overload hash
-template <>
-struct hash<::tvm::relay::CCacheKey> {
-  size_t operator()(const ::tvm::relay::CCacheKey& key) const {
-    ICHECK(key.defined());
-    return key->Hash();
-  }
-};
-}  // namespace std
 #endif  // TVM_RELAY_BACKEND_COMPILE_ENGINE_H_
diff --git a/src/relay/backend/graph_executor_codegen.cc b/src/relay/backend/graph_executor_codegen.cc
index bca8e82440937..1e5c74ef4b1c0 100644
--- a/src/relay/backend/graph_executor_codegen.cc
+++ b/src/relay/backend/graph_executor_codegen.cc
@@ -25,6 +25,7 @@
 #include <dmlc/any.h>
 #include <dmlc/json.h>
 #include <tvm/ir/module.h>
+#include <tvm/relay/attrs/annotation.h>
 #include <tvm/relay/expr_functor.h>
 #include <tvm/runtime/device_api.h>
 #include <tvm/runtime/object.h>
@@ -36,10 +37,13 @@
 #include <vector>
 
 #include "compile_engine.h"
+#include "te_compiler.h"
 #include "utils.h"
 
 namespace tvm {
 namespace relay {
+// TODO(@jroesch, @csullivan): declare directly elsewhere
+backend::StaticMemoryPlan GraphPlanMemory(const Function& func);
 namespace backend {
 
 class GraphNode;
@@ -52,7 +56,6 @@ using GraphAttrs = std::unordered_map<std::string, dmlc::any>;
 using GraphObjectPtr = std::shared_ptr<GraphNode>;
 using GraphInputObjectPtr = std::shared_ptr<GraphInputNode>;
 using GraphOpObjectPtr = std::shared_ptr<GraphOpNode>;
-using TargetsMap = std::unordered_map<int, Target>;
 
 /*! \brief Node types */
 enum GraphNodeType {
@@ -176,112 +179,89 @@ class GraphOpNode : public GraphNode {
   const std::string op_type_name_{"tvm_op"};
 };
 
-/*! \brief Code generator for graph executor */
+/*! \brief Code generator for the graph executor, produces a module containing the graph JSON,
+ * module, and parameters.
+ */
 class GraphExecutorCodegen : public backend::MemoizedExprTranslator<std::vector<GraphNodeRef>> {
  public:
-  GraphExecutorCodegen(runtime::Module* mod, const TargetsMap& targets) : mod_(mod) {
-    compile_engine_ = CompileEngine::Global();
+  GraphExecutorCodegen(runtime::Module* mod, const TargetMap& targets) : mod_(mod) {
     targets_ = targets;
   }
 
-  /*!
-   * \brief Update the "main" control function's metadata
-   *
-   * \param func The main function that contains calls to relay primitive functions
-   */
-  void UpdateMainWorkspaceSize(const Function& func) {
-    // This is a Map<device,Map<storage_id, size>>
-    std::unordered_map<int, std::unordered_map<int, int>> sid_workspace;
-    // This is a Map<device, size_of_inputs_and_outputs>
-    std::unordered_map<int, int> device_io;
-    // This is a Map<device, size_of_constants>
-    std::unordered_map<int, int> device_consts;
-
-    // Initialize the maps to zero
-    for (const auto& kv : storage_device_map_) {
-      auto sids = kv.second[0];
-      auto devices = kv.second[1];
-      CHECK_EQ(sids.size(), devices.size());
-      for (uint32_t i = 0; i < sids.size(); i++) {
-        sid_workspace[devices[i]][sids[i]] = 0;
-        device_io[devices[i]] = 0;
-        device_consts[devices[i]] = 0;
-      }
-    }
+  StorageInfo GetStorageInfo(const Expr& e) {
+    size_t count = memory_plan_->expr_to_storage_info.count(e);
+    ICHECK_GT(count, 0) << "Expr is not existing in storage plan";
+    auto storage_info = memory_plan_->expr_to_storage_info[e];
+    return storage_info;
+  }
 
-    // Collect sizes of tensors
-    for (const auto& kv : storage_device_map_) {
-      auto size_bytes = CalculateRelayExprSizeBytes(kv.first->checked_type());
-      auto sids = kv.second[0];
-      auto devices = kv.second[1];
-      if (kv.first->IsInstance<ConstantNode>()) {
-        for (const auto& dev : devices) {
-          device_consts[dev] += size_bytes;
-        }
-        continue;
-      } else if (kv.first->IsInstance<VarNode>() || kv.first == func->body) {
-        for (const auto& dev : devices) {
-          device_io[dev] += size_bytes;
-        }
-        continue;
-      }
-      for (uint32_t i = 0; i < sids.size(); i++) {
-        // Here we record the largest size of the tensor
-        // that share the same storage id, because storage_id will
-        // be shared between multiple tensors that are not live simultaneously.
-        if (size_bytes > sid_workspace[devices[i]][sids[i]]) {
-          sid_workspace[devices[i]][sids[i]] = size_bytes;
-        }
-      }
-    }
+  LoweredOutput Codegen(relay::Function func) {
+    // TODO(@jroesch): we need to split device planning and memory planning
+    // first we run device assignment, then we perform lowering, and then
+    // storage planning in ideal world.
 
-    // This is a Map<device, workspace_size>
-    std::unordered_map<int, int> device_workspace;
-    // Once we know the sizes of sids, we need to accumulate per device
-    for (const auto& dev_sid_size : sid_workspace) {
-      auto dev = dev_sid_size.first;
-      device_workspace[dev] = 0;
-      for (const auto& sid_size : dev_sid_size.second) {
-        device_workspace[dev] += sid_size.second;
-      }
-    }
+    memory_plan_ = GraphPlanMemory(func);
 
-    // Populate FunctionInfo
-    auto fi_node = make_object<FunctionInfoNode>();
-    // Initialize all target workspaces to zero
-    for (const auto& kv : targets_) {
-      auto tgt = kv.second;
-      fi_node->workspace_sizes.Set(tgt, 0);
-    }
-    for (const auto& dev_and_size : device_workspace) {
-      auto tgt = GetTargetFromInteger(dev_and_size.first);
-      fi_node->workspace_sizes.Set(tgt, dev_and_size.second);
-      fi_node->relay_primfuncs.Set(tgt, func);
-    }
-    for (const auto& dev_and_size : device_io) {
-      auto tgt = GetTargetFromInteger(dev_and_size.first);
-      fi_node->io_sizes.Set(tgt, dev_and_size.second);
-    }
-    for (const auto& dev_and_size : device_consts) {
-      auto tgt = GetTargetFromInteger(dev_and_size.first);
-      fi_node->constant_sizes.Set(tgt, dev_and_size.second);
+    // This first phase moves from implicit use of compile engine,
+    // to instead explicitly lowering the incoming IRModule, and then
+    // performing the preexisting graph executor code generation phase.
+    IRModule mod = IRModule::FromExpr(func);
+
+    // Build a map from each operation to device.
+    tec::DeviceMap device_context_map;
+    for (const auto& it : memory_plan_->expr_to_storage_info) {
+      auto expr = it.first;
+      auto storage_info = it.second;
+      auto device_types = storage_info->device_types;
+      // CHECK_EQ(device_types.size(), 1);
+      tvm::Device dev;
+      dev.device_id = 0;
+      dev.device_type = device_types[0];
+      device_context_map.insert({expr, dev});
     }
 
-    function_metadata_.Set(String(runtime::symbol::tvm_module_main), FunctionInfo(fi_node));
-  }
 
-  LoweredOutput Codegen(relay::Function func, String mod_name) {
-    auto pf = GetPackedFunc("relay.backend.GraphPlanMemory");
-    storage_device_map_ = (*pf)(func);
-    mod_name_ = mod_name;
-    UpdateMainWorkspaceSize(func);
+    auto lowered_module = tec::LowerTE(
+        mod, targets_, device_context_map,
+        [this](Function func) {
+          std::cout << "\n\n\n\n\n\nThe lambda is called\n\n\n\n\n\n" << std::endl;
+
+          // We need to maintain the constant map for external functions so we pass this
+          // processing function which allows us to process each function as we lower it.
+          if (func->GetAttr<String>(attr::kCompiler).defined()) {
+            UpdateConstants(func, &params_);
+          }
+
+          // TODO(@areusch, @jroesch): We should refactor this to execute as a further pass,
+          // instead writing data to the lowering process directly.
+          UpdateFunctionMetadata(func, this->function_metadata_);
+        },
+        memory_plan_);
+
+    std::cout << "RIGHT" << this->function_metadata_ << std::endl;
+    function_metadata_.Set(runtime::symbol::tvm_module_main, lowered_module.main_func_info);
+    auto main_module = lowered_module.main_module;
+    std::cout << "MainModule: " << main_module << std::endl;
+    main_module = relay::transform::InferType()(main_module);
+    relay::Function main_func = Downcast<relay::Function>(main_module->Lookup("main"));
+
+    // Now that we have lowered all operators to TIR code, we can proceed with compilation.
+    //
+    // We need to unfortunately re-plan as the previous results have been invalidated by lowering
+    // we will fix this in future refactors.
+    memory_plan_ = GraphPlanMemory(main_func);
+
+    // The graph planner also can not handle planning calls to global variables to we must remap
+
     // First we convert all the parameters into input nodes.
-    for (auto param : func->params) {
+    for (auto param : main_func->params) {
       auto node_ptr = GraphInputNode::make_node_ptr(param->name_hint(), GraphAttrs());
       var_map_[param.get()] = AddNode(node_ptr, param);
     }
-    heads_ = VisitExpr(func->body);
+
+    heads_ = VisitExpr(main_func->body);
     std::ostringstream os;
+
     dmlc::JSONWriter writer(&os);
     GetJSON(&writer);
     LoweredOutput ret;
@@ -292,17 +272,10 @@ class GraphExecutorCodegen : public backend::MemoizedExprTranslator<std::vector<
           param.first,
           std::make_pair(static_cast<int>(param_storage_ids_[param.first]), param.second)));
     }
-
-    for (auto& kv : lowered_funcs_) {
-      if (ret.lowered_funcs.count(kv.first) == 0) {
-        ret.lowered_funcs.Set(kv.first, IRModule(Map<GlobalVar, BaseFunc>({})));
-      }
-      auto& mod = ret.lowered_funcs[kv.first];
-      mod->Update(kv.second);
-      ret.lowered_funcs.Set(kv.first, mod);
-    }
-    ret.external_mods = compile_engine_->LowerExternalFunctions();
+    std::cout << function_metadata_ << std::endl;
     ret.function_metadata = std::move(function_metadata_);
+    ret.lowered_funcs = lowered_module.per_target_module;
+    ret.external_mods = lowered_module.external_mods;
     return ret;
   }
 
@@ -331,20 +304,18 @@ class GraphExecutorCodegen : public backend::MemoizedExprTranslator<std::vector<
    */
   std::vector<GraphNodeRef> AddNode(GraphObjectPtr node, Expr expr) {
     auto checked_type = expr->checked_type();
-    size_t count = storage_device_map_.count(expr);
-    ICHECK_GT(count, 0) << "Expr is not existing in storage plan";
-    auto storage_device_info = storage_device_map_[expr];
-    ICHECK_EQ(storage_device_info.size(), 3);
+
+    auto storage_info = GetStorageInfo(expr);
     // storage
-    std::vector<int64_t> storage_info;
-    for (auto& v : storage_device_info[0]) {
-      storage_info.push_back(v->value);
+    std::vector<int64_t> storage_ids;
+    for (auto v : storage_info->storage_ids) {
+      storage_ids.push_back(v);
     }
-    node->attrs_["storage_id"] = std::move(storage_info);
+    node->attrs_["storage_id"] = std::move(storage_ids);
     // type
     std::vector<int64_t> device_types;
-    for (auto& v : storage_device_info[1]) {
-      device_types.push_back(v->value);
+    for (auto v : storage_info->device_types) {
+      device_types.push_back(static_cast<int64_t>(v));
     }
     size_t num_unknown_devices = std::count(device_types.begin(), device_types.end(), 0);
     if (num_unknown_devices != 0 && num_unknown_devices != device_types.size()) {
@@ -404,7 +375,7 @@ class GraphExecutorCodegen : public backend::MemoizedExprTranslator<std::vector<
     auto node = GraphInputNode::make_node_ptr(name, GraphAttrs());
     auto to_return = AddNode(node, expr);
     CHECK_EQ(to_return.size(), 1) << "Expected exactly 1 parameter node created";
-    param_storage_ids_[name] = storage_device_map_[expr][0][0]->value;
+    param_storage_ids_[name] = GetStorageInfo(expr)->storage_ids[0];
     params_[name] = op->data;
     return to_return;
   }
@@ -420,8 +391,18 @@ class GraphExecutorCodegen : public backend::MemoizedExprTranslator<std::vector<
     return fields;
   }
 
-  std::vector<GraphNodeRef> GraphAddCallNode(const CallNode* op, const std::string& op_name,
-                                             const std::string& func_name, GraphAttrs attrs) {
+  bool ShareSameStorage(const Expr& lhs, const Expr& rhs) {
+    StorageInfo lit = GetStorageInfo(lhs);
+    StorageInfo rit = GetStorageInfo(rhs);
+    int64_t lhs_storage_id = lit->storage_ids[0];
+    int64_t rhs_storage_id = rit->storage_ids[0];
+    std::cout << "lhs_storage_id " << lhs_storage_id << std::endl;
+    std::cout << "rhs_storage_id " << rhs_storage_id << std::endl;
+    return lhs_storage_id == rhs_storage_id;
+  }
+
+  std::vector<GraphNodeRef> GraphAddCallNode(const CallNode* op, const std::string& func_name,
+                                             GraphAttrs op_attrs) {
     std::vector<GraphNodeRef> inputs;
     for (auto arg : op->args) {
       auto res = VisitExpr(arg);
@@ -429,161 +410,44 @@ class GraphExecutorCodegen : public backend::MemoizedExprTranslator<std::vector<
         inputs.push_back(nr);
       }
     }
-    auto node = GraphOpNode::make_node_ptr(op_name, GraphAttrs(), func_name, inputs, attrs);
-    return AddNode(node, GetRef<Expr>(op));
-  }
-
-  bool ShareSameStorage(const Expr& lhs, const Expr& rhs) {
-    auto lit = storage_device_map_.find(lhs);
-    auto rit = storage_device_map_.find(rhs);
-    ICHECK(lit != storage_device_map_.end());
-    ICHECK(rit != storage_device_map_.end());
-    int64_t lhs_storage_id = ((*lit).second)[0][0]->value;
-    int64_t rhs_storage_id = ((*rit).second)[0][0]->value;
-    return lhs_storage_id == rhs_storage_id;
-  }
 
-  /*!
-   * \brief Obtain the Target from the device type.
-   * If homogenous compilation, this will return the only target.
-   * If heteregenous compilation, this will select associated using the targets_ Map.
-   *
-   * \param dev_type
-   * \return Target
-   */
-  Target GetTargetFromInteger(int64_t dev_type) {
-    if (targets_.size() == 1) {
-      // homogeneous execution.
-      const auto& it = targets_.begin();
-      return (*it).second;
-    } else {
-      // heterogeneous execution.
-      std::string call_dev_name;
-      if (dev_type == 0) {
-        call_dev_name = "llvm";
-      } else {
-        call_dev_name = runtime::DeviceName(dev_type);
-      }
-      if (targets_.count(dev_type) == 0) {
-        LOG(FATAL) << "No target is provided for device " << call_dev_name;
-      }
-      return targets_[dev_type];
+    /// An adapted version of the storage optimization for the time being.
+    bool reshape_only = false;
+    if (op->attrs.defined() && op->attrs.as<TIRCallAttrs>()) {
+      reshape_only = true;
+      std::cout << "should reshape" << std::endl;
     }
-  }
 
-  /*!
-   * \brief Update the function metadata for a given cached function and its relay
-   * primitive function.
-   *
-   * \param cfunc The cached function as provided the by the compile engine
-   * \param relay_func The source relay primitive function
-   * \param relay_target The target associated with relay primitive function
-   */
-  void UpdateFunctionMetadata(const CachedFunc& cfunc, const Function& relay_func,
-                              const Target& relay_target) {
-    auto fi_node = make_object<FunctionInfoNode>();
-    for (const auto& kv : cfunc->funcs->functions) {
-      auto primfunc = Downcast<tir::PrimFunc>(kv.second);
-      auto workspace_byte_alignment = relay_target->GetAttr<Integer>("workspace-byte-alignment")
-                                          .value_or(tvm::runtime::kDefaultWorkspaceAlignment);
-      Integer workspace_size = CalculateWorkspaceBytes(primfunc, workspace_byte_alignment);
-      Target primfunc_target = relay_target;
-      if (primfunc->attrs->dict.count("target")) {
-        primfunc_target = Downcast<Target>(primfunc->attrs->dict["target"]);
-      }
-      fi_node->workspace_sizes.Set(primfunc_target, workspace_size);
-      // Calculating size for I/O
-      for (auto const& param : primfunc->params) {
-        auto p_shape = primfunc->buffer_map[param]->shape;
-        int num_of_elements = 1;
-        for (const auto& dim_index_expr : p_shape) {
-          if (dim_index_expr->IsInstance<IntImmNode>()) {
-            num_of_elements *= dim_index_expr.as<IntImmNode>()->value;
-          } else {
-            // If shape is dynamic, we cannot calculate workspace in compile time.
-            num_of_elements = 0;
-          }
-        }
-        int element_size = primfunc->buffer_map[param]->dtype.bytes();
-        fi_node->io_sizes.Set(primfunc_target, element_size * num_of_elements);
-      }
-      fi_node->constant_sizes.Set(primfunc_target, 0);
-      fi_node->tir_primfuncs.Set(primfunc_target, primfunc);
-      fi_node->relay_primfuncs.Set(primfunc_target, relay_func);
-    }
-    function_metadata_.Set(cfunc->func_name, FunctionInfo(fi_node));
-  }
-
-  std::vector<GraphNodeRef> VisitExpr_(const CallNode* op) override {
-    Expr expr = GetRef<Expr>(op);
-    Function func;
-    if (op->op.as<OpNode>()) {
-      LOG(FATAL) << "Operators should be transformed away; try applying"
-                 << "the fuse_ops transformation to the expression.";
-    } else if (op->op.as<GlobalVarNode>()) {
-      LOG(FATAL) << "Not implemented";
-    } else if (op->op.as<FunctionNode>()) {
-      func = GetRef<Function>(op->op.as<FunctionNode>());
-    } else {
-      LOG(FATAL) << "TVM runtime does not support calls to " << op->op->GetTypeKey();
-    }
-    if (!func->HasNonzeroAttr(attr::kPrimitive)) {
-      LOG(FATAL) << "TVM only support calls to primitive functions "
-                 << "(i.e functions composed of fusable operator invocations)";
-    }
-
-    // Copy attrs from function into the graph node
-    // For now we only handle strings
-    GraphAttrs attrs;
-    for (auto p : func->attrs->dict) {
-      if (p.second.as<StringObj>()) {
-        attrs[p.first] = std::string(Downcast<String>(p.second));
-      }
+    std::cout << "Op: " << GetRef<Expr>(op) << std::endl;
+    std::cout << "First Arg: " << op->args[0] << std::endl;
+    if (reshape_only && ShareSameStorage(GetRef<Expr>(op), op->args[0])) {
+      auto node =
+          GraphOpNode::make_node_ptr("reshape_nop", GraphAttrs(), "__nop", inputs, op_attrs);
+      std::cout << "Firing storage optimization" << std::endl;
+      return AddNode(node, GetRef<Expr>(op));
     }
 
-    auto pf0 = GetPackedFunc("relay.backend._make_CCacheKey");
-    auto pf1 = GetPackedFunc("relay.backend._CompileEngineLower");
-    Target target;
-    // Handle external function
-    if (func->GetAttr<String>(attr::kCompiler).defined()) {
-      target = Target("ext_dev");
-      CCacheKey key = (*pf0)(func, target);
-      CachedFunc ext_func = (*pf1)(compile_engine_, key, mod_name_);
-      ICHECK(ext_func.defined()) << "External function is not defined.";
-      UpdateConstants(func, &params_);
-      return GraphAddCallNode(op, ext_func->func_name, ext_func->func_name, attrs);
-    }
-
-    // In the current flat memory allocation scenario
-    // the flat memory allocator can always allocate input
-    // and output of the reshape to the same memory, we can turn reshape only
-    // function to a nop.
-    //
-    // NOTE that for non-flat memory this is not necessarily true.
-    //
-    // TODO(tvm-team) Update checks of flat memory enablement when we support
-    // opaque-nd memory planning to skip this path.
-    if (func->HasNonzeroAttr(attr::kReshapeOnly) && ShareSameStorage(expr, op->args[0])) {
-      return GraphAddCallNode(op, "reshape_nop", "__nop", attrs);
-    }
+    // Compute the operator name, because we used the get unique name when generating the kernel.
+    auto op_name = _GetUniqueName(func_name);
+    auto node = GraphOpNode::make_node_ptr(op_name, GraphAttrs(), func_name, inputs, op_attrs);
+    return AddNode(node, GetRef<Expr>(op));
+  }
 
-    ICHECK_GE(storage_device_map_.count(expr), 0);
-    auto& device_type = storage_device_map_[expr][1];
-    auto call_dev_type = device_type[0]->value;
-    target = GetTargetFromInteger(call_dev_type);
-    // Normal Relay Function
+  std::vector<GraphNodeRef> VisitExpr_(const CallNode* call_node) override {
+    relay::Call call = GetRef<Call>(call_node);
+    if (auto global_node = call->op.as<GlobalVarNode>()) {
+      auto prim_fn_name = global_node->name_hint;
 
-    CCacheKey key = (*pf0)(func, target);
-    CachedFunc lowered_func = (*pf1)(compile_engine_, key, mod_name_);
-    if (!lowered_funcs_.count(target->str())) {
-      lowered_funcs_[target->str()] = IRModule(Map<GlobalVar, BaseFunc>({}));
+      // TODO(@jroesch): attach attributes somehow
+      return GraphAddCallNode(call_node, prim_fn_name, GraphAttrs());
+    } else {
+      ICHECK(false) << "Non-primitive-call nodes should have been transformed away.\n"
+                    << "The graph executor code generator expects all calls to have their callee "
+                       "normalized to a GlobalVar but found a "
+                    << call->GetTypeKey() << "."
+                    << "AST: " << PrettyPrint(call) << PrettyPrint(call) << std::endl;
+      return {};
     }
-    lowered_funcs_[target->str()]->Update(lowered_func->funcs);
-
-    // Update function metadata via looking at all primfuncs
-    UpdateFunctionMetadata(lowered_func, func, target);
-    return GraphAddCallNode(op, _GetUniqueName(lowered_func->func_name), lowered_func->func_name,
-                            attrs);
   }
 
   std::vector<GraphNodeRef> VisitExpr_(const LetNode* op) override {
@@ -714,7 +578,7 @@ class GraphExecutorCodegen : public backend::MemoizedExprTranslator<std::vector<
   /*! \brief variable map */
   std::unordered_map<const Object*, std::vector<GraphNodeRef>> var_map_;
   /*! \brief target device */
-  TargetsMap targets_;
+  TargetMap targets_;
   /*!
    * \brief parameters (i.e. ConstantNodes found in the graph).
    * These are take as inputs to the GraphExecutor.
@@ -724,7 +588,7 @@ class GraphExecutorCodegen : public backend::MemoizedExprTranslator<std::vector<
   std::unordered_map<std::string, runtime::NDArray> params_;
   std::unordered_map<std::string, int64_t> param_storage_ids_;
   /*! \brief plan memory of device result */
-  Map<Expr, Array<IntegerArray>> storage_device_map_;
+  StaticMemoryPlan memory_plan_;
   /*! \brief the module name we use to mangle the function names */
   String mod_name_;
   /*! \brief lowered funcs */
@@ -733,8 +597,6 @@ class GraphExecutorCodegen : public backend::MemoizedExprTranslator<std::vector<
   Map<String, FunctionInfo> function_metadata_;
   /*! \brief name map */
   std::unordered_map<std::string, size_t> name_map_;
-  /*! \brief compile engine */
-  CompileEngine compile_engine_;
 };
 
 class GraphExecutorCodegenModule : public runtime::ModuleNode {
@@ -747,11 +609,11 @@ class GraphExecutorCodegenModule : public runtime::ModuleNode {
                                     << "runtime::Module mod and Map<int, Target> targets";
         void* mod = args[0];
         Map<Integer, tvm::Target> tmp = args[1];
-        TargetsMap targets;
+        TargetMap targets;
         for (const auto& it : tmp) {
           auto dev_type = it.first.as<tir::IntImmNode>();
           ICHECK(dev_type);
-          targets[dev_type->value] = it.second;
+          targets[static_cast<DLDeviceType>(dev_type->value)] = it.second;
         }
         codegen_ = std::make_shared<GraphExecutorCodegen>(reinterpret_cast<runtime::Module*>(mod),
                                                           targets);
diff --git a/src/relay/backend/graph_plan_memory.cc b/src/relay/backend/graph_plan_memory.cc
index 351469d6e1caf..979f87e9233a6 100644
--- a/src/relay/backend/graph_plan_memory.cc
+++ b/src/relay/backend/graph_plan_memory.cc
@@ -23,15 +23,19 @@
  *   the program in the graph executor.
  */
 #include <tvm/relay/analysis.h>
+#include <tvm/relay/attrs/annotation.h>
 #include <tvm/relay/expr.h>
 #include <tvm/relay/expr_functor.h>
 #include <tvm/tir/op.h>
 
 #include "../../support/arena.h"
+#include "./utils.h"
 
 namespace tvm {
 namespace relay {
 
+using backend::StaticMemoryPlan;
+using backend::StorageInfo;
 using IntegerArray = Array<Integer>;
 
 struct StorageToken {
@@ -114,7 +118,8 @@ class StorageAllocaBaseVisitor : public ExprVisitor {
   const std::vector<StorageToken*>& GetToken(const Expr& expr) {
     this->VisitExpr(expr);
     auto it = token_map_.find(expr.operator->());
-    ICHECK(it != token_map_.end());
+    ICHECK(it != token_map_.end())
+        << "Expression: `" << PrettyPrint(expr) << "` not found in storage map.";
     return it->second;
   }
   /*!
@@ -168,6 +173,7 @@ class StorageAllocaInit : protected StorageAllocaBaseVisitor {
   void VisitExpr_(const CallNode* op) final {
     // create token for the call node.
     CreateToken(op, true);
+
     // for each input, visit argument token.
     for (Expr arg : op->args) {
       for (StorageToken* tok : GetToken(arg)) {
@@ -196,31 +202,32 @@ class StorageAllocator : public StorageAllocaBaseVisitor {
   }
 
   // Run storage allocation for a function.
-  Map<Expr, Array<IntegerArray> > Plan(const Function& func) {
+  StaticMemoryPlan Plan(const Function& func) {
     prototype_ = StorageAllocaInit(&arena_).GetInitTokenMap(func);
     this->Run(func);
 
     // The value of smap contains two integer arrays where the first array
     // contains the planned storage ids and the second holds the device types.
-    Map<Expr, Array<IntegerArray> > smap;
+    Map<Expr, backend::StorageInfo> smap;
     int num_annotated_nodes = 0;
     int num_nodes = 0;
 
     for (const auto& kv : token_map_) {
-      std::vector<Integer> storage_ids;
-      std::vector<Integer> device_types;
-      std::vector<Integer> sid_sizes_byte;
+      std::vector<int64_t> storage_ids;
+      std::vector<DLDeviceType> device_types;
+      std::vector<int64_t> sid_sizes_byte;
+
       for (StorageToken* tok : kv.second) {
         if (tok->device_type) {
           num_annotated_nodes++;
         }
         num_nodes++;
         storage_ids.push_back(tok->storage_id);
-        device_types.push_back(tok->device_type);
+        device_types.push_back(static_cast<DLDeviceType>(tok->device_type));
         sid_sizes_byte.push_back(GetMemorySize(tok));
       }
-      smap.Set(GetRef<Expr>(kv.first),
-               Array<IntegerArray>({storage_ids, device_types, sid_sizes_byte}));
+      auto storage_info = backend::StorageInfo(storage_ids, device_types, sid_sizes_byte);
+      smap.Set(GetRef<Expr>(kv.first), storage_info);
     }
     // Either all or none of the nodes should be annotated.
     if (num_annotated_nodes != 0 && num_annotated_nodes != num_nodes) {
@@ -228,7 +235,8 @@ class StorageAllocator : public StorageAllocaBaseVisitor {
                  << "expressions are assigned with virtual device types. Either all "
                     "or none of the expressions are expected to be annotated.";
     }
-    return smap;
+
+    return backend::StaticMemoryPlan(smap);
   }
 
  protected:
@@ -279,6 +287,7 @@ class StorageAllocator : public StorageAllocaBaseVisitor {
         args.push_back(tok);
       }
     }
+
     // Under the flat-memory setting.
     // we can force aliasing the input and output of reshape
     // to make it an nop. Note that this is not true
@@ -294,6 +303,7 @@ class StorageAllocator : public StorageAllocaBaseVisitor {
       // create token for the call node.
       CreateToken(op, true);
     }
+
     // check if there is orphaned output that can be released immediately.
     for (StorageToken* tok : token_map_.at(op)) {
       CheckForRelease(tok);
@@ -320,6 +330,9 @@ class StorageAllocator : public StorageAllocaBaseVisitor {
     if (const auto* fn = call->op.as<FunctionNode>()) {
       return fn->HasNonzeroAttr(attr::kReshapeOnly);
     }
+    if (call->attrs.defined() && call->attrs.as<TIRCallAttrs>()) {
+      return true;
+    }
     return false;
   }
   /*!
@@ -419,9 +432,7 @@ class StorageAllocator : public StorageAllocaBaseVisitor {
   std::unordered_map<const ExprNode*, std::vector<StorageToken*> > prototype_;
 };
 
-Map<Expr, Array<IntegerArray> > GraphPlanMemory(const Function& func) {
-  return StorageAllocator().Plan(func);
-}
+StaticMemoryPlan GraphPlanMemory(const Function& func) { return StorageAllocator().Plan(func); }
 
 TVM_REGISTER_GLOBAL("relay.backend.GraphPlanMemory").set_body_typed(GraphPlanMemory);
 
diff --git a/src/relay/backend/interpreter.cc b/src/relay/backend/interpreter.cc
index eeba010dc1644..53985c78a33c8 100644
--- a/src/relay/backend/interpreter.cc
+++ b/src/relay/backend/interpreter.cc
@@ -32,6 +32,7 @@
 #include <tvm/runtime/device_api.h>
 #include <tvm/runtime/object.h>
 
+#include "../transforms/pass_utils.h"
 #include "compile_engine.h"
 
 namespace tvm {
@@ -381,7 +382,7 @@ class Interpreter : public ExprFunctor<ObjectRef(const Expr& n)>,
     } else {
       m = build(cfunc->funcs, cfunc->target, Target(nullptr));
     }
-    shape_func = m.GetFunction(cfunc->func_name);
+    shape_func = m.GetFunction(cfunc->prim_fn_var->name_hint);
     shape_func.CallPacked(TVMArgs(values.data(), codes.data(), arity), &rv);
 
     // Get output shapes
diff --git a/src/relay/backend/te_compiler.cc b/src/relay/backend/te_compiler.cc
new file mode 100644
index 0000000000000..ab453757e66fc
--- /dev/null
+++ b/src/relay/backend/te_compiler.cc
@@ -0,0 +1,676 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include "te_compiler.h"
+
+#include <tvm/driver/driver_api.h>
+#include <tvm/ir/type_functor.h>
+#include <tvm/relay/analysis.h>
+#include <tvm/relay/attrs/annotation.h>
+#include <tvm/relay/attrs/device_copy.h>
+#include <tvm/relay/expr.h>
+#include <tvm/relay/expr_functor.h>
+#include <tvm/relay/op.h>
+#include <tvm/relay/op_attr_types.h>
+#include <tvm/runtime/device_api.h>
+#include <tvm/runtime/registry.h>
+#include <tvm/te/operation.h>
+#include <tvm/te/schedule.h>
+#include <tvm/te/schedule_pass.h>
+#include <tvm/topi/tags.h>
+
+#include <functional>
+#include <limits>
+#include <mutex>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include "../transforms/pass_utils.h"
+#include "te_compiler.h"
+#include "te_compiler_cache.h"
+#include "utils.h"
+
+namespace tvm {
+namespace relay {
+// TODO(@jroesch, @csullivan): declare directly elsewhere
+backend::StaticMemoryPlan GraphPlanMemory(const Function& func);
+
+namespace tec {
+
+using namespace tvm::relay::transform;
+
+TVM_REGISTER_OBJECT_TYPE(TECompilerNode);
+
+class TECompilerImpl : public TECompilerNode {
+ public:
+  // Lower the function.
+  CachedFunc Lower(const CCacheKey& key) { return LowerInternal(key)->cached_func; }
+
+  // For now, build one module per function.
+  PackedFunc JIT(const CCacheKey& key) final {
+    CCacheValue value = LowerInternal(key);
+    if (value->packed_func != nullptr) {
+      return value->packed_func;
+    }
+    auto m = build(value->cached_func->funcs, key->target, Target(nullptr));
+    value->packed_func = m.GetFunction(value->cached_func->prim_fn_var->name_hint);
+    return value->packed_func;
+  }
+
+  CachedFunc LowerShapeFunc(const CCacheKey& key) final {
+    return LowerShapeFuncInternal(key)->cached_func;
+  }
+
+  Map<String, IRModule> GetLoweredFunctions() {
+    Map<String, IRModule> lowered_functions;
+    for (const auto& it : cache_) {
+      auto source_func = it.first;
+      auto lowered_func = it.second;
+      auto target = source_func->target;
+
+      if (!lowered_functions.count(target->str())) {
+        lowered_functions.Set(target->str(), IRModule(Map<GlobalVar, BaseFunc>({})));
+      }
+
+      lowered_functions[target->str()]->Update(lowered_func->cached_func->funcs);
+    }
+    return lowered_functions;
+  }
+
+  Array<tvm::runtime::Module> LowerExternalFunctions() {
+    Array<tvm::runtime::Module> ret;
+    std::unordered_map<std::string, std::string> cached_symbol;
+    std::vector<CCacheKey> cached_ext_funcs;
+    for (const auto& it : cache_) {
+      auto src_func = it.first->source_func;
+      ICHECK(src_func.defined());
+      if (src_func->GetAttr<String>(attr::kCompiler).defined()) {
+        auto code_gen = src_func->GetAttr<String>(attr::kCompiler);
+        std::string code_gen_name = code_gen.value();
+        cached_ext_funcs.push_back(it.first);
+
+        auto symbol_name = src_func->GetAttr<String>(tvm::attr::kGlobalSymbol);
+        ICHECK(symbol_name.defined()) << "No external symbol is set for:\n"
+                                      << AsText(src_func, false);
+
+        std::string sn = symbol_name.value();
+        if (cached_symbol.count(sn)) {
+          cached_symbol[sn] = code_gen_name;
+        } else {
+          ICHECK_NE(sn, code_gen_name)
+              << "Found duplicated symbol: " << sn << " for: " << code_gen_name;
+        }
+
+        std::string ext_name = "relay.ext." + code_gen_name;
+        auto pf = tvm::runtime::Registry::Get(ext_name);
+        ICHECK(pf) << "Failed to find the codegen tool for " << ext_name;
+        // No need to keep compiler attribute at this point, functions have been
+        // extracted for specific codegen.
+        src_func = WithAttr(std::move(src_func), attr::kCompiler, NullValue<ObjectRef>());
+        runtime::Module ext_mod = (*pf)(src_func);
+
+        ICHECK(ext_mod.defined()) << "No external runtime is generated.";
+        ret.push_back(ext_mod);
+      }
+    }
+
+    // No need to cache external functions as we collected them all to create
+    // external runtime modules.
+    for (const auto& it : cached_ext_funcs) {
+      cache_.erase(it);
+    }
+    return ret;
+  }
+
+  void Clear() final { cache_.clear(); }
+
+  // List all items in the cache.
+  Array<ObjectRef> ListItems() {
+    std::lock_guard<std::mutex> lock(mutex_);
+    Array<ObjectRef> items;
+    for (auto& kv : cache_) {
+      items.push_back(kv.first);
+      items.push_back(kv.second);
+    }
+    return items;
+  }
+
+  /*!
+   * \brief Get the cache key of the function that is being lowered currently
+   * \return the cache key
+   */
+  CCacheKey GetCurrentCCacheKey() { return cur_ccache_key_; }
+
+ private:
+  // implement lowered func
+  CCacheValue LowerInternal(const CCacheKey& key) {
+    std::lock_guard<std::mutex> lock(mutex_);
+    CCacheValue value;
+    auto it = cache_.find(key);
+    if (it != cache_.end()) {
+      it->second->use_count += 1;
+      if (it->second->cached_func.defined()) return it->second;
+      value = it->second;
+    } else {
+      value = CCacheValue(make_object<CCacheValueNode>());
+      value->use_count = 0;
+      if (!backend::IsCompileEngineCacheDisabled()) {
+        cache_[key] = value;
+      }
+    }
+    cur_ccache_key_ = key;
+
+    // No need to lower external functions for now. We will invoke the external
+    // codegen tool once and lower all functions together.
+    if (key->source_func->GetAttr<String>(attr::kCompiler).defined()) {
+      auto ir_module = IRModule();
+      const auto name_node = key->source_func->GetAttr<String>(tvm::attr::kGlobalSymbol);
+      ICHECK(name_node.defined()) << "External function has not been attached a name yet.";
+      auto func_name = GetUniqueName(name_node.value(), &name_map_);
+      auto target = Target("ext_dev");
+      auto global_var = GlobalVar(func_name);
+      global_var->checked_type_ = key->source_func->checked_type();
+      value->cached_func = CachedFunc(target, global_var, {}, {}, te::Schedule(), {}, ir_module);
+      return value;
+    }
+    // Enforce use the target.
+    With<Target> target_scope(key->target);
+
+    ICHECK(!value->cached_func.defined());
+    auto cfunc = PrimFuncFor(key->source_func, key->target,
+                             [&](std::string name) { return GetUniqueName(name, &name_map_); });
+
+    // Skip lowering for device copy node.
+    const Expr body = (key->source_func)->body;
+    if (const CallNode* call_node = body.as<CallNode>()) {
+      if (call_node->attrs.as<DeviceCopyAttrs>()) {
+        value->cached_func = cfunc;
+        return value;
+      }
+    }
+
+    // NOTE: array will copy on write.
+    Array<te::Tensor> all_args = Array<te::Tensor>(cfunc->inputs);
+    for (te::Tensor arg : cfunc->outputs) {
+      all_args.push_back(arg);
+    }
+
+    std::unordered_map<te::Tensor, tir::Buffer> binds;
+    auto func_name = cfunc->prim_fn_var->name_hint;
+    cfunc->funcs->Update(tvm::LowerSchedule(cfunc->schedule, all_args, func_name, binds));
+    value->cached_func = cfunc;
+    return value;
+  }
+
+  // implement lowered shape func
+  CCacheValue LowerShapeFuncInternal(const CCacheKey& key) {
+    std::lock_guard<std::mutex> lock(mutex_);
+    CCacheValue value;
+    auto it = shape_func_cache_.find(key);
+    if (it != shape_func_cache_.end()) {
+      it->second->use_count += 1;
+      if (it->second->cached_func.defined()) return it->second;
+      value = it->second;
+    } else {
+      value = CCacheValue(make_object<CCacheValueNode>());
+      value->use_count = 0;
+      shape_func_cache_[key] = value;
+    }
+    // Enforce use the target.
+    With<Target> target_scope(key->target);
+
+    ICHECK(!value->cached_func.defined());
+
+    using tvm::transform::PassContext;
+    With<PassContext> fresh_pass_ctx_scope(PassContext::Create());
+    auto cached_func = ShapeFuncFor(key->source_func, key->target, [&](std::string name) {
+      return GetUniqueName(name, &name_map_);
+    });
+
+    value->cached_func = cached_func;
+    return value;
+  }
+
+  /*! \brief compiler cache lock*/
+  std::mutex mutex_;
+  /*! \brief internal name map to get an unique name */
+  std::unordered_map<std::string, int> name_map_;
+  /*! \brief internal compiler cache */
+  std::unordered_map<CCacheKey, CCacheValue> cache_;
+  /*! \brief internal compiler cache for shape funcs */
+  std::unordered_map<CCacheKey, CCacheValue> shape_func_cache_;
+  /*! \brief the cache key of the function that is being lowered currently*/
+  CCacheKey cur_ccache_key_;
+};
+
+TECompiler::TECompiler() {
+  auto object = make_object<TECompilerImpl>();
+  data_ = object;
+}
+
+using AnalysisRemapping = std::unordered_map<Expr, Expr, ObjectHash, ObjectEqual>;
+
+class LowerTensorExpr : public ExprMutator {
+ public:
+  LowerTensorExpr(const IRModule& module, const TargetMap& targets, const DeviceMap& device_ctx_map,
+                  ProcessFn process_fn, AnalysisRemapping* prim_fn_to_call, TECompiler compiler)
+      : module_(module),
+        targets_(targets),
+        device_context_map_(device_ctx_map),
+        process_fn(process_fn),
+        prim_fn_to_call(prim_fn_to_call),
+        compiler_(compiler) {}
+
+  Expr VisitExpr_(const CallNode* call) override {
+    Call expr = GetRef<Call>(call);
+    Function func;
+
+    if (call->op.as<FunctionNode>()) {
+      func = GetRef<Function>(call->op.as<FunctionNode>());
+    } else {
+      return ExprMutator::VisitExpr_(call);
+    }
+
+    if (!func->HasNonzeroAttr(attr::kPrimitive)) {
+      // Provide a callback hook which allows one-level up code generators to
+      // act when we process a function.
+      this->process_fn(func);
+      return ExprMutator::VisitExpr_(call);
+    }
+
+    // Process inputs.
+    Array<Expr> args;
+    for (size_t i = 0; i < expr->args.size(); i++) {
+      args.push_back(VisitExpr(expr->args[i]));
+    }
+
+    Target target;
+
+    if (func->GetAttr<String>(attr::kCompiler).defined()) {
+      target = Target("ext_dev");
+      CCacheKey key = CCacheKey(func, target);
+      CachedFunc ext_func = compiler_->Lower(key);
+      ICHECK(ext_func.defined()) << "Lowering returned undefined function for "
+                                 << ext_func->prim_fn_var->name_hint;
+      // Provide a callback hook which allows one-level up code generators to
+      // act when we process a function.
+      this->process_fn(func);
+
+      auto ret_call = Call(ext_func->prim_fn_var, args, {});
+      (*prim_fn_to_call)[func] = ret_call;
+      return std::move(ret_call);
+    }
+
+    ICHECK_GE(device_context_map_.count(expr), 0)
+        << "Could not find an entry in the device context map for " << PrettyPrint(expr)
+        << "The memory planning was either not performed for this precise node, or there is bug "
+           "in the memory planner.";
+
+    auto& device_context = this->device_context_map_[expr];
+    auto call_dev_type = device_context.device_type;
+
+    // Non-External Relay Function
+    if (targets_.size() == 1) {
+      // The homogeneous execution case, we should only have one target
+      // so we just grab it.
+      const auto& it = targets_.begin();
+      target = (*it).second;
+    } else {
+      std::cout << "DeviceType: " << call_dev_type << std::endl;
+      // The heterogeneous execution case we have multiple targets
+      // in this case.
+      //
+      // We need to identify the target and translate.
+      std::string call_dev_name;
+      if (call_dev_type == 0) {
+        call_dev_name = "llvm";
+        call_dev_type = kDLCPU;
+      } else {
+        call_dev_name = ::tvm::runtime::DeviceName(call_dev_type);
+      }
+
+      if (targets_.count(call_dev_type) == 0) {
+        std::stringstream msg;
+        msg << "No target is specified for provided device name: `" << call_dev_name << "`\n\n";
+        msg << call_dev_name << " mapped to device type (" << call_dev_type
+            << ") which was not found in the target map.\n";
+        msg << "Availible targets: \n";
+        for (auto target : targets_) {
+          msg << "  " << target.first << "-> " << target.second << "\n";
+        }
+        LOG(FATAL) << msg.str();
+      }
+
+      std::cout << "DeviceName: " << call_dev_name << std::endl;
+      target = targets_[call_dev_type];
+      std::cout << "Target: " << target << std::endl;
+    }
+
+    CCacheKey key = CCacheKey(func, target);
+    CachedFunc lowered_func = compiler_->Lower(key);
+
+    Map<GlobalVar, tir::PrimFunc> prim_fns;
+
+    for (auto prim_fn : lowered_func->funcs->functions) {
+      CHECK(prim_fn.second.as<tir::PrimFuncNode>()) << "must be a prim fn";
+      prim_fns.Set(prim_fn.first, Downcast<tir::PrimFunc>(prim_fn.second));
+    }
+
+    // TODO(@areusch, @jroesch): this metadata is for AOT, this should be our interface for AOT
+    relay::Function func_with_metadata = func;
+    func_with_metadata = WithAttr(func_with_metadata, "prim_fn_var", lowered_func->prim_fn_var);
+    func_with_metadata = WithAttr(func_with_metadata, "prim_funcs", prim_fns);
+    func_with_metadata = WithAttr(func_with_metadata, "target", lowered_func->target);
+
+    // Provide a callback hook which allows one-level up code generators to
+    // act when we process a function.
+    this->process_fn(func_with_metadata);
+
+    Attrs attrs;
+    if (func->HasNonzeroAttr(attr::kReshapeOnly)) {
+      std::cout << "marking as reshape only" << std::endl;
+      auto tir_call_attrs = make_object<TIRCallAttrs>();
+      attrs = Attrs(tir_call_attrs);
+    }
+
+    Expr ret_call = Call(lowered_func->prim_fn_var, args, attrs);
+    (*prim_fn_to_call)[func] = ret_call;
+    return ret_call;
+  }
+
+  IRModule module_;
+  TargetMap targets_;
+  DeviceMap device_context_map_;
+  ProcessFn process_fn;
+  AnalysisRemapping* prim_fn_to_call;
+  TECompiler compiler_;
+};
+
+/*!
+ * \brief Obtain the Target from the device type.
+ * If homogenous compilation, this will return the only target.
+ * If heteregenous compilation, this will select associated using the targets_ Map.
+ *
+ * \param dev_type
+ * \return Target
+ */
+Target GetTargetFromInteger(DLDeviceType dev_type, TargetMap targets) {
+  if (targets.size() == 1) {
+    // homogeneous execution.
+    const auto& it = targets.begin();
+    return (*it).second;
+  } else {
+    // heterogeneous execution.
+    std::string call_dev_name;
+    if (dev_type == 0) {
+      call_dev_name = "llvm";
+    } else {
+      call_dev_name = runtime::DeviceName(dev_type);
+    }
+    if (targets.count(dev_type) == 0) {
+      LOG(FATAL) << "No target is provided for device " << call_dev_name;
+    }
+    return targets[dev_type];
+  }
+}
+
+/*!
+ * \brief Update the "main" control function's metadata
+ *
+ * \param mod The module
+ * \param targets Map of targets
+ * \return function_infos Function info for each function in the module
+ */
+
+backend::FunctionInfo UpdateMainWorkspaceSize(const IRModule& mod, TargetMap targets,
+                                              Map<Expr, backend::StorageInfo> storage_info_map) {
+  CHECK_EQ(mod->functions.size(), 1)
+      << "There should only be one function in the module passed to UpdateMainWorkspaceSize";
+  Function func = Downcast<Function>(mod->Lookup("main"));
+
+  // This is a Map<device,Map<storage_id, size>>
+  std::unordered_map<DLDeviceType, std::unordered_map<int, int>, EnumClassHash> sid_workspace;
+  // This is a Map<device, size_of_inputs_and_outputs>
+  std::unordered_map<DLDeviceType, int, EnumClassHash> device_io;
+  // This is a Map<device, size_of_constants>
+  std::unordered_map<DLDeviceType, int, EnumClassHash> device_consts;
+
+  // Initialize the maps to zero
+  for (const auto& kv : storage_info_map) {
+    backend::StorageInfo storage_info = kv.second;
+    std::vector<int64_t> storage_ids = storage_info->storage_ids;
+    std::vector<DLDeviceType> devices = storage_info->device_types;
+
+    CHECK_EQ(storage_ids.size(), devices.size());
+    for (uint32_t i = 0; i < devices.size(); i++) {
+      sid_workspace[devices[i]][storage_ids[i]] = 0;
+      device_io[devices[i]] = 0;
+      device_consts[devices[i]] = 0;
+    }
+  }
+
+  // Collect sizes of tensors
+  std::cout << "Trying to loop through storage info map " << std::endl;
+  for (const auto& kv : storage_info_map) {
+    Expr expr = kv.first;
+    int64_t size_bytes = backend::CalculateRelayExprSizeBytes(expr->checked_type());
+    std::cout << "Expression size bytes is: " << size_bytes << std::endl;
+    std::cout << "Expression: " << PrettyPrint(expr) << std::endl;
+    backend::StorageInfo storage_info = kv.second;
+    std::vector<int64_t> storage_ids = storage_info->storage_ids;
+    std::vector<DLDeviceType> devices = storage_info->device_types;
+
+    if (expr->IsInstance<ConstantNode>()) {
+      std::cout << "Expr is const" << std::endl;
+      for (const auto& dev : devices) {
+        device_consts[dev] += size_bytes;
+      }
+      continue;
+    } else if (expr->IsInstance<VarNode>() || expr.same_as(func->body)) {
+      std::cout << "Expr is var or func body" << std::endl;
+      CHECK_GE(devices.size(), 1) << "must be at least one device";
+      for (const auto& dev : devices) {
+        device_io[dev] += size_bytes;
+      }
+      continue;
+    }
+
+    // TODO(@electriclilies): This code is never being called which means sid_workspace is not
+    // updated.. This means that storage info is probably not being created correctly. Or is not
+    // equivalent to what was here previously
+    std::cout << "Looping through storage ids, compare sid to sid workspace thingy" << std::endl;
+    for (uint32_t i = 0; i < storage_ids.size(); i++) {
+      // Here we record the largest size of the tensor
+      // that share the same storage id, because storage_id will
+      // be shared between multiple tensors that are not live simultaneously.
+      std::cout << "size_bytes is: " << size_bytes;
+      std::cout << "sid workspace thing is: " << sid_workspace[devices[i]][storage_ids[i]];
+      if (size_bytes > sid_workspace[devices[i]][storage_ids[i]]) {
+        std::cout << "UPdated sid workspace to " << size_bytes;
+        sid_workspace[devices[i]][storage_ids[i]] = size_bytes;
+      }
+    }
+  }
+
+  // This is a Map<device, workspace_size>
+  std::unordered_map<DLDeviceType, int, EnumClassHash> device_workspace;
+  // Once we know the sizes of sids, we need to accumulate per device
+  for (const auto& dev_sid_size : sid_workspace) {
+    auto dev = dev_sid_size.first;
+    device_workspace[dev] = 0;
+    for (const auto& sid_size : dev_sid_size.second) {
+      std::cout << "the sid_size is: " << sid_size.second << std::endl;
+      device_workspace[dev] += sid_size.second;
+    }
+  }
+
+  Map<Target, Integer> workspace_sizes;
+  Map<Target, Integer> io_sizes;
+  Map<Target, Integer> constant_sizes;
+  Map<Target, tir::PrimFunc> tir_primfuncs;
+  Map<Target, Function> relay_primfuncs;
+
+  // Initialize all target workspaces to zero
+  for (const auto& kv : targets) {
+    auto tgt = kv.second;
+    workspace_sizes.Set(tgt, 0);
+  }
+
+  for (const auto& dev_and_size : device_workspace) {
+    auto tgt = GetTargetFromInteger(dev_and_size.first, targets);
+    workspace_sizes.Set(tgt, dev_and_size.second);
+    relay_primfuncs.Set(tgt, func);
+  }
+  for (const auto& dev_and_size : device_io) {
+    auto tgt = GetTargetFromInteger(dev_and_size.first, targets);
+    io_sizes.Set(tgt, dev_and_size.second);
+  }
+
+  for (const auto& dev_and_size : device_consts) {
+    auto tgt = GetTargetFromInteger(dev_and_size.first, targets);
+    constant_sizes.Set(tgt, dev_and_size.second);
+  }
+
+  return backend::FunctionInfo(workspace_sizes, io_sizes, constant_sizes, tir_primfuncs,
+                               relay_primfuncs);
+}
+
+// TODO(@electriclilies): Is the function passed in here relay_func??
+// Also should this be inlined?
+/*!
+ * \brief A function to create the function metadata for an input function (ie calculate buffer
+ * input/output sizes)
+ * \param relay_func The function to calculate function metadata for
+ * \param function_metadata The map that stores all the function metadatas
+ */
+void UpdateFunctionMetadata(Function relay_func,
+                            Map<String, backend::FunctionInfo>& function_metadata) { // NOLINT(*)
+  // Originally UpdateFunctionMetadata took in CCachedFunc and looped through all the funcs stored
+  // there Now the goal is to take only one func because process_fn should be controlling the
+  // iteration However, to do the workspace calculations we need the primfuncs. So process_fn needs
+  // to either access the cached funcs or be directly passed primfuncs This is bad and ideally we
+  // don't want process_fn to look at primfuncs There's also the question now of what the function
+  // metadatas are and how they are used if we can do something else to replicate the behavior of
+  // the function metadatas that might be good (ie annotating functions or something).
+  Map<Target, Integer> workspace_sizes;
+  Map<Target, Integer> io_sizes;
+  Map<Target, Integer> constant_sizes;
+  Map<Target, tir::PrimFunc> tir_primfuncs;
+  Map<Target, Function> relay_primfuncs;
+
+  Optional<Map<GlobalVar, tir::PrimFunc>> prim_fns =
+      relay_func->GetAttr<Map<GlobalVar, tir::PrimFunc>>("prim_funcs");
+  CHECK(prim_fns) << "primitive functions not set on Relay function by TECompiler";
+
+  Optional<GlobalVar> prim_fn_var = relay_func->GetAttr<GlobalVar>("prim_fn_var");
+  CHECK(prim_fn_var) << "prim_fn_var must be set on Relay functions by TECompiler";
+
+  Optional<Target> relay_target = relay_func->GetAttr<Target>("target");
+  CHECK(relay_target) << "target must be set on Relay functions by the TECompiler";
+
+  for (const auto& kv : prim_fns.value()) {
+    auto prim_fn = Downcast<tir::PrimFunc>(kv.second);
+    auto workspace_byte_alignment =
+        relay_target.value()->GetAttr<Integer>("workspace_byte_alignment").value_or(16);
+
+    Integer workspace_size = CalculateWorkspaceBytes(prim_fn, workspace_byte_alignment);
+
+    // Workspace sizes
+    Target prim_fn_target;
+    if (prim_fn->attrs->dict.count("target")) {
+      prim_fn_target = Downcast<Target>(prim_fn->attrs->dict["target"]);
+    } else {
+      prim_fn_target = relay_target.value();
+    }
+
+    CHECK(prim_fn.defined()) << "must be set";
+
+    workspace_sizes.Set(prim_fn_target, workspace_size);
+
+    // Calculating size for I/O
+    for (auto const& param : prim_fn->params) {
+      auto p_shape = prim_fn->buffer_map[param]->shape;
+      int num_of_elements = 1;
+      for (const auto& dim_index_expr : p_shape) {
+        if (dim_index_expr->IsInstance<IntImmNode>()) {
+          num_of_elements *= dim_index_expr.as<IntImmNode>()->value;
+        } else {
+          // If shape is dynamic, we cannot calculate workspace in compile time.
+          num_of_elements = 0;
+        }
+      }
+      int element_size = prim_fn->buffer_map[param]->dtype.bytes();
+      io_sizes.Set(prim_fn_target, element_size * num_of_elements);
+    }
+
+    constant_sizes.Set(prim_fn_target, 0);
+    tir_primfuncs.Set(prim_fn_target, prim_fn);
+    relay_primfuncs.Set(prim_fn_target, relay_func);
+  }
+
+  backend::FunctionInfo fi = backend::FunctionInfo(workspace_sizes, io_sizes, constant_sizes,
+                                                   tir_primfuncs, relay_primfuncs);
+
+  // The primitive function name here corresponds to the string we will use to generate
+  // this Relay function at the low level.
+  std::cout << "THING: " << function_metadata << std::endl;
+  function_metadata.Set(prim_fn_var.value()->name_hint, fi);
+  std::cout << "THING AFTER: " << function_metadata << std::endl;
+}
+
+LoweredModule LowerTE(const IRModule& module, TargetMap targets, DeviceMap device_context_map,
+                      std::function<void(Function)> process_fn,
+                      backend::StaticMemoryPlan memory_plan) {
+  TECompiler compiler;
+  std::cout << "LowerTE called" << std::endl;
+  CHECK_EQ(module->functions.size(), 1)
+      << "There should only be one function in the module passed to LowerTE";
+
+  AnalysisRemapping* prim_fn_to_call_map = new AnalysisRemapping;
+
+  auto pass = CreateFunctionPass(
+      [=](Function func, IRModule module, PassContext ctx) {
+        LowerTensorExpr lower_te(module, targets, device_context_map, process_fn,
+                                 prim_fn_to_call_map, compiler);
+        return Downcast<Function>(lower_te.VisitExpr(func));
+      },
+      0, "LowerTensorExpr", {});
+
+  // TODO(@electriclilies, @jroesch): remove UpdateMainWorkspaceSize
+  backend::FunctionInfo func_info =
+      UpdateMainWorkspaceSize(module, targets, memory_plan->expr_to_storage_info);
+
+  auto updated_module = pass(module);
+  std::cout << "UPdated module" << std::endl;
+
+  LoweredModule lowered_module;
+  lowered_module.main_module = updated_module;
+  lowered_module.per_target_module = compiler->GetLoweredFunctions();
+  lowered_module.external_mods = compiler->LowerExternalFunctions();
+  lowered_module.prim_fn_to_call_map =
+      Map<Expr, Expr>(prim_fn_to_call_map->begin(), prim_fn_to_call_map->end());
+  delete prim_fn_to_call_map;
+  lowered_module.main_func_info = func_info;
+  return lowered_module;
+}
+
+}  // namespace tec
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/backend/te_compiler.h b/src/relay/backend/te_compiler.h
new file mode 100644
index 0000000000000..4ba471f191a8d
--- /dev/null
+++ b/src/relay/backend/te_compiler.h
@@ -0,0 +1,194 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file relay/backend/tir_compiler.h
+ *  * \brief Internal compilation layer which lowers Relay "primitive functions" to TIR PrimFns.
+ *
+ *
+ * This represents the new design of the Relay compilation flow and will replace the interface
+ * contained in compile_engine.h as we migrate towards a standard pass based lowering of
+ * Relay functions.
+ *
+ * This files provides an internal API which lowers Relay programs to components which
+ * can be combined with TVM produced kernels to compile an entire program.
+ *
+ * The result of lowering contains a combination of `runtime::Module`s produced by external
+ * compilers and a set of lowered PrimFns which can be code generated for targets.
+ */
+#ifndef TVM_RELAY_BACKEND_TE_COMPILER_H_
+#define TVM_RELAY_BACKEND_TE_COMPILER_H_
+
+#include <tvm/node/structural_equal.h>
+#include <tvm/node/structural_hash.h>
+#include <tvm/relay/analysis.h>
+#include <tvm/relay/attrs/memory.h>
+#include <tvm/relay/expr.h>
+#include <tvm/relay/op_strategy.h>
+#include <tvm/relay/transform.h>
+#include <tvm/runtime/module.h>
+#include <tvm/topi/elemwise.h>
+
+#include <functional>
+#include <string>
+#include <unordered_map>
+
+#include "../transforms/infer_layout_utils.h"
+#include "../transforms/pass_utils.h"
+#include "./te_compiler_cache.h"
+#include "utils.h"
+
+namespace tvm {
+namespace relay {
+namespace tec {
+
+// This class is needed to avoid a GCC 5 bug that prevents maps containing enums
+// from being compiled. If i386 GCC version is increased, we can remove it.
+struct EnumClassHash {
+  template <typename T>
+  std::size_t operator()(T t) const {
+    return static_cast<std::size_t>(t);
+  }
+};
+
+// TODO(@jroesch, @chrisS) these should be a tvm::Map for uniformity sake
+// we should a version of context which works in Map
+using TargetMap = std::unordered_map<DLDeviceType, Target, EnumClassHash>;
+using DeviceMap =
+    std::unordered_map<Expr, tvm::Device, runtime::ObjectPtrHash, runtime::ObjectPtrEqual>;
+using ProcessFn = std::function<void(Function)>;
+
+/*!
+ * \brief A compiler which lowers primitive Relay functions to tensor expressions
+ * and schdules them into TIR functions.
+ */
+class TECompilerNode : public Object {
+ public:
+  /*! \brief destructor */
+  virtual ~TECompilerNode() {}
+  /*!
+   * \brief Get lowered result.
+   * \param key The key to the cached function.
+   * \return The result.
+   */
+  virtual CachedFunc Lower(const CCacheKey& key) = 0;
+
+  /* Return all functions which have been lowered by the compiler, keyed by target. */
+  virtual Map<String, IRModule> GetLoweredFunctions() = 0;
+
+  /*!
+   * \brief Just in time compile to get a PackedFunc.
+   * \param key The key to the cached function.
+   * \return The result.
+   */
+  virtual PackedFunc JIT(const CCacheKey& key) = 0;
+  /*!
+   * \brief Lower the shape function.
+   * \param key The key to the cached function.
+   * \return The result.
+   */
+  virtual CachedFunc LowerShapeFunc(const CCacheKey& key) = 0;
+  /*!
+   * \brief Lower the external function using external codegen tools.
+   * \return The runtime moduels for each needed external codegen tool.
+   */
+  virtual tvm::Array<tvm::runtime::Module> LowerExternalFunctions() = 0;
+
+  /*! \brief clear the cache. */
+  virtual void Clear() = 0;
+
+  void VisitAttrs(AttrVisitor*) {}
+
+  static constexpr const char* _type_key = "relay.TECompiler";
+  TVM_DECLARE_FINAL_OBJECT_INFO(TECompilerNode, Object);
+};
+
+/*! \brief cache entry used in compile engine */
+class TECompiler : public ObjectRef {
+ public:
+  TECompiler();
+  explicit TECompiler(ObjectPtr<Object> n) : ObjectRef(n) {}
+  TECompilerNode* operator->() { return static_cast<TECompilerNode*>(get_mutable()); }
+  using ContainerType = TECompilerNode;
+  /*! \brief The global compile engine. */
+  TVM_DLL static TECompiler& Global();
+};
+
+/*! \brief The result of lowering a module, for now we need to pass an aggregate data structure
+ * which contains more then a single module in order to interact with the today API.
+ */
+struct LoweredModule {
+  /*! \brief The module which contains the Relay code. */
+  IRModule main_module;
+  /*! \brief The module which contains per target code. */
+  Map<String, IRModule> per_target_module;
+  /*! \brief The external runtime modules which must be combined with the lowered code. */
+  Array<tvm::runtime::Module> external_mods;
+  /*! \brief Primtive function to call node map.
+   * NB: this is a temporary workaround for storage information until we unify the hetergenous
+   * support, memory planning, and lowering.
+   */
+  Map<Expr, Expr> prim_fn_to_call_map;
+  // TOOD(@electrililies, @jroesch): Remove this fields
+  // TODO(@electriclilies): THis might need to become a map
+  /*! \brief The info for this function (not sure what a better description is??)
+   *
+   */
+  backend::FunctionInfo main_func_info;
+};
+
+/*!
+ * \brief A function to create the function metadata for an input function (ie calculate buffer
+ * input/output sizes)
+ * \param relay_func The function to calculate function metadata for
+ * \param function_metadata The map that stores all the function metadatas
+ */
+void UpdateFunctionMetadata(Function relay_func,
+                            Map<String, backend::FunctionInfo>& function_metadata); // NOLINT(*)
+
+/*!
+ * \brief Obtain the Target from the device type.
+ * If homogenous compilation, this will return the only target.
+ * If heteregenous compilation, this will select associated using the targets_ Map.
+ *
+ * \param dev_type
+ * \return Target
+ */
+Target GetTargetFromInteger(DLDeviceType dev_type, TargetMap targets);
+
+/*! \brief Lower an IRModule's primitive functions to TIR.
+ *
+ * This is the "back half" of the Relay compiler which lowers "primitive functions"
+ * to TE expressions, schedules them, and then to TIR.
+ *
+ * /param module The IRModule.
+ * /param targets The mapping for devices to targets.
+ * /param device_map An analysis result mapping each sub-expression to a device.
+ * /return The lowered module, see above.
+ */
+// TODO(@electriclilies): Not sure if this default initialization is correct...
+LoweredModule LowerTE(
+    const IRModule& module, TargetMap targets, DeviceMap device_map,
+    ProcessFn process_fn = [](Function f) {}, backend::StaticMemoryPlan memory_plan = {});
+
+}  // namespace tec
+}  // namespace relay
+}  // namespace tvm
+
+#endif  // TVM_RELAY_BACKEND_TE_COMPILER_H_
diff --git a/src/relay/backend/te_compiler_cache.cc b/src/relay/backend/te_compiler_cache.cc
new file mode 100644
index 0000000000000..bbe38f0426b4d
--- /dev/null
+++ b/src/relay/backend/te_compiler_cache.cc
@@ -0,0 +1,694 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include "./te_compiler_cache.h"
+
+#include <tvm/driver/driver_api.h>
+#include <tvm/ir/type_functor.h>
+#include <tvm/relay/analysis.h>
+#include <tvm/relay/attrs/device_copy.h>
+#include <tvm/relay/expr.h>
+#include <tvm/relay/expr_functor.h>
+#include <tvm/relay/op.h>
+#include <tvm/relay/op_attr_types.h>
+#include <tvm/runtime/device_api.h>
+#include <tvm/runtime/registry.h>
+#include <tvm/te/operation.h>
+#include <tvm/te/schedule.h>
+#include <tvm/te/schedule_pass.h>
+#include <tvm/topi/tags.h>
+
+#include <functional>
+#include <limits>
+#include <mutex>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include "../transforms/pass_utils.h"
+#include "utils.h"
+
+namespace tvm {
+namespace relay {
+namespace tec {
+
+TVM_REGISTER_NODE_TYPE(LoweredOutputNode);
+TVM_REGISTER_NODE_TYPE(CachedFuncNode);
+TVM_REGISTER_NODE_TYPE(CCacheKeyNode);
+TVM_REGISTER_NODE_TYPE(CCacheValueNode);
+
+LoweredOutput::LoweredOutput(tvm::Array<te::Tensor> outputs, OpImplementation impl) {
+  auto n = make_object<LoweredOutputNode>();
+  n->outputs = std::move(outputs);
+  n->implementation = std::move(impl);
+  data_ = std::move(n);
+}
+
+CCacheKey::CCacheKey(Function source_func, Target target) {
+  auto n = make_object<CCacheKeyNode>();
+  n->source_func = std::move(source_func);
+  n->target = std::move(target);
+  data_ = std::move(n);
+}
+
+CachedFunc::CachedFunc(tvm::Target target, GlobalVar prim_fn_var, tvm::Array<te::Tensor> inputs,
+                       tvm::Array<te::Tensor> outputs, te::Schedule schedule,
+                       tvm::Array<Integer> shape_func_param_states, IRModule funcs) {
+  auto n = make_object<CachedFuncNode>();
+  n->target = target;
+  n->prim_fn_var = prim_fn_var;
+  n->inputs = inputs;
+  n->outputs = outputs;
+  n->schedule = schedule;
+  n->shape_func_param_states = shape_func_param_states;
+  n->funcs = funcs;
+  data_ = std::move(n);
+}
+
+Array<IndexExpr> GetShape(const Array<IndexExpr>& shape) {
+  // for now, we always use int32 shape when possible
+  // even if the result of shape inference becomes int64.
+  Array<IndexExpr> res;
+  for (IndexExpr val : shape) {
+    const int64_t* pval = tir::as_const_int(val);
+    if (pval != nullptr) {
+#ifndef TVM_INDEX_DEFAULT_I64
+      ICHECK_LE(pval[0], std::numeric_limits<int32_t>::max())
+          << "dimension must be less then int32_t's max value";
+      ICHECK_GE(pval[0], std::numeric_limits<int32_t>::min())
+          << "dimension must be less then int32_t's max value";
+      res.push_back(IntImm(DataType::Int(32), *pval));
+#else
+      res.push_back(val);
+#endif  // TVM_INDEX_DEFAULT_I64
+    } else if (val->IsInstance<tir::AnyNode>()) {
+      res.push_back(val.as<tir::AnyNode>()->ToVar());
+    } else {
+      res.push_back(val);
+    }
+  }
+  return res;
+}
+
+// Construct a schedule for a given Relay primitive function and target.
+class ScheduleBuilder : public backend::MemoizedExprTranslator<Array<te::Tensor>> {
+ public:
+  explicit ScheduleBuilder(Target target)
+      : target_(target), device_copy_op_(Op::Get("device_copy")) {
+    // Whether to use auto_scheduler schedule.
+    use_auto_scheduler_ = backend::IsAutoSchedulerEnabled();
+  }
+
+  CachedFunc Create(const Function& prim_func, std::function<std::string(std::string)> renamer) {
+    Array<tvm::te::Tensor> fn_inputs;
+    for (Var param : prim_func->params) {
+      Array<tvm::te::Tensor> inputs;
+      if (const auto* ttype = param->checked_type().as<TensorTypeNode>()) {
+        tvm::te::Tensor tensor = tvm::te::placeholder(GetShape(ttype->shape), ttype->dtype);
+        fn_inputs.push_back(tensor);
+        inputs.push_back(tensor);
+      } else {
+        // flatten tuple of tensor type.
+        const auto* tuple_type = param->type_as<TupleTypeNode>();
+        for (Type field : tuple_type->fields) {
+          const auto* ttype = field.as<TensorTypeNode>();
+          // TODO(@icemelon): Allow recursive tuple
+          ICHECK(ttype != nullptr);
+          tvm::te::Tensor tensor = tvm::te::placeholder(GetShape(ttype->shape), ttype->dtype);
+          fn_inputs.push_back(tensor);
+          inputs.push_back(tensor);
+        }
+      }
+      memo_[param] = inputs;
+    }
+    readable_name_stream_ << "fused";
+    auto outputs = this->VisitExpr(prim_func->body);
+    auto candidate_name = readable_name_stream_.str();
+    constexpr static size_t kMaxFuncNameLength = 80;
+    if (candidate_name.size() > kMaxFuncNameLength) {
+      std::stringstream truncated_name;
+      truncated_name << candidate_name.substr(0, kMaxFuncNameLength);
+      truncated_name << "_" << std::hash<std::string>{}(candidate_name) << "_";
+      candidate_name = truncated_name.str();
+    }
+
+    // NB(@jroesch): unfortunately the graph runtime deals with copy in
+    // a totally hacky way, we really need to rectify this but this will
+    // have to work for now.
+    std::string prim_fn_name = candidate_name;
+    if (prim_fn_name != "__copy") {
+      prim_fn_name = renamer(prim_fn_name);
+    }
+    auto prim_fn_var = GlobalVar(prim_fn_name);
+    prim_fn_var->checked_type_ = prim_func->checked_type();
+
+    ICHECK(anchor_op_.defined());
+    // Fusion over tupled results may leave identity relationships
+    // between inputs and outputs, and those should not be scheduled.
+    // Hence schedule only non PlaceholderOp outputs.
+    tvm::Array<te::Tensor> tensor_outs;
+    for (const auto& tensor : outputs) {
+      if (!tensor->op.as<te::PlaceholderOpNode>()) {
+        tensor_outs.push_back(tensor);
+      }
+    }
+
+    te::Schedule schedule;
+    // No need to register schedule for device copy op.
+    if (anchor_attrs_.as<DeviceCopyAttrs>() == nullptr) {
+      if (use_auto_scheduler_) {
+        const auto* fauto_schedule =
+            runtime::Registry::Get("auto_scheduler.relay_integration.auto_schedule_topi_compute");
+        ICHECK(fauto_schedule != nullptr)
+            << "auto_scheduler.relay_integration.auto_schedule_topi_compute is not registered";
+        ObjectRef obj = (*fauto_schedule)(prim_fn_name, tensor_outs);
+        if (obj.defined()) {
+          schedule = Downcast<te::Schedule>(obj);
+        }
+      }
+
+      // Use TOPI schdule if user specificed, or the function has no auto_scheduler schedule.
+      if (!schedule.defined()) {
+        ICHECK(anchor_implementation_.defined());
+        schedule = anchor_implementation_.Schedule(anchor_attrs_, tensor_outs, target_);
+      }
+      for (const auto& scalar : scalars_) {
+        if (schedule->Contain(scalar)) {
+          schedule[scalar].compute_inline();
+        }
+      }
+    }
+
+    return CachedFunc(target_, prim_fn_var, fn_inputs, outputs, schedule, {});
+  }
+
+  Array<te::Tensor> VisitExpr_(const VarNode* op) final {
+    LOG(FATAL) << "Unexpected free variable " << op->name_hint();
+    return {};
+  }
+
+  Array<te::Tensor> VisitExpr_(const ConstantNode* op) final {
+    using tir::make_const;
+    ICHECK(op->is_scalar());
+    void* data = op->data->data;
+    DataType dtype = DataType(op->data->dtype);
+    auto value = te::compute(
+        {},
+        [&](const Array<tvm::tir::Var>&) {
+          if (dtype == DataType::Int(32)) {
+            return make_const(dtype, static_cast<const int32_t*>(data)[0]);
+          } else if (dtype == DataType::Int(64)) {
+            return make_const(dtype, static_cast<const int64_t*>(data)[0]);
+          } else if (dtype == DataType::Float(32)) {
+            return make_const(dtype, static_cast<const float*>(data)[0]);
+          } else if (dtype == DataType::Float(64)) {
+            return make_const(dtype, static_cast<const double*>(data)[0]);
+          } else if (dtype == DataType::Bool()) {
+            return make_const(dtype, static_cast<const uint8_t*>(data)[0]);
+          } else {
+            LOG(FATAL) << "not handled";
+            return tvm::PrimExpr();
+          }
+        },
+        "compile_engine_const", topi::kBroadcast);
+    scalars_.push_back(value->op);
+    return {value};
+  }
+
+  Array<te::Tensor> VisitExpr_(const CallNode* call_node) final {
+    static auto fpattern = Op::GetAttrMap<TOpPattern>("TOpPattern");
+    static auto flower_call = tvm::runtime::Registry::Get("relay.backend.lower_call");
+    ICHECK(flower_call) << "relay.backend.lower_call is not registered.";
+
+    Array<te::Tensor> inputs;
+    int count_tuple = 0;
+    for (Expr arg : call_node->args) {
+      if (arg->checked_type().as<TupleTypeNode>()) {
+        ++count_tuple;
+      }
+      for (te::Tensor tensor : VisitExpr(arg)) {
+        inputs.push_back(tensor);
+      }
+    }
+
+    if (count_tuple) {
+      ICHECK_EQ(call_node->args.size(), 1U)
+          << "Only functions with a single tuple input are allowed, but " << count_tuple
+          << " were provided.";
+    }
+
+    ICHECK(call_node->op.as<OpNode>()) << "Primitive function only allows call into primitive ops";
+    Op op = Downcast<Op>(call_node->op);
+
+    Array<te::Tensor> outputs;
+    OpImplementation impl;
+    // Skip fcompute for device copy operators as it is not registered.
+    if (op == device_copy_op_) {
+      const auto* copy_input = inputs[0].operator->();
+      outputs.push_back(te::Tensor(copy_input->shape, copy_input->dtype, te::Operation(), 0));
+    } else {
+      LoweredOutput lowered_out = (*flower_call)(GetRef<Call>(call_node), inputs, target_);
+      outputs = lowered_out->outputs;
+      impl = lowered_out->implementation;
+    }
+
+    int op_pattern = fpattern[op];
+    if (!use_auto_scheduler_ && op_pattern >= kCommReduce) {
+      ICHECK(!anchor_op_.defined() || anchor_op_pattern_ < kCommReduce)
+          << "Cannot apply TOPI schedule to a primitive function with two complicated ops"
+          << " anchor=" << anchor_op_ << " current=" << op;
+    }
+    if (op_pattern >= anchor_op_pattern_) {
+      anchor_op_ = op;
+      anchor_attrs_ = call_node->attrs;
+      anchor_op_pattern_ = op_pattern;
+      anchor_implementation_ = impl;
+    }
+    if (outputs.size() != 1) {
+      const auto* tuple_type = call_node->checked_type().as<TupleTypeNode>();
+      ICHECK(tuple_type) << "Expected output to be a tuple type "
+                         << PrettyPrint(call_node->checked_type());
+
+      ICHECK_EQ(tuple_type->fields.size(), outputs.size());
+    }
+    // Set the name to `__copy`. It will be detected in graph runtime to perform
+    // data copy across devices.
+    if (op == device_copy_op_) {
+      readable_name_stream_.str(std::string());
+      readable_name_stream_ << "__copy";
+    } else {
+      readable_name_stream_ << '_' << op->name;
+    }
+    return outputs;
+  }
+
+  Array<te::Tensor> VisitExpr_(const FunctionNode* op) final {
+    LOG(FATAL) << "Primitive Functions can not contain nested functions.";
+    return Array<te::Tensor>();
+  }
+
+  Array<te::Tensor> VisitExpr_(const LetNode* op) final {
+    Array<te::Tensor> val = VisitExpr(op->value);
+    ICHECK(!memo_.count(op->var));
+    memo_[op->var] = val;
+    return VisitExpr(op->body);
+  }
+
+  Array<te::Tensor> VisitExpr_(const TupleNode* op) final {
+    Array<te::Tensor> fields;
+    for (Expr field : op->fields) {
+      ICHECK(field->checked_type().as<TensorTypeNode>()) << "Only allow Tuple of Tensor";
+      Array<te::Tensor> res = VisitExpr(field);
+      ICHECK_EQ(res.size(), 1);
+      fields.push_back(res[0]);
+    }
+    return fields;
+  }
+
+  Array<te::Tensor> VisitExpr_(const TupleGetItemNode* op) final {
+    const auto* tuple_type = op->tuple->type_as<TupleTypeNode>();
+    Array<te::Tensor> tuple = VisitExpr(op->tuple);
+    ICHECK_EQ(tuple_type->fields.size(), tuple.size());
+    ICHECK_GE(op->index, 0);
+    ICHECK_LT(static_cast<size_t>(op->index), tuple.size());
+    return {tuple[op->index]};
+  }
+
+ private:
+  tvm::Target target_;
+  Op anchor_op_;
+  Attrs anchor_attrs_;
+  int anchor_op_pattern_{0};
+  OpImplementation anchor_implementation_;
+  std::ostringstream readable_name_stream_;
+  Array<te::Operation> scalars_;
+  bool use_auto_scheduler_;
+  // Cache device copy op for equivalence checking to reduce registry lookup
+  // overhead for each invocation of call node when retrieving schedules.
+  const Op& device_copy_op_;
+};
+
+/*!
+ * \brief Create schedule for target.
+ * \param source_func The primitive function to be lowered.
+ * \param target The target we want to create schedule for.
+ * \return Pair of schedule and cache.
+ *  The funcs field in cache is not yet populated.
+ */
+CachedFunc PrimFuncFor(const Function& source_func, const Target& target,
+                       std::function<std::string(std::string)> renamer) {
+  return ScheduleBuilder(target).Create(source_func, renamer);
+}
+
+// Creates shape function from functor.
+class MakeShapeFunc : public backend::MemoizedExprTranslator<Array<te::Tensor>> {
+ public:
+  MakeShapeFunc() {}
+
+  CachedFunc Create(const Function& prim_func, const Target& target,
+                    std::function<std::string(std::string)> renamer) {
+    Array<te::Tensor> inputs;
+    TShapeDataDependent shape_func_param_states;
+
+    for (auto param : prim_func->params) {
+      param_states_[param] = kNoNeed;
+      Array<tvm::te::Tensor> data_inputs;
+      Array<tvm::te::Tensor> shape_inputs;
+
+      auto add_placeholder = [&data_inputs, &shape_inputs](const TensorTypeNode* ttype) {
+        // Add data placeholder
+        Shape shape = GetShape(ttype->shape);
+        tvm::te::Tensor data_tensor = tvm::te::placeholder(shape, ttype->dtype);
+        data_inputs.push_back(data_tensor);
+        // Add shape placeholder
+        int64_t ndim = shape.size();
+        Shape sshape;
+        if (ndim > 0) {
+          sshape.push_back(tvm::Integer(ndim));
+        }
+        tvm::te::Tensor shape_tensor = tvm::te::placeholder(sshape, DataType::Int(64));
+        shape_inputs.push_back(shape_tensor);
+      };
+
+      if (const auto* ttype = param->checked_type().as<TensorTypeNode>()) {
+        add_placeholder(ttype);
+      } else {
+        // flatten tuple of tensor type.
+        const auto* tuple_type = param->type_as<TupleTypeNode>();
+        // TODO(@icemelon): Support recursive tuple
+        ICHECK(tuple_type);
+        for (Type field : tuple_type->fields) {
+          const auto* ttype = field.as<TensorTypeNode>();
+          ICHECK(ttype);
+          add_placeholder(ttype);
+        }
+      }
+      param_data_[param] = data_inputs;
+      param_shapes_[param] = shape_inputs;
+    }
+
+    // Setup the name;
+    readable_name_stream_ << "shape_func";
+
+    // Create the `te::Tensor`s which represent the output.
+    auto outputs = VisitExpr(prim_func->body);
+
+    // Generate a name.
+    auto candidate_name = readable_name_stream_.str();
+    constexpr static size_t kMaxFuncNameLength = 80;
+    if (candidate_name.size() > kMaxFuncNameLength) {
+      std::stringstream truncated_name;
+      truncated_name << candidate_name.substr(0, kMaxFuncNameLength);
+      truncated_name << "_" << std::hash<std::string>{}(candidate_name) << "_";
+      candidate_name = truncated_name.str();
+    }
+
+    // Set all the inputs correctly.
+    for (auto param : prim_func->params) {
+      int state = param_states_[param];
+      shape_func_param_states.push_back(IntImm(DataType::Int(32), state));
+      if (state & kNeedInputData) {
+        for (auto t : param_data_[param]) {
+          inputs.push_back(t);
+        }
+      }
+      if (state & kNeedInputShape) {
+        for (auto t : param_shapes_[param]) {
+          inputs.push_back(t);
+        }
+      }
+    }
+
+    auto func_name = renamer(candidate_name);
+    auto prim_fn_gvar = GlobalVar(func_name);
+    prim_fn_gvar->checked_type_ = prim_func->checked_type();
+
+    // generate schedule for shape func
+    Array<te::Operation> out_ops;
+    for (auto t : outputs) {
+      out_ops.push_back(t->op);
+    }
+    auto schedule = te::create_schedule(out_ops);
+    tvm::te::AutoInlineInjective(schedule);
+    for (const auto& scalar : scalars_) {
+      auto scalar_op = scalar->op;
+      if (schedule->Contain(scalar_op)) {
+        schedule[scalar_op].compute_inline();
+      }
+    }
+
+    Array<te::Tensor> all_args = Array<te::Tensor>(inputs);
+    for (te::Tensor arg : outputs) {
+      all_args.push_back(arg);
+    }
+
+    using tvm::transform::PassContext;
+    With<PassContext> fresh_pass_ctx_scope(PassContext::Create());
+
+    std::unordered_map<te::Tensor, tir::Buffer> binds;
+    IRModule ir_module = tvm::LowerSchedule(schedule, all_args, func_name, binds);
+
+    return CachedFunc(target, prim_fn_gvar, inputs, outputs, schedule, shape_func_param_states,
+                      ir_module);
+  }
+
+  Array<te::Tensor> VisitExpr(const Expr& expr) final {
+    if (expr.as<VarNode>()) {
+      // Do not memoize vars because shape functions could use either the data
+      // or the shape of a var each time.
+      return ExprFunctor::VisitExpr(expr);
+    }
+    // For other case, do memoized visit
+    return backend::MemoizedExprTranslator<Array<te::Tensor>>::VisitExpr(expr);
+  }
+
+  Array<te::Tensor> VisitExpr_(const VarNode* var_node) final {
+    auto var = GetRef<Var>(var_node);
+    auto it = param_states_.find(var);
+    if (it == param_states_.end()) {
+      LOG(FATAL) << "Unexpected free variable " << var->name_hint();
+      return {};
+    } else {
+      ICHECK(data_dependents_per_input_.size());
+      auto data_dependent = data_dependents_per_input_.back();
+      if (data_dependent) {
+        param_states_[var] |= kNeedInputData;
+        return param_data_[var];
+      } else {
+        param_states_[var] |= kNeedInputShape;
+        return param_shapes_[var];
+      }
+    }
+  }
+
+  Array<te::Tensor> VisitExpr_(const ConstantNode* op) final {
+    using tir::make_const;
+    ICHECK(data_dependents_per_input_.size());
+    bool data_dependent = data_dependents_per_input_.back();
+    if (!op->is_scalar()) {
+      // This is a constant weight, extract the shape of the weight tensor.
+      // This can not be data dependent.
+      CHECK(!data_dependent);
+      auto ttype = op->checked_type().as<TensorTypeNode>();
+      int ndim = static_cast<int>(ttype->shape.size());
+      Array<PrimExpr> out_shape{ndim};
+      te::Tensor value = tvm::te::compute(
+          out_shape,
+          [&](const Array<tvm::tir::Var>& indices) {
+            auto idx = indices[0];
+            PrimExpr ret = make_const(DataType::Int(64), 0);
+            for (int i = 0; i < ndim; i++) {
+              ret = tvm::if_then_else(idx == i, ttype->shape[i], ret);
+            }
+            return ret;
+          },
+          "shape_const", topi::kBroadcast);
+      scalars_.push_back(value);
+      return {value};
+    }
+    if (data_dependent) {
+      void* data = op->data->data;
+      DataType dtype = DataType(op->data->dtype);
+      auto value = tvm::te::compute(
+          {},
+          [&](const Array<tvm::tir::Var>&) {
+            if (dtype == DataType::Int(32)) {
+              return make_const(dtype, static_cast<const int32_t*>(data)[0]);
+            } else if (dtype == DataType::Int(64)) {
+              return make_const(dtype, static_cast<const int64_t*>(data)[0]);
+            } else if (dtype == DataType::Float(32)) {
+              return make_const(dtype, static_cast<const float*>(data)[0]);
+            } else if (dtype == DataType::Float(64)) {
+              return make_const(dtype, static_cast<const double*>(data)[0]);
+            } else if (dtype == DataType::Bool()) {
+              return make_const(dtype, static_cast<const uint8_t*>(data)[0]);
+            } else {
+              LOG(FATAL) << "not handled";
+              return tvm::PrimExpr();
+            }
+          },
+          "data_const", topi::kBroadcast);
+      scalars_.push_back(value);
+      return {value};
+    } else {
+      auto value = tvm::te::compute(
+          {}, [&](const Array<tvm::tir::Var>&) { return tir::make_const(DataType::Int(64), 0); },
+          "shape_const", topi::kBroadcast);
+      scalars_.push_back(value);
+      return {value};
+    }
+  }
+
+  Array<te::Tensor> VisitExpr_(const CallNode* call_node) final {
+    static auto fshape_func = Op::GetAttrMap<FShapeFunc>("FShapeFunc");
+    static auto tshape_data_dependent = Op::GetAttrMap<TShapeDataDependent>("TShapeDataDependent");
+    ICHECK(call_node->op.as<OpNode>()) << "Primitive function only allows call into primitive ops";
+    Op op = Downcast<Op>(call_node->op);
+    ICHECK(data_dependents_per_input_.empty() || !data_dependents_per_input_.back())
+        << "Error in op fusion: output of the shape func is fed to a "
+        << "data-dependent shape func";
+    ICHECK_GT(fshape_func.count(op), 0) << "Internal error, cannot find ShapeFunc for " << op->name;
+    ICHECK_GT(tshape_data_dependent.count(op), 0)
+        << "Internal error, cannot find TShapeDataDependent for " << op->name;
+
+    Array<Integer> dep_spec = tshape_data_dependent[op];
+    if (dep_spec.size() == 1) {
+      // This is for cases when data dependence is specified per op
+      // Replicate 0 or 1 flag to all arguments
+      for (size_t i = 1; i < call_node->args.size(); ++i) {
+        dep_spec.push_back(dep_spec[0]);
+      }
+    }
+
+    // Visit all inputs
+    Array<te::Tensor> inputs;
+    int count_tuple = 0;
+    for (size_t i = 0; i < call_node->args.size(); ++i) {
+      Expr arg = call_node->args[i];
+      if (arg->checked_type().as<TupleTypeNode>()) {
+        ++count_tuple;
+      }
+      data_dependents_per_input_.push_back(dep_spec[i]->value != 0);
+      for (te::Tensor tensor : VisitExpr(arg)) {
+        inputs.push_back(tensor);
+      }
+      data_dependents_per_input_.pop_back();
+    }
+    if (count_tuple) {
+      ICHECK_EQ(call_node->args.size(), 1U) << "Only allow function with a single tuple input";
+    }
+    // Get output ndims
+    auto ret_type = call_node->checked_type();
+    Array<IndexExpr> out_ndims;
+    if (const auto* ttype = ret_type.as<TensorTypeNode>()) {
+      out_ndims.push_back(IntImm(DataType::Int(32), ttype->shape.size()));
+    } else {
+      auto rtype = ret_type.as<TupleTypeNode>();
+      // TODO(@icemelon): Allow recursive tuple
+      ICHECK(rtype);
+      for (size_t i = 0; i < rtype->fields.size(); ++i) {
+        auto ttype = rtype->fields[i].as<TensorTypeNode>();
+        ICHECK(ttype);
+        out_ndims.push_back(IntImm(DataType::Int(32), ttype->shape.size()));
+      }
+    }
+    // Call shape function
+    auto outputs = fshape_func[op](call_node->attrs, inputs, out_ndims);
+    readable_name_stream_ << "_" << op->name;
+    return outputs;
+  }
+
+  Array<te::Tensor> VisitExpr_(const FunctionNode* op) final {
+    LOG(FATAL) << "Do not support sub function";
+    return Array<te::Tensor>();
+  }
+
+  Array<te::Tensor> VisitExpr_(const LetNode* op) final {
+    Array<te::Tensor> val = VisitExpr(op->value);
+    ICHECK(!memo_.count(op->var));
+    memo_[op->var] = val;
+    return VisitExpr(op->body);
+  }
+
+  Array<te::Tensor> VisitExpr_(const TupleNode* op) final {
+    Array<te::Tensor> fields;
+    for (Expr field : op->fields) {
+      ICHECK(field->checked_type().as<TensorTypeNode>())
+          << "Expected a Tuple of Tensor, but got " << PrettyPrint(field->checked_type());
+      Array<te::Tensor> res = VisitExpr(field);
+      ICHECK_EQ(res.size(), 1);
+      fields.push_back(res[0]);
+    }
+    return fields;
+  }
+
+  Array<te::Tensor> VisitExpr_(const TupleGetItemNode* op) final {
+    Array<te::Tensor> input_shapes = VisitExpr(op->tuple);
+    Array<te::Tensor> out;
+    out.push_back(input_shapes[op->index]);
+    return out;
+  }
+
+ private:
+  /*! \brief String stream for function name */
+  std::ostringstream readable_name_stream_;
+  /*! \brief Map from parameter to its shape function usage state */
+  std::unordered_map<Expr, int, ObjectPtrHash, ObjectPtrEqual> param_states_;
+  /*! \brief Map from parameter to list of data placeholder */
+  std::unordered_map<Expr, Array<te::Tensor>, ObjectPtrHash, ObjectPtrEqual> param_data_;
+  /*! \brief Map from parameter to list of shape placeholder */
+  std::unordered_map<Expr, Array<te::Tensor>, ObjectPtrHash, ObjectPtrEqual> param_shapes_;
+  /*! \brief Stack of data dependencies for shape function, specified per each op input */
+  std::vector<bool> data_dependents_per_input_;
+  /*! \brief Scalars used in the shape function */
+  Array<te::Tensor> scalars_;
+};
+
+CachedFunc ShapeFuncFor(const Function& prim_func, const Target& target,
+                        std::function<std::string(std::string)> renamer) {
+  return MakeShapeFunc().Create(prim_func, target, renamer);
+}
+
+/*!
+ * \brief Get unique name from name.
+ * \param name The orginal name.
+ * \return Updated name which is unique.
+ */
+std::string GetUniqueName(std::string name, std::unordered_map<std::string, int>* name_map_) {
+  for (size_t i = 0; i < name.length(); ++i) {
+    if (name[i] == '.') name[i] = '_';
+  }
+  while (true) {
+    auto it = name_map_->find(name);
+    if (it == name_map_->end()) {
+      (*name_map_)[name] = 1;
+      return name;
+    } else {
+      std::ostringstream os;
+      os << name << "_" << it->second;
+      ++(it->second);
+      name = os.str();
+    }
+  }
+  return name;
+}
+
+}  // namespace tec
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/backend/te_compiler_cache.h b/src/relay/backend/te_compiler_cache.h
new file mode 100644
index 0000000000000..1c7511ffd7d26
--- /dev/null
+++ b/src/relay/backend/te_compiler_cache.h
@@ -0,0 +1,249 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file relay/backend/tec_compiler_cache.h
+ * \brief Utilities for compiling tensor expressions inside of the Relay compiler.
+ */
+#ifndef TVM_RELAY_BACKEND_TE_COMPILER_CACHE_H_
+#define TVM_RELAY_BACKEND_TE_COMPILER_CACHE_H_
+
+#include <tvm/node/structural_equal.h>
+#include <tvm/node/structural_hash.h>
+#include <tvm/relay/analysis.h>
+#include <tvm/relay/attrs/memory.h>
+#include <tvm/relay/expr.h>
+#include <tvm/relay/op_strategy.h>
+#include <tvm/relay/transform.h>
+#include <tvm/runtime/module.h>
+#include <tvm/topi/elemwise.h>
+
+#include <functional>
+#include <string>
+#include <unordered_map>
+
+#include "../transforms/infer_layout_utils.h"
+
+namespace tvm {
+namespace relay {
+namespace tec {
+
+/*! \brief Indicate whether the data or shape or both of a parameter is used in the shape func. */
+enum ShapeFuncParamState {
+  kNoNeed = 0,
+  kNeedInputData = 1,
+  kNeedInputShape = 2,
+  kNeedBoth = 3,
+};
+
+struct LoweredOutputNode : public Object {
+  /*! \brief The outputs to the function */
+  tvm::Array<te::Tensor> outputs;
+  /*! \brief The implementation used to compute the output */
+  OpImplementation implementation;
+
+  void VisitAttrs(tvm::AttrVisitor* v) {
+    v->Visit("outputs", &outputs);
+    v->Visit("implementation", &implementation);
+  }
+
+  static constexpr const char* _type_key = "relay.LoweredOutput";
+  TVM_DECLARE_FINAL_OBJECT_INFO(LoweredOutputNode, Object);
+};
+
+class LoweredOutput : public ObjectRef {
+ public:
+  TVM_DLL LoweredOutput(tvm::Array<te::Tensor> outputs, OpImplementation impl);
+
+  TVM_DEFINE_OBJECT_REF_METHODS(LoweredOutput, ObjectRef, LoweredOutputNode);
+};
+
+class CCacheKey;
+/*! \brief Compile cache key */
+class CCacheKeyNode : public Object {
+ public:
+  /*! \brief The source function to be lowered. */
+  Function source_func;
+  /*! \brief The hardware target.*/
+  Target target;
+
+  void VisitAttrs(tvm::AttrVisitor* v) {
+    v->Visit("source_func", &source_func);
+    v->Visit("target", &target);
+  }
+  /*! \return The hash value of CCacheKey. */
+  inline size_t Hash() const;
+  /*!
+   * \brief check content equality
+   * \param other The other value.
+   * \return The result of equality check.
+   */
+  inline bool Equal(const CCacheKeyNode* other) const;
+
+  static constexpr const char* _type_key = "relay.CCacheKey";
+  TVM_DECLARE_FINAL_OBJECT_INFO(CCacheKeyNode, tvm::Object);
+
+ private:
+  /*!
+   * \brief internal cached hash value.
+   */
+  mutable size_t hash_{0};
+};
+
+/*! \brief cache entry used in compile engine */
+class CCacheKey : public ObjectRef {
+ public:
+  CCacheKey() {}
+  explicit CCacheKey(ObjectPtr<Object> n) : ObjectRef(n) {}
+
+  /*!
+   * \brief The constructor
+   * \param source_func The source function.
+   * \param target The target device.
+   */
+  TVM_DLL CCacheKey(Function source_func, Target target);
+
+  const CCacheKeyNode* operator->() const { return static_cast<const CCacheKeyNode*>(get()); }
+  // comparator
+  inline bool operator==(const CCacheKey& other) const {
+    ICHECK(defined() && other.defined());
+    return (*this)->Equal(other.operator->());
+  }
+  using ContainerType = CCacheKeyNode;
+};
+
+/*! \brief Node container to represent a cached function. */
+struct CachedFuncNode : public Object {
+  /* \brief compiled target */
+  tvm::Target target;
+  /*! \brief Primitive Function Name */
+  GlobalVar prim_fn_var;
+  /* \brief The inputs to the function */
+  tvm::Array<te::Tensor> inputs;
+  /* \brief The outputs to the function */
+  tvm::Array<te::Tensor> outputs;
+  /*! \brief The schedule to the function */
+  te::Schedule schedule;
+  /*! \brief Parameter usage states in the shape function. */
+  tvm::Array<Integer> shape_func_param_states;
+  /*! \brief The lowered functions to support the function. */
+  IRModule funcs = IRModule(Map<GlobalVar, BaseFunc>({}));
+
+  void VisitAttrs(tvm::AttrVisitor* v) {
+    v->Visit("target", &target);
+    v->Visit("prim_fn_var", &prim_fn_var);
+    v->Visit("inputs", &inputs);
+    v->Visit("outputs", &outputs);
+    v->Visit("schedule", &schedule);
+    v->Visit("funcs", &funcs);
+    v->Visit("shape_func_param_states", &shape_func_param_states);
+  }
+
+  static constexpr const char* _type_key = "relay.CachedFunc";
+  TVM_DECLARE_FINAL_OBJECT_INFO(CachedFuncNode, Object);
+};
+
+class CachedFunc : public ObjectRef {
+ public:
+  CachedFunc(tvm::Target target, GlobalVar prim_fn_name, tvm::Array<te::Tensor> inputs,
+             tvm::Array<te::Tensor> outputs, te::Schedule schedule,
+             tvm::Array<Integer> shape_func_param_states,
+             IRModule funcs = IRModule(Map<GlobalVar, BaseFunc>({})));
+
+ public:
+  TVM_DEFINE_OBJECT_REF_METHODS(CachedFunc, ObjectRef, CachedFuncNode);
+};
+
+/*! \brief Node container for compile cache. */
+class CCacheValueNode : public Object {
+ public:
+  /*! \brief The corresponding function */
+  CachedFunc cached_func;
+  /*! \brief Result of Packed function generated by JIT */
+  PackedFunc packed_func;
+  /*! \brief usage statistics */
+  int use_count{0};
+
+  void VisitAttrs(tvm::AttrVisitor* v) {
+    v->Visit("cached_func", &cached_func);
+    v->Visit("use_count", &use_count);
+  }
+  static constexpr const char* _type_key = "relay.CCacheValue";
+  TVM_DECLARE_FINAL_OBJECT_INFO(CCacheValueNode, tvm::Object);
+};
+
+/*! \brief cache entry used in compile engine */
+class CCacheValue : public ObjectRef {
+ public:
+  CCacheValue() {}
+  explicit CCacheValue(ObjectPtr<Object> n) : ObjectRef(n) {}
+  CCacheValueNode* operator->() { return static_cast<CCacheValueNode*>(get_mutable()); }
+  const CCacheValueNode* operator->() const { return static_cast<const CCacheValueNode*>(get()); }
+  using ContainerType = CCacheValueNode;
+};
+
+Array<IndexExpr> GetShape(const Array<IndexExpr>& shape);
+
+/*!
+ * \brief Create schedule for target.
+ * \param source_func The primitive function to be lowered.
+ * \param target The target we want to create schedule for.
+ * \return Pair of schedule and cache.
+ *  The funcs field in cache is not yet populated.
+ */
+CachedFunc PrimFuncFor(const Function& source_func, const Target& target,
+                       std::function<std::string(std::string)> renamer);
+
+CachedFunc ShapeFuncFor(const Function& prim_func, const Target& target,
+                        std::function<std::string(std::string)> renamer);
+
+std::string GetUniqueName(std::string name, std::unordered_map<std::string, int>* name_map);
+
+// implementations
+inline size_t CCacheKeyNode::Hash() const {
+  if (hash_ != 0) return hash_;
+  // do structral hash, avoid 0.
+  hash_ = tvm::StructuralHash()(this->source_func);
+  hash_ = dmlc::HashCombine(hash_, std::hash<std::string>()(target->str()));
+  if (hash_ == 0) hash_ = 1;
+  return hash_;
+}
+
+inline bool CCacheKeyNode::Equal(const CCacheKeyNode* other) const {
+  if (Hash() != other->Hash()) return false;
+  return this->target->str() == other->target->str() &&
+         tvm::StructuralEqual()(this->source_func, other->source_func);
+}
+
+}  // namespace tec
+}  // namespace relay
+}  // namespace tvm
+
+namespace std {
+// overload hash
+template <>
+struct hash<::tvm::relay::tec::CCacheKey> {
+  size_t operator()(const ::tvm::relay::tec::CCacheKey& key) const {
+    ICHECK(key.defined());
+    return key->Hash();
+  }
+};
+}  // namespace std
+
+#endif  // TVM_RELAY_BACKEND_TE_COMPILER_CACHE_H_
diff --git a/src/relay/backend/utils.cc b/src/relay/backend/utils.cc
index 3ea15438fe8fa..f0c543f1244b4 100644
--- a/src/relay/backend/utils.cc
+++ b/src/relay/backend/utils.cc
@@ -39,6 +39,30 @@ StorageInfo::StorageInfo(std::vector<int64_t> storage_ids, std::vector<DLDeviceT
   data_ = std::move(n);
 }
 
+TVM_REGISTER_GLOBAL("relay.ir.StorageInfoStorageIds").set_body_typed([](StorageInfo si) {
+  Array<tvm::Integer> ids;
+  for (auto id : si->storage_ids) {
+    ids.push_back(id);
+  }
+  return ids;
+});
+
+TVM_REGISTER_GLOBAL("relay.ir.StorageInfoDeviceTypes").set_body_typed([](StorageInfo si) {
+  Array<tvm::Integer> device_types;
+  for (auto id : si->device_types) {
+    device_types.push_back(id);
+  }
+  return device_types;
+});
+
+TVM_REGISTER_GLOBAL("relay.ir.StorageInfoStorageSizes").set_body_typed([](StorageInfo si) {
+  Array<tvm::Integer> storage_sizes_in_bytes;
+  for (auto id : si->storage_sizes_in_bytes) {
+    storage_sizes_in_bytes.push_back(id);
+  }
+  return storage_sizes_in_bytes;
+});
+
 TVM_REGISTER_NODE_TYPE(StaticMemoryPlanNode);
 
 StaticMemoryPlan::StaticMemoryPlan(Map<Expr, StorageInfo> expr_to_storage_info) {
@@ -73,6 +97,29 @@ int64_t CalculateRelayExprSizeBytes(const Type& expr_type) {
 
 TVM_REGISTER_NODE_TYPE(FunctionInfoNode);
 
+FunctionInfo::FunctionInfo(Map<Target, Integer> workspace_sizes, Map<Target, Integer> io_sizes,
+                           Map<Target, Integer> constant_sizes,
+                           Map<Target, tir::PrimFunc> tir_primfuncs,
+                           Map<Target, Function> relay_primfuncs) {
+  ObjectPtr<FunctionInfoNode> n = make_object<FunctionInfoNode>();
+  n->workspace_sizes = std::move(workspace_sizes);
+  n->io_sizes = std::move(io_sizes);
+  n->constant_sizes = std::move(constant_sizes);
+  n->tir_primfuncs = std::move(tir_primfuncs);
+  n->relay_primfuncs = std::move(relay_primfuncs);
+  data_ = std::move(n);
+}
+
+TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
+    .set_dispatch<FunctionInfoNode>([](const ObjectRef& ref, ReprPrinter* p) {
+      auto* node = static_cast<const FunctionInfoNode*>(ref.get());
+      p->stream << "FunctionInfoNode(\n"
+                << "workspace_sizes=" << node->workspace_sizes << ",\n  io_sizes=" << node->io_sizes
+                << ",\n  constant_sizes=" << node->constant_sizes
+                << ",\n  tir_primfuncs=" << node->tir_primfuncs
+                << ",\n  relay_primfuncs=" << node->relay_primfuncs << ")";
+    });
+
 }  // namespace backend
 }  // namespace relay
 }  // namespace tvm
diff --git a/src/relay/backend/utils.h b/src/relay/backend/utils.h
index 7d7f026c298e8..9294ff4f77950 100644
--- a/src/relay/backend/utils.h
+++ b/src/relay/backend/utils.h
@@ -114,6 +114,10 @@ struct FunctionInfoNode : public Object {
 
 class FunctionInfo : public ObjectRef {
  public:
+  FunctionInfo(Map<Target, Integer> workspace_sizes, Map<Target, Integer> io_sizes,
+               Map<Target, Integer> constant_sizes, Map<Target, tir::PrimFunc> tir_primfuncs,
+               Map<Target, Function> relay_primfuncs);
+
   TVM_DEFINE_MUTABLE_OBJECT_REF_METHODS(FunctionInfo, ObjectRef, FunctionInfoNode);
 };
 
@@ -132,11 +136,65 @@ struct LoweredOutput {
   std::string graph_json;
   Map<String, IRModule> lowered_funcs;
   Array<tvm::runtime::Module> external_mods;
-  Map<String, FunctionInfo> function_metadata;
+  Map<String, FunctionInfo>
+      function_metadata;  // TODO(@electriclilies): Why is this a map? seems like it can only every
+                          // have 1 function info in it.
   std::unordered_map<std::string, std::pair<int, const tvm::runtime::NDArray>> params;
   runtime::Metadata metadata;
 };
 
+/*!
+ * \brief The static storage information produced by memory planning.
+ */
+class StorageInfoNode : public Object {
+ public:
+  /*! \brief The set of storage ids where the expression is stored. */
+  std::vector<int64_t> storage_ids;
+  /* \brief The type of "virtual devices" these expressions are stored on. */
+  std::vector<DLDeviceType> device_types;
+  /* \brief The sizes of each storage element. */
+  std::vector<int64_t> storage_sizes_in_bytes;
+
+  // TODO(@jroesch): expose the fields
+  void VisitAttrs(AttrVisitor* v) {}
+
+  static constexpr const char* _type_key = "relay.StorageInfo";
+  TVM_DECLARE_FINAL_OBJECT_INFO(StorageInfoNode, Object);
+};
+
+/*! \brief The storage information for a single expression. */
+class StorageInfo : public ObjectRef {
+ public:
+  StorageInfo(std::vector<int64_t> storage_ids, std::vector<DLDeviceType> device_types,
+              std::vector<int64_t> storage_sizes_in_bytes);
+  TVM_DEFINE_OBJECT_REF_METHODS(StorageInfo, ObjectRef, StorageInfoNode);
+};
+
+/*!
+ * \brief The result of static memory planning.
+ */
+class StaticMemoryPlanNode : public Object {
+ public:
+  Map<Expr, StorageInfo> expr_to_storage_info;
+
+  void VisitAttrs(AttrVisitor* v) { v->Visit("expr_to_storage_info", &expr_to_storage_info); }
+
+  static constexpr const char* _type_key = "relay.StaticMemoryPlan";
+  TVM_DECLARE_FINAL_OBJECT_INFO(StaticMemoryPlanNode, Object);
+};
+
+/*! \brief The result of running static memory planning. */
+class StaticMemoryPlan : public ObjectRef {
+ public:
+  explicit StaticMemoryPlan(Map<Expr, StorageInfo> expr_to_storage_info);
+  TVM_DEFINE_OBJECT_REF_METHODS(StaticMemoryPlan, ObjectRef, StaticMemoryPlanNode);
+};
+
+/*!
+ * \brief A helper to plan the graph memory
+ */
+// StaticMemoryPlan GraphPlanMemory(const Function& func);
+
 /*!
  * \brief A helper to expand the params by adding the ones used in a given expression.
  */
@@ -188,6 +246,21 @@ inline void UpdateConstants(Function func,
   }
 }
 
+/*!
+ * \brief A function to update the function metadata with the input and output buffer sizes.
+ * \param func The function whose metadata we need to create
+ * \param metadata The map from function name to metadata, where we'll store the metadata we create
+ */
+inline void UpdateFunctionMetadata(Function func,
+                            Map<String, FunctionInfo> function_metadata) {
+
+  tir::PrimFunc primfunc = Downcast<tir::PrimFunc>(func);
+  auto workspace_byte_alignment =
+          target_host_->GetAttr<Integer>("workspace-byte-alignment").value_or(16);
+
+}
+
+
 /*!
  * \brief A simple wrapper around ExprFunctor for a single argument case.
  *  The result of visit is memoized.
diff --git a/src/relay/backend/vm/compiler.cc b/src/relay/backend/vm/compiler.cc
index c50f2f65f949e..96aa77f286a9a 100644
--- a/src/relay/backend/vm/compiler.cc
+++ b/src/relay/backend/vm/compiler.cc
@@ -978,7 +978,7 @@ void VMCompiler::Lower(IRModule mod, const TargetsMap& targets, const tvm::Targe
   // update primitive function map
   size_t primitive_index = 0;
   for (const auto& cfunc : context_.cached_funcs) {
-    exec_->primitive_map.insert({cfunc->func_name, primitive_index++});
+    exec_->primitive_map.insert({cfunc->prim_fn_var->name_hint, primitive_index++});
   }
 }
 
@@ -1173,8 +1173,9 @@ void VMCompiler::Codegen() {
 
     if (target->kind->device_type == kDLExtDev) {
       // Collect metadata in functions that are handled by external codegen.
-      ICHECK(mod->ContainGlobalVar(cfunc->func_name));
-      Function func = Downcast<Function>(mod->Lookup(cfunc->func_name));
+      auto name = cfunc->prim_fn_var->name_hint;
+      ICHECK(mod->ContainGlobalVar(name));
+      Function func = Downcast<Function>(mod->Lookup(name));
       backend::UpdateConstants(func, &params_);
     } else if (funcs.count(target) == 0) {
       funcs.Set(target, mod);
diff --git a/src/relay/ir/function.cc b/src/relay/ir/function.cc
index c9920a621b56f..83ac55fce085b 100644
--- a/src/relay/ir/function.cc
+++ b/src/relay/ir/function.cc
@@ -62,9 +62,17 @@ TVM_REGISTER_GLOBAL("relay.ir.Function")
 
 TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
     .set_dispatch<FunctionNode>([](const ObjectRef& ref, ReprPrinter* p) {
-      auto* node = static_cast<const FunctionNode*>(ref.get());
-      p->stream << "FunctionNode(" << node->params << ", " << node->ret_type << ", " << node->body
-                << ", " << node->type_params << ", " << node->attrs << ")";
+      // TODO(@jroesch): previously this had a debug printer, the debug printer
+      // can cause exponential behavior and is currently dangerous, for these
+      // cases we need some kind of de-duping.
+      //
+      // See old implementation:
+      //
+      // auto* node = static_cast<const FunctionNode*>(ref.get());
+      // p->stream << "FunctionNode(" << node->params << ", " << node->ret_type << ", " <<
+      // node->body
+      //           << ", " << node->type_params << ", " << node->attrs << ")";
+      p->stream << PrettyPrint(ref);
     });
 
 }  // namespace relay
diff --git a/src/relay/transforms/auto_scheduler_layout_rewrite.cc b/src/relay/transforms/auto_scheduler_layout_rewrite.cc
index edc4119ce859c..d5c03b113dc3a 100644
--- a/src/relay/transforms/auto_scheduler_layout_rewrite.cc
+++ b/src/relay/transforms/auto_scheduler_layout_rewrite.cc
@@ -124,7 +124,7 @@ Expr AutoSchedulerLayoutRewriter::VisitExpr_(const CallNode* n) {
       CHECK(f) << "Could not find auto_scheduler.enter_layout_rewrite function.";
       (*f)();
 
-      CreateSchedule(GetRef<Function>(func), Target::Current());
+      PrimFuncFor(GetRef<Function>(func), Target::Current(), [](std::string name) { return name; });
 
       f = runtime::Registry::Get("auto_scheduler.exit_layout_rewrite");
       CHECK(f) << "Could not find ansor.exit_layout_rewrite function.";
diff --git a/src/relay/transforms/memory_alloc.cc b/src/relay/transforms/memory_alloc.cc
index 03473b7d7455f..a4d26c2b7a4fd 100644
--- a/src/relay/transforms/memory_alloc.cc
+++ b/src/relay/transforms/memory_alloc.cc
@@ -43,6 +43,7 @@
 #include "../backend/compile_engine.h"
 #include "../op/memory/memory.h"
 #include "../op/vm/vm.h"
+#include "./pass_utils.h"
 #include "let_list.h"
 #include "pattern_utils.h"
 
diff --git a/src/relay/transforms/type_infer.cc b/src/relay/transforms/type_infer.cc
index 4c60137924260..f29087dcc0493 100644
--- a/src/relay/transforms/type_infer.cc
+++ b/src/relay/transforms/type_infer.cc
@@ -205,8 +205,13 @@ class TypeInferencer : private ExprFunctor<Type(const Expr&)>,
       this->EmitFatal(Diagnostic::Error(op->span) << "Cannot do type inference on global variables "
                                                   << "without a module");
     }
-    relay::Function e = Downcast<Function>(mod_->Lookup(var));
-    return e->checked_type();
+
+    if (mod_->ContainGlobalVar(var->name_hint)) {
+      relay::Function e = Downcast<Function>(mod_->Lookup(var));
+      return e->checked_type();
+    } else {
+      return op->checked_type_;
+    }
   }
 
   Type VisitExpr_(const ConstantNode* op) final { return op->tensor_type(); }
diff --git a/src/runtime/graph_executor/graph_executor.cc b/src/runtime/graph_executor/graph_executor.cc
index 1084b4ee3ec49..65974986e54d3 100644
--- a/src/runtime/graph_executor/graph_executor.cc
+++ b/src/runtime/graph_executor/graph_executor.cc
@@ -415,6 +415,7 @@ GraphExecutor::CreateTVMOp(const TVMOpParam& param, const std::vector<DLTensor>&
     }
   }
 
+  std::cout << "Executing: " << param.func_name << std::endl;
   if (param.func_name == "__nop") {
     return {[]() {}, arg_ptr};
   } else if (param.func_name == "__copy") {
@@ -423,6 +424,8 @@ GraphExecutor::CreateTVMOp(const TVMOpParam& param, const std::vector<DLTensor>&
     auto fexec = [arg_ptr]() {
       DLTensor* from = static_cast<DLTensor*>(arg_ptr->arg_values[0].v_handle);
       DLTensor* to = static_cast<DLTensor*>(arg_ptr->arg_values[1].v_handle);
+      std::cout << "from: " << from->device.device_type << "to: " << to->device.device_type
+                << std::endl;
       TVM_CCALL(TVMArrayCopyFromTo(from, to, nullptr));
     };
     return {fexec, arg_ptr};
diff --git a/src/target/llvm/llvm_module.cc b/src/target/llvm/llvm_module.cc
index 24fb3dc958198..9ece234b4444f 100644
--- a/src/target/llvm/llvm_module.cc
+++ b/src/target/llvm/llvm_module.cc
@@ -223,8 +223,12 @@ class LLVMModuleNode final : public runtime::ModuleNode {
         found_linked_params = true;
         continue;
       }
-      ICHECK(kv.second->IsInstance<PrimFuncNode>())
-          << "Can only lower IR Module with PrimFuncs, but got " << kv.second->GetTypeKey();
+      if (!kv.second->IsInstance<PrimFuncNode>()) {
+        // (@jroesch): we relax constraints here, Relay functions will just be ignored.
+        DLOG(INFO) << "Can only lower IR Module with PrimFuncs, but got "
+                   << kv.second->GetTypeKey();
+        continue;
+      }
       auto f = Downcast<PrimFunc>(kv.second);
       auto global_symbol = f->GetAttr<String>(tvm::attr::kGlobalSymbol);
       ICHECK(global_symbol.defined());
@@ -234,7 +238,7 @@ class LLVMModuleNode final : public runtime::ModuleNode {
       }
       funcs.push_back(f);
     }
-    ICHECK(funcs.size() > 0 || (could_have_linked_params && found_linked_params));
+    // ICHECK(funcs.size() > 0 || (could_have_linked_params && found_linked_params));
     // TODO(tqchen): remove the entry function behavior as it does not
     // makes sense when we start to use multiple modules.
     cg->Init("TVMMod", tm_.get(), ctx_.get(), system_lib, system_lib, target_c_runtime);
diff --git a/tests/python/relay/test_backend_graph_executor.py b/tests/python/relay/test_backend_graph_executor.py
index 4ec1c21467fc4..e7040f55f6315 100644
--- a/tests/python/relay/test_backend_graph_executor.py
+++ b/tests/python/relay/test_backend_graph_executor.py
@@ -130,22 +130,22 @@ def test_plan_memory():
     mod = relay.transform.FuseOps(0)(mod)
     func = mod["main"]
     mod = relay.transform.InferType()(mod)
-    smap = relay.backend._backend.GraphPlanMemory(func)
+    memory_plan = relay.backend._backend.GraphPlanMemory(func)
     storage_ids = set()
     device_types = set()
     storage_sizes = {}
-    for k, v in smap.items():
-        assert len(v) == 3
-        for x in v[0]:
-            storage_ids.add(x.value)
-            storage_sizes[x.value] = v[2]
-        for x in v[1]:
-            device_types.add(x.value)
+
+    for k, v in memory_plan.expr_to_storage_info.items():
+        for x in v.storage_ids:
+            storage_ids.add(x)
+            storage_sizes[x] = v.storage_sizes
+        for x in v.device_types:
+            device_types.add(x)
 
     # Current rule requires vars have unique storage id
     # because we don't do inplace, we will need another
     # two alternating temporary space.
-    assert len(storage_ids) == 4
+    assert len(storage_ids) == 4, f"found storage_ids: {storage_ids}"
     assert len(device_types) == 1
     assert len(storage_sizes) == 4
 
@@ -288,11 +288,4 @@ def test_graph_executor_nested_tuples():
 
 
 if __name__ == "__main__":
-    test_reshape_nop()
-    test_plan_memory()
-    test_with_params()
-    test_add_op_scalar()
-    test_add_op_tensor()
-    test_add_op_broadcast()
-    test_gru_like()
-    test_compile_nested_tuples()
+    sys.exit(pytest.main([file] + sys.argv[1:]))
diff --git a/tests/python/relay/test_pass_annotation.py b/tests/python/relay/test_pass_annotation.py
index f0949ab19f9cf..c89c7ae236615 100644
--- a/tests/python/relay/test_pass_annotation.py
+++ b/tests/python/relay/test_pass_annotation.py
@@ -49,6 +49,9 @@ def check_graph_executor(
             device_index = graph_json["attrs"]["device_index"][1]
             assert device_index == expected_index
         mod = graph_executor.create(graph, lib, contexts)
+        import pdb
+
+        pdb.set_trace()
         mod.set_input(**new_params)
         mod.run()
         res = mod.get_output(0).numpy()
diff --git a/tests/python/unittest/test_micro_model_library_format.py b/tests/python/unittest/test_micro_model_library_format.py
index 2922a3adf48b2..5265cf02ffa3a 100644
--- a/tests/python/unittest/test_micro_model_library_format.py
+++ b/tests/python/unittest/test_micro_model_library_format.py
@@ -211,7 +211,7 @@ def @main(%a : Tensor[(1, 2), uint8], %b : Tensor[(1, 2), float32], %c : Tensor[
     "target",
     [
         ("graph", tvm.target.target.micro("host")),
-        ("aot", tvm.target.target.micro("host", options="-executor=aot")),
+        # ("aot", tvm.target.target.micro("host", options="-executor=aot")),
     ],
 )
 def test_export_model_library_format_workspace(target):
@@ -251,6 +251,11 @@ def @main(%p0: Tensor[(1, 56, 56, 128), int16], %p1: Tensor[(3, 3, 128, 1), int1
         )
         assert (datetime.datetime.now() - export_datetime) < datetime.timedelta(seconds=60 * 5)
         assert metadata["target"] == {"1": str(_target)}
+        # print("Metadata is: ", metadata["memory"]["functions"]["main"])
+        # print("Expected metadata: ")
+        import pdb
+
+        pdb.set_trace()
         assert metadata["memory"]["functions"]["main"] == [
             {
                 "constants_size_bytes": 0,