diff --git a/include/tvm/ir/function.h b/include/tvm/ir/function.h
index 13b984d9cb355..5ee719f9964f8 100644
--- a/include/tvm/ir/function.h
+++ b/include/tvm/ir/function.h
@@ -189,6 +189,27 @@ constexpr const char* kTarget = "target";
  * Type: String
  */
 constexpr const char* kGlobalSymbol = "global_symbol";
+
+/*!
+ * \brief The device type which will hold each of the functions parameters.
+ *
+ * Only supported on Relay \p Functions. Generally added by the \p PlanDevices pass, but
+ * may be included as an annotation on user programs.
+ *
+ * Type: Array<Integer> (but interpreted as Array<DLDeviceType>)
+ */
+constexpr const char* kParamDeviceTypes = "param_device_types";
+
+/*!
+ * \brief The device type which will hold the function result.
+ *
+ * Only supported on Relay \p Functions. Generally added by the \p PlanDevices pass, but
+ * may be included as an annotation on user programs.
+ *
+ * Type: Integer (but interpreted as DLDeviceType)
+ */
+constexpr const char* kResultDeviceType = "result_device_type";
+
 }  // namespace attr
 }  // namespace tvm
 #endif  // TVM_IR_FUNCTION_H_
diff --git a/include/tvm/parser/parser.h b/include/tvm/parser/parser.h
index 7673eec2a337f..8c27220509057 100644
--- a/include/tvm/parser/parser.h
+++ b/include/tvm/parser/parser.h
@@ -23,6 +23,7 @@
  * \file parser.h
  * \brief A parser for TVM IR.
  */
+#include <tvm/ir/module.h>
 #include <tvm/runtime/packed_func.h>
 #include <tvm/runtime/registry.h>
 
@@ -32,8 +33,11 @@
 namespace tvm {
 namespace parser {
 
-IRModule ParseModule(std::string file_name, std::string file_content,
-                     Optional<IRModule> init_module = Optional<IRModule>());
+using MetaTable = Map<String, Array<ObjectRef>>;
+
+IRModule ParseModule(const std::string& file_name, const std::string& file_content,
+                     const Optional<IRModule>& init_module = Optional<IRModule>(),
+                     const MetaTable& init_meta_table = MetaTable());
 
 }  // namespace parser
 }  // namespace tvm
diff --git a/include/tvm/relay/attrs/function.h b/include/tvm/relay/attrs/function.h
deleted file mode 100644
index f4f94131da1f5..0000000000000
--- a/include/tvm/relay/attrs/function.h
+++ /dev/null
@@ -1,66 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file tvm/relay/attrs/function.h
- * \brief Attributes for Relay Functions which don't make sense on PrimFuncs.
- */
-#ifndef TVM_RELAY_ATTRS_FUNCTION_H_
-#define TVM_RELAY_ATTRS_FUNCTION_H_
-
-namespace tvm {
-namespace relay {
-/*!
- * \brief Attributes for Relay function definitions which capture the devices for the
- * function parameters and result.
- *
- * See also OnDeviceAttrs in include/tvm/relay/attrs/annotation.h for the companion "on_device"
- * call attributes.
- */
-struct FunctionOnDeviceAttrs : public tvm::AttrsNode<FunctionOnDeviceAttrs> {
-  /*! \brief Device type on which each of the function's arguments already resides. */
-  Array<Integer> param_device_types;
-  // TODO(mbs): Replace device types with TargetDevice.
-  /*! \brief Device type on which function body should be evaluated. */
-  int result_device_type = kInvalidDeviceType;
-
-  TVM_DECLARE_ATTRS(FunctionOnDeviceAttrs, "relay.attrs.FunctionOnDeviceAttrs") {
-    TVM_ATTR_FIELD(param_device_types)
-        .describe("The type of the virtual device which holds each function parameters.");
-    TVM_ATTR_FIELD(result_device_type)
-        .describe("The type of the virtual device which will hold the function's result.")
-        .set_default(0);
-  }
-};
-
-namespace attr {
-
-/*!
- * \brief Device annotations for function parameters and results.
- *
- * Type: FunctionOnDeviceAttrs
- */
-constexpr static const char* kFunctionAttrsKey = "on_device";
-
-}  // namespace attr
-
-}  // namespace relay
-}  // namespace tvm
-
-#endif  // TVM_RELAY_ATTRS_FUNCTION_H_
diff --git a/python/tvm/parser/__init__.py b/python/tvm/parser/__init__.py
index 60fcddb17f08b..4b8628e49a571 100644
--- a/python/tvm/parser/__init__.py
+++ b/python/tvm/parser/__init__.py
@@ -26,8 +26,10 @@ def add(self, name, content):
         return _ffi.get_global_func("SourceMapAdd")(self, name, content)
 
 
-def parse(source, source_name="from_string"):
-    return _ffi_api.ParseModule(source_name, source)
+def parse(source, source_name="from_string", init_module=None, init_meta_table=None):
+    if init_meta_table is None:
+        init_meta_table = {}
+    return _ffi_api.ParseModule(source_name, source, init_module, init_meta_table)
 
 
 def parse_expr(source):
diff --git a/src/ir/diagnostic.cc b/src/ir/diagnostic.cc
index 876113b85f6e4..b9677d198eba0 100644
--- a/src/ir/diagnostic.cc
+++ b/src/ir/diagnostic.cc
@@ -242,10 +242,10 @@ void ReportAt(const DiagnosticContext& context, std::ostream& out, const Span& s
   }
 
   auto source = (*it).second;
-  DLOG(INFO) << "Source: " << std::endl << source->source;
+  VLOG(1) << "Source: " << std::endl << source->source;
 
-  DLOG(INFO) << "ReportAt "
-             << "span = " << span << " msg = " << diagnostic->message;
+  VLOG(1) << "ReportAt "
+          << "span = " << span << " msg = " << diagnostic->message;
 
   auto line_text = source.GetLine(span->line);
 
diff --git a/src/parser/meta_ref.h b/src/parser/meta_ref.h
index 481f334cb0fe0..483b7f726e073 100644
--- a/src/parser/meta_ref.h
+++ b/src/parser/meta_ref.h
@@ -26,6 +26,7 @@
 #define TVM_PARSER_META_REF_H_
 
 #include <tvm/ir/attrs.h>
+#include <tvm/parser/parser.h>
 #include <tvm/relay/expr.h>
 #include <tvm/relay/function.h>
 
@@ -36,8 +37,6 @@ namespace parser {
 
 using namespace relay;
 
-using MetaTable = Map<String, Array<ObjectRef>>;
-
 /*!
  * \brief Options for allocating storage.
  */
diff --git a/src/parser/parser.cc b/src/parser/parser.cc
index 93dc687d72f5e..4803bffedd8dc 100644
--- a/src/parser/parser.cc
+++ b/src/parser/parser.cc
@@ -1088,7 +1088,7 @@ class Parser {
     VLOG(0) << "Parser::ParseFunctionDef";
     return WithSpan<Function>([&]() {
       PushScope();
-      PushTypeScope();
+      PushTypeScope();  // TODO(mbs): BUG?
 
       Array<TypeVar> generics;
       if (Peek()->token_type == TokenType::kLSquare) {
@@ -1444,6 +1444,10 @@ class Parser {
                     ICHECK(attr_obj.defined());
                     attrs = Downcast<Attrs>(attr_obj);
                   }
+                } else {
+                  this->diag_ctx.EmitFatal(Diagnostic::Error(op->span)
+                                           << "unable to determine the 'attrs_type_key' with which "
+                                              "to represent the call attributes for this operator");
                 }
               }
               return true;
@@ -1867,7 +1871,7 @@ class Parser {
 };
 
 Parser InitParser(const std::string& file_name, const std::string& file_content,
-                  Optional<IRModule> init_module) {
+                  const Optional<IRModule>& init_module, const MetaTable& init_meta_table) {
   VLOG(0) << "InitParser: file_name: " << file_name << "file_content_size: " << file_content.size();
   SourceName src_name = SourceName::Get(file_name);
   Source source(src_name, file_content);
@@ -1886,19 +1890,33 @@ Parser InitParser(const std::string& file_name, const std::string& file_content,
   auto tokens_and_table = Tokenize(diag_ctx, source);
 
   auto tokens = tokens_and_table.first;
-  auto meta_data_table = tokens_and_table.second;
+  MetaTable meta_data_table = tokens_and_table.second.ToMetadata();
+
+  // Merge any entries in init_meta_table into anything captured in the #[metadata] section
+  // of the file_content. Metadata references within file_content must use indexes which account
+  // for this ordering.
+  for (const auto& pair : init_meta_table) {
+    Array<ObjectRef> items;
+    if (meta_data_table.count(pair.first)) {
+      items = meta_data_table[pair.first];
+    }
+    for (const auto& obj : pair.second) {
+      items.push_back(obj);
+    }
+    meta_data_table.Set(pair.first, items);
+  }
 
-  return Parser(module, diag_ctx, source, tokens, DefaultOpTable(), meta_data_table.ToMetadata());
+  return Parser(module, diag_ctx, source, tokens, DefaultOpTable(), std::move(meta_data_table));
 }
 
-IRModule ParseModule(std::string file_name, std::string file_content,
-                     Optional<IRModule> init_module) {
+IRModule ParseModule(const std::string& file_name, const std::string& file_content,
+                     const Optional<IRModule>& init_module, const MetaTable& init_meta_table) {
   VLOG(0) << "ParseModule";
-  auto parser = InitParser(file_name, file_content, init_module);
+  auto parser = InitParser(file_name, file_content, init_module, init_meta_table);
   auto mod = parser.ParseModule();
   ICHECK(mod.defined()) << "The parser must return a non-null module.";
-  // NB(@jroesch): it is very important that we render any errors before we procede
-  // if there were any errors which allow the parser to procede we must render them
+  // NB(@jroesch): it is very important that we render any errors before we proceed
+  // if there were any errors which allow the parser to proceed we must render them
   // here.
   parser.diag_ctx.Render();
   auto infer_type = tvm::relay::transform::InferType();
@@ -1906,23 +1924,24 @@ IRModule ParseModule(std::string file_name, std::string file_content,
   return infer_type(mod);
 }
 
-Expr ParseExpr(std::string file_name, std::string file_content) {
+Expr ParseExpr(const std::string& file_name, const std::string& file_content) {
   VLOG(0) << "ParseExpr";
-  auto parser = InitParser(file_name, file_content, Optional<IRModule>());
+  auto parser = InitParser(file_name, file_content, Optional<IRModule>(), MetaTable());
   parser.ParseSemVer(false);
   parser.PushScope();
   auto expr = parser.ParseExpr();
   parser.Match(TokenType::kEndOfFile);
-  // NB(@jroesch): it is very important that we render any errors before we procede
-  // if there were any errors which allow the parser to procede we must render them
+  // NB(@jroesch): it is very important that we render any errors before we proceed
+  // if there were any errors which allow the parser to proceed we must render them
   // here.
   parser.diag_ctx.Render();
   return expr;
 }
 
 TVM_REGISTER_GLOBAL("parser.ParseModule")
-    .set_body_typed([](tvm::String file_name, tvm::String file_content) {
-      return ParseModule(file_name, file_content);
+    .set_body_typed([](const std::string& file_name, const std::string& file_content,
+                       const Optional<IRModule>& init_module, const MetaTable& init_meta_table) {
+      return ParseModule(file_name, file_content, init_module, init_meta_table);
     });
 
 TVM_REGISTER_GLOBAL("parser.ParseExpr")
diff --git a/src/parser/source_map.cc b/src/parser/source_map.cc
index 4e79d0e74c592..3c1329670c40e 100644
--- a/src/parser/source_map.cc
+++ b/src/parser/source_map.cc
@@ -60,7 +60,7 @@ Source::Source(SourceName src_name, std::string source) {
 }
 
 tvm::String Source::GetLine(int line) {
-  DLOG(INFO) << "Source::GetLine: line=" << line;
+  VLOG(1) << "Source::GetLine: line=" << line;
   ICHECK(line - 1 < static_cast<int64_t>((*this)->line_map.size()))
       << "requested line: " << line << "at index: " << (line - 1)
       << "line_map size: " << (*this)->line_map.size() << "source: " << (*this)->source;
@@ -69,10 +69,10 @@ tvm::String Source::GetLine(int line) {
   auto range = (*this)->line_map.at(line - 1);
   int line_start = range.first;
   int line_length = range.second;
-  DLOG(INFO) << "Source::GetLine: line_start=" << line_start << " line_length=" << line_length;
+  VLOG(1) << "Source::GetLine: line_start=" << line_start << " line_length=" << line_length;
   // TODO(@jroesch): expose substring on tvm::String.
   auto line_text = std::string((*this)->source).substr(line_start, line_length);
-  DLOG(INFO) << "Source::GetLine: line_text=" << line_text;
+  VLOG(1) << "Source::GetLine: line_text=" << line_text;
   return line_text;
 }
 
diff --git a/src/relay/op/annotation/annotation.cc b/src/relay/op/annotation/annotation.cc
index 4eda15937f3a4..d29e1bbfde237 100644
--- a/src/relay/op/annotation/annotation.cc
+++ b/src/relay/op/annotation/annotation.cc
@@ -26,7 +26,6 @@
 #include "./annotation.h"
 
 #include <tvm/relay/attrs/annotation.h>
-#include <tvm/relay/attrs/function.h>
 #include <tvm/relay/expr.h>
 #include <tvm/relay/op.h>
 #include <tvm/relay/op_attr_types.h>
@@ -92,6 +91,7 @@ RELAY_REGISTER_OP("on_device")
     .add_argument("data", "Tensor", "The input data.")
     .set_support_level(10)
     .add_type_rel("Identity", IdentityRel)
+    .set_attrs_type_key("relay.attrs.OnDeviceAttrs")
     .set_attr<TOpPattern>("TOpPattern", kOpaque)
     .set_attr<TOpIsStateful>("TOpIsStateful", false)
     .set_attr<FInferCorrectLayout>("FInferCorrectLayout", ElemwiseArbitraryLayout)
@@ -128,14 +128,10 @@ OnDeviceProps GetOnDeviceProps(const Expr& expr) {
   return {};
 }
 
-TVM_REGISTER_NODE_TYPE(FunctionOnDeviceAttrs);
-
 Function FunctionOnDevice(Function function, Array<Integer> param_device_types,
-                          DLDeviceType result_device_type) {
-  auto attrs = make_object<FunctionOnDeviceAttrs>();
-  attrs->param_device_types = std::move(param_device_types);
-  attrs->result_device_type = result_device_type;
-  return WithAttr(std::move(function), attr::kFunctionAttrsKey, Attrs(std::move(attrs)));
+                          Integer result_device_type) {
+  return WithAttr(WithAttr(std::move(function), tvm::attr::kParamDeviceTypes, param_device_types),
+                  tvm::attr::kResultDeviceType, result_device_type);
 }
 
 Function FunctionOnDevice(Function function, const std::vector<DLDeviceType>& param_device_types,
@@ -143,9 +139,20 @@ Function FunctionOnDevice(Function function, const std::vector<DLDeviceType>& pa
   Array<Integer> arr;
   arr.reserve(param_device_types.size());
   for (const auto device_type : param_device_types) {
-    arr.push_back(static_cast<int64_t>(device_type));
+    arr.push_back(static_cast<int>(device_type));
+  }
+  return FunctionOnDevice(std::move(function), std::move(arr),
+                          static_cast<int>(result_device_type));
+}
+
+Function OptFunctionOnDevice(Function function, const std::vector<DLDeviceType>& param_device_types,
+                             DLDeviceType result_device_type) {
+  if (std::all_of(param_device_types.begin(), param_device_types.end(),
+                  [](DLDeviceType type) { return type == kInvalidDeviceType; }) &&
+      result_device_type == kInvalidDeviceType) {
+    return function;
   }
-  return FunctionOnDevice(function, arr, result_device_type);
+  return FunctionOnDevice(function, param_device_types, result_device_type);
 }
 
 TVM_REGISTER_GLOBAL("relay.op.annotation._make.function_on_device")
@@ -156,32 +163,26 @@ TVM_REGISTER_GLOBAL("relay.op.annotation._make.function_on_device")
     });
 
 DLDeviceType GetFunctionResultDeviceType(const FunctionNode* function_node) {
-  auto opt_attrs = function_node->GetAttr<Attrs>(attr::kFunctionAttrsKey);
-  if (!opt_attrs) {
+  auto opt_integer = function_node->GetAttr<Integer>(tvm::attr::kResultDeviceType);
+  if (!opt_integer) {
     // No annotation.
     return kInvalidDeviceType;
   }
-  const auto* opt_function_on_device_attrs = opt_attrs.value().as<FunctionOnDeviceAttrs>();
-  ICHECK(opt_function_on_device_attrs != nullptr)
-      << "function '" << attr::kFunctionAttrsKey << "' annotation must be a FunctionOnDeviceAttrs";
-  return static_cast<DLDeviceType>(opt_function_on_device_attrs->result_device_type);
+  return static_cast<DLDeviceType>(opt_integer.value()->value);
 }
 
 DLDeviceType GetFunctionParamDeviceType(const FunctionNode* function_node, size_t i) {
   ICHECK_LT(i, function_node->params.size())
       << "param index " << i << " out of range for function of arity "
       << function_node->params.size();
-  auto opt_attrs = function_node->GetAttr<Attrs>(attr::kFunctionAttrsKey);
-  if (!opt_attrs) {
+  auto opt_array = function_node->GetAttr<Array<Integer>>(tvm::attr::kParamDeviceTypes);
+  if (!opt_array) {
     // No annotation.
     return kInvalidDeviceType;
   }
-  const auto* opt_function_on_device_attrs = opt_attrs.value().as<FunctionOnDeviceAttrs>();
-  ICHECK(opt_function_on_device_attrs != nullptr)
-      << "function '" << attr::kFunctionAttrsKey << "' annotation must be a FunctionOnDeviceAttrs";
-  ICHECK_EQ(opt_function_on_device_attrs->param_device_types.size(), function_node->params.size())
+  ICHECK_EQ(opt_array.value().size(), function_node->params.size())
       << "annotation parameters do not match function arity";
-  return static_cast<DLDeviceType>(opt_function_on_device_attrs->param_device_types[i]->value);
+  return static_cast<DLDeviceType>(opt_array.value()[i]->value);
 }
 
 Expr StopFusion(Expr data) {
diff --git a/src/relay/op/annotation/annotation.h b/src/relay/op/annotation/annotation.h
index e3a4aea4708c4..7a4516bcd5813 100644
--- a/src/relay/op/annotation/annotation.h
+++ b/src/relay/op/annotation/annotation.h
@@ -83,24 +83,30 @@ OnDeviceProps GetOnDeviceProps(const Expr& expr);
 inline bool IsOnDeviceCall(const Expr& expr) { return GetOnDeviceProps(expr).body.defined(); }
 
 /*!
- * \brief Returns \p function annotated with "on_device" attributes capturing parameter and result
- * devices types. However returns \p function directly if all device types are \p
- * kInvalidDeviceType.
+ * \brief Returns \p function annotated with "param_device_types" and "result_device_type"
+ * attributes capturing parameter and result devices types respectively.
  */
 Function FunctionOnDevice(Function function, Array<Integer> param_device_types,
-                          DLDeviceType body_device_type);
+                          Integer body_device_type);
 Function FunctionOnDevice(Function function, const std::vector<DLDeviceType>& param_device_types,
                           DLDeviceType body_device_type);
 
+/*!
+ * \brief As for \p FunctionOnDevice, but returns \p function unchanged if all parameters and
+ * result device types are \p kInvalidDeviceType.
+ */
+Function OptFunctionOnDevice(Function function, const std::vector<DLDeviceType>& param_device_types,
+                             DLDeviceType result_device_type);
+
 /*!
  * \brief Returns the device type for the resut of \p function_node, or \p kInvalidDeviceType
- * if function does not have "on_device" annotation.
+ * if function does not have "result_device_type" annotation.
  */
 DLDeviceType GetFunctionResultDeviceType(const FunctionNode* function_node);
 
 /*!
  * \brief Returns the device type for the \p i'th parameter of \p function_node, or
- * \p kInvalidDeviceType if function does not have "on_device" annotation.
+ * \p kInvalidDeviceType if function does not have "param_device_types" annotation.
  */
 DLDeviceType GetFunctionParamDeviceType(const FunctionNode* function_node, size_t i);
 
diff --git a/src/relay/op/memory/device_copy.cc b/src/relay/op/memory/device_copy.cc
index b94caac2c3d99..dce89aa91b65a 100644
--- a/src/relay/op/memory/device_copy.cc
+++ b/src/relay/op/memory/device_copy.cc
@@ -76,6 +76,7 @@ on different devices.
     .add_argument("data", "Tensor", "The input data.")
     .set_support_level(10)
     .add_type_rel("Identity", IdentityRel)
+    .set_attrs_type_key("relay.attrs.DeviceCopyAttrs")
     .set_attr<TOpPattern>("TOpPattern", kOpaque)
     .set_attr<TOpIsStateful>("TOpIsStateful", false)
     .set_attr<FInferCorrectLayout>("FInferCorrectLayout", ElemwiseArbitraryLayout)
diff --git a/src/relay/op/memory/memory.cc b/src/relay/op/memory/memory.cc
index 68a83ebba1fe6..5339d48e3a2f1 100644
--- a/src/relay/op/memory/memory.cc
+++ b/src/relay/op/memory/memory.cc
@@ -86,6 +86,7 @@ RELAY_REGISTER_OP("memory.alloc_storage")
     .add_argument("size", "Tensor", "The size of the storage to allocate.")
     .add_argument("alignment", "Tensor", "The alignment of the storage.")
     .add_type_rel("AllocStorage", AllocStorageRel)
+    .set_attrs_type_key("relay.attrs.AllocStorageAttrs")
     .set_support_level(10)
     .set_attr<TOpPattern>("TOpPattern", kOpaque)
     .set_attr<TOpIsStateful>("TOpIsStateful", false)
@@ -200,6 +201,7 @@ RELAY_REGISTER_OP("memory.alloc_tensor")
     .add_argument("offset", "Tensor", "The offset into the backing storage.")
     .add_argument("shape", "Tensor", "The shape of the tensor to allocate.")
     .add_type_rel("AllocTensor", AllocTensorRel)
+    .set_attrs_type_key("relay.attrs.AllocTensorAttrs")
     .set_support_level(10)
     .set_attr<TOpPattern>("TOpPattern", kOpaque)
     .set_attr<TOpIsStateful>("TOpIsStateful", false)
diff --git a/src/relay/op/vm/vm.cc b/src/relay/op/vm/vm.cc
index a74a259a114f6..be31b54829379 100644
--- a/src/relay/op/vm/vm.cc
+++ b/src/relay/op/vm/vm.cc
@@ -50,6 +50,7 @@ RELAY_REGISTER_OP("vm.shape_of")
     .set_num_inputs(1)
     .add_argument("tensor", "Tensor", "The input tensor")
     .add_type_rel("ShapeOf", ShapeOfRel)
+    .set_attrs_type_key("relay.attrs.ShapeOfAttrs")
     .set_support_level(10)
     .set_attr<TOpPattern>("TOpPattern", kOpaque)
     .set_attr<TOpIsStateful>("TOpIsStateful", false)
@@ -131,6 +132,7 @@ RELAY_REGISTER_OP("vm.shape_func")
     .add_argument("func", "Function", "The operation to call")
     .add_argument("ins", "Tuple", "The input tensors.")
     .add_argument("outs", "Tuple", "The output tensors.")
+    .set_attrs_type_key("relay.attrs.ShapeFuncAttrs")
     .add_type_rel("ShapeFuncRel", ShapeFuncRel)
     .set_support_level(10)
     .set_attr<TOpPattern>("TOpPattern", kOpaque)
@@ -214,6 +216,7 @@ RELAY_REGISTER_OP("vm.reshape_tensor")
     .add_argument("data", "Tensor", "The input tensor")
     .add_argument("shape", "Tensor", "The output shape tensor")
     .add_type_rel("ReshapeTensor", ReshapeTensorRel)
+    .set_attrs_type_key("relay.attrs.ReshapeTensorAttrs")
     .set_support_level(10)
     .set_attr<TOpPattern>("TOpPattern", kOpaque)
     .set_attr<TOpIsStateful>("TOpIsStateful", false)
diff --git a/src/relay/transforms/device_planner.cc b/src/relay/transforms/device_planner.cc
index cdbfcc78c5662..9f959a3bc1d9f 100644
--- a/src/relay/transforms/device_planner.cc
+++ b/src/relay/transforms/device_planner.cc
@@ -256,7 +256,6 @@
 #include <tvm/relay/analysis.h>
 #include <tvm/relay/attrs/annotation.h>
 #include <tvm/relay/attrs/device_copy.h>
-#include <tvm/relay/attrs/function.h>
 #include <tvm/relay/attrs/memory.h>
 #include <tvm/relay/expr_functor.h>
 #include <tvm/relay/op.h>
@@ -697,6 +696,22 @@ class DeviceDomains {
         args_and_result.emplace_back(free_domain);
       }
       args_and_result.emplace_back(free_domain);
+    } else if (call->op->IsInstance<ConstructorNode>()) {
+      // <constructor>(arg1, ..., argn)
+      // <constructor>: fn(?x1?, ..., ?xn?):?xr?
+      // where we force all possibly higher-order ?xi? to be collapsed to the first-order ?xr?.
+      // TODO(mbs): This assumes we've eta-expanded constructors, thus all constructors appear
+      // in callee positions.
+      const auto* func_type_node = call->op->checked_type().as<FuncTypeNode>();
+      ICHECK_NOTNULL(func_type_node);
+      ICHECK_EQ(func_type_node->arg_types.size(), call->args.size());
+      auto result_domain = Free(func_type_node->ret_type);  // first-order
+      for (const auto& arg_type : func_type_node->arg_types) {
+        auto param_domain = Free(arg_type);  // possibly higher-order
+        UnifyCollapsed(result_domain, param_domain);  // collapse if required
+        args_and_result.emplace_back(param_domain);
+      }
+      args_and_result.emplace_back(result_domain);
     } else {
       // Defer to normal case where op can be an arbitrary expression.
       return DomainFor(call->op);
@@ -1038,7 +1053,8 @@ class DeviceAnalyzer : public ExprVisitor {
     VLOG(1) << "initial call function domain:" << std::endl
             << domains_->ToString(func_domain) << std::endl
             << "and implied domain:" << std::endl
-            << domains_->ToString(implied_domain) << "for call:" << std::endl
+            << domains_->ToString(implied_domain) << std::endl
+            << "for call:" << std::endl
             << PrettyPrint(call);
 
     // The above must match.
@@ -1113,9 +1129,7 @@ class DeviceAnalyzer : public ExprVisitor {
 
     // If the function already has an "on_device" attribute then we can further
     // constrain the function's domain to match it.
-    Optional<Attrs> opt_attrs =
-        function_node->GetAttr<Attrs>(attr::kFunctionAttrsKey);
-    if (opt_attrs) {
+    if (GetFunctionResultDeviceType(function_node) != kInvalidDeviceType) {
       std::vector<DeviceDomainPtr> args_and_result;
       for (size_t i = 0; i < function_node->params.size(); ++i) {
         args_and_result.emplace_back(
@@ -1213,8 +1227,8 @@ class DeviceAnalyzer : public ExprVisitor {
   }
 
   void VisitExpr_(const ConstructorNode* constructor_node) final {
-    // Probably needs to be device polymorphic.
-    domains_->DomainFor(GetRef<Constructor>(constructor_node));
+    // no-op, constructors are handled at their call-sites.
+    // TODO(mbs): Assumes eta-expansion
   }
 
   void VisitExpr_(const IfNode* if_node) final {
@@ -1396,7 +1410,7 @@ class DeviceCapturer : public ExprMutator {
   }
 
  private:
-  // Nothing interesting for VarNode, ConstantNode, GlobalVarNode and OpNode.
+  // Nothing interesting for VarNode, ConstantNode, GlobalVarNode, OpNode and ConstructorNode
 
   Expr VisitExpr_(const TupleNode* tuple_node) final {
     auto tuple = GetRef<Tuple>(tuple_node);
@@ -1573,13 +1587,6 @@ class DeviceCapturer : public ExprMutator {
     return RefWrite(ref, value, ref_write_node->span);
   }
 
-  Expr VisitExpr_(const ConstructorNode* constructor_node) final {
-    auto constructor = GetRef<Constructor>(constructor_node);
-    // check we have a device type.
-    (void)GetDeviceType(constructor);
-    return constructor;
-  }
-
   Expr VisitExpr_(const MatchNode* match_node) final {
     auto match = GetRef<Match>(match_node);
     Expr data = VisitChild(match, match_node->data);
diff --git a/src/relay/transforms/type_infer.cc b/src/relay/transforms/type_infer.cc
index 6c2371716b167..ebdf1fed2fab5 100644
--- a/src/relay/transforms/type_infer.cc
+++ b/src/relay/transforms/type_infer.cc
@@ -824,7 +824,6 @@ Pass InferType() {
   auto pass_info = PassInfo(0, "InferType", {});
   return tvm::transform::CreateModulePass(
       [=](IRModule mod, const PassContext& pass_ctx) {
-        DLOG(INFO) << "tvm::relay::transform::InferType";
         // Execute the pass function and return a new module.
         IRModule updated_mod = mod->ShallowCopy();
 
diff --git a/tests/python/relay/op/annotation/test_annotation.py b/tests/python/relay/op/annotation/test_annotation.py
index 51daa9aaa06a4..58e559eb96809 100644
--- a/tests/python/relay/op/annotation/test_annotation.py
+++ b/tests/python/relay/op/annotation/test_annotation.py
@@ -54,13 +54,10 @@ def test_function_on_device():
     f = relay.Function([x, y], relay.add(x, y))
     func = relay.annotation.function_on_device(f, ["cpu", "cuda"], "cuda")
     assert isinstance(func, relay.Function)
-    assert len(func.attrs["on_device"].param_device_types) == 2
-    assert func.attrs["on_device"].param_device_types[0] == 1
-    # ie kDLCPU
-    assert func.attrs["on_device"].param_device_types[1] == 2
-    # ie kDLCUDA
-    assert func.attrs["on_device"].result_device_type == 2
-    # ie KDLCUDA
+    assert len(func.attrs["param_device_types"]) == 2
+    assert func.attrs["param_device_types"][0] == 1  # ie kDLCPU
+    assert func.attrs["param_device_types"][1] == 2  # ie kDLCUDA
+    assert func.attrs["result_device_type"] == 2  # ie KDLCUDA
 
 
 if __name__ == "__main__":
diff --git a/tests/python/relay/test_pass_plan_devices.py b/tests/python/relay/test_pass_plan_devices.py
index 6c3d2e266b8d2..e06ad333aa3d5 100644
--- a/tests/python/relay/test_pass_plan_devices.py
+++ b/tests/python/relay/test_pass_plan_devices.py
@@ -18,23 +18,21 @@
 
 """Unit tests for the PlanDevices pass. We check:
     - The pass alone given the expected AST, though we need to manually run InferTypes.
-    - The pass is idempotent."""
-
-# TODO(mbs): All the input/expected programs should be directly quoted using @script
-# TODO(mbs): Not testing Match and Constructor since not supported by Python bindings?
-# TODO(mbs): Add back reference implementation tests once VM is ready.
+    - The pass is idempotent.
+    - Execution on the VM backend yields the correct result."""
 
 import tvm
 from tvm import relay
 import tvm.testing
 import numpy as np
 
-N = 5
-M = 7
 CPU = tvm.device("cpu")  # device_type=1
 GPU = tvm.device("cuda")  # device_type=2
 DEFAULT = GPU
 
+core = tvm.IRModule()
+core.import_from_std("core.rly")
+
 
 def rewrite_and_assert(in_mod, expected_mod):
     """Manually run the pass and assert it's structurally equals to the expected."""
@@ -42,7 +40,7 @@ def rewrite_and_assert(in_mod, expected_mod):
     actual_mod = relay.transform.PlanDevices(DEFAULT)(actual_mod)
     actual_mod = relay.transform.InferType()(actual_mod)
     expected_mod = relay.transform.InferType()(expected_mod)
-    if not tvm.ir.structural_equal(actual_mod, expected_mod):
+    if not tvm.ir.structural_equal(actual_mod, expected_mod, True):
         # Print everything in full so we can see what's going on when things fail.
         print("Input module:")
         print(in_mod)
@@ -51,7 +49,20 @@ def rewrite_and_assert(in_mod, expected_mod):
         print("Actual module:")
         print(actual_mod)
         # Assert again so as to see the actual disagreeing sub-expressions.
-        tvm.ir.assert_structural_equal(actual_mod, expected_mod)
+        tvm.ir.assert_structural_equal(actual_mod, expected_mod, True)
+
+
+def eval_and_assert(in_mod: tvm.IRModule, reference_func, args):
+    """Test the standard compilation flow gives us a function which agrees with the Numpy
+    reference implementation."""
+    if not tvm.runtime.enabled(GPU):
+        print("Not evaluating since device %s is not enabled" % GPU)
+        return
+    with tvm.transform.PassContext(opt_level=3):
+        compiled = relay.create_executor("vm", mod=in_mod, device=GPU, target="cuda").evaluate()
+        actual = compiled(*args).numpy()
+        expected = reference_func(*args)
+        tvm.testing.assert_allclose(actual, expected)
 
 
 def rand(shape):
@@ -68,455 +79,371 @@ def exercise(in_mod: tvm.IRModule, expected_mod: tvm.IRModule, reference_func, a
     rewrite_and_assert(in_mod, expected_mod)
     # Idempotence
     rewrite_and_assert(expected_mod, expected_mod)
-    # TODO(mbs): Add back compiling and comparing to reference implementation once VM is ready.
-
-
-#
-# Annotation shorthands
-#
-
-
-def on_cpu(expr: relay.Expr):
-    return relay.annotation.on_device(expr, CPU)
-
-
-def on_gpu(expr: relay.Expr):
-    return relay.annotation.on_device(expr, GPU)
-
-
-def cpu_to_gpu(expr: relay.Expr):
-    return relay.op.device_copy(expr, CPU, GPU)
-
-
-def gpu_to_cpu(expr: relay.Expr):
-    return relay.op.device_copy(expr, GPU, CPU)
-
-
-def fixed_cpu(expr: relay.Expr):
-    return relay.annotation.on_device(expr, CPU, True)
-
-
-def fixed_gpu(expr: relay.Expr):
-    return relay.annotation.on_device(expr, GPU, True)
+    # The VM can compile and possibly even run the module
+    # TODO(mbs): Disabled until VM supports new device planning.
+    # if not (reference_func is None) and not (args is None):
+    #    eval_and_assert(in_mod, reference_func, args)
 
 
 def test_plain():
-    shape = (N, M)
-    a = relay.var("a", shape=shape)
-    b = relay.var("b", shape=shape)
-    c = relay.var("c", shape=shape)
-    d = relay.var("d", shape=shape)
-
-    # def @main(a, b, c, d) { subtract(add(a, b), add(c, d)) }
+    # Everything defaults to GPU
     def input():
-        return tvm.IRModule.from_expr(
-            relay.Function([a, b, c, d], relay.subtract(relay.add(a, b), relay.add(c, d)))
+        return tvm.parser.fromtext(
+            """
+            #[version = "0.0.5"]
+            def @main(%a: Tensor[(5, 7), float32], %b: Tensor[(5, 7), float32],
+                      %c: Tensor[(5, 7), float32], %d: Tensor[(5, 7), float32]) {
+              %0 = add(%a, %b);
+              %1 = add(%c, %d);
+              subtract(%0, %1)
+            }
+        """
         )
 
-    # def @main(a, b, c, d, on_device={param_device_types=[2,2,2,2], result_device_type=2}) {
-    #   subtract(add(a, b), add(c, d))
-    # }
     def expected():
-        return tvm.IRModule.from_expr(
-            relay.annotation.function_on_device(
-                relay.Function([a, b, c, d], relay.subtract(relay.add(a, b), relay.add(c, d))),
-                [GPU, GPU, GPU, GPU],
-                GPU,
-            )
+        return tvm.parser.fromtext(
+            """
+            #[version = "0.0.5"]
+            def @main(%a: Tensor[(5, 7), float32], %b: Tensor[(5, 7), float32],
+                      %c: Tensor[(5, 7), float32], %d: Tensor[(5, 7), float32],
+                      param_device_types=[2, 2, 2, 2], result_device_type=2) {
+              %0 = add(%a, %b);
+              %1 = add(%c, %d);
+              subtract(%0, %1)
+            }
+        """
         )
 
     def ref(a, b, c, d):
         return np.subtract(np.add(a, b), np.add(c, d))
 
-    exercise(input(), expected(), ref, rands(shape, 4))
+    exercise(input(), expected(), ref, rands((5, 7), 4))
 
 
 def test_left_add_on_cpu():
-    shape = (N, M)
-    a = relay.var("a", shape=shape)
-    b = relay.var("b", shape=shape)
-    c = relay.var("c", shape=shape)
-    d = relay.var("d", shape=shape)
-
-    # def @main(a, b, c, d) { subtract(on_cpu(add(a, b)), add(c, d)) }
+    # Force some args to be on CPU, rest default to GPU.
     def input():
-        return tvm.IRModule.from_expr(
-            relay.Function([a, b, c, d], relay.subtract(on_cpu(relay.add(a, b)), relay.add(c, d)))
+        return tvm.parser.fromtext(
+            """
+            #[version = "0.0.5"]
+            def @main(%a: Tensor[(5, 7), float32], %b: Tensor[(5, 7), float32],
+                      %c: Tensor[(5, 7), float32], %d: Tensor[(5, 7), float32]) {
+              %0 = add(%a, %b);
+              %1 = on_device(%0, device_type=1);
+              %2 = add(%c, %d);
+              subtract(%1, %2)
+            }
+        """
         )
 
-    # def @main(a, b, c, d, on_device={param_device_types=[1,1,2,2], result_device_type=2}) {
-    #    subtract(cpu_to_gpu(fixed_cpu(add(a, b)), add(c, d))
     def expected():
-        return tvm.IRModule.from_expr(
-            relay.annotation.function_on_device(
-                relay.Function(
-                    [a, b, c, d],
-                    relay.subtract(cpu_to_gpu(fixed_cpu(relay.add(a, b))), relay.add(c, d)),
-                ),
-                [CPU, CPU, GPU, GPU],
-                GPU,
-            )
+        return tvm.parser.fromtext(
+            """
+            #[version = "0.0.5"]
+            def @main(%a: Tensor[(5, 7), float32], %b: Tensor[(5, 7), float32],
+                      %c: Tensor[(5, 7), float32], %d: Tensor[(5, 7), float32],
+                      param_device_types=[1, 1, 2, 2], result_device_type=2) {
+              %0 = add(%a, %b);
+              %1 = on_device(%0, device_type=1, is_fixed=True);
+              %2 = device_copy(%1, src_dev_type=1, dst_dev_type=2);
+              %3 = add(%c, %d);
+              subtract(%2, %3)
+            }
+        """
         )
 
     def ref(a, b, c, d):
         return np.subtract(np.add(a, b), np.add(c, d))
 
-    exercise(input(), expected(), ref, rands(shape, 4))
+    exercise(input(), expected(), ref, rands((5, 7), 4))
 
 
 def test_left_add_on_cpu_via_copy():
-    shape = (N, M)
-    a = relay.var("a", shape=shape)
-    b = relay.var("b", shape=shape)
-    c = relay.var("c", shape=shape)
-    d = relay.var("d", shape=shape)
-
-    # def @main(a, b, c, d) { subtract(cpu_to_gpu(add(a, b)), add(c, d)) }
+    # As for test_left_add_on_cpu, but with an explicit device_copy.
     def input():
-        return tvm.IRModule.from_expr(
-            relay.Function(
-                [a, b, c, d], relay.subtract(cpu_to_gpu(relay.add(a, b)), relay.add(c, d))
-            )
+        return tvm.parser.fromtext(
+            """
+            #[version = "0.0.5"]
+            def @main(%a: Tensor[(5, 7), float32], %b: Tensor[(5, 7), float32],
+                      %c: Tensor[(5, 7), float32], %d: Tensor[(5, 7), float32]) {
+              %0 = add(%a, %b);
+              %1 = device_copy(%0, src_dev_type=1, dst_dev_type=2);
+              %2 = add(%c, %d);
+              subtract(%1, %2)
+            }
+        """
         )
 
-    # def @main(a, b, c, d, on_device={param_device_types=[1,1,2,2], result_device_type=2}) {
-    #    subtract(cpu_to_gpu(fixed_cpu(add(a, b)), add(c, d))
     def expected():
-        return tvm.IRModule.from_expr(
-            relay.annotation.function_on_device(
-                relay.Function(
-                    [a, b, c, d],
-                    relay.subtract(cpu_to_gpu(fixed_cpu(relay.add(a, b))), relay.add(c, d)),
-                ),
-                [CPU, CPU, GPU, GPU],
-                GPU,
-            )
+        return tvm.parser.fromtext(
+            """
+            #[version = "0.0.5"]
+            def @main(%a: Tensor[(5, 7), float32], %b: Tensor[(5, 7), float32],
+                      %c: Tensor[(5, 7), float32], %d: Tensor[(5, 7), float32],
+                      param_device_types=[1, 1, 2, 2], result_device_type=2) {
+              %0 = add(%a, %b);
+              %1 = on_device(%0, device_type=1, is_fixed=True);
+              %2 = device_copy(%1, src_dev_type=1, dst_dev_type=2);
+              %3 = add(%c, %d);
+              subtract(%2, %3)
+            }
+        """
         )
 
     def ref(a, b, c, d):
         return np.subtract(np.add(a, b), np.add(c, d))
 
-    exercise(input(), expected(), ref, rands(shape, 4))
+    exercise(input(), expected(), ref, rands((5, 7), 4))
 
 
 def test_both_adds_on_cpu():
-    shape = (N, M)
-    a = relay.var("a", shape=shape)
-    b = relay.var("b", shape=shape)
-    c = relay.var("c", shape=shape)
-    d = relay.var("d", shape=shape)
-
-    # def @main(a, b, c, d) { subtract(on_cpu(add(a, b)), on_cpu(add(c, d))) }
     def input():
-        return tvm.IRModule.from_expr(
-            relay.Function(
-                [a, b, c, d], relay.subtract(on_cpu(relay.add(a, b)), on_cpu(relay.add(c, d)))
-            )
+        return tvm.parser.fromtext(
+            """
+            #[version = "0.0.5"]
+            def @main(%a: Tensor[(5, 7), float32], %b: Tensor[(5, 7), float32],
+                      %c: Tensor[(5, 7), float32], %d: Tensor[(5, 7), float32]) {
+              %0 = add(%a, %b);
+              %1 = add(%c, %d);
+              %2 = on_device(%0, device_type=1);
+              %3 = on_device(%1, device_type=1);
+              subtract(%2, %3)
+            }
+        """
         )
 
-    # def @main(a, b, c, d, on_device={param_device_types=[1,1,1,1], result_device_type=2}) {
-    #    subtract(cpu_to_gpu(fixed_cpu(add(a, b)), cpu_to_gpu(fixed_cpu(add(c, d))))
     def expected():
-        return tvm.IRModule.from_expr(
-            relay.annotation.function_on_device(
-                relay.Function(
-                    [a, b, c, d],
-                    relay.subtract(
-                        cpu_to_gpu(fixed_cpu(relay.add(a, b))),
-                        cpu_to_gpu(fixed_cpu(relay.add(c, d))),
-                    ),
-                ),
-                [CPU, CPU, CPU, CPU],
-                GPU,
-            )
+        return tvm.parser.fromtext(
+            """
+            #[version = "0.0.5"]
+            def @main(%a: Tensor[(5, 7), float32], %b: Tensor[(5, 7), float32],
+                      %c: Tensor[(5, 7), float32], %d: Tensor[(5, 7), float32],
+                      param_device_types=[1, 1, 1, 1], result_device_type=2) {
+              %0 = add(%a, %b);
+              %1 = on_device(%0, device_type=1, is_fixed=True);
+              %2 = add(%c, %d);
+              %3 = on_device(%2, device_type=1, is_fixed=True);
+              %4 = device_copy(%1, src_dev_type=1, dst_dev_type=2);
+              %5 = device_copy(%3, src_dev_type=1, dst_dev_type=2);
+              subtract(%4, %5)
+            }
+        """
         )
 
     def ref(a, b, c, d):
         return np.subtract(np.add(a, b), np.add(c, d))
 
-    exercise(input(), expected(), ref, rands(shape, 4))
+    exercise(input(), expected(), ref, rands((5, 7), 4))
 
 
 def test_sharing():
-    shape = (N, M)
-    a = relay.var("a", shape=shape)
-    b = relay.var("b", shape=shape)
-
-    # def @main(a, b) {
-    #   %0 = add(a, b)
-    #   subtract(on_cpu(%0), %0) }
+    # The same add sub-expression is annotated twice.
     def input():
-        add = relay.add(a, b)
-        return tvm.IRModule.from_expr(
-            relay.Function([a, b], relay.subtract(on_cpu(add), on_cpu(add)))
+        return tvm.parser.fromtext(
+            """
+            #[version = "0.0.5"]
+            def @main(%a: Tensor[(5, 7), float32], %b: Tensor[(5, 7), float32]) {
+              %0 = add(%a, %b);
+              %1 = on_device(%0, device_type=1);
+              %2 = on_device(%0, device_type=1);
+              subtract(%1, %2)
+            }
+        """
         )
 
-    # def @main(a, b, on_device={param_device_types=[1,1], result_device_type=2}) {
-    #    %0 = add(a, b)
-    #    subtract(cpu_to_gpu(fixed_cpu(%0), cpu_to_gpu(fixed_cpu(%0)))
     def expected():
-        add = relay.add(a, b)
-        return tvm.IRModule.from_expr(
-            relay.annotation.function_on_device(
-                relay.Function(
-                    [a, b], relay.subtract(cpu_to_gpu(fixed_cpu(add)), cpu_to_gpu(fixed_cpu(add)))
-                ),
-                [CPU, CPU],
-                GPU,
-            )
+        return tvm.parser.fromtext(
+            """
+            #[version = "0.0.5"]
+            def @main(%a: Tensor[(5, 7), float32], %b: Tensor[(5, 7), float32],
+                      param_device_types=[1, 1], result_device_type=2) {
+              %0 = add(%a, %b);
+              %1 = on_device(%0, device_type=1, is_fixed=True);
+              %2 = on_device(%0, device_type=1, is_fixed=True);
+              %3 = device_copy(%1, src_dev_type=1, dst_dev_type=2);
+              %4 = device_copy(%2, src_dev_type=1, dst_dev_type=2);
+              subtract(%3, %4)
+            }
+        """
         )
 
     def ref(a, b):
         x = np.add(a, b)
         return np.subtract(x, x)
 
-    exercise(input(), expected(), ref, rands(shape, 2))
+    exercise(input(), expected(), ref, rands((5, 7), 2))
 
 
 def test_let_on_cpu():
-    shape = (N, M)
-    a = relay.var("a", shape=shape)
-    b = relay.var("b", shape=shape)
-    c = relay.var("c", shape=shape)
-    d = relay.var("d", shape=shape)
-    l = relay.Var("l")
-    r = relay.Var("r")
-
-    # def @main(a, b, c, d) {
-    #   let l = add(a, b);
-    #   let r = add(c, d);
-    #   subtract(on_cpu(l), r)
-    # }
+    # The device for a let-bound expression can flow from uses of the let-bound var.
     def input():
-        return tvm.IRModule.from_expr(
-            relay.Function(
-                [a, b, c, d],
-                relay.Let(
-                    l, relay.add(a, b), relay.Let(r, relay.add(c, d), relay.subtract(on_cpu(l), r))
-                ),
-            )
+        return tvm.parser.fromtext(
+            """
+            #[version = "0.0.5"]
+            def @main(%a: Tensor[(5, 7), float32], %b: Tensor[(5, 7), float32],
+                      %c: Tensor[(5, 7), float32], %d: Tensor[(5, 7), float32]) {
+              let %l = add(%a, %b);
+              let %r = add(%c, %d);
+              %0 = on_device(%l, device_type=1);
+              subtract(%0, %r)
+            }
+        """
         )
 
-    # def @main(a, b, c, d, on_device={param_device_types=[1,1,2,2], result_device_type=2}) {
-    #    let l = fixed_cpu(add(a, b));
-    #    let r = add(c, d);
-    #    subtract(cpu_to_gpu(l), r)
-    # }
     def expected():
-        return tvm.IRModule.from_expr(
-            relay.annotation.function_on_device(
-                relay.Function(
-                    [a, b, c, d],
-                    relay.Let(
-                        l,
-                        fixed_cpu(relay.add(a, b)),
-                        relay.Let(r, relay.add(c, d), relay.subtract(cpu_to_gpu(l), r)),
-                    ),
-                ),
-                [CPU, CPU, GPU, GPU],
-                GPU,
-            )
+        return tvm.parser.fromtext(
+            """
+            #[version = "0.0.5"]
+            def @main(%a: Tensor[(5, 7), float32], %b: Tensor[(5, 7), float32],
+                      %c: Tensor[(5, 7), float32], %d: Tensor[(5, 7), float32],
+                      param_device_types=[1, 1, 2, 2], result_device_type=2) {
+              %0 = add(%a, %b);
+              let %l = on_device(%0, device_type=1, is_fixed=True);
+              let %r = add(%c, %d);
+              %1 = device_copy(%l, src_dev_type=1, dst_dev_type=2);
+              subtract(%1, %r)
+            }
+        """
         )
 
     def ref(a, b, c, d):
         return np.subtract(np.add(a, b), np.add(c, d))
 
-    exercise(input(), expected(), ref, rands(shape, 4))
+    exercise(input(), expected(), ref, rands((5, 7), 4))
 
 
 def test_func_param_on_cpu():
-    shape = (N, M)
-    a = relay.var("a", shape=shape)
-    b = relay.var("b", shape=shape)
-    c = relay.var("c", shape=shape)
-    d = relay.var("d", shape=shape)
-    f = relay.Var("f")
-    x = relay.Var("x")
-    y = relay.Var("y")
-
-    # def @main(a, b, c, d) {
-    #   let f = fn(x, y) { on_cpu(add(x, y)) }   -- forces both body and result on CPU
-    #   subtract(f(a, b), add(c, d))
-    # }
+    # Devices for function parameters flow to call sites.
     def input():
-        return tvm.IRModule.from_expr(
-            relay.Function(
-                [a, b, c, d],
-                relay.Let(
-                    f,
-                    relay.Function([x, y], on_cpu(relay.add(x, y))),
-                    relay.subtract(relay.Call(f, [a, b]), relay.add(c, d)),
-                ),
-            )
+        return tvm.parser.fromtext(
+            """
+            #[version = "0.0.5"]
+            def @main(%a: Tensor[(5, 7), float32], %b: Tensor[(5, 7), float32],
+                      %c: Tensor[(5, 7), float32], %d: Tensor[(5, 7), float32]) {
+              let %f = fn (%x, %y) {
+                %0 = add(%x, %y);
+                on_device(%0, device_type=1)
+              };
+              %1 = %f(%a, %b);
+              %2 = add(%c, %d);
+              subtract(%1, %2)
+            }
+        """
         )
 
-    # def @main(a, b, c, d, on_device={param_device_types=[1,1,1,1], result_device_type=1}) {
-    #   let f = fn(x, y, on_device={param_device_types[1,1], result_device_type=1}) {
-    #     add(x, y)
-    #   };
-    #   subtract(f(a, b), add(c, d))
-    # }
     def expected():
-        return tvm.IRModule.from_expr(
-            relay.annotation.function_on_device(
-                relay.Function(
-                    [a, b, c, d],
-                    relay.Let(
-                        f,
-                        relay.annotation.function_on_device(
-                            relay.Function([x, y], relay.add(x, y)), [CPU, CPU], CPU
-                        ),
-                        relay.subtract(relay.Call(f, [a, b]), relay.add(c, d)),
-                    ),
-                ),
-                [CPU, CPU, CPU, CPU],
-                CPU,
-            )
+        return tvm.parser.fromtext(
+            """
+            #[version = "0.0.5"]
+            def @main(%a: Tensor[(5, 7), float32], %b: Tensor[(5, 7), float32],
+                      %c: Tensor[(5, 7), float32], %d: Tensor[(5, 7), float32],
+                      param_device_types=[1, 1, 1, 1], result_device_type=1) {
+              let %f = fn (%x, %y, param_device_types=[1, 1], result_device_type=1) {
+                add(%x, %y)
+              };
+              %0 = %f(%a, %b);
+              %1 = add(%c, %d);
+              subtract(%0, %1)
+            }
+        """
         )
 
     def ref(a, b, c, d):
         return np.subtract(np.add(a, b), np.add(c, d))
 
-    exercise(input(), expected(), ref, rands(shape, 4))
+    exercise(input(), expected(), ref, rands((5, 7), 4))
 
 
 def test_func_result_on_cpu():
-    shape = (N, M)
-    a = relay.var("a", shape=shape)
-    b = relay.var("b", shape=shape)
-    c = relay.var("c", shape=shape)
-    d = relay.var("d", shape=shape)
-    f = relay.Var("f")
-    x = relay.Var("x")
-    y = relay.Var("y")
-
-    # def @main(a, b, c, d) {
-    #   let f = fn(x, y) { add(x, y) }
-    #   subtract(on_cpu(f(a, b)), add(c, d))
-    # }
+    # Devices for call sites flow to function results.
     def input():
-        return tvm.IRModule.from_expr(
-            relay.Function(
-                [a, b, c, d],
-                relay.Let(
-                    f,
-                    relay.Function([x, y], relay.add(x, y)),
-                    relay.subtract(on_cpu(relay.Call(f, [a, b])), relay.add(c, d)),
-                ),
-            )
+        return tvm.parser.fromtext(
+            """
+            #[version = "0.0.5"]
+            def @main(%a: Tensor[(5, 7), float32], %b: Tensor[(5, 7), float32],
+                      %c: Tensor[(5, 7), float32], %d: Tensor[(5, 7), float32]) {
+              let %f = fn (%x, %y) {
+                add(%x, %y)
+              };
+              %0 = %f(%a, %b);
+              %1 = on_device(%0, device_type=1);
+              %2 = add(%c, %d);
+              subtract(%1, %2)
+            }
+        """
         )
 
-    # def @main(a, b, c, d, on_device={param_device_types=[1,1,2,2], result_device_type=2}) {
-    #   let f = fixed_cpu(fn(x, y, on_device={param_device_types=[1,1], result_device_type=1}) {
-    #     add(x, y)
-    #   });
-    #   subtract(cpu_to_gpu(fixed_cpu(f(a, b))), add(c, d))
-    # }
     def expected():
-        return tvm.IRModule.from_expr(
-            relay.annotation.function_on_device(
-                relay.Function(
-                    [a, b, c, d],
-                    relay.Let(
-                        f,
-                        fixed_cpu(
-                            relay.annotation.function_on_device(
-                                relay.Function([x, y], relay.add(x, y)), [CPU, CPU], CPU
-                            )
-                        ),
-                        relay.subtract(
-                            cpu_to_gpu(fixed_cpu(relay.Call(f, [a, b]))), relay.add(c, d)
-                        ),
-                    ),
-                ),
-                [CPU, CPU, GPU, GPU],
-                GPU,
-            )
+        return tvm.parser.fromtext(
+            """
+            #[version = "0.0.5"]
+            def @main(%a: Tensor[(5, 7), float32], %b: Tensor[(5, 7), float32],
+                      %c: Tensor[(5, 7), float32], %d: Tensor[(5, 7), float32],
+                      param_device_types=[1, 1, 2, 2], result_device_type=2) {
+              %0 = fn (%x, %y, param_device_types=[1, 1], result_device_type=1) {
+                add(%x, %y)
+              };
+              let %f = on_device(%0, device_type=1, is_fixed=True);
+              %1 = %f(%a, %b);
+              %2 = on_device(%1, device_type=1, is_fixed=True);
+              %3 = device_copy(%2, src_dev_type=1, dst_dev_type=2);
+              %4 = add(%c, %d);
+              subtract(%3, %4)
+            }
+        """
         )
 
     def ref(a, b, c, d):
         return np.subtract(np.add(a, b), np.add(c, d))
 
-    exercise(input(), expected(), ref, rands(shape, 4))
+    exercise(input(), expected(), ref, rands((5, 7), 4))
 
 
 def test_higher_order():
-    shape = (N, M)
-    x = relay.var("x", shape=shape)
-    y = relay.var("y", shape=shape)
-    f = relay.Var("f")
-    g = relay.Var("g")
-    a = relay.Var("a")
-    h = relay.Var("h")
-    b = relay.Var("b")
-
-    # The constraint on a flows back to y via f and h
-    # def @main(x, y) {
-    #   let f = fn(g) { fn(a) { add(g(on_cpu(a)), x) } }
-    #   let h = fn(b) { relu(b) }
-    #   subtract(x, f(h)(y))
-    # }
+    # The constraint on %a flows back to %y via %f and %h
     def input():
-        return tvm.IRModule.from_expr(
-            relay.Function(
-                [x, y],
-                relay.Let(
-                    f,
-                    relay.Function(
-                        [g], relay.Function([a], relay.add(relay.Call(g, [on_cpu(a)]), x))
-                    ),
-                    relay.Let(
-                        h,
-                        relay.Function([b], relay.negative(b)),
-                        relay.subtract(x, relay.Call(relay.Call(f, [h]), [y])),
-                    ),
-                ),
-            )
+        return tvm.parser.fromtext(
+            """
+            #[version = "0.0.5"]
+            def @main(%x: Tensor[(5, 7), float32], %y: Tensor[(5, 7), float32]) {
+              let %f = fn (%g) {
+                fn (%a) {
+                  %0 = on_device(%a, device_type=1);
+                  %1 = %g(%0);
+                  add(%1, %x)
+                }
+              };
+              let %h = fn (%b) {
+                negative(%b)
+              };
+              %2 = %f(%h);
+              %3 = %2(%y);
+              subtract(%x, %3)
+            }
+        """
         )
 
-    # def @main(x, y, on_device={param_device_types=[GPU, CPU], result_device_type=GPU}) {
-    #   let f = fn(g, on_device={param_device_types=[GPU], result_device_type=GPU}) {
-    #     fn(a, on_device={param_device_types=[CPU], result_device_type=GPU}) {
-    #       add(g(cpu_to_gpu(a)), x)
-    #     }
-    #   }
-    #   let h = fn(b, on_device={param_device_types=[GPU], result_device_type=GPU}) { negative(b) }
-    #   subtract(x, f(h)(y))
-    # }
     def expected():
-        # Yeah, this is illegible.
-        return tvm.IRModule.from_expr(
-            relay.annotation.function_on_device(
-                relay.Function(
-                    [x, y],
-                    relay.Let(
-                        f,
-                        relay.annotation.function_on_device(
-                            relay.Function(
-                                [g],
-                                relay.annotation.function_on_device(
-                                    relay.Function(
-                                        [a], relay.add(relay.Call(g, [cpu_to_gpu(a)]), x)
-                                    ),
-                                    [CPU],
-                                    GPU,
-                                ),
-                            ),
-                            [GPU],
-                            GPU,
-                        ),
-                        relay.Let(
-                            h,
-                            relay.annotation.function_on_device(
-                                relay.Function([b], relay.negative(b)), [GPU], GPU
-                            ),
-                            relay.subtract(x, relay.Call(relay.Call(f, [h]), [y])),
-                        ),
-                    ),
-                ),
-                [GPU, CPU],
-                GPU,
-            )
+        return tvm.parser.fromtext(
+            """
+            #[version = "0.0.5"]
+            def @main(%x: Tensor[(5, 7), float32], %y: Tensor[(5, 7), float32],
+                      param_device_types=[2, 1], result_device_type=2) {
+              let %f = fn (%g, param_device_types=[2], result_device_type=2) {
+                fn (%a, param_device_types=[1], result_device_type=2) {
+                  %0 = device_copy(%a, src_dev_type=1, dst_dev_type=2);
+                  %1 = %g(%0);
+                  add(%1, %x)
+                }
+              };
+              let %h = fn (%b, param_device_types=[2], result_device_type=2) {
+                negative(%b)
+              };
+              %2 = %f(%h);
+              %3 = %2(%y);
+              subtract(%x, %3)
+            }
+        """
         )
 
     def ref(x, y):
@@ -528,157 +455,125 @@ def h(b):
 
         return np.subtract(x, f(h)(y))
 
-    exercise(input(), expected(), ref, rands(shape, 2))
+    exercise(input(), expected(), ref, rands((5, 7), 2))
 
 
 def test_function_in_tuple():
-    shape = (N, M)
-    x = relay.var("x", shape=shape)
-    y = relay.var("y", shape=shape)
-    a = relay.var("a", shape=shape)
-    b = relay.var("b", shape=shape)
-    y = relay.var("y", shape=shape)
-    f = relay.Var("f")
-    t = relay.Var("t")
-
-    # Since f end up in a tuple its argument and result is forced to be on the CPU
-    # def @main(x, y) {
-    #   let f = fn(a, b) { add(a, on_cpu(b)) }
-    #   let t = (f, x)
-    #   t.0(t.1, y)
-    # }
+    # Since %f ends up in a tuple its argument and result is forced to be on the CPU
     def input():
-        return tvm.IRModule.from_expr(
-            relay.Function(
-                [x, y],
-                relay.Let(
-                    f,
-                    relay.Function([a, b], relay.add(a, on_cpu(b))),
-                    relay.Let(
-                        t,
-                        relay.Tuple([f, x]),
-                        relay.Call(relay.TupleGetItem(t, 0), [relay.TupleGetItem(t, 1), y]),
-                    ),
-                ),
-            )
+        return tvm.parser.fromtext(
+            """
+            #[version = "0.0.5"]
+            def @main(%x: Tensor[(5, 7), float32], %y: Tensor[(5, 7), float32]) {
+              let %f = fn (%a: Tensor[(5, 7), float32], %b: Tensor[(5, 7), float32]) {
+                %0 = on_device(%b, device_type=1);
+                add(%a, %0)
+              };
+              let %t = (%f, %x);
+              %1 = %t.1;
+              %2 = %t.0;
+              %2(%1, %y)
+            }
+        """
         )
 
-    # def @main(x, y, on_device={param_device_types=[1,1], result_device_type=1}) {
-    #   let f = fn(a, b, on_device={param_device_types=[1,1], result_device_type=1}) { add(a, b) }
-    #   let t = (f, x)
-    #   t.0(t.1, y)
-    # }
     def expected():
-        return tvm.IRModule.from_expr(
-            relay.annotation.function_on_device(
-                relay.Function(
-                    [x, y],
-                    relay.Let(
-                        f,
-                        relay.annotation.function_on_device(
-                            relay.Function([a, b], relay.add(a, b)), [CPU, CPU], CPU
-                        ),
-                        relay.Let(
-                            t,
-                            relay.Tuple([f, x]),
-                            relay.Call(relay.TupleGetItem(t, 0), [relay.TupleGetItem(t, 1), y]),
-                        ),
-                    ),
-                ),
-                [CPU, CPU],
-                CPU,
-            )
+        return tvm.parser.fromtext(
+            """
+            #[version = "0.0.5"] 
+            def @main(%x: Tensor[(5, 7), float32], %y: Tensor[(5, 7), float32],
+                      param_device_types=[1, 1], result_device_type=1) {
+              let %f = fn (%a: Tensor[(5, 7), float32], %b: Tensor[(5, 7), float32],
+                           param_device_types=[1, 1], result_device_type=1) {
+                add(%a, %b)
+              };
+              let %t = (%f, %x);
+              %0 = %t.1;
+              %1 = %t.0;
+              %1(%0, %y)
+            }
+        """
         )
 
     def ref(x, y):
         return np.add(x, y)
 
-    exercise(input(), expected(), ref, rands(shape, 2))
+    exercise(input(), expected(), ref, rands((5, 7), 2))
 
 
 def test_device_copy():
-    shape = (N, M)
-    x = relay.var("x", shape=shape)
-    const = relay.const(rand(shape))
+    const = rand((5, 7))
+    metatable = {"relay.Constant": [relay.const(const)]}
 
-    # def @main(x) { add(cpu_to_gpu(x), const) }
     def input():
-        return tvm.IRModule.from_expr(relay.Function([x], relay.add(cpu_to_gpu(x), const)))
+        return tvm.parser.parse(
+            """
+            #[version = "0.0.5"] 
+            def @main(%x: Tensor[(5, 7), float32]) {
+              %0 = device_copy(%x, src_dev_type=1, dst_dev_type=2);
+              add(%0, meta[relay.Constant][0])
+            }
+        """,
+            "from_string",
+            None,
+            metatable,
+        )
 
-    # def @main(x, on_device={param_device_types=[1], result_device_type=2}) {
-    #   add(cpu_to_gpu(x), constant)
-    # }
     def expected():
-        return tvm.IRModule.from_expr(
-            relay.annotation.function_on_device(
-                relay.Function([x], relay.add(cpu_to_gpu(x), const)), [CPU], GPU
-            )
+        return tvm.parser.parse(
+            """
+            #[version = "0.0.5"] 
+            def @main(%x: Tensor[(5, 7), float32], param_device_types=[1], result_device_type=2) {
+              %0 = device_copy(%x, src_dev_type=1, dst_dev_type=2);
+              add(%0, meta[relay.Constant][0])
+            }
+        """,
+            "from_string",
+            None,
+            metatable,
         )
 
     def ref(x):
-        return np.add(x, const.data.numpy())
+        return np.add(x, const)
 
-    exercise(input(), expected(), ref, rands(shape, 1))
+    exercise(input(), expected(), ref, rands((5, 7), 1))
 
 
 def test_shape_func():
-    p = relay.var("p")
-    data_shape = (relay.Any(),)
-    x = relay.var("x", shape=data_shape)
-    y = relay.var("y", shape=data_shape)
-    s = relay.var("s", shape=(1,), dtype="int64")
-
-    # def @main(x, s) {
-    #   let p = fixed_gpu(fn(y) { relu(y) })    -- simulates a primitive post FuseOps
-    #   shape_func(p,
-    #              (shape_of(fixed_gpu(x)),),   -- shape of primitive input tensor
-    #              (s,),                        -- space for output shape
-    #              [False])                     -- calling with input shapes not tensors
-    # }
     def input():
-        return tvm.IRModule.from_expr(
-            relay.Function(
-                [x, s],
-                relay.Let(
-                    p,
-                    fixed_gpu(relay.Function([y], relay.nn.relu(y))),
-                    relay.op.vm.shape_func(
-                        p,
-                        relay.Tuple([relay.op.vm.shape_of(fixed_gpu(x))]),
-                        relay.Tuple([s]),
-                        [False],
-                    ),
-                ),
-            )
+        return tvm.parser.fromtext(
+            """
+            #[version = "0.0.5"] 
+            def @main(%x: Tensor[(?), float32], %s: Tensor[(1), int64]) {
+              %0 = fn (%y: Tensor[(?), float32]) {
+                nn.relu(%y)
+              };
+              let %p = on_device(%0, device_type=2, is_fixed=True);
+              %1 = on_device(%x, device_type=2, is_fixed=True);
+              %2 = vm.shape_of(%1, dtype="int64");
+              %3 = (%2,);
+              %4 = (%s,);
+              vm.shape_func(%p, %3, %4, is_input=[False])
+            }
+        """
         )
 
-    # def @main(x, s, on_device={param_device_types=[2,1], result_device_type=1}) {
-    #   let p = fixed_gpu(fn(y, param_device_types=[2], result_device_type=2) { relu(y) })
-    #   shape_func(p,
-    #              (shape_of(x),),
-    #              (s,),
-    #              [False])
-    # }
     def expected():
-        return tvm.IRModule.from_expr(
-            relay.annotation.function_on_device(
-                relay.Function(
-                    [x, s],
-                    relay.Let(
-                        p,
-                        fixed_gpu(
-                            relay.annotation.function_on_device(
-                                relay.Function([y], relay.nn.relu(y)), [GPU], GPU
-                            )
-                        ),
-                        relay.op.vm.shape_func(
-                            p, relay.Tuple([relay.op.vm.shape_of(x)]), relay.Tuple([s]), [False]
-                        ),
-                    ),
-                ),
-                [GPU, CPU],
-                CPU,
-            )
+        return tvm.parser.fromtext(
+            """
+            #[version = "0.0.5"] 
+            def @main(%x: Tensor[(?), float32], %s: Tensor[(1), int64],
+                      param_device_types=[2, 1], result_device_type=1) {
+              %0 = fn (%y: Tensor[(?), float32], param_device_types=[2], result_device_type=2) {
+                nn.relu(%y)
+              };
+              let %p = on_device(%0, device_type=2, is_fixed=True);
+              %1 = vm.shape_of(%x, dtype="int64");
+              %2 = (%1,);
+              %3 = (%s,);
+              vm.shape_func(%p, %2, %3, is_input=[False])
+            }
+        """
         )
 
     # Don't try to execute, too fiddly to setup.
@@ -686,264 +581,255 @@ def expected():
 
 
 def test_shape_of():
-    compiletime_shape = (relay.Any(), relay.Any())
-    runtime_shape = (N, M)
-    x = relay.var("x", shape=compiletime_shape)
-
-    # We need to use fixed_gpu since the result of on_gpu will default to the result device of @main which is cpu,
-    # which then forces a copy.
+    # We need to use is_fixed=True in the on_device call so that the tensor will be on the GPU. Otherwise the
+    # result defaults to the result device for @main which is the CPU, thus forcing a copy.
     # TODO(mbs): Perhaps the defaulting heuristics are being too clever?
-    # def @main(x) { shape_of(fixed_gpu(x)) }
     def input():
-        return tvm.IRModule.from_expr(relay.Function([x], relay.op.vm.shape_of(fixed_gpu(x))))
+        return tvm.parser.fromtext(
+            """
+            #[version = "0.0.5"] 
+            def @main(%x: Tensor[(?, ?), float32]) {
+              %0 = on_device(%x, device_type=2, is_fixed=True);
+              vm.shape_of(%0, dtype="int64")
+            }
+        """
+        )
 
-    # def @main(x, on_device={param_device_types=[2], result_dev_type=1}) {
-    #   shape_of(x)
-    # }
     def expected():
-        return tvm.IRModule.from_expr(
-            relay.annotation.function_on_device(
-                relay.Function([x], relay.op.vm.shape_of(x)), [GPU], CPU
-            )
+        return tvm.parser.fromtext(
+            """
+            #[version = "0.0.5"]
+            def @main(%x: Tensor[(?, ?), float32], param_device_types=[2], result_device_type=1) {
+              vm.shape_of(%x, dtype="int64")
+            }
+        """
         )
 
     def ref(x):
         return x.shape
 
-    exercise(input(), expected(), ref, rands(runtime_shape, 1))
+    exercise(input(), expected(), ref, rands((5, 7), 1))
 
 
 def test_alloc_storage():
-    size = relay.Var("size", relay.scalar_type("int64"))
-    alignment = relay.Var("alignment", relay.scalar_type("int64"))
-    main = relay.GlobalVar("main")
-    stdlib = tvm.IRModule()
-    stdlib.import_from_std("core.rly")
-
-    # def @main(size, alignment) { alloc_storage(size, alignment, GPU) }
     def input():
-        mod = tvm.IRModule()
-        mod.update(stdlib)
-        mod[main] = relay.Function(
-            [size, alignment], relay.op.memory.alloc_storage(size, alignment, GPU)
+        return tvm.parser.parse(
+            """
+            #[version = "0.0.5"]
+            def @main(%size: int64, %alignment: int64) {
+              memory.alloc_storage(%size, %alignment, device_id=0, device_type=2)
+            }
+        """,
+            "from_string",
+            core,
         )
-        return mod
 
-    # def @main(size, alignment, on_device={param_device_types=[1,1], result_device_type=2}) {
-    #   alloc_storage(size, alignment, GPU)
-    # }
     def expected():
-        mod = tvm.IRModule()
-        mod.update(stdlib)
-        mod[main] = relay.annotation.function_on_device(
-            relay.Function([size, alignment], relay.op.memory.alloc_storage(size, alignment, GPU)),
-            [CPU, CPU],
-            GPU,
+        return tvm.parser.parse(
+            """
+            #[version = "0.0.5"]
+            def @main(%size: int64, %alignment: int64, param_device_types=[1, 1], result_device_type=2) {
+              memory.alloc_storage(%size, %alignment, device_id=0, device_type=2)
+            }
+        """,
+            "from_string",
+            core,
         )
-        return mod
 
     # Don't try to execute, too fiddly to setup.
     exercise(input(), expected(), None, None)
 
 
 def test_alloc_tensor():
-    stdlib = tvm.IRModule()
-    stdlib.import_from_std("core.rly")
-    sto_type = relay.TypeCall(stdlib.get_global_type_var("Storage"), [])
-    sto = relay.Var("sto", sto_type)
-    main = relay.GlobalVar("main")
-    shape = relay.const(np.array([3, 2]), dtype="int64")
-
-    # def @main(sto) { alloc_tensor(sto, 0, [3, 2]) }
+    shape = np.array([3, 2])
+    metatable = {"relay.Constant": [relay.const(shape, dtype="int64")]}
+
     def input():
-        mod = tvm.IRModule()
-        mod.update(stdlib)
-        mod[main] = relay.Function(
-            [sto], relay.op.memory.alloc_tensor(sto, relay.const(0, dtype="int64"), shape)
+        return tvm.parser.parse(
+            """
+            #[version = "0.0.5"]
+            def @main(%sto: Storage[]) {
+              memory.alloc_tensor(%sto, 0, meta[relay.Constant][0],
+                                  const_shape=meta[relay.Constant][0], assert_shape=[])
+            }
+        """,
+            "from_string",
+            core,
+            metatable,
         )
-        return mod
 
-    # def @main(sto, on_device={param_device_types=[2], result_device_type=2}) {
-    #   alloc_tensor(sto, fixed_cpu(0), fixed_cpu([3, 2]))
-    # }
     def expected():
-        mod = tvm.IRModule()
-        mod.update(stdlib)
-        mod[main] = relay.annotation.function_on_device(
-            relay.Function(
-                [sto],
-                relay.op.memory.alloc_tensor(
-                    sto, fixed_cpu(relay.const(0, dtype="int64")), fixed_cpu(shape)
-                ),
-            ),
-            [GPU],
-            GPU,
+        return tvm.parser.parse(
+            """
+            #[version = "0.0.5"]
+            def @main(%sto: Storage[], param_device_types=[2], result_device_type=2) {
+              %0 = on_device(0, device_type=1, is_fixed=True);
+              %1 = on_device(meta[relay.Constant][0], device_type=1, is_fixed=True);
+              memory.alloc_tensor(%sto, %0, %1, const_shape=meta[relay.Constant][0], assert_shape=[])
+            }
+        """,
+            "from_string",
+            core,
+            metatable,
         )
-        return mod
 
     # Don't try to execute, too fiddly to setup.
     exercise(input(), expected(), None, None)
 
 
 def test_reshape_tensor():
-    shape = (2, 8)
-    x = relay.var("x", shape=shape, dtype="float32")
-    newshape_expr = relay.const([2, 4, 2], dtype="int64")
-    newshape_prim = [2, 4, 2]
+    newshape = [2, 4, 2]
+    metatable = {"relay.Constant": [relay.const(newshape)]}
 
-    # def @main(x) { reshape_tensor(x, shape, newshape=[2,4,2]) }
     def input():
-        return tvm.IRModule.from_expr(
-            relay.Function([x], relay.op.vm.reshape_tensor(x, newshape_expr, newshape_prim))
+        return tvm.parser.parse(
+            """
+            #[version = "0.0.5"]
+            def @main(%x: Tensor[(2, 8), float32]) {
+              vm.reshape_tensor(%x, meta[relay.Constant][0], newshape=[2, 4, 2])
+            }
+        """,
+            "from_string",
+            None,
+            metatable,
         )
 
-    # def @main(x, on_device={param_device_types=[2], result_device_type=2}) {
-    #   reshape_tensor(x, fixed_cpu(shape), newshape=[2,4,2])
-    # }
     def expected():
-        return tvm.IRModule.from_expr(
-            relay.annotation.function_on_device(
-                relay.Function(
-                    [x], relay.op.vm.reshape_tensor(x, fixed_cpu(newshape_expr), newshape_prim)
-                ),
-                [GPU],
-                GPU,
-            )
+        return tvm.parser.parse(
+            """
+            #[version = "0.0.5"]
+            def @main(%x: Tensor[(2, 8), float32], param_device_types=[2], result_device_type=2) {
+              %0 = on_device(meta[relay.Constant][0], device_type=1, is_fixed=True);
+              vm.reshape_tensor(%x, %0, newshape=[2, 4, 2])
+            }
+        """,
+            "from_string",
+            None,
+            metatable,
         )
 
     def ref(x):
-        return np.reshape(x, newshape_prim)
+        return np.reshape(x, newshape)
 
-    exercise(input(), expected(), ref, rands(shape, 1))
+    exercise(input(), expected(), ref, rands((2, 8), 1))
 
 
 def test_dynamic_input():
-    compiletime_shape = (relay.Any(), relay.Any())
-    runtime_shape = (N, M)
-    x0 = relay.var("x0", shape=compiletime_shape)
-    x1 = relay.var("x1", shape=compiletime_shape)
-
-    # def @main(x0, x1) { add(x0, x1) }
+    # There's nothing special about inferring devices for partially unknown types.
     def input():
-        return tvm.IRModule.from_expr(relay.Function([x0, x1], relay.add(x0, x1)))
+        return tvm.parser.fromtext("""
+            #[version = "0.0.5"]
+            def @main(%x0: Tensor[(?, ?), float32], %x1: Tensor[(?, ?), float32]) {
+              add(%x0, %x1)
+            }
+        """)
 
-    # def @main(x0, x1), on_device={param_device_types=[2,2], result_device_type=2}) {
-    #   add(x0, x1)
-    # }
     def expected():
-        return tvm.IRModule.from_expr(
-            relay.annotation.function_on_device(
-                relay.Function([x0, x1], relay.add(x0, x1)), [GPU, GPU], GPU
-            )
-        )
+        return tvm.parser.fromtext("""
+            #[version = "0.0.5"]
+            def @main(%x0: Tensor[(?, ?), float32], %x1: Tensor[(?, ?), float32],
+                      param_device_types=[2, 2], result_device_type=2) {
+              add(%x0, %x1)
+            }
+        """)
 
     def ref(x0, x1):
         return np.add(x0, x1)
 
-    exercise(input(), expected(), ref, rands(runtime_shape, 2))
+    exercise(input(), expected(), ref, rands((5, 7), 2))
 
 
 def test_redundant_annotation():
-    shape = (N, M)
-    x = relay.var("x", shape=shape)
-    y = relay.var("y", shape=shape)
-    z = relay.var("z", shape=shape)
-
-    # def @main(x, y, z) {
-    #   %0 = add(x, y)
-    #   add(subtract(on_cpu(%0), z), on_cpu(%0))
-    # }
     def input():
-        a = relay.add(x, y)
-        return tvm.IRModule.from_expr(
-            relay.Function([x, y, z], relay.add(relay.subtract(on_cpu(a), z), on_cpu(a)))
-        )
+        return tvm.parser.fromtext("""
+            #[version = "0.0.5"]
+            def @main(%x: Tensor[(5, 7), float32], %y: Tensor[(5, 7), float32], %z: Tensor[(5, 7), float32]) {
+              %0 = add(%x, %y);
+              %1 = on_device(%0, device_type=1);
+              %2 = subtract(%1, %z);
+              %3 = on_device(%0, device_type=1);
+              add(%2, %3)
+            }
+        """)
 
-    # def @main(x, y, z, on_device={param_device_types=[1,1,2], result_device_type=2}) {
-    #   %0 = add(x, y)
-    #   add(subtract(cpu_to_gpu(fixed_cpu(%0)), z), cpu_to_gpu(fixed_cpu(%0)))
-    # }
     def expected():
-        a = relay.add(x, y)
-        return tvm.IRModule.from_expr(
-            relay.annotation.function_on_device(
-                relay.Function(
-                    [x, y, z],
-                    relay.add(
-                        relay.subtract(cpu_to_gpu(fixed_cpu(a)), z), cpu_to_gpu(fixed_cpu(a))
-                    ),
-                ),
-                [CPU, CPU, GPU],
-                GPU,
-            )
-        )
+        return tvm.parser.fromtext("""
+            #[version = "0.0.5"]
+            def @main(%x: Tensor[(5, 7), float32], %y: Tensor[(5, 7), float32], %z: Tensor[(5, 7), float32],
+                      param_device_types=[1, 1, 2], result_device_type=2) {
+              %0 = add(%x, %y);
+              %1 = on_device(%0, device_type=1, is_fixed=True);
+              %2 = device_copy(%1, src_dev_type=1, dst_dev_type=2);
+              %3 = on_device(%0, device_type=1, is_fixed=True);
+              %4 = subtract(%2, %z);
+              %5 = device_copy(%3, src_dev_type=1, dst_dev_type=2);
+              add(%4, %5)
+            }
+        """)
 
     def ref(x, y, z):
         a = np.add(x, y)
         return np.add(np.subtract(a, z), a)
 
-    exercise(input(), expected(), ref, rands(shape, 3))
+    exercise(input(), expected(), ref, rands((5, 7), 3))
 
 
 def test_annotate_expr():
-    shape = (N, M)
-    x = relay.var("x", shape=shape)
-    y = relay.var("y", shape=shape)
-    z = relay.var("z", shape=shape)
-
-    # def @main(x, y, z) { on_cpu(subtract(on_gpu(add(x, y)), z)) } -- forces function result also on cpu
     def input():
-        return tvm.IRModule.from_expr(
-            relay.Function([x, y, z], on_cpu(relay.subtract(on_gpu(relay.add(x, y)), z)))
-        )
+        return tvm.parser.fromtext("""
+            #[version = "0.0.5"]
+            def @main(%x: Tensor[(5, 7), float32], %y: Tensor[(5, 7), float32], %z: Tensor[(5, 7), float32]) {
+              %0 = add(%x, %y);
+              %1 = on_device(%0, device_type=2);
+              %2 = subtract(%1, %z);
+              on_device(%2, device_type=1)
+            }
+        """)
 
-    # def @main(x, y, z, on_device={param_device_types=[2,2,1], result_device_type=1}) {
-    #   subtract(gpu_to_cpu(fixed_gpu(add(x, y))), z)
-    # }
     def expected():
-        add = relay.add(x, y)
-        return tvm.IRModule.from_expr(
-            relay.annotation.function_on_device(
-                relay.Function(
-                    [x, y, z], relay.subtract(gpu_to_cpu(fixed_gpu(relay.add(x, y))), z)
-                ),
-                [GPU, GPU, CPU],
-                CPU,
-            )
-        )
+        return tvm.parser.fromtext("""
+            #[version = "0.0.5"]
+            def @main(%x: Tensor[(5, 7), float32], %y: Tensor[(5, 7), float32], %z: Tensor[(5, 7), float32],
+                      param_device_types=[2, 2, 1], result_device_type=1) {
+              %0 = add(%x, %y);
+              %1 = on_device(%0, device_type=2, is_fixed=True);
+              %2 = device_copy(%1, src_dev_type=2, dst_dev_type=1);
+              subtract(%2, %z)
+            }
+        """)
 
     def ref(x, y, z):
         return np.subtract(np.add(x, y), z)
 
-    exercise(input(), expected(), ref, rands(shape, 3))
+    exercise(input(), expected(), ref, rands((5, 7), 3))
 
 
 def test_annotate_all():
-    shape = (N, M)
-    x = relay.var("x", shape=shape)
-    y = relay.var("y", shape=shape)
-    z = relay.var("z", shape=shape)
-
-    # def @main(x, y, z) { on_cpu(subtract(on_cpu(add(x, y)), z) }  -- top-level also forces result to be CPU
     def input():
-        return tvm.IRModule.from_expr(
-            relay.Function([x, y, z], on_cpu(relay.subtract(on_cpu(relay.add(x, y)), z)))
-        )
+        return tvm.parser.parse("""
+            #[version = "0.0.5"]
+            def @main(%x: Tensor[(5, 7), float32], %y: Tensor[(5, 7), float32], %z: Tensor[(5, 7), float32]) {
+              %0 = add(%x, %y);
+              %1 = on_device(%0, device_type=1);
+              %2 = subtract(%1, %z);
+              on_device(%2, device_type=1)
+            }
+        """)
 
-    # def @main(x, y, z, on_device={param_device_types=[CPU, CPU, CPU], result_device_type=CPU}) {
-    #   subtract(add(x, y), z)
-    # }
     def expected():
-        return tvm.IRModule.from_expr(
-            relay.annotation.function_on_device(
-                relay.Function([x, y, z], relay.subtract(relay.add(x, y), z)), [CPU, CPU, CPU], CPU
-            )
-        )
+        return tvm.parser.parse("""
+            #[version = "0.0.5"]
+            def @main(%x: Tensor[(5, 7), float32], %y: Tensor[(5, 7), float32], %z: Tensor[(5, 7), float32],
+                      param_device_types=[1, 1, 1], result_device_type=1) {
+              %0 = add(%x, %y);
+              subtract(%0, %z)
+            }
+        """)
 
     def ref(x, y, z):
         return np.subtract(np.add(x, y), z)
 
-    exercise(input(), expected(), ref, rands(shape, 3))
+    exercise(input(), expected(), ref, rands((5, 7), 3))
 
 
 def test_conv_network():
@@ -959,97 +845,87 @@ def test_conv_network():
            |
         <result>       <--- CPU
     """
-    batch_size = 1
-    dshape = (batch_size, 64, 56, 56)
-    wshape = (64, 64, 3, 3)
-    weight = relay.var("weight", shape=wshape)
-    data1 = relay.var("data1", shape=dshape)
-    data2 = relay.var("data2", shape=dshape)
 
     def input():
-        conv2d_1 = relay.nn.conv2d(data1, weight, channels=64, kernel_size=(3, 3), padding=(1, 1))
-        conv2d_2 = relay.nn.conv2d(data2, weight, channels=64, kernel_size=(3, 3), padding=(1, 1))
-        add = relay.add(on_cpu(conv2d_1), on_cpu(conv2d_2))
-        conv2d_3 = relay.nn.conv2d(
-            on_gpu(add), weight, channels=64, kernel_size=(3, 3), padding=(1, 1)
-        )
-        return tvm.IRModule.from_expr(relay.Function([data1, data2, weight], on_cpu(conv2d_3)))
+        return tvm.parser.fromtext("""
+            #[version = "0.0.5"]
+            def @main(%data1: Tensor[(1, 64, 56, 56), float32], %data2: Tensor[(1, 64, 56, 56), float32],
+                      %weight: Tensor[(64, 64, 3, 3), float32]) {
+              %0 = nn.conv2d(%data1, %weight, padding=[1, 1, 1, 1], channels=64, kernel_size=[3, 3]);
+              %1 = nn.conv2d(%data2, %weight, padding=[1, 1, 1, 1], channels=64, kernel_size=[3, 3]);
+              %2 = on_device(%0, device_type=1);
+              %3 = on_device(%1, device_type=1);
+              %4 = add(%2, %3);
+              %5 = on_device(%4, device_type=2);
+              %6 = nn.conv2d(%5, %weight, padding=[1, 1, 1, 1], channels=64, kernel_size=[3, 3]);
+              on_device(%6, device_type=1)
+            }
+        """)
 
     def expected():
-        conv2d_1 = relay.nn.conv2d(data1, weight, channels=64, kernel_size=(3, 3), padding=(1, 1))
-        conv2d_2 = relay.nn.conv2d(data2, weight, channels=64, kernel_size=(3, 3), padding=(1, 1))
-        add = relay.add(cpu_to_gpu(fixed_cpu(conv2d_1)), cpu_to_gpu(fixed_cpu(conv2d_2)))
-        conv2d_3 = relay.nn.conv2d(
-            gpu_to_cpu(fixed_gpu(add)), weight, channels=64, kernel_size=(3, 3), padding=(1, 1)
-        )
-        return tvm.IRModule.from_expr(
-            relay.annotation.function_on_device(
-                relay.Function([data1, data2, weight], conv2d_3), [CPU, CPU, CPU], CPU
-            )
-        )
+        return tvm.parser.fromtext("""
+            #[version = "0.0.5"]
+            def @main(%data1: Tensor[(1, 64, 56, 56), float32], %data2: Tensor[(1, 64, 56, 56), float32],
+                      %weight: Tensor[(64, 64, 3, 3), float32], param_device_types=[1, 1, 1], result_device_type=1) {
+              %0 = nn.conv2d(%data1, %weight, padding=[1, 1, 1, 1], channels=64, kernel_size=[3, 3]);
+              %1 = on_device(%0, device_type=1, is_fixed=True);
+              %2 = nn.conv2d(%data2, %weight, padding=[1, 1, 1, 1], channels=64, kernel_size=[3, 3]);
+              %3 = on_device(%2, device_type=1, is_fixed=True);
+              %4 = device_copy(%1, src_dev_type=1, dst_dev_type=2);
+              %5 = device_copy(%3, src_dev_type=1, dst_dev_type=2);
+              %6 = add(%4, %5);
+              %7 = on_device(%6, device_type=2, is_fixed=True);
+              %8 = device_copy(%7, src_dev_type=2, dst_dev_type=1);
+              nn.conv2d(%8, %weight, padding=[1, 1, 1, 1], channels=64, kernel_size=[3, 3])
+            }
+        """)
 
     # Don't try to execute, we don't have a reference conv2d
     exercise(input(), expected(), None, None)
 
 
 def test_tuple_get_item():
-    shape = (3, 3, 4)
-    x = relay.Var("x", relay.ty.TensorType(shape, "float32"))
-    t = relay.Var("t")
-
-    # We'll device copy after projection, not before.
-    # def @main(x) {
-    #   let t = split(x, 3);
-    #   subtract(on_cpu(t).0, on_cpu(t).1)
-    # }
+    # Note that the device copy should be placed after projection rather than before. This is handled by
+    # a heuristic in the pass.
     def input():
-        return tvm.IRModule.from_expr(
-            relay.Function(
-                [x],
-                relay.Let(
-                    t,
-                    relay.op.split(x, 3).astuple(),
-                    on_gpu(
-                        relay.subtract(
-                            relay.TupleGetItem(on_cpu(t), 0), relay.TupleGetItem(on_cpu(t), 1)
-                        )
-                    ),
-                ),
-            )
-        )
+        return tvm.parser.fromtext("""
+            #[version = "0.0.5"]
+            def @main(%x: Tensor[(3, 3, 4), float32]) {
+              let %t = split(%x, indices_or_sections=3);
+              %0 = on_device(%t, device_type=1);
+              %1 = on_device(%t, device_type=1);
+              %2 = %0.0;
+              %3 = %1.1;
+              %4 = subtract(%2, %3);
+              on_device(%4, device_type=2)
+            }
+        """)
 
-    # def @main(x, on_device={param_device_type=[1], result_device_type=2}) {
-    #   let t = fixed_cpu(split(x, 3))
-    #   subtract(cpu_to_gpu(fixed_cpu(t.0)), cpu_to_gpu(fixed_cpu(t.1)))
-    # }
     def expected():
-        return tvm.IRModule.from_expr(
-            relay.annotation.function_on_device(
-                relay.Function(
-                    [x],
-                    relay.Let(
-                        t,
-                        fixed_cpu(relay.op.split(x, 3).astuple()),
-                        relay.subtract(
-                            cpu_to_gpu(fixed_cpu(relay.TupleGetItem(t, 0))),
-                            cpu_to_gpu(fixed_cpu(relay.TupleGetItem(t, 1))),
-                        ),
-                    ),
-                ),
-                [CPU],
-                GPU,
-            )
-        )
+        return tvm.parser.fromtext("""
+            #[version = "0.0.5"]
+            def @main(%x: Tensor[(3, 3, 4), float32], param_device_types=[1], result_device_type=2) {
+              %0 = split(%x, indices_or_sections=3);
+              let %t = on_device(%0, device_type=1, is_fixed=True);
+              %1 = %t.0;
+              %2 = on_device(%1, device_type=1, is_fixed=True);
+              %3 = %t.1;
+              %4 = on_device(%3, device_type=1, is_fixed=True);
+              %5 = device_copy(%2, src_dev_type=1, dst_dev_type=2);
+              %6 = device_copy(%4, src_dev_type=1, dst_dev_type=2);
+              subtract(%5, %6)
+            }
+        """)
 
     def ref(x):
         t = np.split(x, 3)
         return np.subtract(t[0], t[1])
 
-    exercise(input(), expected(), ref, rands(shape, 1))
+    exercise(input(), expected(), ref, rands((3, 3, 4), 1))
 
 
 def test_propogation():
-    R""" The network and devices are as follows:
+    r""" The network and devices are as follows:
                   x           <--- CPU
                   |
                  log          <--- CPU
@@ -1062,36 +938,52 @@ def test_propogation():
                   |
                <result>       <--- CPU
     """
-    shape = (N, M)
-    x = relay.var("x", shape=shape)
 
     def input():
-        log = relay.log(x)
-        log2 = relay.log2(on_cpu(log))
-        log10 = relay.log10(on_cpu(log))
-        add = relay.add(on_gpu(log2), on_gpu(log10))
-        tan = relay.tan(on_gpu(add))
-        return tvm.IRModule.from_expr(relay.Function([x], on_cpu(tan)))
+        return tvm.parser.fromtext("""
+            #[version = "0.0.5"]
+            def @main(%x: Tensor[(5, 7), float32]) {
+              %0 = log(%x);
+              %1 = on_device(%0, device_type=1);
+              %2 = log2(%1);
+              %3 = on_device(%0, device_type=1);
+              %4 = log10(%3);
+              %5 = on_device(%2, device_type=2);
+              %6 = on_device(%4, device_type=2);
+              %7 = add(%5, %6);
+              %8 = on_device(%7, device_type=2);
+              %9 = tan(%8);
+              on_device(%9, device_type=1)
+            }
+        """)
 
     def expected():
-        log = relay.log(x)
-        log2 = relay.log2(cpu_to_gpu(fixed_cpu(log)))
-        log10 = relay.log10(cpu_to_gpu(fixed_cpu(log)))
-        add = relay.add(log2, log10)
-        tan = relay.tan(gpu_to_cpu(fixed_gpu(add)))
-        return tvm.IRModule.from_expr(
-            relay.annotation.function_on_device(relay.Function([x], tan), [CPU], CPU)
-        )
+        return tvm.parser.fromtext("""
+            #[version = "0.0.5"]
+            def @main(%x: Tensor[(5, 7), float32], param_device_types=[1], result_device_type=1) {
+              %0 = log(%x);
+              %1 = on_device(%0, device_type=1, is_fixed=True);
+              %2 = device_copy(%1, src_dev_type=1, dst_dev_type=2);
+              %3 = on_device(%0, device_type=1, is_fixed=True);
+              %4 = device_copy(%3, src_dev_type=1, dst_dev_type=2);
+              %5 = log2(%2);
+              %6 = log10(%4);
+              %7 = add(%5, %6);
+              %8 = on_device(%7, device_type=2, is_fixed=True);
+              %9 = device_copy(%8, src_dev_type=2, dst_dev_type=1);
+              tan(%9)
+            }
+        """)
 
     def ref(x):
         y = np.log(x)
         return np.tan(np.add(np.log2(y), np.log10(y)))
 
-    exercise(input(), expected(), ref, rands(shape, 1))
+    exercise(input(), expected(), ref, rands((5, 7), 1))
 
 
 def test_fusible_network():
-    R""" The network is as follows:
+    r""" The network is as follows:
                x     y      <--- GPU
                 \   /
                  add        <--- GPU
@@ -1106,33 +998,46 @@ def test_fusible_network():
                   |
                <result>     <--- CPU
     """
-    shape = (N, M)
-    x = relay.var("x", shape=shape)
-    y = relay.var("y", shape=shape)
 
     def input():
-        add = relay.add(x, y)
-        sqrt = relay.negative(on_gpu(add))
-        log = relay.negative(add)
-        subtract = relay.add(on_cpu(sqrt), log)
-        exp = relay.negative(on_gpu(subtract))
-        return tvm.IRModule.from_expr(relay.Function([x, y], on_cpu(exp)))
+        return tvm.parser.fromtext("""
+            #[version = "0.0.5"]
+            def @main(%x: Tensor[(5, 7), float32], %y: Tensor[(5, 7), float32]) {
+              %0 = add(%x, %y);
+              %1 = on_device(%0, device_type=2);
+              %2 = negative(%1);
+              %3 = on_device(%2, device_type=1);
+              %4 = negative(%0);
+              %5 = add(%3, %4);
+              %6 = on_device(%5, device_type=2);
+              %7 = negative(%6);
+              on_device(%7, device_type=1)
+            }
+        """)
 
     def expected():
-        add = relay.add(x, y)
-        sqrt = relay.negative(gpu_to_cpu(fixed_gpu(add)))
-        log = relay.negative(add)
-        subtract = relay.add(cpu_to_gpu(fixed_cpu(sqrt)), log)
-        exp = relay.negative(gpu_to_cpu(fixed_gpu(subtract)))
-        return tvm.IRModule.from_expr(
-            relay.annotation.function_on_device(relay.Function([x, y], exp), [GPU, GPU], CPU)
-        )
+        return tvm.parser.fromtext("""
+            #[version = "0.0.5"]
+            def @main(%x: Tensor[(5, 7), float32], %y: Tensor[(5, 7), float32], param_device_types=[2, 2], result_device_type=1) {
+              %0 = add(%x, %y);
+              %1 = on_device(%0, device_type=2, is_fixed=True);
+              %2 = device_copy(%1, src_dev_type=2, dst_dev_type=1);
+              %3 = negative(%2);
+              %4 = on_device(%3, device_type=1, is_fixed=True);
+              %5 = device_copy(%4, src_dev_type=1, dst_dev_type=2);
+              %6 = negative(%0);
+              %7 = add(%5, %6);
+              %8 = on_device(%7, device_type=2, is_fixed=True);
+              %9 = device_copy(%8, src_dev_type=2, dst_dev_type=1);
+              negative(%9)
+            }
+        """)
 
     def ref(x, y):
         z = np.add(x, y)
         return np.negative(np.add(np.negative(z), np.negative(z)))
 
-    exercise(input(), expected(), ref, rands(shape, 2))
+    exercise(input(), expected(), ref, rands((5, 7), 2))
 
 
 def test_unpropagatable_graph():
@@ -1149,109 +1054,82 @@ def test_unpropagatable_graph():
            |
         <result>        <--- CPU
     """
-    shape = (N, M)
-    a = relay.var("a", shape=shape)
-    b = relay.var("b", shape=shape)
-    c = relay.var("c", shape=shape)
-    d = relay.var("d", shape=shape)
 
     def input():
-        return tvm.IRModule.from_expr(
-            relay.Function(
-                [a, b, c, d],
-                on_cpu(relay.subtract(on_cpu(relay.add(a, b)), on_gpu(relay.multiply(c, d)))),
-            )
-        )
+        return tvm.parser.fromtext("""
+            #[version = "0.0.5"]
+            def @main(%a: Tensor[(5, 7), float32], %b: Tensor[(5, 7), float32],
+                      %c: Tensor[(5, 7), float32], %d: Tensor[(5, 7), float32]) {
+              %0 = add(%a, %b);
+              %1 = multiply(%c, %d);
+              %2 = on_device(%0, device_type=1);
+              %3 = on_device(%1, device_type=2);
+              %4 = subtract(%2, %3);
+              on_device(%4, device_type=1)
+            }
+        """)
 
     def expected():
-        return tvm.IRModule.from_expr(
-            relay.annotation.function_on_device(
-                relay.Function(
-                    [a, b, c, d],
-                    relay.subtract(relay.add(a, b), gpu_to_cpu(fixed_gpu(relay.multiply(c, d)))),
-                ),
-                [CPU, CPU, GPU, GPU],
-                CPU,
-            )
-        )
+        return tvm.parser.fromtext("""
+            #[version = "0.0.5"]
+            def @main(%a: Tensor[(5, 7), float32], %b: Tensor[(5, 7), float32],
+                      %c: Tensor[(5, 7), float32], %d: Tensor[(5, 7), float32],
+                      param_device_types=[1, 1, 2, 2], result_device_type=1) {
+              %0 = multiply(%c, %d);
+              %1 = on_device(%0, device_type=2, is_fixed=True);
+              %2 = add(%a, %b);
+              %3 = device_copy(%1, src_dev_type=2, dst_dev_type=1);
+              subtract(%2, %3)
+            }
+        """)
 
     def ref(a, b, c, d):
         return np.subtract(np.add(a, b), np.multiply(c, d))
 
-    exercise(input(), expected(), ref, rands(shape, 4))
+    exercise(input(), expected(), ref, rands((5, 7), 4))
 
 
 def test_conditional():
-    shape = (N, M)
-    x = relay.Var("x", relay.ty.scalar_type("bool"))
-    y = relay.var("y", shape=shape)
-    z = relay.var("z", shape=shape)
-    f = relay.Var("f")
-    g = relay.Var("g")
-    h = relay.Var("h")
-    a1 = relay.Var("a")
-    a2 = relay.Var("a")
-
-    # def @main(x, y, z) {
-    #   let f = fn(a) { add(a, fixed_cpu(y)) }
-    #   let g = fn(a) { subtract(a, y) }
-    #   let h = if (x) {
-    #     f
-    #   } else {
-    #     g
-    #   }
-    #   h(z)
-    # }
+    # The conditional is over a function type, thus exercising the first-order/higher-order domain handling.
     def input():
-        return tvm.IRModule.from_expr(
-            relay.Function(
-                [x, y, z],
-                relay.Let(
-                    f,
-                    relay.Function([a1], relay.add(a1, fixed_cpu(y))),
-                    relay.Let(
-                        g,
-                        relay.Function([a2], relay.subtract(a2, y)),
-                        relay.Let(h, relay.If(x, f, g), relay.Call(h, [z])),
-                    ),
-                ),
-            )
-        )
+        return tvm.parser.fromtext("""
+            #[version = "0.0.5"]
+            def @main(%x: bool, %y: Tensor[(5, 7), float32], %z: Tensor[(5, 7), float32]) {
+              let %f = fn (%a) {
+                %0 = on_device(%y, device_type=1, is_fixed=True);
+                add(%a, %0)
+              };
+              let %g = fn (%a1) {
+                subtract(%a1, %y)
+              };
+              let %h = if (%x) {
+                %f
+              } else {
+                %g
+              };
+              %h(%z)
+            }
+        """)
 
-    # def @main(x, y, z, on_device={param_device_types=[1,1,1], result_device_type=1}) {
-    #   let f = fn(a, on_device={param_device_types=[1], result_device_type=1}) { add(a, y) }
-    #   let g = fn
-    #   (a, on_device={param_device_types=[1], result_device_type=1}) { subtract(a, y) }
-    #   let h = if (x) {
-    #     f
-    #   } else {
-    #     g
-    #   }
-    #   h(z)
-    # }
     def expected():
-        return tvm.IRModule.from_expr(
-            relay.annotation.function_on_device(
-                relay.Function(
-                    [x, y, z],
-                    relay.Let(
-                        f,
-                        relay.annotation.function_on_device(
-                            relay.Function([a1], relay.add(a1, y)), [CPU], CPU
-                        ),
-                        relay.Let(
-                            g,
-                            relay.annotation.function_on_device(
-                                relay.Function([a2], relay.subtract(a2, y)), [CPU], CPU
-                            ),
-                            relay.Let(h, relay.If(x, f, g), relay.Call(h, [z])),
-                        ),
-                    ),
-                ),
-                [CPU, CPU, CPU],
-                CPU,
-            )
-        )
+        return tvm.parser.fromtext("""
+            #[version = "0.0.5"]
+            def @main(%x: bool, %y: Tensor[(5, 7), float32], %z: Tensor[(5, 7), float32],
+                      param_device_types=[1, 1, 1], result_device_type=1) {
+              let %f = fn (%a, param_device_types=[1], result_device_type=1) {
+                add(%a, %y)
+              };
+              let %g = fn (%a1, param_device_types=[1], result_device_type=1) {
+                subtract(%a1, %y)
+              };
+              let %h = if (%x) {
+                %f
+              } else {
+                %g
+              };
+              %h(%z)
+            }
+        """)
 
     def ref(x, y, z):
         def f(a):
@@ -1263,47 +1141,37 @@ def g(a):
         h = f if x else g
         return h(z)
 
-    exercise(input(), expected(), ref, [True, rand(shape), rand(shape)])
+    exercise(input(), expected(), ref, [True, rand((5, 7)), rand((5, 7))])
 
 
 def test_global():
-    shape = (N, M)
-    a = relay.var("a", shape=shape)
-    b = relay.var("b", shape=shape)
-    x = relay.var("x", shape=shape)
-    y = relay.var("y", shape=shape)
-    f = relay.GlobalVar("f")
-    main = relay.GlobalVar("main")
-
-    # def @f(a, b) { add(a, on_cpu(b)) }
-    # def @main(x, y) { @f(y, x) }
     def input():
-        mod = tvm.IRModule()
-        mod[f] = relay.Function(
-            [a, b], relay.add(a, on_cpu(b)), relay.ty.TensorType(shape, "float32")
-        )
-        mod[main] = relay.Function(
-            [x, y], relay.Call(f, [y, x]), relay.ty.TensorType(shape, "float32")
-        )
-        return mod
+        return tvm.parser.fromtext("""
+            #[version = "0.0.5"]
+            def @f(%a: Tensor[(5, 7), float32], %b: Tensor[(5, 7), float32]) -> Tensor[(5, 7), float32] {
+              %0 = on_device(%b, device_type=1);
+              add(%a, %0)
+            }
+            
+            def @main(%x: Tensor[(5, 7), float32], %y: Tensor[(5, 7), float32]) -> Tensor[(5, 7), float32] {
+              @f(%y, %x)
+            }
+        """)
 
-    # def @f(a, b, on_device={param_device_types=[2,1], result_device_type=2}) { add(a, on_cpu(b)) }
-    # def @main(x, y, on_device={param_device_types=[1,2], result_device_type=2}) { @f(y, x) }
     def expected():
-        mod = tvm.IRModule()
-        mod[f] = relay.annotation.function_on_device(
-            relay.Function(
-                [a, b], relay.add(a, cpu_to_gpu(b)), relay.ty.TensorType(shape, "float32")
-            ),
-            [GPU, CPU],
-            GPU,
-        )
-        mod[main] = relay.annotation.function_on_device(
-            relay.Function([x, y], relay.Call(f, [y, x]), relay.ty.TensorType(shape, "float32")),
-            [CPU, GPU],
-            GPU,
-        )
-        return mod
+        return tvm.parser.fromtext("""
+            #[version = "0.0.5"]
+            def @f(%a: Tensor[(5, 7), float32], %b: Tensor[(5, 7), float32],
+                   param_device_types=[2, 1], result_device_type=2) -> Tensor[(5, 7), float32] {
+              %0 = device_copy(%b, src_dev_type=1, dst_dev_type=2);
+              add(%a, %0)
+            }
+            
+            def @main(%x: Tensor[(5, 7), float32], %y: Tensor[(5, 7), float32],
+                      param_device_types=[1, 2], result_device_type=2) -> Tensor[(5, 7), float32] {
+              @f(%y, %x)
+            }
+        """)
 
     def ref(x, y):
         def f(a, b):
@@ -1311,69 +1179,88 @@ def f(a, b):
 
         return f(x, y)
 
-    exercise(input(), expected(), ref, rands(shape, 2))
-
-
-# Note that match and ADTs don't appear to be supported for direct AST
-# construction.
+    exercise(input(), expected(), ref, rands((5, 7), 2))
 
 
 def test_ref():
-    shape = (N, M)
-    x = relay.var("x", shape=shape)
-    y = relay.var("y", shape=shape)
-    r = relay.var("r")
-    dummy = relay.var("dummy")
-
-    # def @main(x, y) {
-    #   r = ref(x)
-    #   ref_write(r, on_cpu(y))
-    #   add(x, ref_read(r))
-    # }
     def input():
-        return tvm.IRModule.from_expr(
-            relay.Function(
-                [x, y],
-                relay.Let(
-                    r,
-                    relay.RefCreate(x),
-                    relay.Let(dummy, relay.RefWrite(r, on_cpu(y)), relay.add(x, relay.RefRead(r))),
-                ),
-            )
-        )
+        return tvm.parser.fromtext("""
+            #[version = "0.0.5"]
+            def @main(%x: Tensor[(5, 7), float32], %y: Tensor[(5, 7), float32]) {
+              let %r = ref(%x);
+              %0 = on_device(%y, device_type=1);
+              ref_write(%r, %0);
+              %1 = ref_read(%r);
+              add(%x, %1)
+            }
+        """)
 
-    # def @main(x, y, on_device={param_device_types=[GPU, CPU], result_device_type=GPU}) {
-    #   r = ref(x)
-    #   ref_write(r, cpu_to_gpu(y))
-    #   add(x, ref_read(r))
-    # }
     def expected():
-        return tvm.IRModule.from_expr(
-            relay.annotation.function_on_device(
-                relay.Function(
-                    [x, y],
-                    relay.Let(
-                        r,
-                        relay.RefCreate(x),
-                        relay.Let(
-                            dummy, relay.RefWrite(r, cpu_to_gpu(y)), relay.add(x, relay.RefRead(r))
-                        ),
-                    ),
-                ),
-                [GPU, CPU],
-                GPU,
-            )
-        )
+        return tvm.parser.fromtext("""
+            #[version = "0.0.5"]
+            def @main(%x: Tensor[(5, 7), float32], %y: Tensor[(5, 7), float32],
+                      param_device_types=[2, 1], result_device_type=2) {
+              let %r = ref(%x);
+              %0 = device_copy(%y, src_dev_type=1, dst_dev_type=2);
+              ref_write(%r, %0);
+              %1 = ref_read(%r);
+              add(%x, %1)
+            }
+        """)
 
     def ref(x, y):
         r = {"value": x}
         r["value"] = y
         return np.add(x, r["value"])
 
-    # Don't try to execute, no backend currently supports both cross-devices and references.
+    # Don't try to execute, no backend currently supports both hetrogeneous devices and references.
     exercise(input(), expected(), None, None)
 
 
+def test_adt():
+    def input():
+        return tvm.parser.fromtext("""
+            #[version = "0.0.5"]
+            type List[A] {
+              Cons(A, List[A]),
+              Nil,
+            }
+            def @main(%x : Tensor[(5, 7), float32], %y : Tensor[(5, 7), float32]) {
+              %0 = on_device(%y, device_type=1, is_fixed=True);
+              %1 = Nil;
+              %2 = Cons(%0, %1);
+              let %l = Cons(%x, %2);
+              match? (%l) {
+                Cons(%z, _) => %z
+              }
+            }
+        """)
+
+    def expected():
+        return tvm.parser.fromtext("""
+            #[version = "0.0.5"]
+            type List[A] {
+              Cons(A, List[A]),
+              Nil,
+            }
+            def @main(%x : Tensor[(5, 7), float32], %y : Tensor[(5, 7), float32],
+                      param_device_types=[1, 1], result_device_type=1) {
+              %0 = Nil;
+              %1 = Cons(%y, %0);
+              let %l = Cons(%x, %1);
+              match? (%l) {
+                Cons(%z, _) => %z
+              }
+            }
+        """)
+
+    def ref(x, y):
+        l = [x, y]
+        return l[0]
+
+    exercise(input(), expected(), ref, rands((5, 7), 2))
+
+
 if __name__ == "__main__":
     test_plain()
     test_left_add_on_cpu()
@@ -1403,3 +1290,4 @@ def ref(x, y):
     test_conditional()
     test_global()
     test_ref()
+    test_adt()