diff --git a/.circleci/config.yml b/.circleci/config.yml
index 4fa87539c..ba119097d 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -11,7 +11,6 @@ test: &test
         key: v1.03-libhcl-
     - run: make build-python
     - run: pip install --user pytest 
-    - run: pip install --user future 
     - run: python -m pytest tests
     - run: pip install --user mxnet
     - run: python -m pytest samples
diff --git a/.gitignore b/.gitignore
index 65f3dfcf8..a70651d15 100644
--- a/.gitignore
+++ b/.gitignore
@@ -16,6 +16,8 @@ tags
 docs/source/samples
 docs/source/tutorials
 soda_*
+*.cpp
+*.h
 out
 
 # Downloaded files
diff --git a/HISTORY b/HISTORY
deleted file mode 100644
index e08d564bc..000000000
--- a/HISTORY
+++ /dev/null
@@ -1,11 +0,0 @@
-### 2019-12-09 
-  * fixed issue of zc706 simulation 
-    * remove kernel-name variable allocation before KernelDef
-    * change multi-dimension array access to row-major single-dimension access
-    * create local buffer for each on-device variable
-    * updated the `KernelUpdater` class (using position index instead of name)
-    * added `stream_arg_pos` map in `CodeGenC` to facilitate codegen with streaming
-  * fixed test cases 
-    * changed tvm `build` function to support legacy string type target 
-    * fixed opencl aocl data type mismatching issue
-    * fixed kernel def data type conversion issue
diff --git a/Makefile b/Makefile
index 9508b9171..88c653d77 100644
--- a/Makefile
+++ b/Makefile
@@ -12,15 +12,15 @@ build-tvm: build-pkgs
 
 build-hcl: build-tvm
 	cd python; \
-	python setup.py develop --user; \
+	python setup.py install --user; \
   cd ../hlib/python; \
-	python setup.py develop --user;
+	python setup.py install --user;
 
 build-python:
 	cd python; \
-	python setup.py develop --user; \
+	python setup.py install --user; \
   cd ../hlib/python; \
-	python setup.py develop --user;
+	python setup.py install --user;
 
 clean:
 	rm -rf build
diff --git a/Makefile.config b/Makefile.config
index 60d1cfd3e..2060d201c 100644
--- a/Makefile.config
+++ b/Makefile.config
@@ -12,9 +12,6 @@ CMAKE_OK = no
 # set whether to use vivado hls runtime
 USE_VIVADO_HLS = 1
 
-# set whether to use sdaccel opencl runtime
-USE_SDACCEL_HLS = 1
-
 # Specify current directory level with respect to CLAY_ROOT
 ifndef LEVEL
 	LEVEL := .
diff --git a/hlib/python/hlib/nn.py b/hlib/python/hlib/nn.py
index 8f1c4d0e8..c8fa146a8 100644
--- a/hlib/python/hlib/nn.py
+++ b/hlib/python/hlib/nn.py
@@ -32,17 +32,6 @@ def _pad(*indices):
         return data[tuple(index_tuple)]
     return hcl.compute(out_shape, _pad, name='pad')
 
-def conv2d_nchw_imp(Input, Filter, Output, stride=[1,1], padding=[[0,0],[0,0]]):
-    with hcl.for_(0,Output.shape[0]) as n:
-      with hcl.for_(0,Output.shape[1]) as c:
-        with hcl.for_(0,Output.shape[2]) as h:
-          with hcl.for_(0,Output.shape[3]) as w:
-            partial = hcl.scalar(0)
-            with hcl.for_(0,Filter.shape[-2]) as x:
-              with hcl.for_(0,Filter.shape[-1]) as y:
-                partial.v += Input[n][c][h+x][w+y] * Filter[0][0][x][y] 
-            Output[n,c,h,w] = partial
-
 def conv2d_nchw(Input, Filter, name="conv2d", stride=[1,1], padding=[[0,0],[0,0]]):
     out_dtype = Input.dtype
     batch, in_channel, in_height, in_width = Input.shape
diff --git a/hlib/rocc-ppac b/hlib/rocc-ppac
deleted file mode 160000
index 40d323d0c..000000000
--- a/hlib/rocc-ppac
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit 40d323d0c81e2f64dbfb63afb5eb5d6ccf7c5e48
diff --git a/python/heterocl/__init__.py b/python/heterocl/__init__.py
index 4b90160f0..588196177 100644
--- a/python/heterocl/__init__.py
+++ b/python/heterocl/__init__.py
@@ -3,7 +3,6 @@
 from .compute_api import *
 from .dsl import *
 from .types import *
-from .devices import *
 from .nparray import *
 from .debug import hcl_excepthook
 from .tvm.intrin import *
diff --git a/python/heterocl/api.py b/python/heterocl/api.py
index f3e2151c8..4da52786f 100644
--- a/python/heterocl/api.py
+++ b/python/heterocl/api.py
@@ -53,7 +53,7 @@ def app2(A, B, C):
         # execute f2
     """
     # set the configurations
-    config.init_dtype  = init_dtype
+    config.init_dtype = init_dtype
     # initialize global variables
     Schedule.stage_ops = []
     Schedule.last_stages = OrderedSet([])
@@ -90,7 +90,7 @@ def placeholder(shape, name=None, dtype=None):
     """
     name = util.get_name("placeholder", name)
     dtype = util.get_dtype(dtype)
-    
+
     if shape == ():
         return Scalar(tvm_api._Var(name, dtype))
     tensor = Tensor(shape, dtype, name)
diff --git a/python/heterocl/debug.py b/python/heterocl/debug.py
index a885d2e0b..cba313e23 100644
--- a/python/heterocl/debug.py
+++ b/python/heterocl/debug.py
@@ -45,11 +45,6 @@ class TensorError(HCLError):
     def __init__(self, msg):
         HCLError.__init__(self, msg, "\33[1;31m[Tensor]\33[0m ")
 
-class DeviceError(HCLError):
-    """A subclass for specifying device related exception"""
-    def __init__(self, msg):
-        HCLError.__init__(self, msg, "\33[1;31m[Device]\33[0m ")
-
 def hcl_excepthook(etype, value, tb):
     """Customized excepthook
 
diff --git a/python/heterocl/devices.py b/python/heterocl/devices.py
deleted file mode 100644
index a5d81df86..000000000
--- a/python/heterocl/devices.py
+++ /dev/null
@@ -1,278 +0,0 @@
-"""Define HeteroCL device types"""
-#pylint: disable=too-few-public-methods, too-many-return-statements
-from .debug import DeviceError
-from .tools import option_table, model_table
-from future.utils import with_metaclass
-
-class tooling(type):
-    def __getattr__(cls, key):
-        if key in option_table:
-           return cls(key, *option_table[key])
-        else: # unsupported device
-           raise DeviceError("not supported")
-
-class tool(with_metaclass(tooling, object)):
-    """The base class for all device tooling
-
-    mode (sim/impl) is decided by tool configuration
-    e.g. run sw emulation by passing gcc / vivado_hls arg
-    and actual impl by passing sdaccel / aocl arg 
-
-    Parameters
-    ----------
-    types: str
-        Device of device to place data
-    model: str
-        Model of device to place date
-    """
-    def __init__(self, name, mode, kwargs):
-        self.name = name
-        self.mode = mode
-        self.options = kwargs
-
-    def __getattr__(self, entry):
-        return self.mapping[entry] 
-
-    def __call__(self, mode, setting={}):
-        self.mode = mode
-        self.options = setting
-        return self
-
-    def __str__(self):
-        return str(self.name) + "-" + \
-               str(self.mode) + ":\n" + \
-               str(self.options)
-
-    def __repr__(self):
-        return str(self.name) + "-" + \
-               str(self.mode) + ":\n" + \
-               str(self.options)
-
-tool_table = {
-  "aws_f1"      : tool("sdaccel", *option_table["sdaccel"]),
-  "zc706"       : tool("vivado_hls", *option_table["vivado_hls"]),
-  "ppac"        : tool("rocket", *option_table["rocket"]),
-  "stratix10_sx": tool("aocl", *option_table["aocl"]),
-  "llvm"        : tool("llvm", *option_table["llvm"])
-}
-
-class Device(object):
-    """The base class for all device types
-
-    The default data placement is on CPU.
-
-    Parameters
-    ----------
-    types: str
-        Device of device to place data
-    model: str
-        Model of device to place date
-    """
-    def __init__(self, types, vendor, 
-                 model, **kwargs):
-        self.vendor = vendor
-        self.types = types
-        self.model = model
-        self.impls = {"lang": ""}
-        for key, value in kwargs.items(): 
-            self.impls[key] = value
-
-    def __getattr__(self, key):
-        """ device hierarchy """
-        return self.impls[key] 
-
-    def set_lang(self, lang):
-        assert lang in \
-            ["opencl", "hlsc", "c", "opengl", "merlinc", "cuda", "metal"], \
-            "unsupported lang sepc " + lang
-        self.impls["lang"] = lang
-        return self
-
-class CPU(Device):
-    """cpu device with different models"""
-    def __init__(self, vendor, model, **kwargs):
-        if vendor not in ["riscv", "arm", "intel", "sparc", "powerpc"]: 
-            raise DeviceError(vendor + " not supported yet")
-        assert "cpu_" + model in model_table[vendor], \
-            model + " not supported yet"
-        super(CPU, self).__init__("CPU", vendor, model, **kwargs)
-    def __repr__(self):
-        return "cpu-" + self.vendor + "-" + str(self.model) + \
-               ":" + self.impls["lang"]
-
-class FPGA(Device):
-    """fpga device with different models"""
-    def __init__(self, vendor, model, **kwargs):
-        if vendor not in ["xilinx", "intel"]: 
-            raise DeviceError(vendor + " not supported yet")
-        assert "fpga_" + model in model_table[vendor], \
-            model + " not supported yet"
-        super(FPGA, self).__init__("FPGA", vendor, model, **kwargs)
-    def __repr__(self):
-        return "fpga-" + self.vendor + "-" + str(self.model) + \
-               ":" + self.impls["lang"]
-
-class GPU(Device):
-    """gpu device with different models"""
-    def __init__(self, vendor, model, **kwargs):
-        if vendor not in ["nvidia", "amd"]: 
-            raise DeviceError(vendor + " not supported yet")
-        assert "gpu_" + model in model_table[vendor], \
-            model + " not supported yet"
-        super(GPU, self).__init__("GPU", vendor, model, **kwargs)
-    def __repr__(self):
-        return "gpu-" + self.vendor + "-" + str(self.model) + \
-               ":" + self.impls["lang"]
-
-class PIM(Device):
-    """cpu device with different models"""
-    def __init__(self, vendor, model, **kwargs):
-        if model not in ["ppac"]: 
-            raise DeviceError(model + " not supported yet")
-        super(PIM, self).__init__("PIM", vendor, model, **kwargs)
-    def __repr__(self):
-        return "pim-" + str(self.model)
-
-dev_table = {
-  "aws_f1" : [CPU("intel", "e5"), FPGA("xilinx", "xcvu19p")],
-  "zc706" : [CPU("arm", "a9"), FPGA("xilinx", "xc7z045")],
-  "rocc-ppac" : [CPU("riscv", "riscv"), PIM("ppac", "ppac")],
-  "stratix10_sx": [CPU("arm", "a53"), FPGA("intel", "stratix10_gx")]
-}
-
-class env(type):
-    """The platform class for compute environment setups
-    
-     serves as meta-class for attr getting
-     default platform: aws_f1, zynq, ppac
-
-    Parameters
-    ----------
-    host: str
-        Device of device to place data
-    model: str
-        Model of device to place date
-    """
-    def __getattr__(cls, key):
-        if key == "aws_f1":
-            devs = dev_table[key]
-            host = devs[0].set_lang("opencl")
-            xcel = devs[1].set_lang("hlsc")
-        elif key == "zc706":
-            devs = dev_table[key]
-            host = devs[0].set_lang("hlsc")
-            xcel = devs[1].set_lang("hlsc")
-        elif key == "llvm":
-            devs = None 
-            host = None 
-            xcel = None 
-        elif key == "ppac":
-            devs = dev_table["rocc-ppac"]
-            host = devs[0].set_lang("c")
-            xcel = None 
-        else: # unsupported device
-            raise DeviceError("not supported")
-        tool = tool_table[key]
-        return cls(key, devs, host, xcel, tool)
-           
-class platform(with_metaclass(env, object)):
-    def __init__(self, name, devs, host, xcel, tool):
-        self.name = name
-        self.devs = devs
-        self.host = host
-        self.xcel = xcel
-        self.tool = tool
-
-        if isinstance(host, CPU):
-            self.cpu = host
-        if isinstance(xcel, FPGA):
-            self.fpga = xcel
-        elif isinstance(xcel, PIM) and \
-             xcel.model == "ppac":
-            self.ppac = xcel
-
-    def __getattr__(self, key):
-        """ return tool options """
-        return self.tool.__getattr__(key)
-   
-    def __call__(self, tooling=None):
-        if tooling: # check and update
-            assert isinstance(tooling, tool)
-            self.tool = tooling
-        return self
-
-    def __str__(self):
-        return str(self.name) + "(" + \
-               str(self.host) + " : " + \
-               str(self.xcel) + ")"
-
-    def __repr__(self):
-        return str(self.name) + "(" + \
-               str(self.host) + " : " + \
-               str(self.xcel) + ")"
-
-def device_to_str(dtype):
-    """Convert a device type to string format.
-
-    Parameters
-    ----------
-    dtype : Device or str
-        The device type to be converted
-
-    Returns
-    -------
-    str
-        The converted device type in string format.
-    """
-    if isinstance(dtype, Device):
-        if isinstance(dtype, CPU):
-            return "cpu_" + str(dtype.model)
-        elif isinstance(dtype, FPGA):
-            return "fpga_" + str(dtype.model)
-    else:
-        if not isinstance(dtype, str):
-            raise DeviceError("Unsupported device type format")
-        return dtype
-
-def device_to_hcl(dtype):
-    """Convert a device type to Heterocl type.
-
-    Parameters
-    ----------
-    dtype : Device or str
-        The device type to be converted
-
-    Returns
-    -------
-    Device
-    """
-    if isinstance(dtype, Device):
-        return dtype
-    elif isinstance(dtype, str):
-        device, model = dtype.split("_") 
-        if device == "cpu":
-            return CPU(model)
-        elif device == "gpu":
-            return GPU(model)
-        elif device == "fpga":
-            return FPGA(model)
-        else:
-            raise DeviceError("Unrecognized device type")
-    else:
-        raise DeviceError("Unrecognized device type format")
-
-def get_model(dtype):
-    """Get the model of a given device type.
-
-    Parameters
-    ----------
-    dtype : Device or str
-        The given device type
-
-    Returns
-    -------
-    str
-    """
-    dtype = dtype_to_hcl(dtype)
-    return dtype.types, dtype.model
-
diff --git a/python/heterocl/dsl.py b/python/heterocl/dsl.py
index b226cb0ab..6d42031f1 100644
--- a/python/heterocl/dsl.py
+++ b/python/heterocl/dsl.py
@@ -405,7 +405,6 @@ def decorator(fmodule, shapes=shapes, dtypes=dtypes, ret_dtype=ret_dtype, name=n
                     raise APIError("The number of data types does not match the of arguments")
                 for (name_, dtype_) in zip(new_names, dtypes):
                     dtypes.append(util.get_dtype(dtype_, name_))
-                dtypes = dtypes[int(len(dtypes)/2):]
             else:
                 dtype = util.get_dtype(dtypes)
                 dtypes = []
@@ -415,20 +414,15 @@ def decorator(fmodule, shapes=shapes, dtypes=dtypes, ret_dtype=ret_dtype, name=n
             # prepare inputs for IR generation
             inputs = []
             inputs_tvm = []
-            arg_shapes, arg_dtypes = [], []
             for shape, name_, dtype in zip(shapes, new_names, dtypes):
                 if shape == ():
                     var_ = placeholder((), name_, dtype)
                     inputs.append(var_)
                     inputs_tvm.append(var_.var)
-                    arg_shapes.append([1])
-                    arg_dtypes.append(dtype)
-                else: # tensor inputs (new bufs)
+                else:
                     placeholder_ = placeholder(shape, name_, dtype)
                     inputs.append(placeholder_)
                     inputs_tvm.append(placeholder_.buf.data)
-                    arg_shapes.append(list(shape))
-                    arg_dtypes.append(dtype)
 
             s.ret_dtype = ret_dtype
             fmodule(*inputs)
@@ -441,8 +435,7 @@ def decorator(fmodule, shapes=shapes, dtypes=dtypes, ret_dtype=ret_dtype, name=n
             ret_void = _make.UIntImm("uint1", 0) if s.has_return else _make.UIntImm("uint1", 1)
             body = s.pop_stmt()
             s.stmt_stack.append([])
-            s.emit(_make.KernelDef(inputs_tvm, arg_shapes, arg_dtypes, 
-                                   body, ret_void, ret_dtype, name, []))
+            s.emit(_make.KernelDef(inputs_tvm, body, ret_void, ret_dtype, name))
             for name_, i in zip(names, inputs):
                 s.var_dict[name_] = i
             s.input_stages.clear()
diff --git a/python/heterocl/mutator.py b/python/heterocl/mutator.py
index 7d49f1e76..88ca42788 100644
--- a/python/heterocl/mutator.py
+++ b/python/heterocl/mutator.py
@@ -77,8 +77,6 @@ def mutate(self, node):
                     return self.mutate_SetSlice(node)
                 elif isinstance(node, _expr.KernelExpr):
                     return self.mutate_KernelExpr(node)
-                elif isinstance(node, _expr.StreamExpr):
-                    return self.mutate_StreamExpr(node)
                 else:
                     return node
         elif isinstance(node, _stmt.Stmt):
@@ -114,8 +112,6 @@ def mutate(self, node):
                 return self.mutate_Break(node)
             elif isinstance(node, _stmt.While):
                 return self.mutate_While(node)
-            elif isinstance(node, _stmt.StreamStmt):
-                return self.mutate_StreamStmt(node)
             else:
                 return node
         elif isinstance(node, tuple):
@@ -252,10 +248,6 @@ def mutate_KernelExpr(self, node):
         args = self.mutate(node.args)
         return _make.KernelExpr(node.dtype, args, node.name)
 
-    def mutate_StreamExpr(self, node):
-        args = self.mutate(node.args)
-        return _make.StreamExpr(node.dtype, args, node.name)
-
     # statements
     def mutate_LetStmt(self, node):
         var = self.mutate(node.var)
@@ -328,10 +320,6 @@ def mutate_KernelStmt(self, node):
         args = self.mutate(node.args)
         return _make.KernelStmt(args, node.name)
 
-    def mutate_StreamStmt(self, node):
-        args = self.mutate(node.args)
-        return _make.StreamStmt(node.dtype, args, node.name)
-
     def mutate_Return(self, node):
         value = self.mutate(node.value)
         return _make.Return(value)
diff --git a/python/heterocl/schedule.py b/python/heterocl/schedule.py
index 03af1cf3e..abd74acdc 100644
--- a/python/heterocl/schedule.py
+++ b/python/heterocl/schedule.py
@@ -5,7 +5,6 @@
 from ordered_set import OrderedSet
 from .tvm import make as _make
 from .tvm import stmt as _stmt
-from .tvm import expr as _expr
 from .tvm import api as tvm_api
 from .tvm import _api_internal
 from .tvm._api_internal import _ExternOp
@@ -135,42 +134,6 @@ def reuse_at(self, target, parent, axis, name=None):
             name = target.name + ".reuse"
         return self.sch.reuse_at(target, parent, axis, name)
 
-    def to(self, tensors, dst, src=None,
-           stream_type=_expr.StreamExpr.Channel, depth=10, name=None):
-        """Stream a list of Tensors to dst devices 
-        
-        Parameters
-        ----------
-        tensors : list of Tensor
-            The tensors to be moved
-
-        dst : device or module 
-            The tensors to be moved
-
-        stream_type : {FIFO, Channel, Burst}, optional
-            The stream type
-        """
-        if stream_type > 2:
-            raise APIError("Invalid channel type")
-        rets = []
-        if not isinstance(tensors, list):
-            tensors = [tensors]
-        for tensor in tensors: 
-            try:
-                target = tensor.tensor
-            except (AttributeError, ValueError):
-                try:
-                    target = tensor._op
-                except AttributeError:
-                    target = tensor
-            if name is None:
-                name = target.name + ".stream"
-            ret = self.sch.to(target, dst, src, 
-                              stream_type, depth, name)
-            name = None
-            rets.append(ret)
-        return rets
-
     def partition(self, target, partition_type=_stmt.Partition.Complete, dim=0, factor=0):
         """Partition a Tensor into smaller Tensors or even registers
 
@@ -339,7 +302,7 @@ def __exit__(self, ptype, value, trace):
         # create the output operation
         input_ops = [i._op for i in self.input_stages]
         input_bufs = [i._buf for i in self.input_stages]
-        output_bufs = [self._buf] 
+        output_bufs = [self._buf]
         body = self.pop_stmt()
         Stage._current.pop()
         op = _ExternOp(self.name, "", self.axis_list, input_ops,
@@ -368,7 +331,8 @@ def __exit__(self, ptype, value, trace):
             superstage.var_dict[self.name] = self
             # update prefix
             self.name_with_prefix = superstage.name_with_prefix + "." + self.name
-        else: # otherwise update the list of stages globally
+        # Otherwise update the list of stages globally
+        else:
             Schedule.stage_ops.append(self)
             Schedule.last_stages.add(self)
             Schedule.last_stages -= self.input_stages
diff --git a/python/heterocl/tools.py b/python/heterocl/tools.py
deleted file mode 100644
index bf47753fa..000000000
--- a/python/heterocl/tools.py
+++ /dev/null
@@ -1,108 +0,0 @@
-"""Define HeteroCL default tool settings"""
-#pylint: disable=too-few-public-methods, too-many-return-statements
-
-model_table = {
-  "xilinx" : ["fpga_xc7z045", "fpga_xcvu19p"],
-  "intel"  : ["cpu_e5", "cpu_i7", "fpga_stratix10_gx", 
-              "fpga_stratix10_dx", "fpga_stratix10_mx"],
-  "arm"    : ["cpu_a7", "cpu_a9", "cpu_a53"],
-  "riscv"  : ["cpu_riscv"]
-}
-
-option_table = {
-  "llvm"    : ("llvm_sim", {"version" : "6.0.0"}),
-  "sdaccel" : ("sw_emu", {"version" : "2017.1", "clock" : "1"}),
-  "vivado_hls" : ("csim", {"version" : "2017.1"}),
-  "rocket"     : ("source", {"RISCV" : ""}),
-
-  # refer to xilinx2016_1/ug904-vivado-implementation.pdf
-  "vivado"     : ("pnr",
-    {"version" : "2017.1",
-     "logic" : ["Default", "Explore", "ExploreSequentialArea", "AddRemap", "ExploreArea"],
-     "placement" : ["Default", "Explore", "ExtraNetDelay_high", "ExtraNetDelay_medium", "ExtraNetDelay_low", "ExtraPostPlacementOpt", "WLDrivenBlockPlacement", "LateBlockPlacement", "AltSpreadLogic_low", "AltSpreadLogic_medium", "AltSpreadLogic_high"],
-     "routing" : ["Default", "Explore", "HigherDelayCost"],
-     "fanout_opt" : ["on", "off"],
-     "placement_opt" : ["on", "off"],
-     "critical_cell_opt" : ["on", "off"],
-     "critical_pin_opt" : ["on", "off"],
-     "retime" : ["on", "off"],
-     "rewire" : ["on", "off"],
-    }),
-
-  "quartus"    : ("pnr", 
-    {"version" : "17.1",
-    "auto_dsp_recognition" : ['On', 'Off'],
-    "disable_register_merging_across_hierarchies": ['On', 'Off', 'Auto'],
-    "mux_restructure" : ['On', 'Off', 'Auto'],
-    "optimization_technique" : ['Area', 'Speed', 'Balanced'],
-    "synthesis_effort" : ['Auto', 'Fast'],
-    "synth_timing_driven_synthesis" : ['On', 'Off'],
-    "fitter_aggressive_routability_optimization" : ['Always', 'Automatically', 'Never'],
-    "fitter_effort" : ['Standard Fit', 'Auto Fit'],
-    "remove_duplicate_registers" : ['On', 'Off'],
-    "physical_synthesis" : ['On', 'Off'],
-    "adv_netlist_opt_synth_wysiwyg_remap" : ['On', 'Off'],
-    "allow_any_ram_size_for_recognition" : ['On', 'Off'],
-    "allow_any_rom_size_for_recognition" : ['On', 'Off'],
-    "allow_any_shift_register_size_for_recognition" : ['On', 'Off'],
-    "allow_power_up_dont_care" : ['On', 'Off'],
-    "allow_shift_register_merging_across_hierarchies" : ["Always", "Auto", "Off"],
-    "allow_synch_ctrl_usage" : ['On', 'Off'],
-    "auto_carry_chains" : ['On', 'Off'],
-    "auto_clock_enable_recognition" : ['On', 'Off'],
-    "auto_dsp_recognition" : ['On', 'Off'],
-    "auto_enable_smart_compile" : ['On', 'Off'],
-    "auto_open_drain_pins" : ['On', 'Off'],
-    "auto_ram_recognition" : ['On', 'Off'],
-    "auto_resource_sharing" : ['On', 'Off'],
-    "auto_rom_recognition" : ['On', 'Off'],
-    "auto_shift_register_recognition" : ["Always", "Auto", "Off"],
-    "disable_register_merging_across_hierarchies" : ["Auto", "On", "Off"],
-    "enable_state_machine_inference" : ['On', 'Off'],
-    "force_synch_clear" : ['On', 'Off'],
-    "ignore_carry_buffers" : ['On', 'Off'],
-    "ignore_cascade_buffers" : ['On', 'Off'],
-    "ignore_max_fanout_assignments" : ['On', 'Off'],
-    "infer_rams_from_raw_logic" : ['On', 'Off'],
-    "mux_restructure" : ["Auto", "On", "Off"],
-    "optimization_technique" : ["Area", "Balanced", "Speed"],
-    "optimize_power_during_synthesis" : ["Extra effort", "Normal compilation", "Off"],
-    "remove_duplicate_registers" : ['On', 'Off'],
-    "shift_register_recognition_aclr_signal" : ['On', 'Off'],
-    "state_machine_processing" : 
-        ["Auto", "Gray", "Johnson, Minimal Bits", "One-Hot", "Sequential", "User-Encoded"],
-    "strict_ram_recognition" : ['On', 'Off'],
-    "synthesis_effort" : ["Auto", "Fast"],
-    "synthesis_keep_synch_clear_preset_behavior_in_unmapper" : ['On', 'Off'],
-    "synth_resource_aware_inference_for_block_ram" : ['On', 'Off'],
-    "synth_timing_driven_synthesis" : ['On', 'Off'],
-    "alm_register_packing_effort" : ["High", "Low", "Medium"],
-    "auto_delay_chains" : ['On', 'Off'],
-    "auto_delay_chains_for_high_fanout_input_pins" : ["On", "Off"],
-    "eco_optimize_timing" : ["On", "Off"],
-    "final_placement_optimization" : ["Always", "Automatically", "Never"],
-    "fitter_aggressive_routability_optimization" : ["Always", "Automatically", "Never"],
-    "fitter_effort" : ["Standard Fit", "Auto Fit"],
-    "optimize_for_metastability" : ["On", "Off"],
-    "optimize_hold_timing" : ["All Paths", "IO Paths and Minimum TPD Paths", "Off"],
-    "optimize_ioc_register_placement_for_timing" : 
-        ["Normal", "Off", "Pack All IO Registers"],
-    "optimize_multi_corner_timing" : ['On', 'Off'],
-    "optimize_power_during_fitting" : ["Extra effort", "Normal compilation", "Off"],
-    "physical_synthesis" : ['On', 'Off'],
-    "placement_effort_multiplier" : [0.2, 0.5, 1.0, 2.0, 3.0, 4.0],
-    "programmable_power_technology_setting" : ["Automatic", "Force All Tiles with Failing Timing Paths to High Speed", "Force All Used Tiles to High Speed", "Minimize Power Only"],
-    "qii_auto_packed_registers" : ["Auto", "Minimize Area", "Minimize Area with Chains", "Normal", "Off", "Sparse", "Sparse Auto"],
-    "router_clocking_topology_analysis" : ['On', 'Off'],
-    "router_lcell_insertion_and_logic_duplication" : ["Auto", "On", "Off"],
-    "router_register_duplication" : ["Auto", "On", "Off"],
-    "router_timing_optimization_level" : ["MINIMUM", "Normal", "MAXIMUM"],
-    "seed" : (1, 5),
-    "tdc_aggressive_hold_closure_effort" : ['On', 'Off'],
-    "allow_register_retiming" : ['On', 'Off']}),
-
-  "aocl" : ("emu", {"version" : "17.0",
-                    "clokc" : 1.5,
-                    })
-}
-
diff --git a/python/heterocl/tvm/build_module.py b/python/heterocl/tvm/build_module.py
index 47b4e31ae..c8dcc91f2 100755
--- a/python/heterocl/tvm/build_module.py
+++ b/python/heterocl/tvm/build_module.py
@@ -6,10 +6,8 @@
 from __future__ import absolute_import as _abs
 import warnings
 import types
-import os
 
 from ._ffi.node import NodeBase, register_node
-from ._ffi.function import register_func
 from ._ffi.base import _RUNTIME_ONLY
 from . import api
 from . import tensor
@@ -23,48 +21,6 @@
 from . import ndarray
 from . import target as _target
 from . import make
-from ..devices import platform
-
-# test build sim
-@register_func
-def tvm_callback_syn_postproc(code):
-    return "test" 
-
-@register_func
-def get_util_path(platform):
-    if platform == "aws_f1":
-        return "/work/zhang-x1/users/sx233/heterocl/tvm/src/template/sdaccel/" 
-    elif platform == "rocket":
-        ppac = "/work/zhang-x1/users/sx233/heterocl/hlib/rocc-ppac" 
-        emulator = os.path.join(ppac, "rocket/emulator/emulator-freechips." + \
-                                      "rocketchip.system-RoccExampleConfig-debug")
-        # build emulator if not exist
-        if not os.path.isfile(emulator):
-            cmd = "cd " + ppac + ";"
-            cmd += "cp src/Ppac.v rocket/src/main/resources/vsrc;" + \
-                   "cp src/PpacRoCC.scala rocket/src/main/scala/tile;" + \
-                   "cd rocket && git apply ../src/rocc-ppac.patch;" + \
-                   "cd emulator && make CONFIG=RoccExampleConfig debug"
-            # create subprocess to check
-            subprocess.Popen(cmd, shell=True, stdout=open("build.log", "w")).wait()
-             
-        # re-build proxy kernel 
-        if not os.path.isfile(ppac + "/rocket/riscv-pk/build/pk"):
-            cmd = "cd " + ppac + "/rocket/riscv-pk;"
-            cmd += "git apply ../../tests/patches/riscv-pk.patch;"
-            cmd += "mkdir build; cd build;"
-            cmd += " ../configure --prefix=$RISCV/riscv64-unknown-elf --host=riscv64-unknown-elf;"
-            cmd += "make -j8; make install"
-            subprocess.Popen(cmd, shell=True, stdout=open("build.log", "w")).wait()
-        # return util folder needed to compile generated test files
-        return "/work/zhang-x1/users/sx233/heterocl/rocc-ppac/tests" 
-
-    # copy tcl and testbench  
-    elif platform == "vivado_hls":
-        return "/work/zhang-x1/users/sx233/heterocl/tvm/src/template/vivado" 
-
-    else: # unrecognized platform
-        assert False, "unsupported platform"
 
 class DumpIR(object):
     """
@@ -384,7 +340,6 @@ def lower(sch,
         stmt = f(stmt)
     # Phase 1
     stmt = ir_pass.StorageFlatten(stmt, binds, 64)
-    stmt = ir_pass.InferStream(stmt, 32)
     #stmt = ir_pass.CanonicalSimplify(stmt) #TODO: SOLVE THIS!!
     stmt = ir_pass.LiftAllocateAttrs(stmt)
     if cfg.generate_reuse_buffer:
@@ -423,7 +378,7 @@ def lower(sch,
     else:
         return ir_pass.MakeAPI(stmt, name, arg_list, 0, cfg.restricted_func)
 
-def build_fpga_kernel(sch, args, target, name="default_function"):
+def build_fpga_kernel(sch, args, target_name, name="default_function"):
     """Build an FPGA kernel.
 
     Parameters
@@ -452,66 +407,20 @@ def build_fpga_kernel(sch, args, target, name="default_function"):
     if args is None:
         raise ValueError("args must be given for build from schedule")
 
-    # generate host (device) code / function 
-    if target == "merlinc":
+    if target_name == "merlinc":
         BuildConfig.current = build_config(generate_reuse_buffer=False)
     else:
         BuildConfig.current = build_config()
-
     flist = lower(sch, args, kernel_only=True, name=name)
     if isinstance(flist, container.LoweredFunc):
         flist = [flist]
-    fdevice = [ir_pass.LowerIntrin(x, str(target)) for x in flist]
-
-    if isinstance(target, str): # string type
-        builder = getattr(codegen, "build_{0}".format(target))
-        ret = builder(fdevice)
-        if isinstance(ret, str):
-            decl = ret[:ret.find("{device}")]
-            start = ret.find("{host}")
-            end = ret.rfind("{host}")
-            ret = decl + "\n" + ret[start+6:end]
-            ret = ret.strip("\n").lstrip("\n") + "\n\n" 
-        return ret
-
-    try: # generate and split code
-        host, xcel = None, None
-        if target.tool.name == "sdaccel":
-            host = target.host.lang.replace("opencl", "aocl")
-            xcel = target.xcel.lang.replace("hlsc", "vhls")
-        elif target.tool.name == "vivado_hls":
-            host = target.host.lang.replace("hlsc", "vhls")
-            xcel = target.xcel.lang.replace("hlsc", "vhls")
-        elif target.tool.name == "rocket":
-            host = target.host.lang.replace("c", "rv64_ppac")
-   
-        # return simulation built function
-        mode = str(target.tool.mode)
-        if "emu" in mode or "sim" in mode:
-            builder = getattr(codegen, "build_{0}".format("sim"))
-            keys = [k for k in target.tool.options.keys()]
-            vals = [v for v in target.tool.options.values()]
-            keys.insert(0, "name")
-            vals.insert(0, target.tool.name)
-            return builder(fdevice, keys, vals)
-        elif mode != "debug": # impl mode
-            pass
-        else: # return source code only
-            host_code, xcel_code = "", ""
-            if host: # src mode generate host code 
-                builder = getattr(codegen, "build_{0}".format(host))
-                host_code = builder(fdevice)
-                findex, rindex = host_code.find("{host}"), host_code.rfind("{host}")
-                host_code = host_code[findex + 6 : rindex]
-            if xcel: # src mode generate xcel code
-                builder = getattr(codegen, "build_{0}".format(xcel))
-                xcel_code = builder(fdevice)
-                findex, rindex = xcel_code.find("{device}"), xcel_code.rfind("{device}")
-                xcel_code = xcel_code[findex + 8 : rindex]
-            return xcel_code + host_code 
+    fdevice = [ir_pass.LowerIntrin(x, target_name) for x in flist]
 
+    try:
+        builder = getattr(codegen, "build_{0}".format(target_name))
+        return builder(fdevice)
     except AttributeError:
-        raise AttributeError("Cannot find the target builder %s" % target)
+        raise AttributeError("Cannot find the target builder %s" % target_name)
     return None
 
 def build(sch,
@@ -559,13 +468,11 @@ def build(sch,
     ----
     See the note on :any:`tvm.target` on target string format.
     """
-    if isinstance(target, platform):
-        return build_fpga_kernel(sch, args, target, name=name)
-    else: # default string type target
-        target = _target.current_target() if target is None else target
-        target = _target.create(target) if target else _target.create("llvm")
-        if "fpga" in target.keys:
-            return build_fpga_kernel(sch, args, target.target_name, name=name)
+    target = _target.current_target() if target is None else target
+    target = _target.create(target) if target else _target.create("llvm")
+
+    if "fpga" in target.keys:
+        return build_fpga_kernel(sch, args, target.target_name, name=name)
     BuildConfig.current = build_config()
 
     if isinstance(sch, schedule._Schedule):
diff --git a/python/heterocl/tvm/expr.py b/python/heterocl/tvm/expr.py
index d1ea4ae75..d71307e8f 100644
--- a/python/heterocl/tvm/expr.py
+++ b/python/heterocl/tvm/expr.py
@@ -382,9 +382,3 @@ class Quantize(Expr):
 @register_node
 class KernelExpr(Expr):
   pass
-
-@register_node
-class StreamExpr(Expr):
-    Channel = 0
-    Pipe = 1
-    FIFO = 2
diff --git a/python/heterocl/tvm/schedule.py b/python/heterocl/tvm/schedule.py
index 36ead39de..21905b443 100644
--- a/python/heterocl/tvm/schedule.py
+++ b/python/heterocl/tvm/schedule.py
@@ -3,7 +3,6 @@
 from ._ffi.base import string_types
 from ._ffi.node import NodeBase, register_node
 from ._ffi.function import _init_api
-from ..devices import Device
 from . import _api_internal
 from . import tensor as _tensor
 from . import expr as _expr
@@ -333,53 +332,6 @@ def reuse_at(self, target, parent, axis, name):
     def partition(self, target, partition_type, dim, factor):
         return _api_internal._SchedulePartition(self, target, dim, factor, partition_type)
 
-    def to(self, tensor, dst, src, 
-           types=_expr.StreamExpr.Channel, 
-           depth=1, name=None):
-        """ Stream data to devices or on-chip module 
-
-        Parameters
-        ----------
-        tensor : list of Tensors
-            Tensor to be streamed.
-        dst : hcl device or dst stage
-            The device or module for streaming 
-        type : channel type
-            The streaming type (e.g. fifo or pipe)
-
-        Returns
-        -------
-        outer : IterVar
-            The outer variable of iteration.
-        """ 
-        # create producer and consumer for stream
-        if isinstance(dst, Device): 
-            dst = 1 if 'fpga' in str(dst) else 0
-            return _api_internal._ScheduleMove(self, tensor, dst,
-                                               types, depth, name)
-        else: # connect kernel
-            assert isinstance(dst, _Stage), "dst not a stage "
-            if src: # remove buffer between kernels 
-                assert isinstance(src, _Stage), \
-                       "destination should be a stage but " + str(type(src)) 
-                try: 
-                    self.remove_args.append(tensor.op.output(0))
-                except:
-                    self.remove_args = []
-                    self.remove_args.append(tensor.op.output(0))
-                _api_internal._ScheduleStream(self, tensor, dst, src, 
-                                              types, depth, name)
-            else: # from externop buffer to kernel
-                shape = [_.value for _ in tensor.shape]
-                index, match = 0, []
-                for s in dst.op.body.api_args:
-                    arg_shape = [_.value for _ in s]
-                    if shape == arg_shape: match.append(index)
-                    index = index + 1
-                assert len(match) > 0, "wrong kernel or tensor (shape not matching)"
-                _api_internal._ScheduleMoveToStage(self, tensor, dst, match[0], 
-                                                   types, depth, name)
-
 @register_node("Stage")
 class _Stage(NodeBase):
     """A Stage represents schedule for one operation.
@@ -702,7 +654,7 @@ def pragma(self, var, pragma_type):
         - **parallel_stride_pattern**
 
           Hint parallel loop to execute in strided pattern.
-          :code:`for (int i = task_id; i < end; i += num_task)`          
+          :code:`for (int i = task_id; i < end; i += num_task)`
 
         """
         _api_internal._StagePragma(self, var, pragma_type)
diff --git a/python/heterocl/tvm/stmt.py b/python/heterocl/tvm/stmt.py
index d5c2d0a18..4db84970f 100644
--- a/python/heterocl/tvm/stmt.py
+++ b/python/heterocl/tvm/stmt.py
@@ -112,7 +112,3 @@ class Partition(Stmt):
 @register_node
 class Stencil(Stmt):
     pass
-
-@register_node
-class StreamStmt(Stmt):
-    pass
diff --git a/python/heterocl/tvm/target.py b/python/heterocl/tvm/target.py
index 5687953ca..12235d95d 100644
--- a/python/heterocl/tvm/target.py
+++ b/python/heterocl/tvm/target.py
@@ -1,3 +1,43 @@
+"""Target management API of TVM.
+
+TVM's target string is in fomat ``<target_name> [-option=value]...``.
+
+Note
+----
+The list of options include:
+
+- **-device=<device name>**
+
+   The device name.
+
+- **-mtriple=<target triple>** or **-target**
+
+   Specify the target triple, which is useful for cross
+   compilation.
+
+- **-mcpu=<cpuname>**
+
+   Specify a specific chip in the current architecture to
+   generate code for. By default this is infered from the
+   target triple and autodetected to the current architecture.
+
+- **-mattr=a1,+a2,-a3,...**
+
+   Override or control specific attributes of the target,
+   such as whether SIMD operations are enabled or not. The
+   default set of attributes is set by the current CPU.
+
+- **-system-lib**
+
+   Build TVM system library module. System lib is a global module that contains
+   self registered functions in program startup. User can get the module using
+   :any:`tvm.module.system_lib`.
+   It is useful in environments where dynamic loading api like dlopen is banned.
+   The system lib will be available as long as the result code is linked by the program.
+
+We can use :any:`tvm.target.create` to create a tvm.target.Target from the target string.
+We can also use other specific function in this module to create specific targets.
+"""
 from __future__ import absolute_import
 
 import warnings
@@ -10,8 +50,7 @@
     if _LIB_NAME != "libhcl_runtime.so":
         raise err_msg
 
-FPGA_TARGETS = ['merlinc', 'soda', 'soda_xhls', 'vhls', 'ihls', 'vhls_csim', 
-                'opencl', 'sdaccel', 'sdaccel_csim', 'aocl', 'aocl_csim', 'rv64_ppac']
+FPGA_TARGETS = ['merlinc', 'soda', 'soda_xhls', 'vhls', 'ihls', 'vhls_csim']
 
 def _merge_opts(opts, new_opts):
     """Helper function to merge options"""
@@ -29,7 +68,7 @@ class Target(object):
 
     Parameters
     ----------
-    target_name : {"llvm", "cuda", "opencl", "metal", "rocm", "stackvm", "opengl", "ext_dev", "rv64_ppac"}
+    target_name : {"llvm", "cuda", "opencl", "metal", "rocm", "stackvm", "opengl", "ext_dev"}
         The major target name.
 
                   {"merlinc", "soda", "soda_xhls", "vhls"}
diff --git a/python/heterocl/util.py b/python/heterocl/util.py
index 704b774cb..996201105 100644
--- a/python/heterocl/util.py
+++ b/python/heterocl/util.py
@@ -4,7 +4,6 @@
 from .tvm.expr import Var, Call
 from .tvm.api import _IterVar, decl_buffer
 from . import types
-from . import devices
 from . import config
 from .scheme import Scheme
 from .debug import DTypeError
diff --git a/samples/conv/conv.py b/samples/conv/conv.py
deleted file mode 100644
index ca41a50a1..000000000
--- a/samples/conv/conv.py
+++ /dev/null
@@ -1,70 +0,0 @@
-import heterocl as hcl
-import hlib
-import numpy as np
-from PIL import Image
-from urllib.request import urlopen
-
-batch_size = 1
-hcl.init(hcl.UInt(32))
-dtype = hcl.UInt(32)
-image_size = ()
-kernel_size = 3
-
-# setup target using vivado 
-tool = hcl.tool.vivado("csim")
-target = hcl.platform.zc706
-
-def conv():
-    image = hcl.placeholder((batch_size, 1, 256, 256), "input_image")
-    k1 = hcl.placeholder((1, 1, 3, 3), "kernel_1")
-    k2 = hcl.placeholder((1, 1, 3, 3), "kernel_2")
-
-    def kernel(input_image, kernel_1, kernel_2):
-
-        # return tensor required (cannot do def_())
-        interm_shape = (1,1,254,254)
-        output_shape = (1,1,252,252)
-
-        # make compute wrapped in hcl def
-        module1 = hcl.def_([input_image.shape, kernel_1.shape, interm_shape], name="conv1")(hlib.nn.conv2d_nchw_imp)
-        module2 = hcl.def_([interm_shape, kernel_2.shape, output_shape], name="conv2")(hlib.nn.conv2d_nchw_imp)
-        conv1 = hcl.compute(interm_shape, lambda *args: 0)  
-        conv2 = hcl.compute(output_shape, lambda *args: 0)  
-        module1(input_image, kernel_1, conv1)
-        module2(conv1, kernel_2, conv2)
-
-        # derivative module for normalization 
-        return hcl.compute(output_shape, lambda *args: conv2[args], name="derv")
-
-    s = hcl.create_schedule([image, k1, k2], kernel)
-
-    # data moved to local  
-    i0, k10, k20 = s.to([image, k1, k2], target.fpga)
-    # s.to([i0, k10], s[kernel.conv1])
-    # s.to([k20], s[kernel.conv2])
-    s.to(kernel.derv, target.cpu)
-
-    # create stream channel between modules 
-    print(type(target.fpga), hcl.lower(s))
-    return hcl.build(s, target)
-
-# Load sample data
-img = Image.open(urlopen('http://i.stack.imgur.com/8zINU.gif'))
-kernel_x = np.array([[-1, 0, 1], [-2, 0, 2], [-1, 0, 1]])
-kernel_y = np.flip(kernel_x.T.T, axis=0)
-img = np.array(img)
-
-img = img[np.newaxis, ...]
-img = img[np.newaxis, ...]
-kernel_x = kernel_x[np.newaxis, ...]
-kernel_x = kernel_x[np.newaxis, ...]
-kernel_y = kernel_y[np.newaxis, ...]
-kernel_y = kernel_y[np.newaxis, ...]
-
-hcl_input  = hcl.asarray(img, dtype)    
-kernel_x   = hcl.asarray(kernel_x, dtype)
-kernel_y   = hcl.asarray(kernel_y, dtype)
-hcl_output = hcl.asarray(np.zeros((1,1,254,254)), dtype)    
-
-f = conv()
-f(hcl_input, kernel_x, kernel_y, hcl_output)
diff --git a/samples/digitrec/digitrec_stream.py b/samples/digitrec/digitrec_stream.py
deleted file mode 100644
index 4c0da096a..000000000
--- a/samples/digitrec/digitrec_stream.py
+++ /dev/null
@@ -1,150 +0,0 @@
-import heterocl as hcl
-import time
-import numpy as np
-import math
-from digitrec_data import read_digitrec_data
-
-N = 8 * 8
-max_bit = int(math.ceil(math.log(N, 2)))
-test_size = (180, )
-data_size = (10, 1800)
-
-dtype_image = hcl.UInt(N)
-dtype_knnmat = hcl.UInt(max_bit)
-
-setting = {
-  "version" : "2019.1",
-  "clock"   : "10"
-}
-tool = hcl.tool.vivado("csim", setting)
-target = hcl.platform.aws_f1
-
-def knn(test_images, train_images):
-
-    def popcount(num):
-        out = hcl.local(0, "out")
-        with hcl.for_(0, train_images.type.bits) as i:
-            out.v += num[i]
-        return out.v
-
-    def update_knn(dist, knn_mat, i, j):
-        max_id = hcl.local(0, "max_id")
-        with hcl.for_(0, 3) as k:
-            with hcl.if_(knn_mat[i][k] > knn_mat[i][max_id.v]):
-                max_id.v = k
-        with hcl.if_(dist[i][j] < knn_mat[i][max_id.v]):
-            knn_mat[i][max_id.v] = dist[i][j]
-
-    def sort_knn(knn_mat, i, j):
-        val = hcl.local(0, "val")
-        with hcl.if_( j == 1 ):
-            with hcl.if_( knn_mat[i][1] > knn_mat[i][2] ):
-                val.v = knn_mat[i][1] 
-                knn_mat[i][1] = knn_mat[i][2]
-                knn_mat[i][2] = val.v
-        with hcl.else_():
-            with hcl.if_( knn_mat[i][0] > knn_mat[i][1] ):
-                val.v = knn_mat[i][0] 
-                knn_mat[i][0] = knn_mat[i][1]
-                knn_mat[i][1] = val.v
-
-    def knn_vote(knn_mat, j):
-        id0 = hcl.local(0, "id0")
-        id1 = hcl.local(0, "id1")
-        id2 = hcl.local(0, "id2")
-        count = hcl.local(0, "count")
-        with hcl.for_(0, 10) as n:
-            with hcl.if_(knn_mat[n][0] < knn_mat[id0.v][0]):
-                id0.v = n
-        with hcl.for_(0, 10) as m:
-            with hcl.if_(knn_mat[m][0] < knn_mat[id1.v][0]):
-                id1.v = m
-        with hcl.for_(0, 10) as k:
-            with hcl.if_(knn_mat[k][0] < knn_mat[id2.v][0]):
-                id2.v = k
-        with hcl.if_(j == id0.v):
-            count.v += 1 
-        with hcl.elif_(j == id1.v):
-            count.v += 1 
-        with hcl.elif_(j == id2.v):
-            count.v += 1 
-        with hcl.else_():
-            count.v += 0 
-        return count.v
-
-    # support hcl.compute in hcl def 
-    @hcl.def_([(), data_size, (10,3)])
-    def knn_dist(test_image, train_images, pred_matrix):
-        pass
-
-    with hcl.for_(0, 180) as index:
-        test_image = test_images[index] 
-        diff = hcl.compute(train_images.shape,
-                           lambda x, y: train_images[x][y] ^ test_image,
-                           "diff")
-        dist = hcl.compute(diff.shape,
-                           lambda x, y: popcount(diff[x][y]),
-                           "dist")
-        knn_mat = hcl.compute((10, 3), lambda x, y: 50, "knn_mat")
-        hcl.mutate(dist.shape,
-                        lambda x, y: update_knn(dist, knn_mat, x, y),
-                        "knn_update")
-        hcl.mutate((10, 3), lambda x, y: sort_knn(knn_mat, x, y), "sort")
-        knn_new = hcl.compute(knn_mat.shape, 
-                              lambda x, y: knn_mat[x][y], "copy")
-        knn_pred = hcl.compute((10,), 
-                               lambda x: knn_vote(knn_mat, x), "vote")
-    return knn_pred
-
-test_image = hcl.placeholder(test_size, "test_image", dtype_image)
-train_images = hcl.placeholder(data_size, "train_images", dtype_image)
-
-scheme = hcl.create_scheme([test_image, train_images], knn)
-scheme.downsize([knn.dist, knn.dist.out, knn.knn_mat], dtype_knnmat)
-
-s = hcl.create_schedule_from_scheme(scheme)
-
-diff = knn.diff
-dist = knn.dist
-vote = knn.copy
-knn_update = knn.knn_update
-
-s.to([test_images, train_images], target.xcel)
-s.to(vote, target.host)
-
-# merge loop nests
-s[diff].compute_at(s[dist], dist.axis[1])
-s[dist].compute_at(s[knn_update], knn_update.axis[1])
-
-# reorder loop to expose more parallelism
-s[knn_update].reorder(knn_update.axis[1], knn_update.axis[0])
-
-# parallel outer loop and pipeline inner loop
-s[knn_update].parallel(knn_update.axis[1])
-s[knn_update].pipeline(knn_update.axis[0])
-
-# at the end, we build the whole offloaded function.
-# print(hcl.lower(s))
-f = hcl.build(s, target)
-
-train_images, _, test_images, test_labels = read_digitrec_data()
-total = len(test_images)
-total_time = 0
-
-# read returned prediction from streaming pipe
-hcl_train_images = hcl.asarray(train_images, dtype_image)
-hcl_knn_pred = hcl.asarray(np.zeros((total, 10)), dtype_knnmat)
-
-start = time.time()
-f(test_images, hcl_train_images, hcl_knn_pred)
-total_time = total_time + (time.time() - start)
-
-knn_result = hcl_knn_pred.asnumpy()
-
-correct = 0.0
-for i in range(total):
-    if np.argmax(knn_result[i]) == test_labels[i]:
-        correct += 1
-
-print("Average kernel time (s): {:.2f}".format(total_time/total))
-print("Accuracy (%): {:.2f}".format(100*correct/1))
diff --git a/samples/digitrec/kernel.cpp b/samples/digitrec/kernel.cpp
new file mode 100644
index 000000000..21b550c8b
--- /dev/null
+++ b/samples/digitrec/kernel.cpp
@@ -0,0 +1,38 @@
+#include <string.h>
+#include <math.h>
+#include <assert.h>
+#pragma ACCEL kernel
+void default_function(unsigned long test_image, unsigned long* train_images, unsigned char* knn_mat) {
+  for (int x = 0; x < 10; ++x) {
+    for (int y = 0; y < 3; ++y) {
+      knn_mat[(y + (x * 3))] = (unsigned char)50;
+    }
+  }
+  unsigned long knn_update;
+#pragma ACCEL parallel
+  for (int y1 = 0; y1 < 1800; ++y1) {
+#pragma ACCEL pipeline
+    for (int x1 = 0; x1 < 10; ++x1) {
+      unsigned char dist;
+      unsigned long diff;
+      diff = (train_images[(y1 + (x1 * 1800))] ^ test_image);
+      unsigned char out;
+      out = (unsigned char)0;
+      for (int i = 0; i < 49; ++i) {
+        out = ((unsigned char)(((unsigned long)out) + ((unsigned long)((diff & (1L << i)) >> i))));
+      }
+      dist = out;
+      unsigned long max_id;
+      max_id = (unsigned long)0;
+      for (int i1 = 0; i1 < 3; ++i1) {
+        if (knn_mat[(((long)max_id) + ((long)(x1 * 3)))] < knn_mat[(i1 + (x1 * 3))]) {
+          max_id = ((unsigned long)i1);
+        }
+      }
+      if (dist < knn_mat[(((long)max_id) + ((long)(x1 * 3)))]) {
+        knn_mat[(((long)max_id) + ((long)(x1 * 3)))] = dist;
+      }
+    }
+  }
+}
+
diff --git a/samples/gemm/common/common.mk b/samples/gemm/common/common.mk
deleted file mode 100644
index 3409e4aa5..000000000
--- a/samples/gemm/common/common.mk
+++ /dev/null
@@ -1,55 +0,0 @@
-SHELL = /bin/bash
-VPATH = ./
-CC = xcpp
-CLCC = xocc
-ifeq ($(XDEVICE_REPO_PATH),)
-    DEVICE_REPO_OPT = 
-else
-DEVICE_REPO_OPT = --xp prop:solution.device_repo_paths=${XDEVICE_REPO_PATH}
-endif
-HOST_CFLAGS += -I${XILINX_SDX}/runtime/include/1_2
-HOST_LFLAGS += -L${XILINX_SDX}/runtime/lib/x86_64 -lxilinxopencl -lrt -pthread
-CLCC_OPT += $(CLCC_OPT_LEVEL) ${DEVICE_REPO_OPT} --xdevice ${XDEVICE} -o ${XCLBIN} ${KERNEL_DEFS} ${KERNEL_INCS}
-ifeq (${KEEP_TEMP},1)
-    CLCC_OPT += -s
-endif
-ifeq (${KERNEL_DEBUG},1)
-    CLCC_OPT += -g
-endif
-CLCC_OPT += --kernel ${KERNEL_NAME}
-OBJECTS := $(HOST_SRCS:.cpp=.o)
-.PHONY: all
-all: run
-host: ${HOST_EXE_DIR}/${HOST_EXE}
-xbin_cpu_em:
-    make SDA_FLOW=cpu_emu xbin -f sdaccel.mk
-xbin_hw_em:
-    make SDA_FLOW=hw_emu xbin -f sdaccel.mk
-xbin_hw :
-    make SDA_FLOW=hw xbin -f sdaccel.mk
-xbin: ${XCLBIN}
-run_cpu_em: 
-    make SDA_FLOW=cpu_emu run_em -f sdaccel.mk
-run_hw_em: 
-    make SDA_FLOW=hw_emu run_em -f sdaccel.mk
-run_hw : 
-    make SDA_FLOW=hw run_hw_int -f sdaccel.mk
-run_em: xconfig host xbin
-    XCL_EMULATION_MODE=true ${HOST_EXE_DIR}/${HOST_EXE} ${HOST_ARGS}
-run_hw_int : host xbin_hw
-    source ${BOARD_SETUP_FILE};${HOST_EXE_DIR}/${HOST_EXE} ${HOST_ARGS}
-estimate : 
-    ${CLCC} -c -t hw_emu --xdevice ${XDEVICE} --report estimate ${KERNEL_SRCS}
-xconfig : emconfig.json
-emconfig.json :
-    emconfigutil --xdevice ${XDEVICE} ${DEVICE_REPO_OPT} --od .
-${HOST_EXE_DIR}/${HOST_EXE} : ${OBJECTS}
-    ${CC} ${HOST_LFLAGS} ${OBJECTS} -o $@
-${XCLBIN}:
-    ${CLCC} ${CLCC_OPT} ${KERNEL_SRCS}
-%.o: %.cpp
-    ${CC} ${HOST_CFLAGS} -c $< -o $@
-clean:
-    ${RM} -rf ${HOST_EXE} ${OBJECTS} ${XCLBIN} emconfig.json _xocc_${XCLBIN_NAME}_*.dir .Xil
-cleanall: clean
-    ${RM} -rf *.xclbin sdaccel_profile_summary.* _xocc_* TempConfig *.log *.jou
diff --git a/samples/gemm/gemm_aocl.cl b/samples/gemm/gemm_aocl.cl
deleted file mode 100644
index 198757823..000000000
--- a/samples/gemm/gemm_aocl.cl
+++ /dev/null
@@ -1,14 +0,0 @@
-#include "ihc_apint.h"
-__kernel void default_function(__global int* restrict placeholder0, __global int* restrict placeholder1, __global int* restrict matrix_3) {
-  for (int x = 0; x < 10; ++x) {
-    for (int y = 0; y < 10; ++y) {
-      int sum;
-      sum = 0;
-      for (int k = 0; k < 10; ++k) {
-        sum = ((int)(((int64_t)(((long)placeholder0[(k + (x * 10))]) * ((long)placeholder1[(y + (k * 10))]))) + ((int64_t)sum)));
-      }
-      matrix_3[(y + (x * 10))] = sum;
-    }
-  }
-}
-
diff --git a/samples/gemm/gemm_main.py b/samples/gemm/gemm_main.py
index 4796bf2fb..fb05a094d 100644
--- a/samples/gemm/gemm_main.py
+++ b/samples/gemm/gemm_main.py
@@ -52,6 +52,5 @@ def time_gemm(dtype, m=1024, n=1024, k=1024, target=None):
 ###############################################################################
 # Test the algorithm with different data types
 dtypes = [hcl.Int(32), hcl.Float(), hcl.Fixed(32, 16)]
-
-# for dtype in dtypes:
-# time_gemm(hcl.Float(), 10, 10, 10, 'sdaccel')
+for dtype in dtypes:
+    time_gemm(dtype)
diff --git a/samples/gemm/gemm_runtime.py b/samples/gemm/gemm_runtime.py
deleted file mode 100644
index 49947fa4c..000000000
--- a/samples/gemm/gemm_runtime.py
+++ /dev/null
@@ -1,86 +0,0 @@
-# Yang.Bai
-# yb269@cornell.edu
-
-import heterocl as hcl
-import numpy as np
-
-hcl.init()
-
-# matrix_size = (16, 16)
-# def add_compute(A, B):
-#     C = hcl.compute(A.shape, lambda x, y: A[x, y] + B[x, y], "C")
-#     return C
-
-# def add_compute_2(A, B):
-#     C = hcl.compute(A.shape, lambda x: A[x] + B[x], "C")
-#     return C
-
-# A = hcl.placeholder(matrix_size, "A")
-# B = hcl.placeholder(matrix_size, "B")
-
-# s = hcl.create_schedule([A, B], add_compute)
-# # f2 = hcl.build(s, target='sdaccel')
-# f2 = hcl.build(s, target='aocl')
-# print (f2)
-
-# hcl_A = hcl.asarray(np.random.random_sample(matrix_size), dtype = hcl.Float())
-# hcl_B = hcl.asarray(np.random.random_sample(matrix_size), dtype = hcl.Float())
-# hcl_C = hcl.asarray(np.zeros(matrix_size), dtype = hcl.Float())
-# hcl_C2 = hcl.asarray(np.zeros(matrix_size), dtype = hcl.Float())
-# f3 = hcl.build(s)
-
-# A = hcl.placeholder((10, ), "A")
-# B = hcl.placeholder((10, ), "B")
-# s = hcl.create_schedule([A, B], add_compute_2)
-# f4 = hcl.build(s, target='sdaccel')
-# print (f4)
-# print (hcl_A, hcl_B, hcl_C)
-
-matrix_1_size = (10, 10)
-matrix_2_size = (10, 10)
-matrix_3_size = (matrix_1_size[0], matrix_2_size[1])
-
-def gemm_compute(matrix_1, matrix_2):
-    m = matrix_1.shape[0];
-    k = matrix_1.shape[1];
-    n = matrix_2.shape[1];
-    r = hcl.reduce_axis(0, k, 'k')
-    temp = hcl.compute((m, n), 
-            lambda x, y: hcl.sum(matrix_1[x, r] * matrix_2[r, y], 
-            axis = r), name='matrix_3')
-    return temp
-
-matrix_1 = hcl.placeholder(matrix_1_size)
-matrix_2 = hcl.placeholder(matrix_2_size)
-
-s = hcl.create_schedule([matrix_1, matrix_2], gemm_compute)
-f = hcl.build(s, target='sdaccel_csim')
-code = hcl.build(s, target='aocl')
-with open('gemm_aocl.cl', 'w') as fin:
-    fin.write(code)
-
-code2 = hcl.build(s, target='sdaccel')
-with open('gemm_sdaccel.cl', 'w') as fin2:
-    fin2.write(code2)
-
-
-matrix_1_np = np.random.randint(10, size=matrix_1_size)
-matrix_2_np = np.random.randint(10, size=matrix_2_size)
-matrix_3_np = np.random.randint(10, size=matrix_3_size)
-
-hcl_matrix_1 = hcl.asarray(matrix_1_np)
-hcl_matrix_2 = hcl.asarray(matrix_2_np)
-hcl_matrix_3 = hcl.asarray(matrix_3_np)
-
-# f(hcl_matrix_1, hcl_matrix_2, hcl_matrix_3)
-
-
-
-
-
-# with open('sdaccel.cl', 'w') as f:
-#     f.write(code)
-
-
-
-
diff --git a/samples/gemm/gemm_sdaccel.cl b/samples/gemm/gemm_sdaccel.cl
deleted file mode 100644
index f46a88426..000000000
--- a/samples/gemm/gemm_sdaccel.cl
+++ /dev/null
@@ -1,13 +0,0 @@
-__kernel void default_function(__global int* placeholder0, __global int* placeholder1, __global int* matrix_3) {
-  for (int x = 0; x < 10; ++x) {
-    for (int y = 0; y < 10; ++y) {
-      __local int sum;
-      sum = 0;
-      for (int k = 0; k < 10; ++k) {
-        sum = ((int)(((long)(((long)placeholder0[(k + (x * 10))]) * ((long)placeholder1[(y + (k * 10))]))) + ((long)sum)));
-      }
-      matrix_3[(y + (x * 10))] = sum;
-    }
-  }
-}
-
diff --git a/samples/gemm/gemm_sdaccel.py b/samples/gemm/gemm_sdaccel.py
deleted file mode 100644
index 85c318120..000000000
--- a/samples/gemm/gemm_sdaccel.py
+++ /dev/null
@@ -1,8 +0,0 @@
-import heterocl as hcl
-import numpy as np
-from gemm_main import *
-
-#dtypes = [hcl.Int(32), hcl.Float(), hcl.Fixed(32, 16)]
-#for dtype in dtypes:
-time_gemm(hcl.Int(32), 15, 15, 15, 'sdaccel_sw_emu')
-# time_gemm(hcl.Float(), 100, 100, 100, 'sdaccel_sw_emu')
diff --git a/samples/gemm/gemm_vhls.py b/samples/gemm/gemm_vhls.py
index 8edd84bdd..e27fa155e 100644
--- a/samples/gemm/gemm_vhls.py
+++ b/samples/gemm/gemm_vhls.py
@@ -2,6 +2,6 @@
 import numpy as np
 from gemm_main import *
 
-#dtypes = [hcl.Int(32), hcl.Float(), hcl.Fixed(32, 16)]
-#for dtype in dtypes:
-time_gemm(hcl.Int(32), 10, 10, 10, 'vhls_csim')
+dtypes = [hcl.Int(32), hcl.Float(), hcl.Fixed(32, 16)]
+for dtype in dtypes:
+    time_gemm(dtype, 10, 10, 10, 'vhls_csim')
diff --git a/samples/gemm/host.cpp b/samples/gemm/host.cpp
deleted file mode 100644
index 914b2aa26..000000000
--- a/samples/gemm/host.cpp
+++ /dev/null
@@ -1,118 +0,0 @@
-#define CL_HPP_CL_1_2_DEFAULT_BUILD
-#define CL_HPP_TARGET_OPENCL_VERSION 120
-#define CL_HPP_MINIMUM_OPENCL_VERSION 120
-#define CL_HPP_ENABLE_PROGRAM_CONSTRUCTION_FROM_ARRAY_COMPATIBILITY 1
-#include <CL/cl2.hpp>
-#include <fstream>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <fcntl.h>
-#include <unistd.h>
-#include <stdlib.h>
-#include <stdio.h>
-#include <cstring>
-#include <iostream>
-#include <iomanip>
-#include <cmath>
-#include <sys/ipc.h>
-#include <sys/shm.h>
-#pragma once
-
-
-
-
-int main(void) { 
-#if defined(SDX_PLATFORM) && !defined(TARGET_DEVICE)
-  #define STR_VALUE(arg) #arg
-  #define GET_STRING(name) STR_VALUE(name)
-  #define TARGET_DEVICE GET_STRING(SDX_PLATFORM)
-#endif
-    char* xclbinFilename = argv[1];
-
-    std::vector<int> source_0(6 * 2);
-    std::vector<int> source_1(2 * 7);
-    std::vector<int> source_2(6 * 7);
-
-    size_t vector_size_bytes_0 = sizeof(int) * 6 * 2;
-    size_t vector_size_bytes_1 = sizeof(int) * 2 * 7;
-    size_t vector_size_bytes_2 = sizeof(int) * 6 * 7;
-
-    int* arg_0 = (int*)shmat(4849666, nullptr, 0);
-    for (size_t i0 = 0; i0 < 6; i0++) {
-      for (size_t i1 = 0; i1 < 2; i1++) {
-        source_0[i1 + i0*2] = arg_0[i1 + i0*2];
-      }
-    }
-    int* arg_1 = (int*)shmat(7667712, nullptr, 0);
-    for (size_t i0 = 0; i0 < 2; i0++) {
-      for (size_t i1 = 0; i1 < 7; i1++) {
-        source_1[i1 + i0*7] = arg_1[i1 + i0*7];
-      }
-    }
-    int* arg_2 = (int*)shmat(7667713, nullptr, 0);
-    for (size_t i0 = 0; i0 < 6; i0++) {
-      for (size_t i1 = 0; i1 < 7; i1++) {
-        source_2[i1 + i0*7] = arg_2[i1 + i0*7];
-      }
-    }
-    std::vector<cl::Platform> platforms;
-    cl::Platform::get(&platforms);
-    cl::Platform platform = platforms[0];
-
-    std::vector<cl::Device> devices;
-    platform.getDevices(CL_DEVICE_TYPE_ACCELERATOR, &devices);
-    cl::Device device = devices[0];
-
-    cl::Context context(device);
-    cl::CommandQueue q(context, device);
-
-    std::ifstream bin_file(xclbinFilename, std::ifstream::binary);
-    bin_file.seekg (0, bin_file.end);
-    unsigned nb = bin_file.tellg();
-    bin_file.seekg (0, bin_file.beg);
-    char *buf = new char [nb];
-    bin_file.read(buf, nb);
-
-    cl::Program::Binaries bins;
-    bins.push_back({buf,nb});
-    devices.resize(1);
-    cl::Program program(context, devices, bins);
-
-    int err1;
-    cl::Kernel kernel(program, "default_function", &err1);
-    auto default_function = cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&>(kernel);
-
-    cl::Buffer buffer_0(context, CL_MEM_READ_WRITE, vector_size_bytes_0);
-    cl::Buffer buffer_1(context, CL_MEM_READ_WRITE, vector_size_bytes_1);
-    cl::Buffer buffer_2(context, CL_MEM_READ_WRITE, vector_size_bytes_2);
-
-    q.enqueueWriteBuffer(buffer_0, CL_TRUE, 0, vector_size_bytes_0, source_0.data());
-    q.enqueueWriteBuffer(buffer_1, CL_TRUE, 0, vector_size_bytes_1, source_1.data());
-    q.enqueueWriteBuffer(buffer_2, CL_TRUE, 0, vector_size_bytes_2, source_2.data());
-
-    default_function(cl::EnqueueArgs(q, cl::NDRange(1,1,1), cl::NDRange(1,1,1)),buffer_0, buffer_1, buffer_2);
-    q.finish();
-
-    q.enqueueReadBuffer(buffer_0, CL_TRUE, 0, vector_size_bytes_0, source_0.data());
-    q.enqueueReadBuffer(buffer_1, CL_TRUE, 0, vector_size_bytes_1, source_1.data());
-    q.enqueueReadBuffer(buffer_2, CL_TRUE, 0, vector_size_bytes_2, source_2.data());
-
-    for (size_t i0 = 0; i0 < 6; i0++) {
-      for (size_t i1 = 0; i1 < 2; i1++) {
-        arg_0[i1 + i0*2] = source_0[i1 + i0*2];
-      }
-    }
-    shmdt(arg_0);
-    for (size_t i0 = 0; i0 < 2; i0++) {
-      for (size_t i1 = 0; i1 < 7; i1++) {
-        arg_1[i1 + i0*7] = source_1[i1 + i0*7];
-      }
-    }
-    shmdt(arg_1);
-    for (size_t i0 = 0; i0 < 6; i0++) {
-      for (size_t i1 = 0; i1 < 7; i1++) {
-        arg_2[i1 + i0*7] = source_2[i1 + i0*7];
-      }
-    }
-    shmdt(arg_2);
-}
diff --git a/samples/gemm/sdaccel.mk b/samples/gemm/sdaccel.mk
deleted file mode 100644
index 9cf0dafd7..000000000
--- a/samples/gemm/sdaccel.mk
+++ /dev/null
@@ -1,33 +0,0 @@
-ifndef XILINX_SDX
-$(error Environment variable XILINX_SDX is required and should point to SDAccel install area)
-endif
-SDA_FLOW = cpu_emu
-HOST_SRCS = host.cpp
-HOST_EXE_DIR=.
-HOST_EXE = host
-HOST_CFLAGS = -g -Wall -DFPGA_DEVICE -DC_KERNEL
-HOST_LFLAGS = 
-KERNEL_SRCS = default_function.cl
-KERNEL_NAME = default_function
-KERNEL_DEFS =
-KERNEL_INCS =
-XDEVICE=xilinx:adm-pcie-7v3:1ddr:3.0
-XDEVICE_REPO_PATH=
-KEEP_TEMP=1
-KERNEL_DEBUG=
-XCLBIN_NAME=bin_krnl
-HOST_CFLAGS+=-DTARGET_DEVICE=\"${XDEVICE}\"
-BOARD_SETUP_FILE=setup.sh
-ifeq (${SDA_FLOW},cpu_emu)
-    CLCC_OPT += -t sw_emu
-    XCLBIN = ${XCLBIN_NAME}_cpu_emu.xclbin
-else ifeq (${SDA_FLOW},hw_emu)
-    CLCC_OPT += -t hw_emu
-    XCLBIN = ${XCLBIN_NAME}_hw_emu.xclbin
-else ifeq (${SDA_FLOW},hw)
-    XCLBIN = ${XCLBIN_NAME}_hw.xclbin
-CLCC_OPT += -t hw
-endif
-HOST_ARGS = ${XCLBIN}
-COMMON_DIR = ./common
-include ${COMMON_DIR}/common.mk
diff --git a/samples/kmeans/kmeans_aocl.cl b/samples/kmeans/kmeans_aocl.cl
deleted file mode 100644
index e64b116f4..000000000
--- a/samples/kmeans/kmeans_aocl.cl
+++ /dev/null
@@ -1,49 +0,0 @@
-#include "ihc_apint.h"
-__kernel void default_function(__global int* restrict placeholder2, __global int* restrict placeholder3, __global int* restrict compute3) {
-  for (int x = 0; x < 32; ++x) {
-    compute3[x] = 0;
-  }
-  int main_loop;
-  for (int _1 = 0; _1 < 10; ++_1) {
-    #pragma ii 1
-    for (int N = 0; N < 32; ++N) {
-      int local2;
-      local2 = 100000;
-      for (int i = 0; i < 6; ++i) {
-        int local3;
-        local3 = 0;
-        for (int i1 = 0; i1 < 3; ++i1) {
-          local3 = ((int)(((int64_t)local3) + ((int64_t)(((int64_t)((int33_t)(placeholder2[(i1 + (N * 3))] - placeholder3[(i1 + (i * 3))]))) * ((int64_t)((int33_t)(placeholder2[(i1 + (N * 3))] - placeholder3[(i1 + (i * 3))])))))));
-        }
-        if (local3 < local2) {
-          local2 = local3;
-          compute3[N] = i;
-        }
-      }
-    }
-    int compute4[6];
-    for (int x1 = 0; x1 < 6; ++x1) {
-      compute4[x1] = 0;
-    }
-    int compute5[18];
-    for (int x2 = 0; x2 < 6; ++x2) {
-      for (int y = 0; y < 3; ++y) {
-        compute5[(y + (x2 * 3))] = 0;
-      }
-    }
-    int calc_sum;
-    #pragma unroll
-    for (int n = 0; n < 32; ++n) {
-      compute4[compute3[n]] = (compute4[compute3[n]] + 1);
-      for (int i2 = 0; i2 < 3; ++i2) {
-        compute5[(i2 + (compute3[n] * 3))] = ((int)(((int33_t)compute5[(i2 + (compute3[n] * 3))]) + ((int33_t)placeholder2[(i2 + (n * 3))])));
-      }
-    }
-    int update_mean;
-    #pragma unroll
-    for (int k_d_fused = 0; k_d_fused < 18; ++k_d_fused) {
-      placeholder3[k_d_fused] = (compute5[k_d_fused] / compute4[(k_d_fused / 3)]);
-    }
-  }
-}
-
diff --git a/samples/kmeans/kmeans_sdaccel.py b/samples/kmeans/kmeans_sdaccel.py
deleted file mode 100644
index c204c592e..000000000
--- a/samples/kmeans/kmeans_sdaccel.py
+++ /dev/null
@@ -1,27 +0,0 @@
-import numpy as np
-import random
-import heterocl as hcl
-from kmeans_main import top
-
-K = 16
-N = 320
-dim = 32
-
-f1 = top('sdaccel_sw_emu')
-#f2 = top()
-points_np = np.random.randint(100, size=(N, dim))
-labels_np = np.zeros(N)
-means_np = points_np[random.sample(range(N), K),:]
-
-hcl_points1 = hcl.asarray(points_np)
-hcl_means1 = hcl.asarray(means_np)
-hcl_labels1 = hcl.asarray(labels_np)
-
-hcl_points2 = hcl.asarray(points_np)
-hcl_means2 = hcl.asarray(means_np)
-hcl_labels2 = hcl.asarray(labels_np)
-
-f1(hcl_points1, hcl_means1, hcl_labels1)
-#f2(hcl_points2, hcl_means2, hcl_labels2)
-
-#assert np.array_equal(hcl_labels1.asnumpy(), hcl_labels2.asnumpy())
diff --git a/samples/kmeans/merlinc_code.cl b/samples/kmeans/merlinc_code.cl
deleted file mode 100644
index ea672313d..000000000
--- a/samples/kmeans/merlinc_code.cl
+++ /dev/null
@@ -1,52 +0,0 @@
-#include <string.h>
-#include <math.h>
-#include <assert.h>
-#pragma ACCEL kernel
-void default_function(int* placeholder2, int* placeholder3, int* compute3) {
-  for (int x = 0; x < 320; ++x) {
-    compute3[x] = 0;
-  }
-  int main_loop;
-  for (int _1 = 0; _1 < 200; ++_1) {
-#pragma ACCEL pipeline
-    for (int N = 0; N < 320; ++N) {
-      int local2;
-      local2 = 100000;
-      for (int i = 0; i < 16; ++i) {
-        int local3;
-        local3 = 0;
-        for (int i1 = 0; i1 < 32; ++i1) {
-          local3 = ((int)(((long)local3) + ((long)(((long)((long)(placeholder2[(i1 + (N * 32))] - placeholder3[(i1 + (i * 32))]))) * ((long)((long)(placeholder2[(i1 + (N * 32))] - placeholder3[(i1 + (i * 32))])))))));
-        }
-        if (local3 < local2) {
-          local2 = local3;
-          compute3[N] = i;
-        }
-      }
-    }
-    int compute4[16];
-    for (int x1 = 0; x1 < 16; ++x1) {
-      compute4[x1] = 0;
-    }
-    int compute5[512];
-    for (int x2 = 0; x2 < 16; ++x2) {
-      for (int y = 0; y < 32; ++y) {
-        compute5[(y + (x2 * 32))] = 0;
-      }
-    }
-    int calc_sum;
-#pragma ACCEL parallel flatten
-    for (int n = 0; n < 320; ++n) {
-      compute4[compute3[n]] = (compute4[compute3[n]] + 1);
-      for (int i2 = 0; i2 < 32; ++i2) {
-        compute5[(i2 + (compute3[n] * 32))] = ((int)(((long)compute5[(i2 + (compute3[n] * 32))]) + ((long)placeholder2[(i2 + (n * 32))])));
-      }
-    }
-    int update_mean;
-#pragma ACCEL parallel flatten
-    for (int k_d_fused = 0; k_d_fused < 512; ++k_d_fused) {
-      placeholder3[k_d_fused] = (compute5[k_d_fused] / compute4[(k_d_fused / 32)]);
-    }
-  }
-}
-
diff --git a/samples/kmeans/sdaccel_code.cl b/samples/kmeans/sdaccel_code.cl
deleted file mode 100644
index 196f96257..000000000
--- a/samples/kmeans/sdaccel_code.cl
+++ /dev/null
@@ -1,48 +0,0 @@
-__kernel void default_function(__global int* placeholder4, __global int* placeholder5, __global int* compute6) {
-  for (int x = 0; x < 320; ++x) {
-    compute6[x] = 0;
-  }
-  __local int main_loop;
-  for (int _1 = 0; _1 < 200; ++_1) {
-    __attribute__((xcl_pipeline_loop(1)))
-    for (int N = 0; N < 320; ++N) {
-      __local int local4;
-      local4 = 100000;
-      for (int i = 0; i < 16; ++i) {
-        __local int local5;
-        local5 = 0;
-        for (int i1 = 0; i1 < 32; ++i1) {
-          local5 = ((int)(((long)local5) + ((long)(((long)((long)(placeholder4[(i1 + (N * 32))] - placeholder5[(i1 + (i * 32))]))) * ((long)((long)(placeholder4[(i1 + (N * 32))] - placeholder5[(i1 + (i * 32))])))))));
-        }
-        if (local5 < local4) {
-          local4 = local5;
-          compute6[N] = i;
-        }
-      }
-    }
-    __local int compute7[16];
-    for (int x1 = 0; x1 < 16; ++x1) {
-      compute7[x1] = 0;
-    }
-    __local int compute8[512];
-    for (int x2 = 0; x2 < 16; ++x2) {
-      for (int y = 0; y < 32; ++y) {
-        compute8[(y + (x2 * 32))] = 0;
-      }
-    }
-    __local int calc_sum;
-    
-    for (int n = 0; n < 320; ++n) {
-      compute7[compute6[n]] = (compute7[compute6[n]] + 1);
-      for (int i2 = 0; i2 < 32; ++i2) {
-        compute8[(i2 + (compute6[n] * 32))] = ((int)(((long)compute8[(i2 + (compute6[n] * 32))]) + ((long)placeholder4[(i2 + (n * 32))])));
-      }
-    }
-    __local int update_mean;
-    
-    for (int k_d_fused = 0; k_d_fused < 512; ++k_d_fused) {
-      placeholder5[k_d_fused] = (compute8[k_d_fused] / compute7[(k_d_fused / 32)]);
-    }
-  }
-}
-
diff --git a/samples/kmeans/submit.sh b/samples/kmeans/submit.sh
deleted file mode 100644
index a4345a542..000000000
--- a/samples/kmeans/submit.sh
+++ /dev/null
@@ -1,3 +0,0 @@
-unset DISPLAY
-aoc -board=a10gx -time time.out -time-passes -regtest_mode -v -fpc -fp-relaxed --opt-arg -nocaching -regtest_mode -report -I $INTELFPGAOCLSDKROOT/include/kernel_headers kmeans_aocl.cl
-
diff --git a/samples/kmeans/vhls_code.cl b/samples/kmeans/vhls_code.cl
deleted file mode 100644
index b651dd8bf..000000000
--- a/samples/kmeans/vhls_code.cl
+++ /dev/null
@@ -1,52 +0,0 @@
-#include <ap_int.h>
-#include <ap_fixed.h>
-#include <math.h>
-
-void default_function(ap_int<32> placeholder6[320][32], ap_int<32> placeholder7[16][32], ap_int<32> compute9[320]) {
-  for (ap_int<32> x = 0; x < 320; ++x) {
-    compute9[x] = 0;
-  }
-  ap_int<32> main_loop;
-  for (ap_int<32> _ = 0; _ < 200; ++_) {
-    for (ap_int<32> N = 0; N < 320; ++N) {
-    #pragma HLS pipeline
-      ap_int<32> local6;
-      local6 = 100000;
-      for (ap_int<32> i = 0; i < 16; ++i) {
-        ap_int<32> local7;
-        local7 = 0;
-        for (ap_int<32> i1 = 0; i1 < 32; ++i1) {
-          local7 = ((ap_int<32>)(((ap_int<67>)local7) + ((ap_int<67>)(((ap_int<66>)((ap_int<33>)(placeholder6[N][i1] - placeholder7[i][i1]))) * ((ap_int<66>)((ap_int<33>)(placeholder6[N][i1] - placeholder7[i][i1])))))));
-        }
-        if (local7 < local6) {
-          local6 = local7;
-          compute9[N] = i;
-        }
-      }
-    }
-    ap_int<32> compute10[16];
-    for (ap_int<32> x1 = 0; x1 < 16; ++x1) {
-      compute10[x1] = 0;
-    }
-    ap_int<32> compute11[16][32];
-    for (ap_int<32> x2 = 0; x2 < 16; ++x2) {
-      for (ap_int<32> y = 0; y < 32; ++y) {
-        compute11[x2][y] = 0;
-      }
-    }
-    ap_int<32> calc_sum;
-    for (ap_int<32> n = 0; n < 320; ++n) {
-    #pragma HLS unroll
-      compute10[compute9[n]] = (compute10[compute9[n]] + 1);
-      for (ap_int<32> i2 = 0; i2 < 32; ++i2) {
-        compute11[compute9[n]][i2] = ((ap_int<32>)(((ap_int<33>)compute11[compute9[n]][i2]) + ((ap_int<33>)placeholder6[n][i2])));
-      }
-    }
-    ap_int<32> update_mean;
-    for (ap_int<32> k_d_fused = 0; k_d_fused < 512; ++k_d_fused) {
-    #pragma HLS unroll
-      placeholder7[(k_d_fused / 32)][(k_d_fused % 32)] = (compute11[(k_d_fused / 32)][(k_d_fused % 32)] / compute10[(k_d_fused / 32)]);
-    }
-  }
-}
-
diff --git a/samples/lenet/common/common.mk b/samples/lenet/common/common.mk
deleted file mode 100644
index 3409e4aa5..000000000
--- a/samples/lenet/common/common.mk
+++ /dev/null
@@ -1,55 +0,0 @@
-SHELL = /bin/bash
-VPATH = ./
-CC = xcpp
-CLCC = xocc
-ifeq ($(XDEVICE_REPO_PATH),)
-    DEVICE_REPO_OPT = 
-else
-DEVICE_REPO_OPT = --xp prop:solution.device_repo_paths=${XDEVICE_REPO_PATH}
-endif
-HOST_CFLAGS += -I${XILINX_SDX}/runtime/include/1_2
-HOST_LFLAGS += -L${XILINX_SDX}/runtime/lib/x86_64 -lxilinxopencl -lrt -pthread
-CLCC_OPT += $(CLCC_OPT_LEVEL) ${DEVICE_REPO_OPT} --xdevice ${XDEVICE} -o ${XCLBIN} ${KERNEL_DEFS} ${KERNEL_INCS}
-ifeq (${KEEP_TEMP},1)
-    CLCC_OPT += -s
-endif
-ifeq (${KERNEL_DEBUG},1)
-    CLCC_OPT += -g
-endif
-CLCC_OPT += --kernel ${KERNEL_NAME}
-OBJECTS := $(HOST_SRCS:.cpp=.o)
-.PHONY: all
-all: run
-host: ${HOST_EXE_DIR}/${HOST_EXE}
-xbin_cpu_em:
-    make SDA_FLOW=cpu_emu xbin -f sdaccel.mk
-xbin_hw_em:
-    make SDA_FLOW=hw_emu xbin -f sdaccel.mk
-xbin_hw :
-    make SDA_FLOW=hw xbin -f sdaccel.mk
-xbin: ${XCLBIN}
-run_cpu_em: 
-    make SDA_FLOW=cpu_emu run_em -f sdaccel.mk
-run_hw_em: 
-    make SDA_FLOW=hw_emu run_em -f sdaccel.mk
-run_hw : 
-    make SDA_FLOW=hw run_hw_int -f sdaccel.mk
-run_em: xconfig host xbin
-    XCL_EMULATION_MODE=true ${HOST_EXE_DIR}/${HOST_EXE} ${HOST_ARGS}
-run_hw_int : host xbin_hw
-    source ${BOARD_SETUP_FILE};${HOST_EXE_DIR}/${HOST_EXE} ${HOST_ARGS}
-estimate : 
-    ${CLCC} -c -t hw_emu --xdevice ${XDEVICE} --report estimate ${KERNEL_SRCS}
-xconfig : emconfig.json
-emconfig.json :
-    emconfigutil --xdevice ${XDEVICE} ${DEVICE_REPO_OPT} --od .
-${HOST_EXE_DIR}/${HOST_EXE} : ${OBJECTS}
-    ${CC} ${HOST_LFLAGS} ${OBJECTS} -o $@
-${XCLBIN}:
-    ${CLCC} ${CLCC_OPT} ${KERNEL_SRCS}
-%.o: %.cpp
-    ${CC} ${HOST_CFLAGS} -c $< -o $@
-clean:
-    ${RM} -rf ${HOST_EXE} ${OBJECTS} ${XCLBIN} emconfig.json _xocc_${XCLBIN_NAME}_*.dir .Xil
-cleanall: clean
-    ${RM} -rf *.xclbin sdaccel_profile_summary.* _xocc_* TempConfig *.log *.jou
diff --git a/samples/lenet/lenet_aocl.cl b/samples/lenet/lenet_aocl.cl
deleted file mode 100644
index 9b2a200f8..000000000
--- a/samples/lenet/lenet_aocl.cl
+++ /dev/null
@@ -1,138 +0,0 @@
-#include "ihc_apint.h"
-__kernel void default_function(__global float* restrict input_image, __global float* restrict weight_conv1, __global float* restrict weight_conv2, __global float* restrict weight_fc1, __global float* restrict weight_fc2, __global float* restrict lenet) {
-  float conv2d;
-  for (int nn = 0; nn < 1; ++nn) {
-    for (int yy = 0; yy < -1; ++yy) {
-      for (int xx = 0; xx < -1; ++xx) {
-        float reducer0;
-        reducer0 = 0.000000e+00f;
-        for (int ra1 = 0; ra1 < 5; ++ra1) {
-          for (int ra2 = 0; ra2 < 5; ++ra2) {
-            reducer0 = ((input_image[(((xx + ra2) + ((yy + ra1) * 3)) + (nn * 9))] * weight_conv1[(ra2 + (ra1 * 5))]) + reducer0);
-          }
-        }
-        conv2d = reducer0;
-      }
-    }
-  }
-  float tanh1;
-  for (int args = 0; args < 1; ++args) {
-    for (int args1 = 0; args1 < -1; ++args1) {
-      for (int args2 = 0; args2 < -1; ++args2) {
-        tanh1 = ((float)tanh(((float)conv2d)));
-      }
-    }
-  }
-  float max_pool;
-  for (int i = 0; i < 1; ++i) {
-    for (int h = 0; h < -1; ++h) {
-      for (int w = 0; w < -1; ++w) {
-        float reducer1;
-        reducer1 = -1.000000e+00f;
-        for (int ra3 = 0; ra3 < 2; ++ra3) {
-          for (int ra4 = 0; ra4 < 2; ++ra4) {
-            reducer1 = max(tanh1, reducer1);
-          }
-        }
-        max_pool = reducer1;
-      }
-    }
-  }
-  float conv2d1[250];
-  for (int nn1 = 0; nn1 < 1; ++nn1) {
-    for (int ff = 0; ff < 10; ++ff) {
-      for (int yy1 = 0; yy1 < -5; ++yy1) {
-        for (int xx1 = 0; xx1 < -5; ++xx1) {
-          float reducer2;
-          reducer2 = 0.000000e+00f;
-          for (int ra6 = 0; ra6 < 5; ++ra6) {
-            for (int ra7 = 0; ra7 < 5; ++ra7) {
-              reducer2 = ((max_pool * weight_conv2[((ra7 + (ra6 * 5)) + (ff * 25))]) + reducer2);
-            }
-          }
-          conv2d1[(((xx1 - (yy1 * 5)) + (ff * 25)) + (nn1 * 250))] = reducer2;
-        }
-      }
-    }
-  }
-  float tanh2[250];
-  for (int args3 = 0; args3 < 1; ++args3) {
-    for (int args0 = 0; args0 < 10; ++args0) {
-      for (int args11 = 0; args11 < -5; ++args11) {
-        for (int args21 = 0; args21 < -5; ++args21) {
-          tanh2[(((args21 - (args11 * 5)) + (args0 * 25)) + (args3 * 250))] = ((float)tanh(((float)conv2d1[(((args21 - (args11 * 5)) + (args0 * 25)) + (args3 * 250))])));
-        }
-      }
-    }
-  }
-  float max_pool1[90];
-  for (int i1 = 0; i1 < 1; ++i1) {
-    for (int c = 0; c < 10; ++c) {
-      for (int h1 = 0; h1 < -3; ++h1) {
-        for (int w1 = 0; w1 < -3; ++w1) {
-          float reducer3;
-          reducer3 = -1.000000e+00f;
-          for (int ra8 = 0; ra8 < 2; ++ra8) {
-            for (int ra9 = 0; ra9 < 2; ++ra9) {
-              reducer3 = max(tanh2[(((((w1 * 2) - (((h1 * 2) + ra8) * 5)) + ra9) + (c * 25)) + (i1 * 250))], reducer3);
-            }
-          }
-          max_pool1[(((w1 - (h1 * 3)) + (c * 9)) + (i1 * 90))] = reducer3;
-        }
-      }
-    }
-  }
-  float compute0[90];
-  for (int i2 = 0; i2 < 1; ++i2) {
-    for (int j = 0; j < 90; ++j) {
-      compute0[(j + (i2 * 90))] = max_pool1[((((j % -3) - (((j / -3) % -3) * 3)) + ((((j / -3) / -3) % 10) * 9)) + (i2 * 90))];
-    }
-  }
-  float dense[25];
-  for (int i3 = 0; i3 < 1; ++i3) {
-    for (int j1 = 0; j1 < 25; ++j1) {
-      float reducer4;
-      reducer4 = 0.000000e+00f;
-      for (int ra10 = 0; ra10 < 90; ++ra10) {
-        reducer4 = ((compute0[(ra10 + (i3 * 90))] * weight_fc1[(ra10 + (j1 * 40))]) + reducer4);
-      }
-      dense[(j1 + (i3 * 25))] = reducer4;
-    }
-  }
-  float tanh3[25];
-  for (int args4 = 0; args4 < 1; ++args4) {
-    for (int args01 = 0; args01 < 25; ++args01) {
-      tanh3[(args01 + (args4 * 25))] = ((float)tanh(((float)dense[(args01 + (args4 * 25))])));
-    }
-  }
-  float dense1[10];
-  for (int i4 = 0; i4 < 1; ++i4) {
-    for (int j2 = 0; j2 < 10; ++j2) {
-      float reducer5;
-      reducer5 = 0.000000e+00f;
-      for (int ra11 = 0; ra11 < 25; ++ra11) {
-        reducer5 = ((tanh3[(ra11 + (i4 * 25))] * weight_fc2[(ra11 + (j2 * 25))]) + reducer5);
-      }
-      dense1[(j2 + (i4 * 10))] = reducer5;
-    }
-  }
-  float compute1;
-  int max1;
-  max1 = 0;
-  for (int ra12 = 0; ra12 < 10; ++ra12) {
-    max1 = ((int)max(dense1[ra12], ((float)max1)));
-  }
-  compute1 = ((float)max1);
-  float compute2;
-  int sum;
-  sum = 0;
-  for (int ra13 = 0; ra13 < 10; ++ra13) {
-    sum = ((int)(exp(((float)(dense1[ra13] - compute1))) + ((float)sum)));
-  }
-  compute2 = ((float)sum);
-  float update0;
-  for (int j3 = 0; j3 < 10; ++j3) {
-    lenet[j3] = ((float)(exp(((float)(dense1[j3] - compute1))) / ((float)compute2)));
-  }
-}
-
diff --git a/samples/lenet/lenet_main_withoutq.py b/samples/lenet/lenet_main_withoutq.py
deleted file mode 100644
index b16bdd6c3..000000000
--- a/samples/lenet/lenet_main_withoutq.py
+++ /dev/null
@@ -1,125 +0,0 @@
-import heterocl as hcl
-import hlib
-import numpy as np
-
-hcl.init()
-
-def softmax(out, x):
-    assert len(x.shape) == 2, "only support 2-dim softmax"
-    m, n = x.shape
-    k = hcl.reduce_axis(0, n)
-    max_elem = hcl.compute((m,), lambda i: hcl.max(x[i, k], axis=k))
-    k = hcl.reduce_axis(0, n)
-    expsum = hcl.compute((m,),
-            lambda i: hcl.sum(hcl.exp(x[i, k] - max_elem[i]), axis=k))
-    return hcl.update(out,
-            lambda i, j: hcl.exp(x[i, j] - max_elem[i]) / expsum[i])
-
-def build_lenet(input_image, weight_conv1, weight_conv2,
-                weight_fc1, weight_fc2, lenet):
-    # first conv
-    conv1 = hlib.nn.conv2d_nchw(input_image, weight_conv1)
-    tanh1 = hlib.nn.tanh(conv1, "tanh1")
-    pool1 = hlib.nn.max_pool(tanh1, kernel=(2,2), stride=(2,2))
-    # second conv
-    conv2 = hlib.nn.conv2d_nchw(pool1, weight_conv2)
-    tanh2 = hlib.nn.tanh(conv2, "tanh2")
-    pool2 = hlib.nn.max_pool(tanh2, kernel=(2,2), stride=(2,2))
-    # first fc
-    flat = hlib.nn.flatten(pool2)
-    fc1 = hlib.nn.dense(flat, weight_fc1)
-    tanh3 = hlib.nn.tanh(fc1, "tanh3")
-    # second fc
-    fc2 =  hlib.nn.dense(tanh3, weight_fc2)
-    # loss
-    return softmax(lenet, fc2)
-
-
-import mxnet as mx
-# download pretrained lenet model
-mx.gluon.utils.download('https://gist.githubusercontent.com/Huyuwei/dc00ce83f537914c64a204133d23b019/raw/79af41e7c8ba9120ea7f35fb1d0484b65bccd54f/lenet-0010.params')
-mx.gluon.utils.download('https://gist.githubusercontent.com/Huyuwei/dc00ce83f537914c64a204133d23b019/raw/79af41e7c8ba9120ea7f35fb1d0484b65bccd54f/lenet-symbol.json')
-sym, arg_params, aux_params = mx.model.load_checkpoint('lenet', 10)
-# get weights
-weight_conv1_np = arg_params['convolution0_weight'].asnumpy()
-weight_conv2_np = arg_params['convolution1_weight'].asnumpy()
-weight_fc1_np = arg_params['fullyconnected0_weight'].asnumpy()
-weight_fc2_np = arg_params['fullyconnected1_weight'].asnumpy()
-
-
-# qtype1 = hcl.Fixed(16, 14)
-# qtype2 = hcl.Fixed(16, 14)
-
-# qtype1 = hcl.Fixed(16, 12)
-# qtype2 = hcl.Fixed(16, 12)
-
-
-
-correct_sum = 0
-batch_size = 1000
-mnist = mx.test_utils.get_mnist()
-
-
-def build_lenet_inf(batch_size=batch_size, target=None):
-    # set up input/output placeholders
-    input_image = hcl.placeholder((batch_size, 1, 28, 28), "input_image")
-    # weight_conv1 = hcl.placeholder((20, 1, 5, 5), "weight_conv1", qtype1)
-    # weight_conv2 = hcl.placeholder((50, 20, 5, 5), "weight_conv2", qtype1)
-    # weight_fc1 = hcl.placeholder((500, 800), "weight_fc1", qtype1)
-    # weight_fc2 = hcl.placeholder((10, 500), "weight_fc2", qtype1)
-    weight_conv1 = hcl.placeholder((20, 1, 5, 5), "weight_conv1")
-    weight_conv2 = hcl.placeholder((50, 20, 5, 5), "weight_conv2")
-    weight_fc1 = hcl.placeholder((500, 800), "weight_fc1")
-    weight_fc2 = hcl.placeholder((10, 500), "weight_fc2")
-    lenet = hcl.placeholder((batch_size, 10), "lenet")
-    # create a quantization scheme
-    # scheme = hcl.create_scheme(
-    #         [input_image, weight_conv1, weight_conv2,
-    #          weight_fc1, weight_fc2, lenet], build_lenet)
-    # # quantize the three activation layers
-    # scheme.quantize(
-    #         [build_lenet.tanh1, build_lenet.tanh2, build_lenet.tanh3], qtype2)
-    # s = hcl.create_schedule_from_scheme(scheme)
-    s = hcl.create_schedule([input_image, weight_conv1, weight_conv2, weight_fc1, weight_fc2, lenet], build_lenet)
-    return hcl.build(s, target=target)
-
-code1 = build_lenet_inf(batch_size, 'merlinc')
-# print (code1)
-with open('merlinc_code.cl', 'w') as f:
-	f.write(code1)
-
-code2 = build_lenet_inf(batch_size, 'sdaccel')
-
-with open('sdaccel_code.cl', 'w') as f:
-	f.write(code2)
-
-code3 = build_lenet_inf(batch_size, 'vhls')
-with open('vhls_code.cl', 'w') as f:
-        f.write(code3)
-
-f = build_lenet_inf(batch_size, 'sdaccel_sw_emu')
-
-# weight_conv1_hcl = hcl.asarray(weight_conv1_np, dtype=qtype1)
-# weight_conv2_hcl = hcl.asarray(weight_conv2_np, dtype=qtype1)
-# weight_fc1_hcl = hcl.asarray(weight_fc1_np, dtype=qtype1)
-# weight_fc2_hcl = hcl.asarray(weight_fc2_np, dtype=qtype1)
-
-weight_conv1_hcl = hcl.asarray(weight_conv1_np)
-weight_conv2_hcl = hcl.asarray(weight_conv2_np)
-weight_fc1_hcl = hcl.asarray(weight_fc1_np)
-weight_fc2_hcl = hcl.asarray(weight_fc2_np)
-
-
-for i in range(10000 // batch_size):
-    label = mnist['test_label'][i*batch_size:(i+1)*batch_size]
-    input_image_np = mnist['test_data'][i*batch_size:(i+1)*batch_size]
-    input_image_hcl = hcl.asarray(input_image_np)
-    output_hcl = hcl.asarray(np.zeros((batch_size,10)))
-    f(input_image_hcl, weight_conv1_hcl, weight_conv2_hcl,
-            weight_fc1_hcl, weight_fc2_hcl, output_hcl)
-    print (output_hcl.asnumpy())
-    prediction = np.argmax(output_hcl.asnumpy(), axis=1)
-    correct_sum += np.sum(np.equal(prediction, label))
-
-print("Testing accuracy: {}".format(correct_sum / 10000.))
-
diff --git a/samples/lenet/lenet_sdaccel.py b/samples/lenet/lenet_sdaccel.py
deleted file mode 100644
index 917b2b625..000000000
--- a/samples/lenet/lenet_sdaccel.py
+++ /dev/null
@@ -1,23 +0,0 @@
-import heterocl as hcl
-import numpy as np
-from lenet_main import *
-
-batch_size = 50
-
-# f = build_lenet_inf(batch_size, 'vhls_csim')
-f = build_lenet_inf(batch_size, 'sdaccel_sw_emu')
-
-mnist = mx.test_utils.get_mnist()
-correct_sum = 0
-
-for i in range(50 // batch_size):
-    label = mnist['test_label'][i*batch_size:(i+1)*batch_size]
-    input_image_np = mnist['test_data'][i*batch_size:(i+1)*batch_size]
-    input_image_hcl = hcl.asarray(input_image_np)
-    output_hcl = hcl.asarray(np.zeros((batch_size,10)))
-    f(input_image_hcl, weight_conv1_hcl, weight_conv2_hcl, weight_fc1_hcl, weight_fc2_hcl, output_hcl)
-    prediction = np.argmax(output_hcl.asnumpy(), axis=1)
-    correct_sum += np.sum(np.equal(prediction, label))
-
-print(str(qtype1) + ", " + str(qtype2) + ": Accuracy over 10000 test images is: {}".format(correct_sum / 10000.))
-assert correct_sum == 9882
diff --git a/samples/lenet/merlinc_code.cl b/samples/lenet/merlinc_code.cl
deleted file mode 100644
index 1c5118707..000000000
--- a/samples/lenet/merlinc_code.cl
+++ /dev/null
@@ -1,155 +0,0 @@
-#include <string.h>
-#include <math.h>
-#include <assert.h>
-#pragma ACCEL kernel
-void default_function(int* input_image, int* weight_conv1, int* weight_conv2, int* weight_fc1, int* weight_fc2, int* lenet) {
-  int conv2d[11520000];
-  for (int nn = 0; nn < 1000; ++nn) {
-    for (int ff = 0; ff < 20; ++ff) {
-      for (int yy = 0; yy < 24; ++yy) {
-        for (int xx = 0; xx < 24; ++xx) {
-          float reducer0;
-          reducer0 = 0.000000e+00f;
-          for (int ra1 = 0; ra1 < 5; ++ra1) {
-            for (int ra2 = 0; ra2 < 5; ++ra2) {
-              reducer0 = (((float)(((long)input_image[(((xx + ra2) + ((yy + ra1) * 28)) + (nn * 784))]) * ((long)weight_conv1[((ra2 + (ra1 * 5)) + (ff * 25))]))) + reducer0);
-            }
-          }
-          conv2d[(((xx + (yy * 24)) + (ff * 576)) + (nn * 11520))] = ((int)reducer0);
-        }
-      }
-    }
-  }
-  int tanh1[11520000];
-  for (int args = 0; args < 1000; ++args) {
-    for (int args0 = 0; args0 < 20; ++args0) {
-      for (int args1 = 0; args1 < 24; ++args1) {
-        for (int args2 = 0; args2 < 24; ++args2) {
-          tanh1[(((args2 + (args1 * 24)) + (args0 * 576)) + (args * 11520))] = ((int)tanh(((double)conv2d[(((args2 + (args1 * 24)) + (args0 * 576)) + (args * 11520))])));
-        }
-      }
-    }
-  }
-  int max_pool[2880000];
-  for (int i = 0; i < 1000; ++i) {
-    for (int c = 0; c < 20; ++c) {
-      for (int h = 0; h < 12; ++h) {
-        for (int w = 0; w < 12; ++w) {
-          float reducer1;
-          reducer1 = -1.000000e+00f;
-          for (int ra3 = 0; ra3 < 2; ++ra3) {
-            for (int ra4 = 0; ra4 < 2; ++ra4) {
-              reducer1 = max(((float)tanh1[(((((w * 2) + ra4) + (((h * 2) + ra3) * 24)) + (c * 576)) + (i * 11520))]), reducer1);
-            }
-          }
-          max_pool[(((w + (h * 12)) + (c * 144)) + (i * 2880))] = ((int)reducer1);
-        }
-      }
-    }
-  }
-  int conv2d1[3200000];
-  for (int nn1 = 0; nn1 < 1000; ++nn1) {
-    for (int ff1 = 0; ff1 < 50; ++ff1) {
-      for (int yy1 = 0; yy1 < 8; ++yy1) {
-        for (int xx1 = 0; xx1 < 8; ++xx1) {
-          float reducer2;
-          reducer2 = 0.000000e+00f;
-          for (int ra5 = 0; ra5 < 20; ++ra5) {
-            for (int ra6 = 0; ra6 < 5; ++ra6) {
-              for (int ra7 = 0; ra7 < 5; ++ra7) {
-                reducer2 = (((float)(((long)max_pool[((((xx1 + ra7) + ((yy1 + ra6) * 12)) + (ra5 * 144)) + (nn1 * 2880))]) * ((long)weight_conv2[(((ra7 + (ra6 * 5)) + (ra5 * 25)) + (ff1 * 500))]))) + reducer2);
-              }
-            }
-          }
-          conv2d1[(((xx1 + (yy1 * 8)) + (ff1 * 64)) + (nn1 * 3200))] = ((int)reducer2);
-        }
-      }
-    }
-  }
-  int tanh2[3200000];
-  for (int args3 = 0; args3 < 1000; ++args3) {
-    for (int args01 = 0; args01 < 50; ++args01) {
-      for (int args11 = 0; args11 < 8; ++args11) {
-        for (int args21 = 0; args21 < 8; ++args21) {
-          tanh2[(((args21 + (args11 * 8)) + (args01 * 64)) + (args3 * 3200))] = ((int)tanh(((double)conv2d1[(((args21 + (args11 * 8)) + (args01 * 64)) + (args3 * 3200))])));
-        }
-      }
-    }
-  }
-  int max_pool1[800000];
-  for (int i1 = 0; i1 < 1000; ++i1) {
-    for (int c1 = 0; c1 < 50; ++c1) {
-      for (int h1 = 0; h1 < 4; ++h1) {
-        for (int w1 = 0; w1 < 4; ++w1) {
-          float reducer3;
-          reducer3 = -1.000000e+00f;
-          for (int ra8 = 0; ra8 < 2; ++ra8) {
-            for (int ra9 = 0; ra9 < 2; ++ra9) {
-              reducer3 = max(((float)tanh2[(((((w1 * 2) + ra9) + (((h1 * 2) + ra8) * 8)) + (c1 * 64)) + (i1 * 3200))]), reducer3);
-            }
-          }
-          max_pool1[(((w1 + (h1 * 4)) + (c1 * 16)) + (i1 * 800))] = ((int)reducer3);
-        }
-      }
-    }
-  }
-  int compute0[800000];
-  for (int i2 = 0; i2 < 1000; ++i2) {
-    for (int j = 0; j < 800; ++j) {
-      compute0[(j + (i2 * 800))] = max_pool1[((((((j / 4) % 4) * 4) + (j % 4)) + ((j / 16) * 16)) + (i2 * 800))];
-    }
-  }
-  int dense[500000];
-  for (int i3 = 0; i3 < 1000; ++i3) {
-    for (int j1 = 0; j1 < 500; ++j1) {
-      float reducer4;
-      reducer4 = 0.000000e+00f;
-      for (int ra10 = 0; ra10 < 800; ++ra10) {
-        reducer4 = (((float)(((long)compute0[(ra10 + (i3 * 800))]) * ((long)weight_fc1[(ra10 + (j1 * 800))]))) + reducer4);
-      }
-      dense[(j1 + (i3 * 500))] = ((int)reducer4);
-    }
-  }
-  int tanh3[500000];
-  for (int args4 = 0; args4 < 1000; ++args4) {
-    for (int args02 = 0; args02 < 500; ++args02) {
-      tanh3[(args02 + (args4 * 500))] = ((int)tanh(((double)dense[(args02 + (args4 * 500))])));
-    }
-  }
-  int dense1[10000];
-  for (int i4 = 0; i4 < 1000; ++i4) {
-    for (int j2 = 0; j2 < 10; ++j2) {
-      float reducer5;
-      reducer5 = 0.000000e+00f;
-      for (int ra11 = 0; ra11 < 500; ++ra11) {
-        reducer5 = (((float)(((long)tanh3[(ra11 + (i4 * 500))]) * ((long)weight_fc2[(ra11 + (j2 * 500))]))) + reducer5);
-      }
-      dense1[(j2 + (i4 * 10))] = ((int)reducer5);
-    }
-  }
-  int compute1[1000];
-  for (int i5 = 0; i5 < 1000; ++i5) {
-    int max;
-    max = 0;
-    for (int ra12 = 0; ra12 < 10; ++ra12) {
-      max = max(dense1[(ra12 + (i5 * 10))], max);
-    }
-    compute1[i5] = max;
-  }
-  int compute2[1000];
-  for (int i6 = 0; i6 < 1000; ++i6) {
-    int sum;
-    sum = 0;
-    for (int ra13 = 0; ra13 < 10; ++ra13) {
-      sum = ((int)(exp(((double)((long)(dense1[(ra13 + (i6 * 10))] - compute1[i6])))) + ((double)sum)));
-    }
-    compute2[i6] = sum;
-  }
-  int update0;
-  for (int i7 = 0; i7 < 1000; ++i7) {
-    for (int j3 = 0; j3 < 10; ++j3) {
-      lenet[(j3 + (i7 * 10))] = ((int)(exp(((double)((long)(dense1[(j3 + (i7 * 10))] - compute1[i7])))) / ((double)compute2[i7])));
-    }
-  }
-}
-
diff --git a/samples/lenet/sdaccel.mk b/samples/lenet/sdaccel.mk
deleted file mode 100644
index ce266d89e..000000000
--- a/samples/lenet/sdaccel.mk
+++ /dev/null
@@ -1,32 +0,0 @@
-ifndef XILINX_SDX
-$(error Environment variable XILINX_SDX is required and should point to SDAccel install area)
-endif
-SDA_FLOW = cpu_emu
-HOST_SRCS = host.cpp
-HOST_EXE_DIR=.
-HOST_EXE = host
-HOST_CFLAGS = -g -Wall -DFPGA_DEVICE -DC_KERNEL
-HOST_LFLAGS = 
-KERNEL_SRCS = default_function.cl
-KERNEL_NAME = default_function
-KERNEL_DEFS =
-KERNEL_INCS =
-XDEVICE=xilinx:adm-pcie-7v3:1ddr:3.0
-XDEVICE_REPO_PATH=
-KEEP_TEMP=1
-KERNEL_DEBUG=
-XCLBIN_NAME=bin_krnl
-HOST_CFLAGS+=-DTARGET_DEVICE=\"${XDEVICE}\"
-BOARD_SETUP_FILE=setup.sh
-ifeq (${SDA_FLOW},cpu_emu)
-    CLCC_OPT += -t sw_emu
-    XCLBIN = ${XCLBIN_NAME}_cpu_emu.xclbin
-else ifeq (${SDA_FLOW},hw_emu)
-    CLCC_OPT += -t hw_emu
-    XCLBIN = ${XCLBIN_NAME}_hw_emu.xclbin
-else ifeq (${SDA_FLOW},hw)
-    XCLBIN = ${XCLBIN_NAME}_hw.xclbin
-CLCC_OPT += -t hw
-endifHOST_ARGS = ${XCLBIN}
-COMMON_DIR = ./common
-include ${COMMON_DIR}/common.mk
diff --git a/samples/lenet/sdaccel_code.cl b/samples/lenet/sdaccel_code.cl
deleted file mode 100644
index 114880df0..000000000
--- a/samples/lenet/sdaccel_code.cl
+++ /dev/null
@@ -1,151 +0,0 @@
-__kernel void default_function(__global int* input_image, __global int* weight_conv1, __global int* weight_conv2, __global int* weight_fc1, __global int* weight_fc2, __global int* lenet) {
-  __local int conv2d[11520000];
-  for (int nn = 0; nn < 1000; ++nn) {
-    for (int ff = 0; ff < 20; ++ff) {
-      for (int yy = 0; yy < 24; ++yy) {
-        for (int xx = 0; xx < 24; ++xx) {
-          __local float reducer6;
-          reducer6 = 0.000000e+00f;
-          for (int ra15 = 0; ra15 < 5; ++ra15) {
-            for (int ra16 = 0; ra16 < 5; ++ra16) {
-              reducer6 = (((float)(((long)input_image[(((xx + ra16) + ((yy + ra15) * 28)) + (nn * 784))]) * ((long)weight_conv1[((ra16 + (ra15 * 5)) + (ff * 25))]))) + reducer6);
-            }
-          }
-          conv2d[(((xx + (yy * 24)) + (ff * 576)) + (nn * 11520))] = ((int)reducer6);
-        }
-      }
-    }
-  }
-  __local int tanh1[11520000];
-  for (int args = 0; args < 1000; ++args) {
-    for (int args0 = 0; args0 < 20; ++args0) {
-      for (int args1 = 0; args1 < 24; ++args1) {
-        for (int args2 = 0; args2 < 24; ++args2) {
-          tanh1[(((args2 + (args1 * 24)) + (args0 * 576)) + (args * 11520))] = ((int)tanh(((double)conv2d[(((args2 + (args1 * 24)) + (args0 * 576)) + (args * 11520))])));
-        }
-      }
-    }
-  }
-  __local int max_pool[2880000];
-  for (int i = 0; i < 1000; ++i) {
-    for (int c = 0; c < 20; ++c) {
-      for (int h = 0; h < 12; ++h) {
-        for (int w = 0; w < 12; ++w) {
-          __local float reducer7;
-          reducer7 = -1.000000e+00f;
-          for (int ra17 = 0; ra17 < 2; ++ra17) {
-            for (int ra18 = 0; ra18 < 2; ++ra18) {
-              reducer7 = max(((float)tanh1[(((((w * 2) + ra18) + (((h * 2) + ra17) * 24)) + (c * 576)) + (i * 11520))]), reducer7);
-            }
-          }
-          max_pool[(((w + (h * 12)) + (c * 144)) + (i * 2880))] = ((int)reducer7);
-        }
-      }
-    }
-  }
-  __local int conv2d1[3200000];
-  for (int nn1 = 0; nn1 < 1000; ++nn1) {
-    for (int ff1 = 0; ff1 < 50; ++ff1) {
-      for (int yy1 = 0; yy1 < 8; ++yy1) {
-        for (int xx1 = 0; xx1 < 8; ++xx1) {
-          __local float reducer8;
-          reducer8 = 0.000000e+00f;
-          for (int ra19 = 0; ra19 < 20; ++ra19) {
-            for (int ra20 = 0; ra20 < 5; ++ra20) {
-              for (int ra21 = 0; ra21 < 5; ++ra21) {
-                reducer8 = (((float)(((long)max_pool[((((xx1 + ra21) + ((yy1 + ra20) * 12)) + (ra19 * 144)) + (nn1 * 2880))]) * ((long)weight_conv2[(((ra21 + (ra20 * 5)) + (ra19 * 25)) + (ff1 * 500))]))) + reducer8);
-              }
-            }
-          }
-          conv2d1[(((xx1 + (yy1 * 8)) + (ff1 * 64)) + (nn1 * 3200))] = ((int)reducer8);
-        }
-      }
-    }
-  }
-  __local int tanh2[3200000];
-  for (int args3 = 0; args3 < 1000; ++args3) {
-    for (int args01 = 0; args01 < 50; ++args01) {
-      for (int args11 = 0; args11 < 8; ++args11) {
-        for (int args21 = 0; args21 < 8; ++args21) {
-          tanh2[(((args21 + (args11 * 8)) + (args01 * 64)) + (args3 * 3200))] = ((int)tanh(((double)conv2d1[(((args21 + (args11 * 8)) + (args01 * 64)) + (args3 * 3200))])));
-        }
-      }
-    }
-  }
-  __local int max_pool1[800000];
-  for (int i1 = 0; i1 < 1000; ++i1) {
-    for (int c1 = 0; c1 < 50; ++c1) {
-      for (int h1 = 0; h1 < 4; ++h1) {
-        for (int w1 = 0; w1 < 4; ++w1) {
-          __local float reducer9;
-          reducer9 = -1.000000e+00f;
-          for (int ra22 = 0; ra22 < 2; ++ra22) {
-            for (int ra23 = 0; ra23 < 2; ++ra23) {
-              reducer9 = max(((float)tanh2[(((((w1 * 2) + ra23) + (((h1 * 2) + ra22) * 8)) + (c1 * 64)) + (i1 * 3200))]), reducer9);
-            }
-          }
-          max_pool1[(((w1 + (h1 * 4)) + (c1 * 16)) + (i1 * 800))] = ((int)reducer9);
-        }
-      }
-    }
-  }
-  __local int compute3[800000];
-  for (int i2 = 0; i2 < 1000; ++i2) {
-    for (int j = 0; j < 800; ++j) {
-      compute3[(j + (i2 * 800))] = max_pool1[((((((j / 4) % 4) * 4) + (j % 4)) + ((j / 16) * 16)) + (i2 * 800))];
-    }
-  }
-  __local int dense[500000];
-  for (int i3 = 0; i3 < 1000; ++i3) {
-    for (int j1 = 0; j1 < 500; ++j1) {
-      __local float reducer10;
-      reducer10 = 0.000000e+00f;
-      for (int ra24 = 0; ra24 < 800; ++ra24) {
-        reducer10 = (((float)(((long)compute3[(ra24 + (i3 * 800))]) * ((long)weight_fc1[(ra24 + (j1 * 800))]))) + reducer10);
-      }
-      dense[(j1 + (i3 * 500))] = ((int)reducer10);
-    }
-  }
-  __local int tanh3[500000];
-  for (int args4 = 0; args4 < 1000; ++args4) {
-    for (int args02 = 0; args02 < 500; ++args02) {
-      tanh3[(args02 + (args4 * 500))] = ((int)tanh(((double)dense[(args02 + (args4 * 500))])));
-    }
-  }
-  __local int dense1[10000];
-  for (int i4 = 0; i4 < 1000; ++i4) {
-    for (int j2 = 0; j2 < 10; ++j2) {
-      __local float reducer11;
-      reducer11 = 0.000000e+00f;
-      for (int ra25 = 0; ra25 < 500; ++ra25) {
-        reducer11 = (((float)(((long)tanh3[(ra25 + (i4 * 500))]) * ((long)weight_fc2[(ra25 + (j2 * 500))]))) + reducer11);
-      }
-      dense1[(j2 + (i4 * 10))] = ((int)reducer11);
-    }
-  }
-  __local int compute4[1000];
-  for (int i5 = 0; i5 < 1000; ++i5) {
-    __local int max;
-    max = 0;
-    for (int ra26 = 0; ra26 < 10; ++ra26) {
-      max = max(dense1[(ra26 + (i5 * 10))], max);
-    }
-    compute4[i5] = max;
-  }
-  __local int compute5[1000];
-  for (int i6 = 0; i6 < 1000; ++i6) {
-    __local int sum;
-    sum = 0;
-    for (int ra27 = 0; ra27 < 10; ++ra27) {
-      sum = ((int)(exp(((double)((long)(dense1[(ra27 + (i6 * 10))] - compute4[i6])))) + ((double)sum)));
-    }
-    compute5[i6] = sum;
-  }
-  __local int update1;
-  for (int i7 = 0; i7 < 1000; ++i7) {
-    for (int j3 = 0; j3 < 10; ++j3) {
-      lenet[(j3 + (i7 * 10))] = ((int)(exp(((double)((long)(dense1[(j3 + (i7 * 10))] - compute4[i7])))) / ((double)compute5[i7])));
-    }
-  }
-}
-
diff --git a/samples/lenet/vhls_code.cl b/samples/lenet/vhls_code.cl
deleted file mode 100644
index 3d85466b4..000000000
--- a/samples/lenet/vhls_code.cl
+++ /dev/null
@@ -1,155 +0,0 @@
-#include <ap_int.h>
-#include <ap_fixed.h>
-#include <math.h>
-
-void default_function(ap_int<32> input_image[1000][1][28][28], ap_int<32> weight_conv1[20][1][5][5], ap_int<32> weight_conv2[50][20][5][5], ap_int<32> weight_fc1[500][800], ap_int<32> weight_fc2[10][500], ap_int<32> lenet[1000][10]) {
-  ap_int<32> conv2d[1000][20][24][24];
-  for (ap_int<32> nn = 0; nn < 1000; ++nn) {
-    for (ap_int<32> ff = 0; ff < 20; ++ff) {
-      for (ap_int<32> yy = 0; yy < 24; ++yy) {
-        for (ap_int<32> xx = 0; xx < 24; ++xx) {
-          float reducer12;
-          reducer12 = 0.000000e+00f;
-          for (ap_int<32> ra29 = 0; ra29 < 5; ++ra29) {
-            for (ap_int<32> ra30 = 0; ra30 < 5; ++ra30) {
-              reducer12 = (((float)(((ap_int<64>)input_image[nn][0][(yy + ra29)][(xx + ra30)]) * ((ap_int<64>)weight_conv1[ff][0][ra29][ra30]))) + reducer12);
-            }
-          }
-          conv2d[nn][ff][yy][xx] = ((ap_int<32>)reducer12);
-        }
-      }
-    }
-  }
-  ap_int<32> tanh1[1000][20][24][24];
-  for (ap_int<32> args = 0; args < 1000; ++args) {
-    for (ap_int<32> args0 = 0; args0 < 20; ++args0) {
-      for (ap_int<32> args1 = 0; args1 < 24; ++args1) {
-        for (ap_int<32> args2 = 0; args2 < 24; ++args2) {
-          tanh1[args][args0][args1][args2] = ((ap_int<32>)tanh(((double)conv2d[args][args0][args1][args2])));
-        }
-      }
-    }
-  }
-  ap_int<32> max_pool[1000][20][12][12];
-  for (ap_int<32> i = 0; i < 1000; ++i) {
-    for (ap_int<32> c = 0; c < 20; ++c) {
-      for (ap_int<32> h = 0; h < 12; ++h) {
-        for (ap_int<32> w = 0; w < 12; ++w) {
-          float reducer13;
-          reducer13 = -1.000000e+00f;
-          for (ap_int<32> ra31 = 0; ra31 < 2; ++ra31) {
-            for (ap_int<32> ra32 = 0; ra32 < 2; ++ra32) {
-              reducer13 = std::max(((float)tanh1[i][c][((h * 2) + ra31)][((w * 2) + ra32)]), reducer13);
-            }
-          }
-          max_pool[i][c][h][w] = ((ap_int<32>)reducer13);
-        }
-      }
-    }
-  }
-  ap_int<32> conv2d1[1000][50][8][8];
-  for (ap_int<32> nn1 = 0; nn1 < 1000; ++nn1) {
-    for (ap_int<32> ff1 = 0; ff1 < 50; ++ff1) {
-      for (ap_int<32> yy1 = 0; yy1 < 8; ++yy1) {
-        for (ap_int<32> xx1 = 0; xx1 < 8; ++xx1) {
-          float reducer14;
-          reducer14 = 0.000000e+00f;
-          for (ap_int<32> ra33 = 0; ra33 < 20; ++ra33) {
-            for (ap_int<32> ra34 = 0; ra34 < 5; ++ra34) {
-              for (ap_int<32> ra35 = 0; ra35 < 5; ++ra35) {
-                reducer14 = (((float)(((ap_int<64>)max_pool[nn1][ra33][(yy1 + ra34)][(xx1 + ra35)]) * ((ap_int<64>)weight_conv2[ff1][ra33][ra34][ra35]))) + reducer14);
-              }
-            }
-          }
-          conv2d1[nn1][ff1][yy1][xx1] = ((ap_int<32>)reducer14);
-        }
-      }
-    }
-  }
-  ap_int<32> tanh2[1000][50][8][8];
-  for (ap_int<32> args3 = 0; args3 < 1000; ++args3) {
-    for (ap_int<32> args01 = 0; args01 < 50; ++args01) {
-      for (ap_int<32> args11 = 0; args11 < 8; ++args11) {
-        for (ap_int<32> args21 = 0; args21 < 8; ++args21) {
-          tanh2[args3][args01][args11][args21] = ((ap_int<32>)tanh(((double)conv2d1[args3][args01][args11][args21])));
-        }
-      }
-    }
-  }
-  ap_int<32> max_pool1[1000][50][4][4];
-  for (ap_int<32> i1 = 0; i1 < 1000; ++i1) {
-    for (ap_int<32> c1 = 0; c1 < 50; ++c1) {
-      for (ap_int<32> h1 = 0; h1 < 4; ++h1) {
-        for (ap_int<32> w1 = 0; w1 < 4; ++w1) {
-          float reducer15;
-          reducer15 = -1.000000e+00f;
-          for (ap_int<32> ra36 = 0; ra36 < 2; ++ra36) {
-            for (ap_int<32> ra37 = 0; ra37 < 2; ++ra37) {
-              reducer15 = std::max(((float)tanh2[i1][c1][((h1 * 2) + ra36)][((w1 * 2) + ra37)]), reducer15);
-            }
-          }
-          max_pool1[i1][c1][h1][w1] = ((ap_int<32>)reducer15);
-        }
-      }
-    }
-  }
-  ap_int<32> compute6[1000][800];
-  for (ap_int<32> i2 = 0; i2 < 1000; ++i2) {
-    for (ap_int<32> j = 0; j < 800; ++j) {
-      compute6[i2][j] = max_pool1[i2][(j / 16)][((j / 4) % 4)][(j % 4)];
-    }
-  }
-  ap_int<32> dense[1000][500];
-  for (ap_int<32> i3 = 0; i3 < 1000; ++i3) {
-    for (ap_int<32> j1 = 0; j1 < 500; ++j1) {
-      float reducer16;
-      reducer16 = 0.000000e+00f;
-      for (ap_int<32> ra38 = 0; ra38 < 800; ++ra38) {
-        reducer16 = (((float)(((ap_int<64>)compute6[i3][ra38]) * ((ap_int<64>)weight_fc1[j1][ra38]))) + reducer16);
-      }
-      dense[i3][j1] = ((ap_int<32>)reducer16);
-    }
-  }
-  ap_int<32> tanh3[1000][500];
-  for (ap_int<32> args4 = 0; args4 < 1000; ++args4) {
-    for (ap_int<32> args02 = 0; args02 < 500; ++args02) {
-      tanh3[args4][args02] = ((ap_int<32>)tanh(((double)dense[args4][args02])));
-    }
-  }
-  ap_int<32> dense1[1000][10];
-  for (ap_int<32> i4 = 0; i4 < 1000; ++i4) {
-    for (ap_int<32> j2 = 0; j2 < 10; ++j2) {
-      float reducer17;
-      reducer17 = 0.000000e+00f;
-      for (ap_int<32> ra39 = 0; ra39 < 500; ++ra39) {
-        reducer17 = (((float)(((ap_int<64>)tanh3[i4][ra39]) * ((ap_int<64>)weight_fc2[j2][ra39]))) + reducer17);
-      }
-      dense1[i4][j2] = ((ap_int<32>)reducer17);
-    }
-  }
-  ap_int<32> compute7[1000];
-  for (ap_int<32> i5 = 0; i5 < 1000; ++i5) {
-    ap_int<32> max;
-    max = 0;
-    for (ap_int<32> ra40 = 0; ra40 < 10; ++ra40) {
-      max = std::max(dense1[i5][ra40], max);
-    }
-    compute7[i5] = max;
-  }
-  ap_int<32> compute8[1000];
-  for (ap_int<32> i6 = 0; i6 < 1000; ++i6) {
-    ap_int<32> sum;
-    sum = 0;
-    for (ap_int<32> ra41 = 0; ra41 < 10; ++ra41) {
-      sum = ((ap_int<32>)(exp(((double)((ap_int<33>)(dense1[i6][ra41] - compute7[i6])))) + ((double)sum)));
-    }
-    compute8[i6] = sum;
-  }
-  ap_int<32> update2;
-  for (ap_int<32> i7 = 0; i7 < 1000; ++i7) {
-    for (ap_int<32> j3 = 0; j3 < 10; ++j3) {
-      lenet[i7][j3] = ((ap_int<32>)(exp(((double)((ap_int<33>)(dense1[i7][j3] - compute7[i7])))) / ((double)compute8[i7])));
-    }
-  }
-}
-
diff --git a/samples/smith_waterman/common/common.mk b/samples/smith_waterman/common/common.mk
deleted file mode 100644
index 3409e4aa5..000000000
--- a/samples/smith_waterman/common/common.mk
+++ /dev/null
@@ -1,55 +0,0 @@
-SHELL = /bin/bash
-VPATH = ./
-CC = xcpp
-CLCC = xocc
-ifeq ($(XDEVICE_REPO_PATH),)
-    DEVICE_REPO_OPT = 
-else
-DEVICE_REPO_OPT = --xp prop:solution.device_repo_paths=${XDEVICE_REPO_PATH}
-endif
-HOST_CFLAGS += -I${XILINX_SDX}/runtime/include/1_2
-HOST_LFLAGS += -L${XILINX_SDX}/runtime/lib/x86_64 -lxilinxopencl -lrt -pthread
-CLCC_OPT += $(CLCC_OPT_LEVEL) ${DEVICE_REPO_OPT} --xdevice ${XDEVICE} -o ${XCLBIN} ${KERNEL_DEFS} ${KERNEL_INCS}
-ifeq (${KEEP_TEMP},1)
-    CLCC_OPT += -s
-endif
-ifeq (${KERNEL_DEBUG},1)
-    CLCC_OPT += -g
-endif
-CLCC_OPT += --kernel ${KERNEL_NAME}
-OBJECTS := $(HOST_SRCS:.cpp=.o)
-.PHONY: all
-all: run
-host: ${HOST_EXE_DIR}/${HOST_EXE}
-xbin_cpu_em:
-    make SDA_FLOW=cpu_emu xbin -f sdaccel.mk
-xbin_hw_em:
-    make SDA_FLOW=hw_emu xbin -f sdaccel.mk
-xbin_hw :
-    make SDA_FLOW=hw xbin -f sdaccel.mk
-xbin: ${XCLBIN}
-run_cpu_em: 
-    make SDA_FLOW=cpu_emu run_em -f sdaccel.mk
-run_hw_em: 
-    make SDA_FLOW=hw_emu run_em -f sdaccel.mk
-run_hw : 
-    make SDA_FLOW=hw run_hw_int -f sdaccel.mk
-run_em: xconfig host xbin
-    XCL_EMULATION_MODE=true ${HOST_EXE_DIR}/${HOST_EXE} ${HOST_ARGS}
-run_hw_int : host xbin_hw
-    source ${BOARD_SETUP_FILE};${HOST_EXE_DIR}/${HOST_EXE} ${HOST_ARGS}
-estimate : 
-    ${CLCC} -c -t hw_emu --xdevice ${XDEVICE} --report estimate ${KERNEL_SRCS}
-xconfig : emconfig.json
-emconfig.json :
-    emconfigutil --xdevice ${XDEVICE} ${DEVICE_REPO_OPT} --od .
-${HOST_EXE_DIR}/${HOST_EXE} : ${OBJECTS}
-    ${CC} ${HOST_LFLAGS} ${OBJECTS} -o $@
-${XCLBIN}:
-    ${CLCC} ${CLCC_OPT} ${KERNEL_SRCS}
-%.o: %.cpp
-    ${CC} ${HOST_CFLAGS} -c $< -o $@
-clean:
-    ${RM} -rf ${HOST_EXE} ${OBJECTS} ${XCLBIN} emconfig.json _xocc_${XCLBIN_NAME}_*.dir .Xil
-cleanall: clean
-    ${RM} -rf *.xclbin sdaccel_profile_summary.* _xocc_* TempConfig *.log *.jou
diff --git a/samples/smith_waterman/lenet_aocl.cl b/samples/smith_waterman/lenet_aocl.cl
deleted file mode 100644
index bf8608082..000000000
--- a/samples/smith_waterman/lenet_aocl.cl
+++ /dev/null
@@ -1,143 +0,0 @@
-#include "ihc_apint.h"
-__kernel void default_function(__global uint3_t* restrict seqAs, __global uint3_t* restrict seqBs, __global uint3_t* restrict outAs, __global uint3_t* restrict outBs) {
-  int B;
-  #pragma ii 1
-  for (int t_outer = 0; t_outer < 32; ++t_outer) {
-    #pragma unroll
-    for (int t_inner = 0; t_inner < 32; ++t_inner) {
-      int maxtrix_max;
-      maxtrix_max = 0;
-      int i_max;
-      i_max = 0;
-      int j_max;
-      j_max = 0;
-      short matrix[16641];
-      for (int x = 0; x < 129; ++x) {
-        for (int y = 0; y < 129; ++y) {
-          matrix[(y + (x * 129))] = (short)0;
-        }
-      }
-      short action[16641];
-      for (int x1 = 0; x1 < 129; ++x1) {
-        for (int y1 = 0; y1 < 129; ++y1) {
-          action[(y1 + (x1 * 129))] = (short)3;
-        }
-      }
-      int mutate3;
-      for (int i = 0; i < 129; ++i) {
-        for (int j = 0; j < 129; ++j) {
-          int trace_back[4];
-          for (int x2 = 0; x2 < 4; ++x2) {
-            trace_back[x2] = 0;
-          }
-          if ((i != 0) && (j != 0)) {
-            trace_back[0] = ((int)(((int33_t)matrix[((j + (i * 129)) + -130)]) + ((int33_t)(int)((seqAs[((i + ((t_inner + (t_outer * 32)) * 128)) + -1)] == seqBs[((j + ((t_inner + (t_outer * 32)) * 128)) + -1)]) ? 1 : -4))));
-            trace_back[1] = (((int)matrix[((j + (i * 129)) + -129)]) + -4);
-            trace_back[2] = (((int)matrix[((j + (i * 129)) + -1)]) + -4);
-            trace_back[3] = 0;
-            int max;
-            max = trace_back[0];
-            int act;
-            act = 0;
-            for (int i1 = 0; i1 < 4; ++i1) {
-              if (max < trace_back[i1]) {
-                max = trace_back[i1];
-                act = i1;
-              }
-            }
-            matrix[(j + (i * 129))] = ((short)max);
-            action[(j + (i * 129))] = ((short)act);
-            if (maxtrix_max < ((int)matrix[(j + (i * 129))])) {
-              maxtrix_max = ((int)matrix[(j + (i * 129))]);
-              i_max = i;
-              j_max = j;
-            }
-          }
-        }
-      }
-      int T;
-      int curr_i;
-      curr_i = i_max;
-      int curr_j;
-      curr_j = j_max;
-      int next_i;
-      next_i = 0;
-      int next_j;
-      next_j = 0;
-      int act1;
-      act1 = ((int)action[(curr_j + (curr_i * 129))]);
-      int next_i1;
-      next_i1 = 0;
-      int next_j1;
-      next_j1 = 0;
-      if (act1 == 0) {
-        next_i1 = (curr_i + -1);
-        next_j1 = (curr_j + -1);
-      } else {
-        if (act1 == 1) {
-          next_i1 = (curr_i + -1);
-          next_j1 = curr_j;
-        } else {
-          if (act1 == 2) {
-            next_i1 = curr_i;
-            next_j1 = (curr_j + -1);
-          } else {
-            next_i1 = curr_i;
-            next_j1 = curr_j;
-          }
-        }
-      }
-      next_i = next_i1;
-      next_j = next_j1;
-      int tick;
-      tick = 0;
-      while (((curr_i != next_i) || (curr_j != next_j))) {
-        int a;
-        a = 0;
-        int b;
-        b = 0;
-        if (next_i == curr_i) {
-          a = 0;
-        } else {
-          a = ((int)seqAs[((curr_i + ((t_inner + (t_outer * 32)) * 128)) + -1)]);
-        }
-        if (next_j == curr_j) {
-          b = 0;
-        } else {
-          b = ((int)seqBs[((curr_j + ((t_inner + (t_outer * 32)) * 128)) + -1)]);
-        }
-        outAs[(tick + ((t_inner + (t_outer * 32)) * 256))] = ((uint3_t)a);
-        outBs[(tick + ((t_inner + (t_outer * 32)) * 256))] = ((uint3_t)b);
-        curr_i = next_i;
-        curr_j = next_j;
-        int act2;
-        act2 = ((int)action[(curr_j + (curr_i * 129))]);
-        int next_i2;
-        next_i2 = 0;
-        int next_j2;
-        next_j2 = 0;
-        if (act2 == 0) {
-          next_i2 = (curr_i + -1);
-          next_j2 = (curr_j + -1);
-        } else {
-          if (act2 == 1) {
-            next_i2 = (curr_i + -1);
-            next_j2 = curr_j;
-          } else {
-            if (act2 == 2) {
-              next_i2 = curr_i;
-              next_j2 = (curr_j + -1);
-            } else {
-              next_i2 = curr_i;
-              next_j2 = curr_j;
-            }
-          }
-        }
-        next_i = next_i2;
-        next_j = next_j2;
-        tick = (tick + 1);
-      }
-    }
-  }
-}
-
diff --git a/samples/smith_waterman/main.cpp b/samples/smith_waterman/main.cpp
deleted file mode 100644
index 851a98bf7..000000000
--- a/samples/smith_waterman/main.cpp
+++ /dev/null
@@ -1,135 +0,0 @@
-#define CL_HPP_CL_1_2_DEFAULT_BUILD
-#define CL_HPP_TARGET_OPENCL_VERSION 120
-#define CL_HPP_MINIMUM_OPENCL_VERSION 120
-#define CL_HPP_ENABLE_PROGRAM_CONSTRUCTION_FROM_ARRAY_COMPATIBILITY 1
-#include <CL/cl2.hpp>
-#include <fstream>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <fcntl.h>
-#include <unistd.h>
-#include <stdlib.h>
-#include <stdio.h>
-#include <cstring>
-#include <iostream>
-#include <iomanip>
-#include <cmath>
-#include <sys/ipc.h>
-#include <sys/shm.h>
-#pragma once
-
-
-
-
-int main(void) { 
-#if defined(SDX_PLATFORM) && !defined(TARGET_DEVICE)
-  #define STR_VALUE(arg) #arg
-  #define GET_STRING(name) STR_VALUE(name)
-  #define TARGET_DEVICE GET_STRING(SDX_PLATFORM)
-#endif
-    char* xclbinFilename = argv[1];
-
-    std::vector<unsigned int> source_0(1024 * 128);
-    std::vector<unsigned int> source_1(1024 * 128);
-    std::vector<unsigned int> source_2(1024 * 256);
-    std::vector<unsigned int> source_3(1024 * 256);
-
-    size_t vector_size_bytes_0 = sizeof(unsigned int) * 1024 * 128;
-    size_t vector_size_bytes_1 = sizeof(unsigned int) * 1024 * 128;
-    size_t vector_size_bytes_2 = sizeof(unsigned int) * 1024 * 256;
-    size_t vector_size_bytes_3 = sizeof(unsigned int) * 1024 * 256;
-
-    unsigned int* arg_0 = (unsigned int*)shmat(1769476, nullptr, 0);
-    for (size_t i0 = 0; i0 < 1024; i0++) {
-      for (size_t i1 = 0; i1 < 128; i1++) {
-        source_0[i1 + i0*128] = arg_0[i1 + i0*128];
-      }
-    }
-    unsigned int* arg_1 = (unsigned int*)shmat(3538944, nullptr, 0);
-    for (size_t i0 = 0; i0 < 1024; i0++) {
-      for (size_t i1 = 0; i1 < 128; i1++) {
-        source_1[i1 + i0*128] = arg_1[i1 + i0*128];
-      }
-    }
-    unsigned int* arg_2 = (unsigned int*)shmat(3538945, nullptr, 0);
-    for (size_t i0 = 0; i0 < 1024; i0++) {
-      for (size_t i1 = 0; i1 < 256; i1++) {
-        source_2[i1 + i0*256] = arg_2[i1 + i0*256];
-      }
-    }
-    unsigned int* arg_3 = (unsigned int*)shmat(2162690, nullptr, 0);
-    for (size_t i0 = 0; i0 < 1024; i0++) {
-      for (size_t i1 = 0; i1 < 256; i1++) {
-        source_3[i1 + i0*256] = arg_3[i1 + i0*256];
-      }
-    }
-    std::vector<cl::Platform> platforms;
-    cl::Platform::get(&platforms);
-    cl::Platform platform = platforms[0];
-
-    std::vector<cl::Device> devices;
-    platform.getDevices(CL_DEVICE_TYPE_ACCELERATOR, &devices);
-    cl::Device device = devices[0];
-
-    cl::Context context(device);
-    cl::CommandQueue q(context, device);
-
-    std::ifstream bin_file(xclbinFilename, std::ifstream::binary);
-    bin_file.seekg (0, bin_file.end);
-    unsigned nb = bin_file.tellg();
-    bin_file.seekg (0, bin_file.beg);
-    char *buf = new char [nb];
-    bin_file.read(buf, nb);
-
-    cl::Program::Binaries bins;
-    bins.push_back({buf,nb});
-    devices.resize(1);
-    cl::Program program(context, devices, bins);
-
-    int err1;
-    cl::Kernel kernel(program, "default_function", &err1);
-    auto default_function = cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&>(kernel);
-
-    cl::Buffer buffer_0(context, CL_MEM_READ_WRITE, vector_size_bytes_0);
-    cl::Buffer buffer_1(context, CL_MEM_READ_WRITE, vector_size_bytes_1);
-    cl::Buffer buffer_2(context, CL_MEM_READ_WRITE, vector_size_bytes_2);
-    cl::Buffer buffer_3(context, CL_MEM_READ_WRITE, vector_size_bytes_3);
-
-    q.enqueueWriteBuffer(buffer_0, CL_TRUE, 0, vector_size_bytes_0, source_0.data());
-    q.enqueueWriteBuffer(buffer_1, CL_TRUE, 0, vector_size_bytes_1, source_1.data());
-    q.enqueueWriteBuffer(buffer_2, CL_TRUE, 0, vector_size_bytes_2, source_2.data());
-    q.enqueueWriteBuffer(buffer_3, CL_TRUE, 0, vector_size_bytes_3, source_3.data());
-
-    default_function(cl::EnqueueArgs(q, cl::NDRange(1,1,1), cl::NDRange(1,1,1)),buffer_0, buffer_1, buffer_2, buffer_3);
-    q.finish();
-
-    q.enqueueReadBuffer(buffer_0, CL_TRUE, 0, vector_size_bytes_0, source_0.data());
-    q.enqueueReadBuffer(buffer_1, CL_TRUE, 0, vector_size_bytes_1, source_1.data());
-    q.enqueueReadBuffer(buffer_2, CL_TRUE, 0, vector_size_bytes_2, source_2.data());
-    q.enqueueReadBuffer(buffer_3, CL_TRUE, 0, vector_size_bytes_3, source_3.data());
-
-    for (size_t i0 = 0; i0 < 1024; i0++) {
-      for (size_t i1 = 0; i1 < 128; i1++) {
-        arg_0[i1 + i0*128] = source_0[i1 + i0*128];
-      }
-    }
-    shmdt(arg_0);
-    for (size_t i0 = 0; i0 < 1024; i0++) {
-      for (size_t i1 = 0; i1 < 128; i1++) {
-        arg_1[i1 + i0*128] = source_1[i1 + i0*128];
-      }
-    }
-    shmdt(arg_1);
-    for (size_t i0 = 0; i0 < 1024; i0++) {
-      for (size_t i1 = 0; i1 < 256; i1++) {
-        arg_2[i1 + i0*256] = source_2[i1 + i0*256];
-      }
-    }
-    shmdt(arg_2);
-    for (size_t i0 = 0; i0 < 1024; i0++) {
-      for (size_t i1 = 0; i1 < 256; i1++) {
-        arg_3[i1 + i0*256] = source_3[i1 + i0*256];
-      }
-    }
-    shmdt(arg_3);
-}
diff --git a/samples/smith_waterman/merlinc_code.cl b/samples/smith_waterman/merlinc_code.cl
deleted file mode 100644
index c3a347f35..000000000
--- a/samples/smith_waterman/merlinc_code.cl
+++ /dev/null
@@ -1,146 +0,0 @@
-#include <string.h>
-#include <math.h>
-#include <assert.h>
-#pragma ACCEL kernel
-void default_function(unsigned char* seqAs, unsigned char* seqBs, unsigned char* outAs, unsigned char* outBs) {
-  int B;
-#pragma ACCEL pipeline
-  for (int t_outer = 0; t_outer < 32; ++t_outer) {
-#pragma ACCEL parallel
-    for (int t_inner = 0; t_inner < 32; ++t_inner) {
-      int maxtrix_max;
-      maxtrix_max = 0;
-      int i_max;
-      i_max = 0;
-      int j_max;
-      j_max = 0;
-      short matrix[16641];
-      for (int x = 0; x < 129; ++x) {
-        for (int y = 0; y < 129; ++y) {
-          matrix[(y + (x * 129))] = (short)0;
-        }
-      }
-      short action[16641];
-      for (int x1 = 0; x1 < 129; ++x1) {
-        for (int y1 = 0; y1 < 129; ++y1) {
-          action[(y1 + (x1 * 129))] = (short)3;
-        }
-      }
-      int mutate3;
-      for (int i = 0; i < 129; ++i) {
-        for (int j = 0; j < 129; ++j) {
-          int trace_back[4];
-          for (int x2 = 0; x2 < 4; ++x2) {
-            trace_back[x2] = 0;
-          }
-          if ((i != 0) && (j != 0)) {
-            trace_back[0] = ((int)(((long)matrix[((j + (i * 129)) + -130)]) + ((long)((seqAs[((i + ((t_inner + (t_outer * 32)) * 128)) + -1)] == seqBs[((j + ((t_inner + (t_outer * 32)) * 128)) + -1)]) ? 1 : -4))));
-            trace_back[1] = (((int)matrix[((j + (i * 129)) + -129)]) + -4);
-            trace_back[2] = (((int)matrix[((j + (i * 129)) + -1)]) + -4);
-            trace_back[3] = 0;
-            int max;
-            max = trace_back[0];
-            int act;
-            act = 0;
-            for (int i1 = 0; i1 < 4; ++i1) {
-              if (max < trace_back[i1]) {
-                max = trace_back[i1];
-                act = i1;
-              }
-            }
-            matrix[(j + (i * 129))] = ((short)max);
-            action[(j + (i * 129))] = ((short)act);
-            if (maxtrix_max < ((int)matrix[(j + (i * 129))])) {
-              maxtrix_max = ((int)matrix[(j + (i * 129))]);
-              i_max = i;
-              j_max = j;
-            }
-          }
-        }
-      }
-      int T;
-      int curr_i;
-      curr_i = i_max;
-      int curr_j;
-      curr_j = j_max;
-      int next_i;
-      next_i = 0;
-      int next_j;
-      next_j = 0;
-      int act1;
-      act1 = ((int)action[(curr_j + (curr_i * 129))]);
-      int next_i1;
-      next_i1 = 0;
-      int next_j1;
-      next_j1 = 0;
-      if (act1 == 0) {
-        next_i1 = (curr_i + -1);
-        next_j1 = (curr_j + -1);
-      } else {
-        if (act1 == 1) {
-          next_i1 = (curr_i + -1);
-          next_j1 = curr_j;
-        } else {
-          if (act1 == 2) {
-            next_i1 = curr_i;
-            next_j1 = (curr_j + -1);
-          } else {
-            next_i1 = curr_i;
-            next_j1 = curr_j;
-          }
-        }
-      }
-      next_i = next_i1;
-      next_j = next_j1;
-      int tick;
-      tick = 0;
-      while (((curr_i != next_i) || (curr_j != next_j))) {
-        int a;
-        a = 0;
-        int b;
-        b = 0;
-        if (next_i == curr_i) {
-          a = 0;
-        } else {
-          a = ((int)seqAs[((curr_i + ((t_inner + (t_outer * 32)) * 128)) + -1)]);
-        }
-        if (next_j == curr_j) {
-          b = 0;
-        } else {
-          b = ((int)seqBs[((curr_j + ((t_inner + (t_outer * 32)) * 128)) + -1)]);
-        }
-        outAs[(tick + ((t_inner + (t_outer * 32)) * 256))] = ((unsigned char)a);
-        outBs[(tick + ((t_inner + (t_outer * 32)) * 256))] = ((unsigned char)b);
-        curr_i = next_i;
-        curr_j = next_j;
-        int act2;
-        act2 = ((int)action[(curr_j + (curr_i * 129))]);
-        int next_i2;
-        next_i2 = 0;
-        int next_j2;
-        next_j2 = 0;
-        if (act2 == 0) {
-          next_i2 = (curr_i + -1);
-          next_j2 = (curr_j + -1);
-        } else {
-          if (act2 == 1) {
-            next_i2 = (curr_i + -1);
-            next_j2 = curr_j;
-          } else {
-            if (act2 == 2) {
-              next_i2 = curr_i;
-              next_j2 = (curr_j + -1);
-            } else {
-              next_i2 = curr_i;
-              next_j2 = curr_j;
-            }
-          }
-        }
-        next_i = next_i2;
-        next_j = next_j2;
-        tick = (tick + 1);
-      }
-    }
-  }
-}
-
diff --git a/samples/smith_waterman/sdaccel.mk b/samples/smith_waterman/sdaccel.mk
deleted file mode 100644
index ce266d89e..000000000
--- a/samples/smith_waterman/sdaccel.mk
+++ /dev/null
@@ -1,32 +0,0 @@
-ifndef XILINX_SDX
-$(error Environment variable XILINX_SDX is required and should point to SDAccel install area)
-endif
-SDA_FLOW = cpu_emu
-HOST_SRCS = host.cpp
-HOST_EXE_DIR=.
-HOST_EXE = host
-HOST_CFLAGS = -g -Wall -DFPGA_DEVICE -DC_KERNEL
-HOST_LFLAGS = 
-KERNEL_SRCS = default_function.cl
-KERNEL_NAME = default_function
-KERNEL_DEFS =
-KERNEL_INCS =
-XDEVICE=xilinx:adm-pcie-7v3:1ddr:3.0
-XDEVICE_REPO_PATH=
-KEEP_TEMP=1
-KERNEL_DEBUG=
-XCLBIN_NAME=bin_krnl
-HOST_CFLAGS+=-DTARGET_DEVICE=\"${XDEVICE}\"
-BOARD_SETUP_FILE=setup.sh
-ifeq (${SDA_FLOW},cpu_emu)
-    CLCC_OPT += -t sw_emu
-    XCLBIN = ${XCLBIN_NAME}_cpu_emu.xclbin
-else ifeq (${SDA_FLOW},hw_emu)
-    CLCC_OPT += -t hw_emu
-    XCLBIN = ${XCLBIN_NAME}_hw_emu.xclbin
-else ifeq (${SDA_FLOW},hw)
-    XCLBIN = ${XCLBIN_NAME}_hw.xclbin
-CLCC_OPT += -t hw
-endifHOST_ARGS = ${XCLBIN}
-COMMON_DIR = ./common
-include ${COMMON_DIR}/common.mk
diff --git a/samples/smith_waterman/sdaccel_code.cl b/samples/smith_waterman/sdaccel_code.cl
deleted file mode 100644
index a0f5fdb01..000000000
--- a/samples/smith_waterman/sdaccel_code.cl
+++ /dev/null
@@ -1,142 +0,0 @@
-__kernel void default_function(__global unsigned char* seqAs, __global unsigned char* seqBs, __global unsigned char* outAs, __global unsigned char* outBs) {
-  __local int B;
-  __attribute__((xcl_pipeline_loop(1)))
-  for (int t_outer = 0; t_outer < 2; ++t_outer) {
-    
-    for (int t_inner = 0; t_inner < 32; ++t_inner) {
-      __local int maxtrix_max;
-      maxtrix_max = 0;
-      __local int i_max;
-      i_max = 0;
-      __local int j_max;
-      j_max = 0;
-      __local short matrix[841];
-      for (int x = 0; x < 29; ++x) {
-        for (int y = 0; y < 29; ++y) {
-          matrix[(y + (x * 29))] = (short)0;
-        }
-      }
-      __local short action[841];
-      for (int x1 = 0; x1 < 29; ++x1) {
-        for (int y1 = 0; y1 < 29; ++y1) {
-          action[(y1 + (x1 * 29))] = (short)3;
-        }
-      }
-      __local int mutate1;
-      for (int i = 0; i < 29; ++i) {
-        for (int j = 0; j < 29; ++j) {
-          __local int trace_back[4];
-          for (int x2 = 0; x2 < 4; ++x2) {
-            trace_back[x2] = 0;
-          }
-          if ((i != 0) && (j != 0)) {
-            trace_back[0] = ((int)(((long)matrix[((j + (i * 29)) + -30)]) + ((long)(int)((seqAs[((i + ((t_inner + (t_outer * 32)) * 28)) + -1)] == seqBs[((j + ((t_inner + (t_outer * 32)) * 28)) + -1)]) ? 1 : -4))));
-            trace_back[1] = (((int)matrix[((j + (i * 29)) + -29)]) + -4);
-            trace_back[2] = (((int)matrix[((j + (i * 29)) + -1)]) + -4);
-            trace_back[3] = 0;
-            __local int max;
-            max = trace_back[0];
-            __local int act;
-            act = 0;
-            for (int i1 = 0; i1 < 4; ++i1) {
-              if (max < trace_back[i1]) {
-                max = trace_back[i1];
-                act = i1;
-              }
-            }
-            matrix[(j + (i * 29))] = ((short)max);
-            action[(j + (i * 29))] = ((short)act);
-            if (maxtrix_max < ((int)matrix[(j + (i * 29))])) {
-              maxtrix_max = ((int)matrix[(j + (i * 29))]);
-              i_max = i;
-              j_max = j;
-            }
-          }
-        }
-      }
-      __local int T;
-      __local int curr_i;
-      curr_i = i_max;
-      __local int curr_j;
-      curr_j = j_max;
-      __local int next_i;
-      next_i = 0;
-      __local int next_j;
-      next_j = 0;
-      __local int act1;
-      act1 = ((int)action[(curr_j + (curr_i * 29))]);
-      __local int next_i1;
-      next_i1 = 0;
-      __local int next_j1;
-      next_j1 = 0;
-      if (act1 == 0) {
-        next_i1 = (curr_i + -1);
-        next_j1 = (curr_j + -1);
-      } else {
-        if (act1 == 1) {
-          next_i1 = (curr_i + -1);
-          next_j1 = curr_j;
-        } else {
-          if (act1 == 2) {
-            next_i1 = curr_i;
-            next_j1 = (curr_j + -1);
-          } else {
-            next_i1 = curr_i;
-            next_j1 = curr_j;
-          }
-        }
-      }
-      next_i = next_i1;
-      next_j = next_j1;
-      __local int tick;
-      tick = 0;
-      while (((curr_i != next_i) || (curr_j != next_j))) {
-        __local int a;
-        a = 0;
-        __local int b;
-        b = 0;
-        if (next_i == curr_i) {
-          a = 0;
-        } else {
-          a = ((int)seqAs[((curr_i + ((t_inner + (t_outer * 32)) * 28)) + -1)]);
-        }
-        if (next_j == curr_j) {
-          b = 0;
-        } else {
-          b = ((int)seqBs[((curr_j + ((t_inner + (t_outer * 32)) * 28)) + -1)]);
-        }
-        outAs[(tick + ((t_inner + (t_outer * 32)) * 56))] = ((unsigned char)a);
-        outBs[(tick + ((t_inner + (t_outer * 32)) * 56))] = ((unsigned char)b);
-        curr_i = next_i;
-        curr_j = next_j;
-        __local int act2;
-        act2 = ((int)action[(curr_j + (curr_i * 29))]);
-        __local int next_i2;
-        next_i2 = 0;
-        __local int next_j2;
-        next_j2 = 0;
-        if (act2 == 0) {
-          next_i2 = (curr_i + -1);
-          next_j2 = (curr_j + -1);
-        } else {
-          if (act2 == 1) {
-            next_i2 = (curr_i + -1);
-            next_j2 = curr_j;
-          } else {
-            if (act2 == 2) {
-              next_i2 = curr_i;
-              next_j2 = (curr_j + -1);
-            } else {
-              next_i2 = curr_i;
-              next_j2 = curr_j;
-            }
-          }
-        }
-        next_i = next_i2;
-        next_j = next_j2;
-        tick = (tick + 1);
-      }
-    }
-  }
-}
-
diff --git a/samples/smith_waterman/sdaccel_code_nounroll.cl b/samples/smith_waterman/sdaccel_code_nounroll.cl
deleted file mode 100644
index d5e145c05..000000000
--- a/samples/smith_waterman/sdaccel_code_nounroll.cl
+++ /dev/null
@@ -1,142 +0,0 @@
-__kernel void default_function(__global unsigned char* seqAs, __global unsigned char* seqBs, __global unsigned char* outAs, __global unsigned char* outBs) {
-  __local int B;
-  __attribute__((xcl_pipeline_loop(1)))
-  for (int t_outer = 0; t_outer < 32; ++t_outer) {
-    __attribute__((opencl_unroll_hint(2)))
-    for (int t_inner = 0; t_inner < 32; ++t_inner) {
-      __local int maxtrix_max;
-      maxtrix_max = 0;
-      __local int i_max;
-      i_max = 0;
-      __local int j_max;
-      j_max = 0;
-      __local short matrix[16641];
-      for (int x = 0; x < 129; ++x) {
-        for (int y = 0; y < 129; ++y) {
-          matrix[(y + (x * 129))] = (short)0;
-        }
-      }
-      __local short action[16641];
-      for (int x1 = 0; x1 < 129; ++x1) {
-        for (int y1 = 0; y1 < 129; ++y1) {
-          action[(y1 + (x1 * 129))] = (short)3;
-        }
-      }
-      __local int mutate1;
-      for (int i = 0; i < 129; ++i) {
-        for (int j = 0; j < 129; ++j) {
-          __local int trace_back[4];
-          for (int x2 = 0; x2 < 4; ++x2) {
-            trace_back[x2] = 0;
-          }
-          if ((i != 0) && (j != 0)) {
-            trace_back[0] = ((int)(((long)matrix[((j + (i * 129)) + -130)]) + ((long)(int)((seqAs[((i + ((t_inner + (t_outer * 32)) * 128)) + -1)] == seqBs[((j + ((t_inner + (t_outer * 32)) * 128)) + -1)]) ? 1 : -4))));
-            trace_back[1] = (((int)matrix[((j + (i * 129)) + -129)]) + -4);
-            trace_back[2] = (((int)matrix[((j + (i * 129)) + -1)]) + -4);
-            trace_back[3] = 0;
-            __local int max;
-            max = trace_back[0];
-            __local int act;
-            act = 0;
-            for (int i1 = 0; i1 < 4; ++i1) {
-              if (max < trace_back[i1]) {
-                max = trace_back[i1];
-                act = i1;
-              }
-            }
-            matrix[(j + (i * 129))] = ((short)max);
-            action[(j + (i * 129))] = ((short)act);
-            if (maxtrix_max < ((int)matrix[(j + (i * 129))])) {
-              maxtrix_max = ((int)matrix[(j + (i * 129))]);
-              i_max = i;
-              j_max = j;
-            }
-          }
-        }
-      }
-      __local int T;
-      __local int curr_i;
-      curr_i = i_max;
-      __local int curr_j;
-      curr_j = j_max;
-      __local int next_i;
-      next_i = 0;
-      __local int next_j;
-      next_j = 0;
-      __local int act1;
-      act1 = ((int)action[(curr_j + (curr_i * 129))]);
-      __local int next_i1;
-      next_i1 = 0;
-      __local int next_j1;
-      next_j1 = 0;
-      if (act1 == 0) {
-        next_i1 = (curr_i + -1);
-        next_j1 = (curr_j + -1);
-      } else {
-        if (act1 == 1) {
-          next_i1 = (curr_i + -1);
-          next_j1 = curr_j;
-        } else {
-          if (act1 == 2) {
-            next_i1 = curr_i;
-            next_j1 = (curr_j + -1);
-          } else {
-            next_i1 = curr_i;
-            next_j1 = curr_j;
-          }
-        }
-      }
-      next_i = next_i1;
-      next_j = next_j1;
-      __local int tick;
-      tick = 0;
-      while (((curr_i != next_i) || (curr_j != next_j))) {
-        __local int a;
-        a = 0;
-        __local int b;
-        b = 0;
-        if (next_i == curr_i) {
-          a = 0;
-        } else {
-          a = ((int)seqAs[((curr_i + ((t_inner + (t_outer * 32)) * 128)) + -1)]);
-        }
-        if (next_j == curr_j) {
-          b = 0;
-        } else {
-          b = ((int)seqBs[((curr_j + ((t_inner + (t_outer * 32)) * 128)) + -1)]);
-        }
-        outAs[(tick + ((t_inner + (t_outer * 32)) * 256))] = ((unsigned char)a);
-        outBs[(tick + ((t_inner + (t_outer * 32)) * 256))] = ((unsigned char)b);
-        curr_i = next_i;
-        curr_j = next_j;
-        __local int act2;
-        act2 = ((int)action[(curr_j + (curr_i * 129))]);
-        __local int next_i2;
-        next_i2 = 0;
-        __local int next_j2;
-        next_j2 = 0;
-        if (act2 == 0) {
-          next_i2 = (curr_i + -1);
-          next_j2 = (curr_j + -1);
-        } else {
-          if (act2 == 1) {
-            next_i2 = (curr_i + -1);
-            next_j2 = curr_j;
-          } else {
-            if (act2 == 2) {
-              next_i2 = curr_i;
-              next_j2 = (curr_j + -1);
-            } else {
-              next_i2 = curr_i;
-              next_j2 = curr_j;
-            }
-          }
-        }
-        next_i = next_i2;
-        next_j = next_j2;
-        tick = (tick + 1);
-      }
-    }
-  }
-}
-
diff --git a/samples/smith_waterman/smith_aocl.cl b/samples/smith_waterman/smith_aocl.cl
deleted file mode 100644
index 80a4ba601..000000000
--- a/samples/smith_waterman/smith_aocl.cl
+++ /dev/null
@@ -1,143 +0,0 @@
-#include "ihc_apint.h"
-__kernel void default_function(__global uint* restrict seqAs, __global uint* restrict seqBs, __global uint* restrict outAs, __global uint* restrict outBs) {
-  int B;
-  #pragma ii 1
-  for (int t_outer = 0; t_outer < 2; ++t_outer) {
-    #pragma unroll
-    for (int t_inner = 0; t_inner < 32; ++t_inner) {
-      int maxtrix_max;
-      maxtrix_max = 0;
-      int i_max;
-      i_max = 0;
-      int j_max;
-      j_max = 0;
-      short matrix[841];
-      for (int x = 0; x < 29; ++x) {
-        for (int y = 0; y < 29; ++y) {
-          matrix[(y + (x * 29))] = (short)0;
-        }
-      }
-      short action[841];
-      for (int x1 = 0; x1 < 29; ++x1) {
-        for (int y1 = 0; y1 < 29; ++y1) {
-          action[(y1 + (x1 * 29))] = (short)3;
-        }
-      }
-      int mutate3;
-      for (int i = 0; i < 29; ++i) {
-        for (int j = 0; j < 29; ++j) {
-          int trace_back[4];
-          for (int x2 = 0; x2 < 4; ++x2) {
-            trace_back[x2] = 0;
-          }
-          if ((i != 0) && (j != 0)) {
-            trace_back[0] = ((int)(((int33_t)matrix[((j + (i * 29)) + -30)]) + ((int33_t)(int)((seqAs[((i + ((t_inner + (t_outer * 32)) * 28)) + -1)] == seqBs[((j + ((t_inner + (t_outer * 32)) * 28)) + -1)]) ? 1 : -4))));
-            trace_back[1] = (((int)matrix[((j + (i * 29)) + -29)]) + -4);
-            trace_back[2] = (((int)matrix[((j + (i * 29)) + -1)]) + -4);
-            trace_back[3] = 0;
-            int max;
-            max = trace_back[0];
-            int act;
-            act = 0;
-            for (int i1 = 0; i1 < 4; ++i1) {
-              if (max < trace_back[i1]) {
-                max = trace_back[i1];
-                act = i1;
-              }
-            }
-            matrix[(j + (i * 29))] = ((short)max);
-            action[(j + (i * 29))] = ((short)act);
-            if (maxtrix_max < ((int)matrix[(j + (i * 29))])) {
-              maxtrix_max = ((int)matrix[(j + (i * 29))]);
-              i_max = i;
-              j_max = j;
-            }
-          }
-        }
-      }
-      int T;
-      int curr_i;
-      curr_i = i_max;
-      int curr_j;
-      curr_j = j_max;
-      int next_i;
-      next_i = 0;
-      int next_j;
-      next_j = 0;
-      int act1;
-      act1 = ((int)action[(curr_j + (curr_i * 29))]);
-      int next_i1;
-      next_i1 = 0;
-      int next_j1;
-      next_j1 = 0;
-      if (act1 == 0) {
-        next_i1 = (curr_i + -1);
-        next_j1 = (curr_j + -1);
-      } else {
-        if (act1 == 1) {
-          next_i1 = (curr_i + -1);
-          next_j1 = curr_j;
-        } else {
-          if (act1 == 2) {
-            next_i1 = curr_i;
-            next_j1 = (curr_j + -1);
-          } else {
-            next_i1 = curr_i;
-            next_j1 = curr_j;
-          }
-        }
-      }
-      next_i = next_i1;
-      next_j = next_j1;
-      int tick;
-      tick = 0;
-      while (((curr_i != next_i) || (curr_j != next_j))) {
-        int a;
-        a = 0;
-        int b;
-        b = 0;
-        if (next_i == curr_i) {
-          a = 0;
-        } else {
-          a = ((int)seqAs[((curr_i + ((t_inner + (t_outer * 32)) * 28)) + -1)]);
-        }
-        if (next_j == curr_j) {
-          b = 0;
-        } else {
-          b = ((int)seqBs[((curr_j + ((t_inner + (t_outer * 32)) * 28)) + -1)]);
-        }
-        outAs[(tick + ((t_inner + (t_outer * 32)) * 56))] = ((uint3_t)a);
-        outBs[(tick + ((t_inner + (t_outer * 32)) * 56))] = ((uint3_t)b);
-        curr_i = next_i;
-        curr_j = next_j;
-        int act2;
-        act2 = ((int)action[(curr_j + (curr_i * 29))]);
-        int next_i2;
-        next_i2 = 0;
-        int next_j2;
-        next_j2 = 0;
-        if (act2 == 0) {
-          next_i2 = (curr_i + -1);
-          next_j2 = (curr_j + -1);
-        } else {
-          if (act2 == 1) {
-            next_i2 = (curr_i + -1);
-            next_j2 = curr_j;
-          } else {
-            if (act2 == 2) {
-              next_i2 = curr_i;
-              next_j2 = (curr_j + -1);
-            } else {
-              next_i2 = curr_i;
-              next_j2 = curr_j;
-            }
-          }
-        }
-        next_i = next_i2;
-        next_j = next_j2;
-        tick = (tick + 1);
-      }
-    }
-  }
-}
-
diff --git a/samples/smith_waterman/smith_vhls.cl b/samples/smith_waterman/smith_vhls.cl
deleted file mode 100644
index 4fd36c8aa..000000000
--- a/samples/smith_waterman/smith_vhls.cl
+++ /dev/null
@@ -1,146 +0,0 @@
-#include <ap_int.h>
-#include <ap_fixed.h>
-#include <math.h>
-
-void default_function(ap_uint<3> seqAs[64][28], ap_uint<3> seqBs[64][28], ap_uint<3> outAs[64][56], ap_uint<3> outBs[64][56]) {
-  ap_int<32> B;
-  for (ap_int<32> t_outer = 0; t_outer < 2; ++t_outer) {
-  #pragma HLS pipeline
-    for (ap_int<32> t_inner = 0; t_inner < 32; ++t_inner) {
-    #pragma HLS unroll
-      ap_int<32> maxtrix_max;
-      maxtrix_max = 0;
-      ap_int<32> i_max;
-      i_max = 0;
-      ap_int<32> j_max;
-      j_max = 0;
-      ap_int<16> matrix[29][29];
-      for (ap_int<32> x = 0; x < 29; ++x) {
-        for (ap_int<32> y = 0; y < 29; ++y) {
-          matrix[x][y] = (ap_int<16>)0;
-        }
-      }
-      ap_int<16> action[29][29];
-      for (ap_int<32> x1 = 0; x1 < 29; ++x1) {
-        for (ap_int<32> y1 = 0; y1 < 29; ++y1) {
-          action[x1][y1] = (ap_int<16>)3;
-        }
-      }
-      ap_int<32> mutate5;
-      for (ap_int<32> i = 0; i < 29; ++i) {
-        for (ap_int<32> j = 0; j < 29; ++j) {
-          ap_int<32> trace_back[4];
-          for (ap_int<32> x2 = 0; x2 < 4; ++x2) {
-            trace_back[x2] = 0;
-          }
-          if ((i != 0) && (j != 0)) {
-            trace_back[0] = ((ap_int<32>)(((ap_int<33>)matrix[(i + -1)][(j + -1)]) + ((ap_int<33>)((seqAs[(t_inner + (t_outer * 32))][(i + -1)] == seqBs[(t_inner + (t_outer * 32))][(j + -1)]) ? 1 : -4))));
-            trace_back[1] = (((ap_int<32>)matrix[(i + -1)][j]) + -4);
-            trace_back[2] = (((ap_int<32>)matrix[i][(j + -1)]) + -4);
-            trace_back[3] = 0;
-            ap_int<32> max;
-            max = trace_back[0];
-            ap_int<32> act;
-            act = 0;
-            for (ap_int<32> i1 = 0; i1 < 4; ++i1) {
-              if (max < trace_back[i1]) {
-                max = trace_back[i1];
-                act = i1;
-              }
-            }
-            matrix[i][j] = ((ap_int<16>)max);
-            action[i][j] = ((ap_int<16>)act);
-            if (maxtrix_max < ((ap_int<32>)matrix[i][j])) {
-              maxtrix_max = ((ap_int<32>)matrix[i][j]);
-              i_max = i;
-              j_max = j;
-            }
-          }
-        }
-      }
-      ap_int<32> T;
-      ap_int<32> curr_i;
-      curr_i = i_max;
-      ap_int<32> curr_j;
-      curr_j = j_max;
-      ap_int<32> next_i;
-      next_i = 0;
-      ap_int<32> next_j;
-      next_j = 0;
-      ap_int<32> act1;
-      act1 = ((ap_int<32>)action[((curr_j / 29) + curr_i)][(curr_j % 29)]);
-      ap_int<32> next_i1;
-      next_i1 = 0;
-      ap_int<32> next_j1;
-      next_j1 = 0;
-      if (act1 == 0) {
-        next_i1 = (curr_i + -1);
-        next_j1 = (curr_j + -1);
-      } else {
-        if (act1 == 1) {
-          next_i1 = (curr_i + -1);
-          next_j1 = curr_j;
-        } else {
-          if (act1 == 2) {
-            next_i1 = curr_i;
-            next_j1 = (curr_j + -1);
-          } else {
-            next_i1 = curr_i;
-            next_j1 = curr_j;
-          }
-        }
-      }
-      next_i = next_i1;
-      next_j = next_j1;
-      ap_int<32> tick;
-      tick = 0;
-      while (((curr_i != next_i) || (curr_j != next_j))) {
-        ap_int<32> a;
-        a = 0;
-        ap_int<32> b;
-        b = 0;
-        if (next_i == curr_i) {
-          a = 0;
-        } else {
-          a = ((ap_int<32>)seqAs[((((curr_i - ((curr_i + -1) % 28)) + ((t_inner + (t_outer * 32)) * 28)) + -1) / 28)][((curr_i + -1) % 28)]);
-        }
-        if (next_j == curr_j) {
-          b = 0;
-        } else {
-          b = ((ap_int<32>)seqBs[((((curr_j - ((curr_j + -1) % 28)) + ((t_inner + (t_outer * 32)) * 28)) + -1) / 28)][((curr_j + -1) % 28)]);
-        }
-        outAs[((tick / 56) + (t_inner + (t_outer * 32)))][(tick % 56)] = ((ap_uint<3>)a);
-        outBs[((tick / 56) + (t_inner + (t_outer * 32)))][(tick % 56)] = ((ap_uint<3>)b);
-        curr_i = next_i;
-        curr_j = next_j;
-        ap_int<32> act2;
-        act2 = ((ap_int<32>)action[((curr_j / 29) + curr_i)][(curr_j % 29)]);
-        ap_int<32> next_i2;
-        next_i2 = 0;
-        ap_int<32> next_j2;
-        next_j2 = 0;
-        if (act2 == 0) {
-          next_i2 = (curr_i + -1);
-          next_j2 = (curr_j + -1);
-        } else {
-          if (act2 == 1) {
-            next_i2 = (curr_i + -1);
-            next_j2 = curr_j;
-          } else {
-            if (act2 == 2) {
-              next_i2 = curr_i;
-              next_j2 = (curr_j + -1);
-            } else {
-              next_i2 = curr_i;
-              next_j2 = curr_j;
-            }
-          }
-        }
-        next_i = next_i2;
-        next_j = next_j2;
-        tick = (tick + 1);
-      }
-    }
-  }
-}
-
diff --git a/samples/smith_waterman/smith_waterman_sdaccel.py b/samples/smith_waterman/smith_waterman_sdaccel.py
deleted file mode 100644
index 354cac757..000000000
--- a/samples/smith_waterman/smith_waterman_sdaccel.py
+++ /dev/null
@@ -1,24 +0,0 @@
-import heterocl as hcl
-import numpy as np
-from smith_waterman_main import *
-
-# f = top("vhls_csim")
-f = top("sdaccel_sw_emu")
-
-# add a very simple test
-_seqA_np = np.ones((num, lenA))
-for i in range(0, 4):
-    _seqA_np[0][i] = 2
-_seqB_np = np.ones((num, lenB))
-_seqA = hcl.asarray(_seqA_np, dtype)
-_seqB = hcl.asarray(_seqB_np, dtype)
-_consensusA = hcl.asarray(np.zeros((num, (lenA + lenB))), dtype)
-_consensusB = hcl.asarray(np.zeros((num, (lenA + lenB))), dtype)
-f(_seqA, _seqB, _consensusA, _consensusB)
-_consensusA_np = _consensusA.asnumpy()
-_consensusB_np = _consensusB.asnumpy()
-for i in range(0, 256):
-    if i < 124:
-        assert _consensusA_np[0][i] == 1
-    else:
-        assert _consensusA_np[0][i] == 0
diff --git a/samples/smith_waterman/vhls_code.cl b/samples/smith_waterman/vhls_code.cl
deleted file mode 100644
index 8066bc2c2..000000000
--- a/samples/smith_waterman/vhls_code.cl
+++ /dev/null
@@ -1,146 +0,0 @@
-#include <ap_int.h>
-#include <ap_fixed.h>
-#include <math.h>
-
-void default_function(ap_uint<3> seqAs[1024][128], ap_uint<3> seqBs[1024][128], ap_uint<3> outAs[1024][256], ap_uint<3> outBs[1024][256]) {
-  ap_int<32> B;
-  for (ap_int<32> t_outer = 0; t_outer < 32; ++t_outer) {
-  #pragma HLS pipeline
-    for (ap_int<32> t_inner = 0; t_inner < 32; ++t_inner) {
-    #pragma HLS unroll
-      ap_int<32> maxtrix_max;
-      maxtrix_max = 0;
-      ap_int<32> i_max;
-      i_max = 0;
-      ap_int<32> j_max;
-      j_max = 0;
-      ap_int<16> matrix[129][129];
-      for (ap_int<32> x = 0; x < 129; ++x) {
-        for (ap_int<32> y = 0; y < 129; ++y) {
-          matrix[x][y] = (ap_int<16>)0;
-        }
-      }
-      ap_int<16> action[129][129];
-      for (ap_int<32> x1 = 0; x1 < 129; ++x1) {
-        for (ap_int<32> y1 = 0; y1 < 129; ++y1) {
-          action[x1][y1] = (ap_int<16>)3;
-        }
-      }
-      ap_int<32> mutate3;
-      for (ap_int<32> i = 0; i < 129; ++i) {
-        for (ap_int<32> j = 0; j < 129; ++j) {
-          ap_int<32> trace_back[4];
-          for (ap_int<32> x2 = 0; x2 < 4; ++x2) {
-            trace_back[x2] = 0;
-          }
-          if ((i != 0) && (j != 0)) {
-            trace_back[0] = ((ap_int<32>)(((ap_int<33>)matrix[(i + -1)][(j + -1)]) + ((ap_int<33>)((seqAs[(t_inner + (t_outer * 32))][(i + -1)] == seqBs[(t_inner + (t_outer * 32))][(j + -1)]) ? 1 : -4))));
-            trace_back[1] = (((ap_int<32>)matrix[(i + -1)][j]) + -4);
-            trace_back[2] = (((ap_int<32>)matrix[i][(j + -1)]) + -4);
-            trace_back[3] = 0;
-            ap_int<32> max;
-            max = trace_back[0];
-            ap_int<32> act;
-            act = 0;
-            for (ap_int<32> i1 = 0; i1 < 4; ++i1) {
-              if (max < trace_back[i1]) {
-                max = trace_back[i1];
-                act = i1;
-              }
-            }
-            matrix[i][j] = ((ap_int<16>)max);
-            action[i][j] = ((ap_int<16>)act);
-            if (maxtrix_max < ((ap_int<32>)matrix[i][j])) {
-              maxtrix_max = ((ap_int<32>)matrix[i][j]);
-              i_max = i;
-              j_max = j;
-            }
-          }
-        }
-      }
-      ap_int<32> T;
-      ap_int<32> curr_i;
-      curr_i = i_max;
-      ap_int<32> curr_j;
-      curr_j = j_max;
-      ap_int<32> next_i;
-      next_i = 0;
-      ap_int<32> next_j;
-      next_j = 0;
-      ap_int<32> act1;
-      act1 = ((ap_int<32>)action[((curr_j / 129) + curr_i)][(curr_j % 129)]);
-      ap_int<32> next_i1;
-      next_i1 = 0;
-      ap_int<32> next_j1;
-      next_j1 = 0;
-      if (act1 == 0) {
-        next_i1 = (curr_i + -1);
-        next_j1 = (curr_j + -1);
-      } else {
-        if (act1 == 1) {
-          next_i1 = (curr_i + -1);
-          next_j1 = curr_j;
-        } else {
-          if (act1 == 2) {
-            next_i1 = curr_i;
-            next_j1 = (curr_j + -1);
-          } else {
-            next_i1 = curr_i;
-            next_j1 = curr_j;
-          }
-        }
-      }
-      next_i = next_i1;
-      next_j = next_j1;
-      ap_int<32> tick;
-      tick = 0;
-      while (((curr_i != next_i) || (curr_j != next_j))) {
-        ap_int<32> a;
-        a = 0;
-        ap_int<32> b;
-        b = 0;
-        if (next_i == curr_i) {
-          a = 0;
-        } else {
-          a = ((ap_int<32>)seqAs[((((curr_i - ((curr_i + -1) % 128)) + ((t_inner + (t_outer * 32)) * 128)) + -1) / 128)][((curr_i + -1) % 128)]);
-        }
-        if (next_j == curr_j) {
-          b = 0;
-        } else {
-          b = ((ap_int<32>)seqBs[((((curr_j - ((curr_j + -1) % 128)) + ((t_inner + (t_outer * 32)) * 128)) + -1) / 128)][((curr_j + -1) % 128)]);
-        }
-        outAs[((tick / 256) + (t_inner + (t_outer * 32)))][(tick % 256)] = ((ap_uint<3>)a);
-        outBs[((tick / 256) + (t_inner + (t_outer * 32)))][(tick % 256)] = ((ap_uint<3>)b);
-        curr_i = next_i;
-        curr_j = next_j;
-        ap_int<32> act2;
-        act2 = ((ap_int<32>)action[((curr_j / 129) + curr_i)][(curr_j % 129)]);
-        ap_int<32> next_i2;
-        next_i2 = 0;
-        ap_int<32> next_j2;
-        next_j2 = 0;
-        if (act2 == 0) {
-          next_i2 = (curr_i + -1);
-          next_j2 = (curr_j + -1);
-        } else {
-          if (act2 == 1) {
-            next_i2 = (curr_i + -1);
-            next_j2 = curr_j;
-          } else {
-            if (act2 == 2) {
-              next_i2 = curr_i;
-              next_j2 = (curr_j + -1);
-            } else {
-              next_i2 = curr_i;
-              next_j2 = curr_j;
-            }
-          }
-        }
-        next_i = next_i2;
-        next_j = next_j2;
-        tick = (tick + 1);
-      }
-    }
-  }
-}
-
diff --git a/samples/sobel/sobel.py b/samples/sobel/sobel.py
deleted file mode 100644
index a4299d8ae..000000000
--- a/samples/sobel/sobel.py
+++ /dev/null
@@ -1,91 +0,0 @@
-import heterocl as hcl
-import hlib
-import numpy as np
-from PIL import Image
-from urllib.request import urlopen
-
-batch_size = 1
-hcl.init(hcl.UInt(32))
-dtype = hcl.UInt(32)
-image_size = ()
-kernel_size = 3
-
-# setup target using vivado 
-tool = hcl.tool.vivado("csim")
-target = hcl.platform.zc706
-
-def sobel():
-    image = hcl.placeholder((batch_size, 1, 256, 256), "input_image")
-    k1 = hcl.placeholder((1, 1, 3, 3), "kernel_1")
-    k2 = hcl.placeholder((1, 1, 3, 3), "kernel_2")
-
-    def kernel(input_image, kernel_1, kernel_2):
-
-        def absolute(image, *args):
-            with hcl.if_(image[args] > 0):
-                hcl.return_(image[args])
-            with hcl.else_():
-                hcl.return_(-1 * image[args])
-
-        def dev(gx, gy, org):
-            assert gx.shape == gy.shape, "mismatch"
-            rx = hcl.reduce_axis(0, 255, "rx")
-            ry = hcl.reduce_axis(0, 255, "ry")
-            mat_sum = hcl.compute(gx.shape, lambda nn, ff, xx, yy:
-                          gx[nn, ff, xx, yy] + gy[nn, ff, xx, yy], name="add")
-            return hcl.compute(mat_sum.shape, lambda nn, ff, xx, yy:
-                          mat_sum[nn, ff, xx, yy] * 255.0 / hcl.max(mat_sum[nn, ff, rx, ry], axis=[rx, ry]),
-                          name = "derv")
-
-        # make the conv op a kernel on fpga. 
-        # return tensor required (cannot do def_())
-        output_shape = (1,1,254,254)
-
-        # make compute wrapped in hcl def
-        module1 = hcl.def_([input_image.shape, kernel_1.shape, output_shape], name="conv1")(hlib.nn.conv2d_nchw_imp)
-        module2 = hcl.def_([input_image.shape, kernel_1.shape, output_shape], name="conv2")(hlib.nn.conv2d_nchw_imp)
-        conv1 = hcl.compute(output_shape, lambda *args: 0)  
-        conv2 = hcl.compute(output_shape, lambda *args: 0)  
-        module1(input_image, kernel_1, conv1)
-        module2(input_image, kernel_2, conv2)
-
-        abs1 = hcl.compute(conv1.shape, 
-                           lambda *args: absolute(conv1, *args))
-        abs2 = hcl.compute(conv2.shape, 
-                           lambda *args: absolute(conv2, *args))
-
-        # derivative module for normalization 
-        return dev(abs1, abs2, input_image)
-
-    s = hcl.create_schedule([image, k1, k2], kernel)
-
-    # data moved to local  
-    i0, k10 = s.to([image, k1], target.fpga)
-    s.to([i0, k10], s[kernel.conv1])
-    s.to(kernel.derv, target.cpu)
-
-    # create stream channel between modules 
-    print(type(target.fpga), hcl.lower(s))
-    return hcl.build(s, target)
-
-# Load sample data
-img = Image.open(urlopen('http://i.stack.imgur.com/8zINU.gif'))
-kernel_x = np.array([[-1, 0, 1], [-2, 0, 2], [-1, 0, 1]])
-kernel_y = np.flip(kernel_x.T.T, axis=0)
-img = np.array(img)
-
-img = img[np.newaxis, ...]
-img = img[np.newaxis, ...]
-kernel_x = kernel_x[np.newaxis, ...]
-kernel_x = kernel_x[np.newaxis, ...]
-kernel_y = kernel_y[np.newaxis, ...]
-kernel_y = kernel_y[np.newaxis, ...]
-
-hcl_input  = hcl.asarray(img, dtype)    
-kernel_x   = hcl.asarray(kernel_x, dtype)
-kernel_y   = hcl.asarray(kernel_y, dtype)
-hcl_output = hcl.asarray(np.zeros((1,1,254,254)), dtype)    
-
-f = sobel()
-f(hcl_input, kernel_x, kernel_y, hcl_output)
-
diff --git a/samples/stream/example.cl b/samples/stream/example.cl
deleted file mode 100644
index fa3cfbd81..000000000
--- a/samples/stream/example.cl
+++ /dev/null
@@ -1,34 +0,0 @@
-#include "ihc_apint.h"
-#pragma OPENCL EXTENSION cl_intel_channels : enable
-channel int ret_add_c;
-channel int ret_mul_c;
-__kernel void ret_add(__global int* restrict ret_add_a, __global int* restrict ret_add_b) {
-    for (int i = 0; i < 10; ++i) {
-      for (int i1 = 0; i1 < 20; ++i1) {
-        write_channel_intel(ret_add_c, ((int)(((int33_t)ret_add_a[(i1 + (i * 20))]) + ((int33_t)ret_add_b[(i1 + (i * 20))]))));
-      }
-    }
-}
-
-__kernel void ret_mul(__global int* restrict ret_mul_d, __global int* restrict ret_mul_e) {
-    for (int i = 0; i < 10; ++i) {
-      for (int i1 = 0; i1 < 20; ++i1) {
-        ret_mul_e[(i1 + (i * 20))] = ((int)(((long)read_channel_intel(ret_mul_c)) * ((long)ret_mul_d[(i1 + (i * 20))])));
-      }
-    }
-}
-
-__kernel void default_function(__global int* restrict a, __global int* restrict b, __global int* restrict c, __global int* restrict d, __global int* restrict e) {
-  int ret_add;
-  int ret_mul;
-  for (int x = 0; x < 10; ++x) {
-    for (int y = 0; y < 20; ++y) {
-      c[(y + (x * 20))] = 0;
-    }
-  }
-  int ret_add0;
-  ret_add(a, b);
-  int ret_mul0;
-  ret_mul(d, e);
-}
-
diff --git a/samples/stream/mod.py b/samples/stream/mod.py
deleted file mode 100644
index 8c12ad722..000000000
--- a/samples/stream/mod.py
+++ /dev/null
@@ -1,32 +0,0 @@
-import heterocl as hcl
-
-hcl.init()
-initiation_interval = 4
-a = hcl.placeholder((10, 20))
-b = hcl.placeholder((10, 20))
-
-@hcl.def_([a.shape, b.shape, (), ()])
-def ret_add(a, b, x, y):
-    hcl.return_(a[x, y] + b[x, y])
-
-@hcl.def_([a.shape, b.shape, (), ()])
-def ret_mul(a, b, x, y):
-    hcl.return_(a[x, y] * b[x, y])
-
-c = hcl.compute(a.shape, lambda i, j: ret_add(a, b, i, j))
-d = hcl.compute(b.shape, lambda i, j: ret_mul(a, b, i, j))
-s = hcl.create_schedule([a, b, c, d])
-
-# compute customization
-s[c].pipeline(c.axis[0], initiation_interval)
-s.partition(b, dim=2, factor=2)
-
-# stream into modules / device
-# s[c].stream_to(ret_mul)
-# s[d].stream_to(hcl.FPGA)
-
-print(hcl.lower(s))
-code = hcl.build(s, target="vhls")
-print(code)
-
-
diff --git a/samples/stream/stream.py b/samples/stream/stream.py
deleted file mode 100644
index 5c2396a57..000000000
--- a/samples/stream/stream.py
+++ /dev/null
@@ -1,58 +0,0 @@
-import heterocl as hcl
-
-hcl.init()
-target = hcl.platform.zc706
-initiation_interval = 4
-
-a = hcl.placeholder((10, 20), name="a")
-b = hcl.placeholder((10, 20), name="b")
-c = hcl.placeholder((10, 20), name="c") 
-d = hcl.placeholder((10, 20), name="d")
-e = hcl.placeholder((10, 20), name="e")
-
-def add_mul(a, b, c, d, e):
-    @hcl.def_([a.shape, b.shape, c.shape])
-    def ret_add(a, b, c):
-        with hcl.for_(0, a.shape[0]) as i:
-            with hcl.for_(0, a.shape[1]) as j:
-                c[i, j] = a[i, j] + b[i, j]
-
-    @hcl.def_([c.shape, d.shape, e.shape])
-    def ret_mul(c, d, e):
-        # hcl.update(c, lambda x, y: a[x, y] * b[x, y], 'c_mul')
-        with hcl.for_(0, c.shape[0]) as i:
-            with hcl.for_(0, c.shape[1]) as j:
-                e[i, j] = c[i, j] * d[i, j]
-
-    ret_add(a, b, c)
-    ret_mul(c, d, e)
-
-# compute customization
-s = hcl.create_schedule([a, b, c, d, e], add_mul)
-# op1 = add_mul.ret_add.c
-# op2 = add_mul.ret_mul.c
-# s[op1].pipeline(op1.axis[0], initiation_interval)
-
-# stream into modules / device
-a0, b0 = s.to([a, b], target.xcel)
-d0 = s.to(d, target.xcel)
-#s.partition(b0, dim=2, factor=2)
-s.to([a0, b0], s[add_mul.ret_add])
-s.to(d0, s[add_mul.ret_mul])
-
-# within device move producer to consumer
-s.to(c, s[add_mul.ret_mul],
-        s[add_mul.ret_add], depth=10)
-
-# return tensor for inter-device move
-# e0 = s.stream_to(e, hcl.CPU('riscv'))
-
-# print(add_mul.ret_mul._buf, c._buf)
-print(hcl.lower(s))
-code = hcl.build(s, target)
-print(code)
-# 
-# with open("example.cl", "w") as f:
-#   f.write(code)
-#   f.close()
- 
diff --git a/tests/test_codegen_aocl.py b/tests/test_codegen_aocl.py
deleted file mode 100644
index a72d364f2..000000000
--- a/tests/test_codegen_aocl.py
+++ /dev/null
@@ -1,99 +0,0 @@
-import heterocl as hcl
-
-def test_ap_int():
-	hcl.init();
-	A = hcl.placeholder((1, 32), dtype=hcl.Int(3))
-	B = hcl.placeholder((1, 32), dtype=hcl.UInt(3))
-	C = hcl.compute(A.shape, lambda i, j: A[i][j] + B[i][j], dtype=hcl.Int(8))
-	s = hcl.create_schedule([A, B, C])
-	code = hcl.build(s, target='aocl')
-	print (code)
-	assert "#pragma OPENCL EXTENSION cl_intel_arbitrary_precision_integers : enable" in code
-	assert "ap_int<3> intd_t" in code
-	assert "ap_uint<3> uintd_t" in code
-	assert "ap_int<8> intd_t" in code 
-
-def test_pragma():
-	hcl.init()
-	A = hcl.placeholder((10, 32), "A")
-	B = hcl.placeholder((10, 32))
-	C = hcl.compute(A.shape, lambda i, j: A[i][j] + B[i][j])
-
-	# unroll
-	s1 = hcl.create_schedule([A, B, C])
-	s1[C].unroll(C.axis[1], factor=4)
-	code1 = hcl.build(s1, target='aocl')
-	print (code1)
-	assert "#pragma unroll 4" in code1
-	
-	# pipeline
-	s2 = hcl.create_schedule([A, B, C])
-	s2[C].pipeline(C.axis[0], initiation_interval=2)
-	code2 = hcl.build(s2, target='aocl')
-	print (code2)
-	assert "#pragma ii 2" in code2
-
-def test_reorder():
-	hcl.init()
-	A = hcl.placeholder((10, 100), "A")
-
-	def two_stage(A):
-		B = hcl.compute(A.shape, lambda x, y : A[x, y] + 1, "B")
-		C = hcl.compute(A.shape, lambda x, y : B[x, y] + 1, "C")
-		return C
-
-	s = hcl.create_schedule([A], two_stage)
-	s_B = two_stage.B
-	code = hcl.build(s, target='aocl')
-	print (code)
-	s[s_B].reorder(s_B.axis[1], s_B.axis[0])
-	code2 = hcl.build(s, target='aocl')
-	print (code2)
-
-def test_split_fuse():
-	hcl.init()
-	A = hcl.placeholder((10, 100), "A")
-
-	def two_stage(A):
-		B = hcl.compute(A.shape, lambda x, y : A[x, y] + 1, "B")
-		C = hcl.compute(A.shape, lambda x, y : B[x, y] + 1, 'C')
-		return C
-
-	s = hcl.create_schedule([A], two_stage)
-	s_B = two_stage.B
-	x_out, x_in = s[s_B].split(s_B.axis[0], 5)
-	code = hcl.build(s, target='aocl')
-	print (code)
-	s2 = hcl.create_schedule([A], two_stage)
-	s2_B = two_stage.B
-	x_y = s[s_B].fuse(s2_B.axis[0], s2_B.axis[1])
-	code2 = hcl.build(s2, target='aocl')
-	print (code2)
-
-def test_binary_conv():
-    hcl.init()
-    A = hcl.placeholder((1, 32, 14, 14), dtype=hcl.UInt(1), name="A")
-    B = hcl.placeholder((64, 32, 3, 3), dtype=hcl.UInt(1), name="B")
-    rc = hcl.reduce_axis(0, 32)
-    ry = hcl.reduce_axis(0, 3)
-    rx = hcl.reduce_axis(0, 3)
-    C = hcl.compute((1, 64, 12, 12),
-        lambda nn, ff, yy, xx: hcl.sum(
-            A[nn, rc, yy + ry, xx + rx] * B[ff, rc, ry, rx], axis=[rc, ry, rx]),
-        dtype=hcl.UInt(8), name="C")
-    s = hcl.create_schedule([A, B, C])
-    s[C].split(C.axis[1], factor=5)
-    code = hcl.build(s, target='aocl')
-    print (code)
-    assert "for (ap_int<32> intd_t ff_outer = 0; ff_outer < 13; ++ff_outer)" in code
-    assert "for (ap_int<32> intd_t ff_inner = 0; ff_inner < 5; ++ff_inner)" in code
-    assert "if (ff_inner < (64 - (ff_outer * 5)))" in code
-
-if __name__ == '__main__':
-    test_ap_int()
-    test_pragma()
-    test_reorder()
-    test_split_fuse()
-    test_binary_conv()
-
-
diff --git a/tests/test_codegen_ihls.py b/tests/test_codegen_ihls.py
index 1b53f18ca..fc5a7e53b 100644
--- a/tests/test_codegen_ihls.py
+++ b/tests/test_codegen_ihls.py
@@ -65,4 +65,3 @@ def kernel(A):
     s = hcl.create_schedule([A], kernel)
     code = hcl.build(s, target="ihls")
     assert "A[0].slc<4>(1)" in code
-
diff --git a/tests/test_codegen_sdaccel.py b/tests/test_codegen_sdaccel.py
deleted file mode 100644
index 43d94f238..000000000
--- a/tests/test_codegen_sdaccel.py
+++ /dev/null
@@ -1,36 +0,0 @@
-import heterocl as hcl
-
-
-
-
-
-def test_pragma():
-	hcl.init(hcl.Float())
-	A = hcl.placeholder((10, 32), "A")
-	B = hcl.placeholder((10, 32))
-	C = hcl.compute(A.shape, lambda i, j: A[i][j] + B[i][j])
-
-	# unroll
-	s1 = hcl.create_schedule([A, B, C])
-	s1[C].unroll(C.axis[1], factor=6)
-	code1 = hcl.build(s1, target='sdaccel')
-	print (code1)
-	assert "__attribute__((opencl_unroll_hint(6)))" in code1
-
-	# pipeline
-	s2 = hcl.create_schedule([A, B, C])
-	s2[C].pipeline(C.axis[0], initiation_interval=2)
-	code2 = hcl.build(s2, target='sdaccel')
-	print (code2)
-	assert "__attribute__((xcl_pipeline_loop(2)))" in code2
-
-	# partition
-	s3 = hcl.create_schedule([A, B, C])
-	s3.partition(A, hcl.Partition.Block, dim=2, factor=2)
-	code3 = hcl.build(s3, target='sdaccel')
-	print (code3)
-	assert "__attribute__((xcl_array_partition(block,2,2)))" in code3
-	
-
-if __name__ == "__main__":
-	test_pragma()
\ No newline at end of file
diff --git a/tests/test_codegen_soda.py b/tests/test_codegen_soda.py
index 492ee6146..56fb8df77 100644
--- a/tests/test_codegen_soda.py
+++ b/tests/test_codegen_soda.py
@@ -52,7 +52,6 @@ def test_blur(self):
   img_t(0, 0) = uint16((int32((uint18((uint17(img_i(-1, 0)) + uint17(img_i(0, 0)))) + uint18(img_i(1, 0)))) / 3))
 output uint16:
   img_o(0, 0) = uint16((int32((uint18((uint17(img_t(0, -1)) + uint17(img_t(0, 0)))) + uint18(img_t(0, 1)))) / 3))
-
 ''')
 
     def test_gaussian(self):
@@ -77,7 +76,6 @@ def test_gaussian(self):
   reduce_ssa3 = float32(((float64(img_i(-1, 0)) * 3699.65) + float64(reduce_ssa2)))
   reduce_ssa4 = float32(((float64(img_i(0, 0)) * 4620.30) + float64(reduce_ssa3)))
   img_o(0, 0) = reduce_ssa4
-
 '''
                 )
 
diff --git a/tests/test_codegen_vhls.py b/tests/test_codegen_vhls.py
index a6385975b..dadae5068 100644
--- a/tests/test_codegen_vhls.py
+++ b/tests/test_codegen_vhls.py
@@ -85,7 +85,7 @@ def test_index_split():
     s = hcl.create_schedule([A, B])
     s[B].split(B.axis[0], 5)
     code = hcl.build(s, target="vhls")
-    assert "B[(x + ((y_inner + (y_outer * 5)) * 10))]" in code
+    assert "B[(y_inner + (y_outer * 5))][x]" in code
 
 def test_index_split_reshape():
     hcl.init()
@@ -95,7 +95,7 @@ def test_index_split_reshape():
     s[B].split(B.axis[0], 5)
     s.reshape(B, (2, 5, 10))
     code = hcl.build(s, target="vhls")
-    assert "B[(x + ((y_inner + (y_outer * 5)) * 10))]" in code
+    assert "B[y_outer][y_inner][x]" in code
 
 def test_index_fuse():
     hcl.init()
@@ -104,7 +104,7 @@ def test_index_fuse():
     s = hcl.create_schedule([A, B])
     s[B].fuse(B.axis[0], B.axis[1])
     code = hcl.build(s, target="vhls")
-    assert "B[y_x_fused]" in code
+    assert "B[(y_x_fused / 10)][(y_x_fused % 10)]" in code
 
 def test_binary_conv():
     hcl.init()
diff --git a/tvm/HalideIR/src/ir/Expr.h b/tvm/HalideIR/src/ir/Expr.h
index 4b70d51fc..b78a466ed 100644
--- a/tvm/HalideIR/src/ir/Expr.h
+++ b/tvm/HalideIR/src/ir/Expr.h
@@ -91,9 +91,6 @@ enum class IRNodeType : int {
     /** for memory customization **/
     Reuse,
     Partition,
-    /** for data stream **/
-    StreamExpr,
-    StreamStmt,
     /** for stencil analysis **/
     Stencil
 };
@@ -305,20 +302,6 @@ enum class PartitionType : int {
     Cyclic = 2
 };
 
-/** An enum describing the stream type */
-enum class StreamType : int {
-    Channel = 0,
-    Pipe = 1,
-    FIFO = 2
-};
-
-/** An enum class for device type */
-enum class DeviceType : int {
-    CPU = 0,
-    FPGA = 1,
-    GPU = 2
-};
-
 /** A reference-counted handle to a statement node. */
 struct Stmt : public IRHandle {
     Stmt() : IRHandle() {}
diff --git a/tvm/HalideIR/src/ir/IR.cpp b/tvm/HalideIR/src/ir/IR.cpp
index a604b6fd2..a9718b40e 100644
--- a/tvm/HalideIR/src/ir/IR.cpp
+++ b/tvm/HalideIR/src/ir/IR.cpp
@@ -692,27 +692,17 @@ Expr Quantize::make(Expr body, Expr bitwidth) {
   return Expr(node);
 }
 
-Stmt KernelDef::make(Array<VarExpr> args, Array<Array<Expr>> api_args, 
-                     Array<Expr> api_types, Stmt body, Expr ret_void, 
-                     Type ret_type, std::string name, Array<Expr> channels) {
-  internal_assert(api_args.size() == api_types.size()) << "KernelDef of unmatched args\n";
+Stmt KernelDef::make(Array<VarExpr> args, Stmt body, Expr ret_void, Type ret_type, std::string name) {
   for (size_t i = 0; i < args.size(); i++) {
     internal_assert(args[i].defined()) << "KernelDef of undefined arg\n";
-    internal_assert(api_types[i].defined()) << "KernelDef of undefined type\n";
-    for (size_t j = 0; j < api_args[i].size(); j++) {
-      internal_assert(api_args[i][j].defined()) << "KernelDef of undefined shape\n";
-    }
   }
   internal_assert(body.defined()) << "KernelDef of undefined body\n";
   internal_assert(ret_void.defined()) << "KernelDef of undefined return type\n";  
   std::shared_ptr<KernelDef> node = std::make_shared<KernelDef>();
   node->args = std::move(args);
-  node->api_args = std::move(api_args);
-  node->api_types = std::move(api_types);
   node->body = std::move(body);
   node->ret_void = std::move(ret_void);
   node->ret_type = ret_type;
-  node->channels = std::move(channels);
   node->name = name;
   return Stmt(node);
 }
@@ -782,62 +772,6 @@ Stmt Partition::make(VarExpr buffer_var, int dim, int factor, PartitionType part
   return Stmt(node);
 }
 
-Expr StreamExpr::make(Type type, VarExpr buffer_var, StreamType stream_type, int depth) {
-  internal_assert(depth>= 1) << "The stream channel depth must be larger than 1\n";
-
-  std::shared_ptr<StreamExpr> node = std::make_shared<StreamExpr>();
-  node->type = type;
-  node->buffer_var = std::move(buffer_var);
-  node->depth = depth;
-  node->stream_type = stream_type;
-  return Expr(node);
-}
-
-Expr StreamExpr::make(Type type, VarExpr buffer_var, StreamType stream_type, int depth,
-                      Array<Expr> annotate_keys, Array<Expr> annotate_values) {
-  internal_assert(depth>= 1) << "The stream channel depth must be larger than 1\n";
-  internal_assert(annotate_keys.size() == annotate_values.size()) <<
-      "Length of annotate keys and annotate values not equal";
-
-  std::shared_ptr<StreamExpr> node = std::make_shared<StreamExpr>();
-  node->type = type;
-  node->buffer_var = std::move(buffer_var);
-  node->depth = depth;
-  node->stream_type = stream_type;
-  node->annotate_keys = std::move(annotate_keys);
-  node->annotate_values = std::move(annotate_values);
-  return Expr(node);
-}
-
-Stmt StreamStmt::make(VarExpr buffer_var, Expr value, StreamType stream_type, int depth) {
-  internal_assert(value.defined()) << "The stream-in value not defined\n";
-  internal_assert(depth>= 1) << "The stream channel depth must be larger than 1\n";
-
-  std::shared_ptr<StreamStmt> node = std::make_shared<StreamStmt>();
-  node->buffer_var = std::move(buffer_var);
-  node->value = std::move(value);
-  node->depth = depth;
-  node->stream_type = stream_type;
-  return Stmt(node);
-}
-
-Stmt StreamStmt::make(VarExpr buffer_var, Expr value, StreamType stream_type, int depth,
-                      Array<Expr> annotate_keys, Array<Expr> annotate_values) {
-  internal_assert(value.defined()) << "The stream-in value not defined\n";
-  internal_assert(depth>= 1) << "The stream channel depth must be larger than 1\n";
-  internal_assert(annotate_keys.size() == annotate_values.size()) <<
-      "Length of annotate keys and annotate values not equal";
-
-  std::shared_ptr<StreamStmt> node = std::make_shared<StreamStmt>();
-  node->buffer_var = std::move(buffer_var);
-  node->value = std::move(value);
-  node->depth = depth;
-  node->stream_type = stream_type;
-  node->annotate_keys = std::move(annotate_keys);
-  node->annotate_values = std::move(annotate_values);
-  return Stmt(node);
-}
-
 Stmt Stencil::make(Array<VarExpr> inputs, Array<VarExpr> outputs, Stmt body,
                    int burst_width, int unroll_factor, int num_iteration) {
   internal_assert(body.defined()) << "Stencil of undefined body\n";
@@ -950,8 +884,6 @@ template<> void StmtNode<While>::accept(IRVisitor *v, const Stmt &s) const { v->
 template<> void StmtNode<Reuse>::accept(IRVisitor *v, const Stmt &s) const { v->visit((const Reuse *)this, s); }
 template<> void StmtNode<Partition>::accept(IRVisitor *v, const Stmt &s) const { v->visit((const Partition *)this, s); }
 template<> void StmtNode<Stencil>::accept(IRVisitor *v, const Stmt &s) const { v->visit((const Stencil *)this, s); }
-template<> void StmtNode<StreamStmt>::accept(IRVisitor *v, const Stmt &s) const { v->visit((const StreamStmt *)this, s); }
-template<> void ExprNode<StreamExpr>::accept(IRVisitor *v, const Expr &e) const { v->visit((const StreamExpr *)this, e); }
 
 Call::ConstString Call::debug_to_file = "debug_to_file";
 Call::ConstString Call::reinterpret = "reinterpret";
diff --git a/tvm/HalideIR/src/ir/IR.h b/tvm/HalideIR/src/ir/IR.h
index e8a8835bf..fae48da29 100644
--- a/tvm/HalideIR/src/ir/IR.h
+++ b/tvm/HalideIR/src/ir/IR.h
@@ -1049,29 +1049,19 @@ struct Quantize : public ExprNode<Quantize> {
 /** The imperative function definition */
 struct KernelDef : public StmtNode<KernelDef> {
   Array<VarExpr> args;
-  Array<Array<Expr>> api_args;
-  Array<Expr> api_types;
   Stmt body;
   Expr ret_void;
   Type ret_type;
   std::string name;
-  // args to stream data 
-  Array<Expr> channels;
 
-  EXPORT static Stmt make(Array<VarExpr> args, Array<Array<Expr>> api_args, 
-                          Array<Expr> api_types, Stmt body, Expr ret_void, 
-                          Type ret_type, std::string name, 
-                          Array<Expr> channels);
+  EXPORT static Stmt make(Array<VarExpr> args, Stmt body, Expr ret_void, Type ret_type, std::string name);
 
   void VisitAttrs(IR::AttrVisitor* v) final {
     v -> Visit("args", &args);
-    v -> Visit("api_args", &api_args);
-    v -> Visit("api_types", &api_types);
     v -> Visit("body", &body);
     v -> Visit("ret_void", &ret_void);
     v -> Visit("ret_type", &ret_type);
     v -> Visit("name", &name);
-    v -> Visit("channels", &channels);
   }
   static const IRNodeType _type_info = IRNodeType::KernelDef;
   static constexpr const char* _type_key = "KernelDef";
@@ -1180,70 +1170,6 @@ struct Partition : public StmtNode<Partition> {
   static constexpr const char* _type_key = "Partition";
 };
 
-struct StreamStmt : public StmtNode<StreamStmt> {
-  VarExpr buffer_var;
-  Expr value; 
-  int depth;
-  StreamType stream_type;
-  Array<Expr> annotate_keys;
-  Array<Expr> annotate_values;
-
-  EXPORT static Stmt make(VarExpr buffer_var, 
-                          Expr value,
-                          StreamType stream_type,
-                          int depth);
-
-  EXPORT static Stmt make(VarExpr buffer_var, 
-                          Expr value,
-                          StreamType stream_type,
-                          int depth,
-                          Array<Expr> annotate_keys,
-                          Array<Expr> annotate_values);
-
-  void VisitAttrs(IR::AttrVisitor* v) final {
-    v -> Visit("buffer_var", &buffer_var);
-    v -> Visit("value", &value);
-    v -> Visit("depth", &depth);
-    v -> Visit("stream_type", &stream_type);
-    v -> Visit("annotate_keys", &annotate_keys);
-    v -> Visit("annotate_values", &annotate_values);
-  }
-
-  static const IRNodeType _type_info = IRNodeType::StreamStmt;
-  static constexpr const char* _type_key = "StreamStmt";
-};
-
-struct StreamExpr : public ExprNode<StreamExpr> {
-  VarExpr buffer_var; // var loaded 
-  int depth;
-  StreamType stream_type;
-  Array<Expr> annotate_keys;
-  Array<Expr> annotate_values;
-
-  EXPORT static Expr make(Type type,
-                          VarExpr buffer_var, 
-                          StreamType stream_type,
-                          int depth);
-
-  EXPORT static Expr make(Type type,
-                          VarExpr buffer_var, 
-                          StreamType stream_type,
-                          int depth,
-                          Array<Expr> annotate_keys,
-                          Array<Expr> annotate_values);
-
-  void VisitAttrs(IR::AttrVisitor* v) final {
-    v -> Visit("dtype", &type);
-    v -> Visit("buffer_var", &buffer_var);
-    v -> Visit("depth", &depth);
-    v -> Visit("stream_type", &stream_type);
-    v -> Visit("annotate_keys", &annotate_keys);
-    v -> Visit("annotate_values", &annotate_values);
-  }
-  static const IRNodeType _type_info = IRNodeType::StreamExpr;
-  static constexpr const char* _type_key = "StreamExpr";
-};
-
 struct Stencil : public StmtNode<Stencil> {
   Array<VarExpr> inputs;
   Array<VarExpr> outputs;
diff --git a/tvm/HalideIR/src/ir/IREquality.cpp b/tvm/HalideIR/src/ir/IREquality.cpp
index 46590056e..9e5798fbb 100644
--- a/tvm/HalideIR/src/ir/IREquality.cpp
+++ b/tvm/HalideIR/src/ir/IREquality.cpp
@@ -80,7 +80,6 @@ class IRComparer : public IRVisitor {
     void visit(const Call *, const Expr &);
     void visit(const Let *, const Expr &);
     void visit(const Shuffle *, const Expr &);
-    void visit(const StreamExpr *, const Expr &);
     void visit(const LetStmt *, const Stmt &);
     void visit(const AttrStmt *, const Stmt &);
     void visit(const AssertStmt *, const Stmt &);
@@ -489,11 +488,6 @@ void IRComparer::visit(const Shuffle *op, const Expr &expr) {
     compare_expr_vector(e->indices, op->indices);
 }
 
-void IRComparer::visit(const StreamExpr *op, const Expr &expr) {
-    const StreamExpr *node = expr_.as<StreamExpr>();
-    compare_node_refs(op->buffer_var, node->buffer_var);
-}
-
 } // namespace
 
 
diff --git a/tvm/HalideIR/src/ir/IRMutator.cpp b/tvm/HalideIR/src/ir/IRMutator.cpp
index fbd3e82b5..13b346e93 100644
--- a/tvm/HalideIR/src/ir/IRMutator.cpp
+++ b/tvm/HalideIR/src/ir/IRMutator.cpp
@@ -480,8 +480,7 @@ void IRMutator::visit(const KernelDef *op, const Stmt &s) {
     stmt = s;
   }
   else {
-    stmt = KernelDef::make(op->args, op->api_args, op->api_types,
-                           body, ret_void, op->ret_type, op->name, op->channels);
+    stmt = KernelDef::make(op->args, body, ret_void, op->ret_type, op->name);
   }
 }
 
@@ -525,20 +524,6 @@ void IRMutator::visit(const KernelStmt *op, const Stmt &s) {
   }
 }
 
-void IRMutator::visit(const StreamStmt *op, const Stmt &s) {
-  Expr value = mutate(op->value);
-  if (value.same_as(op->value)) {
-    stmt = s;
-  } else {
-    stmt = StreamStmt::make(op->buffer_var, value,
-                            op->stream_type, op->depth);
-  }
-}
-
-void IRMutator::visit(const StreamExpr *op, const Expr &e) {
-  expr = e;
-}
-
 void IRMutator::visit(const Return *op, const Stmt &s) {
   Expr value = mutate(op->value);
   if (value.same_as(op->value)) {
diff --git a/tvm/HalideIR/src/ir/IRMutator.h b/tvm/HalideIR/src/ir/IRMutator.h
index 4088ae5ea..1fea5fec6 100644
--- a/tvm/HalideIR/src/ir/IRMutator.h
+++ b/tvm/HalideIR/src/ir/IRMutator.h
@@ -99,8 +99,6 @@ class IRMutator : public IRVisitor {
     EXPORT virtual void visit(const Reuse *, const Stmt &);
     EXPORT virtual void visit(const Partition *, const Stmt &);
     EXPORT virtual void visit(const Stencil *, const Stmt &);
-    EXPORT virtual void visit(const StreamExpr *, const Expr &);
-    EXPORT virtual void visit(const StreamStmt *, const Stmt &);
 };
 
 
diff --git a/tvm/HalideIR/src/ir/IRPrinter.cpp b/tvm/HalideIR/src/ir/IRPrinter.cpp
index b6f3e6082..6a3a5d651 100644
--- a/tvm/HalideIR/src/ir/IRPrinter.cpp
+++ b/tvm/HalideIR/src/ir/IRPrinter.cpp
@@ -336,19 +336,6 @@ TVM_STATIC_IR_FUNCTOR(IRPrinter, vtable)
     }
 });
 
-TVM_STATIC_IR_FUNCTOR(IRPrinter, vtable)
-.set_dispatch<StreamStmt>([](const StreamStmt *op, IRPrinter* p) {
-    p->do_indent();
-    p->stream << op->buffer_var << ".write(";
-    p->print(op->value);
-    p->stream << ")\n";
-});
-
-TVM_STATIC_IR_FUNCTOR(IRPrinter, vtable)
-.set_dispatch<StreamExpr>([](const StreamExpr *op, IRPrinter* p) {
-    p->stream << op->buffer_var << ".read()";
-});
-
 TVM_STATIC_IR_FUNCTOR(IRPrinter, vtable)
 .set_dispatch<Ramp>([](const Ramp *op, IRPrinter* p) {
     p->stream << "ramp(";
@@ -736,16 +723,7 @@ TVM_STATIC_IR_FUNCTOR(IRPrinter, vtable)
     p->do_indent();
     p->stream << "def " << op->name << "(";
     for (size_t i = 0; i < op->args.size(); i++) {
-        p->stream << op->args[i].type() << "("; // handle type
         p->print(op->args[i]);
-        if (op->api_args[i].size() > 1) {
-          p->stream << "[";
-          for (size_t j = 0; j < op->api_args[i].size(); j++) {
-            p->print(op->api_args[i][j]);
-            if (j < op->api_args[i].size() - 1) p->stream << "*";
-          }
-          p->stream << "])";
-        }
         if (i < op->args.size() - 1) {
             p->stream << ", ";
         }
diff --git a/tvm/HalideIR/src/ir/IRVisitor.cpp b/tvm/HalideIR/src/ir/IRVisitor.cpp
index 30e1fe86b..02880fdb4 100644
--- a/tvm/HalideIR/src/ir/IRVisitor.cpp
+++ b/tvm/HalideIR/src/ir/IRVisitor.cpp
@@ -137,9 +137,6 @@ void IRVisitor::visit(const Let *op, const Expr &) {
     op->body.accept(this);
 }
 
-void IRVisitor::visit(const StreamExpr *op, const Expr &) {
-}
-
 void IRVisitor::visit(const LetStmt *op, const Stmt &) {
     op->value.accept(this);
     op->body.accept(this);
@@ -172,10 +169,6 @@ void IRVisitor::visit(const Store *op, const Stmt &) {
     op->predicate.accept(this);
 }
 
-void IRVisitor::visit(const StreamStmt *op, const Stmt &) {
-    op->value.accept(this);
-}
-
 void IRVisitor::visit(const Provide *op, const Stmt &) {
     op->value.accept(this);
     for (size_t i = 0; i < op->args.size(); i++) {
@@ -273,10 +266,6 @@ void IRVisitor::visit(const Quantize *op, const Expr &) {
 void IRVisitor::visit(const KernelDef *op, const Stmt &) {
   for (size_t i = 0; i < op->args.size(); i++) {
     op->args[i].accept(this);
-    op->api_types[i].accept(this);
-    for (size_t j = 0; j < op->api_args[i].size(); j++) {
-      op->api_args[i][j].accept(this);
-    }
   }
   op->ret_void.accept(this);
 }
@@ -585,10 +574,6 @@ void IRGraphVisitor::visit(const Quantize *op, const Expr &) {
 void IRGraphVisitor::visit(const KernelDef *op, const Stmt &) {
   for (size_t i = 0; i < op->args.size(); i++) {
     include(op->args[i]);
-    include(op->api_types[i]);
-    for (size_t j = 0; j < op->api_args[i].size(); j++) {
-      include(op->api_args[i][j]);
-    }
   }
   include(op->ret_void);
 }
@@ -622,12 +607,6 @@ void IRGraphVisitor::visit(const Reuse *op, const Stmt &) {
 
 void IRGraphVisitor::visit(const Partition *op, const Stmt &) {}
 
-void IRGraphVisitor::visit(const StreamExpr *op, const Expr &) {}
-
-void IRGraphVisitor::visit(const StreamStmt *op, const Stmt &) {
-  include(op->value);
-}
-
 void IRGraphVisitor::visit(const Stencil *op, const Stmt &) {
   include(op->body);
 }
diff --git a/tvm/HalideIR/src/ir/IRVisitor.h b/tvm/HalideIR/src/ir/IRVisitor.h
index a4faa4aba..931f1c5c9 100644
--- a/tvm/HalideIR/src/ir/IRVisitor.h
+++ b/tvm/HalideIR/src/ir/IRVisitor.h
@@ -79,8 +79,6 @@ class IRVisitor {
     EXPORT virtual void visit(const Reuse *, const Stmt &);
     EXPORT virtual void visit(const Partition *, const Stmt &);
     EXPORT virtual void visit(const Stencil *, const Stmt &);
-    EXPORT virtual void visit(const StreamStmt *, const Stmt &);
-    EXPORT virtual void visit(const StreamExpr *, const Expr &);
 };
 
 /** A base class for algorithms that walk recursively over the IR
@@ -161,8 +159,6 @@ class IRGraphVisitor : public IRVisitor {
     EXPORT virtual void visit(const Reuse *, const Stmt &);
     EXPORT virtual void visit(const Partition *, const Stmt &);
     EXPORT virtual void visit(const Stencil *, const Stmt &);
-    EXPORT virtual void visit(const StreamExpr *, const Expr &);
-    EXPORT virtual void visit(const StreamStmt *, const Stmt &);
     // @}
 };
 
diff --git a/tvm/Makefile b/tvm/Makefile
index 1b2030645..1a78cbe7c 100644
--- a/tvm/Makefile
+++ b/tvm/Makefile
@@ -126,13 +126,6 @@ else
 	CFLAGS += -DTVM_OPENCL_RUNTIME=0
 endif
 
-ifeq ($(USE_SDACCEL_HLS), 1)
-	CFLAGS += -DHCL_SDACCEL_RUNTIME=1
-else
-	CFLAGS += -DHCL_SDACCEL_RUNTIME=0
-endif
-
-
 ifeq ($(USE_VIVADO_HLS), 1)
 	CFLAGS += -DHCL_VHLS_RUNTIME=1
 else
diff --git a/tvm/include/tvm/codegen.h b/tvm/include/tvm/codegen.h
index 4d6be0230..3877db941 100644
--- a/tvm/include/tvm/codegen.h
+++ b/tvm/include/tvm/codegen.h
@@ -42,7 +42,6 @@ runtime::Module Build(const Array<LoweredFunc>& funcs,
  * \return cstr The C string representation of the file.
  */
 std::string PackImportsToC(const runtime::Module& m, bool system_lib);
-
 }  // namespace codegen
 }  // namespace TVM
 
diff --git a/tvm/include/tvm/ir.h b/tvm/include/tvm/ir.h
index 8a26e551c..e66db3fb4 100644
--- a/tvm/include/tvm/ir.h
+++ b/tvm/include/tvm/ir.h
@@ -21,8 +21,6 @@ using Halide::Internal::StmtNode;
 using Halide::Internal::IRNodeType;
 using Halide::Internal::ForType;
 using Halide::Internal::PartitionType;
-using Halide::Internal::StreamType;
-using Halide::Internal::DeviceType;
 using Halide::DeviceAPI;
 
 // Node container for CommReducer
@@ -234,8 +232,6 @@ constexpr const char* pipeline_exec_scope = "pipeline_exec_scope";
 constexpr const char* opengl_stage_scope = "opengl_stage_scope";
 
 constexpr const char* attach_scope = "attach_scope";
-
-constexpr const char* device_scope = "device_scope";
 }  // namespace attr
 
 /*! \brief namespace of TVM Intrinsic functions */
@@ -505,8 +501,6 @@ using Halide::Internal::Quantize;
 using Halide::Internal::KernelDef;
 using Halide::Internal::KernelExpr;
 using Halide::Internal::KernelStmt;
-using Halide::Internal::StreamExpr;
-using Halide::Internal::StreamStmt;
 using Halide::Internal::Return;
 using Halide::Internal::Break;
 using Halide::Internal::While;
diff --git a/tvm/include/tvm/ir_functor_ext.h b/tvm/include/tvm/ir_functor_ext.h
index 39ce6d2b8..c4f18ba7e 100644
--- a/tvm/include/tvm/ir_functor_ext.h
+++ b/tvm/include/tvm/ir_functor_ext.h
@@ -148,7 +148,6 @@ class ExprFunctor<R(const Expr& n, Args...)> {
   virtual R VisitExpr_(const SetSlice* op, Args... args) EXPR_FUNCTOR_DEFAULT;
   virtual R VisitExpr_(const Quantize* op, Args... args) EXPR_FUNCTOR_DEFAULT;
   virtual R VisitExpr_(const KernelExpr* op, Args... args) EXPR_FUNCTOR_DEFAULT;
-  virtual R VisitExpr_(const StreamExpr* op, Args... args) EXPR_FUNCTOR_DEFAULT;
   virtual R VisitExprDefault_(const Node* op, Args ...) {
   LOG(FATAL) << "Do not have a default for " << op->type_key();
     return R();
@@ -194,7 +193,6 @@ class ExprFunctor<R(const Expr& n, Args...)> {
     IR_EXPR_FUNCTOR_DISPATCH(SetSlice);
     IR_EXPR_FUNCTOR_DISPATCH(Quantize);
     IR_EXPR_FUNCTOR_DISPATCH(KernelExpr);
-    IR_EXPR_FUNCTOR_DISPATCH(StreamExpr);
     return vtable;
   }
 };
@@ -246,7 +244,6 @@ class StmtFunctor<R(const Stmt& n, Args... args)> {
   virtual R VisitStmt_(const Evaluate* op, Args... args) STMT_FUNCTOR_DEFAULT;
   virtual R VisitStmt_(const KernelDef* op, Args... args) STMT_FUNCTOR_DEFAULT;
   virtual R VisitStmt_(const KernelStmt* op, Args... args) STMT_FUNCTOR_DEFAULT;
-  virtual R VisitStmt_(const StreamStmt* op, Args... args) STMT_FUNCTOR_DEFAULT;
   virtual R VisitStmt_(const Return* op, Args... args) STMT_FUNCTOR_DEFAULT;
   virtual R VisitStmt_(const Break* op, Args... args) STMT_FUNCTOR_DEFAULT;
   virtual R VisitStmt_(const While* op, Args... args) STMT_FUNCTOR_DEFAULT;
@@ -278,7 +275,6 @@ class StmtFunctor<R(const Stmt& n, Args... args)> {
     IR_STMT_FUNCTOR_DISPATCH(Evaluate);
     IR_STMT_FUNCTOR_DISPATCH(KernelDef);
     IR_STMT_FUNCTOR_DISPATCH(KernelStmt);
-    IR_STMT_FUNCTOR_DISPATCH(StreamStmt);
     IR_STMT_FUNCTOR_DISPATCH(Return);
     IR_STMT_FUNCTOR_DISPATCH(Break);
     IR_STMT_FUNCTOR_DISPATCH(While);
diff --git a/tvm/include/tvm/ir_mutator.h b/tvm/include/tvm/ir_mutator.h
index 200534644..964684ec1 100644
--- a/tvm/include/tvm/ir_mutator.h
+++ b/tvm/include/tvm/ir_mutator.h
@@ -77,7 +77,6 @@ class TVM_DLL IRMutator {
   virtual Stmt Mutate_(const Reuse* op, const Stmt& s);
   virtual Stmt Mutate_(const Partition* op, const Stmt& s);
   virtual Stmt Mutate_(const Stencil* op, const Stmt& s);
-  virtual Stmt Mutate_(const StreamStmt* op, const Stmt& s);
 
   virtual Expr Mutate_(const Variable* op, const Expr& e);
   virtual Expr Mutate_(const Load* op, const Expr& e);
@@ -115,7 +114,6 @@ class TVM_DLL IRMutator {
   virtual Expr Mutate_(const SetSlice* op, const Expr& e);
   virtual Expr Mutate_(const Quantize* op, const Expr& e);
   virtual Expr Mutate_(const KernelExpr* op, const Expr& e);
-  virtual Expr Mutate_(const StreamExpr* op, const Expr& e);
 };
 
 /*!
diff --git a/tvm/include/tvm/ir_pass.h b/tvm/include/tvm/ir_pass.h
index dfba91d32..88c29f32c 100644
--- a/tvm/include/tvm/ir_pass.h
+++ b/tvm/include/tvm/ir_pass.h
@@ -214,14 +214,6 @@ Stmt StorageFlatten(Stmt stmt,
  */
 Stmt RemoveNoOp(Stmt stmt);
 
-/*!
- * \brief Infer device scope.
- * \param stmt The stmt to be trasnformed
- * \param bus_bandwidth The bandwisth of the stream bus
- * \return Transformed stmt.
- */
-Stmt InferStream(Stmt stmt, int bus_bandwidth);
-
 /*!
  * \brief Split statement into pipeine stages.
  * \param stmt The stmt to be splitted
diff --git a/tvm/include/tvm/ir_visitor.h b/tvm/include/tvm/ir_visitor.h
index 21ef77c32..6fe616aab 100644
--- a/tvm/include/tvm/ir_visitor.h
+++ b/tvm/include/tvm/ir_visitor.h
@@ -131,8 +131,6 @@ class TVM_DLL IRVisitor {
   virtual void Visit_(const KernelDef* op);
   virtual void Visit_(const KernelExpr* op);
   virtual void Visit_(const KernelStmt* op);
-  virtual void Visit_(const StreamExpr* op);
-  virtual void Visit_(const StreamStmt* op);
   virtual void Visit_(const Return* op);
   virtual void Visit_(const Break* op);
   virtual void Visit_(const While* op);
diff --git a/tvm/include/tvm/schedule.h b/tvm/include/tvm/schedule.h
index faacc7d96..9dc1956c8 100644
--- a/tvm/include/tvm/schedule.h
+++ b/tvm/include/tvm/schedule.h
@@ -351,31 +351,11 @@ class Schedule : public NodeRef {
                         const IterVar& axis,
                         int factor_axis = 0);
 
-  EXPORT Tensor reuse_at(const Tensor& target,
-      Stage parent,
+  EXPORT Tensor reuse_at(const Tensor& target, 
+      Stage parent, 
       IterVar axis,
       std::string name);
 
-  EXPORT void to_stage(const Tensor& target,
-                       Stage dest,
-                       int arg_pos,
-                       ir::StreamType stream_type,
-                       int channel_depth, 
-                       std::string name);
-
-  EXPORT Tensor move_to(const Tensor& target,
-                        ir::DeviceType device_type,
-                        ir::StreamType stream_type,
-                        int channel_depth, 
-                        std::string new_name);
-
-  EXPORT void stream_to(const Tensor& target,
-                        Stage dest,
-                        Stage source,
-                        ir::StreamType stream_type,
-                        int channel_depth, 
-                        std::string new_name);
-
   EXPORT Tensor partition(const Tensor& target, int dim, int factor,
                           ir::PartitionType partition_type);
 
@@ -401,8 +381,6 @@ class Schedule : public NodeRef {
   inline ScheduleNode* operator->();
   // declare container type
   using ContainerType = ScheduleNode;
-  // insertion point for host & xcel separation
-  static int split_bound;
 };
 
 /*!
diff --git a/tvm/src/api/api_ir.cc b/tvm/src/api/api_ir.cc
index 8edb1a0e8..825f7580d 100644
--- a/tvm/src/api/api_ir.cc
+++ b/tvm/src/api/api_ir.cc
@@ -176,20 +176,6 @@ TVM_REGISTER_API("make.Select")
       *ret = Node::make(args[0], args[1], args[2], args[3], args[4], args[5]);  \
     })                                                                          \
 
-#define REGISTER_MAKE7(Node)                                                    \
-  TVM_REGISTER_API("make."#Node)                                                \
-  .set_body([](TVMArgs args,  TVMRetValue *ret) {                               \
-      *ret = Node::make(args[0], args[1], args[2], args[3],                     \
-                        args[4], args[5], args[6]);                             \
-    })                                                                          \
-
-#define REGISTER_MAKE8(Node)                                                    \
-  TVM_REGISTER_API("make."#Node)                                                \
-  .set_body([](TVMArgs args,  TVMRetValue *ret) {                               \
-      *ret = Node::make(args[0], args[1], args[2], args[3],                     \
-                        args[4], args[5], args[6], args[7]);                    \
-    })                                                                          \
-
 #define REGISTER_MAKE_BINARY_OP(Node)                        \
   TVM_REGISTER_API("make."#Node)                             \
   .set_body([](TVMArgs args,  TVMRetValue *ret) {            \
@@ -236,7 +222,7 @@ REGISTER_MAKE3(GetSlice);
 REGISTER_MAKE3(SetBit);
 REGISTER_MAKE4(SetSlice);
 REGISTER_MAKE2(Quantize);
-REGISTER_MAKE8(KernelDef);
+REGISTER_MAKE5(KernelDef);
 REGISTER_MAKE3(KernelExpr);
 REGISTER_MAKE2(KernelStmt);
 REGISTER_MAKE1(Return);
diff --git a/tvm/src/api/api_lang.cc b/tvm/src/api/api_lang.cc
index 543e816aa..f07d590a5 100644
--- a/tvm/src/api/api_lang.cc
+++ b/tvm/src/api/api_lang.cc
@@ -461,31 +461,6 @@ TVM_REGISTER_API("_SchedulePartition")
           static_cast<ir::PartitionType>(args[4].operator int()));
   });
 
-TVM_REGISTER_API("_ScheduleMoveToStage")
-  .set_body([](TVMArgs args, TVMRetValue *ret) {
-    args[0].operator Schedule()
-      .to_stage(args[1], args[2], args[3], 
-         static_cast<ir::StreamType>(args[4].operator int()),
-           args[5], args[6]);
-  });
-
-TVM_REGISTER_API("_ScheduleMove")
-  .set_body([](TVMArgs args, TVMRetValue *ret) {
-    *ret = args[0].operator Schedule()
-        .move_to(args[1], 
-          static_cast<ir::DeviceType>(args[2].operator int()),  
-            static_cast<ir::StreamType>(args[3].operator int()),
-              args[4], args[5]);
-  });
-
-TVM_REGISTER_API("_ScheduleStream")
-  .set_body([](TVMArgs args, TVMRetValue *ret) {
-    args[0].operator Schedule()
-      .stream_to(args[1], args[2], args[3], 
-         static_cast<ir::StreamType>(args[4].operator int()),
-           args[5], args[6]);
-  });
-
 TVM_REGISTER_API("_ScheduleReshape")
   .set_body([](TVMArgs args, TVMRetValue *ret) {
     args[0].operator Schedule().reshape(args[1], args[2]);
diff --git a/tvm/src/api/api_pass.cc b/tvm/src/api/api_pass.cc
index 1728b0c23..348b8816e 100644
--- a/tvm/src/api/api_pass.cc
+++ b/tvm/src/api/api_pass.cc
@@ -122,7 +122,6 @@ REGISTER_PASS1(InjectPrefetch);
 REGISTER_PASS2(InjectDoubleBuffer);
 REGISTER_PASS2(LoopPartition);
 REGISTER_PASS1(RemoveNoOp);
-REGISTER_PASS2(InferStream);
 REGISTER_PASS2(SplitPipeline);
 REGISTER_PASS2(LiftAttrScope);
 REGISTER_PASS1(NarrowChannelAccess);
diff --git a/tvm/src/codegen/build_common.cc b/tvm/src/codegen/build_common.cc
deleted file mode 100644
index 8bdbf7e98..000000000
--- a/tvm/src/codegen/build_common.cc
+++ /dev/null
@@ -1,220 +0,0 @@
-/*!
- *  Copyright (c) 2019 by Contributors
- * \file build_common.cc
- * \brief Build unified simulation module
- */
-#include <tvm/base.h>
-#include <tvm/ir_visitor.h>
-#include <tvm/runtime/config.h>
-#include <tvm/runtime/module.h>
-#include <tvm/runtime/packed_func.h>
-#include <tvm/build_module.h>
-#include "./build_common.h"
-#include "./build_util.h"
-
-#include <fstream>
-#include <unistd.h>
-#include <sys/ipc.h>
-#include <sys/shm.h>
-#include <iostream>
-
-#include "merlinc/codeanalys_merlinc.h"
-#include "hlsc/codegen_vhls.h"
-#include "opencl/codegen_aocl.h"
-#include "ppac/codegen_rv64_ppac.h"
-
-namespace TVM {
-namespace runtime {
-
-class SimModuleNode final : public ModuleNode {
- public:
-  SimModuleNode(LoweredFunc func, 
-                std::string host_code,
-                argInfo arg_info,
-                std::string dev_code, std::string platform, 
-                std::unordered_map<std::string, std::string> options)
-    : func_(func), 
-      host_(host_code), 
-      arg_info_(arg_info),
-      dev_(dev_code), 
-      platform_(platform), 
-      options_(options) { 
-  }
-
-  const char* type_key() const {
-    return "unified_sim";
-  }
-
-  // unified simulation function
-  PackedFunc GetFunction(
-      const std::string& name,
-      const std::shared_ptr<ModuleNode>& sptr_to_self) final {
-    return PackedFunc([this](TVMArgs args, TVMRetValue* rv){
-        
-        if (args.size() != (int)func_->args.size())
-          LOG(FATAL) << "The function should take in " << func_->args.size() 
-                     << " inputs but get " << args.size();
-        std::vector<int> shmids;
-        std::vector<size_t> arg_sizes;
-        std::vector<TVMType> arg_types;
-
-        CollectArgInfo(args, func_, arg_sizes, arg_types);
-        GenSharedMem(args, shmids, arg_sizes);
-
-        LOG(CLEAN) << "Generating harness files ...";
-        system("rm -rf __tmp__; mkdir __tmp__");
-        std::string path; 
-        if (const auto* f = Registry::Get("get_util_path")) 
-          path = (*f)(platform_).operator std::string();
-        system(("cp -r " + path + "/* __tmp__/").c_str());
-        LOG(CLEAN) << "Running SW simulation on " + platform_;
-
-        if (platform_ == "sdaccel") {
-          GenWrapperCode(args, shmids, arg_types, arg_info_, func_);
-          GenHostCode(args, shmids, arg_types, func_, 
-                      platform_, host_, arg_info_);
-          GenKernelCode(dev_);
-
-          LOG(CLEAN) << "Running SW simulation ...";
-          system("cd __tmp__; source ./run_sw.sh");
-
-        } else if (platform_ == "rocket") {
-          // generate host and run proxy kernel test 
-          GenHostCode(args, shmids, arg_types, func_, 
-                      platform_, host_, arg_info_);
-          std::string compile = "cd __tmp__;";
-          compile += std::string("autoconf; mkdir build; cd build;") +
-                     std::string("../configure --with-riscvtools=") + 
-                     options_["RISCV"] + std::string(";make -j8");
-          system(compile.c_str());
-
-        } else if (platform_ == "vivado_hls") {
-          GenHostCode(args, shmids, arg_types, func_, 
-                      platform_, host_, arg_info_);
-          GenKernelCode(dev_);
-          system("cd __tmp__; make csim");
-        } else {
-          LOG(FATAL) << "unrecognized platform " << platform_;  
-        } 
-
-        // clean & extract resource information
-        FreeSharedMem(args, shmids, arg_sizes);
-        if (const auto* f = Registry::Get("tvm_callback_syn_postproc")) {
-          std::string code;
-          code = (*f)("test").operator std::string();
-          LOG(CLEAN) << "extract res info";
-        }
-      });
-  }
-
- private:
-  LoweredFunc func_;
-  std::string host_;
-  argInfo arg_info_;
-  std::string dev_;
-  std::string platform_;
-  std::unordered_map<std::string, std::string> options_;
-};
-
-using var2nameType = std::unordered_map<const Variable*, 
-    std::tuple<std::string, Type, std::vector<int>>>; 
-
-Module CreateSimModule(
-    LoweredFunc func,
-    std::string host_code,
-    std::string dev_code,
-    argInfo arg_types,
-    std::string platform, 
-    std::unordered_map<std::string, std::string> options) {
-  std::shared_ptr<SimModuleNode> n =
-    std::make_shared<SimModuleNode>(func, host_code, 
-                                    arg_types, dev_code,
-                                    platform, options);
-  return Module(n);
-}
-} // namespace runtime
-
-namespace codegen {
-using var2nameType = std::unordered_map<const Variable*, 
-    std::tuple<std::string, Type, std::vector<int>>>; 
-
-using argInfo = 
-    std::vector<std::tuple<std::string, bool, Type, std::vector<int>>>;
-
-// unified simulation function for diff platforms 
-template<class CGHost, class CGXcel>
-runtime::Module BuildSimModule(Array<LoweredFunc> funcs,
-                               Array<Expr> attrs,
-                               Array<Expr> values) {
-  CodeAnalysMerlinC ca;
-  CGHost cg_host;
-  CGXcel cg_dev;
-  
-  for (LoweredFunc f : funcs) {
-    ca.AddFunction(f);
-    str2tupleMap<std::string, Type> map_arg_type;
-    map_arg_type = ca.Finish();
-    cg_host.AddFunction(f, map_arg_type);
-    cg_dev.AddFunction(f, map_arg_type);
-  }
-  // vector {vars} 
-  auto& arg_vars = cg_dev.arg_vars;
-  // map {var : is_streamed(bool) }
-  auto& stream_table = cg_dev.stream_table;
-  // map {var : (vid, Type, shape)}
-  auto& arg_top_vars = cg_dev.arg_top_vars;
-
-  argInfo arg_info;
-  for (size_t i = 0 ; i < arg_vars.size(); i++) {
-    auto v = arg_vars[i];
-    auto nameType = arg_top_vars[v];
-    bool is_stream;
-    if (stream_table[v])
-      is_stream = true;
-    else is_stream = false;
-    auto item = std::make_tuple(
-        /*var name*/std::get<0>(nameType),
-        /*whether is streamed*/is_stream, 
-        /*data type*/std::get<1>(nameType), 
-        /*shape*/std::get<2>(nameType));
-    arg_info.push_back(item);
-  }
-  // tool option mapping and platform 
-  std::string platform = values[0].as<StringImm>()->value;
-  std::unordered_map<std::string, std::string> options;
-  for (size_t k = 1; k < attrs.size(); k++) {
-    auto key = attrs[k].as<StringImm>()->value;
-    auto val = values[k].as<StringImm>()->value;
-    options[key] = val;
-  }
-  return runtime::CreateSimModule(funcs[0], 
-                                  cg_host.GetHost(),
-                                  cg_dev.GetDevice(),
-                                  arg_info, platform, options);
-}
-
-TVM_REGISTER_API("codegen.build_sim")
-.set_body([](TVMArgs args, TVMRetValue* rv) {
-    // dispatch to corr codegen
-    auto& sptr = args[2].node_sptr();
-    CHECK(sptr->is_type<ArrayNode>());
-    auto* n = static_cast<const ArrayNode*>(sptr.get());
-    auto data = n->data[static_cast<size_t>(0)];
-
-    // create module node for simulation 
-    std::string type = Expr(data).as<StringImm>()->value;
-    if (type == "rocket") {
-      *rv = BuildSimModule<CodeGenRV64PPAC, CodeGenRV64PPAC>
-                (args[0], args[1], args[2]);
-    } else if (type == "sdaccel") {
-      *rv = BuildSimModule<CodeGenAOCL, CodeGenVivadoHLS>
-                (args[0], args[1], args[2]);
-    } else if (type == "vivado_hls") {
-      *rv = BuildSimModule<CodeGenVivadoHLS, CodeGenVivadoHLS>
-                (args[0], args[1], args[2]);
-    } else {
-    }
-  });
-
-}  // namespace codegen
-}  // namespace TVM
diff --git a/tvm/src/codegen/build_common.h b/tvm/src/codegen/build_common.h
index f9f42d219..ee8cbc509 100644
--- a/tvm/src/codegen/build_common.h
+++ b/tvm/src/codegen/build_common.h
@@ -29,7 +29,6 @@ ExtractFuncInfo(const Array<LoweredFunc>& funcs) {
   }
   return fmap;
 }
-
 }  // namespace codegen
 }  // namespace TVM
 #endif  // TVM_CODEGEN_BUILD_COMMON_H_
diff --git a/tvm/src/codegen/build_opencl.cc b/tvm/src/codegen/build_opencl.cc
new file mode 100644
index 000000000..5054085cd
--- /dev/null
+++ b/tvm/src/codegen/build_opencl.cc
@@ -0,0 +1,44 @@
+/*!
+ *  Copyright (c) 2017 by Contributors
+ *  Build opencl modules from source.
+ * \file build_opencl.cc
+ */
+#include <tvm/base.h>
+#include <tvm/runtime/config.h>
+#include "./codegen_opencl.h"
+#include "./build_common.h"
+
+#if TVM_OPENCL_RUNTIME
+#include "../runtime/opencl/opencl_module.h"
+#endif   // TVM_OPENCL_RUNTIME
+
+namespace TVM {
+namespace codegen {
+
+runtime::Module BuildOpenCL(Array<LoweredFunc> funcs) {
+  using TVM::runtime::Registry;
+  bool output_ssa = false;
+  CodeGenOpenCL cg;
+  cg.Init(output_ssa);
+  for (LoweredFunc f : funcs) {
+    cg.AddFunction(f);
+  }
+  std::string code = cg.Finish();
+
+  if (const auto* f = Registry::Get("tvm_callback_opencl_postproc")) {
+    code = (*f)(code).operator std::string();
+  }
+#if TVM_OPENCL_RUNTIME
+  return OpenCLModuleCreate(code, "cl", ExtractFuncInfo(funcs));
+#else
+  LOG(WARNING) << "OpenCL runtime not enabled, return a source module...";
+  return DeviceSourceModuleCreate(code, "cl", ExtractFuncInfo(funcs), "opencl");
+#endif   // TVM_OPENCL_RUNTIME
+}
+
+TVM_REGISTER_API("codegen.build_opencl")
+.set_body([](TVMArgs args, TVMRetValue* rv) {
+    *rv = BuildOpenCL(args[0]);
+  });
+}  // namespace codegen
+}  // namespace TVM
diff --git a/tvm/src/codegen/build_util.cc b/tvm/src/codegen/build_util.cc
deleted file mode 100644
index e0a5f8b2d..000000000
--- a/tvm/src/codegen/build_util.cc
+++ /dev/null
@@ -1,812 +0,0 @@
-/*!
- *  Copyright (c) 2019 by Contributors
- * \file build_common.cc
- * \brief Build unified simulation module
- */
-#include <tvm/base.h>
-#include <tvm/ir_visitor.h>
-#include <tvm/runtime/config.h>
-#include <tvm/runtime/module.h>
-#include <tvm/runtime/packed_func.h>
-#include <tvm/build_module.h>
-#include "./build_common.h"
-#include "./build_util.h"
-
-#include <fstream>
-#include <unistd.h>
-#include <sys/ipc.h>
-#include <sys/shm.h>
-#include <iostream>
-
-#include "merlinc/codeanalys_merlinc.h"
-#include "hlsc/codegen_vhls.h"
-#include "opencl/codegen_aocl.h"
-#include "ppac/codegen_rv64_ppac.h"
-
-namespace TVM {
-namespace runtime {
-
-std::string getpath(void) {
-   char buff[256];
-   getcwd(buff, 256);
-   std::string cwd(buff);
-   return cwd;
-}
-
-void PrintIndent(std::ofstream& stream, int indent) {
-  for (int i = 0; i < indent; i++)
-    stream << ' ';
-}
-
-inline size_t GetTypeSize(TVMType t) {
-  size_t byte = (t.bits + 7) / 8;
-  if (byte > 2){
-    if (byte <= 4) byte = 4;
-    else if (byte <= 8) byte = 8;
-    else byte = 16;
-  }
-  return byte;
-}
-
-inline size_t GetDataSize(TVMArray* arr) {
-  size_t size = 1;
-  for (tvm_index_t i = 0; i < arr->ndim; ++i) {
-    size *= arr->shape[i];
-  }
-  size_t byte = (arr->dtype.bits + 7) / 8;
-  if (byte > 2){
-    if (byte <= 4) byte = 4;
-    else if (byte <= 8) byte = 8;
-    else byte = 16;
-  }
-  size *= (byte * 8 * arr->dtype.lanes + 7) / 8;
-  return size;
-}
-
-inline TVMType Type2TVMType(Type t) {
-  TVMType tt;
-  if (t.is_int())        tt.code = kDLInt;
-  else if (t.is_uint())  tt.code = kDLUInt;
-  else if (t.is_float()) tt.code = kDLFloat;
-  else                   LOG(FATAL) << "Unacceptable type: " << t;
-  tt.bits = static_cast<uint8_t>(t.bits());
-  tt.fracs = static_cast<uint8_t>(t.fracs());
-  return tt;
-}
-
-inline std::string PrintHalideType(Type t) {
-  std::string str = "";
-  if (t.is_uint() || t.is_int() || t.is_fixed() || t.is_ufixed()) {
-    if (t.is_uint())        str += "ap_uint<" + std::to_string(t.bits()) + ">";
-    else if (t.is_int())    str += "ap_int<" + std::to_string(t.bits()) + ">";
-    else if (t.is_ufixed()) str += "ap_ufixed<" + std::to_string(t.bits()) + ", " + std::to_string(t.bits() - t.fracs()) + ">";
-    else                    str += "ap_fixed<" + std::to_string(t.bits()) + ", " + std::to_string(t.bits() - t.fracs()) + ">";
-  } else {
-    LOG(FATAL) << "Cannot convert type " << t << " to C type";
-  }
-  return str;
-}
-
-inline std::string Type2Str(TVMType t) {
-  std::string str = "";
-  if (t.code == kDLInt) {
-    if (t.fracs > 0) str += "ap_fixed<";
-    else             str += "ap_int<";
-    str += std::to_string(static_cast<int>(t.bits));
-    if (t.fracs > 0) str += ", " + std::to_string(static_cast<int>(t.bits - t.fracs)) + ">";
-    else             str += ">";
-  } else if (t.code == kDLUInt) {
-    if (t.fracs > 0) str += "ap_ufixed<";
-    else             str += "ap_uint<";
-    str += std::to_string(static_cast<int>(t.bits));
-    if (t.fracs > 0) str += ", " + std::to_string(static_cast<int>(t.bits - t.fracs)) + ">";
-    else             str += ">";
-  } else if (t.code == kDLFloat) {
-    str += "float";
-  } else {
-    LOG(FATAL) << "Unknown type";
-  }
-  return str;
-}
-
-inline std::string Type2ExtStr(TVMType t) {
-  std::string str = "";
-  if (t.code == kDLInt) {
-    if (t.fracs > 0) str += "ap_fixed<";
-    else             str += "ap_int<";
-    str += std::to_string(static_cast<int>(t.bits + t.fracs));
-    if (t.fracs > 0) str += ", " + std::to_string(static_cast<int>(t.bits)) + ">";
-    else             str += ">";
-  } else if (t.code == kDLUInt) {
-    if (t.fracs > 0) str += "ap_ufixed<";
-    else             str += "ap_uint<";
-    str += std::to_string(static_cast<int>(t.bits + t.fracs));
-    if (t.fracs > 0) str += ", " + std::to_string(static_cast<int>(t.bits)) + ">";
-    else             str += ">";
-  } else if (t.code == kDLFloat) {
-    str += "float";
-  } else {
-    LOG(FATAL) << "Unknown type";
-  }
-  return str;
-}
-
-inline std::string Type2WrapStr(TVMType t) {
-  std::string str = "";
-  if (t.code == kDLInt) {
-    if (t.fracs > 0) {
-      str += "ap_fixed<";
-      str += std::to_string(static_cast<int>(t.bits + t.fracs));
-    } else {
-      str += "ap_int<";
-      if      (t.bits <= 8)  str += std::to_string(static_cast<int>(t.bits));
-      else if (t.bits <= 16) str += "16";
-      else if (t.bits <= 32) str += "32";
-      else                   str += "64";
-    }     
-    if (t.fracs > 0) str += ", " + std::to_string(static_cast<int>(t.bits)) + ">";
-    else             str += ">";
-  } else if (t.code == kDLUInt) {
-    if (t.fracs > 0) {
-      str += "ap_ufixed<";
-      str += std::to_string(static_cast<int>(t.bits + t.fracs));
-    } else {
-      str += "ap_uint<";
-      if      (t.bits <= 8)  str += std::to_string(static_cast<int>(t.bits));
-      else if (t.bits <= 16) str += "16";
-      else if (t.bits <= 32) str += "32";
-      else                   str += "64"; 
-    }
-    if (t.fracs > 0) str += ", " + std::to_string(static_cast<int>(t.bits)) + ">";
-    else             str += ">";
-  } else if (t.code == kDLFloat) {
-    str += "float";
-  } else {
-    LOG(FATAL) << "Unknown type";
-  }
-  return str;
-}
-
-inline std::string Type2Byte(TVMType t) {
-  std::string str = "";
-  if (t.code == kDLFloat) {
-    str += "float";
-  } else if (t.code == kDLInt || t.code == kDLUInt) {
-    if (t.code == kDLUInt) str += "u";
-    str += "int";
-    if      (t.bits <= 8)  str += "8";
-    else if (t.bits <= 16) str += "16";
-    else if (t.bits <= 32) str += "32";
-    else                   str += "64";
-    str += "_t";
-  }
-  return str;
-}
-
-void CollectArgInfo(TVMArgs& args, 
-                    LoweredFunc func,
-                    std::vector<size_t>& arg_sizes,
-                    std::vector<TVMType>& arg_types) {
-  for (int i = 0; i < args.size(); i++) {
-    if (args[i].type_code() == kArrayHandle) {
-      TVMArray* arr = args[i];
-      arg_sizes.push_back(GetDataSize(arr));
-      arg_types.push_back(arr->dtype);
-    } else {
-      const Variable* var = func->api_args[i].as<Variable>();
-      TVMType t = Type2TVMType(var->type);
-      arg_sizes.push_back(GetTypeSize(t));
-      arg_types.push_back(t);
-    }
-  }
-}
-
-void GenSharedMem(TVMArgs& args,
-                  std::vector<int>& shmids,
-                  std::vector<size_t>& arg_sizes) {
-  for (int i = 0; i < args.size(); i++) {
-    if (args[i].type_code() == kArrayHandle) {
-      TVMArray* arr = args[i];
-      // generate shared memory key and id
-      // TODO: maybe get the current path??
-      key_t key = ftok("/", i+1);
-      int shmid = shmget(key, arg_sizes[i], 0666|IPC_CREAT);
-      shmids.push_back(shmid);
-      // copy mem from TVM args to the shared memory
-      void* mem = shmat(shmid, nullptr, 0);
-      memcpy(mem, arr->data, arg_sizes[i]);
-    } else {
-      shmids.push_back(0);
-    }
-  }
-}
-
-void FreeSharedMem(TVMArgs& args, 
-                   const std::vector<int>& shmids,
-                   std::vector<size_t>& arg_sizes) {
-  for (size_t i = 0; i < shmids.size(); i++) {
-    if (args[i].type_code() == kArrayHandle) {
-      TVMArray* arr = args[i];
-      int shmid = shmids[i];
-      void* mem = shmat(shmid, nullptr, 0);
-      memcpy(arr->data, mem, arg_sizes[i]);
-      shmdt(mem);
-      shmctl(shmid, IPC_RMID, nullptr);
-    }
-  }
-}
-
-// copy values from the shared mem to local mem
-void PrintCopy(TVMArray* arr, 
-               argInfo& arg_info,
-               std::ofstream& stream, 
-               int indent, size_t nth_arr) {
-  for (int i = 0; i < arr->ndim; i++) {
-    PrintIndent(stream, indent);
-    stream << "for (size_t i" << i << " = 0; ";
-    stream << "i" << i << " < " << arr->shape[i] << "; ";
-    stream << "i" << i << "++) {\n";
-    indent += 2;
-    if (i == arr->ndim - 1) {
-      PrintIndent(stream, indent);
-      stream << std::get<0>(arg_info[nth_arr]);
-      stream << "[i" << arr->ndim-1;
-      int mul2 = 1;
-      for (int j = arr->ndim-2; j >= 0; j--) {
-        mul2 *= arr->shape[j+1];
-        stream << " + i" << j << "*" << mul2;
-      }
-      stream << "]";
-
-      stream << " = (";
-      // stream << Type2ExtStr(arr->dtype);
-      stream << Type2Byte(arr->dtype);
-
-      stream << ")(arg_" << nth_arr;
-      stream << "[i" << arr->ndim-1;
-      int mul = 1;
-      for (int j = arr->ndim-2; j >= 0; j--) {
-        mul *= arr->shape[j+1];
-        stream << " + i" << j << "*" << mul;
-      }
-      stream << "])";
-      if (arr->dtype.fracs > 0)
-        stream << " >> " << static_cast<int>(arr->dtype.fracs);
-      stream << ";\n";
-    }
-  }
-  for (int i = 0; i < arr->ndim; i++) {
-    indent -= 2;
-    PrintIndent(stream, indent);
-    stream << "}\n";
-  }
-}
-
-// copy values from local mem back to shared mem
-void PrintCopyBack(TVMArray* arr, 
-                   argInfo& arg_info,
-                   std::ofstream& stream, 
-                   int indent, size_t nth_arr) {
-  for (int i = 0; i < arr->ndim; i++) {
-    PrintIndent(stream, indent);
-    stream << "for (size_t i" << i << " = 0; ";
-    stream << "i" << i << " < " << arr->shape[i] << "; ";
-    stream << "i" << i << "++) {\n";
-    indent += 2;
-    if (i == arr->ndim-1) {
-      PrintIndent(stream, indent);
-      stream << "arg_" << nth_arr;
-      stream << "[i" << arr->ndim-1;
-      int mul = 1;
-      for (int j = arr->ndim-2; j >= 0; j--) {
-        mul *= arr->shape[j+1];
-        stream << " + i" << j << "*" << mul;
-      }
-      stream << "] = (";
-      stream << Type2Byte(arr->dtype);
-      stream << ")(" << std::get<0>(arg_info[nth_arr]);
-      stream << "[i" << arr->ndim - 1;
-      int mul2 = 1;
-      for (int j = arr->ndim-2; j >= 0; j--) {
-        mul2 *= arr->shape[j+1];
-        stream << " + i" << j << "*" << mul2;
-      }
-
-      stream << "])";
-      if (arr->dtype.fracs > 0)
-        stream << " << " << static_cast<int>(arr->dtype.fracs);
-      stream << ";\n";
-    }
-  }
-  for (int i = 0; i < arr->ndim; i++) {
-    indent -= 2;
-    PrintIndent(stream, indent);
-    stream << "}\n";
-  }
-}
-
-void GenKernelCode(std::string test_file) {
-  std::ofstream stream;
-  stream.open("__tmp__/kernel.cpp");
-  stream << test_file;
-  stream.close();
-}
-
-// interface pragma to specify mem and ctrl interface in sdx
-void GenWrapperCode(TVMArgs& args,
-                 const std::vector<int>& shmids,
-                 const std::vector<TVMType>& arg_types,
-                 argInfo& arg_stream_types,
-                 LoweredFunc func) {
-  std::ofstream stream;
-  int indent = 0;
-  std::string path(getenv("PWD"));
-  stream.open("__tmp__/interface.cpp");
-  stream << "#include <stdio.h>\n";
-  stream << "#include \"" + path + "/__tmp__/kernel.cpp\"\n";
-  stream << "\n\n";
-  stream << "extern \"C\" \n";
-  stream << "{\n";
-  indent += 2;
-  PrintIndent(stream, indent);
-
-  // wrapper func interface
-  stream << "void App( ";
-  size_t ex_arg_count = 0;
-  ex_arg_count = arg_stream_types.size() - arg_types.size();
-  for (size_t i = 0; i < arg_types.size(); i++) {
-    if (i != 0) stream << ", ";
-    stream << Type2WrapStr(arg_types[i]);
-    stream << "*";
-    stream << " source_wrapper_" << i;
-  }
-  for (size_t k = 0; k < ex_arg_count; k++) {
-    if (k != ex_arg_count) stream << ", ";
-    stream << PrintHalideType(std::get<2>(arg_stream_types[k + arg_types.size()])); 
-    stream << "*";
-    stream << " source_wrapper_" << k + arg_types.size();
-  }  
-  stream << " ) {\n";
-
-  // memeory and control pragma 
-  for (size_t i = 0; i < arg_stream_types.size(); i++) {
-    std::string interface;
-    if (std::get<1>(arg_stream_types[i])) interface = " m_axi ";
-    else interface = " m_axi ";
-    PrintIndent(stream, indent);
-    stream << "#pragma HLS INTERFACE" + interface + "port=";
-    stream << "source_wrapper_" << i;
-    stream << " offset=slave bundle=gmem\n";
-  }
-  for (size_t i = 0; i < arg_stream_types.size(); i++) {
-    std::string interface;
-    if (std::get<1>(arg_stream_types[i])) interface = " s_axilite ";
-    else interface = " s_axilite ";
-    PrintIndent(stream, indent);
-    stream << "#pragma HLS INTERFACE" + interface + "port=";
-    stream << "source_wrapper_" << i;
-    stream << " bundle=control\n";
-  }
-  PrintIndent(stream, indent);
-  stream << "#pragma HLS INTERFACE s_axilite port=return bundle=control\n";
-  stream << "\n";
-
-  // intermediate vars init alloc 
-  for (size_t i = 0; i < arg_stream_types.size(); i++) {
-    PrintIndent(stream, indent);
-    stream << PrintHalideType(std::get<2>(arg_stream_types[i]));
-    stream << " source_wrapper_temp_" << i;
-    auto shape = std::get<3>(arg_stream_types[i]);
-    for (size_t j = 0; j < shape.size(); j++) 
-      stream << "[" << shape[j] << "]";
-    if (shape.size() == 0) stream << "[1]";
-    stream << ";\n";
-  }
-
-  // vars init for values
-  for (size_t i = 0; i < arg_stream_types.size(); i++) {
-    auto shape = std::get<3>(arg_stream_types[i]);
-    for (size_t j = 0; j < shape.size(); j++) {
-      PrintIndent(stream, indent);
-      stream << "for (int i" << j << " = 0; ";
-      stream << "i" << j << " < " << shape[j] << "; ";
-      stream << "i" << j << "++) {\n";
-      indent += 2;
-      if (j == shape.size() - 1) {
-        PrintIndent(stream, indent);
-        stream << "source_wrapper_temp_" << i;
-        for (size_t k = 0; k < shape.size(); k++) {
-          stream << "[i" << k << "]";
-        }
-        stream << " = ";
-        stream << "source_wrapper_" << i;
-        stream << "[i" << shape.size() - 1;
-        int mul = 1;
-        for (size_t k = shape.size() - 1; k > 0; k--) {
-          mul *= shape[k];
-          stream << "+ i" << k - 1 << "*" << mul;
-        }
-        stream << "];\n";
-      }
-    }
-    for (size_t j = 0; j < shape.size(); j++) {
-      indent -= 2;
-      PrintIndent(stream, indent);
-      stream << "}\n";
-    }
-    if (shape.size() == 0) {
-      PrintIndent(stream, indent);
-      stream << "source_wrapper_temp_" << i;
-      stream << "[0] = source_wrapper_" << i << "[0];\n";
-    }
-  }
-
-  // print top func
-  stream << "\n";
-  PrintIndent(stream, indent);
-  stream << "top( ";
-  for (size_t i = 0;i < arg_stream_types.size(); i++) {
-    if (i != arg_stream_types.size() - 1){
-      stream << "source_wrapper_temp_" << i;
-      stream << ", ";
-    } else {
-      stream << "source_wrapper_temp_" << i;
-      stream << ");\n";
-    }
-
-  }
-  stream << "\n";
-
-  // read back return val
-  for (int k = arg_stream_types.size() - 1; 
-       k > args.size() - 2; k--) {
-    auto shape = std::get<3>(arg_stream_types[k]);
-    for (size_t i = 0; i < shape.size(); i++) {
-      PrintIndent(stream, indent);
-      stream << "for (int i" << i << " = 0; ";
-      stream << "i" << i << " < " << shape[i] <<  "; ";
-      stream << "i" << i << "++) {\n";
-      indent += 2;
-    
-      if (i == shape.size() - 1) {
-        PrintIndent(stream, indent);
-        stream << "source_wrapper_" << k;
-        stream << "[i" << shape.size() - 1;
-        int mul = 1;
-        for (size_t j = shape.size() - 1; j > 0; j--) {
-          mul *= shape[j];
-          stream << " + i" << j - 1 << "*" << mul;
-        }
-        stream << " ] = ";
-    
-        stream << "source_wrapper_temp_" << k;
-        for (size_t j = 0; j < shape.size(); j++) {
-          stream << "[i" << j << "]";
-        }
-        stream <<";\n";
-      }
-    }
-    for (size_t i = 0;i < shape.size(); i++) {
-        indent -= 2;
-        PrintIndent(stream, indent);
-        stream << "}\n";
-    }
-  }
-  stream << "}\n";
-  indent -= 2;
-  stream << "}\n";
-  stream.close();
-}
-
-// generate opencl wrapper for sdaccel sim
-void GenHostHeaders(std::ofstream& stream,
-                    std::string platform) {
-  stream << "#include <sys/ipc.h>\n";
-  stream << "#include <sys/shm.h>\n\n";
-  stream << "// standard C/C++ headers\n";
-  stream << "#include <cstdio>\n";
-  stream << "#include <cstdlib>\n";
-  stream << "#include <getopt.h>\n";
-  stream << "#include <string>\n";
-  stream << "#include <time.h>\n";
-  stream << "#include <sys/time.h>\n\n";
-  
-  if (platform == "sdaccel") {
-    stream << "// opencl harness headers\n";
-    stream << "#include \"CLWorld.h\"\n";
-    stream << "#include \"CLKernel.h\"\n";
-    stream << "#include \"CLMemObj.h\"\n";
-    stream << "#include \"utils.h\"\n";
-    stream << "// harness namespace\n";
-    stream << "using namespace rosetta;\n";
-  } else if (platform == "vivado_hls") {
-    stream << "// vivado hls headers\n";
-    stream << "#include <ap_int.h>\n";
-    stream << "#include <ap_fixed.h>\n";
-    stream << "#include <hls_stream.h>\n";
-    stream << "#include \"kernel.cpp\"\n\n";
-  }
-}
-
-// initialization before executing kernel 
-void KernelInit(std::ofstream& stream,
-                std::string platform,
-                TVMArgs& args, 
-                const std::vector<TVMType>& arg_types,
-                argInfo& arg_stream_types) {
-  int indent = 2;
-  stream << "\n";
-  PrintIndent(stream, indent);
-  stream << "// parse command line arguments for opencl version\n";
-  PrintIndent(stream, indent);
-  stream << "std::string kernelFile(\"\");\n";
-  PrintIndent(stream, indent);
-  stream << "parse_sdaccel_command_line_args(argc, argv, kernelFile);\n";
-  stream << "\n";
-  PrintIndent(stream, indent);
-  stream << "// create OpenCL world\n";
-  PrintIndent(stream, indent);
-  stream << "CLWorld world = CLWorld(TARGET_DEVICE, CL_DEVICE_TYPE_ACCELERATOR);\n";
-  stream << "\n";
-  PrintIndent(stream, indent);
-  stream << "// add the bitstream file\n";
-  PrintIndent(stream, indent);
-  stream << "dworld.addProgram(kernelFile);\n";
-  stream << "\n\n";
-  PrintIndent(stream, indent);
-  stream << "// create kernels\n";
-  PrintIndent(stream, indent);
-  stream << "CLKernel App(world.getContext(), world.getProgram(), \"App\", world.getDevice());\n";
-  stream << "\n\n";
-
-  PrintIndent(stream, indent);
-  stream << "// create mem objects\n";
-  for (int i = 0;i < args.size(); i++) {
-    PrintIndent(stream, indent);
-    stream << "CLMemObj source_" << i;
-    stream << "((void*)arg_top_" << i;
-    stream << ", sizeof(" << Type2Byte(arg_types[i]) << "), ";
-
-    if (args[i].type_code() == kArrayHandle) {
-      TVMArray* arr = args[i];
-      for (int j = 0;j < arr->ndim;j++) {
-        if (j==0) {
-          stream << arr->shape[j] << " ";
-        } else {
-          stream << "* " << arr->shape[j];
-        }
-      }
-    } else {
-      stream << "1";
-    }
-    stream << ", ";
-    stream << "CL_MEM_READ_WRITE);\n";
-  }
-  // additional streamed data
-  for (size_t k = args.size(); k < arg_stream_types.size(); k++) {
-    auto type = std::get<2>(arg_stream_types[k]);
-    auto shape = std::get<3>(arg_stream_types[k]);
-    PrintIndent(stream, indent);
-    stream << "CLMemObj source_" << k;
-    stream << "((void*)knn_mat";
-    stream << ", sizeof(" << Type2Byte(Type2TVMType(type)) << "), ";
-    if (shape.size() > 0) {
-      for (size_t j = 0; j < shape.size(); j++) {
-        if (j == 0) {
-          stream << shape[j] << " ";
-        } else {
-          stream << "* " << shape[j];
-        }
-      }
-    } else {
-      stream << "1";
-    }
-    stream << ", ";
-    stream << "CL_MEM_READ_WRITE);\n";
-  }
-
-  stream << "\n";
-  PrintIndent(stream, indent);
-  stream << "// add them to the world\n";
-  for (size_t i = 0;i < arg_stream_types.size();i++) {
-    PrintIndent(stream, indent);
-    stream << "world.addMemObj(source_" << i;
-    stream << ");\n";
-  }
-
-  stream << "\n\n";
-  PrintIndent(stream, indent);
-  stream << " // set work size\n";
-  PrintIndent(stream, indent);
-  int size = arg_stream_types.size();
-  std::string arr = "[" + std::to_string(size) + "] = {";
-  for (int i = 0; i < size; i++) {
-    if (i != size -1) arr += "1, ";
-    else arr += "1};\n";
-  }
-  stream << "int global_size" + arr;
-  PrintIndent(stream, indent);
-  stream << "int local_size" + arr;
-  PrintIndent(stream, indent);
-  stream << "App.set_global(global_size);\n";
-  PrintIndent(stream, indent);
-  stream << "App.set_local(local_size);\n";
-  stream << "\n";
-  PrintIndent(stream, indent);
-  stream << "// add them to the world\n";
-  PrintIndent(stream, indent);
-  stream << "world.addKernel(App);\n";
-  stream << "\n";
-  PrintIndent(stream, indent);
-  stream << "// set kernel arguments\n";
-  for (size_t i = 0; i < arg_stream_types.size(); i++) {
-    PrintIndent(stream, indent);
-    stream << "world.setMemKernelArg(0, "<< i << ", " << i;
-    stream << ");\n";
-  }
-
-  stream << "\n";
-  PrintIndent(stream, indent);
-  stream << "// run\n";
-  PrintIndent(stream, indent);
-  stream << "world.runKernels();\n\n";
-  PrintIndent(stream, indent);
-  stream << "// read the data back\n";
-  for (size_t i = args.size() - 1; i < arg_stream_types.size(); i++) {
-    PrintIndent(stream, indent);
-    stream << "world.readMemObj(" << i << ");\n";
-  }
-}
-
-// generate host code according to platform type
-void GenHostCode(TVMArgs& args,
-                 const std::vector<int>& shmids,
-                 const std::vector<TVMType>& arg_types,
-                 LoweredFunc lowered_func,
-                 std::string platform,
-                 std::string host_code,
-                 argInfo& arg_info) {
-  int indent = 0;
-  std::ofstream stream;
-  stream.open("__tmp__/host.cpp");
-  GenHostHeaders(stream, platform);
-
-  stream << "int main(int argc, char ** argv) {\n";
-  indent += 2;
-
-  int cnt = 0; // label the constant value
-  for (int i = 0; i < args.size(); i++) {
-    if (args[i].type_code() == kArrayHandle) {
-      // read from the shared memory
-      PrintIndent(stream, indent);
-      stream << Type2Byte(arg_types[i]) << "* "; 
-      stream << "arg_" << i << " = ";
-      stream << "(" << Type2Byte(arg_types[i]) << "*)";
-      stream << "shmat(" << shmids[i] << ", nullptr, 0);\n";
-      PrintIndent(stream, indent);
-
-      stream << Type2Byte(arg_types[i]) << " ";
-      stream << std::get<0>(arg_info[i]);
-      TVMArray* arr = args[i];
-
-      stream << "[";
-      for (int j = 0; j < arr->ndim; j++) {
-        if (j == arr->ndim - 1) {
-          stream << arr->shape[j];
-        } else {
-          stream << arr->shape[j];
-          stream << " * ";
-        }
-      }
-      stream << "];\n";
-      PrintCopy(arr, arg_info, stream, indent, i);
-
-    } else {
-      // directly assign the value to the variable
-      PrintIndent(stream, indent);
-      stream << Type2Byte(arg_types[i]) << " ";
-      stream << "arg_" << i << " = ";
-      stream << "(" << Type2Byte(arg_types[i]) << ")";
-      if (args[i].type_code() == kDLInt || 
-          args[i].type_code() == kDLUInt) {
-        stream << int64_t(args[i]);
-      }
-      stream << ";\n";
-      PrintIndent(stream, indent);
-      stream << Type2Byte(arg_types[i]) << " ";
-      stream << "arg_top_" << i;
-      stream << "[1] = { ";
-
-      stream << "arg_" << i << " }";
-      if (arg_types[i].fracs > 0)
-        stream << " >> " << static_cast<int>(arg_types[i].fracs);
-      stream << ";\n";
-      cnt += 1;
-    }
-    stream << "\n";
-  }
-
-  // allocate mem for stream vars
-  for (size_t k = args.size(); k < arg_info.size(); k++) {
-    auto type = std::get<2>(arg_info[k]);
-    auto shape = std::get<3>(arg_info[k]);
-    PrintIndent(stream, indent);
-    stream << Type2Byte(Type2TVMType(type)) << " " << "name[";
-    if (shape.size() > 0) {
-      for (size_t i = 0; i < shape.size(); i++) {
-        if (i != shape.size() - 1)
-          stream << shape[i] << " * ";
-        else stream << shape[i];
-      }
-    } else {
-      stream << "1";
-    }
-    stream << "];\n";
-  }
-
-  // generate host side (before kernel)
-  PrintIndent(stream, indent);
-  stream << "printf(\"Finished setting up shared memory\\n\");\n";
-  PrintIndent(stream, indent);
-  stream << "// compute bofore kernel function\n";
-  size_t pos = host_code.find("top(");
-  std::string pre_kernel  = host_code.substr(0, pos -1);
-  std::string post_kernel = host_code.substr(host_code.find('\n', pos) + 1);
-  pre_kernel = pre_kernel.substr(pre_kernel.find_first_not_of("\n"));
-  pre_kernel = pre_kernel.substr(pre_kernel.find_first_not_of(" "));
-  PrintIndent(stream, indent);
-  
-  if (platform == "sdaccel") {
-    // create variable wrapper
-    stream << pre_kernel << "\n";
-    KernelInit(stream, platform, args,
-               arg_types, arg_info);
-  } else if (platform == "vivado_hls") {
-    // init hls stream channels 
-    for (size_t k = 0; k < arg_info.size(); k++) {
-      auto info = arg_info[k]; 
-      if (std::get<1>(info)) {
-        PrintIndent(stream, indent);
-        stream << "hls::stream<" 
-               << PrintHalideType(std::get<2>(info)) 
-               << "> " << "fd_" << std::get<0>(info) << ";\n";
-      }  
-    }
-    PrintIndent(stream, indent);
-    stream << pre_kernel << "\n";
-    PrintIndent(stream, indent);
-    // create kernel call from host 
-    stream << "top(";
-    for (size_t i = 0; i < arg_info.size(); i++) {
-      auto info = arg_info[i];
-      auto name = std::get<0>(info);
-      if (i != 0) stream << ", ";
-      stream << "fd_" << name;
-    }
-    stream << ");\n";
-  }
-
-  // generate host (post-kernel)
-  PrintIndent(stream, indent);
-  stream << "// compute after kernel function\n";
-  stream << post_kernel;
-
-  // copy to shared mem
-  for (int i = 0; i < args.size(); i++) {
-    if (args[i].type_code() == kArrayHandle) {
-      TVMArray* arr = args[i];
-      PrintCopyBack(arr, arg_info, stream, indent, i);
-      PrintIndent(stream, indent);
-      stream << "shmdt(";
-      stream << "arg_" << i << ");\n";
-    }
-  }
-
-  stream << "\n\n";
-  PrintIndent(stream, indent);
-  stream << "}\n";
-  stream.close();
-
-}
-}  // namespace runtime
-}  // namespace TVM
diff --git a/tvm/src/codegen/build_util.h b/tvm/src/codegen/build_util.h
deleted file mode 100644
index ca95364c1..000000000
--- a/tvm/src/codegen/build_util.h
+++ /dev/null
@@ -1,70 +0,0 @@
-/*!
- *  Copyright (c) 2019 by Contributors
- *  Common build utilities
- * \file build_util.h
- */
-#ifndef TVM_CODEGEN_BUILD_HELPER_H_
-#define TVM_CODEGEN_BUILD_HELPER_H_
-
-#include <tvm/codegen.h>
-#include <unordered_map>
-#include <string>
-#include "../runtime/meta_data.h"
-
-namespace TVM {
-namespace runtime {
-
-using argInfo = 
-    std::vector<std::tuple<std::string, bool, Type, std::vector<int>>>;
-
-// get current work directory
-std::string getpath(void);
-void PrintIndent(std::ofstream& stream, int indent);
-inline size_t GetTypeSize(TVMType t);
-inline size_t GetDataSize(TVMArray* arr);
-inline TVMType Type2TVMType(Type t);
-inline std::string PrintHalideType(Type t);
-inline std::string Type2Str(TVMType t);
-inline std::string Type2ExtStr(TVMType t);
-inline std::string Type2WrapStr(TVMType t);
-inline std::string Type2Byte(TVMType t);
-
-void CollectArgInfo(TVMArgs& args, 
-                    LoweredFunc func,
-                    std::vector<size_t>& arg_sizes,
-                    std::vector<TVMType>& arg_types);
-
-void GenSharedMem(TVMArgs& args,
-                  std::vector<int>& shmids,
-                  std::vector<size_t>& arg_sizes);
-
-void FreeSharedMem(TVMArgs& args, 
-                   const std::vector<int>& shmids,
-                   std::vector<size_t>& arg_sizes);
-
-void PrintCopy(TVMArray* arr, 
-               std::ofstream& stream, 
-               int indent, size_t nth_arr);
-
-void PrintCopyBack(TVMArray* arr, 
-                   std::ofstream& stream, 
-                   int indent, size_t nth_arr);
-
-void GenKernelCode(std::string test_file);
-
-void GenWrapperCode(TVMArgs& args,
-                 const std::vector<int>& shmids,
-                 const std::vector<TVMType>& arg_types,
-                 argInfo& arg_info,
-                 LoweredFunc func);
-
-void GenHostCode(TVMArgs& args,
-                 const std::vector<int>& shmids,
-                 const std::vector<TVMType>& arg_types,
-                 LoweredFunc func,
-                 std::string platform,
-                 std::string host_code,
-                 argInfo& arg_info);
-} // namespace runtime
-} // namespace TVM
-#endif  // TVM_CODEGEN_BUILD_HELPER_H_
diff --git a/tvm/src/codegen/codegen_c.cc b/tvm/src/codegen/codegen_c.cc
index 006edf933..7373711f4 100644
--- a/tvm/src/codegen/codegen_c.cc
+++ b/tvm/src/codegen/codegen_c.cc
@@ -2,12 +2,9 @@
  *  Copyright (c) 2017 by Contributors
  * \file codegen_c.cc
  */
-#include <tvm/build_module.h>
-#include <tvm/ir_pass.h>
 #include <iomanip>
 #include <cctype>
 #include "./codegen_c.h"
-#include "./merlinc/codeanalys_merlinc.h"
 #include "../arithmetic/compute_expr.h"
 
 namespace TVM {
@@ -15,123 +12,6 @@ namespace codegen {
 
 using namespace ir;
 
-Type String2Type(std::string& s) {
-  if (s.front() == '\"' && s.back() == '\"') {
-    s.erase(0, 1);
-    s.pop_back();
-  }
-  std::istringstream is(s);
-  halideir_type_code_t code = Type::Int;
-  if (s.substr(0, 3) == "int") {
-    code = Type::Int; s = s.substr(3);
-  } else if (s.substr(0, 4) == "uint") {
-    code = Type::UInt; s = s.substr(4);
-  } else if (s.substr(0, 5) == "float") {
-    code = Type::Float; s = s.substr(5);
-  } else if (s.substr(0, 5) == "float") {
-    code = Type::Float; s = s.substr(5);
-  } else if (s == "handle") {
-    return Handle();
-  } else {
-    LOG(FATAL) << "unknown type " << s;
-  }
-  int bits = 32, lanes = 1;
-  if (sscanf(s.c_str(), "%dx%d", &bits, &lanes) == 0) {
-    LOG(FATAL) << "unknown type " << s;
-  }
-  return Type(code, bits, lanes);
-}
-
-// generate row major index
-std::string getIndex(std::vector<int> shape) {
-  std::string str;
-  int mul = 1;
-  for (size_t i = shape.size(); i > 0; i--) {
-    mul = mul * shape[i-1];
-    str += "i" + std::to_string(i-1) +
-           "*" + std::to_string(mul);
-    if (i != 1) str += "+ ";
-  }
-  return str;
-}
-
-// collect type info for vars
-void TypeCollector::Visit_(const Allocate *op) {
-  auto v = op->buffer_var.get();
-  if (top_args_.count(v)) {
-    std::vector<int> shape;
-    for (size_t i = 0; i < op->extents.size(); i++) 
-      shape.push_back(op->extents[i].as<IntImm>()->value);
-    top_args_[v] = std::make_tuple(std::get<0>(top_args_[v]), op->type, shape);
-  }
-  IRVisitor::Visit_(op);
-}
-
-void StreamCollector::Visit_(const Allocate *op) {
-  this->HandleDef(op->buffer_var.get());
-  IRVisitor::Visit_(op);
-}
-    
-void StreamCollector::Visit_(const Load *op) {
-  this->HandleUse(op->buffer_var);
-  IRVisitor::Visit_(op);
-}
-
-// update placeholder status
-void StreamCollector::Visit_(const Store* op) {
-  if (auto val = op->value.as<StreamExpr>()) {
-    this->HandleDef(op->buffer_var.get());
-  }
-  this->HandleUse(op->buffer_var);
-  IRVisitor::Visit_(op);
-}
-
-void StreamCollector::Visit_(const StreamStmt* op) {
-  this->HandleDef(op->buffer_var.get());
-  IRVisitor::Visit_(op);
-}
-
-void StreamCollector::Visit_(const AttrStmt* op) {
-  if (op->attr_key == attr::device_scope) { 
-    if (op->value.as<StringImm>()->value != scope_)
-      switch_on = true;
-    else switch_on = false;
-  }
-  IRVisitor::Visit_(op);
-}
-
-// additional data saved into stream table 
-void StreamCollector::HandleDef(const Variable* v) {
-  if (!switch_on) { // def on host scope 
-    CHECK(!host_def_count_.count(v))
-        << "variable " << v->name_hint
-        << " has already been defined, the Stmt is not SSA";
-    CHECK(!host_use_count_.count(v))
-        << "variable " << v->name_hint
-        << " has been used before definition!";
-    host_use_count_[v] = 0;
-    host_def_count_[v] = 1;
-  }
-}
-
-void StreamCollector::HandleUse(const Expr& v) {
-  CHECK(v.as<Variable>());
-  Var var(v.node_);
-  auto it = host_use_count_.find(var.get());
-  if (!switch_on) { // def on host scope 
-    if (it != host_use_count_.end()) {
-      if (it->second >= 0) {
-        ++it->second;
-      }
-    } else {
-      if (!stream_table_.count(var.get())) {
-        host_undefined_.push_back(var);
-        host_use_count_[var.get()] = -1;
-      }
-    }
-  }
-}
-
 void CodeGenC::Init(bool output_ssa) {
   print_ssa_form_ = output_ssa;
 }
@@ -139,50 +19,44 @@ void CodeGenC::Init(bool output_ssa) {
 void CodeGenC::InitFuncState(LoweredFunc f) {
   alloc_storage_scope_.clear();
   handle_data_type_.clear();
-  var_shape_map_.clear();
-  range_.clear();
   CodeGenSourceBase::ClearFuncState();
 }
-
-void CodeGenC::AddFunction(LoweredFunc f,
-        str2tupleMap<std::string, Type> map_arg_type) {
+void CodeGenC::AddFunction(LoweredFunc f) {
   // clear previous generated state.
   this->InitFuncState(f);
-  map_arg_type_ = map_arg_type;
+  // skip the first underscore, so SSA variable starts from _1
+  GetUniqueName("_");
   // add to alloc buffer type.
   for (const auto & kv : f->handle_data_type) {
     RegisterHandleType(kv.first.get(), kv.second.type());
   }
 
-  // generate function signature 
   this->stream << "void " << f->name << "(";
   for (size_t i = 0; i < f->args.size(); ++i) {
     Var v = f->args[i];
     std::string vid = AllocVarID(v.get());
     if (i != 0) stream << ", ";
-    // check type in the arg map
-    if (map_arg_type.find(vid) == map_arg_type.end()) {
-      LOG(WARNING) << vid << " type not found\n";
-      PrintType(v.type(), this->stream);
-      this->stream << ' ' << vid;
-    } else {
-      auto arg = map_arg_type[vid];
-      PrintType(std::get<1>(arg), this->stream);
-      this->stream << "* " << std::get<0>(arg);
-      const BufferNode* buf = f->api_args[i].as<BufferNode>();
-      if (v.type().is_handle() && buf) {
-        std::vector<int> shape;
-        for (size_t i = 0; i < buf->shape.size(); i++) 
-          shape.push_back(buf->shape[i].as<IntImm>()->value);
-        arg_shapes.push_back(shape);
-        var_shape_map_[buf->data.get()] = buf->shape;
-        auto it = alloc_storage_scope_.find(v.get());
-        if (it != alloc_storage_scope_.end())
-          PrintStorageScope(it->second, stream);
+    if (v.type().is_handle()) {
+      auto it = alloc_storage_scope_.find(v.get());
+      if (it != alloc_storage_scope_.end())
+        PrintStorageScope(it->second, stream);
+      stream << ' ';
+
+      if (handle_data_type_.count(v.get())) {
+        PrintType(handle_data_type_.at(v.get()), stream);
+      } else {
+        stream << "void";
+      }
+      stream << "*";
+
+      if (f->is_restricted && restrict_keyword_.length() != 0) {
+        stream << ' ' << restrict_keyword_;
       }
+    } else {
+      PrintType(v.type(), stream);
     }
+    stream << ' ' << vid;
   }
-
   stream << ") {\n";
   int func_scope = this->BeginScope();
   this->PrintStmt(f->body);
@@ -191,49 +65,8 @@ void CodeGenC::AddFunction(LoweredFunc f,
   this->stream << "}\n\n";
 }
 
-std::string CodeGenC::GetHost() {
-  if (!fpga_scope_)
-    host_stream << stream.str(); 
-  std::string postproc = host_stream.str();
-  postproc.erase(postproc.rfind("}") - 1, 
-                 postproc.length() - 1);
-  postproc.erase(0, postproc.find("{") + 1);
-  return postproc + "\n\n";
-}
-
-std::string CodeGenC::GetDevice() {
-  std::ostringstream device;
-  device << "void top(" << arg_stream.str() << "){\n";
-
-  // process device code
-  PreProcess(device); 
-  // remove the kernel name alloc
-  auto text = device_stream.str();
-  for (auto const& m : stream_arg_pos) {
-    std::string alloc = m.first + ";";
-    size_t nFPos = text.find(alloc);
-    size_t secondNL = text.find('\n', nFPos);
-    size_t firstNL = text.rfind('\n', nFPos);
-    text.erase(firstNL, secondNL - firstNL);
-  }
-  device << text;
-  PostProcess(device);
-
-  if (fpga_scope_) device << stream.str();
-  return decl_stream.str() + module_stream.str() + 
-         device.str() + "}\n\n";
-}
-
 std::string CodeGenC::Finish() {
-  std::ostringstream device;
-  device << "void top(" << arg_stream.str() 
-         << "){\n" << device_stream.str();
-  if (fpga_scope_) device << stream.str();
-  else host_stream << stream.str(); 
-  device << "}\n";
-  return decl_stream.str() + "\n{device}\n" + 
-         module_stream.str() + device.str() + "\n{device}\n" + 
-         "\n{host}\n" + host_stream.str() + "\n{host}\n";
+  return decl_stream.str() + stream.str();
 }
 
 void CodeGenC::PrintExpr(const Expr& n, std::ostream& os) {  // NOLINT(*)
@@ -453,7 +286,7 @@ void CodeGenC::PrintStorageScope(const std::string& scope, std::ostream& os) { /
 
 void CodeGenC::PrintType(Type t, std::ostream& os) {  // NOLINT(*)
   CHECK_EQ(t.lanes(), 1)
-     << "do not yet support vector types";
+      << "do not yet support vector types";
   if (t.is_handle()) {
     os << "void*"; return;
   }
@@ -481,6 +314,7 @@ void CodeGenC::PrintType(Type t, std::ostream& os) {  // NOLINT(*)
   LOG(FATAL) << "Cannot convert type " << t << " to C type";
 }
 
+
 inline void PrintConst(const IntImm* op, std::ostream& os, CodeGenC* p) { // NOLINT(*)
   if (op->type == Int(32)) {
     std::ostringstream temp;
@@ -785,7 +619,7 @@ void CodeGenC::VisitStmt_(const Store* op) {
   Type t = op->value.type();
   if (t.lanes() == 1) {
     std::string value = this->PrintExpr(op->value);
-    std::string ref = this->GetBufferRef(t, op->buffer_var.get(), op->index);
+    std::string ref  = this->GetBufferRef(t, op->buffer_var.get(), op->index);
     this->PrintIndent();
     stream << ref << " = " << value << ";\n";
   } else {
@@ -880,92 +714,49 @@ void CodeGenC::VisitExpr_(const GetSlice *op, std::ostream& os) { // NOLINT(*)
 }
 
 void CodeGenC::VisitExpr_(const SetBit *op, std::ostream& os) { // NOLINT(*)
-  LOG(FATAL) << "SetBit is not implemented yet in C";
+  LOG(FATAL) << "SetBit is not implemented yet";
 }
 
 void CodeGenC::VisitExpr_(const SetSlice *op, std::ostream& os) { // NOLINT(*)
-  LOG(FATAL) << "SetSlice is not implemented yet in C";
+  LOG(FATAL) << "SetSlice is not implemented yet";
 }
 
 void CodeGenC::VisitExpr_(const Quantize *op, std::ostream& os) { // NOLINT(*)
-  LOG(FATAL) << "Quantize is not yet support in C";
-}
-
-void CodeGenC::VisitExpr_(const StreamExpr *op, std::ostream& os) { // NOLINT(*)
-  auto v = op->buffer_var.get();
-  auto it = var_idmap_.find(v);
-  CHECK(it != var_idmap_.end())
-    << "variable " << v->name_hint << " not decalred";
+ LOG(FATAL) << "Quantize is not yet support";
 }
 
 void CodeGenC::VisitExpr_(const KernelExpr *op, std::ostream& os) { // NOLINT(*)
-  os << op->name << "(";
-  for (size_t i = 0; i < op->args.size(); ++i) {
-    PrintExpr(op->args[i], os);
-    if (i != op->args.size() - 1) os << ", ";
-  }
-  os << ")";
-}
-
-void CodeGenC::VisitStmt_(const StreamStmt *op) { // NOLINT(*)
-    CHECK(!var_idmap_.count(op->buffer_var.get())); 
-    std::string vid = AllocVarID(op->buffer_var.get());
-    vid = GetVarID(op->value.as<Load>()->buffer_var.get()); 
-    PrintIndent();
-    auto load_op = op->value.as<Load>(); 
-    auto v = load_op->buffer_var.as<Variable>();
-    // placeholder args using recv name 
-    if (stream_table.count(v)) {
-      auto tuple = arg_top_vars[v];
-      arg_top_vars[v] = std::make_tuple(vid, std::get<1>(tuple),
-                                        std::get<2>(tuple));
-      stream_table[v] = true;
-    } // else: streamed externop defined in analysis
-    // PrintExpr(op->value, stream);
-    // stream << vid << ".write()\n";
+  LOG(FATAL) << "KernelExpr is not yet support";
 }
 
 void CodeGenC::VisitStmt_(const LetStmt* op) {
   std::string value = PrintExpr(op->value);
-  // Skip the argument retrieving assign statement
-  std::string vid = AllocVarID(op->var.get());
   if (print_ssa_form_) {
     CHECK(!var_idmap_.count(op->var.get()));
     var_idmap_[op->var.get()] = value;
   } else {
     PrintIndent();
-    if (op->var.type() != Handle() &&
-        value.find("TVMArray") == std::string::npos &&
-        value.find("arg") != 0) {
-      PrintIndent();
+    if (op->var.type() == Handle() &&
+        handle_data_type_.count(op->var.get())) {
+      PrintType(handle_data_type_.at(op->var.get()), stream);
+      stream << "* "
+             << AllocVarID(op->var.get())
+             << " = (";
+      PrintType(handle_data_type_.at(op->var.get()), stream);
+      stream << "*)"  << value << ";\n";
+    } else {
       PrintType(op->var.type(), this->stream);
       this->stream << ' '
-                   << vid
+                   << AllocVarID(op->var.get())
                    << " = " << value << ";\n";
-    // modify var idmap for passed in args
-    } else if (value.find("data") != std::string::npos ||
-               value.substr(0, 3) == "arg") {
-      auto v = op->var.get();
-      arg_vars.push_back(v);
-      stream_table[v] = false; 
-      std::string api_name = "arg" + std::to_string(arg_count);
-      auto arg = map_arg_type_[api_name];
-      // PrintType(std::get<1>(arg), arg_stream);
-      CHECK(arg_count < arg_shapes.size());
-      auto shape = arg_shapes[arg_count];
-      arg_top_vars[v] = std::make_tuple(vid, std::get<1>(arg), shape);
-      arg_count += 1;
     }
-    PrintStmt(op->body);
   }
+  PrintStmt(op->body);
 }
 
 void CodeGenC::VisitStmt_(const Allocate* op) {
   CHECK(!is_zero(op->condition));
-  std::string vid; 
-  if (!var_idmap_.count(op->buffer_var.get())) 
-    vid = AllocVarID(op->buffer_var.get());
-  else vid = GetVarID(op->buffer_var.get());
+  std::string vid = AllocVarID(op->buffer_var.get());
   if (op->new_expr.defined()) {
     // Prefer global static allocation for the program
     CHECK_EQ(op->free_function, "nop");
@@ -1008,64 +799,6 @@ void CodeGenC::VisitStmt_(const AttrStmt* op) {
     const Variable* v = op->node.as<Variable>();
     CHECK(v);
     volatile_buf_.insert(v);
-  } else if (op->attr_key == ir::attr::device_scope) {
-    // print top( ... in host and enter fpga scope 
-    if (op->value.as<StringImm>()->value == "fpga" && !fpga_scope_) {
-      fpga_scope_ = true;
-      PrintIndent();
-       
-      // track the stream usage
-      StreamCollector collector(stream_table, "cpu");
-      collector.Visit(op->body);
-
-      // update data type and name 
-      for (auto k : collector.host_undefined_) {
-        auto v = k.get();
-        arg_vars.push_back(v);
-        stream_table[v] = true;
-        auto tuple = arg_top_vars[v];
-        arg_top_vars[v] = std::make_tuple(v->name_hint,
-                                          std::get<1>(tuple),
-                                          std::get<2>(tuple)); 
-      }
-      TypeCollector visitor(arg_top_vars);
-      visitor.Visit(op->body);
-  
-      // generte function calls 
-      stream << "top(";
-      int index = 0;
-      for (size_t i = 0; i < arg_vars.size(); i++) {
-        auto v = arg_vars[i];
-        std::string arg_name;
-        if (stream_table[v]) 
-          arg_name = std::get<0>(arg_top_vars[v]);
-        else arg_name = GetVarID(v); 
-        if (index !=0) stream << ", ";
-        stream << arg_name;
-        // print kernel func signature
-        if (index != 0) arg_stream << ", ";
-        PrintType(std::get<1>(arg_top_vars[v]), arg_stream);
-        auto shape = std::get<2>(arg_top_vars[v]);
-        arg_stream << " " << arg_name;
-        for (size_t k = 0; k < shape.size(); k++)
-          arg_stream << "[" << shape[k] << "]";
-        index++;
-      }
-      stream << ");\n";
-  
-      // switch context to device scope
-      host_stream << this->stream.str();
-      this->stream.str("");
-      this->stream.clear();
-  
-    // swtich from device to host
-    } else if (op->value.as<StringImm>()->value == "cpu" && 
-               fpga_scope_) {
-      fpga_scope_ = false;
-      device_stream << this->stream.str();
-      this->stream.str("");
-      this->stream.clear();
-    }
   }
   this->PrintStmt(op->body);
 }
@@ -1156,75 +889,17 @@ void CodeGenC::VisitStmt_(const ProducerConsumer *op) {
   PrintStmt(op->body);
 }
 
-void CodeGenC::VisitStmt_(const KernelDef* op) {
-  LoweredFunc f;
-  // save func states
-  SaveFuncState(f);
-  InitFuncState(f);
-  std::ostringstream save;
-  save << this->stream.str();
-  this->stream.str("");
-  this->stream.clear();
-
-  // skip the first underscore
-  GetUniqueName("_");
-  // add to alloc buffer : type.
-  for (const auto & k : op->args) {
-    RegisterHandleType(k.get(), k.get()->type);
-  }
-  // print function signature
-  PrintType(op->ret_type, stream);
-  stream << " " << op->name << "(";
-  for (size_t k = 0; k < op->channels.size(); k+=2) {
-    int pos = op->channels[k].as<IntImm>()->value;  
-    stream_arg_pos[op->name].insert(pos);
-  }
-  for (size_t i = 0; i < op->args.size(); ++i) {
-    VarExpr v = op->args[i];
-    var_shape_map_[v.get()] = op->api_args[i];
-    std::string vid = AllocVarID(v.get());
-    if (i != 0) stream << ", ";
-    std::string str = PrintExpr(op->api_types[i]);
-    Type type = String2Type(str);
-    PrintType(type, stream);
-    this->stream << " " << vid << "[";
-    if (v.type().is_handle()) {
-      for (size_t j = 0; j < op->api_args[i].size(); j++) {
-        if (j != 0) stream << "* ";
-        auto dim = op->api_args[i][j].as<IntImm>()->value;
-        this->stream << dim;
-      }
-      this->stream << ']';
-    }
-  }  
-  stream << ") {\n";
-  int func_scope = BeginScope();
-  range_ = CollectIterRange(op->body);
-  PrintStmt(op->body);
-  EndScope(func_scope);
-  stream << "}\n\n";
-
-  // restore default stream
-  module_stream << this->stream.str();
-  this->stream.str(""); 
-  this->stream.clear();
-  this->stream << save.str();
-  RestoreFuncState(f);
+void CodeGenC::VisitStmt_(const KernelDef *op) {
+  LOG(FATAL) << "KernelDef is not yet support";
 }
 
 void CodeGenC::VisitStmt_(const KernelStmt *op) {
-  PrintIndent();
-  stream << op->name << "(";
-  for (size_t i = 0; i < op->args.size(); i++) {
-    PrintExpr(op->args[i], stream);
-    if (i < op->args.size() -1) stream << ", ";
-  }
-  stream << ");\n";
+  LOG(FATAL) << "KernelStmt is not yet support";
 }
 
 void CodeGenC::VisitStmt_(const Return *op) {
   this->stream << "return ";
-  PrintExpr(op->value, stream);
+  PrintExpr(op->value);
   this->stream << ";\n";
 }
 
@@ -1247,28 +922,5 @@ void CodeGenC::VisitStmt_(const While *op) {
 void CodeGenC::VisitStmt_(const Partition* op) {
 }
 
-void CodeGenC::SaveFuncState(LoweredFunc f) {
-  // clear save info copy
-  alloc_storage_scope_save.clear();
-  handle_data_type_save.clear();
-  var_shape_map_save.clear();
-  range_save.clear();
-  // backup func info and clear
-  alloc_storage_scope_save = alloc_storage_scope_;
-  handle_data_type_save = handle_data_type_;
-  var_shape_map_save = var_shape_map_;
-  range_save = range_;
-  CodeGenSourceBase::SaveFuncState();
-}
-
-void CodeGenC::RestoreFuncState(LoweredFunc f) {
-  this->InitFuncState(f);
-  alloc_storage_scope_ = alloc_storage_scope_save;
-  handle_data_type_ = handle_data_type_save;
-  var_shape_map_ = var_shape_map_save;
-  range_ = range_save;
-  CodeGenSourceBase::RestoreFuncState();
-}
-
 }  // namespace codegen
 }  // namespace TVM
diff --git a/tvm/src/codegen/codegen_c.h b/tvm/src/codegen/codegen_c.h
index d7292b38f..f579ca579 100644
--- a/tvm/src/codegen/codegen_c.h
+++ b/tvm/src/codegen/codegen_c.h
@@ -8,7 +8,6 @@
 
 #include <tvm/ir.h>
 #include <tvm/ir_functor_ext.h>
-#include <tvm/ir_visitor.h>
 #include <tvm/codegen.h>
 #include <tvm/lowered_func.h>
 #include <string>
@@ -16,64 +15,11 @@
 #include <unordered_map>
 #include <unordered_set>
 #include "./codegen_source_base.h"
-#include "./merlinc/codeanalys_merlinc.h"
-#include "../runtime/thread_storage_scope.h"
 
 namespace TVM {
 namespace codegen {
 
 using namespace ir;
-template<class T, class V>
-using str2tupleMap = std::unordered_map<std::string, std::tuple<T, V>>;
-using var2nameType = std::unordered_map<const Variable*, 
-    std::tuple<std::string, Type, std::vector<int>>>; 
-
-Type String2Type(std::string& s);
-std::string getIndex(std::vector<int> shape);
-
-/*!
- * \brief A data type collector  
- *
- *  CodeGenC TypeCollector gathers information  
- *  of different types of each variable
- *
- */
-class TypeCollector final : public IRVisitor {
- public:
-  var2nameType& top_args_;
-  TypeCollector(var2nameType& top_args) : top_args_(top_args) {};
-  void Visit_(const Allocate *op);
-};
-
-/*!
- * \brief An undefined variable collector
- *
- * CodeGenC stream data collector detects undefined 
- * variable and create channels for them
- *
- * */
-class StreamCollector final : public IRVisitor {
- public:
-  Array<Var> host_undefined_;
-  std::unordered_map<const Variable*, int> host_use_count_;
-  std::unordered_map<const Variable*, int> host_def_count_;
-  StreamCollector(std::unordered_map<const Variable*, bool>& stream_table,
-                  std::string initial_scope)
-    : stream_table_(stream_table),
-      scope_(initial_scope) {};
-  void Visit_(const Allocate *op);
-  void Visit_(const Load *op);
-  void Visit_(const Store *op);
-  void Visit_(const StreamStmt *op);
-  void Visit_(const AttrStmt *op);
-  void HandleDef(const Variable* v);
-  void HandleUse(const Expr& v);
- private: 
-  std::unordered_map<const Variable*, bool>& stream_table_;
-  std::string scope_;
-  bool switch_on{true};
-};
-
 /*!
  * \brief A base class to generate C code.
  *
@@ -98,22 +44,12 @@ class CodeGenC :
    * \brief Add the function to the generated module.
    * \param f The function to be compiled.
    */
-  void AddFunction(LoweredFunc f, str2tupleMap<std::string, Type> map_arg_type);
+  void AddFunction(LoweredFunc f);
   /*!
    * \brief Finalize the compilation and return the code.
    * \return The code.
    */
   std::string Finish();
-  /*!
-   * \brief Finalize the compilation and return the code.
-   * \return The host code.
-   */
-  std::string GetHost();
-  /*!
-   * \brief Finalize the compilation and return the code.
-   * \return The device code.
-   */
-  std::string GetDevice();
   /*!
    * \brief Print the Stmt n to CodeGenC->stream
    * \param n The statement to be printed.
@@ -177,7 +113,6 @@ class CodeGenC :
   void VisitExpr_(const SetSlice* op, std::ostream& os) override;  // NOLINT(*)
   void VisitExpr_(const Quantize* op, std::ostream& os) override;  // NOLINT(*)
   void VisitExpr_(const KernelExpr* op, std::ostream& os) override;  // NOLINT(*)
-  void VisitExpr_(const StreamExpr* op, std::ostream& os) override;  // NOLINT(*)
   // statment
   void VisitStmt_(const LetStmt* op) override;
   void VisitStmt_(const Store* op) override;
@@ -191,7 +126,6 @@ class CodeGenC :
   void VisitStmt_(const ProducerConsumer* op) override;
   void VisitStmt_(const KernelDef* op) override;
   void VisitStmt_(const KernelStmt* op) override;
-  void VisitStmt_(const StreamStmt* op) override;
   void VisitStmt_(const Return* op) override;
   void VisitStmt_(const Break* op) override;
   void VisitStmt_(const While* op) override;
@@ -225,38 +159,10 @@ class CodeGenC :
   // print store of single element.
   virtual void PrintVecElemStore(
       const std::string& vec, Type t, int i, const std::string& value);
-  // get a cast type from to
+  // Get a cast type from to
   virtual std::string CastFromTo(std::string value, Type from, Type target);
 
-  // map from var to shape, range and type
-  std::map<const Variable*, Array<Expr> > var_shape_map_;
-  std::unordered_map<const Variable*, Expr> range_;
-  str2tupleMap<std::string, Type> map_arg_type_;
-
-  // save for kernel 
-  std::map<const Variable*, Array<Expr> > var_shape_map_save;
-  std::unordered_map<const Variable*, Expr> range_save;
-
-  // index into ap_arg_type
-  size_t arg_count{0};
-  // map {var : (vid, Type, shape)}
-  var2nameType arg_top_vars;
-  // vector {vars} in top function 
-  std::vector<const Variable*> arg_vars;
-  // vector of top function arg dimension 
-  std::vector<std::vector<int>> arg_shapes;
-  // whether the function arg is streamed
-  std::unordered_map<const Variable*, bool> stream_table;
-  // map from kernel name to set of streamed arg position index
-  std::unordered_map<std::string, std::unordered_set<int>> stream_arg_pos;
-  // pre and post processing device code
-  virtual void PreProcess(std::ostringstream& os) {};
-  virtual void PostProcess(std::ostringstream& os) {};
-
  protected:
-  void SaveFuncState(LoweredFunc f);
-  void RestoreFuncState(LoweredFunc f);
-
   // Print reference to struct location
   std::string GetStructRef(
       Type t, const Expr& buffer, const Expr& index, int kind);
@@ -280,22 +186,12 @@ class CodeGenC :
       const std::string& target, const std::string& src, Type t) final;
   /*! \brief restrict keyword */
   std::string restrict_keyword_{""};
-  /*! \brief the func arg decl stream */
-  std::ostringstream arg_stream;
   /*! \brief the storage scope of allocation */
   std::unordered_map<const Variable*, std::string> alloc_storage_scope_;
   /*! \brief the data type of allocated buffers */
   std::unordered_map<const Variable*, Type> handle_data_type_;
   std::unordered_map<const Variable*, int> buf_length_map_;
 
-  // save for kernel gen
-  std::unordered_map<const Variable*, std::string> alloc_storage_scope_save;
-  std::unordered_map<const Variable*, Type> handle_data_type_save;
-  std::unordered_map<const Variable*, std::string> var_idmap_save;
-  std::unordered_map<std::string, int> name_alloc_map_save;
-  std::unordered_map<std::string, SSAEntry> ssa_assign_map_save;
-  std::vector<bool> scope_mark_save;
-
  private:
   /*! \brief whether to print in SSA form */
   bool print_ssa_form_{false};
diff --git a/tvm/src/codegen/codegen_cuda.cc b/tvm/src/codegen/codegen_cuda.cc
index 3c675ad06..badbf2849 100644
--- a/tvm/src/codegen/codegen_cuda.cc
+++ b/tvm/src/codegen/codegen_cuda.cc
@@ -25,10 +25,9 @@ void CodeGenCUDA::Init(bool output_ssa) {
   CHECK_EQ(vid_global_barrier_state_, runtime::symbol::tvm_global_barrier_state);
 }
 
-void CodeGenCUDA::AddFunction(LoweredFunc f,
-         str2tupleMap<std::string, Type> map_arg_type) {
+void CodeGenCUDA::AddFunction(LoweredFunc f) {
   this->stream << "extern \"C\" __global__ ";
-  CodeGenC::AddFunction(f, map_arg_type);
+  CodeGenC::AddFunction(f);
 }
 
 void CodeGenCUDA::VisitStmt_(const ir::For* op) {
diff --git a/tvm/src/codegen/codegen_cuda.h b/tvm/src/codegen/codegen_cuda.h
index e0c4f1a41..e49a47ae3 100644
--- a/tvm/src/codegen/codegen_cuda.h
+++ b/tvm/src/codegen/codegen_cuda.h
@@ -10,7 +10,6 @@
 #include <tvm/packed_func_ext.h>
 #include <string>
 #include "./codegen_c.h"
-#include "./merlinc/codeanalys_merlinc.h"
 
 namespace TVM {
 namespace codegen {
@@ -19,8 +18,7 @@ class CodeGenCUDA final : public CodeGenC {
  public:
   CodeGenCUDA();
   void Init(bool output_ssa);
-  void AddFunction(LoweredFunc f, 
-      str2tupleMap<std::string, Type> map_arg_type);
+  void AddFunction(LoweredFunc f);
   // override behavior
   void VisitStmt_(const ir::For* op) final;
   void PrintStorageSync(const Call* op) final;
diff --git a/tvm/src/codegen/opencl/codegen_opencl.cc b/tvm/src/codegen/codegen_opencl.cc
old mode 100755
new mode 100644
similarity index 53%
rename from tvm/src/codegen/opencl/codegen_opencl.cc
rename to tvm/src/codegen/codegen_opencl.cc
index 979a19e0f..d0297a1d9
--- a/tvm/src/codegen/opencl/codegen_opencl.cc
+++ b/tvm/src/codegen/codegen_opencl.cc
@@ -1,239 +1,206 @@
-# include <tvm/runtime/config.h>
-# include <tvm/packed_func_ext.h>
-# include <vector>
-# include <string>
-# include <cmath>
-# include <regex>
-# include "./codegen_opencl.h"
-# include "../../runtime/thread_storage_scope.h"
-
-namespace TVM{
-namespace codegen{
-  
-CodeGenOpenCL::CodeGenOpenCL(){
-  restrict_keyword_ = "restrict";
-}
-
-std::string CodeGenOpenCL::Finish() {
-  // inject extension enable pragma for fp16 and fp64
-  if (enable_fp16_) {
-    decl_stream
-        << "#ifdef cl_khr_fp16\n"
-           "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n"
-           "#elif defined(cl_amd_fp16)\n"
-           "#pragma OPENCL EXTENSION cl_amd_fp16 : enable\n"
-           "#else\n"
-           "#error \"Half precision floating point not supported"
-                    "by OpenCL implementation on your device.\" \n"
-           "#endif\n\n";
-  }
-
-  if (enable_fp64_) {
-    decl_stream
-        << "#ifdef cl_khr_fp64\n"
-           "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
-           "#elif defined(cl_amd_fp64)\n"
-           "#pragma OPENCL EXTENSION cl_amd_fp64 : enable\n"
-           "#else\n"
-           "#error \"Double precision floating point not supported"
-                    "by OpenCL implementation on your device.\" \n"
-           "#endif\n\n";
-  }
-
-  return CodeGenC::Finish();
-}
-
-void CodeGenOpenCL::BindThreadIndex(const IterVar& iv) {
-  CHECK(!var_idmap_.count(iv->var.get()));
-  runtime::ThreadScope ts = runtime::ThreadScope::make(iv->thread_tag);
-  std::ostringstream os;
-  if (ts.rank == 1) {
-    os << "get_local_id(" << ts.dim_index << ")";
-  } else {
-    os << "get_group_id(" << ts.dim_index << ")";
-  }
-  var_idmap_[iv->var.get()] =
-      CastFromTo(os.str(), UInt(64), iv->var.type());
-}
-
-
-void CodeGenOpenCL::PrintVecAddr(const Variable* buffer, Type t,
-                                 Expr base, std::ostream& os) {  // NOLINT(*)
-  if (!HandleTypeMatch(buffer, t.element_of())) {
-    os << '(';
-    auto it = alloc_storage_scope_.find(buffer);
-    if (it != alloc_storage_scope_.end()) {
-      PrintStorageScope(it->second, os);
-    }
-    os << ' ';
-    PrintType(t.element_of(), os);
-    os << "*)";
-  }
-  os << GetVarID(buffer) << " + ";
-  PrintExpr(base, os);
-}
-std::string CodeGenOpenCL::GetVecLoad(
-    Type t, const Variable* buffer, Expr base) {
-  std::ostringstream os;
-  os << "vload" << t.lanes() << "(0, ";
-  PrintVecAddr(buffer, t, base, os);
-  os << ")";
-  return os.str();
-}
-
-void CodeGenOpenCL::PrintVecStore(const Variable* buffer,
-                                  Type t, Expr base,
-                                  const std::string& value) {
-  this->PrintIndent();
-  stream << "vstore" << t.lanes() << "(" << value << ", 0, ";
-  PrintVecAddr(buffer, t, base, stream);
-  stream << ");\n";
-}
-
-void CodeGenOpenCL::PrintStorageSync(const Call* op) {
-  const std::string& sync = op->args[0].as<StringImm>()->value;
-  if (sync == "warp") {
-    LOG(FATAL) << "warp sync not supported in opencl";
-  } else if (sync == "shared") {
-    this->PrintIndent();
-    this->stream << "barrier(CLK_LOCAL_MEM_FENCE);\n";
-  } else if (sync == "global") {
-    LOG(FATAL) << "not supported";
-  }
-}
-
-void CodeGenOpenCL::PrintStorageScope(
-    const std::string& scope, std::ostream& os) { // NOLINT(*)
-  if (scope == "global") {
-    // os << "global ";
-  } else if (scope == "shared") {
-    // os << "local ";
-  }
-}
-
-std::string CodeGenOpenCL::CastFromTo(std::string value, Type from, Type target) {
-  if (from == target) return value;
-  std::ostringstream os;
-  if (target.lanes() == 1) {
-    os << "((";
-    this->PrintType(target, os);
-    os << ")" << value << ")";
-  } else {  // convert vector type
-    os << "(";
-    os << "convert_";
-    this->PrintType(target, os);
-    os << "(" << value << "))";
-  }
-  return os.str();
-}
-
-void CodeGenOpenCL::VisitExpr_(const Broadcast* op, std::ostream& os) {   // NOLINT(*)
-  std::string v = PrintExpr(op->value);
-  os << "((";
-  PrintType(op->type, os);
-  os << ")(";
-  for (int i = 0; i < op->lanes; ++i) {
-    if (i != 0) os << ", ";
-    os << v;
-  }
-  os << "))";
-}
-
-void CodeGenOpenCL::VisitExpr_(const Call * op, std::ostream& os) { // NOLINT(*)
-    if (op->is_intrinsic(intrinsic::tvm_if_then_else)) {
-        os << "(";
-        PrintType(op->args[2].type(), os);
-        os << ")";
-    }
-    CodeGenC::VisitExpr_(op, os);
-}
-
-void CodeGenOpenCL::VisitStmt_(const LetStmt* op) {
-  std::string value = PrintExpr(op->value);
-  // Skip the argument retrieving assign statement
-  std::string vid = AllocVarID(op->var.get());
-  if (op->var.type() != Handle() &&
-      value.find("TVMArray") == std::string::npos &&
-      value.find("arg") != 0) {
-    PrintIndent();
-    PrintType(op->var.type(), this->stream);
-    this->stream << ' '
-                 << vid
-                 << " = " << value << ";\n";
-  }
-  PrintStmt(op->body);
-}
-
-
-void CodeGenOpenCL::VisitExpr_(const FloatImm * op, std::ostream& os) { // NOLINT(*)
-    if (std::isinf(op->value)) {
-        if ( op->value < 0) {
-            os << "-";
-        }
-        os << "INFINITY";
-    } else if (std::isnan(op->value)) {
-        os << "NAN";
-    } else {
-        CodeGenC::VisitExpr_(op, os);
-    }
-}
-
-void CodeGenOpenCL::VisitExpr_(const Select * op, std::ostream& os ) { // NOINT(*)
-    os << "(";
-    PrintType(op->true_value.type(), os);
-    os << ")";
-    CodeGenC::VisitExpr_(op, os);
-} 
-
-void CodeGenOpenCL::VisitStmt_(const IfThenElse* op) {
-  std::string cond = PrintExpr(op->condition);
-  // Skip the buffer data checking
-  if (std::regex_match(cond, std::regex("!\\((arg)(.+)(== NULL)\\)")))
-      return ;
-  PrintIndent();
-  if (cond[0] == '(' && cond[cond.length() - 1] == ')') {
-    stream << "if " << cond << " {\n";
-  } else {
-    stream << "if (" << cond << ") {\n";
-  }
-  int then_scope = BeginScope();
-  PrintStmt(op->then_case);
-  this->EndScope(then_scope);
-  if (op->else_case.defined()) {
-    PrintIndent();
-    stream << "} else {\n";
-    int else_scope = BeginScope();
-    PrintStmt(op->else_case);
-    this->EndScope(else_scope);
-  }
-  PrintIndent();
-  stream << "}\n";
-}
-
-void CodeGenOpenCL::GenForStmt(const For* op, std::string pragma, bool before) {
-  std::string extent = PrintExpr(op->extent);
-  std::string vid = AllocVarID(op->loop_var.get());
-  CHECK(is_zero(op->min));
-  if (before && pragma.length() > 0) {
-    PrintIndent();
-    stream << pragma;
-  }
-  PrintIndent();
-  stream << "for (";
-  PrintType(op->loop_var.type(), stream);
-  stream << ' ' << vid << " = 0; "
-            << vid << " < " << extent
-            << "; ++" << vid << ") {\n";
-  if (!before && pragma.length() > 0) {
-    PrintIndent();
-    stream << pragma;
-  }
-  int for_scope = BeginScope();
-  PrintStmt(op->body);
-  this->EndScope(for_scope);
-  PrintIndent();
-  stream << "}\n";
-}
-
-} // namespace codegen
-} // namespace TVM
+/*!
+ *  Copyright (c) 2017 by Contributors
+ * \file codegen_opencl.cc
+ */
+#include <tvm/runtime/config.h>
+#include <tvm/packed_func_ext.h>
+#include <vector>
+#include <string>
+#include "./codegen_opencl.h"
+#include "../runtime/thread_storage_scope.h"
+
+namespace TVM {
+namespace codegen {
+
+CodeGenOpenCL::CodeGenOpenCL() {
+  restrict_keyword_ = "restrict";
+}
+
+void CodeGenOpenCL::InitFuncState(LoweredFunc f) {
+  CodeGenC::InitFuncState(f);
+  for (Var arg : f->args) {
+    if (arg.type().is_handle()) {
+      alloc_storage_scope_[arg.get()] = "global";
+    }
+  }
+}
+
+void CodeGenOpenCL::AddFunction(LoweredFunc f) {
+  this->stream << "__kernel ";
+  CodeGenC::AddFunction(f);
+}
+
+std::string CodeGenOpenCL::Finish() {
+  // inject extension enable pragma for fp16 and fp64
+  if (enable_fp16_) {
+    decl_stream
+        << "#ifdef cl_khr_fp16\n"
+           "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n"
+           "#elif defined(cl_amd_fp16)\n"
+           "#pragma OPENCL EXTENSION cl_amd_fp16 : enable\n"
+           "#else\n"
+           "#error \"Half precision floating point not supported"
+                    "by OpenCL implementation on your device.\" \n"
+           "#endif\n\n";
+  }
+
+  if (enable_fp64_) {
+    decl_stream
+        << "#ifdef cl_khr_fp64\n"
+           "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
+           "#elif defined(cl_amd_fp64)\n"
+           "#pragma OPENCL EXTENSION cl_amd_fp64 : enable\n"
+           "#else\n"
+           "#error \"Double precision floating point not supported"
+                    "by OpenCL implementation on your device.\" \n"
+           "#endif\n\n";
+  }
+
+  return CodeGenC::Finish();
+}
+
+void CodeGenOpenCL::BindThreadIndex(const IterVar& iv) {
+  CHECK(!var_idmap_.count(iv->var.get()));
+  runtime::ThreadScope ts = runtime::ThreadScope::make(iv->thread_tag);
+  std::ostringstream os;
+  if (ts.rank == 1) {
+    os << "get_local_id(" << ts.dim_index << ")";
+  } else {
+    os << "get_group_id(" << ts.dim_index << ")";
+  }
+  var_idmap_[iv->var.get()] =
+      CastFromTo(os.str(), UInt(64), iv->var.type());
+}
+
+void CodeGenOpenCL::PrintType(Type t, std::ostream& os) {  // NOLINT(*)
+  int lanes = t.lanes();
+  if (t.is_handle()) {
+    CHECK_EQ(lanes, 1)
+        << "do not yet support vector types";
+    os << "void*"; return;
+  }
+  bool fail = false;
+  if (t.is_float()) {
+    switch (t.bits()) {
+      case 16:
+        os << "half";
+        enable_fp16_ = true;
+        break;
+      case 32: os << "float"; break;
+      case 64:
+        os << "double";
+        enable_fp64_ = true;
+        break;
+      default: fail = true; break;
+    }
+    if (!fail && lanes == 1) return;
+    if (!fail && (lanes >= 2 && lanes <= 16)) {
+      os << lanes; return;
+    }
+  } else if (t.is_uint() || t.is_int()) {
+    if (t.is_uint()) {
+      os << 'u';
+    }
+    if (t.bits() == 8 && t.lanes() == 4) {
+      // directly 4 8 bit int in integer.
+      os << "int"; return;
+    }
+    switch (t.bits()) {
+      case 8: os << "char"; break;
+      case 16: os << "short"; break;
+      case 32: os << "int"; break;
+      case 64: os << "long"; break;
+      case 1: os << "int"; break;
+      default: fail = true; break;
+    }
+    if (!fail && lanes == 1) return;
+    if (!fail && (lanes >= 2 && lanes <= 16)) {
+      os << lanes; return;
+    }
+  }
+  LOG(FATAL) << "Cannot convert type " << t << " to OpenCL type";
+}
+
+void CodeGenOpenCL::PrintVecAddr(const Variable* buffer, Type t,
+                                 Expr base, std::ostream& os) {  // NOLINT(*)
+  if (!HandleTypeMatch(buffer, t.element_of())) {
+    os << '(';
+    auto it = alloc_storage_scope_.find(buffer);
+    if (it != alloc_storage_scope_.end()) {
+      PrintStorageScope(it->second, os);
+    }
+    os << ' ';
+    PrintType(t.element_of(), os);
+    os << "*)";
+  }
+  os << GetVarID(buffer) << " + ";
+  PrintExpr(base, os);
+}
+std::string CodeGenOpenCL::GetVecLoad(
+    Type t, const Variable* buffer, Expr base) {
+  std::ostringstream os;
+  os << "vload" << t.lanes() << "(0, ";
+  PrintVecAddr(buffer, t, base, os);
+  os << ")";
+  return os.str();
+}
+
+void CodeGenOpenCL::PrintVecStore(const Variable* buffer,
+                                  Type t, Expr base,
+                                  const std::string& value) {
+  this->PrintIndent();
+  stream << "vstore" << t.lanes() << "(" << value << ", 0, ";
+  PrintVecAddr(buffer, t, base, stream);
+  stream << ");\n";
+}
+
+void CodeGenOpenCL::PrintStorageSync(const Call* op) {
+  const std::string& sync = op->args[0].as<StringImm>()->value;
+  if (sync == "warp") {
+    LOG(FATAL) << "warp sync not supported in opencl";
+  } else if (sync == "shared") {
+    this->PrintIndent();
+    this->stream << "barrier(CLK_LOCAL_MEM_FENCE);\n";
+  } else if (sync == "global") {
+    LOG(FATAL) << "not supported";
+  }
+}
+
+void CodeGenOpenCL::PrintStorageScope(
+    const std::string& scope, std::ostream& os) { // NOLINT(*)
+  if (scope == "global") {
+    os << "__global";
+  } else if (scope == "shared") {
+    os << "__local";
+  }
+}
+
+std::string CodeGenOpenCL::CastFromTo(std::string value, Type from, Type target) {
+  if (from == target) return value;
+  std::ostringstream os;
+  if (target.lanes() == 1) {
+    os << "((";
+    this->PrintType(target, os);
+    os << ")" << value << ")";
+  } else {  // convert vector type
+    os << "(";
+    os << "convert_";
+    this->PrintType(target, os);
+    os << "(" << value << "))";
+  }
+  return os.str();
+}
+
+void CodeGenOpenCL::VisitExpr_(const Broadcast* op, std::ostream& os) {   // NOLINT(*)
+  std::string v = PrintExpr(op->value);
+  os << "((";
+  PrintType(op->type, os);
+  os << ")(";
+  for (int i = 0; i < op->lanes; ++i) {
+    if (i != 0) os << ", ";
+    os << v;
+  }
+  os << "))";
+}
+}  // namespace codegen
+}  // namespace TVM
diff --git a/tvm/src/codegen/codegen_opencl.h b/tvm/src/codegen/codegen_opencl.h
new file mode 100644
index 000000000..088ab089a
--- /dev/null
+++ b/tvm/src/codegen/codegen_opencl.h
@@ -0,0 +1,51 @@
+/*!
+ *  Copyright (c) 2017 by Contributors
+ * \file codegen_opencl.h
+ * \brief Generate OpenCL device code.
+ */
+#ifndef TVM_CODEGEN_CODEGEN_OPENCL_H_
+#define TVM_CODEGEN_CODEGEN_OPENCL_H_
+
+#include <tvm/codegen.h>
+#include <tvm/packed_func_ext.h>
+#include <string>
+#include "./codegen_c.h"
+
+namespace TVM {
+namespace codegen {
+
+class CodeGenOpenCL final : public CodeGenC {
+ public:
+  CodeGenOpenCL();
+  void AddFunction(LoweredFunc f);
+  std::string Finish();
+
+  // override print thread tag.
+  void InitFuncState(LoweredFunc f) final;
+  void BindThreadIndex(const IterVar& iv) final;  // NOLINT(*)
+  void PrintStorageScope(const std::string& scope, std::ostream& os) final; // NOLINT(*)
+  void PrintStorageSync(const Call* op) final;  // NOLINT(*)
+  void PrintType(Type t, std::ostream& os) final; // NOLINT(*)
+  std::string GetVecLoad(Type t, const Variable* buffer,
+                         Expr base) final;
+  void PrintVecStore(const Variable* buffer,
+                     Type t, Expr base,
+                     const std::string& value) final;  // NOLINT(*)
+  // the address of load/store
+  void PrintVecAddr(const Variable* buffer, Type t,
+                    Expr base, std::ostream& os);  // NOLINT(*)
+  std::string CastFromTo(std::string value, Type from, Type target); // NOLINT(*)
+
+  // overload visitor
+  void VisitExpr_(const Broadcast* op, std::ostream& os) final; // NOLINT(*)
+
+ private:
+  // whether enable fp16 and fp64 extension
+  bool enable_fp16_{false};
+  bool enable_fp64_{false};
+};
+
+}  // namespace codegen
+}  // namespace TVM
+
+#endif  // TVM_CODEGEN_CODEGEN_OPENCL_H_
diff --git a/tvm/src/codegen/codegen_source_base.cc b/tvm/src/codegen/codegen_source_base.cc
index 9fc6fc706..0df1ad276 100644
--- a/tvm/src/codegen/codegen_source_base.cc
+++ b/tvm/src/codegen/codegen_source_base.cc
@@ -8,79 +8,34 @@ namespace TVM {
 namespace codegen {
 
 void CodeGenSourceBase::ClearFuncState() {
-  host_name_alloc_map_.clear();
-  device_name_alloc_map_.clear();
+  name_alloc_map_.clear();
   ssa_assign_map_.clear();
   var_idmap_.clear();
   scope_mark_.clear();
 }
 
-void CodeGenSourceBase::SaveFuncState() {
-  host_name_alloc_map_save.clear();
-  device_name_alloc_map_save.clear();
-  ssa_assign_map_save.clear();
-  var_idmap_save.clear();
-  scope_mark_save.clear();
-  // save state into private member
-  host_name_alloc_map_save = host_name_alloc_map_;
-  device_name_alloc_map_save = device_name_alloc_map_;
-  ssa_assign_map_save = ssa_assign_map_;
-  var_idmap_save = var_idmap_;
-  scope_mark_save = scope_mark_;
-}
-
-void CodeGenSourceBase::RestoreFuncState() {
-  this->ClearFuncState();
-  host_name_alloc_map_ = host_name_alloc_map_save;
-  device_name_alloc_map_ = device_name_alloc_map_save;
-  ssa_assign_map_ = ssa_assign_map_save;
-  var_idmap_ = var_idmap_save;
-  scope_mark_ = scope_mark_save;
-}
-
 std::string CodeGenSourceBase::GetUniqueName(std::string prefix) {
   for (size_t i = 0; i < prefix.size(); ++i) {
     if (prefix[i] == '.') prefix[i] = '_';
   }
-  if (fpga_scope_) { 
-    auto it = device_name_alloc_map_.find(prefix);
-    if (it != device_name_alloc_map_.end()) {
-      while (true) {
-        std::ostringstream os;
-        os << prefix << (++it->second);
-        std::string name = os.str();
-        if (device_name_alloc_map_.count(name) == 0) {
-          prefix = name;
-          break;
-        }
+  auto it = name_alloc_map_.find(prefix);
+  if (it != name_alloc_map_.end()) {
+    while (true) {
+      std::ostringstream os;
+      os << prefix << (++it->second);
+      std::string name = os.str();
+      if (name_alloc_map_.count(name) == 0) {
+        prefix = name;
+        break;
       }
     }
-    device_name_alloc_map_[prefix] = 0;
-    return prefix;
-  } else {
-    auto it = host_name_alloc_map_.find(prefix);
-    if (it != host_name_alloc_map_.end()) {
-      while (true) {
-        std::ostringstream os;
-        os << prefix << (++it->second);
-        std::string name = os.str();
-        if (host_name_alloc_map_.count(name) == 0) {
-          prefix = name;
-          break;
-        }
-      }
-    }
-    host_name_alloc_map_[prefix] = 0;
-    return prefix;
   }
+  name_alloc_map_[prefix] = 0;
+  return prefix;
 }
 
 std::string CodeGenSourceBase::SSAGetID(std::string src, Type t) {
-  if (fpga_scope_) {
-    if (device_name_alloc_map_.count(src)) return src;
-  } else {
-    if (host_name_alloc_map_.count(src)) return src;
-  }
+  if (name_alloc_map_.count(src)) return src;
   auto it = ssa_assign_map_.find(src);
   if (it != ssa_assign_map_.end()) {
     if (scope_mark_.at(it->second.scope_id)) {
diff --git a/tvm/src/codegen/codegen_source_base.h b/tvm/src/codegen/codegen_source_base.h
index befc3f8ec..e140662c1 100644
--- a/tvm/src/codegen/codegen_source_base.h
+++ b/tvm/src/codegen/codegen_source_base.h
@@ -39,10 +39,6 @@ class CodeGenSourceBase {
   };
   /*! \brief Clear the states that might relates to function generation */
   void ClearFuncState();
-  /*! \brief Save the states that might relates to function generation */
-  void SaveFuncState();
-  /*! \brief Restore the states that might relates to function generation */
-  void RestoreFuncState();
   /*! \brief print the current indented value */
   void PrintIndent();
   /*!
@@ -93,36 +89,18 @@ class CodeGenSourceBase {
   std::ostringstream decl_stream;
   /*! \brief the stream to be printed */
   std::ostringstream stream;
-  /*! \brief the stream for mocule */
-  std::ostringstream module_stream;
-  /*! \brief the stream host */
-  std::ostringstream host_stream;
-  /*! \brief the stream device */
-  std::ostringstream device_stream;
   /*! \brief name of each variable */
   std::unordered_map<const Variable*, std::string> var_idmap_;
-  /*! \brief save states as copy */
-  std::unordered_map<const Variable*, std::string> var_idmap_save;
-  /*! \brief whether generate code for fpga */
-  bool fpga_scope_{false};
-  /*! \brief name allocation map for host */
-  std::unordered_map<std::string, int> host_name_alloc_map_;
-  /*! \brief name allocation map for device */
-  std::unordered_map<std::string, int> device_name_alloc_map_;
 
  private:
   /*! \brief assignment map of ssa */
   std::unordered_map<std::string, SSAEntry> ssa_assign_map_;
+  /*! \brief name allocation map */
+  std::unordered_map<std::string, int> name_alloc_map_;
   /*! \brief array to check whether we are inside certain scope */
   std::vector<bool> scope_mark_;
   /*! \brief The current indentation value */
   int indent_{0};
-  /*! \brief Save states as copy */
-  std::unordered_map<std::string, SSAEntry> ssa_assign_map_save;
-  std::unordered_map<std::string, int> host_name_alloc_map_save;
-  std::unordered_map<std::string, int> device_name_alloc_map_save;
-  std::vector<bool> scope_mark_save;
- 
 };
 
 /*!
diff --git a/tvm/src/codegen/hlsc/build_hlsc.cc b/tvm/src/codegen/hlsc/build_hlsc.cc
index 2494ee66f..42fb68089 100644
--- a/tvm/src/codegen/hlsc/build_hlsc.cc
+++ b/tvm/src/codegen/hlsc/build_hlsc.cc
@@ -24,6 +24,7 @@ runtime::Module BuildVivadoHLSCSim(Array<LoweredFunc> funcs) {
     cg.AddFunction(f, map_arg_type);
   }
   std::string code = cg.Finish();
+
   return runtime::CreateVivadoHLSModule(funcs[0], code);
 }
 
@@ -46,6 +47,7 @@ std::string BuildHLSC(Array<LoweredFunc> funcs) {
     cg.AddFunction(f, map_arg_type);
   }
   std::string code = cg.Finish();
+
   LOG(WARNING) << "HLS C doesn't have runtime, return kernel code";
   return code;
 }
diff --git a/tvm/src/codegen/hlsc/codegen_hlsc.cc b/tvm/src/codegen/hlsc/codegen_hlsc.cc
index d7fc610d7..3e8696fba 100644
--- a/tvm/src/codegen/hlsc/codegen_hlsc.cc
+++ b/tvm/src/codegen/hlsc/codegen_hlsc.cc
@@ -15,50 +15,49 @@ namespace codegen {
 
 void CodeGenHLSC::AddFunction(LoweredFunc f,
         str2tupleMap<std::string, Type> map_arg_type) {
-  CodeGenC::AddFunction(f, map_arg_type);
-  // // Write header files
-  // // TODO: Insert header files here
-  // // Clear previous generated state
-  // this->InitFuncState(f);
-  // // Register alloc buffer type
-  // for (const auto & kv : f->handle_data_type) {
-  //   RegisterHandleType(kv.first.get(), kv.second.type());
-  // }
-  // // Write entry function name
-  // this->stream << "void " << f->name << "(";
-  // // Write arguments
-  // for (size_t i = 0; i < f->args.size(); ++i) {
-  //   Var v = f->args[i];
-  //   std::string vid = AllocVarID(v.get());
-  //   if (i != 0) this->stream << ", ";
-  //   if (map_arg_type.find(vid) == map_arg_type.end()) {
-  //     LOG(WARNING) << vid << " type not found\n";
-  //     PrintType(v.type(), this->stream);
-  //     this->stream << ' ' << vid;
-  //   }
-  //   else {
-  //     auto arg = map_arg_type[vid];
-  //     PrintType(std::get<1>(arg), this->stream);
-  //     this->stream << ' ' << std::get<0>(arg);
-  //     const BufferNode* buf = f->api_args[i].as<BufferNode>();
-  //     if (v.type().is_handle() && buf) {
-  //       var_shape_map_[buf->data.get()] = buf->shape;
-  //       for (size_t i = 0; i < buf->shape.size(); i++) {
-  //         this->stream << '[';
-  //         this->PrintExpr(buf->shape[i], this->stream);
-  //         this->stream << ']';
-  //       }
-  //     }
-  //     // this->stream << "*"; TODO: create an option for this
-  //   }
-  // }
-  // stream << ") {\n";
-  // int func_scope = this->BeginScope();
-  // range_ = CollectIterRange(f->body);
-  // this->PrintStmt(f->body);
-  // this->EndScope(func_scope);
-  // this->PrintIndent();
-  // this->stream << "}\n\n";
+  // Write header files
+  // TODO: Insert header files here
+  // Clear previous generated state
+  this->InitFuncState(f);
+  // Register alloc buffer type
+  for (const auto & kv : f->handle_data_type) {
+    RegisterHandleType(kv.first.get(), kv.second.type());
+  }
+  // Write entry function name
+  this->stream << "void " << f->name << "(";
+  // Write arguments
+  for (size_t i = 0; i < f->args.size(); ++i) {
+    Var v = f->args[i];
+    std::string vid = AllocVarID(v.get());
+    if (i != 0) this->stream << ", ";
+    if (map_arg_type.find(vid) == map_arg_type.end()) {
+      LOG(WARNING) << vid << " type not found\n";
+      PrintType(v.type(), this->stream);
+      this->stream << ' ' << vid;
+    }
+    else {
+      auto arg = map_arg_type[vid];
+      PrintType(std::get<1>(arg), this->stream);
+      this->stream << ' ' << std::get<0>(arg);
+      const BufferNode* buf = f->api_args[i].as<BufferNode>();
+      if (v.type().is_handle() && buf) {
+        var_shape_map_[buf->data.get()] = buf->shape;
+        for (size_t i = 0; i < buf->shape.size(); i++) {
+          this->stream << '[';
+          this->PrintExpr(buf->shape[i], this->stream);
+          this->stream << ']';
+        }
+      }
+      // this->stream << "*"; TODO: create an option for this
+    }
+  }
+  stream << ") {\n";
+  int func_scope = this->BeginScope();
+  range_ = CollectIterRange(f->body);
+  this->PrintStmt(f->body);
+  this->EndScope(func_scope);
+  this->PrintIndent();
+  this->stream << "}\n\n";
 }
 
 std::string CodeGenHLSC::GetBufferRef(Type t, const Variable* buffer, Expr index) {
@@ -69,16 +68,14 @@ std::string CodeGenHLSC::GetBufferRef(Type t, const Variable* buffer, Expr index
         buf_length_map_[buffer] == 1);
     if (is_scalar) {
       os << vid;
-    } else { 
-      os << vid << "[";
-      PrintExpr(index, os);
-      os << "]";
-      // std::vector<Expr> indices = ExtractIndices(index, var_shape_map_[buffer], range_);
-      // for (size_t i = 0; i < indices.size(); i++) {
-      //   os << '[';
-      //   PrintExpr(indices[i], os);
-      //   os << ']';
-      // }
+    } else {     
+      os << vid;
+      std::vector<Expr> indices = ExtractIndices(index, var_shape_map_[buffer], range_);
+      for (size_t i = 0; i < indices.size(); i++) {
+        os << '[';
+        PrintExpr(indices[i], os);
+        os << ']';
+      }
     }
   }  
   return os.str();
@@ -91,7 +88,6 @@ void CodeGenHLSC::VisitExpr_(const Min *op, std::ostream& os) {  // NOLINT(*)
   PrintExpr(op->b, os);
   os << ")";
 }
-
 void CodeGenHLSC::VisitExpr_(const Max *op, std::ostream& os) {  // NOLINT(*)
   os << "std::max(";
   PrintExpr(op->a, os);
@@ -101,20 +97,19 @@ void CodeGenHLSC::VisitExpr_(const Max *op, std::ostream& os) {  // NOLINT(*)
 }
 
 void CodeGenHLSC::VisitStmt_(const LetStmt* op) {
-  CodeGenC::VisitStmt_(op);
-  // std::string value = PrintExpr(op->value);
-  // // Skip the argument retrieving assign statement
-  // std::string vid = AllocVarID(op->var.get());
-  // if (op->var.type() != Handle() &&
-  //     value.find("TVMArray") == std::string::npos &&
-  //     value.find("arg") != 0) {
-  //   PrintIndent();
-  //   PrintType(op->var.type(), this->stream);
-  //   this->stream << ' '
-  //                << vid
-  //                << " = " << value << ";\n";
-  // }
-  // PrintStmt(op->body);
+  std::string value = PrintExpr(op->value);
+  // Skip the argument retrieving assign statement
+  std::string vid = AllocVarID(op->var.get());
+  if (op->var.type() != Handle() &&
+      value.find("TVMArray") == std::string::npos &&
+      value.find("arg") != 0) {
+    PrintIndent();
+    PrintType(op->var.type(), this->stream);
+    this->stream << ' '
+                 << vid
+                 << " = " << value << ";\n";
+  }
+  PrintStmt(op->body);
 }
 
 void CodeGenHLSC::GenForStmt(const For* op, std::string pragma, bool before) {
@@ -169,10 +164,7 @@ void CodeGenHLSC::VisitStmt_(const IfThenElse* op) {
 
 void CodeGenHLSC::VisitStmt_(const Allocate* op) {
   CHECK(!is_zero(op->condition));
-  std::string vid; 
-  if (!var_idmap_.count(op->buffer_var.get())) 
-    vid = AllocVarID(op->buffer_var.get());
-  else vid = GetVarID(op->buffer_var.get());
+  std::string vid = AllocVarID(op->buffer_var.get());
   this->PrintIndent();
   int32_t constant_size = op->constant_allocation_size();
   CHECK_GT(constant_size, 0)
@@ -181,22 +173,16 @@ void CodeGenHLSC::VisitStmt_(const Allocate* op) {
   var_shape_map_[buffer] = op->extents;
   std::string scope = alloc_storage_scope_.at(buffer);
   PrintStorageScope(scope, stream);
-
-  if (vid.find("stream_") != std::string::npos) { 
-    void(0); // alloc stream channel in pre-processing
-  } else {
-    PrintType(op->type, stream);
-    stream << ' '<< vid;
-    if (constant_size > 1) {// Transfer length one array to scalar
-      stream << "[";
-      for (size_t i = 0; i < op->extents.size(); i++) {
-        PrintExpr(op->extents[i], stream);
-        if (i != op->extents.size()-1) stream << "*";
-      }
+  PrintType(op->type, stream);
+  stream << ' '<< vid;
+  if (constant_size > 1) {// Transfer length one array to scalar
+    for (size_t i = 0; i < op->extents.size(); i++) {
+      stream << '[';
+      PrintExpr(op->extents[i], stream);
       stream << "]";
     }
-    stream << ";\n";
   }
+  stream << ";\n";
   buf_length_map_[buffer] = constant_size;
   RegisterHandleType(op->buffer_var.get(), op->type);
   for (size_t i = 0; i < op->attrs.size(); i++) {
diff --git a/tvm/src/codegen/hlsc/codegen_hlsc.h b/tvm/src/codegen/hlsc/codegen_hlsc.h
index fdd1747fa..c85cbc699 100644
--- a/tvm/src/codegen/hlsc/codegen_hlsc.h
+++ b/tvm/src/codegen/hlsc/codegen_hlsc.h
@@ -27,7 +27,9 @@ class CodeGenHLSC : public CodeGenC {
   void VisitStmt_(const Allocate* op) override;
 
   void GenForStmt(const For* op, std::string pragma, bool before);
-  
+
+  std::map<const Variable*, Array<Expr> > var_shape_map_;
+  std::unordered_map<const Variable*, Expr> range_;
  protected:
   std::string GetBufferRef(Type t, const Variable* buffer, Expr index);
 };
diff --git a/tvm/src/codegen/hlsc/codegen_vhls.cc b/tvm/src/codegen/hlsc/codegen_vhls.cc
index f944bef83..6a0977e40 100644
--- a/tvm/src/codegen/hlsc/codegen_vhls.cc
+++ b/tvm/src/codegen/hlsc/codegen_vhls.cc
@@ -21,83 +21,12 @@
 namespace TVM {
 namespace codegen {
 
-void CodeGenVivadoHLS::PreProcess(std::ostringstream& os) {
-  os << "\n";
-  int indent = 2;
-  for (size_t i = 0; i < arg_vars.size(); i++) {
-    auto v = arg_vars[i];
-    std::string arg_name;
-    if (stream_table[v]) 
-      arg_name = std::get<0>(arg_top_vars[v]);
-    else arg_name = GetVarID(v); 
-
-    // create local buffer saving result
-    auto shape = std::get<2>(arg_top_vars[v]);
-    auto dtype = std::get<1>(arg_top_vars[v]);
-    if (!stream_table[v]) { // unstreamed args 
-      // allocate local buffer 
-      for (int k = 0; k < indent; k++) os << ' ';
-      PrintType(dtype, os); 
-      os << " " << arg_name << "[";
-      for (size_t n = 0; n < shape.size(); n++) {
-        os << shape[n];
-        if (n != shape.size() - 1) os << "* ";
-      } 
-      os << "];\n";
- 
-      for (size_t j = 0; j < shape.size(); j++) {
-        for (int k = 0; k < indent; k++) os << ' ';
-        os << "for (int i" << j << " = 0; i"
-           << j << "< " << shape[j] << "; i" 
-           << j << "++) {\n";
-        // pass stream reference
-        if (j == shape.size() - 1) {
-          for (int k = 0; k < indent; k++) os << ' ';
-          os << "  " << arg_name << "["
-             << getIndex(shape) << "] = "  
-             << "fd_" << arg_name << ".read();\n";
-        }
-        indent += 2;
-      }
-      for (size_t m = 0; m < shape.size(); m++) {
-        indent -= 2;
-        for (int k = 0; k < indent; k++) os << ' ';
-        os << "}\n";
-      }
-    } else if (i == arg_vars.size() - 1 || true) { 
-      // allocate for return variable 
-      for (int k = 0; k < indent; k++) os << ' ';
-      PrintType(dtype, os); 
-      os << " " << arg_name << "[";
-      for (size_t n = 0; n < shape.size(); n++) {
-        os << shape[n];
-        if (n != shape.size() - 1) os << "* ";
-      } 
-      os << "];\n";
-    }
-  }
-}
-
-void CodeGenVivadoHLS::PostProcess(std::ostringstream& os) {
-//   os << "\n";
-//   int indent = 2;
-//   for (size_t i = 0; i < arg_vars.size(); i++) {
-//     auto v = arg_vars[i];
-//     std::string arg_name;
-//     if (stream_table[v]) 
-//       arg_name = std::get<0>(arg_top_vars[v]);
-//     else arg_name = GetVarID(v); 
-//     os  << arg_name << " = " << "fd_" 
-//         << arg_name << ".write();\n";
-}
-
 void CodeGenVivadoHLS::AddFunction(LoweredFunc f,
         str2tupleMap<std::string, Type> map_arg_type) {
   // Write header files
-  this->decl_stream << "#include <ap_int.h>\n";
-  this->decl_stream << "#include <ap_fixed.h>\n";
-  this->decl_stream << "#include <hls_stream.h>\n";
-  this->decl_stream << "#include <math.h>\n\n";
+  this->stream << "#include <ap_int.h>\n";
+  this->stream << "#include <ap_fixed.h>\n";
+  this->stream << "#include <math.h>\n\n";
   CodeGenHLSC::AddFunction(f, map_arg_type);
   if (soda_header_.is_open())
     soda_header_.close();
@@ -148,13 +77,6 @@ void CodeGenVivadoHLS::VisitStmt_(const Store* op) {
     this->stream << ref
                  << "[" << PrintExpr(sb->index)
                  << "] = " << PrintExpr(sb->value) << ";\n";
-  } else if (const StreamExpr* se = op->value.as<StreamExpr>()) {
-    std::string vid = GetVarID(se->buffer_var.get()); 
-    vid = vid.substr(0, vid.find("_stream_send")); 
-    PrintIndent();
-    this->stream << vid << "["
-                 << op->index << "] = "
-                 << "fd_" << vid << ".read();\n";
   } else {
     CodeGenC::VisitStmt_(op);
   }
@@ -221,30 +143,6 @@ void CodeGenVivadoHLS::VisitStmt_(const Partition* op) {
   stream << "\n";
 }
 
-void CodeGenVivadoHLS::VisitExpr_(const StreamExpr* op, std::ostream& os) {
-  CodeGenC::VisitExpr_(op, os);
-  std::string vid = GetVarID(op->buffer_var.get());
-  vid = vid.substr(0, vid.find("_stream_send")); 
-  os << vid << ".read()";
-}
-
-void CodeGenVivadoHLS::VisitStmt_(const StreamStmt* op) {
-  CodeGenC::VisitStmt_(op);
-  std::string vid = GetVarID(op->buffer_var.get());
-  switch (op->stream_type) {
-    case StreamType::Channel:
-      break;
-    case StreamType::FIFO:
-      break;
-    case StreamType::Pipe:
-      break;
-  }
-  vid = vid.substr(0, vid.find("_stream_send")); 
-  auto load = op->value.as<Load>();
-  stream << "fd_" << vid << ".write(" 
-         << vid << "["<< load->index << "]);\n";
-}
-
 class AllocateCollector final : public IRVisitor {
   public:
     AllocateCollector(std::vector<const Allocate*>& alloc_list,
@@ -262,144 +160,6 @@ class AllocateCollector final : public IRVisitor {
     VarExprUnorderedSet& outputs_;
 };
 
-void CodeGenVivadoHLS::VisitStmt_(const AttrStmt* op) {
-  if (op->attr_key == ir::attr::device_scope) {
-    // print top( ... in host and enter fpga scope 
-    if (op->value.as<StringImm>()->value == "fpga" && !fpga_scope_) {
-      fpga_scope_ = true;
-      PrintIndent();
-       
-      // track the stream usage
-      StreamCollector collector(stream_table, "cpu");
-      collector.Visit(op->body);
-
-      // update data type and name 
-      for (auto k : collector.host_undefined_) {
-        auto v = k.get();
-        arg_vars.push_back(v);
-        stream_table[v] = true;
-        auto tuple = arg_top_vars[v];
-        arg_top_vars[v] = std::make_tuple(v->name_hint,
-                                          std::get<1>(tuple),
-                                          std::get<2>(tuple)); 
-      }
-      TypeCollector visitor(arg_top_vars);
-      visitor.Visit(op->body);
-  
-      // generte function calls 
-      stream << "top(";
-      for (size_t i = 0; i < arg_vars.size(); i++) {
-        auto v = arg_vars[i];
-        std::string arg_name;
-        if (stream_table[v]) 
-          arg_name = std::get<0>(arg_top_vars[v]);
-        else arg_name = GetVarID(v); 
-        if (i != 0) stream << ", ";
-        stream << "fd_" << arg_name;
-
-        // generate kernel func definition
-        if (i != 0) arg_stream << ", ";
-        arg_stream << "hls::stream<";
-        PrintType(std::get<1>(arg_top_vars[v]), arg_stream);
-        auto shape = std::get<2>(arg_top_vars[v]);
-        arg_stream << ">& fd_" << arg_name;
-      }
-      stream << ");\n";
-  
-      // switch context to device scope
-      host_stream << this->stream.str();
-      this->stream.str("");
-      this->stream.clear();
-  
-    // swtich from device to host
-    } else if (op->value.as<StringImm>()->value == "cpu" && 
-               fpga_scope_) {
-      fpga_scope_ = false;
-      device_stream << this->stream.str();
-      this->stream.str("");
-      this->stream.clear();
-    }
-    this->PrintStmt(op->body);
-  } else {
-    CodeGenC::VisitStmt_(op);
-  }
-}
-
-void CodeGenVivadoHLS::VisitStmt_(const KernelStmt *op) {
-  PrintIndent();
-  stream << op->name << "(";
-  for (size_t i = 0; i < op->args.size(); i++) {
-    if (stream_arg_pos[op->name].count(i))
-      stream << "fd_";
-    PrintExpr(op->args[i], stream);
-    if (i < op->args.size() -1) stream << ", ";
-  }
-  stream << ");\n";
-}
-
-void CodeGenVivadoHLS::VisitStmt_(const KernelDef* op) {
-  LoweredFunc f;
-  // save func states
-  CodeGenC::SaveFuncState(f);
-  CodeGenC::InitFuncState(f);
-  std::ostringstream save;
-  save << this->stream.str();
-  this->stream.str("");
-  this->stream.clear();
-
-  // skip the first underscore
-  GetUniqueName("_");
-  // add to alloc buffer : type.
-  for (const auto & k : op->args) {
-    RegisterHandleType(k.get(), k.get()->type);
-  }
-  // print function signature
-  PrintType(op->ret_type, stream);
-  stream << " " << op->name << "(";
-  for (size_t k = 0; k < op->channels.size(); k+=2) {
-    int pos = op->channels[k].as<IntImm>()->value;  
-    stream_arg_pos[op->name].insert(pos);
-  }
-  for (size_t i = 0; i < op->args.size(); ++i) {
-    VarExpr v = op->args[i];
-    var_shape_map_[v.get()] = op->api_args[i];
-    std::string vid = AllocVarID(v.get());
-    if (i != 0) stream << ", ";
-    std::string str = PrintExpr(op->api_types[i]);
-    Type type = String2Type(str);
-
-    // pass the stream channel reference 
-    // TODO: broadcast in hlsc (one wr multi read) 
-    if (stream_arg_pos[op->name].count(i)) {
-      stream << "hls::stream<";
-      PrintType(type, stream);
-      stream << ">& " << vid;
-    } else {
-      PrintType(type, stream);
-      this->stream << " " << vid << "[";
-      int mul = 1;
-      for (size_t j = 0; j < op->api_args[i].size(); j++) {
-        auto dim = op->api_args[i][j].as<IntImm>()->value;
-        mul = mul * dim;
-      }
-      this->stream << mul << "]";
-    }
-  }  
-  stream << ") {\n";
-  int func_scope = BeginScope();
-  range_ = CollectIterRange(op->body);
-  PrintStmt(op->body);
-  EndScope(func_scope);
-  stream << "}\n\n";
-
-  // restore default stream
-  module_stream << this->stream.str();
-  this->stream.str(""); 
-  this->stream.clear();
-  this->stream << save.str();
-  RestoreFuncState(f);
-}
-
 void CodeGenVivadoHLS::VisitStmt_(const Stencil* op) {
   // Use SODA codegen for stencil analysis
   CodeGenSODA cg_soda;
diff --git a/tvm/src/codegen/hlsc/codegen_vhls.h b/tvm/src/codegen/hlsc/codegen_vhls.h
index 6462251db..5486be1dc 100644
--- a/tvm/src/codegen/hlsc/codegen_vhls.h
+++ b/tvm/src/codegen/hlsc/codegen_vhls.h
@@ -23,19 +23,11 @@ class CodeGenVivadoHLS final : public CodeGenHLSC {
   
   void VisitExpr_(const GetBit* op, std::ostream& os) override;
   void VisitExpr_(const GetSlice* op, std::ostream& os) override;
-  void VisitExpr_(const StreamExpr* op, std::ostream& os) override;
 
   void VisitStmt_(const Store* op) override;
   void VisitStmt_(const For* op) override;
   void VisitStmt_(const Partition* op) override;
   void VisitStmt_(const Stencil* op) override;
-  void VisitStmt_(const StreamStmt* op) override;
-  void VisitStmt_(const AttrStmt* op) override;
-  void VisitStmt_(const KernelDef* op) override;
-  void VisitStmt_(const KernelStmt* op) override;
-
-  void PreProcess(std::ostringstream& os);
-  void PostProcess(std::ostringstream& os);
  private:
   std::ofstream soda_header_;
 };
diff --git a/tvm/src/codegen/merlinc/codeanalys_merlinc.cc b/tvm/src/codegen/merlinc/codeanalys_merlinc.cc
index d6fa1c6ba..56b4e1d97 100644
--- a/tvm/src/codegen/merlinc/codeanalys_merlinc.cc
+++ b/tvm/src/codegen/merlinc/codeanalys_merlinc.cc
@@ -652,9 +652,6 @@ void CodeAnalysMerlinC::VisitExpr_(const Broadcast* op, std::ostream& os) {   //
   LOG(FATAL) << "Broadcast: not supported ";
 }
 
-void CodeAnalysMerlinC::VisitExpr_(const StreamExpr* op, std::ostream& os) {   // NOLINT(*)
-}
-
 void CodeAnalysMerlinC::VisitExpr_(const Select* op, std::ostream& os) {  // NOLINT(*)
   os << "(";
   PrintExpr(op->condition, os);
@@ -719,8 +716,10 @@ void CodeAnalysMerlinC::VisitExpr_(const Quantize *op, std::ostream& os) { // NO
 }
 
 void CodeAnalysMerlinC::VisitExpr_(const KernelExpr *op, std::ostream& os) { // NOLINT(*)
+  LOG(FATAL) << "KernelExpr is not yet support";
 }
 
+
 void CodeAnalysMerlinC::VisitStmt_(const LetStmt* op) {
   // TODO comaniac
   //std::vector<Var> vec_var = GetNodesByType<Var>(op->value);
@@ -883,9 +882,11 @@ void CodeAnalysMerlinC::VisitStmt_(const ProducerConsumer *op) {
 }
 
 void CodeAnalysMerlinC::VisitStmt_(const KernelDef *op) {
+  LOG(FATAL) << "KernelDef is not yet support";
 }
 
 void CodeAnalysMerlinC::VisitStmt_(const KernelStmt *op) {
+  LOG(FATAL) << "KernelStmt is not yet support";
 }
 
 void CodeAnalysMerlinC::VisitStmt_(const Return *op) {
@@ -916,8 +917,6 @@ void CodeAnalysMerlinC::VisitStmt_(const Reuse *op) {
 
 void CodeAnalysMerlinC::VisitStmt_(const Partition *op) {}
 
-void CodeAnalysMerlinC::VisitStmt_(const StreamStmt *op) {}
-
 void CodeAnalysMerlinC::VisitStmt_(const Stencil *op) {
   PrintStmt(op->body);
 }
diff --git a/tvm/src/codegen/merlinc/codeanalys_merlinc.h b/tvm/src/codegen/merlinc/codeanalys_merlinc.h
index 421f0d96f..6ba082f09 100644
--- a/tvm/src/codegen/merlinc/codeanalys_merlinc.h
+++ b/tvm/src/codegen/merlinc/codeanalys_merlinc.h
@@ -112,7 +112,6 @@ class CodeAnalysMerlinC :
   void VisitExpr_(const SetSlice* op, std::ostream& os) override;  // NOLINT(*)
   void VisitExpr_(const Quantize* op, std::ostream& os) override;  // NOLINT(*)
   void VisitExpr_(const KernelExpr* op, std::ostream& os) override;  // NOLINT(*)
-  void VisitExpr_(const StreamExpr* op, std::ostream& os) override;  // NOLINT(*)
   // statment
   void VisitStmt_(const LetStmt* op) override;
   void VisitStmt_(const Store* op) override;
@@ -132,7 +131,6 @@ class CodeAnalysMerlinC :
   void VisitStmt_(const Reuse* op) override;
   void VisitStmt_(const Partition* op) override;
   void VisitStmt_(const Stencil* op) override;
-  void VisitStmt_(const StreamStmt* op) override;
   /*!
    * Print Type represetnation of type t.
    * \param t The type representation.
diff --git a/tvm/src/codegen/opencl/build_opencl.cc b/tvm/src/codegen/opencl/build_opencl.cc
deleted file mode 100755
index f5b1352a7..000000000
--- a/tvm/src/codegen/opencl/build_opencl.cc
+++ /dev/null
@@ -1,61 +0,0 @@
-#include "./codegen_aocl.h"
-#include "./codegen_sdaccel.h"
-#include "../build_common.h"
-#include "./sdaccel_module.h"
-#include "../merlinc/codeanalys_merlinc.h"
-
-namespace TVM {
-namespace codegen {
-
-#if HCL_SDACCEL_RUNTIME
-runtime::Module BuildSDAccelSim(Array<LoweredFunc> funcs) {
-  CodeAnalysMerlinC ca;
-  CodeGenSDACCEL cg;
-  for (LoweredFunc f : funcs) {
-    // 1st pass: Analyze AST and collect necessary information
-    ca.AddFunction(f);
-    str2tupleMap<std::string, Type> map_arg_type;
-    map_arg_type = ca.Finish();
-    // 2nd pass: Generate kernel code
-    cg.AddFunction(f, map_arg_type);
-  }
-  std::string code = cg.Finish();
-  return runtime::CreateSDAccelModule(funcs[0], code);
-}
-
-TVM_REGISTER_API("codegen.build_sdaccel_csim")
-.set_body([](TVMArgs args, TVMRetValue* rv) {
-    *rv = BuildSDAccelSim(args[0]);
-  });
-#endif
-
-
-template<class CodeGen>
-std::string BuildOpenCL(Array<LoweredFunc> funcs){
-    using TVM::runtime::Registry;
-    CodeAnalysMerlinC ca;
-    CodeGen cg;
-    for(LoweredFunc f: funcs){
-        ca.AddFunction(f);
-        str2tupleMap<std::string, Type>map_arg_type;
-        map_arg_type = ca.Finish();
-        cg.AddFunction(f, map_arg_type);
-    }
-    std::string code = cg.Finish();
-
-    LOG(WARNING) << "OpenCL doesn't have runtime, return kernel code";
-    return code;
-}
-
-
-TVM_REGISTER_API("codegen.build_sdaccel")
-.set_body([]( TVMArgs args, TVMRetValue * rv ) {
-    * rv = BuildOpenCL<CodeGenSDACCEL>(args[0]);
-    });
-
-TVM_REGISTER_API("codegen.build_aocl")
-.set_body([]( TVMArgs args, TVMRetValue * rv ) {
-    * rv = BuildOpenCL<CodeGenAOCL>(args[0]);
-    });
-} // namespace codegen
-} // namespace TVM
diff --git a/tvm/src/codegen/opencl/codegen_aocl.cc b/tvm/src/codegen/opencl/codegen_aocl.cc
deleted file mode 100644
index 6d3247d02..000000000
--- a/tvm/src/codegen/opencl/codegen_aocl.cc
+++ /dev/null
@@ -1,354 +0,0 @@
-#include <tvm/ir_pass.h>
-#include <tvm/runtime/config.h>
-#include <tvm/packed_func_ext.h>
-#include <vector>
-#include <string>
-#include "./codegen_aocl.h"
-#include "../../runtime/thread_storage_scope.h"
-
-namespace TVM {
-namespace codegen {
-
-inline Type String2Type(std::string& s) {
-  if (s.front() == '\"' && s.back() == '\"') {
-    s.erase(0, 1);
-    s.pop_back();
-  }
-  std::istringstream is(s);
-  halideir_type_code_t code = Type::Int;
-  if (s.substr(0, 3) == "int") {
-    code = Type::Int; s = s.substr(3);
-  } else if (s.substr(0, 4) == "uint") {
-    code = Type::UInt; s = s.substr(4);
-  } else if (s.substr(0, 5) == "float") {
-    code = Type::Float; s = s.substr(5);
-  } else if (s.substr(0, 5) == "float") {
-    code = Type::Float; s = s.substr(5);
-  } else if (s == "handle") {
-    return Handle();
-  } else {
-    LOG(FATAL) << "unknown type " << s;
-  }
-  int bits = 32, lanes = 1;
-  if (sscanf(s.c_str(), "%dx%d", &bits, &lanes) == 0) {
-    LOG(FATAL) << "unknown type " << s;
-  }
-  return Type(code, bits, lanes);
-}
-
-void CodeGenAOCL::AddFunction(LoweredFunc f,
-        str2tupleMap<std::string, Type> map_arg_type) {
-  // Clear previous generated state
-  this->InitFuncState(f);
-  for (Var arg: f->args) {
-      if (arg.type().is_handle()) {
-          alloc_storage_scope_[arg.get()] = "global";
-      }
-  }
-
-  // Skip the first underscore, so SSA variable starts from _1
-  GetUniqueName("_");
-
-  // Register alloc buffer type
-  for (const auto & kv : f->handle_data_type) {
-    RegisterHandleType(kv.first.get(), kv.second.type());
-  }
-
-  this->decl_stream << "#include \"ihc_apint.h\"" << "\n";
-  this->decl_stream << "#pragma OPENCL EXTENSION cl_intel_arbitrary_precision_integers : enable\n";
-  this->stream << "__kernel " << "void " << f->name << "(";
-
-  // Write arguments
-  for (size_t i = 0; i < f->args.size(); ++i) {
-    // alloc or get var name
-    Var v = f->args[i];
-    std::string vid;
-    if (!var_idmap_.count(v.get())) 
-      vid = AllocVarID(v.get());
-    else vid = GetVarID(v.get());
-
-    if (i != 0) this->stream << ", ";
-    if (map_arg_type.find(vid) == map_arg_type.end()) {
-      LOG(WARNING) << vid << " type not found\n";
-      PrintType(v.type(), this->stream);
-      this->stream << ' ' << vid;
-    }
-    else {
-      auto arg = map_arg_type[vid];
-      this->stream << "__global ";
-      PrintType(std::get<1>(arg), this->stream);
-      if (v.type().is_handle())
-        this->stream << "*";
-      this->stream << ' ' << "restrict ";
-      this->stream << std::get<0>(arg);
-    }
-  }
-  stream << ") {\n";
-  int func_scope = this->BeginScope();
-  this->PrintStmt(f->body);
-  this->EndScope(func_scope);
-  this->PrintIndent();
-  // this->stream << ' '<< ' ' << "return;\n";
-  this->stream << "}\n\n";
-}
-
-void CodeGenAOCL::PrintType(Type t, std::ostream &os)
-{
-  int lanes = t.lanes();
-  if(t.is_handle()) {
-    os << "void*";return;
-  }
-  if(t == Bool()) {
-    os <<"bool"; return;
-  }
-  CHECK_EQ(lanes, 1)
-      << "do not yet support vector types";
-  
-  bool fail = false;
-  if(t.is_float()) {
-    switch(t.bits())
-    {
-      case 16:
-        os<<"half";
-        // enable_fp16_ = true;
-        break;
-      case 32:
-        os<<"float";
-        break;
-      case 64:
-        os<< "double";
-        // enable_fp64_ = true;
-        break;
-      default:
-        fail = true;
-        break;
-    }
-    if(!fail && lanes ==1) return;
-    if(!fail&&(lanes >= 2 && lanes <=16))
-    {
-      os<<lanes; return;
-    }
-  } else if(t.is_uint() || t.is_int()) {
-    fail = true;
-    if(!fail && lanes == 1) return;
-    if(!fail && (lanes >=2 && lanes <= 16)) {
-      os  <<  lanes; return;
-    }
-    if(fail && lanes==1) {
-      if(t.is_uint()) {
-        if (t.bits() > 64) {
-          os << "uint" << "64" << "_t"; return;
-        } else {
-          os<< "ap_uint<"<< t.bits() <<"> uintd_t"; return;
-        }
-      }
-      if(t.is_int()) {
-        if (t.bits() > 64) {
-          os << "int" << "64" << "_t"; return;
-        } else {
-          os << "ap_int<" << t.bits() << "> intd_t"; return;
-        }
-      }
-    }
-  }
-
-  LOG(FATAL) << "Cannot convert type"<<t<<"to AOCL type";
-}
-
-void CodeGenAOCL::VisitStmt_(const For* op) {
-  std::ostringstream os;
-  if (op->for_type == ForType::Unrolled) {
-    int unroll_factor = 0, i = 0;
-    for (auto key : op->annotate_keys) {
-      if (auto str = key.as<StringImm>()) {
-        auto factor = op->annotate_values[i].as<IntImm>();
-        if (str->value == "factor" && factor != nullptr && factor->value > 1) {
-          unroll_factor = factor->value;
-          break;
-        }
-      }
-      i++;
-    }
-    os << "#pragma unroll";
-    if (unroll_factor > 0) os << " " << unroll_factor << "\n";
-    else                   os << "\n";
-  }
-  else if (op->for_type == ForType::Pipelined) {
-    int II = 1, i = 0;
-    for (auto key : op->annotate_keys) {
-      if (auto str = key.as<StringImm>()) {
-        auto initiation_interval = op->annotate_values[i].as<IntImm>();
-        if (str->value == "initiation_interval" &&
-            initiation_interval != nullptr &&
-            initiation_interval->value > 1) {
-          II = initiation_interval->value;
-          break;
-        }
-      }
-      i++;
-    }
-    os << "#pragma";
-    os << " ii " << II << "\n";
-  }
-  CodeGenAOCL::GenForStmt(op, os.str(), true);
-}
-
-void CodeGenAOCL::VisitExpr_(const StreamExpr* op, std::ostream& os) {
-  std::string vid;
-  if (!var_idmap_.count(op->buffer_var.get())) 
-    vid = AllocVarID(op->buffer_var.get());
-  else vid = GetVarID(op->buffer_var.get());
-  int i = 0;
-  for (auto key : op->annotate_keys) {
-    auto str = key.as<StringImm>();
-    auto val = op->annotate_values[i].as<StringImm>();
-    if (str->value == "name" && val != nullptr) {
-        vid = val->value;
-        decl_stream << "channel ";
-        PrintType(op->type, decl_stream);
-        decl_stream << " " << vid << ";\n";
-    }
-    i++;
-  }
-  switch (op->stream_type) {
-    case StreamType::Channel:
-      os << "read_channel_intel(";
-      os << vid << ")";
-      break;
-    case StreamType::Pipe:
-      os << "read_pipe(";
-      break;
-    case StreamType::FIFO:
-      // buffered channel  
-      os << "fifo";
-      break;
-  }
-}
-
-void CodeGenAOCL::VisitStmt_(const KernelDef* op) {
-  LoweredFunc f;
-  SaveFuncState(f);
-  InitFuncState(f);
-  std::ostringstream save;
-  save << this->stream.str();
-  this->stream.str("");
-  this->stream.clear();
-
-  // skip the first underscore
-  GetUniqueName("_");
-  // add to alloc buffer : type.
-  for (const auto & k : op->args) {
-    RegisterHandleType(k.get(), k.get()->type);
-  }
-  stream << "__kernel ";
-  const UIntImm* is_void = op->ret_void.as<UIntImm>();
-  if (is_void) stream << "void";
-  else PrintType(op->ret_type, stream);
-  stream << " " << op->name << "(";
-
-  // streamed arg position to channel index
-  std::unordered_map<int, int> stream_args;
-  for (size_t j = 0; j < op->channels.size(); j=j+2) {
-    int pos = op->channels[j].as<IntImm>()->value;
-    int idx = op->channels[j+1].as<IntImm>()->value;
-    stream_args[pos] = idx;
-  } 
-  for (size_t i = 0; i < op->args.size(); ++i) {
-    VarExpr v = op->args[i];
-    var_shape_map_[v.get()] = op->api_args[i];
-    std::string vid = AllocVarID(v.get());
-    if (stream_args.count(i)) { 
-      stream_arg_pos[op->name].insert(i); 
-      if (!stream_pragma) {
-        decl_stream << "#pragma OPENCL EXTENSION cl_intel_channels : enable\n";
-        stream_pragma = true;
-      }
-    } else {
-      if (i != 0) {
-        if (stream_args.count(i-1)) void(0);
-        else stream << ", ";
-      } // un-streamed argument 
-      this->stream << "__global ";
-      std::string str = PrintExpr(op->api_types[i]);
-      Type type = String2Type(str);
-      PrintType(type, stream);
-      this->stream << "* restrict " << vid;
-    }
-  }  
-  stream << ") {\n";
-  int func_scope = BeginScope();
-  range_ = CollectIterRange(op->body);
-  PrintStmt(op->body);
-  EndScope(func_scope);
-  stream << "}\n\n";
-
-  // restore default stream
-  module_stream << this->stream.str();
-  this->stream.str(""); 
-  this->stream.clear();
-  this->stream << save.str();
-  RestoreFuncState(f);
-}
-
-void CodeGenAOCL::VisitStmt_(const KernelStmt *op) {
-  PrintIndent();
-  stream << op->name << "(";
-  for (size_t i = 0; i < op->args.size(); i++) {
-    std::string str = op->name + "." + PrintExpr(op->args[i]);
-    if (!stream_arg_pos[op->name].count(i)) {
-      if (i != 0) {
-        if (stream_arg_pos[op->name].count(i-1)) void(0);
-        else stream << ", ";
-      }
-      PrintExpr(op->args[i], stream);
-    }
-  }
-  stream << ");\n";
-}
-
-void CodeGenAOCL::VisitExpr_(const KernelExpr *op, std::ostream& os) { // NOLINT(*)
-  os << op->name << "(";
-  for (size_t i = 0; i < op->args.size(); ++i) {
-    if (!stream_arg_pos[op->name].count(i)) {
-      if (i != 0) {
-        if (stream_arg_pos[op->name].count(i-1)) void(0);
-        else stream << ", ";
-      }
-      PrintExpr(op->args[i], stream);
-    }
-  }
-  os << ")";
-}
-
-void CodeGenAOCL::VisitStmt_(const StreamStmt* op) {
-  std::string vid;
-  if (!var_idmap_.count(op->buffer_var.get())) 
-    vid = AllocVarID(op->buffer_var.get());
-  else vid = GetVarID(op->buffer_var.get());
-  PrintIndent();
-  int i = 0;
-  for (auto key : op->annotate_keys) {
-    auto str = key.as<StringImm>();
-    auto val = op->annotate_values[i].as<StringImm>();
-    if (str->value == "name" && val != nullptr) vid = val->value;
-    i++;
-  }
-  switch (op->stream_type) {
-    case StreamType::Channel:
-      stream << "write_channel_intel(";
-      stream << vid << ", ";
-      break;
-    case StreamType::Pipe:
-      stream << "write_pipe(";
-      stream << vid << ", ";
-      break;
-    case StreamType::FIFO:
-      stream << "fifo(";
-      break;
-  }
-  PrintExpr(op->value, stream);
-  stream << ");\n";
-}
-
-} // namespace codegen
-} // namespace TVM
diff --git a/tvm/src/codegen/opencl/codegen_aocl.h b/tvm/src/codegen/opencl/codegen_aocl.h
deleted file mode 100755
index 5778b70ec..000000000
--- a/tvm/src/codegen/opencl/codegen_aocl.h
+++ /dev/null
@@ -1,34 +0,0 @@
-#ifndef TVM_CODEGEN_CODEGEN_AOCL_H_
-#define TVM_CODEGEN_CODEGEN_AOCL_H_
-
-# include <tvm/codegen.h>
-# include <tvm/packed_func_ext.h>
-# include "./codegen_opencl.h"
-
-namespace TVM {
-namespace codegen {
-
-class CodeGenAOCL : public CodeGenOpenCL {
- public:
-  CodeGenAOCL(){}
-  void AddFunction(LoweredFunc f, str2tupleMap<std::string, Type> map_arg_type);
-  void PrintType(Type t, std::ostream& os) override; //NOLINT(*)
-
-  void VisitStmt_(const For* op) override; //NOLINT(*)
-  void VisitStmt_(const StreamStmt* op) override; //NOLINT(*)
-  void VisitStmt_(const KernelDef* op) override; //NOLINT(*)
-  void VisitStmt_(const KernelStmt* op) override; //NOLINT(*)
-
-  void VisitExpr_(const StreamExpr* op, std::ostream& os) override; //NOLINT(*)
-  void VisitExpr_(const KernelExpr* op, std::ostream& os) override; //NOLINT(*)
-
- private:
-  // whether to enable streaming
-  bool stream_pragma{false}; 
-  // map from kernel name to set of streamed arg position index
-  std::unordered_map<std::string, std::unordered_set<int>> stream_arg_pos;
-};
-} // namespace codegen
-} // namespace TVM
-
-#endif // TVM_CODEGEN_CODEGEN_AOCL_H_
diff --git a/tvm/src/codegen/opencl/codegen_opencl.h b/tvm/src/codegen/opencl/codegen_opencl.h
deleted file mode 100755
index 4f9a15fe5..000000000
--- a/tvm/src/codegen/opencl/codegen_opencl.h
+++ /dev/null
@@ -1,50 +0,0 @@
-#ifndef TVM_CODEGEN_CODEGEN_OPENCL_H_
-#define TVM_CODEGEN_CODEGEN_OPENCL_H_
-
-# include <tvm/codegen.h>
-# include <tvm/packed_func_ext.h>
-# include <string>
-# include "../codegen_c.h"
-
-namespace TVM{
-namespace codegen{
-
-class CodeGenOpenCL : public CodeGenC{
-  public:
-    // void AddFunction(LoweredFunc f);
-    CodeGenOpenCL();
-    virtual void AddFunction(LoweredFunc f, str2tupleMap<std::string, Type> map_arg_type) = 0;
-    std::string Finish();
-    void BindThreadIndex(const IterVar& iv) override; // NOLINT(*)
-    void PrintStorageScope(const std::string& scope, std::ostream& os) override; //NOLINT(*)
-    void PrintStorageSync(const Call* op) override; //NOLINT(*)
-    // void PrintType(Type t, std::ostream& os) override; //NOLINT(*)
-    virtual void PrintType(Type t, std::ostream& os) = 0; //NOLINT
-    std::string GetVecLoad(Type t, const Variable * buffer, 
-                           Expr base) override; // NOLINT(*)
-    void PrintVecStore(const Variable * buffer, Type t,
-                       Expr base, const std::string& value) override; //NOLINT(*)
-    void PrintVecAddr(const Variable * buffer, Type t,
-                      Expr base, std::ostream& os); //NOLINT(*)
-    std::string CastFromTo(std::string value, Type from, Type target) override; //NOLINT(*)
-
-    //overload visitor
-    void VisitExpr_(const Broadcast * op, std::ostream& os) override; //NOLINT(*)
-    void VisitExpr_(const Call * op, std::ostream& os) override; //NOLINT(*)
-    void VisitExpr_(const Select * op, std::ostream& os) override; //NOLINT(*)
-    void VisitExpr_(const FloatImm * op, std::ostream& os) override; //NOLINT(*)
-    void VisitStmt_(const IfThenElse* op) override; //NOLINT(*)
-    void VisitStmt_(const LetStmt* op) override; // NOLINT
-    void GenForStmt(const For* op, std::string pragma, bool before);
-    virtual void VisitStmt_(const For* op) = 0;
-
-protected:
-  // fp16 and fp64 extension
-  bool enable_fp16_{false};
-  bool enable_fp64_{false};
-};
-
-} // namespace codegen
-} // namespace TVM
-
-#endif
diff --git a/tvm/src/codegen/opencl/codegen_sdaccel.cc b/tvm/src/codegen/opencl/codegen_sdaccel.cc
deleted file mode 100644
index cba08fa2d..000000000
--- a/tvm/src/codegen/opencl/codegen_sdaccel.cc
+++ /dev/null
@@ -1,219 +0,0 @@
-# include <tvm/runtime/config.h>
-# include <tvm/packed_func_ext.h>
-# include <vector>
-# include <string>
-# include "./codegen_sdaccel.h"
-# include "../../runtime/thread_storage_scope.h"
-
-namespace TVM {
-namespace codegen {
-
-void CodeGenSDACCEL::AddFunction(LoweredFunc f,
-        str2tupleMap<std::string, Type> map_arg_type) {
-  // Clear previous generated state
-  this->InitFuncState(f);
-  for (Var arg: f->args) {
-      if (arg.type().is_handle()) {
-          alloc_storage_scope_[arg.get()] = "global";
-      }
-  }
-
-  // Skip the first underscore, so SSA variable starts from _1
-  GetUniqueName("_");
-
-  // Register alloc buffer type
-  for (const auto & kv : f->handle_data_type) {
-    RegisterHandleType(kv.first.get(), kv.second.type());
-  }
-
-  this->stream << "__kernel " << "void " << f->name << "(";
-
-  // Write arguments
-  for (size_t i = 0; i < f->args.size(); ++i) {
-    Var v = f->args[i];
-    std::string vid = AllocVarID(v.get());
-    if (i != 0) this->stream << ", ";
-    if (map_arg_type.find(vid) == map_arg_type.end()) {
-      LOG(WARNING) << vid << " type not found\n";
-      PrintType(v.type(), this->stream);
-      this->stream << ' ' << vid;
-    }
-    else {
-      auto arg = map_arg_type[vid];
-      this->stream << "__global ";
-      // this->stream << "global ";
-      PrintType(std::get<1>(arg), this->stream);
-      if (v.type().is_handle())
-        this->stream << "*";
-      this->stream << ' ' << std::get<0>(arg);
-    }
-  }
-  stream << ") {\n";
-  int func_scope = this->BeginScope();
-  this->PrintStmt(f->body);
-  this->EndScope(func_scope);
-  this->PrintIndent();
-  // this->stream << ' '<< ' ' << "return;\n";
-  this->stream << "}\n\n";
-}
-
-void CodeGenSDACCEL::PrintType(Type t, std::ostream& os) {  // NOLINT(*)
-  int lanes = t.lanes();
-  if (t.is_handle()) {
-    //LOG(FATAL) << "The buffer shouldn't call PrintType for printing type";
-    os << "void*";
-    return ;
-  }
-  bool fail = false;
-  if (t.is_float()) {
-    switch (t.bits()) {
-      case 16: os << "half"; break;
-      case 32: os << "float"; break;
-      case 64: os << "double"; break;
-      // case 128: os << "double double"; break;
-      default: fail = true; break;
-    }
-    if (!fail && lanes == 1) return;
-    if (!fail && (lanes >= 2 && lanes <= 16)) {
-      os << lanes; return;
-    }
-  } else if (t.is_uint() || t.is_int()) {
-    if (t.is_uint()) {
-      os << "unsigned ";
-    }
-    if (t.bits() == 8 && t.lanes() == 4) {
-      // directly 4 8 bit int in integer.
-      os << "int"; return;
-    }
-
-    int target_bit = 1;
-    while (target_bit < t.bits())
-      target_bit <<= 1;
-
-    switch (target_bit) {
-      case 1: os << "int"; break;
-      case 2: os << "char"; break;
-      case 4: os << "char"; break;
-      case 8: os << "char"; break;
-      case 16: os << "short"; break;
-      case 32: os << "int"; break;
-      case 64: os << "long"; break;
-      case 128: os << "long"; break; // FIXME: Should use long long
-      default: fail = true; break;
-    }
-    if (!fail && lanes == 1) return;
-    // FIXME: Not yet support multiple lanes
-    //if (!fail && (lanes >= 2 && lanes <= 16)) {
-    //  os << lanes; return;
-    //}
-  }
-  os << t;
-  LOG(WARNING) << "Cannot convert type " << t ;
-  return ;
-}
-
-void CodeGenSDACCEL::PrintStorageScope(
-    const std::string& scope, std::ostream& os) { // NOLINT(*)
-  if (scope == "global" || scope == "shared") {
-    os << "__local ";
-  }
-}
-
-void CodeGenSDACCEL::VisitStmt_(const For* op) {
-  std::ostringstream os;
-  if (op->for_type == ForType::Unrolled) {
-    int unroll_factor = 0, i = 0;
-    for (auto key : op->annotate_keys) {
-      if (auto str = key.as<StringImm>()) {
-        auto factor = op->annotate_values[i].as<IntImm>();
-        if (str->value == "factor" && factor != nullptr && factor->value > 1) {
-          unroll_factor = factor->value;
-          break;
-        }
-      }
-      i++;
-    }
-    if (unroll_factor > 0) {
-        os << "__attribute__((opencl_unroll_hint(";
-        os << unroll_factor << ")))\n";
-    } else {
-      os << "\n";
-    }
-  }
-  else if (op->for_type == ForType::Pipelined) {
-    int II = 1, i = 0;
-    for (auto key : op->annotate_keys) {
-      if (auto str = key.as<StringImm>()) {
-        auto initiation_interval = op->annotate_values[i].as<IntImm>();
-        if (str->value == "initiation_interval" &&
-            initiation_interval != nullptr &&
-            initiation_interval->value > 1) {
-          II = initiation_interval->value;
-          break;
-        }
-      }
-      i++;
-    }
-    os << "__attribute__((xcl_pipeline_loop(";
-    os << II << ")))\n";
-  }
-  CodeGenSDACCEL::GenForStmt(op, os.str(), true);
-}
-
-void CodeGenSDACCEL::VisitStmt_(const Partition* op) {
-  std::string vid = GetVarID(op->buffer_var.get());
-  stream << vid << " ";
-  if (op->partition_type != PartitionType::Complete) {
-    stream << "__attribute__((xcl_array_partition(";
-    switch (op->partition_type) {
-      // case PartitionType::Complete:
-      //   stream << "complete,";
-      //   break;
-      case PartitionType::Block:
-        stream << "block,";
-        break;
-      case PartitionType::Cyclic:
-        stream << "cyclic,";
-        break;
-      }
-    stream << op->factor << ",";
-    stream << op->dim << ")))\n";
-  } else {
-    if (op->dim == 0) {
-      stream << "__attribute__((xcl_array_partition))\n";
-    } else {
-      stream << "__attribute__((xcl_array_partition(";
-      stream << "complete,";
-      stream << op->factor << ",";
-      stream << op->dim << ")))\n";
-      }
-    }
-}
-
-void CodeGenSDACCEL::VisitStmt_(const StreamStmt* op) {
-  std::string vid = GetVarID(op->buffer_var.get());
-  PrintIndent();
-  stream << vid;
-  switch (op->stream_type) {
-    case StreamType::Channel:
-      stream << "[channel]";
-      break;
-    case StreamType::FIFO:
-      stream << "[fifo]";
-      break;
-    case StreamType::Pipe:
-      stream << "[pipe]";
-      break;
-  }
-  stream << ".write";
-  PrintExpr(op->value, stream);
-  stream << ";\n";
-}
-
-void CodeGenSDACCEL::VisitExpr_(const StreamExpr* op, std::ostream& os) {
-  std::string vid = GetVarID(op->buffer_var.get());
-  os << vid << ".read()";
-}
-
-} // namespace codegen
-} // namespace TVM
diff --git a/tvm/src/codegen/opencl/codegen_sdaccel.h b/tvm/src/codegen/opencl/codegen_sdaccel.h
deleted file mode 100755
index 4f1cfa053..000000000
--- a/tvm/src/codegen/opencl/codegen_sdaccel.h
+++ /dev/null
@@ -1,29 +0,0 @@
-#ifndef TVM_CODEGEN_CODEGEN_SDACCEL_H_
-#define TVM_CODEGEN_CODEGEN_SDACCEL_H_
-
-# include <tvm/codegen.h>
-# include <tvm/packed_func_ext.h>
-# include "./codegen_opencl.h"
-
-namespace TVM {
-namespace codegen {
-
-class CodeGenSDACCEL : public CodeGenOpenCL {
-  public:
-    CodeGenSDACCEL(){}
-    void AddFunction(LoweredFunc f, str2tupleMap<std::string, Type> map_arg_type);
-
-    void PrintType(Type t, std::ostream& os) override; //NOLINT(*)
-    void PrintStorageScope(const std::string& scope, std::ostream& os) override; //NOLINT(*)
-
-    void VisitStmt_(const For* op) override; //NOLINT(*)
-    void VisitStmt_(const Partition* op) override; //NOLINT(*)
-    void VisitStmt_(const StreamStmt* op) override; //NOLINT(*)
-
-    void VisitExpr_(const StreamExpr* op, std::ostream& os) override; //NOLINT(*)
-   
-};
-} // namespace codegen
-} // namespace TVM
-
-#endif // TVM_CODEGEN_CODEGEN_SDACCEL_H_
diff --git a/tvm/src/codegen/opencl/sdaccel_module.cc b/tvm/src/codegen/opencl/sdaccel_module.cc
deleted file mode 100644
index 63f12e86b..000000000
--- a/tvm/src/codegen/opencl/sdaccel_module.cc
+++ /dev/null
@@ -1,645 +0,0 @@
-#include "./sdaccel_module.h"
-#include <fstream>
-#include <unistd.h>
-#include <sys/ipc.h>
-#include <sys/shm.h>
-#include <iostream>
-#include <cstring>
-#include <typeinfo>
-
-namespace TVM {
-namespace runtime {
-
-namespace {
-
-void PrintIndent(std::ofstream& stream, int indent) {
-  for (int i = 0; i < indent; i++)
-    stream << ' ';
-}
-
-inline size_t GetTypeSize(TVMType t) {
-  size_t byte = (t.bits + 7) / 8;
-  if (byte > 2){
-    if (byte <= 4) byte = 4;
-    else if (byte <= 8) byte = 8;
-    else byte = 16;
-  }
-  return byte;
-}
-
-inline size_t GetDataSize(TVMArray* arr) {
-  size_t size = 1;
-  for (tvm_index_t i = 0; i < arr->ndim; ++i) {
-    size *= arr->shape[i];
-  }
-  size_t byte = (arr->dtype.bits + 7) / 8;
-  if (byte > 2){
-    if (byte <= 4) byte = 4;
-    else if (byte <= 8) byte = 8;
-    else byte = 16;
-  }
-  size *= (byte * 8 * arr->dtype.lanes + 7) / 8;
-  return size;
-}
-
-inline TVMType Type2TVMType(Type t) {
-  TVMType tt;
-  if (t.is_int())        tt.code = kDLInt;
-  else if (t.is_uint())  tt.code = kDLUInt;
-  else if (t.is_float()) tt.code = kDLFloat;
-  else                   LOG(FATAL) << "Unacceptable type: " << t;
-  tt.bits = static_cast<uint8_t>(t.bits());
-  tt.fracs = static_cast<uint8_t>(t.fracs());
-  return tt;
-}
-
-inline std::string Type2Str(TVMType t) {
-  std::string str = "";
-  if (t.code == kDLInt) {
-    str += "int";
-  } else if (t.code == kDLUInt) {
-    str += "unsigned int";
-  } else if (t.code == kDLFloat) {
-    str += "float";
-  } else {
-    LOG(FATAL) << "Unknown type";
-  }
-  return str;
-}
-
-inline std::string Type2ExtStr(TVMType t) {
-  std::string str = "";
-  if (t.code == kDLInt) {
-    if (t.fracs > 0) str += "ap_fixed<";
-    else             str += "ap_int<";
-    str += std::to_string(static_cast<int>(t.bits + t.fracs));
-    if (t.fracs > 0) str += ", " + std::to_string(static_cast<int>(t.bits)) + ">";
-    else             str += ">";
-  } else if (t.code == kDLUInt) {
-    if (t.fracs > 0) str += "ap_ufixed<";
-    else             str += "ap_uint<";
-    str += std::to_string(static_cast<int>(t.bits + t.fracs));
-    if (t.fracs > 0) str += ", " + std::to_string(static_cast<int>(t.bits)) + ">";
-    else             str += ">";
-  } else if (t.code == kDLFloat) {
-    str += "float";
-  } else {
-    LOG(FATAL) << "Unknown type";
-  }
-  return str;
-}
-
-inline std::string Type2Byte(TVMType t) {
-  std::string str = "";
-  if (t.code == kDLFloat) {
-    str += "float";
-  } else if (t.code == kDLInt || t.code == kDLUInt) {
-    if (t.code == kDLUInt) str += "unsigned";
-    str += "int";
-    if      (t.bits <= 8)  str += "8";
-    else if (t.bits <= 16) str += "16";
-    else if (t.bits <= 32) str += "32";
-    else                   str += "64";
-    // str += "_t";
-  }
-  return str;
-}
-
-void CollectArgInfo(TVMArgs& args, 
-                    LoweredFunc func,
-                    std::vector<size_t>& arg_sizes,
-                    std::vector<TVMType>& arg_types) {
-  for (int i = 0; i < args.size(); i++) {
-    if (args[i].type_code() == kArrayHandle) {
-      TVMArray* arr = args[i];
-      arg_sizes.push_back(GetDataSize(arr));
-      arg_types.push_back(arr->dtype);
-    } else {
-      const Variable* var = func->api_args[i].as<Variable>();
-      TVMType t = Type2TVMType(var->type);
-      arg_sizes.push_back(GetTypeSize(t));
-      arg_types.push_back(t);
-    }
-  }
-}
-
-void GenSharedMem(TVMArgs& args,
-                  std::vector<int>& shmids,
-                  std::vector<size_t>& arg_sizes) {
-  for (int i = 0; i < args.size(); i++) {
-    if (args[i].type_code() == kArrayHandle) {
-      TVMArray* arr = args[i];
-      // generate shared memory key and id
-      // TODO: maybe get the current path??
-      key_t key = ftok("/", i+1);
-      int shmid = shmget(key, arg_sizes[i], 0666|IPC_CREAT);
-      shmids.push_back(shmid);
-      // copy mem from TVM args to the shared memory
-      void* mem = shmat(shmid, nullptr, 0);
-      memcpy(mem, arr->data, arg_sizes[i]);
-    } else {
-      shmids.push_back(0);
-    }
-  }
-}
-
-void FreeSharedMem(TVMArgs& args, 
-                   const std::vector<int>& shmids,
-                   std::vector<size_t>& arg_sizes) {
-  for (size_t i = 0; i < shmids.size(); i++) {
-      TVMArray* arr = args[i];
-      int shmid = shmids[i];
-      void* mem = shmat(shmid, nullptr, 0);
-      memcpy(arr->data, mem, arg_sizes[i]);
-      shmdt(mem);
-      shmctl(shmid, IPC_RMID, nullptr);
-  }
-}
-
-// copy values from the shared mem to local mem
-void PrintCopy(TVMArray* arr, 
-               std::ofstream& stream, 
-               int indent, size_t nth_arr) {
-  for (int i = 0; i < arr->ndim; i++) {
-    PrintIndent(stream, indent);
-    stream << "for (size_t i" << i << " = 0; ";
-    stream << "i" << i << " < " << arr->shape[i] << "; ";
-    stream << "i" << i << "++) {\n";
-    indent += 2;
-    if (i == arr->ndim-1) {
-      PrintIndent(stream, indent);
-      stream << "source_" << nth_arr;
-      stream << "[i" << arr->ndim-1;
-      int mul = 1;
-      for (int j = arr->ndim-2;j >= 0;j--) {
-        mul *= arr->shape[j+1];
-        stream << " + i" << j << "*" << mul;
-      }
-      stream << "] = ";
-      stream << "arg_" << nth_arr;
-      stream << "[i" << arr->ndim - 1;
-
-      int mul2 = 1;
-      for (int j = arr->ndim-2;j >= 0;j--) {
-        mul2 *= arr->shape[j+1];
-        stream << " + i" << j << "*" << mul2;
-      }
-      stream << "]";
-      if (arr->dtype.fracs > 0)
-        stream << " >> " << static_cast<int>(arr->dtype.fracs);
-      stream << ";\n";
-    }
-  }
-  for (int i = 0; i < arr->ndim; i++) {
-    indent -= 2;
-    PrintIndent(stream, indent);
-    stream << "}\n";
-  }
-}
-
-// copy values from local mem back to shared mem
-void PrintCopyBack(TVMArray* arr, 
-                   std::ofstream& stream, 
-                   int indent, size_t nth_arr) {
-  for (int i = 0; i < arr->ndim; i++) {
-    PrintIndent(stream, indent);
-    stream << "for (size_t i" << i << " = 0; ";
-    stream << "i" << i << " < " << arr->shape[i] << "; ";
-    stream << "i" << i << "++) {\n";
-    indent += 2;
-    if (i == arr->ndim-1) {
-      PrintIndent(stream, indent);
-      stream << "arg_" << nth_arr;
-      stream << "[i" << arr->ndim-1;
-      int mul = 1;
-      for (int j = arr->ndim-2; j >= 0; j--) {
-        mul *= arr->shape[j+1];
-        stream << " + i" << j << "*" << mul;
-      }
-      stream << "] = ";
-      // stream << Type2ExtStr(arr->dtype);
-      stream << "source_" << nth_arr;
-      stream << "[i" << arr->ndim - 1;
-      int mul2 = 1;
-      for (int j = arr->ndim-2;j >=0;j--) {
-        mul2 *= arr->shape[j+1];
-        stream << " + i" << j << "*" << mul2;
-      }
-      stream << "]";
-      if (arr->dtype.fracs > 0)
-        stream << " << " << static_cast<int>(arr->dtype.fracs);
-      stream << ";\n";
-    }
-  }
-  for (int i = 0; i < arr->ndim; i++) {
-    indent -= 2;
-    PrintIndent(stream, indent);
-    stream << "}\n";
-  }
-}
-
-void GenMakFile() {
-  int indent = 0;
-  std::ofstream stream;
-  stream.open("sdaccel.mk");
-  indent += 4;
-
-  stream << "ifndef XILINX_SDX\n";
-  stream << "$(error Environment variable XILINX_SDX is required and should point to SDAccel install area)\n";
-  stream << "endif\n";
-
-  stream << "SDA_FLOW = cpu_emu\n";
-  stream << "HOST_SRCS = host.cpp\n";
-  stream << "HOST_EXE_DIR=.\n";
-  stream << "HOST_EXE = host\n";
-  stream << "HOST_CFLAGS = -g -Wall -DFPGA_DEVICE -DC_KERNEL\n";
-  stream << "HOST_LFLAGS = \n";
-  stream << "KERNEL_SRCS = default_function.cl\n";
-  stream << "KERNEL_NAME = default_function\n";
-  stream << "KERNEL_DEFS =\n";
-  stream << "KERNEL_INCS =\n";
-  stream << "XDEVICE=xilinx:adm-pcie-7v3:1ddr:3.0\n";
-  stream << "XDEVICE_REPO_PATH=\n";
-  stream << "KEEP_TEMP=1\n";
-  stream << "KERNEL_DEBUG=\n";
-  stream << "XCLBIN_NAME=bin_krnl\n";
-  stream << "HOST_CFLAGS+=-DTARGET_DEVICE=\\\"${XDEVICE}\\\"\n";
-  stream << "BOARD_SETUP_FILE=setup.sh\n";
-  stream << "ifeq (${SDA_FLOW},cpu_emu)\n";
-  PrintIndent(stream, indent);
-  stream << "CLCC_OPT += -t sw_emu\n";
-  PrintIndent(stream, indent);
-  stream << "XCLBIN = ${XCLBIN_NAME}_cpu_emu.xclbin\n"; 
-  stream << "else ifeq (${SDA_FLOW},hw_emu)\n";
-  PrintIndent(stream, indent);
-  stream << "CLCC_OPT += -t hw_emu\n";
-  PrintIndent(stream, indent);
-  stream << "XCLBIN = ${XCLBIN_NAME}_hw_emu.xclbin\n";
-  stream << "else ifeq (${SDA_FLOW},hw)\n";
-  PrintIndent(stream, indent);
-  stream << "XCLBIN = ${XCLBIN_NAME}_hw.xclbin\n";
-  stream << "CLCC_OPT += -t hw\n";
-  stream << "endif\n";
-
-  stream << "HOST_ARGS = ${XCLBIN}\n";
-  stream << "COMMON_DIR = ./common\n";
-  stream << "include ${COMMON_DIR}/common.mk\n";
-
-  stream.close();
-}
-
-void GenCommonFile() {
-  int indent = 0;
-  std::ofstream stream;
-  stream.open("./common/common.mk");
-  indent += 4;
-  stream << "SHELL = /bin/bash\n";
-  stream << "VPATH = ./\n";
-  stream << "CC = xcpp\n";
-  stream << "CLCC = xocc\n";
-  stream << "ifeq ($(XDEVICE_REPO_PATH),)\n";
-  PrintIndent(stream, indent);
-  stream << "DEVICE_REPO_OPT = \n";
-  stream << "else\n";
-  stream << "DEVICE_REPO_OPT = --xp prop:solution.device_repo_paths=${XDEVICE_REPO_PATH}\n";
-  stream << "endif\n";
-  stream << "HOST_CFLAGS += -I${XILINX_SDX}/runtime/include/1_2\n";
-  stream << "HOST_LFLAGS += -L${XILINX_SDX}/runtime/lib/x86_64 -lxilinxopencl -lrt -pthread\n";
-  stream << "CLCC_OPT += $(CLCC_OPT_LEVEL) ${DEVICE_REPO_OPT} --xdevice ${XDEVICE} -o ${XCLBIN} ${KERNEL_DEFS} ${KERNEL_INCS}\n";
-  stream << "ifeq (${KEEP_TEMP},1)\n";
-  PrintIndent(stream, indent);
-  stream << "CLCC_OPT += -s\n";
-  stream << "endif\n";
-  stream << "ifeq (${KERNEL_DEBUG},1)\n";
-  PrintIndent(stream, indent);
-  stream << "CLCC_OPT += -g\n";
-  stream << "endif\n";
-  stream << "CLCC_OPT += --kernel ${KERNEL_NAME}\n";
-  stream << "OBJECTS := $(HOST_SRCS:.cpp=.o)\n";
-  stream << ".PHONY: all\n";
-  stream << "all: run\n";
-
-  stream << "host: ${HOST_EXE_DIR}/${HOST_EXE}\n";
-  stream << "xbin_cpu_em:\n";
-  PrintIndent(stream, indent);
-  stream << "make SDA_FLOW=cpu_emu xbin -f sdaccel.mk\n";
-  stream << "xbin_hw_em:\n";
-  PrintIndent(stream, indent);
-  stream << "make SDA_FLOW=hw_emu xbin -f sdaccel.mk\n";
-  stream << "xbin_hw :\n";
-  PrintIndent(stream, indent);
-  stream << "make SDA_FLOW=hw xbin -f sdaccel.mk\n";
-  stream << "xbin: ${XCLBIN}\n";
-  stream << "run_cpu_em: \n";
-  PrintIndent(stream, indent);
-  stream << "make SDA_FLOW=cpu_emu run_em -f sdaccel.mk\n";
-  stream << "run_hw_em: \n";
-  PrintIndent(stream, indent);
-  stream << "make SDA_FLOW=hw_emu run_em -f sdaccel.mk\n";
-  stream << "run_hw : \n";
-  PrintIndent(stream, indent);
-  stream << "make SDA_FLOW=hw run_hw_int -f sdaccel.mk\n";
-  stream << "run_em: xconfig host xbin\n";
-  PrintIndent(stream, indent);
-  stream << "XCL_EMULATION_MODE=true ${HOST_EXE_DIR}/${HOST_EXE} ${HOST_ARGS}\n";
-  stream << "run_hw_int : host xbin_hw\n";
-  PrintIndent(stream, indent);
-  stream << "source ${BOARD_SETUP_FILE};${HOST_EXE_DIR}/${HOST_EXE} ${HOST_ARGS}\n";
-  stream << "estimate : \n";
-  PrintIndent(stream, indent);
-  stream << "${CLCC} -c -t hw_emu --xdevice ${XDEVICE} --report estimate ${KERNEL_SRCS}\n";
-  stream << "xconfig : emconfig.json\n";
-  stream << "emconfig.json :\n";
-  PrintIndent(stream, indent);
-  stream << "emconfigutil --xdevice ${XDEVICE} ${DEVICE_REPO_OPT} --od .\n";
-  stream << "${HOST_EXE_DIR}/${HOST_EXE} : ${OBJECTS}\n";
-  PrintIndent(stream, indent);
-  stream << "${CC} ${HOST_LFLAGS} ${OBJECTS} -o $@\n";
-  stream << "${XCLBIN}:\n";
-  PrintIndent(stream, indent);
-  stream << "${CLCC} ${CLCC_OPT} ${KERNEL_SRCS}\n";
-  stream << "%.o: %.cpp\n";
-  PrintIndent(stream, indent);
-  stream << "${CC} ${HOST_CFLAGS} -c $< -o $@\n";
-  stream << "clean:\n";
-  PrintIndent(stream, indent);
-  stream << "${RM} -rf ${HOST_EXE} ${OBJECTS} ${XCLBIN} emconfig.json _xocc_${XCLBIN_NAME}_*.dir .Xil\n";
-  stream << "cleanall: clean\n";
-  PrintIndent(stream, indent);
-  stream << "${RM} -rf *.xclbin sdaccel_profile_summary.* _xocc_* TempConfig *.log *.jou\n";
-
-  stream.close();
-}
-
-void GenHostCode(TVMArgs& args,
-                 const std::vector<int>& shmids,
-                 const std::vector<TVMType>& arg_types,
-                 LoweredFunc func,
-                 std::string test_file) {
-  int indent = 0;
-  std::ofstream stream;
-  stream.open("host.cpp");
-  indent += 2;
-
-  stream << "#define CL_HPP_CL_1_2_DEFAULT_BUILD\n";
-  stream << "#define CL_HPP_TARGET_OPENCL_VERSION 120\n";
-  stream << "#define CL_HPP_MINIMUM_OPENCL_VERSION 120\n";
-  stream << "#define CL_HPP_ENABLE_PROGRAM_CONSTRUCTION_FROM_ARRAY_COMPATIBILITY 1\n";
-  stream << "#include <CL/cl2.hpp>\n";
-  stream << "#include <fstream>\n";
-  stream << "#include <sys/types.h>\n";
-  stream << "#include <sys/stat.h>\n";
-  stream << "#include <fcntl.h>\n";
-  stream << "#include <unistd.h>\n";
-  stream << "#include <stdlib.h>\n";
-  stream << "#include <stdio.h>\n";
-  stream << "#include <cstring>\n";
-  stream << "#include <iostream>\n";
-  stream << "#include <iomanip>\n";
-  // stream << "#include <math.h>\n";
-  stream << "#include <cmath>\n";
-  stream << "#include <sys/ipc.h>\n";
-  stream << "#include <sys/shm.h>\n";
-  stream << "#pragma once\n";
-  stream << "\n\n";
-  
-  // stream << test_file;
-  stream << "\n\n";
-
-  stream << "int main(void) { \n";
-
-  stream << "#if defined(SDX_PLATFORM) && !defined(TARGET_DEVICE)\n";
-  indent += 2;
-  stream << "  #define STR_VALUE(arg) #arg\n";
-  stream << "  #define GET_STRING(name) STR_VALUE(name)\n";
-  stream << "  #define TARGET_DEVICE GET_STRING(SDX_PLATFORM)\n";
-  stream << "#endif\n";
-
-  // get the krnl code
-  PrintIndent(stream, indent);
-  stream << "char* xclbinFilename = argv[1];\n";
-  stream << "\n";
-
-  for (int i = 0;i < args.size();i++) {
-    PrintIndent(stream, indent);
-    stream << "std::vector<" << Type2Str(arg_types[i]);
-    stream << "> ";
-    stream << "source_" << i << "(";
-    TVMArray* arr = args[i];
-    for (int j = 0;j < arr->ndim;j++) {
-      if (j == arr->ndim-1) {
-        stream << arr->shape[j] << ")";
-      } else {
-        // stream << " * " << arr->shape[j] << ")";
-        stream << arr->shape[j] << " * ";
-      }
-    }
-    stream << ";\n";
-  }
-  stream << "\n";
-
-  for (int i = 0;i < args.size();i++) {
-    PrintIndent(stream, indent);
-    stream << "size_t vector_size_bytes_" << i;
-    stream << " = sizeof(" << Type2Str(arg_types[i]);
-    stream << ")";
-    TVMArray* arr = args[i];
-    for (int j = 0;j < arr->ndim;j++) {
-      stream << " * " << arr->shape[j];
-    }
-    stream << ";\n";
-  }
-  stream << "\n";
-
-  for (int i = 0;i < args.size();i++ ) {
-    // read from the shared memory
-    PrintIndent(stream, indent);
-    stream << Type2Str(arg_types[i]) << "* ";
-    stream << "arg_" << i << " = ";
-    stream << "(" << Type2Str(arg_types[i]) << "*)";
-    stream << "shmat(" << shmids[i] << ", nullptr, 0);\n";
-    TVMArray* arr = args[i];
-    // copy from shared mem  
-    PrintCopy(arr, stream, indent, i);
-  }
-
-  // Getting First Platform
-  PrintIndent(stream, indent);
-  stream << "std::vector<cl::Platform> platforms;\n";
-  PrintIndent(stream, indent);
-  stream << "cl::Platform::get(&platforms);\n";
-  PrintIndent(stream, indent);
-  stream << "cl::Platform platform = platforms[0];\n";
-  stream << "\n";
-
-  // Getting ACCELERATOR Devices and selecting 1st such device
-  PrintIndent(stream, indent);
-  stream << "std::vector<cl::Device> devices;\n";
-  PrintIndent(stream, indent);
-  stream << "platform.getDevices(CL_DEVICE_TYPE_ACCELERATOR, &devices);\n";
-  PrintIndent(stream, indent);
-  stream << "cl::Device device = devices[0];\n";
-  stream << "\n";
-
-  // Creating Context and Command Queue for selected Device
-  PrintIndent(stream, indent);
-  stream << "cl::Context context(device);\n";
-  PrintIndent(stream, indent);
-  stream << "cl::CommandQueue q(context, device);\n";
-  stream << "\n";
-
-  // Loading XCL Bin into char buffer
-  PrintIndent(stream, indent);
-  stream << "std::ifstream bin_file(xclbinFilename, std::ifstream::binary);\n";
-  PrintIndent(stream, indent);
-  stream << "bin_file.seekg (0, bin_file.end);\n";
-  PrintIndent(stream, indent);
-  stream << "unsigned nb = bin_file.tellg();\n";
-  PrintIndent(stream, indent);
-  stream << "bin_file.seekg (0, bin_file.beg);\n";
-  PrintIndent(stream, indent);
-  stream << "char *buf = new char [nb];\n";
-  PrintIndent(stream, indent);
-  stream << "bin_file.read(buf, nb);\n";
-  stream << "\n";
-
-  // Creating Program from Binary File
-  PrintIndent(stream, indent);
-  stream << "cl::Program::Binaries bins;\n";
-  PrintIndent(stream, indent);
-  stream << "bins.push_back({buf,nb});\n";
-  PrintIndent(stream, indent);
-  stream << "devices.resize(1);\n";
-  PrintIndent(stream, indent);
-  stream << "cl::Program program(context, devices, bins);\n";
-  stream << "\n";
-
-  // Creating Kernel and Functor of Kernel
-  PrintIndent(stream, indent);
-  stream << "int err1;\n";
-  PrintIndent(stream, indent);
-  stream << "cl::Kernel kernel(program, \"default_function\", &err1);\n";
-  PrintIndent(stream, indent);
-  stream << "auto default_function = cl::KernelFunctor<";
-  for (int i = 0;i < args.size();i++) {
-    if (i == args.size() - 1) {
-      stream << "cl::Buffer&>(kernel);\n";
-    } else {
-      stream << "cl::Buffer&, ";
-    }
-  }
-  stream << "\n";
-
-  // Creating Buffers inside Device
-  for (int i = 0;i < args.size();i++) {
-    PrintIndent(stream, indent);
-    stream << "cl::Buffer buffer_" << i;
-    stream << "(context, CL_MEM_READ_WRITE, vector_size_bytes_" << i << ");\n";
-  }
-  stream << "\n";
-
-  // Copying input data to Device buffer from host memory
-  for (int i = 0;i < args.size();i++) {
-    PrintIndent(stream, indent);
-    stream << "q.enqueueWriteBuffer(buffer_" << i;
-    stream << ", CL_TRUE, 0, vector_size_bytes_" << i;
-    stream << ", source_" << i << ".data());\n"; 
-  }
-  stream << "\n";
-
-  // Running Kernel
-  PrintIndent(stream, indent);
-  stream << func->name << "(";
-  stream << "cl::EnqueueArgs(q, cl::NDRange(1,1,1), cl::NDRange(1,1,1)),";
-  for (int i = 0; i < args.size(); i++) {
-    stream << "buffer_" << i;
-    if (i != args.size()-1) 
-      stream << ", ";
-  }
-  stream << ");\n";
-  PrintIndent(stream, indent);
-  stream << "q.finish();\n";
-  stream << "\n";
-
-  // Copying Device result data to Host memory
-  for (int i = 0;i < args.size(); i++) {
-    PrintIndent(stream, indent);
-    stream << "q.enqueueReadBuffer(buffer_" << i;
-    stream << ", CL_TRUE, 0, vector_size_bytes_" << i;
-    stream << ", source_" << i << ".data());\n";
-  }
-  stream << "\n";
-
-  // copy to shared mem
-  for (int i = 0;i < args.size();i++) {
-    if (args[i].type_code() == kArrayHandle) {
-      TVMArray* arr = args[i];
-      PrintCopyBack(arr, stream, indent, i);
-      PrintIndent(stream, indent);
-      stream << "shmdt(";
-      stream << "arg_" << i << ");\n";
-    }
-  }
-
-  stream << "}\n";
-  stream.close();
-}
-} // namespace
-
-
-class SDAccelModuleNode final : public ModuleNode {
- public:
-  SDAccelModuleNode(LoweredFunc func, std::string test_file) 
-    : func_(func), test_file_(test_file) {}
-
-  const char* type_key() const {
-    return "sdaccel_sw_emu";
-  }
-
-  PackedFunc GetFunction(
-      const std::string& name,
-      const std::shared_ptr<ModuleNode>& sptr_to_self) final {
-    return PackedFunc([this](TVMArgs args, TVMRetValue* rv){
-
-        if (args.size() != (int)func_->args.size())
-          LOG(FATAL) << "The function should take in " << func_->args.size() 
-                     << " inputs but get " << args.size();
-        std::vector<size_t> arg_sizes;
-        std::vector<TVMType> arg_types;
-        std::vector<int> shmids;
-        CollectArgInfo(args, func_, arg_sizes, arg_types);
-        GenSharedMem(args, shmids, arg_sizes);
-        LOG(CLEAN) << "Creating a Host file for SDAccel Runtime ...";
-        GenHostCode(args, shmids, arg_types, func_, test_file_);
-
-        LOG(CLEAN) << "Creating a Common folder for common.mk ...";
-        system("mkdir common");
-        GenCommonFile();
-
-        LOG(CLEAN) << "Creating a Makfile for compling the SDAccel OpenCL Code ...";
-        GenMakFile();
-        // TODO: find a better way to do the following
-        LOG(CLEAN) << "Compiling the generated SDAccel OpenCL Code ...";
-        // system("make -f ./sdaccel.mk run_cpu_em");
-        LOG(CLEAN) << "Running SDAccel OpenCL Software Simulation ...";
-        LOG(CLEAN) << "Finished SDAccel OpenCL Software Simulation ...";
-        // system("make -f sdaccel.mk cleanall");
-        FreeSharedMem(args, shmids, arg_sizes);
-      });
-  }
-
- private:
-  LoweredFunc func_;
-  std::string test_file_;
-};
-
-Module CreateSDAccelModule(LoweredFunc func,
-                           std::string code) {
-  std::shared_ptr<SDAccelModuleNode> n =
-    std::make_shared<SDAccelModuleNode>(func, code);
-
-  return Module(n);
-}
-
-} // namespace runtime
-} // namespace TVM
diff --git a/tvm/src/codegen/opencl/sdaccel_module.h b/tvm/src/codegen/opencl/sdaccel_module.h
deleted file mode 100644
index 01f361dba..000000000
--- a/tvm/src/codegen/opencl/sdaccel_module.h
+++ /dev/null
@@ -1,18 +0,0 @@
-#ifndef SDACCEL_MODULE_H
-#define SDACCEL_MODULE_H
-
-#include <tvm/runtime/module.h>
-#include <tvm/runtime/packed_func.h>
-#include "../build_common.h"
-
-namespace TVM {
-namespace runtime {
-
-Module CreateSDAccelModule(
-    LoweredFunc func,
-    std::string code);
-
-} // namespace runtime
-} // namespace TVM
-
-#endif
diff --git a/tvm/src/codegen/ppac/build_rv64_ppac.cc b/tvm/src/codegen/ppac/build_rv64_ppac.cc
deleted file mode 100644
index c14a1cdf3..000000000
--- a/tvm/src/codegen/ppac/build_rv64_ppac.cc
+++ /dev/null
@@ -1,32 +0,0 @@
-/*
- * \file build_rv64_ppac.cc
- */
-
-#include "./codegen_rv64_ppac.h"
-#include "../build_common.h"
-
-namespace TVM{
-namespace codegen{
-
-std::string BuildRV64PPAC(Array<LoweredFunc> funcs) {
-    CodeAnalysMerlinC ca;
-    CodeGenRV64PPAC cg;
-    for (LoweredFunc f: funcs) {
-        ca.AddFunction(f);
-        str2tupleMap<std::string, Type> map_arg_type;
-        map_arg_type = ca.Finish();
-        cg.AddFunction(f, map_arg_type);
-    }
-    std::string code = cg.Finish();
-
-    LOG(WARNING) << "RV64_PPAC backend doesn't have runtime, return kernel code";
-    return code;
-}
-
-TVM_REGISTER_API("codegen.build_rv64_ppac")
-.set_body([](TVMArgs args, TVMRetValue* rv) {
-    *rv = BuildRV64PPAC(args[0]);
-  });
-
-}  // namespace codegen
-}  // namespace TVM
\ No newline at end of file
diff --git a/tvm/src/codegen/ppac/codegen_rv64_ppac.cc b/tvm/src/codegen/ppac/codegen_rv64_ppac.cc
deleted file mode 100644
index 1fd5e2b6e..000000000
--- a/tvm/src/codegen/ppac/codegen_rv64_ppac.cc
+++ /dev/null
@@ -1,202 +0,0 @@
-/*
- * \file codegen_rv64_ppac.cc
- */
- 
-#include <tvm/build_module.h>
-#include <tvm/ir_pass.h>
-#include <vector>
-#include <string>
-#include <regex>
-#include <fstream>
-#include <sys/types.h>
-#include "./codegen_rv64_ppac.h"
-#include "../build_common.h"
-
-namespace TVM {
-namespace codegen {
-
-void CodeGenRV64PPAC::AddFunction(LoweredFunc f, 
-        str2tupleMap<std::string, Type> map_arg_type) {
-  // Clear previous generated state
-  this->InitFuncState(f);
-  // Register alloc buffer type
-  for (const auto & kv : f->handle_data_type) {
-    RegisterHandleType(kv.first.get(), kv.second.type());
-  }
-  // Write entry function name
-  this->stream << "void " << f->name << "(";
-  // Write arguments
-  for (size_t i = 0; i < f->args.size(); ++i) {
-    Var v = f->args[i];
-    std::string vid = AllocVarID(v.get());
-    if (i != 0) this->stream << ", ";
-    if (map_arg_type.find(vid) == map_arg_type.end()) {
-      LOG(WARNING) << vid << " type not found\n";
-      PrintType(v.type(), this->stream);
-      this->stream << ' ' << vid;
-    }
-    else {
-      auto arg = map_arg_type[vid];
-      PrintType(std::get<1>(arg), this->stream);
-      this->stream << "*"; 
-      this->stream << ' ' << std::get<0>(arg);
-    }
-  }
-  stream << ") {\n";
-  int func_scope = this->BeginScope();
-  this->PrintStmt(f->body);
-  this->EndScope(func_scope);
-  this->PrintIndent();
-  this->stream << "}\n\n";
-}
-
-void CodeGenRV64PPAC::VisitStmt_(const For* op) {
-  std::string func_name;
-  bool is_ppac_func = false;
-  uint8_t i = 0;
-  for (auto key: op->annotate_keys) {
-    if (auto str = key.as<StringImm>()) {
-      if (str->value == "_ppac_func_name") {
-        auto name = op->annotate_values[i].as<StringImm>();
-        func_name = name->value;
-        is_ppac_func = true;
-        break;
-      }
-    }
-    ++i;
-  }
-  if (is_ppac_func) {
-    // scan along the annotate list to find parameters
-    std::string ret, arg0, arg1;
-    int batch_num, in_block_num, out_channel_num;
-    i = 0;
-    uint8_t param_num = 0;
-    for (auto key: op->annotate_keys) {
-      if (auto str = key.as<StringImm>()) {
-        if (str->value == "_ret") {
-          auto v = op->annotate_values[i].as<StringImm>();
-          ret = v->value;
-          ++param_num;         
-        } else if (str->value == "_arg0") {
-          auto v = op->annotate_values[i].as<StringImm>();
-          arg0 = v->value;
-          ++param_num;         
-        } else if (str->value == "_arg1") {
-          auto v = op->annotate_values[i].as<StringImm>();
-          arg1 = v->value;
-          ++param_num;         
-        } else if (str->value == "_batch_num") {
-          auto v = op->annotate_values[i].as<IntImm>();
-          batch_num = v->value;
-          ++param_num;
-        } else if (str->value == "_in_block_num") {
-          auto v = op->annotate_values[i].as<IntImm>();
-          in_block_num = v->value;
-          ++param_num;
-        } else if (str->value == "_out_channel_num") {
-          auto v = op->annotate_values[i].as<IntImm>();
-          out_channel_num = v->value;
-          ++param_num;
-        }
-      }
-      ++i;
-    }
-    if (param_num != 6) {
-      LOG(FATAL) << "PPAC function call need exactly 6 parameters but found " << param_num;
-    }
-    // print ppac function call
-    PrintIndent();
-    stream << func_name << "(" 
-           << ret << ", "
-           << arg0 << ", " 
-           << arg1 << ", "
-           << batch_num << ", " 
-           << in_block_num << ", "
-           << out_channel_num 
-           << ");\n";
-    return;
-  }
-  CodeGenC::VisitStmt_(op);
-}
-
-void CodeGenRV64PPAC::VisitStmt_(const LetStmt* op) {
-  std::string value = PrintExpr(op->value);
-  // Skip the argument retrieving assign statement
-  std::string vid = AllocVarID(op->var.get());
-  if (op->var.type() != Handle() &&
-      value.find("TVMArray") == std::string::npos &&
-      value.find("arg") != 0) {
-    PrintIndent();
-    PrintType(op->var.type(), this->stream);
-    this->stream << ' '
-                 << vid
-                 << " = " << value << ";\n";
-  }
-  PrintStmt(op->body);
-}
-
-void CodeGenRV64PPAC::VisitStmt_(const IfThenElse* op) {
-  std::string cond = PrintExpr(op->condition);
-  // Skip the buffer data checking
-  if (std::regex_match(cond, std::regex("!\\((arg)(.+)(== NULL)\\)")))
-      return ;
-  PrintIndent();
-  if (cond[0] == '(' && cond[cond.length() - 1] == ')') {
-    stream << "if " << cond << " {\n";
-  } else {
-    stream << "if (" << cond << ") {\n";
-  }
-  int then_scope = BeginScope();
-  PrintStmt(op->then_case);
-  this->EndScope(then_scope);
-  if (op->else_case.defined()) {
-    PrintIndent();
-    stream << "} else {\n";
-    int else_scope = BeginScope();
-    PrintStmt(op->else_case);
-    this->EndScope(else_scope);
-  }
-  PrintIndent();
-  stream << "}\n";
-}
-
-void CodeGenRV64PPAC::PrintType(Type t, std::ostream& os) {
-  CHECK_EQ(t.lanes(), 1)
-      << "do not support vector types";
-  if (t.is_uint() || t.is_int()) {
-    if (t.is_uint())  {
-      if (t.bits() <= 8) {
-        os << "uint8_t"; return;
-      } else if (t.bits() <= 16) {
-        os << "uint16_t"; return;
-      } else if (t.bits() <= 32) {
-        os << "uint32_t"; return;
-      } else if (t.bits() <= 64) {
-        os << "uint64_t"; return;
-      } else {
-        LOG(WARNING) << "Casting type " << t << " to uint64_t";
-        os << "uint64_t"; 
-        return;
-      }
-    }
-    else if (t.is_int()) {
-      if (t.bits() <= 8) {
-        os << "int8_t"; return;
-      } else if (t.bits() <= 16) {
-        os << "int16_t"; return;
-      } else if (t.bits() <= 32) {
-        os << "int32_t"; return;
-      } else if (t.bits() <= 64) {
-        os << "int64_t"; return;
-      } else {
-        LOG(WARNING) << "Casting type " << t << " to int64_t";
-        os << "int64_t"; 
-        return;
-      }
-    }
-  }
-  os << t;
-}
-
-} //namespace codegen
-} //namespace TVM
\ No newline at end of file
diff --git a/tvm/src/codegen/ppac/codegen_rv64_ppac.h b/tvm/src/codegen/ppac/codegen_rv64_ppac.h
deleted file mode 100644
index 881bdea05..000000000
--- a/tvm/src/codegen/ppac/codegen_rv64_ppac.h
+++ /dev/null
@@ -1,28 +0,0 @@
-/*
- * \file codegen_rv64_ppac.h
- */
- 
-#ifndef TVM_CODEGEN_CODEGEN_RV64_PPAC_H_
-#define TVM_CODEGEN_CODEGEN_RV64_PPAC_H_
-
-#include <tvm/codegen.h>
-#include <string>
-#include "../codegen_c.h"
-#include "../merlinc/codeanalys_merlinc.h"
-
-namespace TVM {
-namespace codegen {
-
-class CodeGenRV64PPAC : public CodeGenC {
-  public:
-    void AddFunction(LoweredFunc f, str2tupleMap<std::string, Type> map_arg_type);
-    void PrintType(Type t, std::ostream& os) override;
-    void VisitStmt_(const LetStmt* op) override;
-    void VisitStmt_(const IfThenElse* op) override;
-    void VisitStmt_(const For* op) override;
-};
-
-}  // namespace codegen
-}  // namespace TVM
-
-#endif   //TVM_CODEGEN_CODEGEN_RV64_PPAC_H_
\ No newline at end of file
diff --git a/tvm/src/lang/ir.cc b/tvm/src/lang/ir.cc
index c88f8ea94..3589de195 100644
--- a/tvm/src/lang/ir.cc
+++ b/tvm/src/lang/ir.cc
@@ -149,8 +149,6 @@ TVM_REGISTER_NODE_TYPE(Quantize);
 TVM_REGISTER_NODE_TYPE(KernelDef);
 TVM_REGISTER_NODE_TYPE(KernelExpr);
 TVM_REGISTER_NODE_TYPE(KernelStmt);
-TVM_REGISTER_NODE_TYPE(StreamStmt);
-TVM_REGISTER_NODE_TYPE(StreamExpr);
 TVM_REGISTER_NODE_TYPE(Return);
 TVM_REGISTER_NODE_TYPE(Break);
 TVM_REGISTER_NODE_TYPE(While);
diff --git a/tvm/src/pass/ir_mutator.cc b/tvm/src/pass/ir_mutator.cc
index 89485e723..ec67aa314 100644
--- a/tvm/src/pass/ir_mutator.cc
+++ b/tvm/src/pass/ir_mutator.cc
@@ -202,15 +202,6 @@ Stmt IRMutator::Mutate_(const Store *op, const Stmt& s) {
   }
 }
 
-Stmt IRMutator::Mutate_(const StreamStmt *op, const Stmt& s) {
-  Expr value = this->Mutate(op->value);
-  if (value.same_as(op->value)) {
-    return s;
-  } else {
-    return StreamStmt::make(op->buffer_var, value, op->stream_type, op->depth);
-  }
-}
-
 Stmt IRMutator::Mutate_(const Provide* op, const Stmt& s) {
   auto new_args = MutateArray(op->args, this);
   auto new_value = this->Mutate(op->value);
@@ -330,8 +321,7 @@ Stmt IRMutator::Mutate_(const KernelDef *op, const Stmt &s) {
   if (body.same_as(op->body) && ret_void.same_as(op->ret_void)) {
     return s;
   } else {
-    return KernelDef::make(op->args, op->api_args, op->api_types,
-                           body, ret_void, op->ret_type, op->name, op->channels);
+    return KernelDef::make(op->args, body, ret_void, op->ret_type, op->name);
   }
 }
 
@@ -412,7 +402,6 @@ TVM_STATIC_IR_FUNCTOR(IRMutator, vtable_stmt)
 .DISPATCH_TO_MUTATE_STMT(Prefetch)
 .DISPATCH_TO_MUTATE_STMT(KernelDef)
 .DISPATCH_TO_MUTATE_STMT(KernelStmt)
-.DISPATCH_TO_MUTATE_STMT(StreamStmt)
 .DISPATCH_TO_MUTATE_STMT(Return)
 .DISPATCH_TO_MUTATE_STMT(Break)
 .DISPATCH_TO_MUTATE_STMT(While)
@@ -441,10 +430,6 @@ Expr IRMutator::Mutate_(const Load *op, const Expr& e) {
   }
 }
 
-Expr IRMutator::Mutate_(const StreamExpr *op, const Expr& e) {
-  return e;
-}
-
 Expr IRMutator::Mutate_(const Let *op, const Expr& e) {
   Expr value = this->Mutate(op->value);
   Expr body = this->Mutate(op->body);
@@ -680,7 +665,6 @@ TVM_STATIC_IR_FUNCTOR(IRMutator, vtable_expr)
 .DISPATCH_TO_MUTATE_EXPR(SetBit)
 .DISPATCH_TO_MUTATE_EXPR(SetSlice)
 .DISPATCH_TO_MUTATE_EXPR(Quantize)
-.DISPATCH_TO_MUTATE_EXPR(StreamExpr)
 .DISPATCH_TO_MUTATE_EXPR(KernelExpr);
 
 }  // namespace ir
diff --git a/tvm/src/pass/ir_visitor.cc b/tvm/src/pass/ir_visitor.cc
index 6346c6262..160cb906e 100644
--- a/tvm/src/pass/ir_visitor.cc
+++ b/tvm/src/pass/ir_visitor.cc
@@ -252,13 +252,6 @@ void IRVisitor::Visit_(const KernelStmt *op) {
   }
 }
 
-void IRVisitor::Visit_(const StreamStmt *op) {
-  this->Visit(op->value);
-}
-
-void IRVisitor::Visit_(const StreamExpr *op) {
-}
-
 void IRVisitor::Visit_(const Return *op) {
   this->Visit(op->value);
 }
@@ -345,8 +338,6 @@ TVM_STATIC_IR_FUNCTOR(IRVisitor, vtable)
 .DISPATCH_TO_VISIT(KernelDef)
 .DISPATCH_TO_VISIT(KernelExpr)
 .DISPATCH_TO_VISIT(KernelStmt)
-.DISPATCH_TO_VISIT(StreamStmt)
-.DISPATCH_TO_VISIT(StreamExpr)
 .DISPATCH_TO_VISIT(Return)
 .DISPATCH_TO_VISIT(Break)
 .DISPATCH_TO_VISIT(While)
diff --git a/tvm/src/pass/split_host_device.cc b/tvm/src/pass/split_host_device.cc
index fdcd0c56f..534e0b695 100644
--- a/tvm/src/pass/split_host_device.cc
+++ b/tvm/src/pass/split_host_device.cc
@@ -81,14 +81,6 @@ class IRUseDefAnalysis : public IRMutator {
     return IRMutator::Mutate_(op, s);
   }
 
-  Stmt Mutate_(const StreamStmt *op, const Stmt& s) final {
-    if (!def_count_.count(op->buffer_var.get())) {
-      def_count_[op->buffer_var.get()] = 0;
-      use_count_[op->buffer_var.get()] = 0;
-    }
-    return IRMutator::Mutate_(op, s);
-  }
-
   Expr Mutate_(const Let *op, const Expr& e) final {
     this->HandleDef(op->var.get());
     Expr body = this->Mutate(op->body);
@@ -117,14 +109,6 @@ class IRUseDefAnalysis : public IRMutator {
     return IRMutator::Mutate_(op, e);
   }
 
-  Expr Mutate_(const StreamExpr *op, const Expr& e) final {
-    if (!def_count_.count(op->buffer_var.get())) {
-      def_count_[op->buffer_var.get()] = 0;
-      use_count_[op->buffer_var.get()] = 0;
-    }
-    return IRMutator::Mutate_(op, e);
-  }
-
   Stmt Mutate_(const KernelDef *op, const Stmt& s) {
     for (auto arg : op->args) {
       this->HandleDef(arg.get());
diff --git a/tvm/src/pass/stream_inference.cc b/tvm/src/pass/stream_inference.cc
deleted file mode 100644
index ec18b1871..000000000
--- a/tvm/src/pass/stream_inference.cc
+++ /dev/null
@@ -1,345 +0,0 @@
-/*!
- *  Copyright (c) 2019 by Contributors
- * \file remove_no_op.cc
- * \brief Remove no op from the stmt
- */
-#include <tvm/ir.h>
-#include <tvm/ir_pass.h>
-#include <tvm/ir_mutator.h>
-#include <unordered_map>
-
-namespace TVM {
-namespace ir {
-
-// use/def analysis to capture host xcel deps 
-class StreamUseDefAnalysis : public IRMutator {
- public:
-  Stmt Mutate_(const AttrStmt *op, const Stmt& s) final {
-    if (op->attr_key == attr::device_scope) {
-      if (op->value.as<StringImm>()->value == "fpga")
-        host_scope_ = false;
-      return IRMutator::Mutate_(op, s);
-    } else {
-      return IRMutator::Mutate_(op, s);
-    }
-  }
-
-  Stmt Mutate_(const LetStmt *op, const Stmt& s) final {
-    this->HandleDef(op->var.get());
-    Stmt body = this->Mutate(op->body);
-    Expr value = this->Mutate(op->value);
-    if (body.same_as(op->body) &&
-        value.same_as(op->value)) {
-      return s;
-    } else {
-      return LetStmt::make(op->var, value, body);
-    }
-  }
-
-  Stmt Mutate_(const For *op, const Stmt& s) final {
-    this->HandleDef(op->loop_var.get());
-    return IRMutator::Mutate_(op, s);
-  }
-
-  Stmt Mutate_(const Allocate *op, const Stmt& s) final {
-    this->HandleDef(op->buffer_var.get());
-    return IRMutator::Mutate_(op, s);
-  }
-
-  Stmt Mutate_(const Store *op, const Stmt& s) final {
-    this->HandleUse(op->buffer_var);
-    return IRMutator::Mutate_(op, s);
-  }
-
-  Stmt Mutate_(const StreamStmt *op, const Stmt& s) final {
-    this->HandleUse(op->buffer_var);
-    return IRMutator::Mutate_(op, s);
-  }
-
-  Expr Mutate_(const Let *op, const Expr& e) final {
-    this->HandleDef(op->var.get());
-    Expr body = this->Mutate(op->body);
-    Expr value = this->Mutate(op->value);
-    if (body.same_as(op->body) &&
-        value.same_as(op->value)) {
-      return e;
-    } else {
-      return Let::make(op->var, value, body);
-    }
-  }
-
-  Expr Mutate_(const Variable *op, const Expr& e) final {
-    this->HandleUse(e);
-    return IRMutator::Mutate_(op, e);
-  }
-
-  Expr Mutate_(const Load *op, const Expr& e) final {
-    this->HandleUse(op->buffer_var);
-    return IRMutator::Mutate_(op, e);
-  }
-
-  Expr Mutate_(const StreamExpr *op, const Expr& e) final {
-    this->HandleUse(op->buffer_var);
-    return IRMutator::Mutate_(op, e);
-  }
-
-  Stmt Mutate_(const KernelDef *op, const Stmt& s) {
-    for (auto arg : op->args) {
-      this->HandleDef(arg.get());
-    }
-    Stmt body = this->Mutate(op->body);
-    for (auto arg : op->args) {
-      xcel_def_count_[arg.get()] = 0;
-    }
-    return s;
-  }
-
-  void HandleDef(const Variable* v) {
-    if (host_scope_) {
-      CHECK(!host_def_count_.count(v))
-          << "variable " << v->name_hint
-          << " has already been defined, the Stmt is not SSA";
-      CHECK(!host_use_count_.count(v))
-          << "variable " << v->name_hint
-          << " has been used before definition!";
-      host_use_count_[v] = 0;
-      host_def_count_[v] = 1;
-    } else {
-      CHECK(!xcel_def_count_.count(v))
-          << "variable " << v->name_hint
-          << " has already been defined, the Stmt is not SSA";
-      CHECK(!xcel_use_count_.count(v))
-          << "variable " << v->name_hint
-          << " has been used before definition!";
-      xcel_use_count_[v] = 0;
-      xcel_def_count_[v] = 1;
-    }
-  }
-
-  void HandleUse(const Expr& v) {
-    CHECK(v.as<Variable>());
-    Var var(v.node_);
-    if (host_scope_) {
-      auto it = host_use_count_.find(var.get());
-      if (it != host_use_count_.end()) {
-        if (it->second >= 0) {
-          ++it->second;
-        }
-      } else {
-        host_undefined_.push_back(var);
-        host_use_count_[var.get()] = -1;
-      }
-    } else {
-      auto it = xcel_use_count_.find(var.get());
-      if (it != xcel_use_count_.end()) {
-        if (it->second >= 0) {
-          ++it->second;
-        }
-      } else {
-        xcel_undefined_.push_back(var);
-        xcel_use_count_[var.get()] = -1;
-      }
-    }
-  }
-
-  bool host_scope_{true};
-  Array<Var> host_undefined_;
-  Array<Var> xcel_undefined_;
-  std::unordered_map<const Variable*, int> host_use_count_;
-  std::unordered_map<const Variable*, int> host_def_count_;
-  std::unordered_map<const Variable*, int> xcel_use_count_;
-  std::unordered_map<const Variable*, int> xcel_def_count_;
-};
-
-
-class StreamMutator : public IRMutator {
- public:
-  explicit StreamMutator(int bus_bandwidth) {
-    bus_bandwidth_ = bus_bandwidth;
-  }
-  // move device attr to allocate level
-  Stmt Mutate_(const AttrStmt* op, const Stmt& s) final {
-    Stmt stmt = IRMutator::Mutate_(op, s);
-    // if (op->attr_key == attr::device_scope)
-    //   return stmt.as<AttrStmt>()->body;
-    return stmt;
-  }
-
-  Stmt Mutate_(const For* op, const Stmt& s) final {
-    Stmt stmt = IRMutator::Mutate_(op, s);
-    op = stmt.as<For>();
-    auto extent = op->extent.as<IntImm>()->value;
-    auto min = op->min.as<IntImm>()->value;
-    // mutate sender: split and block inner loop
-    if (auto stream_op = op->body.as<StreamStmt>()) {
-      if (extent - min > bus_bandwidth_) {
-        LOG(WARNING) << "large";
-      } else {
-      }
-    // mutate receiver : (StreamExpr + For(Store = GetSlice))
-    } else if (auto store_op = op->body.as<Store>()) {
-      if (store_op->value.as<StreamExpr>() == nullptr) return stmt;
-      if (extent - min > bus_bandwidth_) {
-        LOG(WARNING) << "large";
-      } else {
-        return stmt;
-        // allocate intermediate buffer
-        VarExpr new_var(store_op->buffer_var.get()->name_hint + "_save");
-        Expr new_load = Load::make(store_op->buffer_var.type(), new_var, 0, const_true());
-        Stmt new_store = Store::make(store_op->buffer_var, new_load,
-                                     store_op->index, store_op->predicate);
-        Stmt new_for = For::make(op->loop_var, op->min, op->extent, op->for_type,
-                                 op->device_api, new_store);
-        // save stream data into intermediate buffer
-        Stmt read_in = Store::make(new_var, store_op->value, 
-                                   Expr(0), const_true());
-        // allocate intermediate buffer
-        return Allocate::make(new_var, 
-                              store_op->value.type(),
-                              {make_const(Int(bus_bandwidth_), 1)}, 
-                              const_true(), Block::make(read_in, new_for));
-      }
-    }
-    return stmt;
-  }
-
-  Stmt Mutate_(const StreamStmt* op, const Stmt& s) final {
-    Stmt stmt = IRMutator::Mutate_(op, s);
-    op = stmt.as<StreamStmt>();
-    const Variable* v = op->buffer_var.get();
-    stream_type_map_[v] = op->buffer_var.type();
-    return stmt;
-  }
-
-  Expr Mutate_(const StreamExpr* op, const Expr& e) final {
-    Expr expr = IRMutator::Mutate_(op, e);
-    op = expr.as<StreamExpr>();
-    const Variable* v = op->buffer_var.get();
-    stream_type_map_[v] = op->buffer_var.type();
-    return expr;
-  }
- private:
-   int bus_bandwidth_;
-   bool is_host_{true}; 
-   std::unordered_map<const Variable*, Type> stream_type_map_;
-};
-
-// Mark the statment scope of each stage.
-class StreamInferer : public IRMutator {
- public:
-  explicit StreamInferer(int bus_bandwidth) {
-    bus_bandwidth_ = bus_bandwidth;
-  }
-
-  Stmt Mutate_(const Allocate* op, const Stmt& s) final {
-    Stmt stmt = IRMutator::Mutate_(op, s);
-    op = stmt.as<Allocate>();
-    if (auto block = op->body.as<Block>()) {
-      if (auto producer = block->first.as<ProducerConsumer>()){
-        if (const AttrStmt* attr_stmt = producer->body.as<AttrStmt>()) {
-          if (const AttrStmt* device_attr = attr_stmt->body.as<AttrStmt>()) {
-            if (device_attr->attr_key == attr::device_scope) {
-              // mutate allocate body
-              StreamMutator mutator(bus_bandwidth_);
-              // allocate stream for host 
-              Stmt new_body = mutator.Mutate(op->body);
-              Stmt new_stmt = Allocate::make(op->buffer_var,
-                                             op->type,
-                                             op->extents,
-                                             op->condition,
-                                             new_body);
-              return AttrStmt::make(device_attr->node,
-                                    attr::device_scope,
-                                    device_attr->value,
-                                    new_stmt);
-            }
-          }
-        }
-      }
-    }
-    return stmt;
-  }
-
-  // Stmt Mutate_(const ProducerConsumer* op, const Stmt& s) final {
-  //   Stmt stmt = IRMutator::Mutate_(op, s);
-  //   op = stmt.as<ProducerConsumer>();
-  //   return is_no_op(op->body) ? op->body : stmt;
-  // }
-
-  // Stmt Mutate_(const Store* op, const Stmt& s) final {
-  //   Stmt stmt = IRMutator::Mutate_(op, s);
-  //   op = stmt.as<Store>();
-  //   auto it = var_remap_.find(op->buffer_var.get());
-  //   if (it != var_remap_.end() &&
-  //       !it->second.same_as(op->buffer_var)) {
-  //     CHECK(it->second.as<Variable>());
-  //     VarExpr buf_var(it->second.node_);
-  //     if (has_stencil_) outputs_.insert(buf_var);
-  //     return Store::make(buf_var, op->value, op->index, op->predicate);
-  //   } else {
-  //     return stmt;
-  //   }
-  // }
-
-  // Stmt Mutate_(const AttrStmt* op, const Stmt& s) final {
-  //   if (op->attr_key == attr::realize_scope) {
-  //     storage_scope_[op->node.get()] = op->value.as<StringImm>()->value;
-  //     return this->Mutate(op->body);
-  //   } else if (op->attr_key == attr::double_buffer_scope) {
-  //     Operation func(op->node.node_);
-  //     Stmt body = Mutate(op->body);
-  //     for (int i = 0; i < func->num_outputs(); ++i) {
-  //       TensorKey key{func, i};
-  //       auto it = buf_map_.find(key);
-  //       CHECK(it != buf_map_.end())
-  //           << "Cannot find allocated buffer for " << key.f;
-  //       body = AttrStmt::make(
-  //           it->second.buffer->data, op->attr_key, op->value, body);
-  //     }
-  //     return body;
-  //   } else if (op->attr_key == attr::thread_extent) {
-  //     IterVar iv(op->node.node_);
-  //     ThreadScope ts = ThreadScope::make(iv->thread_tag);
-  //     curr_thread_scope_.push_back(ts);
-  //     Stmt stmt = IRMutator::Mutate_(op, s);
-  //     curr_thread_scope_.pop_back();
-  //     return stmt;
-  //   } else if (op->attr_key == attr::buffer_bind_scope) {
-
-  // Stmt Mutate_(const For* op, const Stmt& s) final {
-  //   Stmt stmt = IRMutator::Mutate_(op, s);
-  //   op = stmt.as<For>();
-  //   return is_no_op(op->body) ? MakeEvaluate({op->min, op->extent}) : stmt;
-  // }
-
- private:
-  int bus_bandwidth_;
-  Stmt MakeEvaluate(Expr value) {
-    if (HasSideEffect(value)) {
-      return Evaluate::make(value);
-    } else {
-      return Evaluate::make(0);
-    }
-  }
-  Stmt MakeEvaluate(const Array<Expr>& values) {
-    Stmt stmt;
-    for (Expr e : values) {
-      if (HasSideEffect(e)) {
-        if (stmt.defined()) {
-          stmt = Block::make(stmt, Evaluate::make(e));
-        } else {
-          stmt = Evaluate::make(e);
-        }
-      }
-    }
-    return stmt.defined() ? stmt : Evaluate::make(0);
-  }
-};
-
-Stmt InferStream(Stmt stmt, 
-                 int bus_bandwidth) {
-  return StreamInferer(bus_bandwidth).Mutate(stmt); 
-}
-
-}  // namespace ir
-}  // namespace TVM
diff --git a/tvm/src/schedule/compute_primitive.h b/tvm/src/schedule/compute_primitive.h
index e7167257c..e65885462 100644
--- a/tvm/src/schedule/compute_primitive.h
+++ b/tvm/src/schedule/compute_primitive.h
@@ -33,14 +33,6 @@ Stmt PerformComputeAt(Stmt& producer,
                       size_t& attach_level,
                       std::unordered_map<const Variable*, Expr>& sub);
 
-Stmt StreamFromProducer(Stmt& stmt, 
-                        Buffer& producer_buf, 
-                        ir::StreamType& type);
-
-Stmt StreamToConsumer(Stmt& stmt, 
-                      Buffer& producer_buf,
-                      ir::StreamType& type);
-
 Stmt UpdateIterVarAttr(Stmt& stmt,
                       const IterVar& var,
                       const IterVarAttrNode* node);
diff --git a/tvm/src/schedule/schedule_dataflow_rewrite.cc b/tvm/src/schedule/schedule_dataflow_rewrite.cc
index a7fc8ee72..b2bd520e7 100644
--- a/tvm/src/schedule/schedule_dataflow_rewrite.cc
+++ b/tvm/src/schedule/schedule_dataflow_rewrite.cc
@@ -8,7 +8,6 @@
 #include <tvm/ir_mutator.h>
 #include <tvm/ir_visitor.h>
 #include <tvm/ir_pass.h>
-#include <regex>
 #include <unordered_set>
 #include "./message_passing.h"
 #include "../pass/ir_util.h"
@@ -28,33 +27,12 @@ size_t FindNodeRef(ArrayNode* array_node, const T& v) {
   return array_node->data.size();
 }
 
-// The replacer of cache.
-class LoadReplacer : public ir::IRMutator {
- public:
-  explicit LoadReplacer(
-      const std::unordered_map<const Variable*, VarExpr>& vsub)
-      : vsub_(vsub) {}
-
-  Expr Mutate_(const Load* op, const Expr& e) {
-    const Variable* var = op->buffer_var.as<Variable>();
-    auto it = vsub_.find(var);
-    if (it != vsub_.end())
-      return Load::make(op->type, it->second, 
-                        op->index, op->predicate); 
-    return e;
-  }
-
- private:
-  const std::unordered_map<const Variable*, VarExpr>& vsub_;
-};
-
 // The replacer of cache.
 class VarReplacer : public ir::IRMutator {
  public:
   explicit VarReplacer(
       const std::unordered_map<const Variable*, Expr>& vsub)
       : vsub_(vsub) {}
-
   Expr Mutate_(const Variable* op, const Expr& e) {
     auto it = vsub_.find(op);
     if (it != vsub_.end()) return it->second;
@@ -65,17 +43,6 @@ class VarReplacer : public ir::IRMutator {
   const std::unordered_map<const Variable*, Expr>& vsub_;
 };
 
-// create indices for store 
-Expr getIndex(std::vector<Expr> indices, const Array<Expr> shape) {
-  Expr ret = indices[0];
-  Expr mul = 1;
-  for (size_t i = 1; i < indices.size(); i++) {
-    ret = Simplify(ret + indices[i] * mul);
-    mul = Simplify(mul * shape[i]);
-  }
-  return ret;
-}
-
 Expr InjectPredicate(const Array<Expr>& predicates,
                      Expr body) {
   using ir::Reduce;
@@ -107,120 +74,6 @@ void ReplaceDataFlow(const Array<Stage>& stages,
   }
 }
 
-class StreamConsumer final : public IRMutator {
-  public: 
-    StreamConsumer(
-        const std::string& target,
-        const ir::StreamType& type,
-        int channel_index) 
-      : target_(target), type_(type),
-        channel_index_(channel_index) {} 
-
-    Expr Mutate_(const Load* op, const Expr& e) {
-      Expr index = op->index;
-      std::string target_name = op->buffer_var.get()->name_hint;
-      if (target_ == target_name) {
-        Array<Expr> keys, values;
-        keys.push_back(StringImm::make("index"));
-        values.push_back(IntImm::make(Int(32), channel_index_));
-        return StreamExpr::make(op->type, op->buffer_var, 
-                                type_, 10, keys, values);
-      } else {
-        return Load::make(op->type, op->buffer_var, 
-                          index, op->predicate);
-      }
-   }
-
-  private:
-    const std::string target_;
-    const ir::StreamType type_;
-    const int channel_index_;
-};
-
-class StreamProducer final : public IRMutator {
-  public: 
-    StreamProducer(
-        const std::string& target,
-        const ir::StreamType& type,
-        int channel_index) 
-      : target_(target), type_(type),
-        channel_index_(channel_index) {} 
-
-    Stmt Mutate_(const Store* op, const Stmt& s) {
-      Expr index = op->index;
-      Expr value = this->Mutate(op->value);
-      std::string target_name = op->buffer_var.get()->name_hint;
-      if (target_name == target_) {
-        Array<Expr> keys, values;
-        keys.push_back(StringImm::make("index"));
-        values.push_back(IntImm::make(Int(32), channel_index_));
-        return StreamStmt::make(op->buffer_var, value, 
-                                type_, 10, keys, values); 
-      } else {
-        return Store::make(op->buffer_var, value, 
-                           index, op->predicate);
-      }
-    }
-
-  private:
-    const std::string target_;
-    const ir::StreamType type_;
-    const int channel_index_;
-};
-
-class KernelUpdater final : public IRMutator {
-  public: 
-    static int channelCount;
-    KernelUpdater(
-        const int arg_pos,
-        const ir::StreamType& type,
-        const bool is_producer,
-        const bool kernel_channel) 
-      : arg_pos_(arg_pos), type_(type), 
-        is_producer_(is_producer),
-        kernel_channel_(kernel_channel) {
-          if (kernel_channel_) channel_index_ = getIndex();
-    }
-
-    Stmt Mutate_(const KernelDef* op, const Stmt& s) {
-      Stmt stmt = op->body;
-      // arr saves arg_pos and common channel idx
-      Array<Expr> arr = op->channels;
-      CHECK(op->channels.size() % 2 == 0)
-        << "arg_pos, index pair number mismatch";
-      arr.push_back(IntImm::make(Int(32), arg_pos_));
-      arr.push_back(IntImm::make(Int(32), channel_index_));
-      std::string target_ = op->args[arg_pos_].get()->name_hint;
-      if (is_producer_) { // mutate target load
-        StreamProducer mutator(target_, type_, channel_index_); 
-        stmt = mutator.Mutate(stmt);
-      } else { // replace load consumer
-        StreamConsumer mutator(target_, type_, channel_index_);
-        stmt = mutator.Mutate(stmt);
-      }
-      // update kernel arg signature
-      return KernelDef::make(op->args, op->api_args, 
-                             op->api_types, stmt, op->ret_void,
-                             op->ret_type, op->name, arr);
-   }
-  private:
-    const int arg_pos_;
-    const ir::StreamType type_;
-    const bool is_producer_;
-    const bool kernel_channel_;
-    int channel_index_{0}; 
-    int getIndex() {
-      channelCount += 1; 
-      int channel_num = channelCount;
-      if (channelCount % 2 == 0) 
-        channel_num = channelCount - 1;
-      return channel_num;
-    }
-};
-
-// Initialize static channel count
-int KernelUpdater::channelCount = 0;
-
 class ParentStmtCollector final : public IRMutator {
   public:
     ParentStmtCollector(
@@ -264,369 +117,6 @@ class ParentStmtCollector final : public IRMutator {
     const IterVar& axis_;
 };
 
-// initialize static split bound
-int Schedule::split_bound = 0;
-
-// stream buffer data to kernel stage 
-void Schedule::to_stage(const Tensor& target,
-                        /*kernel def stage*/ Stage dest,
-                        /*position index*/int arg_pos,
-                        StreamType stream_type,
-                        int channel_depth,
-                        std::string name) {
-  Stage target_stage = (*this)[target];
-  Buffer target_buffer;
-
-  // target stage as kernel def operator 
-  if (const ExternOpNode* op = target_stage->op.as<ExternOpNode>()) {
-    target_buffer = op->output_placeholders[0];
-    // remove the receiver buffer (keep the device scope) 
-    const AttrStmt* attr = op->body.as<AttrStmt>();
-    Stmt scope_attr = AttrStmt::make(attr->node, attr->attr_key, 
-                                     attr->value, Evaluate::make(0));
-    target_stage->op = ExternOpNode::make(op->name,
-                                          "",
-                                          Array<IterVar>(),
-                                          op->inputs,
-                                          op->input_placeholders,
-                                          op->output_placeholders,
-                                          scope_attr);
-    // update dest stage body for data stream in 
-    const ExternOpNode* destOp = dest->op.as<ExternOpNode>();
-    KernelUpdater mutator(arg_pos, stream_type, 
-                          /*is producer*/false, 
-                          /*inter module channel*/false);
-    auto new_body = mutator.Mutate(destOp->body);
-    dest->op = ExternOpNode::make(destOp->name, destOp->tag,
-                                  destOp->axis, destOp->inputs,
-                                  destOp->input_placeholders,
-                                  Array<Buffer>(),
-                                  new_body);
-  }
-}
-
-// stream data between hardware modules  
-void Schedule::stream_to(const Tensor& target,
-                         Stage dest,
-                         Stage source,
-                         StreamType stream_type,
-                         int channel_depth, 
-                         std::string new_name) {
-  Stage target_stage = (*this)[target];
-  std::vector<Stage> consumers; 
-  size_t num_stage = (*this)->stages.size();
-  Buffer target_buffer;
-  std::unordered_map<Stage, int, NodeHash, NodeEqual> pos;
-  const ExternOpNode* destOp = dest->op.as<ExternOpNode>();
-  const ExternOpNode* srcOp = source->op.as<ExternOpNode>();
-
-  // update kernel def and scope 
-  const PlaceholderOpNode* op = target_stage->op.as<PlaceholderOpNode>();
-  bool is_placeholder = op ? true : false;
-  if (is_placeholder) {
-    for (size_t i = 0; i < num_stage; i++) {
-      Stage s = (*this)->stages[i];
-      // name matching to locate kernels 
-      if (const ExternOpNode* op = s->op.as<ExternOpNode>()) {
-        for (size_t j = 0; j < op->inputs.size(); j++) {
-          if (target == op->inputs[j]) {
-            target_buffer = op->input_placeholders[j];
-            consumers.push_back(s);
-            // record streamed data pos in kernel call
-            if (std::regex_match(op->name, 
-                    std::regex(destOp->name + "(\\d)")))
-              pos[dest] = j;
-            else if (std::regex_match(op->name, 
-                         std::regex(destOp->name + "(\\d)")))
-              pos[source] = j;
-            break;
-          }
-        }
-      }
-    }
-  } else { // only consumed by self stage 
-    const ExternOpNode* op = target_stage->op.as<ExternOpNode>();
-    target_buffer = op->output_placeholders[0];
-    consumers.push_back(target_stage);
-  }
-  // mutator (is_producer false, kernel_channel true)
-  KernelUpdater destMutator(0, //target_buffer->name, 
-                            stream_type, false, true);
-  // mutate kernel def and repalce lw / st 
-  dest->op = ExternOpNode::make(destOp->name,
-                                destOp->tag,
-                                destOp->axis,
-                                destOp->inputs,
-                                destOp->input_placeholders,
-                                Array<Buffer>(),
-                                destMutator.Mutate(destOp->body));
-  // mutator (is_producer true, kernel_channel true)
-  KernelUpdater srcMutator(0, //target_buffer->name,
-                           stream_type, true, true);
-  source->op = ExternOpNode::make(srcOp->name,
-                                  srcOp->tag,
-                                  srcOp->axis,
-                                  srcOp->inputs,
-                                  srcOp->input_placeholders,
-                                  Array<Buffer>(),
-                                  srcMutator.Mutate(srcOp->body));
-  // update kernel call ops
-  for (auto s : consumers) {
-    const ExternOpNode* op = s->op.as<ExternOpNode>();
-    Stmt body = AttrStmt::make(VarExpr(),
-                               "device_scope",
-                               StringImm::make("fpga"),
-                               op->body);
-    // not alloc buffer for kernel call
-    s->op = ExternOpNode::make(op->name,
-                               op->tag,
-                               op->axis,
-                               op->inputs,
-                               op->input_placeholders,
-                               Array<Buffer>(),
-                               body);
-  }
-}
-
-// move data to device
-Tensor Schedule::move_to(const Tensor& target,
-                         DeviceType device_type,
-                         StreamType stream_type,
-                         int channel_depth, 
-                         std::string new_name) {
-  Stage target_stage = (*this)[target];
-  std::vector<Stage> consumers; 
-  size_t num_stage = (*this)->stages.size();
-  size_t min_pos = num_stage;
-  ArrayNode* stages = (*this)->stages.CopyOnWrite();
-  Buffer target_buffer;
-
-  // create producer and consumer stages for placeholder
-  const PlaceholderOpNode* op = target_stage->op.as<PlaceholderOpNode>();
-  bool is_placeholder = op ? true : false;
-  if (is_placeholder) {
-    min_pos = 0;
-    for (size_t i = 0; i < num_stage; i++) {
-      Stage s = (*this)->stages[i];
-      if (const ExternOpNode* op = s->op.as<ExternOpNode>()) {
-        for (size_t j = 0; j < op->inputs.size(); j++) {
-          if (target == op->inputs[j]) {
-            target_buffer = op->input_placeholders[j];
-            consumers.push_back(s);
-            break;
-          }
-        }
-      }
-    }
-  } else { // move data generated by extern op 
-    min_pos = FindNodeRef(stages, target_stage) + 1;
-    const ExternOpNode* op = target_stage->op.as<ExternOpNode>();
-    target_buffer = op->output_placeholders[0];
-    for (size_t i = 0; i < num_stage; i++) {
-      Stage s = (*this)->stages[i];
-      if (const ExternOpNode* stage_op = s->op.as<ExternOpNode>()) {
-        for (size_t j = 0; j < stage_op->inputs.size(); j++) {
-          if (op->output_placeholders[0] == stage_op->input_placeholders[j]) {
-            consumers.push_back(s);
-            break;
-          }
-        }
-      }
-    }
-  }
-
-  // create sender and write into streaming channel 
-  Array<Tensor> consumer_inputs;
-  Array<Buffer> consumer_input_placeholders;
-  Array<Buffer> consumer_output_placeholders;
-  std::string consumer_name = target_buffer->name + ".stream_send";
-  Buffer consumer_buffer = BufferNode::make(Var(consumer_name, Handle()),
-                                            target->dtype,
-                                            target->shape,
-                                            Array<Expr>(),
-                                            Expr(),
-                                            consumer_name,
-                                            "", 0, 0);
-  consumer_inputs.push_back(target);
-  consumer_input_placeholders.push_back(target_buffer);
-  consumer_output_placeholders.push_back(consumer_buffer);
-
-  // create statement index
-  std::vector<Expr> csm_indices;
-  std::vector<VarExpr> csm_loop_vars;
-  for (size_t i = 0; i < target->shape.size(); i++) {
-    VarExpr iter(target_buffer->name + std::to_string(i));
-    csm_indices.push_back(iter);
-    csm_loop_vars.push_back(iter);
-  }
-  Expr csm_index = getIndex(csm_indices, target->shape); 
-  Expr load_expr = Load::make(target->dtype,
-                              target_buffer->data, 
-                              csm_index, 
-                              UIntImm::make(UInt(1), 1));
-  Stmt consumer_body = StreamStmt::make(consumer_buffer->data,
-                                        load_expr,
-                                        stream_type,
-                                        channel_depth);
-
-  Expr sender_scope, receiver_scope; 
-  size_t consumer_pos = min_pos;
-  switch (device_type) {
-    case DeviceType::CPU:
-      consumer_pos = num_stage; 
-      sender_scope = StringImm::make("fpga");
-      receiver_scope = StringImm::make("cpu");
-      break;
-    case DeviceType::FPGA:
-      sender_scope = StringImm::make("cpu");
-      receiver_scope = StringImm::make("fpga");
-      break;
-    case DeviceType::GPU:
-      sender_scope = StringImm::make("cpu");
-      receiver_scope = StringImm::make("gpu");
-      break;
-  }
-  
-  for (size_t j = 0; j < target->shape.size(); j++) {
-    consumer_body = For::make(
-      VarExpr(csm_loop_vars[j]),
-      0, target->shape[j],
-      ForType::Serial,
-      DeviceAPI::None,
-      consumer_body);
-  }
-
-  consumer_body = AttrStmt::make(
-      consumer_buffer->data,
-      "device_scope", sender_scope, consumer_body);
-
-  // create new stage and return stream tensors 
-  // auto n = std::make_shared<ExternOpNode>();
-  // n->name = consumer_name;
-  // n->body = consumer_body; 
-  // n->inputs = consumer_inputs;
-  // n->input_placeholders = consumer_input_placeholders;
-  // n->output_placeholders = consumer_output_placeholders;
-  // Operation consumer_op(n);
-
-  Operation consumer_op = ExternOpNode::make(consumer_name, 
-                                             "",
-                                             Array<IterVar>(),
-                                             consumer_inputs,
-                                             consumer_input_placeholders,
-                                             consumer_output_placeholders,
-                                             consumer_body);
-  Stage consumer_stage = Stage(consumer_op);
-  // insert sender before bound for (host,xcel <- host) case
-  if (device_type == DeviceType::FPGA) {
-    if (split_bound == 0) {
-      split_bound = consumer_pos + 1;
-    } else { // insert host sender before bound
-      consumer_pos = split_bound;
-      split_bound += 1;
-    }
-  }
-  stages->data.insert(stages->data.begin() + consumer_pos, consumer_stage.node_);
-  (*this)->stage_map.Set(consumer_op, consumer_stage);
-
-  // build producer (receiver) stage which takes in data from streaming
-  // channel and provide data to orginal consumers
-  Array<Tensor> producer_inputs;
-  Array<Buffer> producer_input_placeholders;
-  Array<Buffer> producer_output_placeholders;
-  std::string producer_name = target_buffer->name + ".stream_recv";
-  Buffer producer_buffer = BufferNode::make(Var(producer_name, Handle()),
-                                            target->dtype,
-                                            target->shape,
-                                            Array<Expr>(),
-                                            Expr(),
-                                            producer_name,
-                                            "", 0, 0);
-  // producer_inputs.push_back(consumer_op.output(0));
-  // producer_input_placeholders.push_back(consumer_buffer);
-  producer_output_placeholders.push_back(producer_buffer);
-  // streaming producer tensor reading from placeholder 
-  Expr stream = StreamExpr::make(target->dtype,
-                                 consumer_buffer->data,
-                                 stream_type,
-                                 channel_depth);
-  // create for loops for tensor init
-  std::vector<Expr> indices;
-  std::vector<VarExpr> loop_vars;
-  for (size_t i = 0; i < target->shape.size(); i++) {
-    VarExpr iter(target_buffer->name + std::to_string(i));
-    indices.push_back(iter);
-    loop_vars.push_back(iter);
-  }
-  Expr index = getIndex(indices, target->shape); 
-  // store op initialized with variable node
-  Stmt for_stmt = Store::make(producer_buffer->data,
-                              stream, index,
-                              UIntImm::make(UInt(1), 1));
-  for (size_t j = 0; j < target->shape.size(); j++) {
-    for_stmt = For::make(
-      VarExpr(loop_vars[j]),
-      0, target->shape[j],
-      ForType::Serial,
-      DeviceAPI::None,
-      for_stmt);
-  }
-
-  // attr annotates new scope
-  Stmt body = AttrStmt::make(
-      target_buffer->data,
-      "device_scope", receiver_scope, for_stmt);
-  Tensor producer = ExternOpNode::make(producer_buffer->name, 
-                                       "",
-                                       Array<IterVar>(),
-                                       producer_inputs,
-                                       producer_input_placeholders,
-                                       producer_output_placeholders,
-                                       body).output(0);
-
-  // recv stage creation + return tensor 
-  Stage producer_stage = Stage(producer->op);
-  size_t pos = FindNodeRef(stages, consumer_stage);
-  if (split_bound == 0 || device_type == DeviceType::CPU) 
-    pos = pos + 1;
-  else pos = split_bound + 1; 
-  stages->data.insert(stages->data.begin() + pos, producer_stage.node_);
-  (*this)->stage_map.Set(producer->op, producer_stage);
-
-  // update consumer stages with new tensor and buffer
-  std::unordered_map<const Variable*, VarExpr> vsub;
-  vsub[target_buffer->data.as<Variable>()] = producer_buffer->data;
-  for (size_t i = 0; i < consumers.size(); i++) {
-    Stage s = consumers[i];
-    Array<Tensor> new_inputs;
-    Array<Buffer> new_input_placeholders;
-    const ExternOpNode* op = s->op.as<ExternOpNode>();
-    new_inputs.push_back(producer);
-    new_input_placeholders.push_back(producer_buffer);
-    for (size_t j = 0; j < op->inputs.size(); j++) {
-      if (target != op->inputs[j]) {
-        new_inputs.push_back(op->inputs[j]);
-        new_input_placeholders.push_back(op->input_placeholders[j]);
-      }
-    }
-    Stmt body = LoadReplacer(vsub).Mutate(op->body);
-    Stmt new_body = AttrStmt::make(
-        target_buffer->data,
-        "device_scope",
-        receiver_scope,
-        op->body);
-    s->op = ExternOpNode::make(
-        op->name,
-        op->tag,
-        op->axis,
-        new_inputs,
-        new_input_placeholders,
-        op->output_placeholders,
-        body);
-  }
-  return producer;
-}
-
 Tensor Schedule::reuse_at(const Tensor& target,
                           Stage parent,
                           IterVar axis,
diff --git a/tvm/src/schedule/schedule_ops.cc b/tvm/src/schedule/schedule_ops.cc
index 8156844f5..b4f8e7468 100644
--- a/tvm/src/schedule/schedule_ops.cc
+++ b/tvm/src/schedule/schedule_ops.cc
@@ -349,7 +349,7 @@ Stmt ScheduleOps(
         << "call schedule.normalize before scheduleops";
     CHECK(s->op.defined());
     // no need to specify place holder op.
-    if (auto op = s->op.as<PlaceholderOpNode>()) continue;
+    if (s->op.as<PlaceholderOpNode>()) continue;
     // Remove grouping sugar, get the real attach spec.
     Stage attach_spec = s.GetAttachSpec();
 
diff --git a/tvm/src/template/sdaccel/CLKernel.cpp b/tvm/src/template/sdaccel/CLKernel.cpp
deleted file mode 100644
index 84cf29465..000000000
--- a/tvm/src/template/sdaccel/CLKernel.cpp
+++ /dev/null
@@ -1,67 +0,0 @@
-/*===============================================================*/
-/*                                                               */
-/*                         CLKernel.cpp                          */
-/*                                                               */
-/*          Defines the object class for an OpenCL kernel        */
-/*                                                               */
-/*===============================================================*/
-
-#include "CLKernel.h"
-#include <stdlib.h>
-
-namespace rosetta
-{
-  // initialize the kernel from binary file
-  CLKernel::CLKernel(cl_context context, cl_program program, std::string kernel_name, cl_device_id device_id) 
-  {
-    printf("Creating kernel %s ... ", kernel_name.c_str());
-
-    int err;
-
-    // set the name and device ID
-    this->device_id = device_id;
-    this->kernel_name = kernel_name;
-
-    // Create the compute kernel in the program we wish to run
-    kernel = clCreateKernel(program, kernel_name.c_str(), &err);
-    if (!kernel || err != CL_SUCCESS)
-    {
-      printf("Error: Failed to create compute kernel!\n");
-      printf("Error Code %d\n", err);
-      exit(EXIT_FAILURE);
-    }
-
-    printf("Done!\n");  
-  }
- 
-  void CLKernel::set_global(int global_work_size[3]) 
-  {
-    printf("Set global work size of kernel %s to [%d, %d, %d]\n", kernel_name.c_str(), 
-           global_work_size[0], global_work_size[1], global_work_size[2]);
-
-    for (int i = 0; i < 3; i ++ )
-      this->global_size[i] = global_work_size[i];
-  }
-  
-  void CLKernel::set_local(int local_work_size[3]) 
-  {
-    printf("Set local work size of kernel %s to [%d, %d, %d]\n", kernel_name.c_str(), 
-           local_work_size[0], local_work_size[1], local_work_size[2]);
-
-    for (int i = 0; i < 3; i ++ )
-      this->local_size[i] = local_work_size[i];
-  }
-
-  std::string CLKernel::get_name()
-  {
-    return this->kernel_name;
-  }
-
-  void CLKernel::releaseKernel()
-  {
-    printf("Release kernel %s ... ", kernel_name.c_str());
-    // release kernel
-    clReleaseKernel(kernel);
-    printf("Done!\n");
-  }
-}
diff --git a/tvm/src/template/sdaccel/CLKernel.h b/tvm/src/template/sdaccel/CLKernel.h
deleted file mode 100644
index 2933913c8..000000000
--- a/tvm/src/template/sdaccel/CLKernel.h
+++ /dev/null
@@ -1,96 +0,0 @@
-/*===============================================================*/
-/*                                                               */
-/*                         CLKernel.h                            */
-/*                                                               */
-/*          Defines the object class for an OpenCL kernel        */
-/*                                                               */
-/*===============================================================*/
-
-
-#ifndef __CLKernel__Harness__
-#define __CLKernel__Harness__
-
-// standard headers
-#include <cstdio>
-#include <vector>
-#include <string>
-// opencl header
-#include <CL/cl.h>
-// CLMemObj is a member of this class
-#include "CLMemObj.h"
-
-namespace rosetta
-{
-  
-  // wrapper class around an OpenCL kernel
-  class CLKernel 
-  {
-
-    friend class CLWorld;
-
-    public:
-
-      // constructor
-      // compiles the kernel
-      CLKernel(cl_context context, cl_program program, std::string kernel_name, cl_device_id device_id);
-
-      // set global/local work group size
-      void set_global(int global_work_size[3]);
-      void set_local(int local_work_size[3]);
-
-      // get kernel name
-      std::string get_name();
-
-    protected:
-
-      // set cl_mem argument
-      int set_mem_arg(int id, cl_mem mem_obj)
-      {
-        int err;
-        err = clSetKernelArg(this->kernel, id, sizeof(mem_obj), &mem_obj);
-        if (err != CL_SUCCESS)
-        {
-          printf("Error: Failed to set kernel argument %d for kernel %s!\n", id, (this->kernel_name).c_str());
-          printf("Error Code %d\n", err);
-          return EXIT_FAILURE;
-        }
-
-        return err;
-      }
-
-      // set memory arguments for this kernel
-      template<typename T>
-      int set_const_arg(int id, T& mem_obj)
-      {
-        int err;
-	// printf("%d\n", mem_obj);
-        err = clSetKernelArg(this->kernel, id, sizeof(mem_obj), &mem_obj);
-	printf("****************\n");
-	printf("%d\n", err);
-        if (err != CL_SUCCESS)
-        {
-          printf("Error: Failed to set kernel argument %d for kernel %s!\n", id, (this->kernel_name).c_str());
-          printf("Error Code %d\n", err);
-          return EXIT_FAILURE;
-        }
-
-        return err;
-      }
-
-      void releaseKernel();
-
-    private:
-
-      // global and local work group size
-      size_t global_size[3];
-      size_t local_size[3];
-
-      // kernel information and objects
-      std::string kernel_name;
-      cl_device_id device_id;		// target device id
-      cl_kernel kernel;                 // compute kernel
-
-  };
-
-}
-#endif /* defined(__CLKernel__Harness__) */
diff --git a/tvm/src/template/sdaccel/CLMemObj.cpp b/tvm/src/template/sdaccel/CLMemObj.cpp
deleted file mode 100644
index a6fdecf4a..000000000
--- a/tvm/src/template/sdaccel/CLMemObj.cpp
+++ /dev/null
@@ -1,57 +0,0 @@
-/*===============================================================*/
-/*                                                               */
-/*                        CLMemObj.cpp                           */
-/*                                                               */
-/*       Implements the member functions of CLMemObj class       */
-/*                                                               */
-/*===============================================================*/
-
-
-#include "CLMemObj.h"
-
-namespace rosetta
-{
-  // default constructor, initializes everything to 0
-  CLMemObj::CLMemObj() 
-  {
-    this->mem_data = nullptr;
-    this->elt_size = 0;
-    this->length   = 0;
-    this->flags    = 0;
-    this->bank     = nullptr;
-  }
-  
-  // meaningful constructor, initialize data info constants
-  CLMemObj::CLMemObj(void *mem_data, int elt_size, int length, cl_mem_flags flags, cl_mem_ext_ptr_t* xil_ext ) 
-  {
-    this->mem_data = mem_data;
-    this->elt_size = elt_size;
-    this->length   = length;
-    this->flags    = flags;
-    // can use Xilinx mem extensions to specify DDR bank
-    if (xil_ext != nullptr)
-    {
-      this->bank = new cl_mem_ext_ptr_t;
-      this->bank->flags = xil_ext->flags;
-      this->bank->obj = xil_ext->obj;
-      this->bank->param = 0;
-    }
-    else
-      this->bank = nullptr;
-  }
-  
-  // return the pointer to data
-  void * CLMemObj::get_data()  { return mem_data; }
-  
-  // get size of each element
-  int CLMemObj::get_element_size() { return elt_size; }
-  
-  // get the number of elements in the buffer
-  int CLMemObj::get_length() { return length; }
-  
-  // get OpenCL memory flags
-  cl_mem_flags CLMemObj::get_flags() { return flags; }
-
-  // get xilinx memory extension pointer
-  cl_mem_ext_ptr_t* CLMemObj::get_xil_ext_ptr() { return bank; }
-}
diff --git a/tvm/src/template/sdaccel/CLMemObj.h b/tvm/src/template/sdaccel/CLMemObj.h
deleted file mode 100644
index 30e564aff..000000000
--- a/tvm/src/template/sdaccel/CLMemObj.h
+++ /dev/null
@@ -1,57 +0,0 @@
-/*===============================================================*/
-/*                                                               */
-/*                         CLMemObj.h                            */
-/*                                                               */
-/*     Defines the object class for an OpenCL memory buffer      */
-/*                                                               */
-/*===============================================================*/
-
-
-#ifndef __CLMemObj__Harness__
-#define __CLMemObj__Harness__
-
-// standard header for command line output
-#include <cstdio>
-// opencl header
-#include <CL/cl.h>
-// xilinx opencl extension header
-#include <CL/cl_ext.h>
-
-namespace rosetta
-{
-  // wrapper class around cl_mem
-  class CLMemObj 
-  {
- 
-    friend class CLWorld;
-
-    public:
-  
-      // default constructor
-      CLMemObj ();
-      // a meaningful constructor
-      CLMemObj (void* mem_data, int elt_size, int length, cl_mem_flags flags, cl_mem_ext_ptr_t* xil_ext = nullptr);
-  
-      // get information about the buffer
-      void* get_data();
-      int get_element_size();
-      int get_length();
-      cl_mem_flags get_flags();
-      cl_mem_ext_ptr_t* get_xil_ext_ptr();
- 
-    private:
-  
-      // pointer to data
-      void *mem_data;
-      // size of each element
-      int elt_size;
-      // number of elements
-      int length;
-      // OpenCL memory flag
-      cl_mem_flags flags;
-      // Xilinx extension describing bank assignment
-      cl_mem_ext_ptr_t* bank;
-  };
-}
-
-#endif /* defined(__CLMemObj__Harness__) */
diff --git a/tvm/src/template/sdaccel/CLWorld.cpp b/tvm/src/template/sdaccel/CLWorld.cpp
deleted file mode 100644
index 7be386df2..000000000
--- a/tvm/src/template/sdaccel/CLWorld.cpp
+++ /dev/null
@@ -1,401 +0,0 @@
-/*===============================================================*/
-/*                                                               */
-/*                         CLWorld.cpp                           */
-/*                                                               */
-/*             Implementation of the CLWorld class               */
-/*                                                               */
-/*===============================================================*/
-
-#include "CLWorld.h"
-
-namespace rosetta
-{
-  // default constructor
-  // make sure it does something meaningful
-  CLWorld::CLWorld()
-  {
-    // default: run on alpha data 7v3 board
-    this->target_device_name = "xilinx:adm-pcie-7v3:1ddr:3.0";
-    this->device_type = CL_DEVICE_TYPE_ACCELERATOR;
-
-    // configure the OpenCL runtime
-    createWorld();
-  }
-
-  // meaningful constructor
-  // user specifies device
-  CLWorld::CLWorld(std::string target_device_name, cl_device_type device_type)
-  {
-    this->target_device_name = target_device_name;
-    this->device_type = device_type;
-    createWorld();
-  }
-
-  // get the compute device
-  cl_device_id CLWorld::getDevice()
-  {
-    return this->device_id;
-  }
-
-  // get context
-  cl_context CLWorld::getContext()
-  {
-    return this->context;
-  }
-
-  // get compute program
-  cl_program CLWorld::getProgram()
-  {
-    return this->program;
-  }
-
-  // insert a new memory object
-  int CLWorld::addMemObj(CLMemObj &new_mem_obj)
-  {
-    int err;
-
-    printf("Adding memory object into the world ... ");
-
-    // first push the CLMemObj object into our vector
-    mem_objs.push_back(new_mem_obj);
-
-    // then create the actual cl_mem buffer, push it into another vector
-    cl_mem buf;
-
-    buf = clCreateBuffer(context, new_mem_obj.flags, new_mem_obj.elt_size * new_mem_obj.length, new_mem_obj.bank, &err);
-    if (err != CL_SUCCESS)
-    {
-      printf("Error creating buffer for memory object %d!\n", mem_objs.size()-1);
-      printf("Error Code %d\n", err);
-      exit(EXIT_FAILURE);
-    }
-
-    cl_mem_buffers.push_back(buf);
-
-    // write the buffer onto the device if needed
-    if ((new_mem_obj.flags != CL_MEM_WRITE_ONLY) && (new_mem_obj.mem_data != nullptr))
-    {
-      err = clEnqueueWriteBuffer(cmd_queue, buf, true, 0, new_mem_obj.elt_size * new_mem_obj.length, 
-                                 new_mem_obj.mem_data, 0, NULL, NULL);
-      if (err != CL_SUCCESS)
-      {
-        printf("Error writing buffer %d onto the device!\n", mem_objs.size()-1);
-        printf("Error Code %d\n", err);
-        exit(EXIT_FAILURE);
-      }
-    }
-
-    printf("Done!\n");
-
-    return (mem_objs.size() - 1);
-  }
-
-  int CLWorld::updateMemObj(int mem_idx)
-  {
-    printf("Updating mem object %d ... ", mem_idx);
-
-    // write the buffer onto the device if needed
-    if (mem_objs[mem_idx].flags != CL_MEM_WRITE_ONLY)
-    {
-      int err = clEnqueueWriteBuffer(cmd_queue, cl_mem_buffers[mem_idx], true, 0, 
-                                     mem_objs[mem_idx].elt_size * mem_objs[mem_idx].length, 
-                                     mem_objs[mem_idx].mem_data, 0, NULL, NULL);
-      if (err != CL_SUCCESS)
-      {
-        printf("Error writing buffer %d onto the device!\n", mem_idx);
-        printf("Error Code %d\n", err);
-        exit(EXIT_FAILURE);
-      }
-    }
-    else
-      printf("Buffer %d is write_only! Not updating it ... \n", mem_idx);
-    
-    return EXIT_SUCCESS;
-  }
-   
-  int CLWorld::readMemObj(int mem_idx)
-  {
-    printf("Reading mem object %d into host buffers ... ", mem_idx);
-
-    int err = clEnqueueReadBuffer(cmd_queue, cl_mem_buffers[mem_idx], true, 0,
-                                  mem_objs[mem_idx].elt_size * mem_objs[mem_idx].length, 
-				  mem_objs[mem_idx].mem_data, 0, NULL, NULL);
-    if (err != CL_SUCCESS)
-    {
-      printf("Error reading kernel buffer %d!\n", mem_idx);
-      printf("Error code %d\n", err);
-      exit(EXIT_FAILURE);
-    }
-
-    printf("Done!\n");
-
-    return err;
-  }
-     
-
-  // create compute program from a file
-  // return error code
-  int CLWorld::addProgram(std::string filename)
-  {
-    printf("Adding binary program into the world ... ");
-
-    // load the file
-    size_t code_size = (size_t) load_file_to_memory(filename.c_str());
-
-    // start to compile
-    int err;
-    cl_int create_binary_status;
-
-    // Create the compute program from the source buffer
-    program = clCreateProgramWithBinary(context, 1, &device_id, (const size_t *) &code_size, 
-                                        (const unsigned char **) &kernel_code, &create_binary_status, &err);
-    if (!program)
-    {
-      printf("Error: Failed to create compute program!\n");
-      printf("Error Code %d\n", err);
-      exit(EXIT_FAILURE);
-    }
- 
-    // Build the program executable
-    err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
-    if (err != CL_SUCCESS)
-    {
-      size_t len;
-      char buffer[2048];
- 
-      printf("Error: Failed to build program executable!\n");
-      printf("Error Code %d\n", err);
-      clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
-      printf("%s\n", buffer);
-      exit(EXIT_FAILURE);
-    }
-
-    printf("Done!\n");
-
-    return err;
-  }
-
-  // insert a kernel into the world
-  // return the position of the kernel in the vector
-  int CLWorld::addKernel(CLKernel &new_kernel)
-  {
-    printf("Adding kernel %s into the world ... ", new_kernel.get_name().c_str());
-
-    kernels.push_back(new_kernel);
-
-    printf("Done!\n");
-
-    return (kernels.size() - 1);
-  }
-
-  // methods to set kernel arguments
-  // memory argument
-  int CLWorld::setMemKernelArg(int kernel_id, int pos, int arg_id)
-  {
-    printf("Set mem arg %d for kernel %d with mem object %d ... ", pos, kernel_id, arg_id);
-
-    int err = kernels[kernel_id].set_mem_arg(pos, cl_mem_buffers[arg_id]);
-    if (err != CL_SUCCESS)
-    {
-      printf("Error setting kernel argument!\n");
-      printf("Error code %d\n", err);
-      exit(EXIT_FAILURE);
-    }
-
-    printf("Done!\n");
-
-    return err;
-  }
-   
-  // run all kernels
-  // return error code
-  int CLWorld::runKernels(bool flush)
-  {
-    printf("Start kernel execution ... ");
-
-    int err;
-
-    // wait for previous write buffer tasks to finish
-    printf("Waiting for queue... \n");
-    clFinish(cmd_queue);
-
-    // enqueue all the kernels
-    // temporarily we assume kernels won't have any dependency between them
-    // or the dependency is handled inside kernels (such as pipes, etc. )
-    for (int i = 0; i < kernels.size(); i ++ )
-    {
-      printf("Start kernel %d!\n", i);
-      err = clEnqueueNDRangeKernel(cmd_queue, kernels[i].kernel, 3, NULL, kernels[i].global_size, kernels[i].local_size, 
-                                   0, NULL, NULL);
-      if (err != CL_SUCCESS)
-      {
-        printf("Error enqueuing kernel %d!\n", i);
-	printf("Error Code %d\n", err);
-	exit(EXIT_FAILURE);
-      }
-    }
-
-    // wait for them to finish
-    printf("Waiting for kernels ... \n");
-    clFinish(cmd_queue);
-
-    // remove all of them from the vector
-    // so that this function can be called multiple times
-    // at a cost that kernels won't be released automatically
-    if (flush)
-    {
-      int total_size = kernels.size();
-      for (int i = 0; i < total_size; i ++ )
-        kernels.pop_back();
-    }
-
-    printf("Done!\n");
-
-    return err;
-  }
-
-  // create runtime environment
-  int CLWorld::createWorld()
-  {
-    printf("Initializing OpenCL runtime environment ... ");
-
-    int err;
-
-    // scan the machine for available OpenCL platforms
-    cl_uint platform_cnt;
-    cl_platform_id platforms[16];
-    err = clGetPlatformIDs(16, platforms, &platform_cnt);
-    if (err != CL_SUCCESS)
-    {
-      printf("Error: Failed to find an OpenCL platform!\n");
-      printf("Error Code %d\n", err);
-      printf("Test failed\n");
-      exit(EXIT_FAILURE);
-    }
-    printf("INFO: Found %d platforms\n", platform_cnt);
-
-
-    // find the target device
-    char device_name[1024];
-    cl_device_id devices[16];
-    cl_uint device_cnt;
-    bool found_device = false;
-    // scan all platforms
-    for (int p = 0; (p < platform_cnt) & (!found_device); p ++ )
-    {
-      err = clGetDeviceIDs(platforms[p], this->device_type, 16, devices, &device_cnt);
-      if (err != CL_SUCCESS)
-      {
-        printf("Error: Failed to create a device group for platform %d!\n", p);
-        printf("Error Code %d\n", err);
-        printf("Test failed\n");
-        exit(EXIT_FAILURE);
-      }
-      // iterate through all devices on the platform
-      for (int d = 0; (d < device_cnt) & (!found_device); d ++ )
-      {
-        err = clGetDeviceInfo(devices[d], CL_DEVICE_NAME, 1024, device_name, 0);
-        if (err != CL_SUCCESS) 
-	{
-	  printf("Error: Failed to get device name for device %d on platform %d!\n", d, p);
-	  printf("Error Code %d\n", err);
-	  printf("Test failed\n");
-	  exit(EXIT_FAILURE);
-	}
-
-        if (std::string(device_name) == this->target_device_name)
-	{
-	  this->platform = platforms[p];
-	  this->device_id = devices[d];
-          found_device = true;
-	  printf("Selected device %d on platform %d as target device!\n", d, p);
-	}
-      }
-    }
-
-    if (!found_device)
-    {
-      printf("Error: Target device %s is not found!\n", (this->target_device_name).c_str());
-      exit(EXIT_FAILURE);
-    }
-
-    // create context and command queue
-    this->context = clCreateContext(0, 1, &(this->device_id), 0, 0, &err);
-    if (!(this->context))
-    {
-      printf("Error: Failed to create a compute context!\n");
-      printf("Error Code %d\n", err);
-      exit(EXIT_FAILURE);
-    }
-    this->cmd_queue = clCreateCommandQueue(this->context, this->device_id, 
-                                           CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE,
-					   &err);
-    if (!(this->cmd_queue))
-    {
-      printf("Error: Failed to create a command queue!\n");
-      printf("Error Code %d\n", err);
-      exit(EXIT_FAILURE);
-    }
-
-    printf("Done!\n");
-
-    return err;
-  }
-
-  // read kernel binary file into memory
-  int CLWorld::load_file_to_memory(const char *filename) 
-  {
-    int size = 0;
-    FILE *f = fopen(filename, "rb");
-    if (f == NULL)
-    {
-      kernel_code = NULL;
-      printf("Can not open kernel file!\n");
-      exit(-1);
-    }
-    fseek(f, 0, SEEK_END);
-    size = ftell(f);
-    printf("Size of the file is %ld\n", size);
-    fseek(f, 0, SEEK_SET);
-    kernel_code = new char[size+1];
-    if ((unsigned int) size != fread(kernel_code, sizeof(char), size, f))
-    {
-      delete []kernel_code;
-      printf("Reading kernel failed!\n");
-      exit(-2);
-    }
-    fclose(f);
-    (kernel_code)[size] = 0;
-    return size;
-  }
-
-
-  // release all runtime constructs
-  void CLWorld::releaseWorld()
-  {
-    printf("Cleaning up OpenCL opjects ... ");
-
-    // release memory objects
-    for (int i = 0; i < cl_mem_buffers.size(); i ++ )
-      clReleaseMemObject(cl_mem_buffers[i]);
-
-    // release program
-    delete []kernel_code;
-    clReleaseProgram(program);
-
-    // release kernels
-    for (int i = 0; i < kernels.size(); i ++ )
-      kernels[i].releaseKernel();
-
-    // release device and context
-    clReleaseCommandQueue(cmd_queue);
-    clReleaseContext(context);
-
-    printf("Done!\n");
-  }
-
-}
-
-
-
-
diff --git a/tvm/src/template/sdaccel/CLWorld.h b/tvm/src/template/sdaccel/CLWorld.h
deleted file mode 100644
index 9624687aa..000000000
--- a/tvm/src/template/sdaccel/CLWorld.h
+++ /dev/null
@@ -1,129 +0,0 @@
-/*===============================================================*/
-/*                                                               */
-/*                          CLWorld.h                            */
-/*                                                               */
-/*          Defines the object class for OpenCL context          */
-/*                                                               */
-/*===============================================================*/
-
-
-#ifndef __CLWorld__Harness__
-#define __CLWorld__Harness__
-
-// standard headers
-#include <cstdio>
-#include <string>
-#include <vector>
-// opencl header
-#include <CL/cl.h>
-// CLKernel and CLMemObj are members of this class
-#include "CLKernel.h"
-#include "CLMemObj.h"
-
-namespace rosetta
-{
-
-  class CLWorld
-  {
-    
-    public:
-
-      // default constructor
-      CLWorld();
-
-      // meaningful constructor
-      CLWorld(std::string target_device_name, cl_device_type device_type);
-
-      // get the compute device associated with this world
-      cl_device_id getDevice();
-
-      // get the compute context associated with this world
-      cl_context getContext();
-
-      // get the binary program 
-      cl_program getProgram();
-
-      // insert a compute program 
-      int addProgram(std::string filename);
-
-      // insert a kernel
-      int addKernel(CLKernel &new_kernel);
-
-      // insert a memory object
-      int addMemObj(CLMemObj &new_mem_obj);
-
-      // update a memory object (write new value)
-      int updateMemObj(int mem_id);
-
-      // read a memory object
-      int readMemObj(int mem_id);
-
-      // set memory kernel argument
-      int setMemKernelArg(int kernel_id, int pos, int mem_id);
-
-      // set constant kernel argument
-      template<typename T>
-      int setConstKernelArg(int kernel_id, int pos, T& arg)
-      {
-	// printf("%lu\n", arg);
-        printf("Set const arg %d for kernel %d ... ", pos, kernel_id);
-    
-        int err = kernels[kernel_id].set_const_arg(pos, arg);
-        if (err != CL_SUCCESS)
-        {
-          printf("Error setting kernel argument!\n");
-          printf("Error code %d\n", err);
-          exit(EXIT_FAILURE);
-        }
-    
-        printf("Done!\n");
-    
-        return err;
-      }
-
-      // run kernels
-      int runKernels(bool flush = false);
-
-      // clean up
-      void releaseWorld();
-
-    private:
-
-      // OpenCL runtime variables
-
-      // the platform we will use
-      cl_platform_id platform;
-
-      // the device we will use
-      std::string target_device_name;	// device name
-      cl_device_type device_type;       // device type
-      cl_device_id device_id;           // device id
-
-      // compute context
-      cl_context context;
-
-      // command queue
-      cl_command_queue cmd_queue;        
-
-      // binary program for the device
-      char* kernel_code;
-      cl_program program;
-
-      // kernels
-      std::vector<CLKernel> kernels;
-
-      // memory objects
-      std::vector<CLMemObj> mem_objs;
-      // actual OpenCL memory buffers
-      std::vector<cl_mem>   cl_mem_buffers;
-
-      // function to create the OpenCL runtime
-      int createWorld();
-
-      // load binary file into memory
-      int load_file_to_memory(const char *filename);
-  };
-
-}
-
-#endif
diff --git a/tvm/src/template/sdaccel/Makefile b/tvm/src/template/sdaccel/Makefile
deleted file mode 100644
index 282f67921..000000000
--- a/tvm/src/template/sdaccel/Makefile
+++ /dev/null
@@ -1,33 +0,0 @@
-# Set kernel name
-KERNEL_NAME = App
-
-# Set host source and headers
-# HOST_SRC_CPP = ./src/host/digit_recognition.cpp ./src/host/utils.cpp ./src/host/check_result.cpp
-HOST_SRC_CPP = host.cpp utils.cpp 
-# HOST_SRC_H   = ./src/host/utils.h ./src/host/check_result.h ./src/host/typedefs.h ./src/host/testing_data.h \
-               ./src/host/training_data.h
-HOST_SRC_H = utils.h
-# DATA         = ./data/*.dat
-
-
-# Set host code include paths
-HOST_INC = -I/opt/Xilinx/Vivado/2018.2.op2258646/include/
-HOST_LIB = -L/opt/Xilinx/Vivado/2018.2.op2258646/lib/
-
-# Set kernel file
-OCL_KERNEL_SRC = interface.cpp
-# OCL_KERNEL_H = ./src/host/typedefs.h
-# SDSOC_KERNEL_SRC = ./src/sdsoc/digitrec.cpp
-# SDSOC_KERNEL_H = ./src/host/typedefs.h
-# SW_KERNEL_SRC = ./src/sw/digitrec_sw.cpp
-# SW_KERNEL_H = ./src/host/typedefs.h ./src/sw/digitrec_sw.h
-
-# Set opencl kernel arguments
-# log: removed --report system
-OCL_KERNEL_ARGS = --max_memory_ports all 
-
-#-------------------------
-# Leave the rest to harness
-#-------------------------
-include harness.mk
-
diff --git a/tvm/src/template/sdaccel/harness.mk b/tvm/src/template/sdaccel/harness.mk
deleted file mode 100644
index 23856f9c7..000000000
--- a/tvm/src/template/sdaccel/harness.mk
+++ /dev/null
@@ -1,196 +0,0 @@
-# ======================================== Check Xilinx SDX Environment Settings ================================================== #
-ifndef XILINX_SDX
-  $(error Environment variable XILINX_SDX is required and should point to SDx install area)
-endif
-
-# =============================================== Tools Used in Rosetta =========================================================== #
-
-# sdaccel tools
-OCL_CXX   = xcpp
-XOCC      = xocc
-
-# sdsoc tools
-SDSXX     = sds++
-
-# default sw compiler
-SW_CXX = g++
-
-# ============================================= SDAccel Platform and Target Settings ============================================== #
-
-# Set Default OpenCL device and platform
-USR_PLATFORM = n
-OCL_DEVICE = xilinx:adm-pcie-7v3:1ddr:3.0
-OCL_PLATFORM = one_of_default_platforms
-
-# Check if the user specified opencl platform
-ifneq ($(OCL_PLATFORM), one_of_default_platforms)
-  USR_PLATFORM=y
-endif
-
-# Check OCL_TARGET value
-OCL_TARGET  = sw_emu
-ifeq ($(OCL_TARGET),sw_emu)
-else ifeq ($(OCL_TARGET),hw_emu)
-else ifeq ($(OCL_TARGET),hw)
-else
-  $(error "OCL_TARGET does not support the $(OCL_TARGET) value. Supported values are: sw_emu, hw_emu, hw")
-endif
-
-# Check opencl kernel file type
-OCL_KERNEL_TYPE = ocl
-
-ifeq ($(suffix $(OCL_KERNEL_SRC)),.cl)
-  OCL_KERNEL_TYPE=ocl
-else
-  OCL_KERNEL_TYPE=c
-endif
-
-# OpenCL runtime Libraries
-OPENCL_INC = $(XILINX_SDX)/runtime/include/1_2
-OPENCL_LIB = $(XILINX_SDX)/runtime/lib/x86_64
-
-# opencl harness files
-OCL_HARNESS_DIR     = .
-OCL_HARNESS_SRC_CPP = $(OCL_HARNESS_DIR)/CLKernel.cpp $(OCL_HARNESS_DIR)/CLMemObj.cpp $(OCL_HARNESS_DIR)/CLWorld.cpp
-OCL_HARNESS_SRC_H   = $(OCL_HARNESS_DIR)/CLKernel.h   $(OCL_HARNESS_DIR)/CLMemObj.h   $(OCL_HARNESS_DIR)/CLWorld.h
-
-# host compilation flags
-OCL_HOST_FLAGS = -DOCL -g -lxilinxopencl -I$(OPENCL_INC) $(HOST_INC) -L$(OPENCL_LIB) $(HOST_LIB) -I$(OCL_HARNESS_DIR) -I$(APPLICATION_DIR)
-
-# xclbin compilation flags
-XCLBIN_FLAGS = -s -t $(OCL_TARGET) -g 
-
-# change OCL_HOST_FLAG
-ifdef K_CONST
- OCL_HOST_FLAGS += -DK_CONST=$(K_CONST)
-endif
-ifdef NUM_ITER 
- OCL_HOST_FLAGS += -DNUM_ITER=$(NUM_ITER)
-endif
-ifdef FIXED_FLAG
- OCL_HOST_FLAGS += -DFIXED_TYPE
-endif
-
-
-ifneq ($(KERNEL_TYPE),ocl)
-  XCLBIN_FLAGS += --kernel $(KERNEL_NAME)
-endif
-
-ifeq ($(USR_PLATFORM),n)
-  XCLBIN_FLAGS += --xdevice $(OCL_DEVICE)
-else
-  XCLBIN_FLAGS += --platform $(OCL_PLATFORM)
-endif
-
-
-# change XCLBIN_FLAGS
-ifdef K_CONST
- XCLBIN_FLAGS += -DK_CONST=$(K_CONST)
-endif
-ifdef NUM_ITER
-  XCLBIN_FLAGS += -DNUM_ITER=$(NUM_ITER)
-endif
-ifdef FIXED_FLAG
-  XCLBIN_FLAGS += -DFIXED_TYPE
-endif
-
-
-XCLBIN_FLAGS += $(OCL_KERNEL_ARGS)
-
-
-# host exe
-OCL_HOST_EXE        = $(KERNEL_NAME)_host.exe
-
-# Kernel XCLBIN file
-XCLBIN        = $(KERNEL_NAME).$(OCL_TARGET).xclbin
-XO            = $(KERNEL_NAME).$(OCL_TARGET).xo
-
-# =============================================== SDSoC Platform and Target Settings ============================================== #
-
-# platform
-SDSOC_PLATFORM = zc706
-
-# executable
-SDSOC_EXE = $(KERNEL_NAME).elf
-
-# sds++ flags
-SDSFLAGS = -sds-pf $(SDSOC_PLATFORM) -sds-hw $(KERNEL_NAME) $(SDSOC_KERNEL_SRC) -sds-end -clkid 3  \
-           -poll-mode 1 -verbose
-SDSCFLAGS += -DSDSOC -Wall -O3 -c
-SDSCFLAGS += -MMD -MP -MF"$(@:%.o=%.d)"
-SDSLFLAGS = -O3 
-
-# objects
-ALL_SDSOC_SRC = $(HOST_SRC_CPP) $(SDSOC_KERNEL_SRC)
-OBJECTS := $(ALL_SDSOC_SRC:.cpp=.o)
-DEPS := $(OBJECTS:.o=.d)
-
-# =============================================== Pure Software Compilation Settings ============================================== #
-
-# compiler flags
-SW_FLAGS = -DSW -O3
-
-# sw executable
-SW_EXE = $(KERNEL_NAME)_sw.exe
-
-# ========================================================= Rules ================================================================= #
-
-# we will have 4 top-level rules: ocl, sdsoc, sw and clean
-# default to sw
-
-.PHONY: all ocl sdsoc sw clean
-
-all: sw
-
-# ocl rules
-ocl: $(OCL_HOST_EXE) $(XCLBIN)
-
-# ocl secondary rule: host executable
-$(OCL_HOST_EXE): $(HOST_SRC_CPP) $(HOST_SRC_H) $(OCL_HARNESS_SRC_CPP) $(OCL_HARNESS_SRC_H) $(DATA)
-	$(OCL_CXX) $(OCL_HOST_FLAGS) -o $@ $(HOST_SRC_CPP) $(OCL_HARNESS_SRC_CPP) 
-
-# ocl secondary rule: xclbin 
-$(XCLBIN): $(XO)
-	$(XOCC) -l $(XCLBIN_FLAGS) -o $@ $(XO)
-
-# ocl secondary rule: xo
-$(XO): $(OCL_KERNEL_SRC) $(OCL_KERNEL_H)
-	$(XOCC) -c $(XCLBIN_FLAGS) -o $@ $(OCL_KERNEL_SRC)
-
-# sdsoc rules
-sdsoc: $(SDSOC_EXE)
-
-$(SDSOC_EXE): $(OBJECTS)
-	$(SDSXX) $(SDSFLAGS) $(SDSLFLAGS) ${OBJECTS} -o $@
-
--include $(DEPS)
-
-%.o: %.cpp
-	$(SDSXX) $(SDSFLAGS) $(SDSCFLAGS) $< -o $@
-
-
-# software rules
-sw: $(HOST_SRC_CPP) $(HOST_SRC_H) $(SW_KENREL_SRC) $(SW_KERNEL_H) $(DATA)
-	$(SW_CXX) $(SW_FLAGS) -o $(SW_EXE) $(HOST_SRC_CPP) $(SW_KERNEL_SRC)
-
-# cleanup
-clean:
-	@echo "Cleaning old files"
-	rm -rf *.exe
-	rm -rf *.elf
-	rm -rf *.xclbin
-	rm -rf *.bit
-	rm -rf *.rpt
-	rm -rf system_estimate.xtxt
-	rm -rf _xocc*
-	rm -rf _sds
-	rm -rf sd_card
-	rm -rf .Xil
-	rm -rf ./src/host/*.d
-	rm -rf ./src/sdsoc/*.o
-	rm -rf ./src/sdsoc/*.d
-	rm -rf ./src/host/*.o
-	rm -rf *.dat
-	rm -rf *.html
-	rm -rf *.csv
-	rm -rf *.json
diff --git a/tvm/src/template/sdaccel/run.tcl b/tvm/src/template/sdaccel/run.tcl
deleted file mode 100644
index 0d6dca4b5..000000000
--- a/tvm/src/template/sdaccel/run.tcl
+++ /dev/null
@@ -1,14 +0,0 @@
-set hls_prj digitrec.prj
-open_project ${hls_prj} -reset
-set_top default_function
-add_files -tb main.cpp
-add_files -tb data
-
-open_solution "solution1"
-set_part {xc7z020clg484-1}
-create_clock -period 10
-
-csim_design -O
-csynth_design
-#cosim_design
-exit
diff --git a/tvm/src/template/sdaccel/run_hw.sh b/tvm/src/template/sdaccel/run_hw.sh
deleted file mode 100755
index f65d28e6d..000000000
--- a/tvm/src/template/sdaccel/run_hw.sh
+++ /dev/null
@@ -1,28 +0,0 @@
-#===============================================================#
-#                                                               #
-#                       	run_hw.sh                         	#
-#                                                               #
-#   	A bash script to synthesize and generate bitstream 			#
-#																#
-#                                                               #
-#===============================================================#
-
-
-#!/bin/bash
-make clean
-
-# the k value of KNN, default is 3
-k_value=3
-# the directory of this lab
-app_dir=`pwd`
-
-### COMPILATION
-# create some blank-line space for easy readability
-echo ""; echo ""; echo "" ; echo ""
-echo "####################################################"
-echo " Synthesize and Generate Bitstream with K_CONST=$k_value"
-echo "####################################################"
-make ocl OCL_TARGET=hw OCL_PLATFORM=$AWS_PLATFORM APPLICATION_DIR=$app_dir K_CONST=$k_value
-#export XCL_EMULATION_MODE=hw_emu
-#./DigitRec_host.exe -f DigitRec.hw_emu.xclbin 
-
diff --git a/tvm/src/template/sdaccel/run_sw.sh b/tvm/src/template/sdaccel/run_sw.sh
deleted file mode 100755
index 80ba00495..000000000
--- a/tvm/src/template/sdaccel/run_sw.sh
+++ /dev/null
@@ -1,51 +0,0 @@
-#===============================================================#
-#                                                               #
-#                       	run1.sh                         	#
-#                                                               #
-#   	A bash script to run the software emulation 			#
-#																#
-#                                                               #
-#===============================================================#
-
-
-#!/bin/bash
-make clean
-
-# check env variable setup
-if [ -z "$AWS_PLATFORM" ]; then
-    echo "AWS_PLATFORM not set up; use default"
-    export AWS_PLATFORM=xilinx:adm-pcie-7v3:1ddr:3.0
-fi
-
-# set up emulation configuration
-echo "#################################################"
-echo " Setting emulation configuration..."
-echo "#################################################"
-export LC_CTYPE=en_US.UTF-8
-export LC_ALL=en_US.UTF-8
-export XCL_EMULATION_MODE=true
-emconfigutil --platform=$AWS_PLATFORM
-
-# the k value of KNN, default is 3
-k_value=3
-# the directory of this lab
-app_dir=`pwd`
-
-### COMPILATION
-# create some blank-line space for easy readability
-echo ""; echo ""; echo "" ; echo ""
-echo "####################################################"
-echo " Compiling project with K_CONST=$k_value"
-echo "####################################################"
-make ocl OCL_TARGET=sw_emu OCL_PLATFORM=$AWS_PLATFORM APPLICATION_DIR=$app_dir K_CONST=$k_value
-
-
-### EXECUTION
-echo ""; echo ""; echo "" ; echo ""
-echo "####################################################"
-echo " Executing DigitRec with K_CONST=$k_value"
-echo "####################################################"
-export XCL_EMULATION_MODE=sw_emu
-#export XCL_EMULATION_MODE=hw_emu
-./App_host.exe -f App.sw_emu.xclbin 
-
diff --git a/tvm/src/template/sdaccel/utils.cpp b/tvm/src/template/sdaccel/utils.cpp
deleted file mode 100644
index 0e6dd632e..000000000
--- a/tvm/src/template/sdaccel/utils.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-/*===============================================================*/
-/*                                                               */
-/*                          utils.cpp                            */
-/*                                                               */
-/*                       Utility functions                       */
-/*                                                               */
-/*===============================================================*/
-
-#include <string>
-#include <cstdio>
-#include <getopt.h>
-#include <stdlib.h>
-
-#include "utils.h"
-
-void print_usage(char* filename)
-{
-    printf("usage: %s <options>\n", filename);
-    printf("  -f [kernel file]\n");
-}
-
-void parse_sdaccel_command_line_args(
-    int argc,
-    char** argv,
-    std::string& kernelFile) 
-{
-
-  int c = 0;
-
-  while ((c = getopt(argc, argv, "f:")) != -1) 
-  {
-    switch (c) 
-    {
-      case 'f':
-        kernelFile = optarg;
-        break;
-      default:
-      {
-        print_usage(argv[0]);
-        exit(-1);
-      }
-    } // matching on arguments
-  } // while args present
-}
-
-
diff --git a/tvm/src/template/sdaccel/utils.h b/tvm/src/template/sdaccel/utils.h
deleted file mode 100644
index a3ab77437..000000000
--- a/tvm/src/template/sdaccel/utils.h
+++ /dev/null
@@ -1,19 +0,0 @@
-/*===============================================================*/
-/*                                                               */
-/*                           utils.h                             */
-/*                                                               */
-/*                       Utility functions                       */
-/*                                                               */
-/*===============================================================*/
-
-#include <string>
-//target device
-const std::string TARGET_DEVICE = "xilinx_aws-vu9p-f1-04261818_dynamic_5_0";
-
-void print_usage(char* filename);
-
-void parse_sdaccel_command_line_args(
-    int argc,
-    char** argv,
-    std::string& kernelFile);
-
diff --git a/tvm/src/template/vivado/Makefile b/tvm/src/template/vivado/Makefile
deleted file mode 100644
index 1d84baead..000000000
--- a/tvm/src/template/vivado/Makefile
+++ /dev/null
@@ -1,31 +0,0 @@
-#==========================================================================
-# Makefile
-#==========================================================================
-# @brief: A makefile the compiles and synthesizes the program
-#
-# @desc: 1. "make" runs csim by default
-#        2. "make csim" compiles & executes the fixed-point implementation
-#        3. "make clean" cleans up the directory
-
-
-# Extract Vivado HLS include path
-VHLS_PATH := $(dir $(shell which vivado_hls))/..
-VHLS_INC ?= ${VHLS_PATH}/include
-
-CFLAGS = -g -I${VHLS_INC} 
-
-all: csim
-
-csim:  host.cpp 
-	@echo "Compiling & simulating on amdpool ..."
-	g++ ${CFLAGS} $^ -o out -lrt
-	./out
-
-vivado:
-	@echo "Run Vivado csim and HLS"
-	vivado_hls -f run.tcl
-
-clean:
-	rm -rf out *.txt *.dat *.prj *.log
-	rm -rf zedboard_project* xillydemo.bit
-
diff --git a/tvm/src/template/vivado/run.tcl b/tvm/src/template/vivado/run.tcl
deleted file mode 100644
index d80b865df..000000000
--- a/tvm/src/template/vivado/run.tcl
+++ /dev/null
@@ -1,36 +0,0 @@
-#=============================================================================
-# run_base.tcl 
-#=============================================================================
-# @brief: A Tcl script for synthesizing the design.
-
-# Project name
-set hls_prj out.prj
-
-# Open/reset the project
-open_project ${hls_prj} -reset
-
-# Top function of the design is "top"
-set_top top
-
-# Add design and testbench files
-add_files kernel.cpp
-add_files -tb host.cpp
-
-open_solution "solution1"
-# Use Zynq device
-set_part {xc7z020clg484-1}
-
-# Target clock period is 10ns
-create_clock -period 10
-
-# Directives 
-
-############################################
-
-# Simulate the C++ design
-csim_design -O
-# Synthesize the design
-csynth_design
-# Co-simulate the design
-#cosim_design
-exit
diff --git a/tvm/src/template/vivado/timer.h b/tvm/src/template/vivado/timer.h
deleted file mode 100644
index 77c461b00..000000000
--- a/tvm/src/template/vivado/timer.h
+++ /dev/null
@@ -1,94 +0,0 @@
-//---------------------------------------------------------
-// Timer.h
-//---------------------------------------------------------
-#ifndef __TIMER_H__
-#define __TIMER_H__
-#include <time.h>
-#include <sys/time.h>
-#include <string.h>
-#include <stdio.h>
-
-#define TIMER_ON
-
-//---------------------------------------------------------
-// Timer is an object which helps profile programs using
-// the clock() function.
-// - By default, a timer is stopped when you instantiate it
-//   and must be started manually
-// - Passing True to the constructor starts the timer when
-//   it is constructed
-// - When the timer is destructed it prints stats to stdout
-//---------------------------------------------------------
-class Timer {
-
-  #ifdef TIMER_ON
-
-    char binName[50];
-    unsigned nCalls;
-    timeval ts_start;
-    float totalTime;
-    
-    public:
-      //------------------------------------------------------------------
-      // constructor
-      //------------------------------------------------------------------
-      Timer (const char* Name="", bool On=false) {
-        if (On) {
-          // record the start time
-          gettimeofday(&ts_start, NULL);
-          nCalls = 1;
-        }
-        else {
-          nCalls = 0;
-        }
-        totalTime = 0;	
-        strcpy(binName, Name);
-      }
-
-      //------------------------------------------------------------------
-      // destructor
-      //------------------------------------------------------------------
-      ~Timer () {
-        // on being destroyed, print the average and total time
-        if (nCalls > 0) {
-          printf ("%-20s: ", binName);
-          printf ("%6d calls; ", nCalls);
-          printf ("%7.3f msecs total time\n", 1000*totalTime);
-          //printf ("%7.4f msecs average time;\n", 1000*totalTime/nCalls);
-        }
-      }
-      
-      //------------------------------------------------------------------
-      // start timer
-      //------------------------------------------------------------------
-      void start() {
-        // record start time
-        gettimeofday(&ts_start, NULL);
-        nCalls++;
-      }
-      
-      //------------------------------------------------------------------
-      // stop timer
-      //------------------------------------------------------------------
-      void stop() {
-        // get current time, add elapsed time to totalTime
-        timeval ts_curr;
-        gettimeofday(&ts_curr, NULL);
-        totalTime += float(ts_curr.tv_sec - ts_start.tv_sec) +
-                     float(ts_curr.tv_usec)*1e-6 - float(ts_start.tv_usec)*1e-6;
-      }
-
-  #else
-
-    //--------------------------------------------------------------------
-    // all methods do nothing if TIMER_ON is not set
-    //--------------------------------------------------------------------
-    public:
-      Timer (const char* Name, bool On=true) {}
-      void start() {}
-      void stop() {}
-
-  #endif
-};
-
-#endif