diff --git a/.circleci/config.yml b/.circleci/config.yml
index ba119097d..4fa87539c 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -11,6 +11,7 @@ test: &test
         key: v1.03-libhcl-
     - run: make build-python
     - run: pip install --user pytest 
+    - run: pip install --user future 
     - run: python -m pytest tests
     - run: pip install --user mxnet
     - run: python -m pytest samples
diff --git a/.gitignore b/.gitignore
index a70651d15..65f3dfcf8 100644
--- a/.gitignore
+++ b/.gitignore
@@ -16,8 +16,6 @@ tags
 docs/source/samples
 docs/source/tutorials
 soda_*
-*.cpp
-*.h
 out
 
 # Downloaded files
diff --git a/HISTORY b/HISTORY
new file mode 100644
index 000000000..e08d564bc
--- /dev/null
+++ b/HISTORY
@@ -0,0 +1,11 @@
+### 2019-12-09 
+  * fixed issue of zc706 simulation 
+    * remove kernel-name variable allocation before KernelDef
+    * change multi-dimension array access to row-major single-dimension access
+    * create local buffer for each on-device variable
+    * updated the `KernelUpdater` class (using position index instead of name)
+    * added `stream_arg_pos` map in `CodeGenC` to facilitate codegen with streaming
+  * fixed test cases 
+    * changed tvm `build` function to support legacy string type target 
+    * fixed opencl aocl data type mismatching issue
+    * fixed kernel def data type conversion issue
diff --git a/Makefile b/Makefile
index 88c653d77..9508b9171 100644
--- a/Makefile
+++ b/Makefile
@@ -12,15 +12,15 @@ build-tvm: build-pkgs
 
 build-hcl: build-tvm
 	cd python; \
-	python setup.py install --user; \
+	python setup.py develop --user; \
   cd ../hlib/python; \
-	python setup.py install --user;
+	python setup.py develop --user;
 
 build-python:
 	cd python; \
-	python setup.py install --user; \
+	python setup.py develop --user; \
   cd ../hlib/python; \
-	python setup.py install --user;
+	python setup.py develop --user;
 
 clean:
 	rm -rf build
diff --git a/Makefile.config b/Makefile.config
index 2060d201c..60d1cfd3e 100644
--- a/Makefile.config
+++ b/Makefile.config
@@ -12,6 +12,9 @@ CMAKE_OK = no
 # set whether to use vivado hls runtime
 USE_VIVADO_HLS = 1
 
+# set whether to use sdaccel opencl runtime
+USE_SDACCEL_HLS = 1
+
 # Specify current directory level with respect to CLAY_ROOT
 ifndef LEVEL
 	LEVEL := .
diff --git a/hlib/python/hlib/nn.py b/hlib/python/hlib/nn.py
index c8fa146a8..8f1c4d0e8 100644
--- a/hlib/python/hlib/nn.py
+++ b/hlib/python/hlib/nn.py
@@ -32,6 +32,17 @@ def _pad(*indices):
         return data[tuple(index_tuple)]
     return hcl.compute(out_shape, _pad, name='pad')
 
+def conv2d_nchw_imp(Input, Filter, Output, stride=[1,1], padding=[[0,0],[0,0]]):
+    with hcl.for_(0,Output.shape[0]) as n:
+      with hcl.for_(0,Output.shape[1]) as c:
+        with hcl.for_(0,Output.shape[2]) as h:
+          with hcl.for_(0,Output.shape[3]) as w:
+            partial = hcl.scalar(0)
+            with hcl.for_(0,Filter.shape[-2]) as x:
+              with hcl.for_(0,Filter.shape[-1]) as y:
+                partial.v += Input[n][c][h+x][w+y] * Filter[0][0][x][y] 
+            Output[n,c,h,w] = partial
+
 def conv2d_nchw(Input, Filter, name="conv2d", stride=[1,1], padding=[[0,0],[0,0]]):
     out_dtype = Input.dtype
     batch, in_channel, in_height, in_width = Input.shape
diff --git a/hlib/rocc-ppac b/hlib/rocc-ppac
new file mode 160000
index 000000000..40d323d0c
--- /dev/null
+++ b/hlib/rocc-ppac
@@ -0,0 +1 @@
+Subproject commit 40d323d0c81e2f64dbfb63afb5eb5d6ccf7c5e48
diff --git a/python/heterocl/__init__.py b/python/heterocl/__init__.py
index 588196177..4b90160f0 100644
--- a/python/heterocl/__init__.py
+++ b/python/heterocl/__init__.py
@@ -3,6 +3,7 @@
 from .compute_api import *
 from .dsl import *
 from .types import *
+from .devices import *
 from .nparray import *
 from .debug import hcl_excepthook
 from .tvm.intrin import *
diff --git a/python/heterocl/api.py b/python/heterocl/api.py
index 4da52786f..f3e2151c8 100644
--- a/python/heterocl/api.py
+++ b/python/heterocl/api.py
@@ -53,7 +53,7 @@ def app2(A, B, C):
         # execute f2
     """
     # set the configurations
-    config.init_dtype = init_dtype
+    config.init_dtype  = init_dtype
     # initialize global variables
     Schedule.stage_ops = []
     Schedule.last_stages = OrderedSet([])
@@ -90,7 +90,7 @@ def placeholder(shape, name=None, dtype=None):
     """
     name = util.get_name("placeholder", name)
     dtype = util.get_dtype(dtype)
-
+    
     if shape == ():
         return Scalar(tvm_api._Var(name, dtype))
     tensor = Tensor(shape, dtype, name)
diff --git a/python/heterocl/debug.py b/python/heterocl/debug.py
index cba313e23..a885d2e0b 100644
--- a/python/heterocl/debug.py
+++ b/python/heterocl/debug.py
@@ -45,6 +45,11 @@ class TensorError(HCLError):
     def __init__(self, msg):
         HCLError.__init__(self, msg, "\33[1;31m[Tensor]\33[0m ")
 
+class DeviceError(HCLError):
+    """A subclass for specifying device related exception"""
+    def __init__(self, msg):
+        HCLError.__init__(self, msg, "\33[1;31m[Device]\33[0m ")
+
 def hcl_excepthook(etype, value, tb):
     """Customized excepthook
 
diff --git a/python/heterocl/devices.py b/python/heterocl/devices.py
new file mode 100644
index 000000000..a5d81df86
--- /dev/null
+++ b/python/heterocl/devices.py
@@ -0,0 +1,278 @@
+"""Define HeteroCL device types"""
+#pylint: disable=too-few-public-methods, too-many-return-statements
+from .debug import DeviceError
+from .tools import option_table, model_table
+from future.utils import with_metaclass
+
+class tooling(type):
+    def __getattr__(cls, key):
+        if key in option_table:
+           return cls(key, *option_table[key])
+        else: # unsupported device
+           raise DeviceError("not supported")
+
+class tool(with_metaclass(tooling, object)):
+    """The base class for all device tooling
+
+    mode (sim/impl) is decided by tool configuration
+    e.g. run sw emulation by passing gcc / vivado_hls arg
+    and actual impl by passing sdaccel / aocl arg 
+
+    Parameters
+    ----------
+    types: str
+        Device of device to place data
+    model: str
+        Model of device to place date
+    """
+    def __init__(self, name, mode, kwargs):
+        self.name = name
+        self.mode = mode
+        self.options = kwargs
+
+    def __getattr__(self, entry):
+        return self.mapping[entry] 
+
+    def __call__(self, mode, setting={}):
+        self.mode = mode
+        self.options = setting
+        return self
+
+    def __str__(self):
+        return str(self.name) + "-" + \
+               str(self.mode) + ":\n" + \
+               str(self.options)
+
+    def __repr__(self):
+        return str(self.name) + "-" + \
+               str(self.mode) + ":\n" + \
+               str(self.options)
+
+tool_table = {
+  "aws_f1"      : tool("sdaccel", *option_table["sdaccel"]),
+  "zc706"       : tool("vivado_hls", *option_table["vivado_hls"]),
+  "ppac"        : tool("rocket", *option_table["rocket"]),
+  "stratix10_sx": tool("aocl", *option_table["aocl"]),
+  "llvm"        : tool("llvm", *option_table["llvm"])
+}
+
+class Device(object):
+    """The base class for all device types
+
+    The default data placement is on CPU.
+
+    Parameters
+    ----------
+    types: str
+        Device of device to place data
+    model: str
+        Model of device to place date
+    """
+    def __init__(self, types, vendor, 
+                 model, **kwargs):
+        self.vendor = vendor
+        self.types = types
+        self.model = model
+        self.impls = {"lang": ""}
+        for key, value in kwargs.items(): 
+            self.impls[key] = value
+
+    def __getattr__(self, key):
+        """ device hierarchy """
+        return self.impls[key] 
+
+    def set_lang(self, lang):
+        assert lang in \
+            ["opencl", "hlsc", "c", "opengl", "merlinc", "cuda", "metal"], \
+            "unsupported lang sepc " + lang
+        self.impls["lang"] = lang
+        return self
+
+class CPU(Device):
+    """cpu device with different models"""
+    def __init__(self, vendor, model, **kwargs):
+        if vendor not in ["riscv", "arm", "intel", "sparc", "powerpc"]: 
+            raise DeviceError(vendor + " not supported yet")
+        assert "cpu_" + model in model_table[vendor], \
+            model + " not supported yet"
+        super(CPU, self).__init__("CPU", vendor, model, **kwargs)
+    def __repr__(self):
+        return "cpu-" + self.vendor + "-" + str(self.model) + \
+               ":" + self.impls["lang"]
+
+class FPGA(Device):
+    """fpga device with different models"""
+    def __init__(self, vendor, model, **kwargs):
+        if vendor not in ["xilinx", "intel"]: 
+            raise DeviceError(vendor + " not supported yet")
+        assert "fpga_" + model in model_table[vendor], \
+            model + " not supported yet"
+        super(FPGA, self).__init__("FPGA", vendor, model, **kwargs)
+    def __repr__(self):
+        return "fpga-" + self.vendor + "-" + str(self.model) + \
+               ":" + self.impls["lang"]
+
+class GPU(Device):
+    """gpu device with different models"""
+    def __init__(self, vendor, model, **kwargs):
+        if vendor not in ["nvidia", "amd"]: 
+            raise DeviceError(vendor + " not supported yet")
+        assert "gpu_" + model in model_table[vendor], \
+            model + " not supported yet"
+        super(GPU, self).__init__("GPU", vendor, model, **kwargs)
+    def __repr__(self):
+        return "gpu-" + self.vendor + "-" + str(self.model) + \
+               ":" + self.impls["lang"]
+
+class PIM(Device):
+    """cpu device with different models"""
+    def __init__(self, vendor, model, **kwargs):
+        if model not in ["ppac"]: 
+            raise DeviceError(model + " not supported yet")
+        super(PIM, self).__init__("PIM", vendor, model, **kwargs)
+    def __repr__(self):
+        return "pim-" + str(self.model)
+
+dev_table = {
+  "aws_f1" : [CPU("intel", "e5"), FPGA("xilinx", "xcvu19p")],
+  "zc706" : [CPU("arm", "a9"), FPGA("xilinx", "xc7z045")],
+  "rocc-ppac" : [CPU("riscv", "riscv"), PIM("ppac", "ppac")],
+  "stratix10_sx": [CPU("arm", "a53"), FPGA("intel", "stratix10_gx")]
+}
+
+class env(type):
+    """The platform class for compute environment setups
+    
+     serves as meta-class for attr getting
+     default platform: aws_f1, zynq, ppac
+
+    Parameters
+    ----------
+    host: str
+        Device of device to place data
+    model: str
+        Model of device to place date
+    """
+    def __getattr__(cls, key):
+        if key == "aws_f1":
+            devs = dev_table[key]
+            host = devs[0].set_lang("opencl")
+            xcel = devs[1].set_lang("hlsc")
+        elif key == "zc706":
+            devs = dev_table[key]
+            host = devs[0].set_lang("hlsc")
+            xcel = devs[1].set_lang("hlsc")
+        elif key == "llvm":
+            devs = None 
+            host = None 
+            xcel = None 
+        elif key == "ppac":
+            devs = dev_table["rocc-ppac"]
+            host = devs[0].set_lang("c")
+            xcel = None 
+        else: # unsupported device
+            raise DeviceError("not supported")
+        tool = tool_table[key]
+        return cls(key, devs, host, xcel, tool)
+           
+class platform(with_metaclass(env, object)):
+    def __init__(self, name, devs, host, xcel, tool):
+        self.name = name
+        self.devs = devs
+        self.host = host
+        self.xcel = xcel
+        self.tool = tool
+
+        if isinstance(host, CPU):
+            self.cpu = host
+        if isinstance(xcel, FPGA):
+            self.fpga = xcel
+        elif isinstance(xcel, PIM) and \
+             xcel.model == "ppac":
+            self.ppac = xcel
+
+    def __getattr__(self, key):
+        """ return tool options """
+        return self.tool.__getattr__(key)
+   
+    def __call__(self, tooling=None):
+        if tooling: # check and update
+            assert isinstance(tooling, tool)
+            self.tool = tooling
+        return self
+
+    def __str__(self):
+        return str(self.name) + "(" + \
+               str(self.host) + " : " + \
+               str(self.xcel) + ")"
+
+    def __repr__(self):
+        return str(self.name) + "(" + \
+               str(self.host) + " : " + \
+               str(self.xcel) + ")"
+
+def device_to_str(dtype):
+    """Convert a device type to string format.
+
+    Parameters
+    ----------
+    dtype : Device or str
+        The device type to be converted
+
+    Returns
+    -------
+    str
+        The converted device type in string format.
+    """
+    if isinstance(dtype, Device):
+        if isinstance(dtype, CPU):
+            return "cpu_" + str(dtype.model)
+        elif isinstance(dtype, FPGA):
+            return "fpga_" + str(dtype.model)
+    else:
+        if not isinstance(dtype, str):
+            raise DeviceError("Unsupported device type format")
+        return dtype
+
+def device_to_hcl(dtype):
+    """Convert a device type to Heterocl type.
+
+    Parameters
+    ----------
+    dtype : Device or str
+        The device type to be converted
+
+    Returns
+    -------
+    Device
+    """
+    if isinstance(dtype, Device):
+        return dtype
+    elif isinstance(dtype, str):
+        device, model = dtype.split("_") 
+        if device == "cpu":
+            return CPU(model)
+        elif device == "gpu":
+            return GPU(model)
+        elif device == "fpga":
+            return FPGA(model)
+        else:
+            raise DeviceError("Unrecognized device type")
+    else:
+        raise DeviceError("Unrecognized device type format")
+
+def get_model(dtype):
+    """Get the model of a given device type.
+
+    Parameters
+    ----------
+    dtype : Device or str
+        The given device type
+
+    Returns
+    -------
+    str
+    """
+    dtype = dtype_to_hcl(dtype)
+    return dtype.types, dtype.model
+
diff --git a/python/heterocl/dsl.py b/python/heterocl/dsl.py
index 6d42031f1..b226cb0ab 100644
--- a/python/heterocl/dsl.py
+++ b/python/heterocl/dsl.py
@@ -405,6 +405,7 @@ def decorator(fmodule, shapes=shapes, dtypes=dtypes, ret_dtype=ret_dtype, name=n
                     raise APIError("The number of data types does not match the of arguments")
                 for (name_, dtype_) in zip(new_names, dtypes):
                     dtypes.append(util.get_dtype(dtype_, name_))
+                dtypes = dtypes[int(len(dtypes)/2):]
             else:
                 dtype = util.get_dtype(dtypes)
                 dtypes = []
@@ -414,15 +415,20 @@ def decorator(fmodule, shapes=shapes, dtypes=dtypes, ret_dtype=ret_dtype, name=n
             # prepare inputs for IR generation
             inputs = []
             inputs_tvm = []
+            arg_shapes, arg_dtypes = [], []
             for shape, name_, dtype in zip(shapes, new_names, dtypes):
                 if shape == ():
                     var_ = placeholder((), name_, dtype)
                     inputs.append(var_)
                     inputs_tvm.append(var_.var)
-                else:
+                    arg_shapes.append([1])
+                    arg_dtypes.append(dtype)
+                else: # tensor inputs (new bufs)
                     placeholder_ = placeholder(shape, name_, dtype)
                     inputs.append(placeholder_)
                     inputs_tvm.append(placeholder_.buf.data)
+                    arg_shapes.append(list(shape))
+                    arg_dtypes.append(dtype)
 
             s.ret_dtype = ret_dtype
             fmodule(*inputs)
@@ -435,7 +441,8 @@ def decorator(fmodule, shapes=shapes, dtypes=dtypes, ret_dtype=ret_dtype, name=n
             ret_void = _make.UIntImm("uint1", 0) if s.has_return else _make.UIntImm("uint1", 1)
             body = s.pop_stmt()
             s.stmt_stack.append([])
-            s.emit(_make.KernelDef(inputs_tvm, body, ret_void, ret_dtype, name))
+            s.emit(_make.KernelDef(inputs_tvm, arg_shapes, arg_dtypes, 
+                                   body, ret_void, ret_dtype, name, []))
             for name_, i in zip(names, inputs):
                 s.var_dict[name_] = i
             s.input_stages.clear()
diff --git a/python/heterocl/mutator.py b/python/heterocl/mutator.py
index 88ca42788..7d49f1e76 100644
--- a/python/heterocl/mutator.py
+++ b/python/heterocl/mutator.py
@@ -77,6 +77,8 @@ def mutate(self, node):
                     return self.mutate_SetSlice(node)
                 elif isinstance(node, _expr.KernelExpr):
                     return self.mutate_KernelExpr(node)
+                elif isinstance(node, _expr.StreamExpr):
+                    return self.mutate_StreamExpr(node)
                 else:
                     return node
         elif isinstance(node, _stmt.Stmt):
@@ -112,6 +114,8 @@ def mutate(self, node):
                 return self.mutate_Break(node)
             elif isinstance(node, _stmt.While):
                 return self.mutate_While(node)
+            elif isinstance(node, _stmt.StreamStmt):
+                return self.mutate_StreamStmt(node)
             else:
                 return node
         elif isinstance(node, tuple):
@@ -248,6 +252,10 @@ def mutate_KernelExpr(self, node):
         args = self.mutate(node.args)
         return _make.KernelExpr(node.dtype, args, node.name)
 
+    def mutate_StreamExpr(self, node):
+        args = self.mutate(node.args)
+        return _make.StreamExpr(node.dtype, args, node.name)
+
     # statements
     def mutate_LetStmt(self, node):
         var = self.mutate(node.var)
@@ -320,6 +328,10 @@ def mutate_KernelStmt(self, node):
         args = self.mutate(node.args)
         return _make.KernelStmt(args, node.name)
 
+    def mutate_StreamStmt(self, node):
+        args = self.mutate(node.args)
+        return _make.StreamStmt(node.dtype, args, node.name)
+
     def mutate_Return(self, node):
         value = self.mutate(node.value)
         return _make.Return(value)
diff --git a/python/heterocl/schedule.py b/python/heterocl/schedule.py
index abd74acdc..03af1cf3e 100644
--- a/python/heterocl/schedule.py
+++ b/python/heterocl/schedule.py
@@ -5,6 +5,7 @@
 from ordered_set import OrderedSet
 from .tvm import make as _make
 from .tvm import stmt as _stmt
+from .tvm import expr as _expr
 from .tvm import api as tvm_api
 from .tvm import _api_internal
 from .tvm._api_internal import _ExternOp
@@ -134,6 +135,42 @@ def reuse_at(self, target, parent, axis, name=None):
             name = target.name + ".reuse"
         return self.sch.reuse_at(target, parent, axis, name)
 
+    def to(self, tensors, dst, src=None,
+           stream_type=_expr.StreamExpr.Channel, depth=10, name=None):
+        """Stream a list of Tensors to dst devices 
+        
+        Parameters
+        ----------
+        tensors : list of Tensor
+            The tensors to be moved
+
+        dst : device or module 
+            The tensors to be moved
+
+        stream_type : {FIFO, Channel, Burst}, optional
+            The stream type
+        """
+        if stream_type > 2:
+            raise APIError("Invalid channel type")
+        rets = []
+        if not isinstance(tensors, list):
+            tensors = [tensors]
+        for tensor in tensors: 
+            try:
+                target = tensor.tensor
+            except (AttributeError, ValueError):
+                try:
+                    target = tensor._op
+                except AttributeError:
+                    target = tensor
+            if name is None:
+                name = target.name + ".stream"
+            ret = self.sch.to(target, dst, src, 
+                              stream_type, depth, name)
+            name = None
+            rets.append(ret)
+        return rets
+
     def partition(self, target, partition_type=_stmt.Partition.Complete, dim=0, factor=0):
         """Partition a Tensor into smaller Tensors or even registers
 
@@ -302,7 +339,7 @@ def __exit__(self, ptype, value, trace):
         # create the output operation
         input_ops = [i._op for i in self.input_stages]
         input_bufs = [i._buf for i in self.input_stages]
-        output_bufs = [self._buf]
+        output_bufs = [self._buf] 
         body = self.pop_stmt()
         Stage._current.pop()
         op = _ExternOp(self.name, "", self.axis_list, input_ops,
@@ -331,8 +368,7 @@ def __exit__(self, ptype, value, trace):
             superstage.var_dict[self.name] = self
             # update prefix
             self.name_with_prefix = superstage.name_with_prefix + "." + self.name
-        # Otherwise update the list of stages globally
-        else:
+        else: # otherwise update the list of stages globally
             Schedule.stage_ops.append(self)
             Schedule.last_stages.add(self)
             Schedule.last_stages -= self.input_stages
diff --git a/python/heterocl/tools.py b/python/heterocl/tools.py
new file mode 100644
index 000000000..bf47753fa
--- /dev/null
+++ b/python/heterocl/tools.py
@@ -0,0 +1,108 @@
+"""Define HeteroCL default tool settings"""
+#pylint: disable=too-few-public-methods, too-many-return-statements
+
+model_table = {
+  "xilinx" : ["fpga_xc7z045", "fpga_xcvu19p"],
+  "intel"  : ["cpu_e5", "cpu_i7", "fpga_stratix10_gx", 
+              "fpga_stratix10_dx", "fpga_stratix10_mx"],
+  "arm"    : ["cpu_a7", "cpu_a9", "cpu_a53"],
+  "riscv"  : ["cpu_riscv"]
+}
+
+option_table = {
+  "llvm"    : ("llvm_sim", {"version" : "6.0.0"}),
+  "sdaccel" : ("sw_emu", {"version" : "2017.1", "clock" : "1"}),
+  "vivado_hls" : ("csim", {"version" : "2017.1"}),
+  "rocket"     : ("source", {"RISCV" : ""}),
+
+  # refer to xilinx2016_1/ug904-vivado-implementation.pdf
+  "vivado"     : ("pnr",
+    {"version" : "2017.1",
+     "logic" : ["Default", "Explore", "ExploreSequentialArea", "AddRemap", "ExploreArea"],
+     "placement" : ["Default", "Explore", "ExtraNetDelay_high", "ExtraNetDelay_medium", "ExtraNetDelay_low", "ExtraPostPlacementOpt", "WLDrivenBlockPlacement", "LateBlockPlacement", "AltSpreadLogic_low", "AltSpreadLogic_medium", "AltSpreadLogic_high"],
+     "routing" : ["Default", "Explore", "HigherDelayCost"],
+     "fanout_opt" : ["on", "off"],
+     "placement_opt" : ["on", "off"],
+     "critical_cell_opt" : ["on", "off"],
+     "critical_pin_opt" : ["on", "off"],
+     "retime" : ["on", "off"],
+     "rewire" : ["on", "off"],
+    }),
+
+  "quartus"    : ("pnr", 
+    {"version" : "17.1",
+    "auto_dsp_recognition" : ['On', 'Off'],
+    "disable_register_merging_across_hierarchies": ['On', 'Off', 'Auto'],
+    "mux_restructure" : ['On', 'Off', 'Auto'],
+    "optimization_technique" : ['Area', 'Speed', 'Balanced'],
+    "synthesis_effort" : ['Auto', 'Fast'],
+    "synth_timing_driven_synthesis" : ['On', 'Off'],
+    "fitter_aggressive_routability_optimization" : ['Always', 'Automatically', 'Never'],
+    "fitter_effort" : ['Standard Fit', 'Auto Fit'],
+    "remove_duplicate_registers" : ['On', 'Off'],
+    "physical_synthesis" : ['On', 'Off'],
+    "adv_netlist_opt_synth_wysiwyg_remap" : ['On', 'Off'],
+    "allow_any_ram_size_for_recognition" : ['On', 'Off'],
+    "allow_any_rom_size_for_recognition" : ['On', 'Off'],
+    "allow_any_shift_register_size_for_recognition" : ['On', 'Off'],
+    "allow_power_up_dont_care" : ['On', 'Off'],
+    "allow_shift_register_merging_across_hierarchies" : ["Always", "Auto", "Off"],
+    "allow_synch_ctrl_usage" : ['On', 'Off'],
+    "auto_carry_chains" : ['On', 'Off'],
+    "auto_clock_enable_recognition" : ['On', 'Off'],
+    "auto_dsp_recognition" : ['On', 'Off'],
+    "auto_enable_smart_compile" : ['On', 'Off'],
+    "auto_open_drain_pins" : ['On', 'Off'],
+    "auto_ram_recognition" : ['On', 'Off'],
+    "auto_resource_sharing" : ['On', 'Off'],
+    "auto_rom_recognition" : ['On', 'Off'],
+    "auto_shift_register_recognition" : ["Always", "Auto", "Off"],
+    "disable_register_merging_across_hierarchies" : ["Auto", "On", "Off"],
+    "enable_state_machine_inference" : ['On', 'Off'],
+    "force_synch_clear" : ['On', 'Off'],
+    "ignore_carry_buffers" : ['On', 'Off'],
+    "ignore_cascade_buffers" : ['On', 'Off'],
+    "ignore_max_fanout_assignments" : ['On', 'Off'],
+    "infer_rams_from_raw_logic" : ['On', 'Off'],
+    "mux_restructure" : ["Auto", "On", "Off"],
+    "optimization_technique" : ["Area", "Balanced", "Speed"],
+    "optimize_power_during_synthesis" : ["Extra effort", "Normal compilation", "Off"],
+    "remove_duplicate_registers" : ['On', 'Off'],
+    "shift_register_recognition_aclr_signal" : ['On', 'Off'],
+    "state_machine_processing" : 
+        ["Auto", "Gray", "Johnson, Minimal Bits", "One-Hot", "Sequential", "User-Encoded"],
+    "strict_ram_recognition" : ['On', 'Off'],
+    "synthesis_effort" : ["Auto", "Fast"],
+    "synthesis_keep_synch_clear_preset_behavior_in_unmapper" : ['On', 'Off'],
+    "synth_resource_aware_inference_for_block_ram" : ['On', 'Off'],
+    "synth_timing_driven_synthesis" : ['On', 'Off'],
+    "alm_register_packing_effort" : ["High", "Low", "Medium"],
+    "auto_delay_chains" : ['On', 'Off'],
+    "auto_delay_chains_for_high_fanout_input_pins" : ["On", "Off"],
+    "eco_optimize_timing" : ["On", "Off"],
+    "final_placement_optimization" : ["Always", "Automatically", "Never"],
+    "fitter_aggressive_routability_optimization" : ["Always", "Automatically", "Never"],
+    "fitter_effort" : ["Standard Fit", "Auto Fit"],
+    "optimize_for_metastability" : ["On", "Off"],
+    "optimize_hold_timing" : ["All Paths", "IO Paths and Minimum TPD Paths", "Off"],
+    "optimize_ioc_register_placement_for_timing" : 
+        ["Normal", "Off", "Pack All IO Registers"],
+    "optimize_multi_corner_timing" : ['On', 'Off'],
+    "optimize_power_during_fitting" : ["Extra effort", "Normal compilation", "Off"],
+    "physical_synthesis" : ['On', 'Off'],
+    "placement_effort_multiplier" : [0.2, 0.5, 1.0, 2.0, 3.0, 4.0],
+    "programmable_power_technology_setting" : ["Automatic", "Force All Tiles with Failing Timing Paths to High Speed", "Force All Used Tiles to High Speed", "Minimize Power Only"],
+    "qii_auto_packed_registers" : ["Auto", "Minimize Area", "Minimize Area with Chains", "Normal", "Off", "Sparse", "Sparse Auto"],
+    "router_clocking_topology_analysis" : ['On', 'Off'],
+    "router_lcell_insertion_and_logic_duplication" : ["Auto", "On", "Off"],
+    "router_register_duplication" : ["Auto", "On", "Off"],
+    "router_timing_optimization_level" : ["MINIMUM", "Normal", "MAXIMUM"],
+    "seed" : (1, 5),
+    "tdc_aggressive_hold_closure_effort" : ['On', 'Off'],
+    "allow_register_retiming" : ['On', 'Off']}),
+
+  "aocl" : ("emu", {"version" : "17.0",
+                    "clokc" : 1.5,
+                    })
+}
+
diff --git a/python/heterocl/tvm/build_module.py b/python/heterocl/tvm/build_module.py
index c8dcc91f2..47b4e31ae 100755
--- a/python/heterocl/tvm/build_module.py
+++ b/python/heterocl/tvm/build_module.py
@@ -6,8 +6,10 @@
 from __future__ import absolute_import as _abs
 import warnings
 import types
+import os
 
 from ._ffi.node import NodeBase, register_node
+from ._ffi.function import register_func
 from ._ffi.base import _RUNTIME_ONLY
 from . import api
 from . import tensor
@@ -21,6 +23,48 @@
 from . import ndarray
 from . import target as _target
 from . import make
+from ..devices import platform
+
+# test build sim
+@register_func
+def tvm_callback_syn_postproc(code):
+    return "test" 
+
+@register_func
+def get_util_path(platform):
+    if platform == "aws_f1":
+        return "/work/zhang-x1/users/sx233/heterocl/tvm/src/template/sdaccel/" 
+    elif platform == "rocket":
+        ppac = "/work/zhang-x1/users/sx233/heterocl/hlib/rocc-ppac" 
+        emulator = os.path.join(ppac, "rocket/emulator/emulator-freechips." + \
+                                      "rocketchip.system-RoccExampleConfig-debug")
+        # build emulator if not exist
+        if not os.path.isfile(emulator):
+            cmd = "cd " + ppac + ";"
+            cmd += "cp src/Ppac.v rocket/src/main/resources/vsrc;" + \
+                   "cp src/PpacRoCC.scala rocket/src/main/scala/tile;" + \
+                   "cd rocket && git apply ../src/rocc-ppac.patch;" + \
+                   "cd emulator && make CONFIG=RoccExampleConfig debug"
+            # create subprocess to check
+            subprocess.Popen(cmd, shell=True, stdout=open("build.log", "w")).wait()
+             
+        # re-build proxy kernel 
+        if not os.path.isfile(ppac + "/rocket/riscv-pk/build/pk"):
+            cmd = "cd " + ppac + "/rocket/riscv-pk;"
+            cmd += "git apply ../../tests/patches/riscv-pk.patch;"
+            cmd += "mkdir build; cd build;"
+            cmd += " ../configure --prefix=$RISCV/riscv64-unknown-elf --host=riscv64-unknown-elf;"
+            cmd += "make -j8; make install"
+            subprocess.Popen(cmd, shell=True, stdout=open("build.log", "w")).wait()
+        # return util folder needed to compile generated test files
+        return "/work/zhang-x1/users/sx233/heterocl/rocc-ppac/tests" 
+
+    # copy tcl and testbench  
+    elif platform == "vivado_hls":
+        return "/work/zhang-x1/users/sx233/heterocl/tvm/src/template/vivado" 
+
+    else: # unrecognized platform
+        assert False, "unsupported platform"
 
 class DumpIR(object):
     """
@@ -340,6 +384,7 @@ def lower(sch,
         stmt = f(stmt)
     # Phase 1
     stmt = ir_pass.StorageFlatten(stmt, binds, 64)
+    stmt = ir_pass.InferStream(stmt, 32)
     #stmt = ir_pass.CanonicalSimplify(stmt) #TODO: SOLVE THIS!!
     stmt = ir_pass.LiftAllocateAttrs(stmt)
     if cfg.generate_reuse_buffer:
@@ -378,7 +423,7 @@ def lower(sch,
     else:
         return ir_pass.MakeAPI(stmt, name, arg_list, 0, cfg.restricted_func)
 
-def build_fpga_kernel(sch, args, target_name, name="default_function"):
+def build_fpga_kernel(sch, args, target, name="default_function"):
     """Build an FPGA kernel.
 
     Parameters
@@ -407,20 +452,66 @@ def build_fpga_kernel(sch, args, target_name, name="default_function"):
     if args is None:
         raise ValueError("args must be given for build from schedule")
 
-    if target_name == "merlinc":
+    # generate host (device) code / function 
+    if target == "merlinc":
         BuildConfig.current = build_config(generate_reuse_buffer=False)
     else:
         BuildConfig.current = build_config()
+
     flist = lower(sch, args, kernel_only=True, name=name)
     if isinstance(flist, container.LoweredFunc):
         flist = [flist]
-    fdevice = [ir_pass.LowerIntrin(x, target_name) for x in flist]
+    fdevice = [ir_pass.LowerIntrin(x, str(target)) for x in flist]
+
+    if isinstance(target, str): # string type
+        builder = getattr(codegen, "build_{0}".format(target))
+        ret = builder(fdevice)
+        if isinstance(ret, str):
+            decl = ret[:ret.find("{device}")]
+            start = ret.find("{host}")
+            end = ret.rfind("{host}")
+            ret = decl + "\n" + ret[start+6:end]
+            ret = ret.strip("\n").lstrip("\n") + "\n\n" 
+        return ret
+
+    try: # generate and split code
+        host, xcel = None, None
+        if target.tool.name == "sdaccel":
+            host = target.host.lang.replace("opencl", "aocl")
+            xcel = target.xcel.lang.replace("hlsc", "vhls")
+        elif target.tool.name == "vivado_hls":
+            host = target.host.lang.replace("hlsc", "vhls")
+            xcel = target.xcel.lang.replace("hlsc", "vhls")
+        elif target.tool.name == "rocket":
+            host = target.host.lang.replace("c", "rv64_ppac")
+   
+        # return simulation built function
+        mode = str(target.tool.mode)
+        if "emu" in mode or "sim" in mode:
+            builder = getattr(codegen, "build_{0}".format("sim"))
+            keys = [k for k in target.tool.options.keys()]
+            vals = [v for v in target.tool.options.values()]
+            keys.insert(0, "name")
+            vals.insert(0, target.tool.name)
+            return builder(fdevice, keys, vals)
+        elif mode != "debug": # impl mode
+            pass
+        else: # return source code only
+            host_code, xcel_code = "", ""
+            if host: # src mode generate host code 
+                builder = getattr(codegen, "build_{0}".format(host))
+                host_code = builder(fdevice)
+                findex, rindex = host_code.find("{host}"), host_code.rfind("{host}")
+                host_code = host_code[findex + 6 : rindex]
+            if xcel: # src mode generate xcel code
+                builder = getattr(codegen, "build_{0}".format(xcel))
+                xcel_code = builder(fdevice)
+                findex, rindex = xcel_code.find("{device}"), xcel_code.rfind("{device}")
+                xcel_code = xcel_code[findex + 8 : rindex]
+            return xcel_code + host_code 
 
-    try:
-        builder = getattr(codegen, "build_{0}".format(target_name))
-        return builder(fdevice)
     except AttributeError:
-        raise AttributeError("Cannot find the target builder %s" % target_name)
+        raise AttributeError("Cannot find the target builder %s" % target)
     return None
 
 def build(sch,
@@ -468,11 +559,13 @@ def build(sch,
     ----
     See the note on :any:`tvm.target` on target string format.
     """
-    target = _target.current_target() if target is None else target
-    target = _target.create(target) if target else _target.create("llvm")
-
-    if "fpga" in target.keys:
-        return build_fpga_kernel(sch, args, target.target_name, name=name)
+    if isinstance(target, platform):
+        return build_fpga_kernel(sch, args, target, name=name)
+    else: # default string type target
+        target = _target.current_target() if target is None else target
+        target = _target.create(target) if target else _target.create("llvm")
+        if "fpga" in target.keys:
+            return build_fpga_kernel(sch, args, target.target_name, name=name)
     BuildConfig.current = build_config()
 
     if isinstance(sch, schedule._Schedule):
diff --git a/python/heterocl/tvm/expr.py b/python/heterocl/tvm/expr.py
index d71307e8f..d1ea4ae75 100644
--- a/python/heterocl/tvm/expr.py
+++ b/python/heterocl/tvm/expr.py
@@ -382,3 +382,9 @@ class Quantize(Expr):
 @register_node
 class KernelExpr(Expr):
   pass
+
+@register_node
+class StreamExpr(Expr):
+    Channel = 0
+    Pipe = 1
+    FIFO = 2
diff --git a/python/heterocl/tvm/schedule.py b/python/heterocl/tvm/schedule.py
index 21905b443..36ead39de 100644
--- a/python/heterocl/tvm/schedule.py
+++ b/python/heterocl/tvm/schedule.py
@@ -3,6 +3,7 @@
 from ._ffi.base import string_types
 from ._ffi.node import NodeBase, register_node
 from ._ffi.function import _init_api
+from ..devices import Device
 from . import _api_internal
 from . import tensor as _tensor
 from . import expr as _expr
@@ -332,6 +333,53 @@ def reuse_at(self, target, parent, axis, name):
     def partition(self, target, partition_type, dim, factor):
         return _api_internal._SchedulePartition(self, target, dim, factor, partition_type)
 
+    def to(self, tensor, dst, src, 
+           types=_expr.StreamExpr.Channel, 
+           depth=1, name=None):
+        """ Stream data to devices or on-chip module 
+
+        Parameters
+        ----------
+        tensor : list of Tensors
+            Tensor to be streamed.
+        dst : hcl device or dst stage
+            The device or module for streaming 
+        type : channel type
+            The streaming type (e.g. fifo or pipe)
+
+        Returns
+        -------
+        outer : IterVar
+            The outer variable of iteration.
+        """ 
+        # create producer and consumer for stream
+        if isinstance(dst, Device): 
+            dst = 1 if 'fpga' in str(dst) else 0
+            return _api_internal._ScheduleMove(self, tensor, dst,
+                                               types, depth, name)
+        else: # connect kernel
+            assert isinstance(dst, _Stage), "dst not a stage "
+            if src: # remove buffer between kernels 
+                assert isinstance(src, _Stage), \
+                       "destination should be a stage but " + str(type(src)) 
+                try: 
+                    self.remove_args.append(tensor.op.output(0))
+                except:
+                    self.remove_args = []
+                    self.remove_args.append(tensor.op.output(0))
+                _api_internal._ScheduleStream(self, tensor, dst, src, 
+                                              types, depth, name)
+            else: # from externop buffer to kernel
+                shape = [_.value for _ in tensor.shape]
+                index, match = 0, []
+                for s in dst.op.body.api_args:
+                    arg_shape = [_.value for _ in s]
+                    if shape == arg_shape: match.append(index)
+                    index = index + 1
+                assert len(match) > 0, "wrong kernel or tensor (shape not matching)"
+                _api_internal._ScheduleMoveToStage(self, tensor, dst, match[0], 
+                                                   types, depth, name)
+
 @register_node("Stage")
 class _Stage(NodeBase):
     """A Stage represents schedule for one operation.
@@ -654,7 +702,7 @@ def pragma(self, var, pragma_type):
         - **parallel_stride_pattern**
 
           Hint parallel loop to execute in strided pattern.
-          :code:`for (int i = task_id; i < end; i += num_task)`
+          :code:`for (int i = task_id; i < end; i += num_task)`          
 
         """
         _api_internal._StagePragma(self, var, pragma_type)
diff --git a/python/heterocl/tvm/stmt.py b/python/heterocl/tvm/stmt.py
index 4db84970f..d5c2d0a18 100644
--- a/python/heterocl/tvm/stmt.py
+++ b/python/heterocl/tvm/stmt.py
@@ -112,3 +112,7 @@ class Partition(Stmt):
 @register_node
 class Stencil(Stmt):
     pass
+
+@register_node
+class StreamStmt(Stmt):
+    pass
diff --git a/python/heterocl/tvm/target.py b/python/heterocl/tvm/target.py
index 12235d95d..5687953ca 100644
--- a/python/heterocl/tvm/target.py
+++ b/python/heterocl/tvm/target.py
@@ -1,43 +1,3 @@
-"""Target management API of TVM.
-
-TVM's target string is in fomat ``<target_name> [-option=value]...``.
-
-Note
-----
-The list of options include:
-
-- **-device=<device name>**
-
-   The device name.
-
-- **-mtriple=<target triple>** or **-target**
-
-   Specify the target triple, which is useful for cross
-   compilation.
-
-- **-mcpu=<cpuname>**
-
-   Specify a specific chip in the current architecture to
-   generate code for. By default this is infered from the
-   target triple and autodetected to the current architecture.
-
-- **-mattr=a1,+a2,-a3,...**
-
-   Override or control specific attributes of the target,
-   such as whether SIMD operations are enabled or not. The
-   default set of attributes is set by the current CPU.
-
-- **-system-lib**
-
-   Build TVM system library module. System lib is a global module that contains
-   self registered functions in program startup. User can get the module using
-   :any:`tvm.module.system_lib`.
-   It is useful in environments where dynamic loading api like dlopen is banned.
-   The system lib will be available as long as the result code is linked by the program.
-
-We can use :any:`tvm.target.create` to create a tvm.target.Target from the target string.
-We can also use other specific function in this module to create specific targets.
-"""
 from __future__ import absolute_import
 
 import warnings
@@ -50,7 +10,8 @@
     if _LIB_NAME != "libhcl_runtime.so":
         raise err_msg
 
-FPGA_TARGETS = ['merlinc', 'soda', 'soda_xhls', 'vhls', 'ihls', 'vhls_csim']
+FPGA_TARGETS = ['merlinc', 'soda', 'soda_xhls', 'vhls', 'ihls', 'vhls_csim', 
+                'opencl', 'sdaccel', 'sdaccel_csim', 'aocl', 'aocl_csim', 'rv64_ppac']
 
 def _merge_opts(opts, new_opts):
     """Helper function to merge options"""
@@ -68,7 +29,7 @@ class Target(object):
 
     Parameters
     ----------
-    target_name : {"llvm", "cuda", "opencl", "metal", "rocm", "stackvm", "opengl", "ext_dev"}
+    target_name : {"llvm", "cuda", "opencl", "metal", "rocm", "stackvm", "opengl", "ext_dev", "rv64_ppac"}
         The major target name.
 
                   {"merlinc", "soda", "soda_xhls", "vhls"}
diff --git a/python/heterocl/util.py b/python/heterocl/util.py
index 996201105..704b774cb 100644
--- a/python/heterocl/util.py
+++ b/python/heterocl/util.py
@@ -4,6 +4,7 @@
 from .tvm.expr import Var, Call
 from .tvm.api import _IterVar, decl_buffer
 from . import types
+from . import devices
 from . import config
 from .scheme import Scheme
 from .debug import DTypeError
diff --git a/samples/conv/conv.py b/samples/conv/conv.py
new file mode 100644
index 000000000..ca41a50a1
--- /dev/null
+++ b/samples/conv/conv.py
@@ -0,0 +1,70 @@
+import heterocl as hcl
+import hlib
+import numpy as np
+from PIL import Image
+from urllib.request import urlopen
+
+batch_size = 1
+hcl.init(hcl.UInt(32))
+dtype = hcl.UInt(32)
+image_size = ()
+kernel_size = 3
+
+# setup target using vivado 
+tool = hcl.tool.vivado("csim")
+target = hcl.platform.zc706
+
+def conv():
+    image = hcl.placeholder((batch_size, 1, 256, 256), "input_image")
+    k1 = hcl.placeholder((1, 1, 3, 3), "kernel_1")
+    k2 = hcl.placeholder((1, 1, 3, 3), "kernel_2")
+
+    def kernel(input_image, kernel_1, kernel_2):
+
+        # return tensor required (cannot do def_())
+        interm_shape = (1,1,254,254)
+        output_shape = (1,1,252,252)
+
+        # make compute wrapped in hcl def
+        module1 = hcl.def_([input_image.shape, kernel_1.shape, interm_shape], name="conv1")(hlib.nn.conv2d_nchw_imp)
+        module2 = hcl.def_([interm_shape, kernel_2.shape, output_shape], name="conv2")(hlib.nn.conv2d_nchw_imp)
+        conv1 = hcl.compute(interm_shape, lambda *args: 0)  
+        conv2 = hcl.compute(output_shape, lambda *args: 0)  
+        module1(input_image, kernel_1, conv1)
+        module2(conv1, kernel_2, conv2)
+
+        # derivative module for normalization 
+        return hcl.compute(output_shape, lambda *args: conv2[args], name="derv")
+
+    s = hcl.create_schedule([image, k1, k2], kernel)
+
+    # data moved to local  
+    i0, k10, k20 = s.to([image, k1, k2], target.fpga)
+    # s.to([i0, k10], s[kernel.conv1])
+    # s.to([k20], s[kernel.conv2])
+    s.to(kernel.derv, target.cpu)
+
+    # create stream channel between modules 
+    print(type(target.fpga), hcl.lower(s))
+    return hcl.build(s, target)
+
+# Load sample data
+img = Image.open(urlopen('http://i.stack.imgur.com/8zINU.gif'))
+kernel_x = np.array([[-1, 0, 1], [-2, 0, 2], [-1, 0, 1]])
+kernel_y = np.flip(kernel_x.T.T, axis=0)
+img = np.array(img)
+
+img = img[np.newaxis, ...]
+img = img[np.newaxis, ...]
+kernel_x = kernel_x[np.newaxis, ...]
+kernel_x = kernel_x[np.newaxis, ...]
+kernel_y = kernel_y[np.newaxis, ...]
+kernel_y = kernel_y[np.newaxis, ...]
+
+hcl_input  = hcl.asarray(img, dtype)    
+kernel_x   = hcl.asarray(kernel_x, dtype)
+kernel_y   = hcl.asarray(kernel_y, dtype)
+hcl_output = hcl.asarray(np.zeros((1,1,254,254)), dtype)    
+
+f = conv()
+f(hcl_input, kernel_x, kernel_y, hcl_output)
diff --git a/samples/digitrec/digitrec_stream.py b/samples/digitrec/digitrec_stream.py
new file mode 100644
index 000000000..4c0da096a
--- /dev/null
+++ b/samples/digitrec/digitrec_stream.py
@@ -0,0 +1,150 @@
+import heterocl as hcl
+import time
+import numpy as np
+import math
+from digitrec_data import read_digitrec_data
+
+N = 8 * 8
+max_bit = int(math.ceil(math.log(N, 2)))
+test_size = (180, )
+data_size = (10, 1800)
+
+dtype_image = hcl.UInt(N)
+dtype_knnmat = hcl.UInt(max_bit)
+
+setting = {
+  "version" : "2019.1",
+  "clock"   : "10"
+}
+tool = hcl.tool.vivado("csim", setting)
+target = hcl.platform.aws_f1
+
+def knn(test_images, train_images):
+
+    def popcount(num):
+        out = hcl.local(0, "out")
+        with hcl.for_(0, train_images.type.bits) as i:
+            out.v += num[i]
+        return out.v
+
+    def update_knn(dist, knn_mat, i, j):
+        max_id = hcl.local(0, "max_id")
+        with hcl.for_(0, 3) as k:
+            with hcl.if_(knn_mat[i][k] > knn_mat[i][max_id.v]):
+                max_id.v = k
+        with hcl.if_(dist[i][j] < knn_mat[i][max_id.v]):
+            knn_mat[i][max_id.v] = dist[i][j]
+
+    def sort_knn(knn_mat, i, j):
+        val = hcl.local(0, "val")
+        with hcl.if_( j == 1 ):
+            with hcl.if_( knn_mat[i][1] > knn_mat[i][2] ):
+                val.v = knn_mat[i][1] 
+                knn_mat[i][1] = knn_mat[i][2]
+                knn_mat[i][2] = val.v
+        with hcl.else_():
+            with hcl.if_( knn_mat[i][0] > knn_mat[i][1] ):
+                val.v = knn_mat[i][0] 
+                knn_mat[i][0] = knn_mat[i][1]
+                knn_mat[i][1] = val.v
+
+    def knn_vote(knn_mat, j):
+        id0 = hcl.local(0, "id0")
+        id1 = hcl.local(0, "id1")
+        id2 = hcl.local(0, "id2")
+        count = hcl.local(0, "count")
+        with hcl.for_(0, 10) as n:
+            with hcl.if_(knn_mat[n][0] < knn_mat[id0.v][0]):
+                id0.v = n
+        with hcl.for_(0, 10) as m:
+            with hcl.if_(knn_mat[m][0] < knn_mat[id1.v][0]):
+                id1.v = m
+        with hcl.for_(0, 10) as k:
+            with hcl.if_(knn_mat[k][0] < knn_mat[id2.v][0]):
+                id2.v = k
+        with hcl.if_(j == id0.v):
+            count.v += 1 
+        with hcl.elif_(j == id1.v):
+            count.v += 1 
+        with hcl.elif_(j == id2.v):
+            count.v += 1 
+        with hcl.else_():
+            count.v += 0 
+        return count.v
+
+    # support hcl.compute in hcl def 
+    @hcl.def_([(), data_size, (10,3)])
+    def knn_dist(test_image, train_images, pred_matrix):
+        pass
+
+    with hcl.for_(0, 180) as index:
+        test_image = test_images[index] 
+        diff = hcl.compute(train_images.shape,
+                           lambda x, y: train_images[x][y] ^ test_image,
+                           "diff")
+        dist = hcl.compute(diff.shape,
+                           lambda x, y: popcount(diff[x][y]),
+                           "dist")
+        knn_mat = hcl.compute((10, 3), lambda x, y: 50, "knn_mat")
+        hcl.mutate(dist.shape,
+                        lambda x, y: update_knn(dist, knn_mat, x, y),
+                        "knn_update")
+        hcl.mutate((10, 3), lambda x, y: sort_knn(knn_mat, x, y), "sort")
+        knn_new = hcl.compute(knn_mat.shape, 
+                              lambda x, y: knn_mat[x][y], "copy")
+        knn_pred = hcl.compute((10,), 
+                               lambda x: knn_vote(knn_mat, x), "vote")
+    return knn_pred
+
+test_image = hcl.placeholder(test_size, "test_image", dtype_image)
+train_images = hcl.placeholder(data_size, "train_images", dtype_image)
+
+scheme = hcl.create_scheme([test_image, train_images], knn)
+scheme.downsize([knn.dist, knn.dist.out, knn.knn_mat], dtype_knnmat)
+
+s = hcl.create_schedule_from_scheme(scheme)
+
+diff = knn.diff
+dist = knn.dist
+vote = knn.copy
+knn_update = knn.knn_update
+
+s.to([test_images, train_images], target.xcel)
+s.to(vote, target.host)
+
+# merge loop nests
+s[diff].compute_at(s[dist], dist.axis[1])
+s[dist].compute_at(s[knn_update], knn_update.axis[1])
+
+# reorder loop to expose more parallelism
+s[knn_update].reorder(knn_update.axis[1], knn_update.axis[0])
+
+# parallel outer loop and pipeline inner loop
+s[knn_update].parallel(knn_update.axis[1])
+s[knn_update].pipeline(knn_update.axis[0])
+
+# at the end, we build the whole offloaded function.
+# print(hcl.lower(s))
+f = hcl.build(s, target)
+
+train_images, _, test_images, test_labels = read_digitrec_data()
+total = len(test_images)
+total_time = 0
+
+# read returned prediction from streaming pipe
+hcl_train_images = hcl.asarray(train_images, dtype_image)
+hcl_knn_pred = hcl.asarray(np.zeros((total, 10)), dtype_knnmat)
+
+start = time.time()
+f(test_images, hcl_train_images, hcl_knn_pred)
+total_time = total_time + (time.time() - start)
+
+knn_result = hcl_knn_pred.asnumpy()
+
+correct = 0.0
+for i in range(total):
+    if np.argmax(knn_result[i]) == test_labels[i]:
+        correct += 1
+
+print("Average kernel time (s): {:.2f}".format(total_time/total))
+print("Accuracy (%): {:.2f}".format(100*correct/1))
diff --git a/samples/digitrec/kernel.cpp b/samples/digitrec/kernel.cpp
deleted file mode 100644
index 21b550c8b..000000000
--- a/samples/digitrec/kernel.cpp
+++ /dev/null
@@ -1,38 +0,0 @@
-#include <string.h>
-#include <math.h>
-#include <assert.h>
-#pragma ACCEL kernel
-void default_function(unsigned long test_image, unsigned long* train_images, unsigned char* knn_mat) {
-  for (int x = 0; x < 10; ++x) {
-    for (int y = 0; y < 3; ++y) {
-      knn_mat[(y + (x * 3))] = (unsigned char)50;
-    }
-  }
-  unsigned long knn_update;
-#pragma ACCEL parallel
-  for (int y1 = 0; y1 < 1800; ++y1) {
-#pragma ACCEL pipeline
-    for (int x1 = 0; x1 < 10; ++x1) {
-      unsigned char dist;
-      unsigned long diff;
-      diff = (train_images[(y1 + (x1 * 1800))] ^ test_image);
-      unsigned char out;
-      out = (unsigned char)0;
-      for (int i = 0; i < 49; ++i) {
-        out = ((unsigned char)(((unsigned long)out) + ((unsigned long)((diff & (1L << i)) >> i))));
-      }
-      dist = out;
-      unsigned long max_id;
-      max_id = (unsigned long)0;
-      for (int i1 = 0; i1 < 3; ++i1) {
-        if (knn_mat[(((long)max_id) + ((long)(x1 * 3)))] < knn_mat[(i1 + (x1 * 3))]) {
-          max_id = ((unsigned long)i1);
-        }
-      }
-      if (dist < knn_mat[(((long)max_id) + ((long)(x1 * 3)))]) {
-        knn_mat[(((long)max_id) + ((long)(x1 * 3)))] = dist;
-      }
-    }
-  }
-}
-
diff --git a/samples/gemm/common/common.mk b/samples/gemm/common/common.mk
new file mode 100644
index 000000000..3409e4aa5
--- /dev/null
+++ b/samples/gemm/common/common.mk
@@ -0,0 +1,55 @@
+SHELL = /bin/bash
+VPATH = ./
+CC = xcpp
+CLCC = xocc
+ifeq ($(XDEVICE_REPO_PATH),)
+    DEVICE_REPO_OPT = 
+else
+DEVICE_REPO_OPT = --xp prop:solution.device_repo_paths=${XDEVICE_REPO_PATH}
+endif
+HOST_CFLAGS += -I${XILINX_SDX}/runtime/include/1_2
+HOST_LFLAGS += -L${XILINX_SDX}/runtime/lib/x86_64 -lxilinxopencl -lrt -pthread
+CLCC_OPT += $(CLCC_OPT_LEVEL) ${DEVICE_REPO_OPT} --xdevice ${XDEVICE} -o ${XCLBIN} ${KERNEL_DEFS} ${KERNEL_INCS}
+ifeq (${KEEP_TEMP},1)
+    CLCC_OPT += -s
+endif
+ifeq (${KERNEL_DEBUG},1)
+    CLCC_OPT += -g
+endif
+CLCC_OPT += --kernel ${KERNEL_NAME}
+OBJECTS := $(HOST_SRCS:.cpp=.o)
+.PHONY: all
+all: run
+host: ${HOST_EXE_DIR}/${HOST_EXE}
+xbin_cpu_em:
+    make SDA_FLOW=cpu_emu xbin -f sdaccel.mk
+xbin_hw_em:
+    make SDA_FLOW=hw_emu xbin -f sdaccel.mk
+xbin_hw :
+    make SDA_FLOW=hw xbin -f sdaccel.mk
+xbin: ${XCLBIN}
+run_cpu_em: 
+    make SDA_FLOW=cpu_emu run_em -f sdaccel.mk
+run_hw_em: 
+    make SDA_FLOW=hw_emu run_em -f sdaccel.mk
+run_hw : 
+    make SDA_FLOW=hw run_hw_int -f sdaccel.mk
+run_em: xconfig host xbin
+    XCL_EMULATION_MODE=true ${HOST_EXE_DIR}/${HOST_EXE} ${HOST_ARGS}
+run_hw_int : host xbin_hw
+    source ${BOARD_SETUP_FILE};${HOST_EXE_DIR}/${HOST_EXE} ${HOST_ARGS}
+estimate : 
+    ${CLCC} -c -t hw_emu --xdevice ${XDEVICE} --report estimate ${KERNEL_SRCS}
+xconfig : emconfig.json
+emconfig.json :
+    emconfigutil --xdevice ${XDEVICE} ${DEVICE_REPO_OPT} --od .
+${HOST_EXE_DIR}/${HOST_EXE} : ${OBJECTS}
+    ${CC} ${HOST_LFLAGS} ${OBJECTS} -o $@
+${XCLBIN}:
+    ${CLCC} ${CLCC_OPT} ${KERNEL_SRCS}
+%.o: %.cpp
+    ${CC} ${HOST_CFLAGS} -c $< -o $@
+clean:
+    ${RM} -rf ${HOST_EXE} ${OBJECTS} ${XCLBIN} emconfig.json _xocc_${XCLBIN_NAME}_*.dir .Xil
+cleanall: clean
+    ${RM} -rf *.xclbin sdaccel_profile_summary.* _xocc_* TempConfig *.log *.jou
diff --git a/samples/gemm/gemm_aocl.cl b/samples/gemm/gemm_aocl.cl
new file mode 100644
index 000000000..198757823
--- /dev/null
+++ b/samples/gemm/gemm_aocl.cl
@@ -0,0 +1,14 @@
+#include "ihc_apint.h"
+__kernel void default_function(__global int* restrict placeholder0, __global int* restrict placeholder1, __global int* restrict matrix_3) {
+  for (int x = 0; x < 10; ++x) {
+    for (int y = 0; y < 10; ++y) {
+      int sum;
+      sum = 0;
+      for (int k = 0; k < 10; ++k) {
+        sum = ((int)(((int64_t)(((long)placeholder0[(k + (x * 10))]) * ((long)placeholder1[(y + (k * 10))]))) + ((int64_t)sum)));
+      }
+      matrix_3[(y + (x * 10))] = sum;
+    }
+  }
+}
+
diff --git a/samples/gemm/gemm_main.py b/samples/gemm/gemm_main.py
index fb05a094d..4796bf2fb 100644
--- a/samples/gemm/gemm_main.py
+++ b/samples/gemm/gemm_main.py
@@ -52,5 +52,6 @@ def time_gemm(dtype, m=1024, n=1024, k=1024, target=None):
 ###############################################################################
 # Test the algorithm with different data types
 dtypes = [hcl.Int(32), hcl.Float(), hcl.Fixed(32, 16)]
-for dtype in dtypes:
-    time_gemm(dtype)
+
+# for dtype in dtypes:
+# time_gemm(hcl.Float(), 10, 10, 10, 'sdaccel')
diff --git a/samples/gemm/gemm_runtime.py b/samples/gemm/gemm_runtime.py
new file mode 100644
index 000000000..49947fa4c
--- /dev/null
+++ b/samples/gemm/gemm_runtime.py
@@ -0,0 +1,86 @@
+# Yang.Bai
+# yb269@cornell.edu
+
+import heterocl as hcl
+import numpy as np
+
+hcl.init()
+
+# matrix_size = (16, 16)
+# def add_compute(A, B):
+#     C = hcl.compute(A.shape, lambda x, y: A[x, y] + B[x, y], "C")
+#     return C
+
+# def add_compute_2(A, B):
+#     C = hcl.compute(A.shape, lambda x: A[x] + B[x], "C")
+#     return C
+
+# A = hcl.placeholder(matrix_size, "A")
+# B = hcl.placeholder(matrix_size, "B")
+
+# s = hcl.create_schedule([A, B], add_compute)
+# # f2 = hcl.build(s, target='sdaccel')
+# f2 = hcl.build(s, target='aocl')
+# print (f2)
+
+# hcl_A = hcl.asarray(np.random.random_sample(matrix_size), dtype = hcl.Float())
+# hcl_B = hcl.asarray(np.random.random_sample(matrix_size), dtype = hcl.Float())
+# hcl_C = hcl.asarray(np.zeros(matrix_size), dtype = hcl.Float())
+# hcl_C2 = hcl.asarray(np.zeros(matrix_size), dtype = hcl.Float())
+# f3 = hcl.build(s)
+
+# A = hcl.placeholder((10, ), "A")
+# B = hcl.placeholder((10, ), "B")
+# s = hcl.create_schedule([A, B], add_compute_2)
+# f4 = hcl.build(s, target='sdaccel')
+# print (f4)
+# print (hcl_A, hcl_B, hcl_C)
+
+matrix_1_size = (10, 10)
+matrix_2_size = (10, 10)
+matrix_3_size = (matrix_1_size[0], matrix_2_size[1])
+
+def gemm_compute(matrix_1, matrix_2):
+    m = matrix_1.shape[0];
+    k = matrix_1.shape[1];
+    n = matrix_2.shape[1];
+    r = hcl.reduce_axis(0, k, 'k')
+    temp = hcl.compute((m, n), 
+            lambda x, y: hcl.sum(matrix_1[x, r] * matrix_2[r, y], 
+            axis = r), name='matrix_3')
+    return temp
+
+matrix_1 = hcl.placeholder(matrix_1_size)
+matrix_2 = hcl.placeholder(matrix_2_size)
+
+s = hcl.create_schedule([matrix_1, matrix_2], gemm_compute)
+f = hcl.build(s, target='sdaccel_csim')
+code = hcl.build(s, target='aocl')
+with open('gemm_aocl.cl', 'w') as fin:
+    fin.write(code)
+
+code2 = hcl.build(s, target='sdaccel')
+with open('gemm_sdaccel.cl', 'w') as fin2:
+    fin2.write(code2)
+
+
+matrix_1_np = np.random.randint(10, size=matrix_1_size)
+matrix_2_np = np.random.randint(10, size=matrix_2_size)
+matrix_3_np = np.random.randint(10, size=matrix_3_size)
+
+hcl_matrix_1 = hcl.asarray(matrix_1_np)
+hcl_matrix_2 = hcl.asarray(matrix_2_np)
+hcl_matrix_3 = hcl.asarray(matrix_3_np)
+
+# f(hcl_matrix_1, hcl_matrix_2, hcl_matrix_3)
+
+
+
+
+
+# with open('sdaccel.cl', 'w') as f:
+#     f.write(code)
+
+
+
+
diff --git a/samples/gemm/gemm_sdaccel.cl b/samples/gemm/gemm_sdaccel.cl
new file mode 100644
index 000000000..f46a88426
--- /dev/null
+++ b/samples/gemm/gemm_sdaccel.cl
@@ -0,0 +1,13 @@
+__kernel void default_function(__global int* placeholder0, __global int* placeholder1, __global int* matrix_3) {
+  for (int x = 0; x < 10; ++x) {
+    for (int y = 0; y < 10; ++y) {
+      __local int sum;
+      sum = 0;
+      for (int k = 0; k < 10; ++k) {
+        sum = ((int)(((long)(((long)placeholder0[(k + (x * 10))]) * ((long)placeholder1[(y + (k * 10))]))) + ((long)sum)));
+      }
+      matrix_3[(y + (x * 10))] = sum;
+    }
+  }
+}
+
diff --git a/samples/gemm/gemm_sdaccel.py b/samples/gemm/gemm_sdaccel.py
new file mode 100644
index 000000000..85c318120
--- /dev/null
+++ b/samples/gemm/gemm_sdaccel.py
@@ -0,0 +1,8 @@
+import heterocl as hcl
+import numpy as np
+from gemm_main import *
+
+#dtypes = [hcl.Int(32), hcl.Float(), hcl.Fixed(32, 16)]
+#for dtype in dtypes:
+time_gemm(hcl.Int(32), 15, 15, 15, 'sdaccel_sw_emu')
+# time_gemm(hcl.Float(), 100, 100, 100, 'sdaccel_sw_emu')
diff --git a/samples/gemm/gemm_vhls.py b/samples/gemm/gemm_vhls.py
index e27fa155e..8edd84bdd 100644
--- a/samples/gemm/gemm_vhls.py
+++ b/samples/gemm/gemm_vhls.py
@@ -2,6 +2,6 @@
 import numpy as np
 from gemm_main import *
 
-dtypes = [hcl.Int(32), hcl.Float(), hcl.Fixed(32, 16)]
-for dtype in dtypes:
-    time_gemm(dtype, 10, 10, 10, 'vhls_csim')
+#dtypes = [hcl.Int(32), hcl.Float(), hcl.Fixed(32, 16)]
+#for dtype in dtypes:
+time_gemm(hcl.Int(32), 10, 10, 10, 'vhls_csim')
diff --git a/samples/gemm/host.cpp b/samples/gemm/host.cpp
new file mode 100644
index 000000000..914b2aa26
--- /dev/null
+++ b/samples/gemm/host.cpp
@@ -0,0 +1,118 @@
+#define CL_HPP_CL_1_2_DEFAULT_BUILD
+#define CL_HPP_TARGET_OPENCL_VERSION 120
+#define CL_HPP_MINIMUM_OPENCL_VERSION 120
+#define CL_HPP_ENABLE_PROGRAM_CONSTRUCTION_FROM_ARRAY_COMPATIBILITY 1
+#include <CL/cl2.hpp>
+#include <fstream>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <cstring>
+#include <iostream>
+#include <iomanip>
+#include <cmath>
+#include <sys/ipc.h>
+#include <sys/shm.h>
+#pragma once
+
+
+
+
+int main(void) { 
+#if defined(SDX_PLATFORM) && !defined(TARGET_DEVICE)
+  #define STR_VALUE(arg) #arg
+  #define GET_STRING(name) STR_VALUE(name)
+  #define TARGET_DEVICE GET_STRING(SDX_PLATFORM)
+#endif
+    char* xclbinFilename = argv[1];
+
+    std::vector<int> source_0(6 * 2);
+    std::vector<int> source_1(2 * 7);
+    std::vector<int> source_2(6 * 7);
+
+    size_t vector_size_bytes_0 = sizeof(int) * 6 * 2;
+    size_t vector_size_bytes_1 = sizeof(int) * 2 * 7;
+    size_t vector_size_bytes_2 = sizeof(int) * 6 * 7;
+
+    int* arg_0 = (int*)shmat(4849666, nullptr, 0);
+    for (size_t i0 = 0; i0 < 6; i0++) {
+      for (size_t i1 = 0; i1 < 2; i1++) {
+        source_0[i1 + i0*2] = arg_0[i1 + i0*2];
+      }
+    }
+    int* arg_1 = (int*)shmat(7667712, nullptr, 0);
+    for (size_t i0 = 0; i0 < 2; i0++) {
+      for (size_t i1 = 0; i1 < 7; i1++) {
+        source_1[i1 + i0*7] = arg_1[i1 + i0*7];
+      }
+    }
+    int* arg_2 = (int*)shmat(7667713, nullptr, 0);
+    for (size_t i0 = 0; i0 < 6; i0++) {
+      for (size_t i1 = 0; i1 < 7; i1++) {
+        source_2[i1 + i0*7] = arg_2[i1 + i0*7];
+      }
+    }
+    std::vector<cl::Platform> platforms;
+    cl::Platform::get(&platforms);
+    cl::Platform platform = platforms[0];
+
+    std::vector<cl::Device> devices;
+    platform.getDevices(CL_DEVICE_TYPE_ACCELERATOR, &devices);
+    cl::Device device = devices[0];
+
+    cl::Context context(device);
+    cl::CommandQueue q(context, device);
+
+    std::ifstream bin_file(xclbinFilename, std::ifstream::binary);
+    bin_file.seekg (0, bin_file.end);
+    unsigned nb = bin_file.tellg();
+    bin_file.seekg (0, bin_file.beg);
+    char *buf = new char [nb];
+    bin_file.read(buf, nb);
+
+    cl::Program::Binaries bins;
+    bins.push_back({buf,nb});
+    devices.resize(1);
+    cl::Program program(context, devices, bins);
+
+    int err1;
+    cl::Kernel kernel(program, "default_function", &err1);
+    auto default_function = cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&>(kernel);
+
+    cl::Buffer buffer_0(context, CL_MEM_READ_WRITE, vector_size_bytes_0);
+    cl::Buffer buffer_1(context, CL_MEM_READ_WRITE, vector_size_bytes_1);
+    cl::Buffer buffer_2(context, CL_MEM_READ_WRITE, vector_size_bytes_2);
+
+    q.enqueueWriteBuffer(buffer_0, CL_TRUE, 0, vector_size_bytes_0, source_0.data());
+    q.enqueueWriteBuffer(buffer_1, CL_TRUE, 0, vector_size_bytes_1, source_1.data());
+    q.enqueueWriteBuffer(buffer_2, CL_TRUE, 0, vector_size_bytes_2, source_2.data());
+
+    default_function(cl::EnqueueArgs(q, cl::NDRange(1,1,1), cl::NDRange(1,1,1)),buffer_0, buffer_1, buffer_2);
+    q.finish();
+
+    q.enqueueReadBuffer(buffer_0, CL_TRUE, 0, vector_size_bytes_0, source_0.data());
+    q.enqueueReadBuffer(buffer_1, CL_TRUE, 0, vector_size_bytes_1, source_1.data());
+    q.enqueueReadBuffer(buffer_2, CL_TRUE, 0, vector_size_bytes_2, source_2.data());
+
+    for (size_t i0 = 0; i0 < 6; i0++) {
+      for (size_t i1 = 0; i1 < 2; i1++) {
+        arg_0[i1 + i0*2] = source_0[i1 + i0*2];
+      }
+    }
+    shmdt(arg_0);
+    for (size_t i0 = 0; i0 < 2; i0++) {
+      for (size_t i1 = 0; i1 < 7; i1++) {
+        arg_1[i1 + i0*7] = source_1[i1 + i0*7];
+      }
+    }
+    shmdt(arg_1);
+    for (size_t i0 = 0; i0 < 6; i0++) {
+      for (size_t i1 = 0; i1 < 7; i1++) {
+        arg_2[i1 + i0*7] = source_2[i1 + i0*7];
+      }
+    }
+    shmdt(arg_2);
+}
diff --git a/samples/gemm/sdaccel.mk b/samples/gemm/sdaccel.mk
new file mode 100644
index 000000000..9cf0dafd7
--- /dev/null
+++ b/samples/gemm/sdaccel.mk
@@ -0,0 +1,33 @@
+ifndef XILINX_SDX
+$(error Environment variable XILINX_SDX is required and should point to SDAccel install area)
+endif
+SDA_FLOW = cpu_emu
+HOST_SRCS = host.cpp
+HOST_EXE_DIR=.
+HOST_EXE = host
+HOST_CFLAGS = -g -Wall -DFPGA_DEVICE -DC_KERNEL
+HOST_LFLAGS = 
+KERNEL_SRCS = default_function.cl
+KERNEL_NAME = default_function
+KERNEL_DEFS =
+KERNEL_INCS =
+XDEVICE=xilinx:adm-pcie-7v3:1ddr:3.0
+XDEVICE_REPO_PATH=
+KEEP_TEMP=1
+KERNEL_DEBUG=
+XCLBIN_NAME=bin_krnl
+HOST_CFLAGS+=-DTARGET_DEVICE=\"${XDEVICE}\"
+BOARD_SETUP_FILE=setup.sh
+ifeq (${SDA_FLOW},cpu_emu)
+    CLCC_OPT += -t sw_emu
+    XCLBIN = ${XCLBIN_NAME}_cpu_emu.xclbin
+else ifeq (${SDA_FLOW},hw_emu)
+    CLCC_OPT += -t hw_emu
+    XCLBIN = ${XCLBIN_NAME}_hw_emu.xclbin
+else ifeq (${SDA_FLOW},hw)
+    XCLBIN = ${XCLBIN_NAME}_hw.xclbin
+CLCC_OPT += -t hw
+endif
+HOST_ARGS = ${XCLBIN}
+COMMON_DIR = ./common
+include ${COMMON_DIR}/common.mk
diff --git a/samples/kmeans/kmeans_aocl.cl b/samples/kmeans/kmeans_aocl.cl
new file mode 100644
index 000000000..e64b116f4
--- /dev/null
+++ b/samples/kmeans/kmeans_aocl.cl
@@ -0,0 +1,49 @@
+#include "ihc_apint.h"
+__kernel void default_function(__global int* restrict placeholder2, __global int* restrict placeholder3, __global int* restrict compute3) {
+  for (int x = 0; x < 32; ++x) {
+    compute3[x] = 0;
+  }
+  int main_loop;
+  for (int _1 = 0; _1 < 10; ++_1) {
+    #pragma ii 1
+    for (int N = 0; N < 32; ++N) {
+      int local2;
+      local2 = 100000;
+      for (int i = 0; i < 6; ++i) {
+        int local3;
+        local3 = 0;
+        for (int i1 = 0; i1 < 3; ++i1) {
+          local3 = ((int)(((int64_t)local3) + ((int64_t)(((int64_t)((int33_t)(placeholder2[(i1 + (N * 3))] - placeholder3[(i1 + (i * 3))]))) * ((int64_t)((int33_t)(placeholder2[(i1 + (N * 3))] - placeholder3[(i1 + (i * 3))])))))));
+        }
+        if (local3 < local2) {
+          local2 = local3;
+          compute3[N] = i;
+        }
+      }
+    }
+    int compute4[6];
+    for (int x1 = 0; x1 < 6; ++x1) {
+      compute4[x1] = 0;
+    }
+    int compute5[18];
+    for (int x2 = 0; x2 < 6; ++x2) {
+      for (int y = 0; y < 3; ++y) {
+        compute5[(y + (x2 * 3))] = 0;
+      }
+    }
+    int calc_sum;
+    #pragma unroll
+    for (int n = 0; n < 32; ++n) {
+      compute4[compute3[n]] = (compute4[compute3[n]] + 1);
+      for (int i2 = 0; i2 < 3; ++i2) {
+        compute5[(i2 + (compute3[n] * 3))] = ((int)(((int33_t)compute5[(i2 + (compute3[n] * 3))]) + ((int33_t)placeholder2[(i2 + (n * 3))])));
+      }
+    }
+    int update_mean;
+    #pragma unroll
+    for (int k_d_fused = 0; k_d_fused < 18; ++k_d_fused) {
+      placeholder3[k_d_fused] = (compute5[k_d_fused] / compute4[(k_d_fused / 3)]);
+    }
+  }
+}
+
diff --git a/samples/kmeans/kmeans_sdaccel.py b/samples/kmeans/kmeans_sdaccel.py
new file mode 100644
index 000000000..c204c592e
--- /dev/null
+++ b/samples/kmeans/kmeans_sdaccel.py
@@ -0,0 +1,27 @@
+import numpy as np
+import random
+import heterocl as hcl
+from kmeans_main import top
+
+K = 16
+N = 320
+dim = 32
+
+f1 = top('sdaccel_sw_emu')
+#f2 = top()
+points_np = np.random.randint(100, size=(N, dim))
+labels_np = np.zeros(N)
+means_np = points_np[random.sample(range(N), K),:]
+
+hcl_points1 = hcl.asarray(points_np)
+hcl_means1 = hcl.asarray(means_np)
+hcl_labels1 = hcl.asarray(labels_np)
+
+hcl_points2 = hcl.asarray(points_np)
+hcl_means2 = hcl.asarray(means_np)
+hcl_labels2 = hcl.asarray(labels_np)
+
+f1(hcl_points1, hcl_means1, hcl_labels1)
+#f2(hcl_points2, hcl_means2, hcl_labels2)
+
+#assert np.array_equal(hcl_labels1.asnumpy(), hcl_labels2.asnumpy())
diff --git a/samples/kmeans/merlinc_code.cl b/samples/kmeans/merlinc_code.cl
new file mode 100644
index 000000000..ea672313d
--- /dev/null
+++ b/samples/kmeans/merlinc_code.cl
@@ -0,0 +1,52 @@
+#include <string.h>
+#include <math.h>
+#include <assert.h>
+#pragma ACCEL kernel
+void default_function(int* placeholder2, int* placeholder3, int* compute3) {
+  for (int x = 0; x < 320; ++x) {
+    compute3[x] = 0;
+  }
+  int main_loop;
+  for (int _1 = 0; _1 < 200; ++_1) {
+#pragma ACCEL pipeline
+    for (int N = 0; N < 320; ++N) {
+      int local2;
+      local2 = 100000;
+      for (int i = 0; i < 16; ++i) {
+        int local3;
+        local3 = 0;
+        for (int i1 = 0; i1 < 32; ++i1) {
+          local3 = ((int)(((long)local3) + ((long)(((long)((long)(placeholder2[(i1 + (N * 32))] - placeholder3[(i1 + (i * 32))]))) * ((long)((long)(placeholder2[(i1 + (N * 32))] - placeholder3[(i1 + (i * 32))])))))));
+        }
+        if (local3 < local2) {
+          local2 = local3;
+          compute3[N] = i;
+        }
+      }
+    }
+    int compute4[16];
+    for (int x1 = 0; x1 < 16; ++x1) {
+      compute4[x1] = 0;
+    }
+    int compute5[512];
+    for (int x2 = 0; x2 < 16; ++x2) {
+      for (int y = 0; y < 32; ++y) {
+        compute5[(y + (x2 * 32))] = 0;
+      }
+    }
+    int calc_sum;
+#pragma ACCEL parallel flatten
+    for (int n = 0; n < 320; ++n) {
+      compute4[compute3[n]] = (compute4[compute3[n]] + 1);
+      for (int i2 = 0; i2 < 32; ++i2) {
+        compute5[(i2 + (compute3[n] * 32))] = ((int)(((long)compute5[(i2 + (compute3[n] * 32))]) + ((long)placeholder2[(i2 + (n * 32))])));
+      }
+    }
+    int update_mean;
+#pragma ACCEL parallel flatten
+    for (int k_d_fused = 0; k_d_fused < 512; ++k_d_fused) {
+      placeholder3[k_d_fused] = (compute5[k_d_fused] / compute4[(k_d_fused / 32)]);
+    }
+  }
+}
+
diff --git a/samples/kmeans/sdaccel_code.cl b/samples/kmeans/sdaccel_code.cl
new file mode 100644
index 000000000..196f96257
--- /dev/null
+++ b/samples/kmeans/sdaccel_code.cl
@@ -0,0 +1,48 @@
+__kernel void default_function(__global int* placeholder4, __global int* placeholder5, __global int* compute6) {
+  for (int x = 0; x < 320; ++x) {
+    compute6[x] = 0;
+  }
+  __local int main_loop;
+  for (int _1 = 0; _1 < 200; ++_1) {
+    __attribute__((xcl_pipeline_loop(1)))
+    for (int N = 0; N < 320; ++N) {
+      __local int local4;
+      local4 = 100000;
+      for (int i = 0; i < 16; ++i) {
+        __local int local5;
+        local5 = 0;
+        for (int i1 = 0; i1 < 32; ++i1) {
+          local5 = ((int)(((long)local5) + ((long)(((long)((long)(placeholder4[(i1 + (N * 32))] - placeholder5[(i1 + (i * 32))]))) * ((long)((long)(placeholder4[(i1 + (N * 32))] - placeholder5[(i1 + (i * 32))])))))));
+        }
+        if (local5 < local4) {
+          local4 = local5;
+          compute6[N] = i;
+        }
+      }
+    }
+    __local int compute7[16];
+    for (int x1 = 0; x1 < 16; ++x1) {
+      compute7[x1] = 0;
+    }
+    __local int compute8[512];
+    for (int x2 = 0; x2 < 16; ++x2) {
+      for (int y = 0; y < 32; ++y) {
+        compute8[(y + (x2 * 32))] = 0;
+      }
+    }
+    __local int calc_sum;
+    
+    for (int n = 0; n < 320; ++n) {
+      compute7[compute6[n]] = (compute7[compute6[n]] + 1);
+      for (int i2 = 0; i2 < 32; ++i2) {
+        compute8[(i2 + (compute6[n] * 32))] = ((int)(((long)compute8[(i2 + (compute6[n] * 32))]) + ((long)placeholder4[(i2 + (n * 32))])));
+      }
+    }
+    __local int update_mean;
+    
+    for (int k_d_fused = 0; k_d_fused < 512; ++k_d_fused) {
+      placeholder5[k_d_fused] = (compute8[k_d_fused] / compute7[(k_d_fused / 32)]);
+    }
+  }
+}
+
diff --git a/samples/kmeans/submit.sh b/samples/kmeans/submit.sh
new file mode 100644
index 000000000..a4345a542
--- /dev/null
+++ b/samples/kmeans/submit.sh
@@ -0,0 +1,3 @@
+unset DISPLAY
+aoc -board=a10gx -time time.out -time-passes -regtest_mode -v -fpc -fp-relaxed --opt-arg -nocaching -regtest_mode -report -I $INTELFPGAOCLSDKROOT/include/kernel_headers kmeans_aocl.cl
+
diff --git a/samples/kmeans/vhls_code.cl b/samples/kmeans/vhls_code.cl
new file mode 100644
index 000000000..b651dd8bf
--- /dev/null
+++ b/samples/kmeans/vhls_code.cl
@@ -0,0 +1,52 @@
+#include <ap_int.h>
+#include <ap_fixed.h>
+#include <math.h>
+
+void default_function(ap_int<32> placeholder6[320][32], ap_int<32> placeholder7[16][32], ap_int<32> compute9[320]) {
+  for (ap_int<32> x = 0; x < 320; ++x) {
+    compute9[x] = 0;
+  }
+  ap_int<32> main_loop;
+  for (ap_int<32> _ = 0; _ < 200; ++_) {
+    for (ap_int<32> N = 0; N < 320; ++N) {
+    #pragma HLS pipeline
+      ap_int<32> local6;
+      local6 = 100000;
+      for (ap_int<32> i = 0; i < 16; ++i) {
+        ap_int<32> local7;
+        local7 = 0;
+        for (ap_int<32> i1 = 0; i1 < 32; ++i1) {
+          local7 = ((ap_int<32>)(((ap_int<67>)local7) + ((ap_int<67>)(((ap_int<66>)((ap_int<33>)(placeholder6[N][i1] - placeholder7[i][i1]))) * ((ap_int<66>)((ap_int<33>)(placeholder6[N][i1] - placeholder7[i][i1])))))));
+        }
+        if (local7 < local6) {
+          local6 = local7;
+          compute9[N] = i;
+        }
+      }
+    }
+    ap_int<32> compute10[16];
+    for (ap_int<32> x1 = 0; x1 < 16; ++x1) {
+      compute10[x1] = 0;
+    }
+    ap_int<32> compute11[16][32];
+    for (ap_int<32> x2 = 0; x2 < 16; ++x2) {
+      for (ap_int<32> y = 0; y < 32; ++y) {
+        compute11[x2][y] = 0;
+      }
+    }
+    ap_int<32> calc_sum;
+    for (ap_int<32> n = 0; n < 320; ++n) {
+    #pragma HLS unroll
+      compute10[compute9[n]] = (compute10[compute9[n]] + 1);
+      for (ap_int<32> i2 = 0; i2 < 32; ++i2) {
+        compute11[compute9[n]][i2] = ((ap_int<32>)(((ap_int<33>)compute11[compute9[n]][i2]) + ((ap_int<33>)placeholder6[n][i2])));
+      }
+    }
+    ap_int<32> update_mean;
+    for (ap_int<32> k_d_fused = 0; k_d_fused < 512; ++k_d_fused) {
+    #pragma HLS unroll
+      placeholder7[(k_d_fused / 32)][(k_d_fused % 32)] = (compute11[(k_d_fused / 32)][(k_d_fused % 32)] / compute10[(k_d_fused / 32)]);
+    }
+  }
+}
+
diff --git a/samples/lenet/common/common.mk b/samples/lenet/common/common.mk
new file mode 100644
index 000000000..3409e4aa5
--- /dev/null
+++ b/samples/lenet/common/common.mk
@@ -0,0 +1,55 @@
+SHELL = /bin/bash
+VPATH = ./
+CC = xcpp
+CLCC = xocc
+ifeq ($(XDEVICE_REPO_PATH),)
+    DEVICE_REPO_OPT = 
+else
+DEVICE_REPO_OPT = --xp prop:solution.device_repo_paths=${XDEVICE_REPO_PATH}
+endif
+HOST_CFLAGS += -I${XILINX_SDX}/runtime/include/1_2
+HOST_LFLAGS += -L${XILINX_SDX}/runtime/lib/x86_64 -lxilinxopencl -lrt -pthread
+CLCC_OPT += $(CLCC_OPT_LEVEL) ${DEVICE_REPO_OPT} --xdevice ${XDEVICE} -o ${XCLBIN} ${KERNEL_DEFS} ${KERNEL_INCS}
+ifeq (${KEEP_TEMP},1)
+    CLCC_OPT += -s
+endif
+ifeq (${KERNEL_DEBUG},1)
+    CLCC_OPT += -g
+endif
+CLCC_OPT += --kernel ${KERNEL_NAME}
+OBJECTS := $(HOST_SRCS:.cpp=.o)
+.PHONY: all
+all: run
+host: ${HOST_EXE_DIR}/${HOST_EXE}
+xbin_cpu_em:
+    make SDA_FLOW=cpu_emu xbin -f sdaccel.mk
+xbin_hw_em:
+    make SDA_FLOW=hw_emu xbin -f sdaccel.mk
+xbin_hw :
+    make SDA_FLOW=hw xbin -f sdaccel.mk
+xbin: ${XCLBIN}
+run_cpu_em: 
+    make SDA_FLOW=cpu_emu run_em -f sdaccel.mk
+run_hw_em: 
+    make SDA_FLOW=hw_emu run_em -f sdaccel.mk
+run_hw : 
+    make SDA_FLOW=hw run_hw_int -f sdaccel.mk
+run_em: xconfig host xbin
+    XCL_EMULATION_MODE=true ${HOST_EXE_DIR}/${HOST_EXE} ${HOST_ARGS}
+run_hw_int : host xbin_hw
+    source ${BOARD_SETUP_FILE};${HOST_EXE_DIR}/${HOST_EXE} ${HOST_ARGS}
+estimate : 
+    ${CLCC} -c -t hw_emu --xdevice ${XDEVICE} --report estimate ${KERNEL_SRCS}
+xconfig : emconfig.json
+emconfig.json :
+    emconfigutil --xdevice ${XDEVICE} ${DEVICE_REPO_OPT} --od .
+${HOST_EXE_DIR}/${HOST_EXE} : ${OBJECTS}
+    ${CC} ${HOST_LFLAGS} ${OBJECTS} -o $@
+${XCLBIN}:
+    ${CLCC} ${CLCC_OPT} ${KERNEL_SRCS}
+%.o: %.cpp
+    ${CC} ${HOST_CFLAGS} -c $< -o $@
+clean:
+    ${RM} -rf ${HOST_EXE} ${OBJECTS} ${XCLBIN} emconfig.json _xocc_${XCLBIN_NAME}_*.dir .Xil
+cleanall: clean
+    ${RM} -rf *.xclbin sdaccel_profile_summary.* _xocc_* TempConfig *.log *.jou
diff --git a/samples/lenet/lenet_aocl.cl b/samples/lenet/lenet_aocl.cl
new file mode 100644
index 000000000..9b2a200f8
--- /dev/null
+++ b/samples/lenet/lenet_aocl.cl
@@ -0,0 +1,138 @@
+#include "ihc_apint.h"
+__kernel void default_function(__global float* restrict input_image, __global float* restrict weight_conv1, __global float* restrict weight_conv2, __global float* restrict weight_fc1, __global float* restrict weight_fc2, __global float* restrict lenet) {
+  float conv2d;
+  for (int nn = 0; nn < 1; ++nn) {
+    for (int yy = 0; yy < -1; ++yy) {
+      for (int xx = 0; xx < -1; ++xx) {
+        float reducer0;
+        reducer0 = 0.000000e+00f;
+        for (int ra1 = 0; ra1 < 5; ++ra1) {
+          for (int ra2 = 0; ra2 < 5; ++ra2) {
+            reducer0 = ((input_image[(((xx + ra2) + ((yy + ra1) * 3)) + (nn * 9))] * weight_conv1[(ra2 + (ra1 * 5))]) + reducer0);
+          }
+        }
+        conv2d = reducer0;
+      }
+    }
+  }
+  float tanh1;
+  for (int args = 0; args < 1; ++args) {
+    for (int args1 = 0; args1 < -1; ++args1) {
+      for (int args2 = 0; args2 < -1; ++args2) {
+        tanh1 = ((float)tanh(((float)conv2d)));
+      }
+    }
+  }
+  float max_pool;
+  for (int i = 0; i < 1; ++i) {
+    for (int h = 0; h < -1; ++h) {
+      for (int w = 0; w < -1; ++w) {
+        float reducer1;
+        reducer1 = -1.000000e+00f;
+        for (int ra3 = 0; ra3 < 2; ++ra3) {
+          for (int ra4 = 0; ra4 < 2; ++ra4) {
+            reducer1 = max(tanh1, reducer1);
+          }
+        }
+        max_pool = reducer1;
+      }
+    }
+  }
+  float conv2d1[250];
+  for (int nn1 = 0; nn1 < 1; ++nn1) {
+    for (int ff = 0; ff < 10; ++ff) {
+      for (int yy1 = 0; yy1 < -5; ++yy1) {
+        for (int xx1 = 0; xx1 < -5; ++xx1) {
+          float reducer2;
+          reducer2 = 0.000000e+00f;
+          for (int ra6 = 0; ra6 < 5; ++ra6) {
+            for (int ra7 = 0; ra7 < 5; ++ra7) {
+              reducer2 = ((max_pool * weight_conv2[((ra7 + (ra6 * 5)) + (ff * 25))]) + reducer2);
+            }
+          }
+          conv2d1[(((xx1 - (yy1 * 5)) + (ff * 25)) + (nn1 * 250))] = reducer2;
+        }
+      }
+    }
+  }
+  float tanh2[250];
+  for (int args3 = 0; args3 < 1; ++args3) {
+    for (int args0 = 0; args0 < 10; ++args0) {
+      for (int args11 = 0; args11 < -5; ++args11) {
+        for (int args21 = 0; args21 < -5; ++args21) {
+          tanh2[(((args21 - (args11 * 5)) + (args0 * 25)) + (args3 * 250))] = ((float)tanh(((float)conv2d1[(((args21 - (args11 * 5)) + (args0 * 25)) + (args3 * 250))])));
+        }
+      }
+    }
+  }
+  float max_pool1[90];
+  for (int i1 = 0; i1 < 1; ++i1) {
+    for (int c = 0; c < 10; ++c) {
+      for (int h1 = 0; h1 < -3; ++h1) {
+        for (int w1 = 0; w1 < -3; ++w1) {
+          float reducer3;
+          reducer3 = -1.000000e+00f;
+          for (int ra8 = 0; ra8 < 2; ++ra8) {
+            for (int ra9 = 0; ra9 < 2; ++ra9) {
+              reducer3 = max(tanh2[(((((w1 * 2) - (((h1 * 2) + ra8) * 5)) + ra9) + (c * 25)) + (i1 * 250))], reducer3);
+            }
+          }
+          max_pool1[(((w1 - (h1 * 3)) + (c * 9)) + (i1 * 90))] = reducer3;
+        }
+      }
+    }
+  }
+  float compute0[90];
+  for (int i2 = 0; i2 < 1; ++i2) {
+    for (int j = 0; j < 90; ++j) {
+      compute0[(j + (i2 * 90))] = max_pool1[((((j % -3) - (((j / -3) % -3) * 3)) + ((((j / -3) / -3) % 10) * 9)) + (i2 * 90))];
+    }
+  }
+  float dense[25];
+  for (int i3 = 0; i3 < 1; ++i3) {
+    for (int j1 = 0; j1 < 25; ++j1) {
+      float reducer4;
+      reducer4 = 0.000000e+00f;
+      for (int ra10 = 0; ra10 < 90; ++ra10) {
+        reducer4 = ((compute0[(ra10 + (i3 * 90))] * weight_fc1[(ra10 + (j1 * 40))]) + reducer4);
+      }
+      dense[(j1 + (i3 * 25))] = reducer4;
+    }
+  }
+  float tanh3[25];
+  for (int args4 = 0; args4 < 1; ++args4) {
+    for (int args01 = 0; args01 < 25; ++args01) {
+      tanh3[(args01 + (args4 * 25))] = ((float)tanh(((float)dense[(args01 + (args4 * 25))])));
+    }
+  }
+  float dense1[10];
+  for (int i4 = 0; i4 < 1; ++i4) {
+    for (int j2 = 0; j2 < 10; ++j2) {
+      float reducer5;
+      reducer5 = 0.000000e+00f;
+      for (int ra11 = 0; ra11 < 25; ++ra11) {
+        reducer5 = ((tanh3[(ra11 + (i4 * 25))] * weight_fc2[(ra11 + (j2 * 25))]) + reducer5);
+      }
+      dense1[(j2 + (i4 * 10))] = reducer5;
+    }
+  }
+  float compute1;
+  int max1;
+  max1 = 0;
+  for (int ra12 = 0; ra12 < 10; ++ra12) {
+    max1 = ((int)max(dense1[ra12], ((float)max1)));
+  }
+  compute1 = ((float)max1);
+  float compute2;
+  int sum;
+  sum = 0;
+  for (int ra13 = 0; ra13 < 10; ++ra13) {
+    sum = ((int)(exp(((float)(dense1[ra13] - compute1))) + ((float)sum)));
+  }
+  compute2 = ((float)sum);
+  float update0;
+  for (int j3 = 0; j3 < 10; ++j3) {
+    lenet[j3] = ((float)(exp(((float)(dense1[j3] - compute1))) / ((float)compute2)));
+  }
+}
+
diff --git a/samples/lenet/lenet_main_withoutq.py b/samples/lenet/lenet_main_withoutq.py
new file mode 100644
index 000000000..b16bdd6c3
--- /dev/null
+++ b/samples/lenet/lenet_main_withoutq.py
@@ -0,0 +1,125 @@
+import heterocl as hcl
+import hlib
+import numpy as np
+
+hcl.init()
+
+def softmax(out, x):
+    assert len(x.shape) == 2, "only support 2-dim softmax"
+    m, n = x.shape
+    k = hcl.reduce_axis(0, n)
+    max_elem = hcl.compute((m,), lambda i: hcl.max(x[i, k], axis=k))
+    k = hcl.reduce_axis(0, n)
+    expsum = hcl.compute((m,),
+            lambda i: hcl.sum(hcl.exp(x[i, k] - max_elem[i]), axis=k))
+    return hcl.update(out,
+            lambda i, j: hcl.exp(x[i, j] - max_elem[i]) / expsum[i])
+
+def build_lenet(input_image, weight_conv1, weight_conv2,
+                weight_fc1, weight_fc2, lenet):
+    # first conv
+    conv1 = hlib.nn.conv2d_nchw(input_image, weight_conv1)
+    tanh1 = hlib.nn.tanh(conv1, "tanh1")
+    pool1 = hlib.nn.max_pool(tanh1, kernel=(2,2), stride=(2,2))
+    # second conv
+    conv2 = hlib.nn.conv2d_nchw(pool1, weight_conv2)
+    tanh2 = hlib.nn.tanh(conv2, "tanh2")
+    pool2 = hlib.nn.max_pool(tanh2, kernel=(2,2), stride=(2,2))
+    # first fc
+    flat = hlib.nn.flatten(pool2)
+    fc1 = hlib.nn.dense(flat, weight_fc1)
+    tanh3 = hlib.nn.tanh(fc1, "tanh3")
+    # second fc
+    fc2 =  hlib.nn.dense(tanh3, weight_fc2)
+    # loss
+    return softmax(lenet, fc2)
+
+
+import mxnet as mx
+# download pretrained lenet model
+mx.gluon.utils.download('https://gist.githubusercontent.com/Huyuwei/dc00ce83f537914c64a204133d23b019/raw/79af41e7c8ba9120ea7f35fb1d0484b65bccd54f/lenet-0010.params')
+mx.gluon.utils.download('https://gist.githubusercontent.com/Huyuwei/dc00ce83f537914c64a204133d23b019/raw/79af41e7c8ba9120ea7f35fb1d0484b65bccd54f/lenet-symbol.json')
+sym, arg_params, aux_params = mx.model.load_checkpoint('lenet', 10)
+# get weights
+weight_conv1_np = arg_params['convolution0_weight'].asnumpy()
+weight_conv2_np = arg_params['convolution1_weight'].asnumpy()
+weight_fc1_np = arg_params['fullyconnected0_weight'].asnumpy()
+weight_fc2_np = arg_params['fullyconnected1_weight'].asnumpy()
+
+
+# qtype1 = hcl.Fixed(16, 14)
+# qtype2 = hcl.Fixed(16, 14)
+
+# qtype1 = hcl.Fixed(16, 12)
+# qtype2 = hcl.Fixed(16, 12)
+
+
+
+correct_sum = 0
+batch_size = 1000
+mnist = mx.test_utils.get_mnist()
+
+
+def build_lenet_inf(batch_size=batch_size, target=None):
+    # set up input/output placeholders
+    input_image = hcl.placeholder((batch_size, 1, 28, 28), "input_image")
+    # weight_conv1 = hcl.placeholder((20, 1, 5, 5), "weight_conv1", qtype1)
+    # weight_conv2 = hcl.placeholder((50, 20, 5, 5), "weight_conv2", qtype1)
+    # weight_fc1 = hcl.placeholder((500, 800), "weight_fc1", qtype1)
+    # weight_fc2 = hcl.placeholder((10, 500), "weight_fc2", qtype1)
+    weight_conv1 = hcl.placeholder((20, 1, 5, 5), "weight_conv1")
+    weight_conv2 = hcl.placeholder((50, 20, 5, 5), "weight_conv2")
+    weight_fc1 = hcl.placeholder((500, 800), "weight_fc1")
+    weight_fc2 = hcl.placeholder((10, 500), "weight_fc2")
+    lenet = hcl.placeholder((batch_size, 10), "lenet")
+    # create a quantization scheme
+    # scheme = hcl.create_scheme(
+    #         [input_image, weight_conv1, weight_conv2,
+    #          weight_fc1, weight_fc2, lenet], build_lenet)
+    # # quantize the three activation layers
+    # scheme.quantize(
+    #         [build_lenet.tanh1, build_lenet.tanh2, build_lenet.tanh3], qtype2)
+    # s = hcl.create_schedule_from_scheme(scheme)
+    s = hcl.create_schedule([input_image, weight_conv1, weight_conv2, weight_fc1, weight_fc2, lenet], build_lenet)
+    return hcl.build(s, target=target)
+
+code1 = build_lenet_inf(batch_size, 'merlinc')
+# print (code1)
+with open('merlinc_code.cl', 'w') as f:
+	f.write(code1)
+
+code2 = build_lenet_inf(batch_size, 'sdaccel')
+
+with open('sdaccel_code.cl', 'w') as f:
+	f.write(code2)
+
+code3 = build_lenet_inf(batch_size, 'vhls')
+with open('vhls_code.cl', 'w') as f:
+        f.write(code3)
+
+f = build_lenet_inf(batch_size, 'sdaccel_sw_emu')
+
+# weight_conv1_hcl = hcl.asarray(weight_conv1_np, dtype=qtype1)
+# weight_conv2_hcl = hcl.asarray(weight_conv2_np, dtype=qtype1)
+# weight_fc1_hcl = hcl.asarray(weight_fc1_np, dtype=qtype1)
+# weight_fc2_hcl = hcl.asarray(weight_fc2_np, dtype=qtype1)
+
+weight_conv1_hcl = hcl.asarray(weight_conv1_np)
+weight_conv2_hcl = hcl.asarray(weight_conv2_np)
+weight_fc1_hcl = hcl.asarray(weight_fc1_np)
+weight_fc2_hcl = hcl.asarray(weight_fc2_np)
+
+
+for i in range(10000 // batch_size):
+    label = mnist['test_label'][i*batch_size:(i+1)*batch_size]
+    input_image_np = mnist['test_data'][i*batch_size:(i+1)*batch_size]
+    input_image_hcl = hcl.asarray(input_image_np)
+    output_hcl = hcl.asarray(np.zeros((batch_size,10)))
+    f(input_image_hcl, weight_conv1_hcl, weight_conv2_hcl,
+            weight_fc1_hcl, weight_fc2_hcl, output_hcl)
+    print (output_hcl.asnumpy())
+    prediction = np.argmax(output_hcl.asnumpy(), axis=1)
+    correct_sum += np.sum(np.equal(prediction, label))
+
+print("Testing accuracy: {}".format(correct_sum / 10000.))
+
diff --git a/samples/lenet/lenet_sdaccel.py b/samples/lenet/lenet_sdaccel.py
new file mode 100644
index 000000000..917b2b625
--- /dev/null
+++ b/samples/lenet/lenet_sdaccel.py
@@ -0,0 +1,23 @@
+import heterocl as hcl
+import numpy as np
+from lenet_main import *
+
+batch_size = 50
+
+# f = build_lenet_inf(batch_size, 'vhls_csim')
+f = build_lenet_inf(batch_size, 'sdaccel_sw_emu')
+
+mnist = mx.test_utils.get_mnist()
+correct_sum = 0
+
+for i in range(50 // batch_size):
+    label = mnist['test_label'][i*batch_size:(i+1)*batch_size]
+    input_image_np = mnist['test_data'][i*batch_size:(i+1)*batch_size]
+    input_image_hcl = hcl.asarray(input_image_np)
+    output_hcl = hcl.asarray(np.zeros((batch_size,10)))
+    f(input_image_hcl, weight_conv1_hcl, weight_conv2_hcl, weight_fc1_hcl, weight_fc2_hcl, output_hcl)
+    prediction = np.argmax(output_hcl.asnumpy(), axis=1)
+    correct_sum += np.sum(np.equal(prediction, label))
+
+print(str(qtype1) + ", " + str(qtype2) + ": Accuracy over 10000 test images is: {}".format(correct_sum / 10000.))
+assert correct_sum == 9882
diff --git a/samples/lenet/merlinc_code.cl b/samples/lenet/merlinc_code.cl
new file mode 100644
index 000000000..1c5118707
--- /dev/null
+++ b/samples/lenet/merlinc_code.cl
@@ -0,0 +1,155 @@
+#include <string.h>
+#include <math.h>
+#include <assert.h>
+#pragma ACCEL kernel
+void default_function(int* input_image, int* weight_conv1, int* weight_conv2, int* weight_fc1, int* weight_fc2, int* lenet) {
+  int conv2d[11520000];
+  for (int nn = 0; nn < 1000; ++nn) {
+    for (int ff = 0; ff < 20; ++ff) {
+      for (int yy = 0; yy < 24; ++yy) {
+        for (int xx = 0; xx < 24; ++xx) {
+          float reducer0;
+          reducer0 = 0.000000e+00f;
+          for (int ra1 = 0; ra1 < 5; ++ra1) {
+            for (int ra2 = 0; ra2 < 5; ++ra2) {
+              reducer0 = (((float)(((long)input_image[(((xx + ra2) + ((yy + ra1) * 28)) + (nn * 784))]) * ((long)weight_conv1[((ra2 + (ra1 * 5)) + (ff * 25))]))) + reducer0);
+            }
+          }
+          conv2d[(((xx + (yy * 24)) + (ff * 576)) + (nn * 11520))] = ((int)reducer0);
+        }
+      }
+    }
+  }
+  int tanh1[11520000];
+  for (int args = 0; args < 1000; ++args) {
+    for (int args0 = 0; args0 < 20; ++args0) {
+      for (int args1 = 0; args1 < 24; ++args1) {
+        for (int args2 = 0; args2 < 24; ++args2) {
+          tanh1[(((args2 + (args1 * 24)) + (args0 * 576)) + (args * 11520))] = ((int)tanh(((double)conv2d[(((args2 + (args1 * 24)) + (args0 * 576)) + (args * 11520))])));
+        }
+      }
+    }
+  }
+  int max_pool[2880000];
+  for (int i = 0; i < 1000; ++i) {
+    for (int c = 0; c < 20; ++c) {
+      for (int h = 0; h < 12; ++h) {
+        for (int w = 0; w < 12; ++w) {
+          float reducer1;
+          reducer1 = -1.000000e+00f;
+          for (int ra3 = 0; ra3 < 2; ++ra3) {
+            for (int ra4 = 0; ra4 < 2; ++ra4) {
+              reducer1 = max(((float)tanh1[(((((w * 2) + ra4) + (((h * 2) + ra3) * 24)) + (c * 576)) + (i * 11520))]), reducer1);
+            }
+          }
+          max_pool[(((w + (h * 12)) + (c * 144)) + (i * 2880))] = ((int)reducer1);
+        }
+      }
+    }
+  }
+  int conv2d1[3200000];
+  for (int nn1 = 0; nn1 < 1000; ++nn1) {
+    for (int ff1 = 0; ff1 < 50; ++ff1) {
+      for (int yy1 = 0; yy1 < 8; ++yy1) {
+        for (int xx1 = 0; xx1 < 8; ++xx1) {
+          float reducer2;
+          reducer2 = 0.000000e+00f;
+          for (int ra5 = 0; ra5 < 20; ++ra5) {
+            for (int ra6 = 0; ra6 < 5; ++ra6) {
+              for (int ra7 = 0; ra7 < 5; ++ra7) {
+                reducer2 = (((float)(((long)max_pool[((((xx1 + ra7) + ((yy1 + ra6) * 12)) + (ra5 * 144)) + (nn1 * 2880))]) * ((long)weight_conv2[(((ra7 + (ra6 * 5)) + (ra5 * 25)) + (ff1 * 500))]))) + reducer2);
+              }
+            }
+          }
+          conv2d1[(((xx1 + (yy1 * 8)) + (ff1 * 64)) + (nn1 * 3200))] = ((int)reducer2);
+        }
+      }
+    }
+  }
+  int tanh2[3200000];
+  for (int args3 = 0; args3 < 1000; ++args3) {
+    for (int args01 = 0; args01 < 50; ++args01) {
+      for (int args11 = 0; args11 < 8; ++args11) {
+        for (int args21 = 0; args21 < 8; ++args21) {
+          tanh2[(((args21 + (args11 * 8)) + (args01 * 64)) + (args3 * 3200))] = ((int)tanh(((double)conv2d1[(((args21 + (args11 * 8)) + (args01 * 64)) + (args3 * 3200))])));
+        }
+      }
+    }
+  }
+  int max_pool1[800000];
+  for (int i1 = 0; i1 < 1000; ++i1) {
+    for (int c1 = 0; c1 < 50; ++c1) {
+      for (int h1 = 0; h1 < 4; ++h1) {
+        for (int w1 = 0; w1 < 4; ++w1) {
+          float reducer3;
+          reducer3 = -1.000000e+00f;
+          for (int ra8 = 0; ra8 < 2; ++ra8) {
+            for (int ra9 = 0; ra9 < 2; ++ra9) {
+              reducer3 = max(((float)tanh2[(((((w1 * 2) + ra9) + (((h1 * 2) + ra8) * 8)) + (c1 * 64)) + (i1 * 3200))]), reducer3);
+            }
+          }
+          max_pool1[(((w1 + (h1 * 4)) + (c1 * 16)) + (i1 * 800))] = ((int)reducer3);
+        }
+      }
+    }
+  }
+  int compute0[800000];
+  for (int i2 = 0; i2 < 1000; ++i2) {
+    for (int j = 0; j < 800; ++j) {
+      compute0[(j + (i2 * 800))] = max_pool1[((((((j / 4) % 4) * 4) + (j % 4)) + ((j / 16) * 16)) + (i2 * 800))];
+    }
+  }
+  int dense[500000];
+  for (int i3 = 0; i3 < 1000; ++i3) {
+    for (int j1 = 0; j1 < 500; ++j1) {
+      float reducer4;
+      reducer4 = 0.000000e+00f;
+      for (int ra10 = 0; ra10 < 800; ++ra10) {
+        reducer4 = (((float)(((long)compute0[(ra10 + (i3 * 800))]) * ((long)weight_fc1[(ra10 + (j1 * 800))]))) + reducer4);
+      }
+      dense[(j1 + (i3 * 500))] = ((int)reducer4);
+    }
+  }
+  int tanh3[500000];
+  for (int args4 = 0; args4 < 1000; ++args4) {
+    for (int args02 = 0; args02 < 500; ++args02) {
+      tanh3[(args02 + (args4 * 500))] = ((int)tanh(((double)dense[(args02 + (args4 * 500))])));
+    }
+  }
+  int dense1[10000];
+  for (int i4 = 0; i4 < 1000; ++i4) {
+    for (int j2 = 0; j2 < 10; ++j2) {
+      float reducer5;
+      reducer5 = 0.000000e+00f;
+      for (int ra11 = 0; ra11 < 500; ++ra11) {
+        reducer5 = (((float)(((long)tanh3[(ra11 + (i4 * 500))]) * ((long)weight_fc2[(ra11 + (j2 * 500))]))) + reducer5);
+      }
+      dense1[(j2 + (i4 * 10))] = ((int)reducer5);
+    }
+  }
+  int compute1[1000];
+  for (int i5 = 0; i5 < 1000; ++i5) {
+    int max;
+    max = 0;
+    for (int ra12 = 0; ra12 < 10; ++ra12) {
+      max = max(dense1[(ra12 + (i5 * 10))], max);
+    }
+    compute1[i5] = max;
+  }
+  int compute2[1000];
+  for (int i6 = 0; i6 < 1000; ++i6) {
+    int sum;
+    sum = 0;
+    for (int ra13 = 0; ra13 < 10; ++ra13) {
+      sum = ((int)(exp(((double)((long)(dense1[(ra13 + (i6 * 10))] - compute1[i6])))) + ((double)sum)));
+    }
+    compute2[i6] = sum;
+  }
+  int update0;
+  for (int i7 = 0; i7 < 1000; ++i7) {
+    for (int j3 = 0; j3 < 10; ++j3) {
+      lenet[(j3 + (i7 * 10))] = ((int)(exp(((double)((long)(dense1[(j3 + (i7 * 10))] - compute1[i7])))) / ((double)compute2[i7])));
+    }
+  }
+}
+
diff --git a/samples/lenet/sdaccel.mk b/samples/lenet/sdaccel.mk
new file mode 100644
index 000000000..ce266d89e
--- /dev/null
+++ b/samples/lenet/sdaccel.mk
@@ -0,0 +1,32 @@
+ifndef XILINX_SDX
+$(error Environment variable XILINX_SDX is required and should point to SDAccel install area)
+endif
+SDA_FLOW = cpu_emu
+HOST_SRCS = host.cpp
+HOST_EXE_DIR=.
+HOST_EXE = host
+HOST_CFLAGS = -g -Wall -DFPGA_DEVICE -DC_KERNEL
+HOST_LFLAGS = 
+KERNEL_SRCS = default_function.cl
+KERNEL_NAME = default_function
+KERNEL_DEFS =
+KERNEL_INCS =
+XDEVICE=xilinx:adm-pcie-7v3:1ddr:3.0
+XDEVICE_REPO_PATH=
+KEEP_TEMP=1
+KERNEL_DEBUG=
+XCLBIN_NAME=bin_krnl
+HOST_CFLAGS+=-DTARGET_DEVICE=\"${XDEVICE}\"
+BOARD_SETUP_FILE=setup.sh
+ifeq (${SDA_FLOW},cpu_emu)
+    CLCC_OPT += -t sw_emu
+    XCLBIN = ${XCLBIN_NAME}_cpu_emu.xclbin
+else ifeq (${SDA_FLOW},hw_emu)
+    CLCC_OPT += -t hw_emu
+    XCLBIN = ${XCLBIN_NAME}_hw_emu.xclbin
+else ifeq (${SDA_FLOW},hw)
+    XCLBIN = ${XCLBIN_NAME}_hw.xclbin
+CLCC_OPT += -t hw
+endifHOST_ARGS = ${XCLBIN}
+COMMON_DIR = ./common
+include ${COMMON_DIR}/common.mk
diff --git a/samples/lenet/sdaccel_code.cl b/samples/lenet/sdaccel_code.cl
new file mode 100644
index 000000000..114880df0
--- /dev/null
+++ b/samples/lenet/sdaccel_code.cl
@@ -0,0 +1,151 @@
+__kernel void default_function(__global int* input_image, __global int* weight_conv1, __global int* weight_conv2, __global int* weight_fc1, __global int* weight_fc2, __global int* lenet) {
+  __local int conv2d[11520000];
+  for (int nn = 0; nn < 1000; ++nn) {
+    for (int ff = 0; ff < 20; ++ff) {
+      for (int yy = 0; yy < 24; ++yy) {
+        for (int xx = 0; xx < 24; ++xx) {
+          __local float reducer6;
+          reducer6 = 0.000000e+00f;
+          for (int ra15 = 0; ra15 < 5; ++ra15) {
+            for (int ra16 = 0; ra16 < 5; ++ra16) {
+              reducer6 = (((float)(((long)input_image[(((xx + ra16) + ((yy + ra15) * 28)) + (nn * 784))]) * ((long)weight_conv1[((ra16 + (ra15 * 5)) + (ff * 25))]))) + reducer6);
+            }
+          }
+          conv2d[(((xx + (yy * 24)) + (ff * 576)) + (nn * 11520))] = ((int)reducer6);
+        }
+      }
+    }
+  }
+  __local int tanh1[11520000];
+  for (int args = 0; args < 1000; ++args) {
+    for (int args0 = 0; args0 < 20; ++args0) {
+      for (int args1 = 0; args1 < 24; ++args1) {
+        for (int args2 = 0; args2 < 24; ++args2) {
+          tanh1[(((args2 + (args1 * 24)) + (args0 * 576)) + (args * 11520))] = ((int)tanh(((double)conv2d[(((args2 + (args1 * 24)) + (args0 * 576)) + (args * 11520))])));
+        }
+      }
+    }
+  }
+  __local int max_pool[2880000];
+  for (int i = 0; i < 1000; ++i) {
+    for (int c = 0; c < 20; ++c) {
+      for (int h = 0; h < 12; ++h) {
+        for (int w = 0; w < 12; ++w) {
+          __local float reducer7;
+          reducer7 = -1.000000e+00f;
+          for (int ra17 = 0; ra17 < 2; ++ra17) {
+            for (int ra18 = 0; ra18 < 2; ++ra18) {
+              reducer7 = max(((float)tanh1[(((((w * 2) + ra18) + (((h * 2) + ra17) * 24)) + (c * 576)) + (i * 11520))]), reducer7);
+            }
+          }
+          max_pool[(((w + (h * 12)) + (c * 144)) + (i * 2880))] = ((int)reducer7);
+        }
+      }
+    }
+  }
+  __local int conv2d1[3200000];
+  for (int nn1 = 0; nn1 < 1000; ++nn1) {
+    for (int ff1 = 0; ff1 < 50; ++ff1) {
+      for (int yy1 = 0; yy1 < 8; ++yy1) {
+        for (int xx1 = 0; xx1 < 8; ++xx1) {
+          __local float reducer8;
+          reducer8 = 0.000000e+00f;
+          for (int ra19 = 0; ra19 < 20; ++ra19) {
+            for (int ra20 = 0; ra20 < 5; ++ra20) {
+              for (int ra21 = 0; ra21 < 5; ++ra21) {
+                reducer8 = (((float)(((long)max_pool[((((xx1 + ra21) + ((yy1 + ra20) * 12)) + (ra19 * 144)) + (nn1 * 2880))]) * ((long)weight_conv2[(((ra21 + (ra20 * 5)) + (ra19 * 25)) + (ff1 * 500))]))) + reducer8);
+              }
+            }
+          }
+          conv2d1[(((xx1 + (yy1 * 8)) + (ff1 * 64)) + (nn1 * 3200))] = ((int)reducer8);
+        }
+      }
+    }
+  }
+  __local int tanh2[3200000];
+  for (int args3 = 0; args3 < 1000; ++args3) {
+    for (int args01 = 0; args01 < 50; ++args01) {
+      for (int args11 = 0; args11 < 8; ++args11) {
+        for (int args21 = 0; args21 < 8; ++args21) {
+          tanh2[(((args21 + (args11 * 8)) + (args01 * 64)) + (args3 * 3200))] = ((int)tanh(((double)conv2d1[(((args21 + (args11 * 8)) + (args01 * 64)) + (args3 * 3200))])));
+        }
+      }
+    }
+  }
+  __local int max_pool1[800000];
+  for (int i1 = 0; i1 < 1000; ++i1) {
+    for (int c1 = 0; c1 < 50; ++c1) {
+      for (int h1 = 0; h1 < 4; ++h1) {
+        for (int w1 = 0; w1 < 4; ++w1) {
+          __local float reducer9;
+          reducer9 = -1.000000e+00f;
+          for (int ra22 = 0; ra22 < 2; ++ra22) {
+            for (int ra23 = 0; ra23 < 2; ++ra23) {
+              reducer9 = max(((float)tanh2[(((((w1 * 2) + ra23) + (((h1 * 2) + ra22) * 8)) + (c1 * 64)) + (i1 * 3200))]), reducer9);
+            }
+          }
+          max_pool1[(((w1 + (h1 * 4)) + (c1 * 16)) + (i1 * 800))] = ((int)reducer9);
+        }
+      }
+    }
+  }
+  __local int compute3[800000];
+  for (int i2 = 0; i2 < 1000; ++i2) {
+    for (int j = 0; j < 800; ++j) {
+      compute3[(j + (i2 * 800))] = max_pool1[((((((j / 4) % 4) * 4) + (j % 4)) + ((j / 16) * 16)) + (i2 * 800))];
+    }
+  }
+  __local int dense[500000];
+  for (int i3 = 0; i3 < 1000; ++i3) {
+    for (int j1 = 0; j1 < 500; ++j1) {
+      __local float reducer10;
+      reducer10 = 0.000000e+00f;
+      for (int ra24 = 0; ra24 < 800; ++ra24) {
+        reducer10 = (((float)(((long)compute3[(ra24 + (i3 * 800))]) * ((long)weight_fc1[(ra24 + (j1 * 800))]))) + reducer10);
+      }
+      dense[(j1 + (i3 * 500))] = ((int)reducer10);
+    }
+  }
+  __local int tanh3[500000];
+  for (int args4 = 0; args4 < 1000; ++args4) {
+    for (int args02 = 0; args02 < 500; ++args02) {
+      tanh3[(args02 + (args4 * 500))] = ((int)tanh(((double)dense[(args02 + (args4 * 500))])));
+    }
+  }
+  __local int dense1[10000];
+  for (int i4 = 0; i4 < 1000; ++i4) {
+    for (int j2 = 0; j2 < 10; ++j2) {
+      __local float reducer11;
+      reducer11 = 0.000000e+00f;
+      for (int ra25 = 0; ra25 < 500; ++ra25) {
+        reducer11 = (((float)(((long)tanh3[(ra25 + (i4 * 500))]) * ((long)weight_fc2[(ra25 + (j2 * 500))]))) + reducer11);
+      }
+      dense1[(j2 + (i4 * 10))] = ((int)reducer11);
+    }
+  }
+  __local int compute4[1000];
+  for (int i5 = 0; i5 < 1000; ++i5) {
+    __local int max;
+    max = 0;
+    for (int ra26 = 0; ra26 < 10; ++ra26) {
+      max = max(dense1[(ra26 + (i5 * 10))], max);
+    }
+    compute4[i5] = max;
+  }
+  __local int compute5[1000];
+  for (int i6 = 0; i6 < 1000; ++i6) {
+    __local int sum;
+    sum = 0;
+    for (int ra27 = 0; ra27 < 10; ++ra27) {
+      sum = ((int)(exp(((double)((long)(dense1[(ra27 + (i6 * 10))] - compute4[i6])))) + ((double)sum)));
+    }
+    compute5[i6] = sum;
+  }
+  __local int update1;
+  for (int i7 = 0; i7 < 1000; ++i7) {
+    for (int j3 = 0; j3 < 10; ++j3) {
+      lenet[(j3 + (i7 * 10))] = ((int)(exp(((double)((long)(dense1[(j3 + (i7 * 10))] - compute4[i7])))) / ((double)compute5[i7])));
+    }
+  }
+}
+
diff --git a/samples/lenet/vhls_code.cl b/samples/lenet/vhls_code.cl
new file mode 100644
index 000000000..3d85466b4
--- /dev/null
+++ b/samples/lenet/vhls_code.cl
@@ -0,0 +1,155 @@
+#include <ap_int.h>
+#include <ap_fixed.h>
+#include <math.h>
+
+void default_function(ap_int<32> input_image[1000][1][28][28], ap_int<32> weight_conv1[20][1][5][5], ap_int<32> weight_conv2[50][20][5][5], ap_int<32> weight_fc1[500][800], ap_int<32> weight_fc2[10][500], ap_int<32> lenet[1000][10]) {
+  ap_int<32> conv2d[1000][20][24][24];
+  for (ap_int<32> nn = 0; nn < 1000; ++nn) {
+    for (ap_int<32> ff = 0; ff < 20; ++ff) {
+      for (ap_int<32> yy = 0; yy < 24; ++yy) {
+        for (ap_int<32> xx = 0; xx < 24; ++xx) {
+          float reducer12;
+          reducer12 = 0.000000e+00f;
+          for (ap_int<32> ra29 = 0; ra29 < 5; ++ra29) {
+            for (ap_int<32> ra30 = 0; ra30 < 5; ++ra30) {
+              reducer12 = (((float)(((ap_int<64>)input_image[nn][0][(yy + ra29)][(xx + ra30)]) * ((ap_int<64>)weight_conv1[ff][0][ra29][ra30]))) + reducer12);
+            }
+          }
+          conv2d[nn][ff][yy][xx] = ((ap_int<32>)reducer12);
+        }
+      }
+    }
+  }
+  ap_int<32> tanh1[1000][20][24][24];
+  for (ap_int<32> args = 0; args < 1000; ++args) {
+    for (ap_int<32> args0 = 0; args0 < 20; ++args0) {
+      for (ap_int<32> args1 = 0; args1 < 24; ++args1) {
+        for (ap_int<32> args2 = 0; args2 < 24; ++args2) {
+          tanh1[args][args0][args1][args2] = ((ap_int<32>)tanh(((double)conv2d[args][args0][args1][args2])));
+        }
+      }
+    }
+  }
+  ap_int<32> max_pool[1000][20][12][12];
+  for (ap_int<32> i = 0; i < 1000; ++i) {
+    for (ap_int<32> c = 0; c < 20; ++c) {
+      for (ap_int<32> h = 0; h < 12; ++h) {
+        for (ap_int<32> w = 0; w < 12; ++w) {
+          float reducer13;
+          reducer13 = -1.000000e+00f;
+          for (ap_int<32> ra31 = 0; ra31 < 2; ++ra31) {
+            for (ap_int<32> ra32 = 0; ra32 < 2; ++ra32) {
+              reducer13 = std::max(((float)tanh1[i][c][((h * 2) + ra31)][((w * 2) + ra32)]), reducer13);
+            }
+          }
+          max_pool[i][c][h][w] = ((ap_int<32>)reducer13);
+        }
+      }
+    }
+  }
+  ap_int<32> conv2d1[1000][50][8][8];
+  for (ap_int<32> nn1 = 0; nn1 < 1000; ++nn1) {
+    for (ap_int<32> ff1 = 0; ff1 < 50; ++ff1) {
+      for (ap_int<32> yy1 = 0; yy1 < 8; ++yy1) {
+        for (ap_int<32> xx1 = 0; xx1 < 8; ++xx1) {
+          float reducer14;
+          reducer14 = 0.000000e+00f;
+          for (ap_int<32> ra33 = 0; ra33 < 20; ++ra33) {
+            for (ap_int<32> ra34 = 0; ra34 < 5; ++ra34) {
+              for (ap_int<32> ra35 = 0; ra35 < 5; ++ra35) {
+                reducer14 = (((float)(((ap_int<64>)max_pool[nn1][ra33][(yy1 + ra34)][(xx1 + ra35)]) * ((ap_int<64>)weight_conv2[ff1][ra33][ra34][ra35]))) + reducer14);
+              }
+            }
+          }
+          conv2d1[nn1][ff1][yy1][xx1] = ((ap_int<32>)reducer14);
+        }
+      }
+    }
+  }
+  ap_int<32> tanh2[1000][50][8][8];
+  for (ap_int<32> args3 = 0; args3 < 1000; ++args3) {
+    for (ap_int<32> args01 = 0; args01 < 50; ++args01) {
+      for (ap_int<32> args11 = 0; args11 < 8; ++args11) {
+        for (ap_int<32> args21 = 0; args21 < 8; ++args21) {
+          tanh2[args3][args01][args11][args21] = ((ap_int<32>)tanh(((double)conv2d1[args3][args01][args11][args21])));
+        }
+      }
+    }
+  }
+  ap_int<32> max_pool1[1000][50][4][4];
+  for (ap_int<32> i1 = 0; i1 < 1000; ++i1) {
+    for (ap_int<32> c1 = 0; c1 < 50; ++c1) {
+      for (ap_int<32> h1 = 0; h1 < 4; ++h1) {
+        for (ap_int<32> w1 = 0; w1 < 4; ++w1) {
+          float reducer15;
+          reducer15 = -1.000000e+00f;
+          for (ap_int<32> ra36 = 0; ra36 < 2; ++ra36) {
+            for (ap_int<32> ra37 = 0; ra37 < 2; ++ra37) {
+              reducer15 = std::max(((float)tanh2[i1][c1][((h1 * 2) + ra36)][((w1 * 2) + ra37)]), reducer15);
+            }
+          }
+          max_pool1[i1][c1][h1][w1] = ((ap_int<32>)reducer15);
+        }
+      }
+    }
+  }
+  ap_int<32> compute6[1000][800];
+  for (ap_int<32> i2 = 0; i2 < 1000; ++i2) {
+    for (ap_int<32> j = 0; j < 800; ++j) {
+      compute6[i2][j] = max_pool1[i2][(j / 16)][((j / 4) % 4)][(j % 4)];
+    }
+  }
+  ap_int<32> dense[1000][500];
+  for (ap_int<32> i3 = 0; i3 < 1000; ++i3) {
+    for (ap_int<32> j1 = 0; j1 < 500; ++j1) {
+      float reducer16;
+      reducer16 = 0.000000e+00f;
+      for (ap_int<32> ra38 = 0; ra38 < 800; ++ra38) {
+        reducer16 = (((float)(((ap_int<64>)compute6[i3][ra38]) * ((ap_int<64>)weight_fc1[j1][ra38]))) + reducer16);
+      }
+      dense[i3][j1] = ((ap_int<32>)reducer16);
+    }
+  }
+  ap_int<32> tanh3[1000][500];
+  for (ap_int<32> args4 = 0; args4 < 1000; ++args4) {
+    for (ap_int<32> args02 = 0; args02 < 500; ++args02) {
+      tanh3[args4][args02] = ((ap_int<32>)tanh(((double)dense[args4][args02])));
+    }
+  }
+  ap_int<32> dense1[1000][10];
+  for (ap_int<32> i4 = 0; i4 < 1000; ++i4) {
+    for (ap_int<32> j2 = 0; j2 < 10; ++j2) {
+      float reducer17;
+      reducer17 = 0.000000e+00f;
+      for (ap_int<32> ra39 = 0; ra39 < 500; ++ra39) {
+        reducer17 = (((float)(((ap_int<64>)tanh3[i4][ra39]) * ((ap_int<64>)weight_fc2[j2][ra39]))) + reducer17);
+      }
+      dense1[i4][j2] = ((ap_int<32>)reducer17);
+    }
+  }
+  ap_int<32> compute7[1000];
+  for (ap_int<32> i5 = 0; i5 < 1000; ++i5) {
+    ap_int<32> max;
+    max = 0;
+    for (ap_int<32> ra40 = 0; ra40 < 10; ++ra40) {
+      max = std::max(dense1[i5][ra40], max);
+    }
+    compute7[i5] = max;
+  }
+  ap_int<32> compute8[1000];
+  for (ap_int<32> i6 = 0; i6 < 1000; ++i6) {
+    ap_int<32> sum;
+    sum = 0;
+    for (ap_int<32> ra41 = 0; ra41 < 10; ++ra41) {
+      sum = ((ap_int<32>)(exp(((double)((ap_int<33>)(dense1[i6][ra41] - compute7[i6])))) + ((double)sum)));
+    }
+    compute8[i6] = sum;
+  }
+  ap_int<32> update2;
+  for (ap_int<32> i7 = 0; i7 < 1000; ++i7) {
+    for (ap_int<32> j3 = 0; j3 < 10; ++j3) {
+      lenet[i7][j3] = ((ap_int<32>)(exp(((double)((ap_int<33>)(dense1[i7][j3] - compute7[i7])))) / ((double)compute8[i7])));
+    }
+  }
+}
+
diff --git a/samples/smith_waterman/common/common.mk b/samples/smith_waterman/common/common.mk
new file mode 100644
index 000000000..3409e4aa5
--- /dev/null
+++ b/samples/smith_waterman/common/common.mk
@@ -0,0 +1,55 @@
+SHELL = /bin/bash
+VPATH = ./
+CC = xcpp
+CLCC = xocc
+ifeq ($(XDEVICE_REPO_PATH),)
+    DEVICE_REPO_OPT = 
+else
+DEVICE_REPO_OPT = --xp prop:solution.device_repo_paths=${XDEVICE_REPO_PATH}
+endif
+HOST_CFLAGS += -I${XILINX_SDX}/runtime/include/1_2
+HOST_LFLAGS += -L${XILINX_SDX}/runtime/lib/x86_64 -lxilinxopencl -lrt -pthread
+CLCC_OPT += $(CLCC_OPT_LEVEL) ${DEVICE_REPO_OPT} --xdevice ${XDEVICE} -o ${XCLBIN} ${KERNEL_DEFS} ${KERNEL_INCS}
+ifeq (${KEEP_TEMP},1)
+    CLCC_OPT += -s
+endif
+ifeq (${KERNEL_DEBUG},1)
+    CLCC_OPT += -g
+endif
+CLCC_OPT += --kernel ${KERNEL_NAME}
+OBJECTS := $(HOST_SRCS:.cpp=.o)
+.PHONY: all
+all: run
+host: ${HOST_EXE_DIR}/${HOST_EXE}
+xbin_cpu_em:
+    make SDA_FLOW=cpu_emu xbin -f sdaccel.mk
+xbin_hw_em:
+    make SDA_FLOW=hw_emu xbin -f sdaccel.mk
+xbin_hw :
+    make SDA_FLOW=hw xbin -f sdaccel.mk
+xbin: ${XCLBIN}
+run_cpu_em: 
+    make SDA_FLOW=cpu_emu run_em -f sdaccel.mk
+run_hw_em: 
+    make SDA_FLOW=hw_emu run_em -f sdaccel.mk
+run_hw : 
+    make SDA_FLOW=hw run_hw_int -f sdaccel.mk
+run_em: xconfig host xbin
+    XCL_EMULATION_MODE=true ${HOST_EXE_DIR}/${HOST_EXE} ${HOST_ARGS}
+run_hw_int : host xbin_hw
+    source ${BOARD_SETUP_FILE};${HOST_EXE_DIR}/${HOST_EXE} ${HOST_ARGS}
+estimate : 
+    ${CLCC} -c -t hw_emu --xdevice ${XDEVICE} --report estimate ${KERNEL_SRCS}
+xconfig : emconfig.json
+emconfig.json :
+    emconfigutil --xdevice ${XDEVICE} ${DEVICE_REPO_OPT} --od .
+${HOST_EXE_DIR}/${HOST_EXE} : ${OBJECTS}
+    ${CC} ${HOST_LFLAGS} ${OBJECTS} -o $@
+${XCLBIN}:
+    ${CLCC} ${CLCC_OPT} ${KERNEL_SRCS}
+%.o: %.cpp
+    ${CC} ${HOST_CFLAGS} -c $< -o $@
+clean:
+    ${RM} -rf ${HOST_EXE} ${OBJECTS} ${XCLBIN} emconfig.json _xocc_${XCLBIN_NAME}_*.dir .Xil
+cleanall: clean
+    ${RM} -rf *.xclbin sdaccel_profile_summary.* _xocc_* TempConfig *.log *.jou
diff --git a/samples/smith_waterman/lenet_aocl.cl b/samples/smith_waterman/lenet_aocl.cl
new file mode 100644
index 000000000..bf8608082
--- /dev/null
+++ b/samples/smith_waterman/lenet_aocl.cl
@@ -0,0 +1,143 @@
+#include "ihc_apint.h"
+__kernel void default_function(__global uint3_t* restrict seqAs, __global uint3_t* restrict seqBs, __global uint3_t* restrict outAs, __global uint3_t* restrict outBs) {
+  int B;
+  #pragma ii 1
+  for (int t_outer = 0; t_outer < 32; ++t_outer) {
+    #pragma unroll
+    for (int t_inner = 0; t_inner < 32; ++t_inner) {
+      int maxtrix_max;
+      maxtrix_max = 0;
+      int i_max;
+      i_max = 0;
+      int j_max;
+      j_max = 0;
+      short matrix[16641];
+      for (int x = 0; x < 129; ++x) {
+        for (int y = 0; y < 129; ++y) {
+          matrix[(y + (x * 129))] = (short)0;
+        }
+      }
+      short action[16641];
+      for (int x1 = 0; x1 < 129; ++x1) {
+        for (int y1 = 0; y1 < 129; ++y1) {
+          action[(y1 + (x1 * 129))] = (short)3;
+        }
+      }
+      int mutate3;
+      for (int i = 0; i < 129; ++i) {
+        for (int j = 0; j < 129; ++j) {
+          int trace_back[4];
+          for (int x2 = 0; x2 < 4; ++x2) {
+            trace_back[x2] = 0;
+          }
+          if ((i != 0) && (j != 0)) {
+            trace_back[0] = ((int)(((int33_t)matrix[((j + (i * 129)) + -130)]) + ((int33_t)(int)((seqAs[((i + ((t_inner + (t_outer * 32)) * 128)) + -1)] == seqBs[((j + ((t_inner + (t_outer * 32)) * 128)) + -1)]) ? 1 : -4))));
+            trace_back[1] = (((int)matrix[((j + (i * 129)) + -129)]) + -4);
+            trace_back[2] = (((int)matrix[((j + (i * 129)) + -1)]) + -4);
+            trace_back[3] = 0;
+            int max;
+            max = trace_back[0];
+            int act;
+            act = 0;
+            for (int i1 = 0; i1 < 4; ++i1) {
+              if (max < trace_back[i1]) {
+                max = trace_back[i1];
+                act = i1;
+              }
+            }
+            matrix[(j + (i * 129))] = ((short)max);
+            action[(j + (i * 129))] = ((short)act);
+            if (maxtrix_max < ((int)matrix[(j + (i * 129))])) {
+              maxtrix_max = ((int)matrix[(j + (i * 129))]);
+              i_max = i;
+              j_max = j;
+            }
+          }
+        }
+      }
+      int T;
+      int curr_i;
+      curr_i = i_max;
+      int curr_j;
+      curr_j = j_max;
+      int next_i;
+      next_i = 0;
+      int next_j;
+      next_j = 0;
+      int act1;
+      act1 = ((int)action[(curr_j + (curr_i * 129))]);
+      int next_i1;
+      next_i1 = 0;
+      int next_j1;
+      next_j1 = 0;
+      if (act1 == 0) {
+        next_i1 = (curr_i + -1);
+        next_j1 = (curr_j + -1);
+      } else {
+        if (act1 == 1) {
+          next_i1 = (curr_i + -1);
+          next_j1 = curr_j;
+        } else {
+          if (act1 == 2) {
+            next_i1 = curr_i;
+            next_j1 = (curr_j + -1);
+          } else {
+            next_i1 = curr_i;
+            next_j1 = curr_j;
+          }
+        }
+      }
+      next_i = next_i1;
+      next_j = next_j1;
+      int tick;
+      tick = 0;
+      while (((curr_i != next_i) || (curr_j != next_j))) {
+        int a;
+        a = 0;
+        int b;
+        b = 0;
+        if (next_i == curr_i) {
+          a = 0;
+        } else {
+          a = ((int)seqAs[((curr_i + ((t_inner + (t_outer * 32)) * 128)) + -1)]);
+        }
+        if (next_j == curr_j) {
+          b = 0;
+        } else {
+          b = ((int)seqBs[((curr_j + ((t_inner + (t_outer * 32)) * 128)) + -1)]);
+        }
+        outAs[(tick + ((t_inner + (t_outer * 32)) * 256))] = ((uint3_t)a);
+        outBs[(tick + ((t_inner + (t_outer * 32)) * 256))] = ((uint3_t)b);
+        curr_i = next_i;
+        curr_j = next_j;
+        int act2;
+        act2 = ((int)action[(curr_j + (curr_i * 129))]);
+        int next_i2;
+        next_i2 = 0;
+        int next_j2;
+        next_j2 = 0;
+        if (act2 == 0) {
+          next_i2 = (curr_i + -1);
+          next_j2 = (curr_j + -1);
+        } else {
+          if (act2 == 1) {
+            next_i2 = (curr_i + -1);
+            next_j2 = curr_j;
+          } else {
+            if (act2 == 2) {
+              next_i2 = curr_i;
+              next_j2 = (curr_j + -1);
+            } else {
+              next_i2 = curr_i;
+              next_j2 = curr_j;
+            }
+          }
+        }
+        next_i = next_i2;
+        next_j = next_j2;
+        tick = (tick + 1);
+      }
+    }
+  }
+}
+
diff --git a/samples/smith_waterman/main.cpp b/samples/smith_waterman/main.cpp
new file mode 100644
index 000000000..851a98bf7
--- /dev/null
+++ b/samples/smith_waterman/main.cpp
@@ -0,0 +1,135 @@
+#define CL_HPP_CL_1_2_DEFAULT_BUILD
+#define CL_HPP_TARGET_OPENCL_VERSION 120
+#define CL_HPP_MINIMUM_OPENCL_VERSION 120
+#define CL_HPP_ENABLE_PROGRAM_CONSTRUCTION_FROM_ARRAY_COMPATIBILITY 1
+#include <CL/cl2.hpp>
+#include <fstream>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <cstring>
+#include <iostream>
+#include <iomanip>
+#include <cmath>
+#include <sys/ipc.h>
+#include <sys/shm.h>
+#pragma once
+
+
+
+
+int main(void) { 
+#if defined(SDX_PLATFORM) && !defined(TARGET_DEVICE)
+  #define STR_VALUE(arg) #arg
+  #define GET_STRING(name) STR_VALUE(name)
+  #define TARGET_DEVICE GET_STRING(SDX_PLATFORM)
+#endif
+    char* xclbinFilename = argv[1];
+
+    std::vector<unsigned int> source_0(1024 * 128);
+    std::vector<unsigned int> source_1(1024 * 128);
+    std::vector<unsigned int> source_2(1024 * 256);
+    std::vector<unsigned int> source_3(1024 * 256);
+
+    size_t vector_size_bytes_0 = sizeof(unsigned int) * 1024 * 128;
+    size_t vector_size_bytes_1 = sizeof(unsigned int) * 1024 * 128;
+    size_t vector_size_bytes_2 = sizeof(unsigned int) * 1024 * 256;
+    size_t vector_size_bytes_3 = sizeof(unsigned int) * 1024 * 256;
+
+    unsigned int* arg_0 = (unsigned int*)shmat(1769476, nullptr, 0);
+    for (size_t i0 = 0; i0 < 1024; i0++) {
+      for (size_t i1 = 0; i1 < 128; i1++) {
+        source_0[i1 + i0*128] = arg_0[i1 + i0*128];
+      }
+    }
+    unsigned int* arg_1 = (unsigned int*)shmat(3538944, nullptr, 0);
+    for (size_t i0 = 0; i0 < 1024; i0++) {
+      for (size_t i1 = 0; i1 < 128; i1++) {
+        source_1[i1 + i0*128] = arg_1[i1 + i0*128];
+      }
+    }
+    unsigned int* arg_2 = (unsigned int*)shmat(3538945, nullptr, 0);
+    for (size_t i0 = 0; i0 < 1024; i0++) {
+      for (size_t i1 = 0; i1 < 256; i1++) {
+        source_2[i1 + i0*256] = arg_2[i1 + i0*256];
+      }
+    }
+    unsigned int* arg_3 = (unsigned int*)shmat(2162690, nullptr, 0);
+    for (size_t i0 = 0; i0 < 1024; i0++) {
+      for (size_t i1 = 0; i1 < 256; i1++) {
+        source_3[i1 + i0*256] = arg_3[i1 + i0*256];
+      }
+    }
+    std::vector<cl::Platform> platforms;
+    cl::Platform::get(&platforms);
+    cl::Platform platform = platforms[0];
+
+    std::vector<cl::Device> devices;
+    platform.getDevices(CL_DEVICE_TYPE_ACCELERATOR, &devices);
+    cl::Device device = devices[0];
+
+    cl::Context context(device);
+    cl::CommandQueue q(context, device);
+
+    std::ifstream bin_file(xclbinFilename, std::ifstream::binary);
+    bin_file.seekg (0, bin_file.end);
+    unsigned nb = bin_file.tellg();
+    bin_file.seekg (0, bin_file.beg);
+    char *buf = new char [nb];
+    bin_file.read(buf, nb);
+
+    cl::Program::Binaries bins;
+    bins.push_back({buf,nb});
+    devices.resize(1);
+    cl::Program program(context, devices, bins);
+
+    int err1;
+    cl::Kernel kernel(program, "default_function", &err1);
+    auto default_function = cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&>(kernel);
+
+    cl::Buffer buffer_0(context, CL_MEM_READ_WRITE, vector_size_bytes_0);
+    cl::Buffer buffer_1(context, CL_MEM_READ_WRITE, vector_size_bytes_1);
+    cl::Buffer buffer_2(context, CL_MEM_READ_WRITE, vector_size_bytes_2);
+    cl::Buffer buffer_3(context, CL_MEM_READ_WRITE, vector_size_bytes_3);
+
+    q.enqueueWriteBuffer(buffer_0, CL_TRUE, 0, vector_size_bytes_0, source_0.data());
+    q.enqueueWriteBuffer(buffer_1, CL_TRUE, 0, vector_size_bytes_1, source_1.data());
+    q.enqueueWriteBuffer(buffer_2, CL_TRUE, 0, vector_size_bytes_2, source_2.data());
+    q.enqueueWriteBuffer(buffer_3, CL_TRUE, 0, vector_size_bytes_3, source_3.data());
+
+    default_function(cl::EnqueueArgs(q, cl::NDRange(1,1,1), cl::NDRange(1,1,1)),buffer_0, buffer_1, buffer_2, buffer_3);
+    q.finish();
+
+    q.enqueueReadBuffer(buffer_0, CL_TRUE, 0, vector_size_bytes_0, source_0.data());
+    q.enqueueReadBuffer(buffer_1, CL_TRUE, 0, vector_size_bytes_1, source_1.data());
+    q.enqueueReadBuffer(buffer_2, CL_TRUE, 0, vector_size_bytes_2, source_2.data());
+    q.enqueueReadBuffer(buffer_3, CL_TRUE, 0, vector_size_bytes_3, source_3.data());
+
+    for (size_t i0 = 0; i0 < 1024; i0++) {
+      for (size_t i1 = 0; i1 < 128; i1++) {
+        arg_0[i1 + i0*128] = source_0[i1 + i0*128];
+      }
+    }
+    shmdt(arg_0);
+    for (size_t i0 = 0; i0 < 1024; i0++) {
+      for (size_t i1 = 0; i1 < 128; i1++) {
+        arg_1[i1 + i0*128] = source_1[i1 + i0*128];
+      }
+    }
+    shmdt(arg_1);
+    for (size_t i0 = 0; i0 < 1024; i0++) {
+      for (size_t i1 = 0; i1 < 256; i1++) {
+        arg_2[i1 + i0*256] = source_2[i1 + i0*256];
+      }
+    }
+    shmdt(arg_2);
+    for (size_t i0 = 0; i0 < 1024; i0++) {
+      for (size_t i1 = 0; i1 < 256; i1++) {
+        arg_3[i1 + i0*256] = source_3[i1 + i0*256];
+      }
+    }
+    shmdt(arg_3);
+}
diff --git a/samples/smith_waterman/merlinc_code.cl b/samples/smith_waterman/merlinc_code.cl
new file mode 100644
index 000000000..c3a347f35
--- /dev/null
+++ b/samples/smith_waterman/merlinc_code.cl
@@ -0,0 +1,146 @@
+#include <string.h>
+#include <math.h>
+#include <assert.h>
+#pragma ACCEL kernel
+void default_function(unsigned char* seqAs, unsigned char* seqBs, unsigned char* outAs, unsigned char* outBs) {
+  int B;
+#pragma ACCEL pipeline
+  for (int t_outer = 0; t_outer < 32; ++t_outer) {
+#pragma ACCEL parallel
+    for (int t_inner = 0; t_inner < 32; ++t_inner) {
+      int maxtrix_max;
+      maxtrix_max = 0;
+      int i_max;
+      i_max = 0;
+      int j_max;
+      j_max = 0;
+      short matrix[16641];
+      for (int x = 0; x < 129; ++x) {
+        for (int y = 0; y < 129; ++y) {
+          matrix[(y + (x * 129))] = (short)0;
+        }
+      }
+      short action[16641];
+      for (int x1 = 0; x1 < 129; ++x1) {
+        for (int y1 = 0; y1 < 129; ++y1) {
+          action[(y1 + (x1 * 129))] = (short)3;
+        }
+      }
+      int mutate3;
+      for (int i = 0; i < 129; ++i) {
+        for (int j = 0; j < 129; ++j) {
+          int trace_back[4];
+          for (int x2 = 0; x2 < 4; ++x2) {
+            trace_back[x2] = 0;
+          }
+          if ((i != 0) && (j != 0)) {
+            trace_back[0] = ((int)(((long)matrix[((j + (i * 129)) + -130)]) + ((long)((seqAs[((i + ((t_inner + (t_outer * 32)) * 128)) + -1)] == seqBs[((j + ((t_inner + (t_outer * 32)) * 128)) + -1)]) ? 1 : -4))));
+            trace_back[1] = (((int)matrix[((j + (i * 129)) + -129)]) + -4);
+            trace_back[2] = (((int)matrix[((j + (i * 129)) + -1)]) + -4);
+            trace_back[3] = 0;
+            int max;
+            max = trace_back[0];
+            int act;
+            act = 0;
+            for (int i1 = 0; i1 < 4; ++i1) {
+              if (max < trace_back[i1]) {
+                max = trace_back[i1];
+                act = i1;
+              }
+            }
+            matrix[(j + (i * 129))] = ((short)max);
+            action[(j + (i * 129))] = ((short)act);
+            if (maxtrix_max < ((int)matrix[(j + (i * 129))])) {
+              maxtrix_max = ((int)matrix[(j + (i * 129))]);
+              i_max = i;
+              j_max = j;
+            }
+          }
+        }
+      }
+      int T;
+      int curr_i;
+      curr_i = i_max;
+      int curr_j;
+      curr_j = j_max;
+      int next_i;
+      next_i = 0;
+      int next_j;
+      next_j = 0;
+      int act1;
+      act1 = ((int)action[(curr_j + (curr_i * 129))]);
+      int next_i1;
+      next_i1 = 0;
+      int next_j1;
+      next_j1 = 0;
+      if (act1 == 0) {
+        next_i1 = (curr_i + -1);
+        next_j1 = (curr_j + -1);
+      } else {
+        if (act1 == 1) {
+          next_i1 = (curr_i + -1);
+          next_j1 = curr_j;
+        } else {
+          if (act1 == 2) {
+            next_i1 = curr_i;
+            next_j1 = (curr_j + -1);
+          } else {
+            next_i1 = curr_i;
+            next_j1 = curr_j;
+          }
+        }
+      }
+      next_i = next_i1;
+      next_j = next_j1;
+      int tick;
+      tick = 0;
+      while (((curr_i != next_i) || (curr_j != next_j))) {
+        int a;
+        a = 0;
+        int b;
+        b = 0;
+        if (next_i == curr_i) {
+          a = 0;
+        } else {
+          a = ((int)seqAs[((curr_i + ((t_inner + (t_outer * 32)) * 128)) + -1)]);
+        }
+        if (next_j == curr_j) {
+          b = 0;
+        } else {
+          b = ((int)seqBs[((curr_j + ((t_inner + (t_outer * 32)) * 128)) + -1)]);
+        }
+        outAs[(tick + ((t_inner + (t_outer * 32)) * 256))] = ((unsigned char)a);
+        outBs[(tick + ((t_inner + (t_outer * 32)) * 256))] = ((unsigned char)b);
+        curr_i = next_i;
+        curr_j = next_j;
+        int act2;
+        act2 = ((int)action[(curr_j + (curr_i * 129))]);
+        int next_i2;
+        next_i2 = 0;
+        int next_j2;
+        next_j2 = 0;
+        if (act2 == 0) {
+          next_i2 = (curr_i + -1);
+          next_j2 = (curr_j + -1);
+        } else {
+          if (act2 == 1) {
+            next_i2 = (curr_i + -1);
+            next_j2 = curr_j;
+          } else {
+            if (act2 == 2) {
+              next_i2 = curr_i;
+              next_j2 = (curr_j + -1);
+            } else {
+              next_i2 = curr_i;
+              next_j2 = curr_j;
+            }
+          }
+        }
+        next_i = next_i2;
+        next_j = next_j2;
+        tick = (tick + 1);
+      }
+    }
+  }
+}
+
diff --git a/samples/smith_waterman/sdaccel.mk b/samples/smith_waterman/sdaccel.mk
new file mode 100644
index 000000000..ce266d89e
--- /dev/null
+++ b/samples/smith_waterman/sdaccel.mk
@@ -0,0 +1,32 @@
+ifndef XILINX_SDX
+$(error Environment variable XILINX_SDX is required and should point to SDAccel install area)
+endif
+SDA_FLOW = cpu_emu
+HOST_SRCS = host.cpp
+HOST_EXE_DIR=.
+HOST_EXE = host
+HOST_CFLAGS = -g -Wall -DFPGA_DEVICE -DC_KERNEL
+HOST_LFLAGS = 
+KERNEL_SRCS = default_function.cl
+KERNEL_NAME = default_function
+KERNEL_DEFS =
+KERNEL_INCS =
+XDEVICE=xilinx:adm-pcie-7v3:1ddr:3.0
+XDEVICE_REPO_PATH=
+KEEP_TEMP=1
+KERNEL_DEBUG=
+XCLBIN_NAME=bin_krnl
+HOST_CFLAGS+=-DTARGET_DEVICE=\"${XDEVICE}\"
+BOARD_SETUP_FILE=setup.sh
+ifeq (${SDA_FLOW},cpu_emu)
+    CLCC_OPT += -t sw_emu
+    XCLBIN = ${XCLBIN_NAME}_cpu_emu.xclbin
+else ifeq (${SDA_FLOW},hw_emu)
+    CLCC_OPT += -t hw_emu
+    XCLBIN = ${XCLBIN_NAME}_hw_emu.xclbin
+else ifeq (${SDA_FLOW},hw)
+    XCLBIN = ${XCLBIN_NAME}_hw.xclbin
+CLCC_OPT += -t hw
+endifHOST_ARGS = ${XCLBIN}
+COMMON_DIR = ./common
+include ${COMMON_DIR}/common.mk
diff --git a/samples/smith_waterman/sdaccel_code.cl b/samples/smith_waterman/sdaccel_code.cl
new file mode 100644
index 000000000..a0f5fdb01
--- /dev/null
+++ b/samples/smith_waterman/sdaccel_code.cl
@@ -0,0 +1,142 @@
+__kernel void default_function(__global unsigned char* seqAs, __global unsigned char* seqBs, __global unsigned char* outAs, __global unsigned char* outBs) {
+  __local int B;
+  __attribute__((xcl_pipeline_loop(1)))
+  for (int t_outer = 0; t_outer < 2; ++t_outer) {
+    
+    for (int t_inner = 0; t_inner < 32; ++t_inner) {
+      __local int maxtrix_max;
+      maxtrix_max = 0;
+      __local int i_max;
+      i_max = 0;
+      __local int j_max;
+      j_max = 0;
+      __local short matrix[841];
+      for (int x = 0; x < 29; ++x) {
+        for (int y = 0; y < 29; ++y) {
+          matrix[(y + (x * 29))] = (short)0;
+        }
+      }
+      __local short action[841];
+      for (int x1 = 0; x1 < 29; ++x1) {
+        for (int y1 = 0; y1 < 29; ++y1) {
+          action[(y1 + (x1 * 29))] = (short)3;
+        }
+      }
+      __local int mutate1;
+      for (int i = 0; i < 29; ++i) {
+        for (int j = 0; j < 29; ++j) {
+          __local int trace_back[4];
+          for (int x2 = 0; x2 < 4; ++x2) {
+            trace_back[x2] = 0;
+          }
+          if ((i != 0) && (j != 0)) {
+            trace_back[0] = ((int)(((long)matrix[((j + (i * 29)) + -30)]) + ((long)(int)((seqAs[((i + ((t_inner + (t_outer * 32)) * 28)) + -1)] == seqBs[((j + ((t_inner + (t_outer * 32)) * 28)) + -1)]) ? 1 : -4))));
+            trace_back[1] = (((int)matrix[((j + (i * 29)) + -29)]) + -4);
+            trace_back[2] = (((int)matrix[((j + (i * 29)) + -1)]) + -4);
+            trace_back[3] = 0;
+            __local int max;
+            max = trace_back[0];
+            __local int act;
+            act = 0;
+            for (int i1 = 0; i1 < 4; ++i1) {
+              if (max < trace_back[i1]) {
+                max = trace_back[i1];
+                act = i1;
+              }
+            }
+            matrix[(j + (i * 29))] = ((short)max);
+            action[(j + (i * 29))] = ((short)act);
+            if (maxtrix_max < ((int)matrix[(j + (i * 29))])) {
+              maxtrix_max = ((int)matrix[(j + (i * 29))]);
+              i_max = i;
+              j_max = j;
+            }
+          }
+        }
+      }
+      __local int T;
+      __local int curr_i;
+      curr_i = i_max;
+      __local int curr_j;
+      curr_j = j_max;
+      __local int next_i;
+      next_i = 0;
+      __local int next_j;
+      next_j = 0;
+      __local int act1;
+      act1 = ((int)action[(curr_j + (curr_i * 29))]);
+      __local int next_i1;
+      next_i1 = 0;
+      __local int next_j1;
+      next_j1 = 0;
+      if (act1 == 0) {
+        next_i1 = (curr_i + -1);
+        next_j1 = (curr_j + -1);
+      } else {
+        if (act1 == 1) {
+          next_i1 = (curr_i + -1);
+          next_j1 = curr_j;
+        } else {
+          if (act1 == 2) {
+            next_i1 = curr_i;
+            next_j1 = (curr_j + -1);
+          } else {
+            next_i1 = curr_i;
+            next_j1 = curr_j;
+          }
+        }
+      }
+      next_i = next_i1;
+      next_j = next_j1;
+      __local int tick;
+      tick = 0;
+      while (((curr_i != next_i) || (curr_j != next_j))) {
+        __local int a;
+        a = 0;
+        __local int b;
+        b = 0;
+        if (next_i == curr_i) {
+          a = 0;
+        } else {
+          a = ((int)seqAs[((curr_i + ((t_inner + (t_outer * 32)) * 28)) + -1)]);
+        }
+        if (next_j == curr_j) {
+          b = 0;
+        } else {
+          b = ((int)seqBs[((curr_j + ((t_inner + (t_outer * 32)) * 28)) + -1)]);
+        }
+        outAs[(tick + ((t_inner + (t_outer * 32)) * 56))] = ((unsigned char)a);
+        outBs[(tick + ((t_inner + (t_outer * 32)) * 56))] = ((unsigned char)b);
+        curr_i = next_i;
+        curr_j = next_j;
+        __local int act2;
+        act2 = ((int)action[(curr_j + (curr_i * 29))]);
+        __local int next_i2;
+        next_i2 = 0;
+        __local int next_j2;
+        next_j2 = 0;
+        if (act2 == 0) {
+          next_i2 = (curr_i + -1);
+          next_j2 = (curr_j + -1);
+        } else {
+          if (act2 == 1) {
+            next_i2 = (curr_i + -1);
+            next_j2 = curr_j;
+          } else {
+            if (act2 == 2) {
+              next_i2 = curr_i;
+              next_j2 = (curr_j + -1);
+            } else {
+              next_i2 = curr_i;
+              next_j2 = curr_j;
+            }
+          }
+        }
+        next_i = next_i2;
+        next_j = next_j2;
+        tick = (tick + 1);
+      }
+    }
+  }
+}
+
diff --git a/samples/smith_waterman/sdaccel_code_nounroll.cl b/samples/smith_waterman/sdaccel_code_nounroll.cl
new file mode 100644
index 000000000..d5e145c05
--- /dev/null
+++ b/samples/smith_waterman/sdaccel_code_nounroll.cl
@@ -0,0 +1,142 @@
+__kernel void default_function(__global unsigned char* seqAs, __global unsigned char* seqBs, __global unsigned char* outAs, __global unsigned char* outBs) {
+  __local int B;
+  __attribute__((xcl_pipeline_loop(1)))
+  for (int t_outer = 0; t_outer < 32; ++t_outer) {
+    __attribute__((opencl_unroll_hint(2)))
+    for (int t_inner = 0; t_inner < 32; ++t_inner) {
+      __local int maxtrix_max;
+      maxtrix_max = 0;
+      __local int i_max;
+      i_max = 0;
+      __local int j_max;
+      j_max = 0;
+      __local short matrix[16641];
+      for (int x = 0; x < 129; ++x) {
+        for (int y = 0; y < 129; ++y) {
+          matrix[(y + (x * 129))] = (short)0;
+        }
+      }
+      __local short action[16641];
+      for (int x1 = 0; x1 < 129; ++x1) {
+        for (int y1 = 0; y1 < 129; ++y1) {
+          action[(y1 + (x1 * 129))] = (short)3;
+        }
+      }
+      __local int mutate1;
+      for (int i = 0; i < 129; ++i) {
+        for (int j = 0; j < 129; ++j) {
+          __local int trace_back[4];
+          for (int x2 = 0; x2 < 4; ++x2) {
+            trace_back[x2] = 0;
+          }
+          if ((i != 0) && (j != 0)) {
+            trace_back[0] = ((int)(((long)matrix[((j + (i * 129)) + -130)]) + ((long)(int)((seqAs[((i + ((t_inner + (t_outer * 32)) * 128)) + -1)] == seqBs[((j + ((t_inner + (t_outer * 32)) * 128)) + -1)]) ? 1 : -4))));
+            trace_back[1] = (((int)matrix[((j + (i * 129)) + -129)]) + -4);
+            trace_back[2] = (((int)matrix[((j + (i * 129)) + -1)]) + -4);
+            trace_back[3] = 0;
+            __local int max;
+            max = trace_back[0];
+            __local int act;
+            act = 0;
+            for (int i1 = 0; i1 < 4; ++i1) {
+              if (max < trace_back[i1]) {
+                max = trace_back[i1];
+                act = i1;
+              }
+            }
+            matrix[(j + (i * 129))] = ((short)max);
+            action[(j + (i * 129))] = ((short)act);
+            if (maxtrix_max < ((int)matrix[(j + (i * 129))])) {
+              maxtrix_max = ((int)matrix[(j + (i * 129))]);
+              i_max = i;
+              j_max = j;
+            }
+          }
+        }
+      }
+      __local int T;
+      __local int curr_i;
+      curr_i = i_max;
+      __local int curr_j;
+      curr_j = j_max;
+      __local int next_i;
+      next_i = 0;
+      __local int next_j;
+      next_j = 0;
+      __local int act1;
+      act1 = ((int)action[(curr_j + (curr_i * 129))]);
+      __local int next_i1;
+      next_i1 = 0;
+      __local int next_j1;
+      next_j1 = 0;
+      if (act1 == 0) {
+        next_i1 = (curr_i + -1);
+        next_j1 = (curr_j + -1);
+      } else {
+        if (act1 == 1) {
+          next_i1 = (curr_i + -1);
+          next_j1 = curr_j;
+        } else {
+          if (act1 == 2) {
+            next_i1 = curr_i;
+            next_j1 = (curr_j + -1);
+          } else {
+            next_i1 = curr_i;
+            next_j1 = curr_j;
+          }
+        }
+      }
+      next_i = next_i1;
+      next_j = next_j1;
+      __local int tick;
+      tick = 0;
+      while (((curr_i != next_i) || (curr_j != next_j))) {
+        __local int a;
+        a = 0;
+        __local int b;
+        b = 0;
+        if (next_i == curr_i) {
+          a = 0;
+        } else {
+          a = ((int)seqAs[((curr_i + ((t_inner + (t_outer * 32)) * 128)) + -1)]);
+        }
+        if (next_j == curr_j) {
+          b = 0;
+        } else {
+          b = ((int)seqBs[((curr_j + ((t_inner + (t_outer * 32)) * 128)) + -1)]);
+        }
+        outAs[(tick + ((t_inner + (t_outer * 32)) * 256))] = ((unsigned char)a);
+        outBs[(tick + ((t_inner + (t_outer * 32)) * 256))] = ((unsigned char)b);
+        curr_i = next_i;
+        curr_j = next_j;
+        __local int act2;
+        act2 = ((int)action[(curr_j + (curr_i * 129))]);
+        __local int next_i2;
+        next_i2 = 0;
+        __local int next_j2;
+        next_j2 = 0;
+        if (act2 == 0) {
+          next_i2 = (curr_i + -1);
+          next_j2 = (curr_j + -1);
+        } else {
+          if (act2 == 1) {
+            next_i2 = (curr_i + -1);
+            next_j2 = curr_j;
+          } else {
+            if (act2 == 2) {
+              next_i2 = curr_i;
+              next_j2 = (curr_j + -1);
+            } else {
+              next_i2 = curr_i;
+              next_j2 = curr_j;
+            }
+          }
+        }
+        next_i = next_i2;
+        next_j = next_j2;
+        tick = (tick + 1);
+      }
+    }
+  }
+}
+
diff --git a/samples/smith_waterman/smith_aocl.cl b/samples/smith_waterman/smith_aocl.cl
new file mode 100644
index 000000000..80a4ba601
--- /dev/null
+++ b/samples/smith_waterman/smith_aocl.cl
@@ -0,0 +1,143 @@
+#include "ihc_apint.h"
+__kernel void default_function(__global uint* restrict seqAs, __global uint* restrict seqBs, __global uint* restrict outAs, __global uint* restrict outBs) {
+  int B;
+  #pragma ii 1
+  for (int t_outer = 0; t_outer < 2; ++t_outer) {
+    #pragma unroll
+    for (int t_inner = 0; t_inner < 32; ++t_inner) {
+      int maxtrix_max;
+      maxtrix_max = 0;
+      int i_max;
+      i_max = 0;
+      int j_max;
+      j_max = 0;
+      short matrix[841];
+      for (int x = 0; x < 29; ++x) {
+        for (int y = 0; y < 29; ++y) {
+          matrix[(y + (x * 29))] = (short)0;
+        }
+      }
+      short action[841];
+      for (int x1 = 0; x1 < 29; ++x1) {
+        for (int y1 = 0; y1 < 29; ++y1) {
+          action[(y1 + (x1 * 29))] = (short)3;
+        }
+      }
+      int mutate3;
+      for (int i = 0; i < 29; ++i) {
+        for (int j = 0; j < 29; ++j) {
+          int trace_back[4];
+          for (int x2 = 0; x2 < 4; ++x2) {
+            trace_back[x2] = 0;
+          }
+          if ((i != 0) && (j != 0)) {
+            trace_back[0] = ((int)(((int33_t)matrix[((j + (i * 29)) + -30)]) + ((int33_t)(int)((seqAs[((i + ((t_inner + (t_outer * 32)) * 28)) + -1)] == seqBs[((j + ((t_inner + (t_outer * 32)) * 28)) + -1)]) ? 1 : -4))));
+            trace_back[1] = (((int)matrix[((j + (i * 29)) + -29)]) + -4);
+            trace_back[2] = (((int)matrix[((j + (i * 29)) + -1)]) + -4);
+            trace_back[3] = 0;
+            int max;
+            max = trace_back[0];
+            int act;
+            act = 0;
+            for (int i1 = 0; i1 < 4; ++i1) {
+              if (max < trace_back[i1]) {
+                max = trace_back[i1];
+                act = i1;
+              }
+            }
+            matrix[(j + (i * 29))] = ((short)max);
+            action[(j + (i * 29))] = ((short)act);
+            if (maxtrix_max < ((int)matrix[(j + (i * 29))])) {
+              maxtrix_max = ((int)matrix[(j + (i * 29))]);
+              i_max = i;
+              j_max = j;
+            }
+          }
+        }
+      }
+      int T;
+      int curr_i;
+      curr_i = i_max;
+      int curr_j;
+      curr_j = j_max;
+      int next_i;
+      next_i = 0;
+      int next_j;
+      next_j = 0;
+      int act1;
+      act1 = ((int)action[(curr_j + (curr_i * 29))]);
+      int next_i1;
+      next_i1 = 0;
+      int next_j1;
+      next_j1 = 0;
+      if (act1 == 0) {
+        next_i1 = (curr_i + -1);
+        next_j1 = (curr_j + -1);
+      } else {
+        if (act1 == 1) {
+          next_i1 = (curr_i + -1);
+          next_j1 = curr_j;
+        } else {
+          if (act1 == 2) {
+            next_i1 = curr_i;
+            next_j1 = (curr_j + -1);
+          } else {
+            next_i1 = curr_i;
+            next_j1 = curr_j;
+          }
+        }
+      }
+      next_i = next_i1;
+      next_j = next_j1;
+      int tick;
+      tick = 0;
+      while (((curr_i != next_i) || (curr_j != next_j))) {
+        int a;
+        a = 0;
+        int b;
+        b = 0;
+        if (next_i == curr_i) {
+          a = 0;
+        } else {
+          a = ((int)seqAs[((curr_i + ((t_inner + (t_outer * 32)) * 28)) + -1)]);
+        }
+        if (next_j == curr_j) {
+          b = 0;
+        } else {
+          b = ((int)seqBs[((curr_j + ((t_inner + (t_outer * 32)) * 28)) + -1)]);
+        }
+        outAs[(tick + ((t_inner + (t_outer * 32)) * 56))] = ((uint3_t)a);
+        outBs[(tick + ((t_inner + (t_outer * 32)) * 56))] = ((uint3_t)b);
+        curr_i = next_i;
+        curr_j = next_j;
+        int act2;
+        act2 = ((int)action[(curr_j + (curr_i * 29))]);
+        int next_i2;
+        next_i2 = 0;
+        int next_j2;
+        next_j2 = 0;
+        if (act2 == 0) {
+          next_i2 = (curr_i + -1);
+          next_j2 = (curr_j + -1);
+        } else {
+          if (act2 == 1) {
+            next_i2 = (curr_i + -1);
+            next_j2 = curr_j;
+          } else {
+            if (act2 == 2) {
+              next_i2 = curr_i;
+              next_j2 = (curr_j + -1);
+            } else {
+              next_i2 = curr_i;
+              next_j2 = curr_j;
+            }
+          }
+        }
+        next_i = next_i2;
+        next_j = next_j2;
+        tick = (tick + 1);
+      }
+    }
+  }
+}
+
diff --git a/samples/smith_waterman/smith_vhls.cl b/samples/smith_waterman/smith_vhls.cl
new file mode 100644
index 000000000..4fd36c8aa
--- /dev/null
+++ b/samples/smith_waterman/smith_vhls.cl
@@ -0,0 +1,146 @@
+#include <ap_int.h>
+#include <ap_fixed.h>
+#include <math.h>
+
+void default_function(ap_uint<3> seqAs[64][28], ap_uint<3> seqBs[64][28], ap_uint<3> outAs[64][56], ap_uint<3> outBs[64][56]) {
+  ap_int<32> B;
+  for (ap_int<32> t_outer = 0; t_outer < 2; ++t_outer) {
+  #pragma HLS pipeline
+    for (ap_int<32> t_inner = 0; t_inner < 32; ++t_inner) {
+    #pragma HLS unroll
+      ap_int<32> maxtrix_max;
+      maxtrix_max = 0;
+      ap_int<32> i_max;
+      i_max = 0;
+      ap_int<32> j_max;
+      j_max = 0;
+      ap_int<16> matrix[29][29];
+      for (ap_int<32> x = 0; x < 29; ++x) {
+        for (ap_int<32> y = 0; y < 29; ++y) {
+          matrix[x][y] = (ap_int<16>)0;
+        }
+      }
+      ap_int<16> action[29][29];
+      for (ap_int<32> x1 = 0; x1 < 29; ++x1) {
+        for (ap_int<32> y1 = 0; y1 < 29; ++y1) {
+          action[x1][y1] = (ap_int<16>)3;
+        }
+      }
+      ap_int<32> mutate5;
+      for (ap_int<32> i = 0; i < 29; ++i) {
+        for (ap_int<32> j = 0; j < 29; ++j) {
+          ap_int<32> trace_back[4];
+          for (ap_int<32> x2 = 0; x2 < 4; ++x2) {
+            trace_back[x2] = 0;
+          }
+          if ((i != 0) && (j != 0)) {
+            trace_back[0] = ((ap_int<32>)(((ap_int<33>)matrix[(i + -1)][(j + -1)]) + ((ap_int<33>)((seqAs[(t_inner + (t_outer * 32))][(i + -1)] == seqBs[(t_inner + (t_outer * 32))][(j + -1)]) ? 1 : -4))));
+            trace_back[1] = (((ap_int<32>)matrix[(i + -1)][j]) + -4);
+            trace_back[2] = (((ap_int<32>)matrix[i][(j + -1)]) + -4);
+            trace_back[3] = 0;
+            ap_int<32> max;
+            max = trace_back[0];
+            ap_int<32> act;
+            act = 0;
+            for (ap_int<32> i1 = 0; i1 < 4; ++i1) {
+              if (max < trace_back[i1]) {
+                max = trace_back[i1];
+                act = i1;
+              }
+            }
+            matrix[i][j] = ((ap_int<16>)max);
+            action[i][j] = ((ap_int<16>)act);
+            if (maxtrix_max < ((ap_int<32>)matrix[i][j])) {
+              maxtrix_max = ((ap_int<32>)matrix[i][j]);
+              i_max = i;
+              j_max = j;
+            }
+          }
+        }
+      }
+      ap_int<32> T;
+      ap_int<32> curr_i;
+      curr_i = i_max;
+      ap_int<32> curr_j;
+      curr_j = j_max;
+      ap_int<32> next_i;
+      next_i = 0;
+      ap_int<32> next_j;
+      next_j = 0;
+      ap_int<32> act1;
+      act1 = ((ap_int<32>)action[((curr_j / 29) + curr_i)][(curr_j % 29)]);
+      ap_int<32> next_i1;
+      next_i1 = 0;
+      ap_int<32> next_j1;
+      next_j1 = 0;
+      if (act1 == 0) {
+        next_i1 = (curr_i + -1);
+        next_j1 = (curr_j + -1);
+      } else {
+        if (act1 == 1) {
+          next_i1 = (curr_i + -1);
+          next_j1 = curr_j;
+        } else {
+          if (act1 == 2) {
+            next_i1 = curr_i;
+            next_j1 = (curr_j + -1);
+          } else {
+            next_i1 = curr_i;
+            next_j1 = curr_j;
+          }
+        }
+      }
+      next_i = next_i1;
+      next_j = next_j1;
+      ap_int<32> tick;
+      tick = 0;
+      while (((curr_i != next_i) || (curr_j != next_j))) {
+        ap_int<32> a;
+        a = 0;
+        ap_int<32> b;
+        b = 0;
+        if (next_i == curr_i) {
+          a = 0;
+        } else {
+          a = ((ap_int<32>)seqAs[((((curr_i - ((curr_i + -1) % 28)) + ((t_inner + (t_outer * 32)) * 28)) + -1) / 28)][((curr_i + -1) % 28)]);
+        }
+        if (next_j == curr_j) {
+          b = 0;
+        } else {
+          b = ((ap_int<32>)seqBs[((((curr_j - ((curr_j + -1) % 28)) + ((t_inner + (t_outer * 32)) * 28)) + -1) / 28)][((curr_j + -1) % 28)]);
+        }
+        outAs[((tick / 56) + (t_inner + (t_outer * 32)))][(tick % 56)] = ((ap_uint<3>)a);
+        outBs[((tick / 56) + (t_inner + (t_outer * 32)))][(tick % 56)] = ((ap_uint<3>)b);
+        curr_i = next_i;
+        curr_j = next_j;
+        ap_int<32> act2;
+        act2 = ((ap_int<32>)action[((curr_j / 29) + curr_i)][(curr_j % 29)]);
+        ap_int<32> next_i2;
+        next_i2 = 0;
+        ap_int<32> next_j2;
+        next_j2 = 0;
+        if (act2 == 0) {
+          next_i2 = (curr_i + -1);
+          next_j2 = (curr_j + -1);
+        } else {
+          if (act2 == 1) {
+            next_i2 = (curr_i + -1);
+            next_j2 = curr_j;
+          } else {
+            if (act2 == 2) {
+              next_i2 = curr_i;
+              next_j2 = (curr_j + -1);
+            } else {
+              next_i2 = curr_i;
+              next_j2 = curr_j;
+            }
+          }
+        }
+        next_i = next_i2;
+        next_j = next_j2;
+        tick = (tick + 1);
+      }
+    }
+  }
+}
+
diff --git a/samples/smith_waterman/smith_waterman_sdaccel.py b/samples/smith_waterman/smith_waterman_sdaccel.py
new file mode 100644
index 000000000..354cac757
--- /dev/null
+++ b/samples/smith_waterman/smith_waterman_sdaccel.py
@@ -0,0 +1,24 @@
+import heterocl as hcl
+import numpy as np
+from smith_waterman_main import *
+
+# f = top("vhls_csim")
+f = top("sdaccel_sw_emu")
+
+# add a very simple test
+_seqA_np = np.ones((num, lenA))
+for i in range(0, 4):
+    _seqA_np[0][i] = 2
+_seqB_np = np.ones((num, lenB))
+_seqA = hcl.asarray(_seqA_np, dtype)
+_seqB = hcl.asarray(_seqB_np, dtype)
+_consensusA = hcl.asarray(np.zeros((num, (lenA + lenB))), dtype)
+_consensusB = hcl.asarray(np.zeros((num, (lenA + lenB))), dtype)
+f(_seqA, _seqB, _consensusA, _consensusB)
+_consensusA_np = _consensusA.asnumpy()
+_consensusB_np = _consensusB.asnumpy()
+for i in range(0, 256):
+    if i < 124:
+        assert _consensusA_np[0][i] == 1
+    else:
+        assert _consensusA_np[0][i] == 0
diff --git a/samples/smith_waterman/vhls_code.cl b/samples/smith_waterman/vhls_code.cl
new file mode 100644
index 000000000..8066bc2c2
--- /dev/null
+++ b/samples/smith_waterman/vhls_code.cl
@@ -0,0 +1,146 @@
+#include <ap_int.h>
+#include <ap_fixed.h>
+#include <math.h>
+
+void default_function(ap_uint<3> seqAs[1024][128], ap_uint<3> seqBs[1024][128], ap_uint<3> outAs[1024][256], ap_uint<3> outBs[1024][256]) {
+  ap_int<32> B;
+  for (ap_int<32> t_outer = 0; t_outer < 32; ++t_outer) {
+  #pragma HLS pipeline
+    for (ap_int<32> t_inner = 0; t_inner < 32; ++t_inner) {
+    #pragma HLS unroll
+      ap_int<32> maxtrix_max;
+      maxtrix_max = 0;
+      ap_int<32> i_max;
+      i_max = 0;
+      ap_int<32> j_max;
+      j_max = 0;
+      ap_int<16> matrix[129][129];
+      for (ap_int<32> x = 0; x < 129; ++x) {
+        for (ap_int<32> y = 0; y < 129; ++y) {
+          matrix[x][y] = (ap_int<16>)0;
+        }
+      }
+      ap_int<16> action[129][129];
+      for (ap_int<32> x1 = 0; x1 < 129; ++x1) {
+        for (ap_int<32> y1 = 0; y1 < 129; ++y1) {
+          action[x1][y1] = (ap_int<16>)3;
+        }
+      }
+      ap_int<32> mutate3;
+      for (ap_int<32> i = 0; i < 129; ++i) {
+        for (ap_int<32> j = 0; j < 129; ++j) {
+          ap_int<32> trace_back[4];
+          for (ap_int<32> x2 = 0; x2 < 4; ++x2) {
+            trace_back[x2] = 0;
+          }
+          if ((i != 0) && (j != 0)) {
+            trace_back[0] = ((ap_int<32>)(((ap_int<33>)matrix[(i + -1)][(j + -1)]) + ((ap_int<33>)((seqAs[(t_inner + (t_outer * 32))][(i + -1)] == seqBs[(t_inner + (t_outer * 32))][(j + -1)]) ? 1 : -4))));
+            trace_back[1] = (((ap_int<32>)matrix[(i + -1)][j]) + -4);
+            trace_back[2] = (((ap_int<32>)matrix[i][(j + -1)]) + -4);
+            trace_back[3] = 0;
+            ap_int<32> max;
+            max = trace_back[0];
+            ap_int<32> act;
+            act = 0;
+            for (ap_int<32> i1 = 0; i1 < 4; ++i1) {
+              if (max < trace_back[i1]) {
+                max = trace_back[i1];
+                act = i1;
+              }
+            }
+            matrix[i][j] = ((ap_int<16>)max);
+            action[i][j] = ((ap_int<16>)act);
+            if (maxtrix_max < ((ap_int<32>)matrix[i][j])) {
+              maxtrix_max = ((ap_int<32>)matrix[i][j]);
+              i_max = i;
+              j_max = j;
+            }
+          }
+        }
+      }
+      ap_int<32> T;
+      ap_int<32> curr_i;
+      curr_i = i_max;
+      ap_int<32> curr_j;
+      curr_j = j_max;
+      ap_int<32> next_i;
+      next_i = 0;
+      ap_int<32> next_j;
+      next_j = 0;
+      ap_int<32> act1;
+      act1 = ((ap_int<32>)action[((curr_j / 129) + curr_i)][(curr_j % 129)]);
+      ap_int<32> next_i1;
+      next_i1 = 0;
+      ap_int<32> next_j1;
+      next_j1 = 0;
+      if (act1 == 0) {
+        next_i1 = (curr_i + -1);
+        next_j1 = (curr_j + -1);
+      } else {
+        if (act1 == 1) {
+          next_i1 = (curr_i + -1);
+          next_j1 = curr_j;
+        } else {
+          if (act1 == 2) {
+            next_i1 = curr_i;
+            next_j1 = (curr_j + -1);
+          } else {
+            next_i1 = curr_i;
+            next_j1 = curr_j;
+          }
+        }
+      }
+      next_i = next_i1;
+      next_j = next_j1;
+      ap_int<32> tick;
+      tick = 0;
+      while (((curr_i != next_i) || (curr_j != next_j))) {
+        ap_int<32> a;
+        a = 0;
+        ap_int<32> b;
+        b = 0;
+        if (next_i == curr_i) {
+          a = 0;
+        } else {
+          a = ((ap_int<32>)seqAs[((((curr_i - ((curr_i + -1) % 128)) + ((t_inner + (t_outer * 32)) * 128)) + -1) / 128)][((curr_i + -1) % 128)]);
+        }
+        if (next_j == curr_j) {
+          b = 0;
+        } else {
+          b = ((ap_int<32>)seqBs[((((curr_j - ((curr_j + -1) % 128)) + ((t_inner + (t_outer * 32)) * 128)) + -1) / 128)][((curr_j + -1) % 128)]);
+        }
+        outAs[((tick / 256) + (t_inner + (t_outer * 32)))][(tick % 256)] = ((ap_uint<3>)a);
+        outBs[((tick / 256) + (t_inner + (t_outer * 32)))][(tick % 256)] = ((ap_uint<3>)b);
+        curr_i = next_i;
+        curr_j = next_j;
+        ap_int<32> act2;
+        act2 = ((ap_int<32>)action[((curr_j / 129) + curr_i)][(curr_j % 129)]);
+        ap_int<32> next_i2;
+        next_i2 = 0;
+        ap_int<32> next_j2;
+        next_j2 = 0;
+        if (act2 == 0) {
+          next_i2 = (curr_i + -1);
+          next_j2 = (curr_j + -1);
+        } else {
+          if (act2 == 1) {
+            next_i2 = (curr_i + -1);
+            next_j2 = curr_j;
+          } else {
+            if (act2 == 2) {
+              next_i2 = curr_i;
+              next_j2 = (curr_j + -1);
+            } else {
+              next_i2 = curr_i;
+              next_j2 = curr_j;
+            }
+          }
+        }
+        next_i = next_i2;
+        next_j = next_j2;
+        tick = (tick + 1);
+      }
+    }
+  }
+}
+
diff --git a/samples/sobel/sobel.py b/samples/sobel/sobel.py
new file mode 100644
index 000000000..a4299d8ae
--- /dev/null
+++ b/samples/sobel/sobel.py
@@ -0,0 +1,91 @@
+import heterocl as hcl
+import hlib
+import numpy as np
+from PIL import Image
+from urllib.request import urlopen
+
+batch_size = 1
+hcl.init(hcl.UInt(32))
+dtype = hcl.UInt(32)
+image_size = ()
+kernel_size = 3
+
+# setup target using vivado 
+tool = hcl.tool.vivado("csim")
+target = hcl.platform.zc706
+
+def sobel():
+    image = hcl.placeholder((batch_size, 1, 256, 256), "input_image")
+    k1 = hcl.placeholder((1, 1, 3, 3), "kernel_1")
+    k2 = hcl.placeholder((1, 1, 3, 3), "kernel_2")
+
+    def kernel(input_image, kernel_1, kernel_2):
+
+        def absolute(image, *args):
+            with hcl.if_(image[args] > 0):
+                hcl.return_(image[args])
+            with hcl.else_():
+                hcl.return_(-1 * image[args])
+
+        def dev(gx, gy, org):
+            assert gx.shape == gy.shape, "mismatch"
+            rx = hcl.reduce_axis(0, 255, "rx")
+            ry = hcl.reduce_axis(0, 255, "ry")
+            mat_sum = hcl.compute(gx.shape, lambda nn, ff, xx, yy:
+                          gx[nn, ff, xx, yy] + gy[nn, ff, xx, yy], name="add")
+            return hcl.compute(mat_sum.shape, lambda nn, ff, xx, yy:
+                          mat_sum[nn, ff, xx, yy] * 255.0 / hcl.max(mat_sum[nn, ff, rx, ry], axis=[rx, ry]),
+                          name = "derv")
+
+        # make the conv op a kernel on fpga. 
+        # return tensor required (cannot do def_())
+        output_shape = (1,1,254,254)
+
+        # make compute wrapped in hcl def
+        module1 = hcl.def_([input_image.shape, kernel_1.shape, output_shape], name="conv1")(hlib.nn.conv2d_nchw_imp)
+        module2 = hcl.def_([input_image.shape, kernel_1.shape, output_shape], name="conv2")(hlib.nn.conv2d_nchw_imp)
+        conv1 = hcl.compute(output_shape, lambda *args: 0)  
+        conv2 = hcl.compute(output_shape, lambda *args: 0)  
+        module1(input_image, kernel_1, conv1)
+        module2(input_image, kernel_2, conv2)
+
+        abs1 = hcl.compute(conv1.shape, 
+                           lambda *args: absolute(conv1, *args))
+        abs2 = hcl.compute(conv2.shape, 
+                           lambda *args: absolute(conv2, *args))
+
+        # derivative module for normalization 
+        return dev(abs1, abs2, input_image)
+
+    s = hcl.create_schedule([image, k1, k2], kernel)
+
+    # data moved to local  
+    i0, k10 = s.to([image, k1], target.fpga)
+    s.to([i0, k10], s[kernel.conv1])
+    s.to(kernel.derv, target.cpu)
+
+    # create stream channel between modules 
+    print(type(target.fpga), hcl.lower(s))
+    return hcl.build(s, target)
+
+# Load sample data
+img = Image.open(urlopen('http://i.stack.imgur.com/8zINU.gif'))
+kernel_x = np.array([[-1, 0, 1], [-2, 0, 2], [-1, 0, 1]])
+kernel_y = np.flip(kernel_x.T.T, axis=0)
+img = np.array(img)
+
+img = img[np.newaxis, ...]
+img = img[np.newaxis, ...]
+kernel_x = kernel_x[np.newaxis, ...]
+kernel_x = kernel_x[np.newaxis, ...]
+kernel_y = kernel_y[np.newaxis, ...]
+kernel_y = kernel_y[np.newaxis, ...]
+
+hcl_input  = hcl.asarray(img, dtype)    
+kernel_x   = hcl.asarray(kernel_x, dtype)
+kernel_y   = hcl.asarray(kernel_y, dtype)
+hcl_output = hcl.asarray(np.zeros((1,1,254,254)), dtype)    
+
+f = sobel()
+f(hcl_input, kernel_x, kernel_y, hcl_output)
+
diff --git a/samples/stream/example.cl b/samples/stream/example.cl
new file mode 100644
index 000000000..fa3cfbd81
--- /dev/null
+++ b/samples/stream/example.cl
@@ -0,0 +1,34 @@
+#include "ihc_apint.h"
+#pragma OPENCL EXTENSION cl_intel_channels : enable
+channel int ret_add_c;
+channel int ret_mul_c;
+__kernel void ret_add(__global int* restrict ret_add_a, __global int* restrict ret_add_b) {
+    for (int i = 0; i < 10; ++i) {
+      for (int i1 = 0; i1 < 20; ++i1) {
+        write_channel_intel(ret_add_c, ((int)(((int33_t)ret_add_a[(i1 + (i * 20))]) + ((int33_t)ret_add_b[(i1 + (i * 20))]))));
+      }
+    }
+}
+
+__kernel void ret_mul(__global int* restrict ret_mul_d, __global int* restrict ret_mul_e) {
+    for (int i = 0; i < 10; ++i) {
+      for (int i1 = 0; i1 < 20; ++i1) {
+        ret_mul_e[(i1 + (i * 20))] = ((int)(((long)read_channel_intel(ret_mul_c)) * ((long)ret_mul_d[(i1 + (i * 20))])));
+      }
+    }
+}
+
+__kernel void default_function(__global int* restrict a, __global int* restrict b, __global int* restrict c, __global int* restrict d, __global int* restrict e) {
+  int ret_add;
+  int ret_mul;
+  for (int x = 0; x < 10; ++x) {
+    for (int y = 0; y < 20; ++y) {
+      c[(y + (x * 20))] = 0;
+    }
+  }
+  int ret_add0;
+  ret_add(a, b);
+  int ret_mul0;
+  ret_mul(d, e);
+}
+
diff --git a/samples/stream/mod.py b/samples/stream/mod.py
new file mode 100644
index 000000000..8c12ad722
--- /dev/null
+++ b/samples/stream/mod.py
@@ -0,0 +1,32 @@
+import heterocl as hcl
+
+hcl.init()
+initiation_interval = 4
+a = hcl.placeholder((10, 20))
+b = hcl.placeholder((10, 20))
+
+@hcl.def_([a.shape, b.shape, (), ()])
+def ret_add(a, b, x, y):
+    hcl.return_(a[x, y] + b[x, y])
+
+@hcl.def_([a.shape, b.shape, (), ()])
+def ret_mul(a, b, x, y):
+    hcl.return_(a[x, y] * b[x, y])
+
+c = hcl.compute(a.shape, lambda i, j: ret_add(a, b, i, j))
+d = hcl.compute(b.shape, lambda i, j: ret_mul(a, b, i, j))
+s = hcl.create_schedule([a, b, c, d])
+
+# compute customization
+s[c].pipeline(c.axis[0], initiation_interval)
+s.partition(b, dim=2, factor=2)
+
+# stream into modules / device
+# s[c].stream_to(ret_mul)
+# s[d].stream_to(hcl.FPGA)
+
+print(hcl.lower(s))
+code = hcl.build(s, target="vhls")
+print(code)
+
+
diff --git a/samples/stream/stream.py b/samples/stream/stream.py
new file mode 100644
index 000000000..5c2396a57
--- /dev/null
+++ b/samples/stream/stream.py
@@ -0,0 +1,58 @@
+import heterocl as hcl
+
+hcl.init()
+target = hcl.platform.zc706
+initiation_interval = 4
+
+a = hcl.placeholder((10, 20), name="a")
+b = hcl.placeholder((10, 20), name="b")
+c = hcl.placeholder((10, 20), name="c") 
+d = hcl.placeholder((10, 20), name="d")
+e = hcl.placeholder((10, 20), name="e")
+
+def add_mul(a, b, c, d, e):
+    @hcl.def_([a.shape, b.shape, c.shape])
+    def ret_add(a, b, c):
+        with hcl.for_(0, a.shape[0]) as i:
+            with hcl.for_(0, a.shape[1]) as j:
+                c[i, j] = a[i, j] + b[i, j]
+
+    @hcl.def_([c.shape, d.shape, e.shape])
+    def ret_mul(c, d, e):
+        # hcl.update(c, lambda x, y: a[x, y] * b[x, y], 'c_mul')
+        with hcl.for_(0, c.shape[0]) as i:
+            with hcl.for_(0, c.shape[1]) as j:
+                e[i, j] = c[i, j] * d[i, j]
+
+    ret_add(a, b, c)
+    ret_mul(c, d, e)
+
+# compute customization
+s = hcl.create_schedule([a, b, c, d, e], add_mul)
+# op1 = add_mul.ret_add.c
+# op2 = add_mul.ret_mul.c
+# s[op1].pipeline(op1.axis[0], initiation_interval)
+
+# stream into modules / device
+a0, b0 = s.to([a, b], target.xcel)
+d0 = s.to(d, target.xcel)
+#s.partition(b0, dim=2, factor=2)
+s.to([a0, b0], s[add_mul.ret_add])
+s.to(d0, s[add_mul.ret_mul])
+
+# within device move producer to consumer
+s.to(c, s[add_mul.ret_mul],
+        s[add_mul.ret_add], depth=10)
+
+# return tensor for inter-device move
+# e0 = s.stream_to(e, hcl.CPU('riscv'))
+
+# print(add_mul.ret_mul._buf, c._buf)
+print(hcl.lower(s))
+code = hcl.build(s, target)
+print(code)
+# 
+# with open("example.cl", "w") as f:
+#   f.write(code)
+#   f.close()
+ 
diff --git a/tests/test_codegen_aocl.py b/tests/test_codegen_aocl.py
new file mode 100644
index 000000000..a72d364f2
--- /dev/null
+++ b/tests/test_codegen_aocl.py
@@ -0,0 +1,99 @@
+import heterocl as hcl
+
+def test_ap_int():
+	hcl.init();
+	A = hcl.placeholder((1, 32), dtype=hcl.Int(3))
+	B = hcl.placeholder((1, 32), dtype=hcl.UInt(3))
+	C = hcl.compute(A.shape, lambda i, j: A[i][j] + B[i][j], dtype=hcl.Int(8))
+	s = hcl.create_schedule([A, B, C])
+	code = hcl.build(s, target='aocl')
+	print (code)
+	assert "#pragma OPENCL EXTENSION cl_intel_arbitrary_precision_integers : enable" in code
+	assert "ap_int<3> intd_t" in code
+	assert "ap_uint<3> uintd_t" in code
+	assert "ap_int<8> intd_t" in code 
+
+def test_pragma():
+	hcl.init()
+	A = hcl.placeholder((10, 32), "A")
+	B = hcl.placeholder((10, 32))
+	C = hcl.compute(A.shape, lambda i, j: A[i][j] + B[i][j])
+
+	# unroll
+	s1 = hcl.create_schedule([A, B, C])
+	s1[C].unroll(C.axis[1], factor=4)
+	code1 = hcl.build(s1, target='aocl')
+	print (code1)
+	assert "#pragma unroll 4" in code1
+	
+	# pipeline
+	s2 = hcl.create_schedule([A, B, C])
+	s2[C].pipeline(C.axis[0], initiation_interval=2)
+	code2 = hcl.build(s2, target='aocl')
+	print (code2)
+	assert "#pragma ii 2" in code2
+
+def test_reorder():
+	hcl.init()
+	A = hcl.placeholder((10, 100), "A")
+
+	def two_stage(A):
+		B = hcl.compute(A.shape, lambda x, y : A[x, y] + 1, "B")
+		C = hcl.compute(A.shape, lambda x, y : B[x, y] + 1, "C")
+		return C
+
+	s = hcl.create_schedule([A], two_stage)
+	s_B = two_stage.B
+	code = hcl.build(s, target='aocl')
+	print (code)
+	s[s_B].reorder(s_B.axis[1], s_B.axis[0])
+	code2 = hcl.build(s, target='aocl')
+	print (code2)
+
+def test_split_fuse():
+	hcl.init()
+	A = hcl.placeholder((10, 100), "A")
+
+	def two_stage(A):
+		B = hcl.compute(A.shape, lambda x, y : A[x, y] + 1, "B")
+		C = hcl.compute(A.shape, lambda x, y : B[x, y] + 1, 'C')
+		return C
+
+	s = hcl.create_schedule([A], two_stage)
+	s_B = two_stage.B
+	x_out, x_in = s[s_B].split(s_B.axis[0], 5)
+	code = hcl.build(s, target='aocl')
+	print (code)
+	s2 = hcl.create_schedule([A], two_stage)
+	s2_B = two_stage.B
+	x_y = s[s_B].fuse(s2_B.axis[0], s2_B.axis[1])
+	code2 = hcl.build(s2, target='aocl')
+	print (code2)
+
+def test_binary_conv():
+    hcl.init()
+    A = hcl.placeholder((1, 32, 14, 14), dtype=hcl.UInt(1), name="A")
+    B = hcl.placeholder((64, 32, 3, 3), dtype=hcl.UInt(1), name="B")
+    rc = hcl.reduce_axis(0, 32)
+    ry = hcl.reduce_axis(0, 3)
+    rx = hcl.reduce_axis(0, 3)
+    C = hcl.compute((1, 64, 12, 12),
+        lambda nn, ff, yy, xx: hcl.sum(
+            A[nn, rc, yy + ry, xx + rx] * B[ff, rc, ry, rx], axis=[rc, ry, rx]),
+        dtype=hcl.UInt(8), name="C")
+    s = hcl.create_schedule([A, B, C])
+    s[C].split(C.axis[1], factor=5)
+    code = hcl.build(s, target='aocl')
+    print (code)
+    assert "for (ap_int<32> intd_t ff_outer = 0; ff_outer < 13; ++ff_outer)" in code
+    assert "for (ap_int<32> intd_t ff_inner = 0; ff_inner < 5; ++ff_inner)" in code
+    assert "if (ff_inner < (64 - (ff_outer * 5)))" in code
+
+if __name__ == '__main__':
+    test_ap_int()
+    test_pragma()
+    test_reorder()
+    test_split_fuse()
+    test_binary_conv()
+
+
diff --git a/tests/test_codegen_ihls.py b/tests/test_codegen_ihls.py
index fc5a7e53b..1b53f18ca 100644
--- a/tests/test_codegen_ihls.py
+++ b/tests/test_codegen_ihls.py
@@ -65,3 +65,4 @@ def kernel(A):
     s = hcl.create_schedule([A], kernel)
     code = hcl.build(s, target="ihls")
     assert "A[0].slc<4>(1)" in code
+
diff --git a/tests/test_codegen_sdaccel.py b/tests/test_codegen_sdaccel.py
new file mode 100644
index 000000000..43d94f238
--- /dev/null
+++ b/tests/test_codegen_sdaccel.py
@@ -0,0 +1,36 @@
+import heterocl as hcl
+
+
+
+
+
+def test_pragma():
+	hcl.init(hcl.Float())
+	A = hcl.placeholder((10, 32), "A")
+	B = hcl.placeholder((10, 32))
+	C = hcl.compute(A.shape, lambda i, j: A[i][j] + B[i][j])
+
+	# unroll
+	s1 = hcl.create_schedule([A, B, C])
+	s1[C].unroll(C.axis[1], factor=6)
+	code1 = hcl.build(s1, target='sdaccel')
+	print (code1)
+	assert "__attribute__((opencl_unroll_hint(6)))" in code1
+
+	# pipeline
+	s2 = hcl.create_schedule([A, B, C])
+	s2[C].pipeline(C.axis[0], initiation_interval=2)
+	code2 = hcl.build(s2, target='sdaccel')
+	print (code2)
+	assert "__attribute__((xcl_pipeline_loop(2)))" in code2
+
+	# partition
+	s3 = hcl.create_schedule([A, B, C])
+	s3.partition(A, hcl.Partition.Block, dim=2, factor=2)
+	code3 = hcl.build(s3, target='sdaccel')
+	print (code3)
+	assert "__attribute__((xcl_array_partition(block,2,2)))" in code3
+	
+
+if __name__ == "__main__":
+	test_pragma()
\ No newline at end of file
diff --git a/tests/test_codegen_soda.py b/tests/test_codegen_soda.py
index 56fb8df77..492ee6146 100644
--- a/tests/test_codegen_soda.py
+++ b/tests/test_codegen_soda.py
@@ -52,6 +52,7 @@ def test_blur(self):
   img_t(0, 0) = uint16((int32((uint18((uint17(img_i(-1, 0)) + uint17(img_i(0, 0)))) + uint18(img_i(1, 0)))) / 3))
 output uint16:
   img_o(0, 0) = uint16((int32((uint18((uint17(img_t(0, -1)) + uint17(img_t(0, 0)))) + uint18(img_t(0, 1)))) / 3))
+
 ''')
 
     def test_gaussian(self):
@@ -76,6 +77,7 @@ def test_gaussian(self):
   reduce_ssa3 = float32(((float64(img_i(-1, 0)) * 3699.65) + float64(reduce_ssa2)))
   reduce_ssa4 = float32(((float64(img_i(0, 0)) * 4620.30) + float64(reduce_ssa3)))
   img_o(0, 0) = reduce_ssa4
+
 '''
                 )
 
diff --git a/tests/test_codegen_vhls.py b/tests/test_codegen_vhls.py
index dadae5068..a6385975b 100644
--- a/tests/test_codegen_vhls.py
+++ b/tests/test_codegen_vhls.py
@@ -85,7 +85,7 @@ def test_index_split():
     s = hcl.create_schedule([A, B])
     s[B].split(B.axis[0], 5)
     code = hcl.build(s, target="vhls")
-    assert "B[(y_inner + (y_outer * 5))][x]" in code
+    assert "B[(x + ((y_inner + (y_outer * 5)) * 10))]" in code
 
 def test_index_split_reshape():
     hcl.init()
@@ -95,7 +95,7 @@ def test_index_split_reshape():
     s[B].split(B.axis[0], 5)
     s.reshape(B, (2, 5, 10))
     code = hcl.build(s, target="vhls")
-    assert "B[y_outer][y_inner][x]" in code
+    assert "B[(x + ((y_inner + (y_outer * 5)) * 10))]" in code
 
 def test_index_fuse():
     hcl.init()
@@ -104,7 +104,7 @@ def test_index_fuse():
     s = hcl.create_schedule([A, B])
     s[B].fuse(B.axis[0], B.axis[1])
     code = hcl.build(s, target="vhls")
-    assert "B[(y_x_fused / 10)][(y_x_fused % 10)]" in code
+    assert "B[y_x_fused]" in code
 
 def test_binary_conv():
     hcl.init()
diff --git a/tvm/HalideIR/src/ir/Expr.h b/tvm/HalideIR/src/ir/Expr.h
index b78a466ed..4b70d51fc 100644
--- a/tvm/HalideIR/src/ir/Expr.h
+++ b/tvm/HalideIR/src/ir/Expr.h
@@ -91,6 +91,9 @@ enum class IRNodeType : int {
     /** for memory customization **/
     Reuse,
     Partition,
+    /** for data stream **/
+    StreamExpr,
+    StreamStmt,
     /** for stencil analysis **/
     Stencil
 };
@@ -302,6 +305,20 @@ enum class PartitionType : int {
     Cyclic = 2
 };
 
+/** An enum describing the stream type */
+enum class StreamType : int {
+    Channel = 0,
+    Pipe = 1,
+    FIFO = 2
+};
+
+/** An enum class for device type */
+enum class DeviceType : int {
+    CPU = 0,
+    FPGA = 1,
+    GPU = 2
+};
+
 /** A reference-counted handle to a statement node. */
 struct Stmt : public IRHandle {
     Stmt() : IRHandle() {}
diff --git a/tvm/HalideIR/src/ir/IR.cpp b/tvm/HalideIR/src/ir/IR.cpp
index a9718b40e..a604b6fd2 100644
--- a/tvm/HalideIR/src/ir/IR.cpp
+++ b/tvm/HalideIR/src/ir/IR.cpp
@@ -692,17 +692,27 @@ Expr Quantize::make(Expr body, Expr bitwidth) {
   return Expr(node);
 }
 
-Stmt KernelDef::make(Array<VarExpr> args, Stmt body, Expr ret_void, Type ret_type, std::string name) {
+Stmt KernelDef::make(Array<VarExpr> args, Array<Array<Expr>> api_args, 
+                     Array<Expr> api_types, Stmt body, Expr ret_void, 
+                     Type ret_type, std::string name, Array<Expr> channels) {
+  internal_assert(api_args.size() == api_types.size()) << "KernelDef of unmatched args\n";
   for (size_t i = 0; i < args.size(); i++) {
     internal_assert(args[i].defined()) << "KernelDef of undefined arg\n";
+    internal_assert(api_types[i].defined()) << "KernelDef of undefined type\n";
+    for (size_t j = 0; j < api_args[i].size(); j++) {
+      internal_assert(api_args[i][j].defined()) << "KernelDef of undefined shape\n";
+    }
   }
   internal_assert(body.defined()) << "KernelDef of undefined body\n";
   internal_assert(ret_void.defined()) << "KernelDef of undefined return type\n";  
   std::shared_ptr<KernelDef> node = std::make_shared<KernelDef>();
   node->args = std::move(args);
+  node->api_args = std::move(api_args);
+  node->api_types = std::move(api_types);
   node->body = std::move(body);
   node->ret_void = std::move(ret_void);
   node->ret_type = ret_type;
+  node->channels = std::move(channels);
   node->name = name;
   return Stmt(node);
 }
@@ -772,6 +782,62 @@ Stmt Partition::make(VarExpr buffer_var, int dim, int factor, PartitionType part
   return Stmt(node);
 }
 
+Expr StreamExpr::make(Type type, VarExpr buffer_var, StreamType stream_type, int depth) {
+  internal_assert(depth>= 1) << "The stream channel depth must be larger than 1\n";
+
+  std::shared_ptr<StreamExpr> node = std::make_shared<StreamExpr>();
+  node->type = type;
+  node->buffer_var = std::move(buffer_var);
+  node->depth = depth;
+  node->stream_type = stream_type;
+  return Expr(node);
+}
+
+Expr StreamExpr::make(Type type, VarExpr buffer_var, StreamType stream_type, int depth,
+                      Array<Expr> annotate_keys, Array<Expr> annotate_values) {
+  internal_assert(depth>= 1) << "The stream channel depth must be larger than 1\n";
+  internal_assert(annotate_keys.size() == annotate_values.size()) <<
+      "Length of annotate keys and annotate values not equal";
+
+  std::shared_ptr<StreamExpr> node = std::make_shared<StreamExpr>();
+  node->type = type;
+  node->buffer_var = std::move(buffer_var);
+  node->depth = depth;
+  node->stream_type = stream_type;
+  node->annotate_keys = std::move(annotate_keys);
+  node->annotate_values = std::move(annotate_values);
+  return Expr(node);
+}
+
+Stmt StreamStmt::make(VarExpr buffer_var, Expr value, StreamType stream_type, int depth) {
+  internal_assert(value.defined()) << "The stream-in value not defined\n";
+  internal_assert(depth>= 1) << "The stream channel depth must be larger than 1\n";
+
+  std::shared_ptr<StreamStmt> node = std::make_shared<StreamStmt>();
+  node->buffer_var = std::move(buffer_var);
+  node->value = std::move(value);
+  node->depth = depth;
+  node->stream_type = stream_type;
+  return Stmt(node);
+}
+
+Stmt StreamStmt::make(VarExpr buffer_var, Expr value, StreamType stream_type, int depth,
+                      Array<Expr> annotate_keys, Array<Expr> annotate_values) {
+  internal_assert(value.defined()) << "The stream-in value not defined\n";
+  internal_assert(depth>= 1) << "The stream channel depth must be larger than 1\n";
+  internal_assert(annotate_keys.size() == annotate_values.size()) <<
+      "Length of annotate keys and annotate values not equal";
+
+  std::shared_ptr<StreamStmt> node = std::make_shared<StreamStmt>();
+  node->buffer_var = std::move(buffer_var);
+  node->value = std::move(value);
+  node->depth = depth;
+  node->stream_type = stream_type;
+  node->annotate_keys = std::move(annotate_keys);
+  node->annotate_values = std::move(annotate_values);
+  return Stmt(node);
+}
+
 Stmt Stencil::make(Array<VarExpr> inputs, Array<VarExpr> outputs, Stmt body,
                    int burst_width, int unroll_factor, int num_iteration) {
   internal_assert(body.defined()) << "Stencil of undefined body\n";
@@ -884,6 +950,8 @@ template<> void StmtNode<While>::accept(IRVisitor *v, const Stmt &s) const { v->
 template<> void StmtNode<Reuse>::accept(IRVisitor *v, const Stmt &s) const { v->visit((const Reuse *)this, s); }
 template<> void StmtNode<Partition>::accept(IRVisitor *v, const Stmt &s) const { v->visit((const Partition *)this, s); }
 template<> void StmtNode<Stencil>::accept(IRVisitor *v, const Stmt &s) const { v->visit((const Stencil *)this, s); }
+template<> void StmtNode<StreamStmt>::accept(IRVisitor *v, const Stmt &s) const { v->visit((const StreamStmt *)this, s); }
+template<> void ExprNode<StreamExpr>::accept(IRVisitor *v, const Expr &e) const { v->visit((const StreamExpr *)this, e); }
 
 Call::ConstString Call::debug_to_file = "debug_to_file";
 Call::ConstString Call::reinterpret = "reinterpret";
diff --git a/tvm/HalideIR/src/ir/IR.h b/tvm/HalideIR/src/ir/IR.h
index fae48da29..e8a8835bf 100644
--- a/tvm/HalideIR/src/ir/IR.h
+++ b/tvm/HalideIR/src/ir/IR.h
@@ -1049,19 +1049,29 @@ struct Quantize : public ExprNode<Quantize> {
 /** The imperative function definition */
 struct KernelDef : public StmtNode<KernelDef> {
   Array<VarExpr> args;
+  Array<Array<Expr>> api_args;
+  Array<Expr> api_types;
   Stmt body;
   Expr ret_void;
   Type ret_type;
   std::string name;
+  // args to stream data 
+  Array<Expr> channels;
 
-  EXPORT static Stmt make(Array<VarExpr> args, Stmt body, Expr ret_void, Type ret_type, std::string name);
+  EXPORT static Stmt make(Array<VarExpr> args, Array<Array<Expr>> api_args, 
+                          Array<Expr> api_types, Stmt body, Expr ret_void, 
+                          Type ret_type, std::string name, 
+                          Array<Expr> channels);
 
   void VisitAttrs(IR::AttrVisitor* v) final {
     v -> Visit("args", &args);
+    v -> Visit("api_args", &api_args);
+    v -> Visit("api_types", &api_types);
     v -> Visit("body", &body);
     v -> Visit("ret_void", &ret_void);
     v -> Visit("ret_type", &ret_type);
     v -> Visit("name", &name);
+    v -> Visit("channels", &channels);
   }
   static const IRNodeType _type_info = IRNodeType::KernelDef;
   static constexpr const char* _type_key = "KernelDef";
@@ -1170,6 +1180,70 @@ struct Partition : public StmtNode<Partition> {
   static constexpr const char* _type_key = "Partition";
 };
 
+struct StreamStmt : public StmtNode<StreamStmt> {
+  VarExpr buffer_var;
+  Expr value; 
+  int depth;
+  StreamType stream_type;
+  Array<Expr> annotate_keys;
+  Array<Expr> annotate_values;
+
+  EXPORT static Stmt make(VarExpr buffer_var, 
+                          Expr value,
+                          StreamType stream_type,
+                          int depth);
+
+  EXPORT static Stmt make(VarExpr buffer_var, 
+                          Expr value,
+                          StreamType stream_type,
+                          int depth,
+                          Array<Expr> annotate_keys,
+                          Array<Expr> annotate_values);
+
+  void VisitAttrs(IR::AttrVisitor* v) final {
+    v -> Visit("buffer_var", &buffer_var);
+    v -> Visit("value", &value);
+    v -> Visit("depth", &depth);
+    v -> Visit("stream_type", &stream_type);
+    v -> Visit("annotate_keys", &annotate_keys);
+    v -> Visit("annotate_values", &annotate_values);
+  }
+
+  static const IRNodeType _type_info = IRNodeType::StreamStmt;
+  static constexpr const char* _type_key = "StreamStmt";
+};
+
+struct StreamExpr : public ExprNode<StreamExpr> {
+  VarExpr buffer_var; // var loaded 
+  int depth;
+  StreamType stream_type;
+  Array<Expr> annotate_keys;
+  Array<Expr> annotate_values;
+
+  EXPORT static Expr make(Type type,
+                          VarExpr buffer_var, 
+                          StreamType stream_type,
+                          int depth);
+
+  EXPORT static Expr make(Type type,
+                          VarExpr buffer_var, 
+                          StreamType stream_type,
+                          int depth,
+                          Array<Expr> annotate_keys,
+                          Array<Expr> annotate_values);
+
+  void VisitAttrs(IR::AttrVisitor* v) final {
+    v -> Visit("dtype", &type);
+    v -> Visit("buffer_var", &buffer_var);
+    v -> Visit("depth", &depth);
+    v -> Visit("stream_type", &stream_type);
+    v -> Visit("annotate_keys", &annotate_keys);
+    v -> Visit("annotate_values", &annotate_values);
+  }
+  static const IRNodeType _type_info = IRNodeType::StreamExpr;
+  static constexpr const char* _type_key = "StreamExpr";
+};
+
 struct Stencil : public StmtNode<Stencil> {
   Array<VarExpr> inputs;
   Array<VarExpr> outputs;
diff --git a/tvm/HalideIR/src/ir/IREquality.cpp b/tvm/HalideIR/src/ir/IREquality.cpp
index 9e5798fbb..46590056e 100644
--- a/tvm/HalideIR/src/ir/IREquality.cpp
+++ b/tvm/HalideIR/src/ir/IREquality.cpp
@@ -80,6 +80,7 @@ class IRComparer : public IRVisitor {
     void visit(const Call *, const Expr &);
     void visit(const Let *, const Expr &);
     void visit(const Shuffle *, const Expr &);
+    void visit(const StreamExpr *, const Expr &);
     void visit(const LetStmt *, const Stmt &);
     void visit(const AttrStmt *, const Stmt &);
     void visit(const AssertStmt *, const Stmt &);
@@ -488,6 +489,11 @@ void IRComparer::visit(const Shuffle *op, const Expr &expr) {
     compare_expr_vector(e->indices, op->indices);
 }
 
+void IRComparer::visit(const StreamExpr *op, const Expr &expr) {
+    const StreamExpr *node = expr_.as<StreamExpr>();
+    compare_node_refs(op->buffer_var, node->buffer_var);
+}
+
 } // namespace
 
 
diff --git a/tvm/HalideIR/src/ir/IRMutator.cpp b/tvm/HalideIR/src/ir/IRMutator.cpp
index 13b346e93..fbd3e82b5 100644
--- a/tvm/HalideIR/src/ir/IRMutator.cpp
+++ b/tvm/HalideIR/src/ir/IRMutator.cpp
@@ -480,7 +480,8 @@ void IRMutator::visit(const KernelDef *op, const Stmt &s) {
     stmt = s;
   }
   else {
-    stmt = KernelDef::make(op->args, body, ret_void, op->ret_type, op->name);
+    stmt = KernelDef::make(op->args, op->api_args, op->api_types,
+                           body, ret_void, op->ret_type, op->name, op->channels);
   }
 }
 
@@ -524,6 +525,20 @@ void IRMutator::visit(const KernelStmt *op, const Stmt &s) {
   }
 }
 
+void IRMutator::visit(const StreamStmt *op, const Stmt &s) {
+  Expr value = mutate(op->value);
+  if (value.same_as(op->value)) {
+    stmt = s;
+  } else {
+    stmt = StreamStmt::make(op->buffer_var, value,
+                            op->stream_type, op->depth);
+  }
+}
+
+void IRMutator::visit(const StreamExpr *op, const Expr &e) {
+  expr = e;
+}
+
 void IRMutator::visit(const Return *op, const Stmt &s) {
   Expr value = mutate(op->value);
   if (value.same_as(op->value)) {
diff --git a/tvm/HalideIR/src/ir/IRMutator.h b/tvm/HalideIR/src/ir/IRMutator.h
index 1fea5fec6..4088ae5ea 100644
--- a/tvm/HalideIR/src/ir/IRMutator.h
+++ b/tvm/HalideIR/src/ir/IRMutator.h
@@ -99,6 +99,8 @@ class IRMutator : public IRVisitor {
     EXPORT virtual void visit(const Reuse *, const Stmt &);
     EXPORT virtual void visit(const Partition *, const Stmt &);
     EXPORT virtual void visit(const Stencil *, const Stmt &);
+    EXPORT virtual void visit(const StreamExpr *, const Expr &);
+    EXPORT virtual void visit(const StreamStmt *, const Stmt &);
 };
 
 
diff --git a/tvm/HalideIR/src/ir/IRPrinter.cpp b/tvm/HalideIR/src/ir/IRPrinter.cpp
index 6a3a5d651..b6f3e6082 100644
--- a/tvm/HalideIR/src/ir/IRPrinter.cpp
+++ b/tvm/HalideIR/src/ir/IRPrinter.cpp
@@ -336,6 +336,19 @@ TVM_STATIC_IR_FUNCTOR(IRPrinter, vtable)
     }
 });
 
+TVM_STATIC_IR_FUNCTOR(IRPrinter, vtable)
+.set_dispatch<StreamStmt>([](const StreamStmt *op, IRPrinter* p) {
+    p->do_indent();
+    p->stream << op->buffer_var << ".write(";
+    p->print(op->value);
+    p->stream << ")\n";
+});
+
+TVM_STATIC_IR_FUNCTOR(IRPrinter, vtable)
+.set_dispatch<StreamExpr>([](const StreamExpr *op, IRPrinter* p) {
+    p->stream << op->buffer_var << ".read()";
+});
+
 TVM_STATIC_IR_FUNCTOR(IRPrinter, vtable)
 .set_dispatch<Ramp>([](const Ramp *op, IRPrinter* p) {
     p->stream << "ramp(";
@@ -723,7 +736,16 @@ TVM_STATIC_IR_FUNCTOR(IRPrinter, vtable)
     p->do_indent();
     p->stream << "def " << op->name << "(";
     for (size_t i = 0; i < op->args.size(); i++) {
+        p->stream << op->args[i].type() << "("; // handle type
         p->print(op->args[i]);
+        if (op->api_args[i].size() > 1) {
+          p->stream << "[";
+          for (size_t j = 0; j < op->api_args[i].size(); j++) {
+            p->print(op->api_args[i][j]);
+            if (j < op->api_args[i].size() - 1) p->stream << "*";
+          }
+          p->stream << "])";
+        }
         if (i < op->args.size() - 1) {
             p->stream << ", ";
         }
diff --git a/tvm/HalideIR/src/ir/IRVisitor.cpp b/tvm/HalideIR/src/ir/IRVisitor.cpp
index 02880fdb4..30e1fe86b 100644
--- a/tvm/HalideIR/src/ir/IRVisitor.cpp
+++ b/tvm/HalideIR/src/ir/IRVisitor.cpp
@@ -137,6 +137,9 @@ void IRVisitor::visit(const Let *op, const Expr &) {
     op->body.accept(this);
 }
 
+void IRVisitor::visit(const StreamExpr *op, const Expr &) {
+}
+
 void IRVisitor::visit(const LetStmt *op, const Stmt &) {
     op->value.accept(this);
     op->body.accept(this);
@@ -169,6 +172,10 @@ void IRVisitor::visit(const Store *op, const Stmt &) {
     op->predicate.accept(this);
 }
 
+void IRVisitor::visit(const StreamStmt *op, const Stmt &) {
+    op->value.accept(this);
+}
+
 void IRVisitor::visit(const Provide *op, const Stmt &) {
     op->value.accept(this);
     for (size_t i = 0; i < op->args.size(); i++) {
@@ -266,6 +273,10 @@ void IRVisitor::visit(const Quantize *op, const Expr &) {
 void IRVisitor::visit(const KernelDef *op, const Stmt &) {
   for (size_t i = 0; i < op->args.size(); i++) {
     op->args[i].accept(this);
+    op->api_types[i].accept(this);
+    for (size_t j = 0; j < op->api_args[i].size(); j++) {
+      op->api_args[i][j].accept(this);
+    }
   }
   op->ret_void.accept(this);
 }
@@ -574,6 +585,10 @@ void IRGraphVisitor::visit(const Quantize *op, const Expr &) {
 void IRGraphVisitor::visit(const KernelDef *op, const Stmt &) {
   for (size_t i = 0; i < op->args.size(); i++) {
     include(op->args[i]);
+    include(op->api_types[i]);
+    for (size_t j = 0; j < op->api_args[i].size(); j++) {
+      include(op->api_args[i][j]);
+    }
   }
   include(op->ret_void);
 }
@@ -607,6 +622,12 @@ void IRGraphVisitor::visit(const Reuse *op, const Stmt &) {
 
 void IRGraphVisitor::visit(const Partition *op, const Stmt &) {}
 
+void IRGraphVisitor::visit(const StreamExpr *op, const Expr &) {}
+
+void IRGraphVisitor::visit(const StreamStmt *op, const Stmt &) {
+  include(op->value);
+}
+
 void IRGraphVisitor::visit(const Stencil *op, const Stmt &) {
   include(op->body);
 }
diff --git a/tvm/HalideIR/src/ir/IRVisitor.h b/tvm/HalideIR/src/ir/IRVisitor.h
index 931f1c5c9..a4faa4aba 100644
--- a/tvm/HalideIR/src/ir/IRVisitor.h
+++ b/tvm/HalideIR/src/ir/IRVisitor.h
@@ -79,6 +79,8 @@ class IRVisitor {
     EXPORT virtual void visit(const Reuse *, const Stmt &);
     EXPORT virtual void visit(const Partition *, const Stmt &);
     EXPORT virtual void visit(const Stencil *, const Stmt &);
+    EXPORT virtual void visit(const StreamStmt *, const Stmt &);
+    EXPORT virtual void visit(const StreamExpr *, const Expr &);
 };
 
 /** A base class for algorithms that walk recursively over the IR
@@ -159,6 +161,8 @@ class IRGraphVisitor : public IRVisitor {
     EXPORT virtual void visit(const Reuse *, const Stmt &);
     EXPORT virtual void visit(const Partition *, const Stmt &);
     EXPORT virtual void visit(const Stencil *, const Stmt &);
+    EXPORT virtual void visit(const StreamExpr *, const Expr &);
+    EXPORT virtual void visit(const StreamStmt *, const Stmt &);
     // @}
 };
 
diff --git a/tvm/Makefile b/tvm/Makefile
index 1a78cbe7c..1b2030645 100644
--- a/tvm/Makefile
+++ b/tvm/Makefile
@@ -126,6 +126,13 @@ else
 	CFLAGS += -DTVM_OPENCL_RUNTIME=0
 endif
 
+ifeq ($(USE_SDACCEL_HLS), 1)
+	CFLAGS += -DHCL_SDACCEL_RUNTIME=1
+else
+	CFLAGS += -DHCL_SDACCEL_RUNTIME=0
+endif
+
+
 ifeq ($(USE_VIVADO_HLS), 1)
 	CFLAGS += -DHCL_VHLS_RUNTIME=1
 else
diff --git a/tvm/include/tvm/codegen.h b/tvm/include/tvm/codegen.h
index 3877db941..4d6be0230 100644
--- a/tvm/include/tvm/codegen.h
+++ b/tvm/include/tvm/codegen.h
@@ -42,6 +42,7 @@ runtime::Module Build(const Array<LoweredFunc>& funcs,
  * \return cstr The C string representation of the file.
  */
 std::string PackImportsToC(const runtime::Module& m, bool system_lib);
+
 }  // namespace codegen
 }  // namespace TVM
 
diff --git a/tvm/include/tvm/ir.h b/tvm/include/tvm/ir.h
index e66db3fb4..8a26e551c 100644
--- a/tvm/include/tvm/ir.h
+++ b/tvm/include/tvm/ir.h
@@ -21,6 +21,8 @@ using Halide::Internal::StmtNode;
 using Halide::Internal::IRNodeType;
 using Halide::Internal::ForType;
 using Halide::Internal::PartitionType;
+using Halide::Internal::StreamType;
+using Halide::Internal::DeviceType;
 using Halide::DeviceAPI;
 
 // Node container for CommReducer
@@ -232,6 +234,8 @@ constexpr const char* pipeline_exec_scope = "pipeline_exec_scope";
 constexpr const char* opengl_stage_scope = "opengl_stage_scope";
 
 constexpr const char* attach_scope = "attach_scope";
+
+constexpr const char* device_scope = "device_scope";
 }  // namespace attr
 
 /*! \brief namespace of TVM Intrinsic functions */
@@ -501,6 +505,8 @@ using Halide::Internal::Quantize;
 using Halide::Internal::KernelDef;
 using Halide::Internal::KernelExpr;
 using Halide::Internal::KernelStmt;
+using Halide::Internal::StreamExpr;
+using Halide::Internal::StreamStmt;
 using Halide::Internal::Return;
 using Halide::Internal::Break;
 using Halide::Internal::While;
diff --git a/tvm/include/tvm/ir_functor_ext.h b/tvm/include/tvm/ir_functor_ext.h
index c4f18ba7e..39ce6d2b8 100644
--- a/tvm/include/tvm/ir_functor_ext.h
+++ b/tvm/include/tvm/ir_functor_ext.h
@@ -148,6 +148,7 @@ class ExprFunctor<R(const Expr& n, Args...)> {
   virtual R VisitExpr_(const SetSlice* op, Args... args) EXPR_FUNCTOR_DEFAULT;
   virtual R VisitExpr_(const Quantize* op, Args... args) EXPR_FUNCTOR_DEFAULT;
   virtual R VisitExpr_(const KernelExpr* op, Args... args) EXPR_FUNCTOR_DEFAULT;
+  virtual R VisitExpr_(const StreamExpr* op, Args... args) EXPR_FUNCTOR_DEFAULT;
   virtual R VisitExprDefault_(const Node* op, Args ...) {
   LOG(FATAL) << "Do not have a default for " << op->type_key();
     return R();
@@ -193,6 +194,7 @@ class ExprFunctor<R(const Expr& n, Args...)> {
     IR_EXPR_FUNCTOR_DISPATCH(SetSlice);
     IR_EXPR_FUNCTOR_DISPATCH(Quantize);
     IR_EXPR_FUNCTOR_DISPATCH(KernelExpr);
+    IR_EXPR_FUNCTOR_DISPATCH(StreamExpr);
     return vtable;
   }
 };
@@ -244,6 +246,7 @@ class StmtFunctor<R(const Stmt& n, Args... args)> {
   virtual R VisitStmt_(const Evaluate* op, Args... args) STMT_FUNCTOR_DEFAULT;
   virtual R VisitStmt_(const KernelDef* op, Args... args) STMT_FUNCTOR_DEFAULT;
   virtual R VisitStmt_(const KernelStmt* op, Args... args) STMT_FUNCTOR_DEFAULT;
+  virtual R VisitStmt_(const StreamStmt* op, Args... args) STMT_FUNCTOR_DEFAULT;
   virtual R VisitStmt_(const Return* op, Args... args) STMT_FUNCTOR_DEFAULT;
   virtual R VisitStmt_(const Break* op, Args... args) STMT_FUNCTOR_DEFAULT;
   virtual R VisitStmt_(const While* op, Args... args) STMT_FUNCTOR_DEFAULT;
@@ -275,6 +278,7 @@ class StmtFunctor<R(const Stmt& n, Args... args)> {
     IR_STMT_FUNCTOR_DISPATCH(Evaluate);
     IR_STMT_FUNCTOR_DISPATCH(KernelDef);
     IR_STMT_FUNCTOR_DISPATCH(KernelStmt);
+    IR_STMT_FUNCTOR_DISPATCH(StreamStmt);
     IR_STMT_FUNCTOR_DISPATCH(Return);
     IR_STMT_FUNCTOR_DISPATCH(Break);
     IR_STMT_FUNCTOR_DISPATCH(While);
diff --git a/tvm/include/tvm/ir_mutator.h b/tvm/include/tvm/ir_mutator.h
index 964684ec1..200534644 100644
--- a/tvm/include/tvm/ir_mutator.h
+++ b/tvm/include/tvm/ir_mutator.h
@@ -77,6 +77,7 @@ class TVM_DLL IRMutator {
   virtual Stmt Mutate_(const Reuse* op, const Stmt& s);
   virtual Stmt Mutate_(const Partition* op, const Stmt& s);
   virtual Stmt Mutate_(const Stencil* op, const Stmt& s);
+  virtual Stmt Mutate_(const StreamStmt* op, const Stmt& s);
 
   virtual Expr Mutate_(const Variable* op, const Expr& e);
   virtual Expr Mutate_(const Load* op, const Expr& e);
@@ -114,6 +115,7 @@ class TVM_DLL IRMutator {
   virtual Expr Mutate_(const SetSlice* op, const Expr& e);
   virtual Expr Mutate_(const Quantize* op, const Expr& e);
   virtual Expr Mutate_(const KernelExpr* op, const Expr& e);
+  virtual Expr Mutate_(const StreamExpr* op, const Expr& e);
 };
 
 /*!
diff --git a/tvm/include/tvm/ir_pass.h b/tvm/include/tvm/ir_pass.h
index 88c29f32c..dfba91d32 100644
--- a/tvm/include/tvm/ir_pass.h
+++ b/tvm/include/tvm/ir_pass.h
@@ -214,6 +214,14 @@ Stmt StorageFlatten(Stmt stmt,
  */
 Stmt RemoveNoOp(Stmt stmt);
 
+/*!
+ * \brief Infer device scope.
+ * \param stmt The stmt to be trasnformed
+ * \param bus_bandwidth The bandwisth of the stream bus
+ * \return Transformed stmt.
+ */
+Stmt InferStream(Stmt stmt, int bus_bandwidth);
+
 /*!
  * \brief Split statement into pipeine stages.
  * \param stmt The stmt to be splitted
diff --git a/tvm/include/tvm/ir_visitor.h b/tvm/include/tvm/ir_visitor.h
index 6fe616aab..21ef77c32 100644
--- a/tvm/include/tvm/ir_visitor.h
+++ b/tvm/include/tvm/ir_visitor.h
@@ -131,6 +131,8 @@ class TVM_DLL IRVisitor {
   virtual void Visit_(const KernelDef* op);
   virtual void Visit_(const KernelExpr* op);
   virtual void Visit_(const KernelStmt* op);
+  virtual void Visit_(const StreamExpr* op);
+  virtual void Visit_(const StreamStmt* op);
   virtual void Visit_(const Return* op);
   virtual void Visit_(const Break* op);
   virtual void Visit_(const While* op);
diff --git a/tvm/include/tvm/schedule.h b/tvm/include/tvm/schedule.h
index 9dc1956c8..faacc7d96 100644
--- a/tvm/include/tvm/schedule.h
+++ b/tvm/include/tvm/schedule.h
@@ -351,11 +351,31 @@ class Schedule : public NodeRef {
                         const IterVar& axis,
                         int factor_axis = 0);
 
-  EXPORT Tensor reuse_at(const Tensor& target, 
-      Stage parent, 
+  EXPORT Tensor reuse_at(const Tensor& target,
+      Stage parent,
       IterVar axis,
       std::string name);
 
+  EXPORT void to_stage(const Tensor& target,
+                       Stage dest,
+                       int arg_pos,
+                       ir::StreamType stream_type,
+                       int channel_depth, 
+                       std::string name);
+
+  EXPORT Tensor move_to(const Tensor& target,
+                        ir::DeviceType device_type,
+                        ir::StreamType stream_type,
+                        int channel_depth, 
+                        std::string new_name);
+
+  EXPORT void stream_to(const Tensor& target,
+                        Stage dest,
+                        Stage source,
+                        ir::StreamType stream_type,
+                        int channel_depth, 
+                        std::string new_name);
+
   EXPORT Tensor partition(const Tensor& target, int dim, int factor,
                           ir::PartitionType partition_type);
 
@@ -381,6 +401,8 @@ class Schedule : public NodeRef {
   inline ScheduleNode* operator->();
   // declare container type
   using ContainerType = ScheduleNode;
+  // insertion point for host & xcel separation
+  static int split_bound;
 };
 
 /*!
diff --git a/tvm/src/api/api_ir.cc b/tvm/src/api/api_ir.cc
index 825f7580d..8edb1a0e8 100644
--- a/tvm/src/api/api_ir.cc
+++ b/tvm/src/api/api_ir.cc
@@ -176,6 +176,20 @@ TVM_REGISTER_API("make.Select")
       *ret = Node::make(args[0], args[1], args[2], args[3], args[4], args[5]);  \
     })                                                                          \
 
+#define REGISTER_MAKE7(Node)                                                    \
+  TVM_REGISTER_API("make."#Node)                                                \
+  .set_body([](TVMArgs args,  TVMRetValue *ret) {                               \
+      *ret = Node::make(args[0], args[1], args[2], args[3],                     \
+                        args[4], args[5], args[6]);                             \
+    })                                                                          \
+
+#define REGISTER_MAKE8(Node)                                                    \
+  TVM_REGISTER_API("make."#Node)                                                \
+  .set_body([](TVMArgs args,  TVMRetValue *ret) {                               \
+      *ret = Node::make(args[0], args[1], args[2], args[3],                     \
+                        args[4], args[5], args[6], args[7]);                    \
+    })                                                                          \
+
 #define REGISTER_MAKE_BINARY_OP(Node)                        \
   TVM_REGISTER_API("make."#Node)                             \
   .set_body([](TVMArgs args,  TVMRetValue *ret) {            \
@@ -222,7 +236,7 @@ REGISTER_MAKE3(GetSlice);
 REGISTER_MAKE3(SetBit);
 REGISTER_MAKE4(SetSlice);
 REGISTER_MAKE2(Quantize);
-REGISTER_MAKE5(KernelDef);
+REGISTER_MAKE8(KernelDef);
 REGISTER_MAKE3(KernelExpr);
 REGISTER_MAKE2(KernelStmt);
 REGISTER_MAKE1(Return);
diff --git a/tvm/src/api/api_lang.cc b/tvm/src/api/api_lang.cc
index f07d590a5..543e816aa 100644
--- a/tvm/src/api/api_lang.cc
+++ b/tvm/src/api/api_lang.cc
@@ -461,6 +461,31 @@ TVM_REGISTER_API("_SchedulePartition")
           static_cast<ir::PartitionType>(args[4].operator int()));
   });
 
+TVM_REGISTER_API("_ScheduleMoveToStage")
+  .set_body([](TVMArgs args, TVMRetValue *ret) {
+    args[0].operator Schedule()
+      .to_stage(args[1], args[2], args[3], 
+         static_cast<ir::StreamType>(args[4].operator int()),
+           args[5], args[6]);
+  });
+
+TVM_REGISTER_API("_ScheduleMove")
+  .set_body([](TVMArgs args, TVMRetValue *ret) {
+    *ret = args[0].operator Schedule()
+        .move_to(args[1], 
+          static_cast<ir::DeviceType>(args[2].operator int()),  
+            static_cast<ir::StreamType>(args[3].operator int()),
+              args[4], args[5]);
+  });
+
+TVM_REGISTER_API("_ScheduleStream")
+  .set_body([](TVMArgs args, TVMRetValue *ret) {
+    args[0].operator Schedule()
+      .stream_to(args[1], args[2], args[3], 
+         static_cast<ir::StreamType>(args[4].operator int()),
+           args[5], args[6]);
+  });
+
 TVM_REGISTER_API("_ScheduleReshape")
   .set_body([](TVMArgs args, TVMRetValue *ret) {
     args[0].operator Schedule().reshape(args[1], args[2]);
diff --git a/tvm/src/api/api_pass.cc b/tvm/src/api/api_pass.cc
index 348b8816e..1728b0c23 100644
--- a/tvm/src/api/api_pass.cc
+++ b/tvm/src/api/api_pass.cc
@@ -122,6 +122,7 @@ REGISTER_PASS1(InjectPrefetch);
 REGISTER_PASS2(InjectDoubleBuffer);
 REGISTER_PASS2(LoopPartition);
 REGISTER_PASS1(RemoveNoOp);
+REGISTER_PASS2(InferStream);
 REGISTER_PASS2(SplitPipeline);
 REGISTER_PASS2(LiftAttrScope);
 REGISTER_PASS1(NarrowChannelAccess);
diff --git a/tvm/src/codegen/build_common.cc b/tvm/src/codegen/build_common.cc
new file mode 100644
index 000000000..8bdbf7e98
--- /dev/null
+++ b/tvm/src/codegen/build_common.cc
@@ -0,0 +1,220 @@
+/*!
+ *  Copyright (c) 2019 by Contributors
+ * \file build_common.cc
+ * \brief Build unified simulation module
+ */
+#include <tvm/base.h>
+#include <tvm/ir_visitor.h>
+#include <tvm/runtime/config.h>
+#include <tvm/runtime/module.h>
+#include <tvm/runtime/packed_func.h>
+#include <tvm/build_module.h>
+#include "./build_common.h"
+#include "./build_util.h"
+
+#include <fstream>
+#include <unistd.h>
+#include <sys/ipc.h>
+#include <sys/shm.h>
+#include <iostream>
+
+#include "merlinc/codeanalys_merlinc.h"
+#include "hlsc/codegen_vhls.h"
+#include "opencl/codegen_aocl.h"
+#include "ppac/codegen_rv64_ppac.h"
+
+namespace TVM {
+namespace runtime {
+
+class SimModuleNode final : public ModuleNode {
+ public:
+  SimModuleNode(LoweredFunc func, 
+                std::string host_code,
+                argInfo arg_info,
+                std::string dev_code, std::string platform, 
+                std::unordered_map<std::string, std::string> options)
+    : func_(func), 
+      host_(host_code), 
+      arg_info_(arg_info),
+      dev_(dev_code), 
+      platform_(platform), 
+      options_(options) { 
+  }
+
+  const char* type_key() const {
+    return "unified_sim";
+  }
+
+  // unified simulation function
+  PackedFunc GetFunction(
+      const std::string& name,
+      const std::shared_ptr<ModuleNode>& sptr_to_self) final {
+    return PackedFunc([this](TVMArgs args, TVMRetValue* rv){
+        
+        if (args.size() != (int)func_->args.size())
+          LOG(FATAL) << "The function should take in " << func_->args.size() 
+                     << " inputs but get " << args.size();
+        std::vector<int> shmids;
+        std::vector<size_t> arg_sizes;
+        std::vector<TVMType> arg_types;
+
+        CollectArgInfo(args, func_, arg_sizes, arg_types);
+        GenSharedMem(args, shmids, arg_sizes);
+
+        LOG(CLEAN) << "Generating harness files ...";
+        system("rm -rf __tmp__; mkdir __tmp__");
+        std::string path; 
+        if (const auto* f = Registry::Get("get_util_path")) 
+          path = (*f)(platform_).operator std::string();
+        system(("cp -r " + path + "/* __tmp__/").c_str());
+        LOG(CLEAN) << "Running SW simulation on " + platform_;
+
+        if (platform_ == "sdaccel") {
+          GenWrapperCode(args, shmids, arg_types, arg_info_, func_);
+          GenHostCode(args, shmids, arg_types, func_, 
+                      platform_, host_, arg_info_);
+          GenKernelCode(dev_);
+
+          LOG(CLEAN) << "Running SW simulation ...";
+          system("cd __tmp__; source ./run_sw.sh");
+
+        } else if (platform_ == "rocket") {
+          // generate host and run proxy kernel test 
+          GenHostCode(args, shmids, arg_types, func_, 
+                      platform_, host_, arg_info_);
+          std::string compile = "cd __tmp__;";
+          compile += std::string("autoconf; mkdir build; cd build;") +
+                     std::string("../configure --with-riscvtools=") + 
+                     options_["RISCV"] + std::string(";make -j8");
+          system(compile.c_str());
+
+        } else if (platform_ == "vivado_hls") {
+          GenHostCode(args, shmids, arg_types, func_, 
+                      platform_, host_, arg_info_);
+          GenKernelCode(dev_);
+          system("cd __tmp__; make csim");
+        } else {
+          LOG(FATAL) << "unrecognized platform " << platform_;  
+        } 
+
+        // clean & extract resource information
+        FreeSharedMem(args, shmids, arg_sizes);
+        if (const auto* f = Registry::Get("tvm_callback_syn_postproc")) {
+          std::string code;
+          code = (*f)("test").operator std::string();
+          LOG(CLEAN) << "extract res info";
+        }
+      });
+  }
+
+ private:
+  LoweredFunc func_;
+  std::string host_;
+  argInfo arg_info_;
+  std::string dev_;
+  std::string platform_;
+  std::unordered_map<std::string, std::string> options_;
+};
+
+using var2nameType = std::unordered_map<const Variable*, 
+    std::tuple<std::string, Type, std::vector<int>>>; 
+
+Module CreateSimModule(
+    LoweredFunc func,
+    std::string host_code,
+    std::string dev_code,
+    argInfo arg_types,
+    std::string platform, 
+    std::unordered_map<std::string, std::string> options) {
+  std::shared_ptr<SimModuleNode> n =
+    std::make_shared<SimModuleNode>(func, host_code, 
+                                    arg_types, dev_code,
+                                    platform, options);
+  return Module(n);
+}
+} // namespace runtime
+
+namespace codegen {
+using var2nameType = std::unordered_map<const Variable*, 
+    std::tuple<std::string, Type, std::vector<int>>>; 
+
+using argInfo = 
+    std::vector<std::tuple<std::string, bool, Type, std::vector<int>>>;
+
+// unified simulation function for diff platforms 
+template<class CGHost, class CGXcel>
+runtime::Module BuildSimModule(Array<LoweredFunc> funcs,
+                               Array<Expr> attrs,
+                               Array<Expr> values) {
+  CodeAnalysMerlinC ca;
+  CGHost cg_host;
+  CGXcel cg_dev;
+  
+  for (LoweredFunc f : funcs) {
+    ca.AddFunction(f);
+    str2tupleMap<std::string, Type> map_arg_type;
+    map_arg_type = ca.Finish();
+    cg_host.AddFunction(f, map_arg_type);
+    cg_dev.AddFunction(f, map_arg_type);
+  }
+  // vector {vars} 
+  auto& arg_vars = cg_dev.arg_vars;
+  // map {var : is_streamed(bool) }
+  auto& stream_table = cg_dev.stream_table;
+  // map {var : (vid, Type, shape)}
+  auto& arg_top_vars = cg_dev.arg_top_vars;
+
+  argInfo arg_info;
+  for (size_t i = 0 ; i < arg_vars.size(); i++) {
+    auto v = arg_vars[i];
+    auto nameType = arg_top_vars[v];
+    bool is_stream;
+    if (stream_table[v])
+      is_stream = true;
+    else is_stream = false;
+    auto item = std::make_tuple(
+        /*var name*/std::get<0>(nameType),
+        /*whether is streamed*/is_stream, 
+        /*data type*/std::get<1>(nameType), 
+        /*shape*/std::get<2>(nameType));
+    arg_info.push_back(item);
+  }
+  // tool option mapping and platform 
+  std::string platform = values[0].as<StringImm>()->value;
+  std::unordered_map<std::string, std::string> options;
+  for (size_t k = 1; k < attrs.size(); k++) {
+    auto key = attrs[k].as<StringImm>()->value;
+    auto val = values[k].as<StringImm>()->value;
+    options[key] = val;
+  }
+  return runtime::CreateSimModule(funcs[0], 
+                                  cg_host.GetHost(),
+                                  cg_dev.GetDevice(),
+                                  arg_info, platform, options);
+}
+
+TVM_REGISTER_API("codegen.build_sim")
+.set_body([](TVMArgs args, TVMRetValue* rv) {
+    // dispatch to corr codegen
+    auto& sptr = args[2].node_sptr();
+    CHECK(sptr->is_type<ArrayNode>());
+    auto* n = static_cast<const ArrayNode*>(sptr.get());
+    auto data = n->data[static_cast<size_t>(0)];
+
+    // create module node for simulation 
+    std::string type = Expr(data).as<StringImm>()->value;
+    if (type == "rocket") {
+      *rv = BuildSimModule<CodeGenRV64PPAC, CodeGenRV64PPAC>
+                (args[0], args[1], args[2]);
+    } else if (type == "sdaccel") {
+      *rv = BuildSimModule<CodeGenAOCL, CodeGenVivadoHLS>
+                (args[0], args[1], args[2]);
+    } else if (type == "vivado_hls") {
+      *rv = BuildSimModule<CodeGenVivadoHLS, CodeGenVivadoHLS>
+                (args[0], args[1], args[2]);
+    } else {
+    }
+  });
+
+}  // namespace codegen
+}  // namespace TVM
diff --git a/tvm/src/codegen/build_common.h b/tvm/src/codegen/build_common.h
index ee8cbc509..f9f42d219 100644
--- a/tvm/src/codegen/build_common.h
+++ b/tvm/src/codegen/build_common.h
@@ -29,6 +29,7 @@ ExtractFuncInfo(const Array<LoweredFunc>& funcs) {
   }
   return fmap;
 }
+
 }  // namespace codegen
 }  // namespace TVM
 #endif  // TVM_CODEGEN_BUILD_COMMON_H_
diff --git a/tvm/src/codegen/build_opencl.cc b/tvm/src/codegen/build_opencl.cc
deleted file mode 100644
index 5054085cd..000000000
--- a/tvm/src/codegen/build_opencl.cc
+++ /dev/null
@@ -1,44 +0,0 @@
-/*!
- *  Copyright (c) 2017 by Contributors
- *  Build opencl modules from source.
- * \file build_opencl.cc
- */
-#include <tvm/base.h>
-#include <tvm/runtime/config.h>
-#include "./codegen_opencl.h"
-#include "./build_common.h"
-
-#if TVM_OPENCL_RUNTIME
-#include "../runtime/opencl/opencl_module.h"
-#endif   // TVM_OPENCL_RUNTIME
-
-namespace TVM {
-namespace codegen {
-
-runtime::Module BuildOpenCL(Array<LoweredFunc> funcs) {
-  using TVM::runtime::Registry;
-  bool output_ssa = false;
-  CodeGenOpenCL cg;
-  cg.Init(output_ssa);
-  for (LoweredFunc f : funcs) {
-    cg.AddFunction(f);
-  }
-  std::string code = cg.Finish();
-
-  if (const auto* f = Registry::Get("tvm_callback_opencl_postproc")) {
-    code = (*f)(code).operator std::string();
-  }
-#if TVM_OPENCL_RUNTIME
-  return OpenCLModuleCreate(code, "cl", ExtractFuncInfo(funcs));
-#else
-  LOG(WARNING) << "OpenCL runtime not enabled, return a source module...";
-  return DeviceSourceModuleCreate(code, "cl", ExtractFuncInfo(funcs), "opencl");
-#endif   // TVM_OPENCL_RUNTIME
-}
-
-TVM_REGISTER_API("codegen.build_opencl")
-.set_body([](TVMArgs args, TVMRetValue* rv) {
-    *rv = BuildOpenCL(args[0]);
-  });
-}  // namespace codegen
-}  // namespace TVM
diff --git a/tvm/src/codegen/build_util.cc b/tvm/src/codegen/build_util.cc
new file mode 100644
index 000000000..e0a5f8b2d
--- /dev/null
+++ b/tvm/src/codegen/build_util.cc
@@ -0,0 +1,812 @@
+/*!
+ *  Copyright (c) 2019 by Contributors
+ * \file build_common.cc
+ * \brief Build unified simulation module
+ */
+#include <tvm/base.h>
+#include <tvm/ir_visitor.h>
+#include <tvm/runtime/config.h>
+#include <tvm/runtime/module.h>
+#include <tvm/runtime/packed_func.h>
+#include <tvm/build_module.h>
+#include "./build_common.h"
+#include "./build_util.h"
+
+#include <fstream>
+#include <unistd.h>
+#include <sys/ipc.h>
+#include <sys/shm.h>
+#include <iostream>
+
+#include "merlinc/codeanalys_merlinc.h"
+#include "hlsc/codegen_vhls.h"
+#include "opencl/codegen_aocl.h"
+#include "ppac/codegen_rv64_ppac.h"
+
+namespace TVM {
+namespace runtime {
+
+std::string getpath(void) {
+   char buff[256];
+   getcwd(buff, 256);
+   std::string cwd(buff);
+   return cwd;
+}
+
+void PrintIndent(std::ofstream& stream, int indent) {
+  for (int i = 0; i < indent; i++)
+    stream << ' ';
+}
+
+inline size_t GetTypeSize(TVMType t) {
+  size_t byte = (t.bits + 7) / 8;
+  if (byte > 2){
+    if (byte <= 4) byte = 4;
+    else if (byte <= 8) byte = 8;
+    else byte = 16;
+  }
+  return byte;
+}
+
+inline size_t GetDataSize(TVMArray* arr) {
+  size_t size = 1;
+  for (tvm_index_t i = 0; i < arr->ndim; ++i) {
+    size *= arr->shape[i];
+  }
+  size_t byte = (arr->dtype.bits + 7) / 8;
+  if (byte > 2){
+    if (byte <= 4) byte = 4;
+    else if (byte <= 8) byte = 8;
+    else byte = 16;
+  }
+  size *= (byte * 8 * arr->dtype.lanes + 7) / 8;
+  return size;
+}
+
+inline TVMType Type2TVMType(Type t) {
+  TVMType tt;
+  if (t.is_int())        tt.code = kDLInt;
+  else if (t.is_uint())  tt.code = kDLUInt;
+  else if (t.is_float()) tt.code = kDLFloat;
+  else                   LOG(FATAL) << "Unacceptable type: " << t;
+  tt.bits = static_cast<uint8_t>(t.bits());
+  tt.fracs = static_cast<uint8_t>(t.fracs());
+  return tt;
+}
+
+inline std::string PrintHalideType(Type t) {
+  std::string str = "";
+  if (t.is_uint() || t.is_int() || t.is_fixed() || t.is_ufixed()) {
+    if (t.is_uint())        str += "ap_uint<" + std::to_string(t.bits()) + ">";
+    else if (t.is_int())    str += "ap_int<" + std::to_string(t.bits()) + ">";
+    else if (t.is_ufixed()) str += "ap_ufixed<" + std::to_string(t.bits()) + ", " + std::to_string(t.bits() - t.fracs()) + ">";
+    else                    str += "ap_fixed<" + std::to_string(t.bits()) + ", " + std::to_string(t.bits() - t.fracs()) + ">";
+  } else {
+    LOG(FATAL) << "Cannot convert type " << t << " to C type";
+  }
+  return str;
+}
+
+inline std::string Type2Str(TVMType t) {
+  std::string str = "";
+  if (t.code == kDLInt) {
+    if (t.fracs > 0) str += "ap_fixed<";
+    else             str += "ap_int<";
+    str += std::to_string(static_cast<int>(t.bits));
+    if (t.fracs > 0) str += ", " + std::to_string(static_cast<int>(t.bits - t.fracs)) + ">";
+    else             str += ">";
+  } else if (t.code == kDLUInt) {
+    if (t.fracs > 0) str += "ap_ufixed<";
+    else             str += "ap_uint<";
+    str += std::to_string(static_cast<int>(t.bits));
+    if (t.fracs > 0) str += ", " + std::to_string(static_cast<int>(t.bits - t.fracs)) + ">";
+    else             str += ">";
+  } else if (t.code == kDLFloat) {
+    str += "float";
+  } else {
+    LOG(FATAL) << "Unknown type";
+  }
+  return str;
+}
+
+inline std::string Type2ExtStr(TVMType t) {
+  std::string str = "";
+  if (t.code == kDLInt) {
+    if (t.fracs > 0) str += "ap_fixed<";
+    else             str += "ap_int<";
+    str += std::to_string(static_cast<int>(t.bits + t.fracs));
+    if (t.fracs > 0) str += ", " + std::to_string(static_cast<int>(t.bits)) + ">";
+    else             str += ">";
+  } else if (t.code == kDLUInt) {
+    if (t.fracs > 0) str += "ap_ufixed<";
+    else             str += "ap_uint<";
+    str += std::to_string(static_cast<int>(t.bits + t.fracs));
+    if (t.fracs > 0) str += ", " + std::to_string(static_cast<int>(t.bits)) + ">";
+    else             str += ">";
+  } else if (t.code == kDLFloat) {
+    str += "float";
+  } else {
+    LOG(FATAL) << "Unknown type";
+  }
+  return str;
+}
+
+inline std::string Type2WrapStr(TVMType t) {
+  std::string str = "";
+  if (t.code == kDLInt) {
+    if (t.fracs > 0) {
+      str += "ap_fixed<";
+      str += std::to_string(static_cast<int>(t.bits + t.fracs));
+    } else {
+      str += "ap_int<";
+      if      (t.bits <= 8)  str += std::to_string(static_cast<int>(t.bits));
+      else if (t.bits <= 16) str += "16";
+      else if (t.bits <= 32) str += "32";
+      else                   str += "64";
+    }     
+    if (t.fracs > 0) str += ", " + std::to_string(static_cast<int>(t.bits)) + ">";
+    else             str += ">";
+  } else if (t.code == kDLUInt) {
+    if (t.fracs > 0) {
+      str += "ap_ufixed<";
+      str += std::to_string(static_cast<int>(t.bits + t.fracs));
+    } else {
+      str += "ap_uint<";
+      if      (t.bits <= 8)  str += std::to_string(static_cast<int>(t.bits));
+      else if (t.bits <= 16) str += "16";
+      else if (t.bits <= 32) str += "32";
+      else                   str += "64"; 
+    }
+    if (t.fracs > 0) str += ", " + std::to_string(static_cast<int>(t.bits)) + ">";
+    else             str += ">";
+  } else if (t.code == kDLFloat) {
+    str += "float";
+  } else {
+    LOG(FATAL) << "Unknown type";
+  }
+  return str;
+}
+
+inline std::string Type2Byte(TVMType t) {
+  std::string str = "";
+  if (t.code == kDLFloat) {
+    str += "float";
+  } else if (t.code == kDLInt || t.code == kDLUInt) {
+    if (t.code == kDLUInt) str += "u";
+    str += "int";
+    if      (t.bits <= 8)  str += "8";
+    else if (t.bits <= 16) str += "16";
+    else if (t.bits <= 32) str += "32";
+    else                   str += "64";
+    str += "_t";
+  }
+  return str;
+}
+
+void CollectArgInfo(TVMArgs& args, 
+                    LoweredFunc func,
+                    std::vector<size_t>& arg_sizes,
+                    std::vector<TVMType>& arg_types) {
+  for (int i = 0; i < args.size(); i++) {
+    if (args[i].type_code() == kArrayHandle) {
+      TVMArray* arr = args[i];
+      arg_sizes.push_back(GetDataSize(arr));
+      arg_types.push_back(arr->dtype);
+    } else {
+      const Variable* var = func->api_args[i].as<Variable>();
+      TVMType t = Type2TVMType(var->type);
+      arg_sizes.push_back(GetTypeSize(t));
+      arg_types.push_back(t);
+    }
+  }
+}
+
+void GenSharedMem(TVMArgs& args,
+                  std::vector<int>& shmids,
+                  std::vector<size_t>& arg_sizes) {
+  for (int i = 0; i < args.size(); i++) {
+    if (args[i].type_code() == kArrayHandle) {
+      TVMArray* arr = args[i];
+      // generate shared memory key and id
+      // TODO: maybe get the current path??
+      key_t key = ftok("/", i+1);
+      int shmid = shmget(key, arg_sizes[i], 0666|IPC_CREAT);
+      shmids.push_back(shmid);
+      // copy mem from TVM args to the shared memory
+      void* mem = shmat(shmid, nullptr, 0);
+      memcpy(mem, arr->data, arg_sizes[i]);
+    } else {
+      shmids.push_back(0);
+    }
+  }
+}
+
+void FreeSharedMem(TVMArgs& args, 
+                   const std::vector<int>& shmids,
+                   std::vector<size_t>& arg_sizes) {
+  for (size_t i = 0; i < shmids.size(); i++) {
+    if (args[i].type_code() == kArrayHandle) {
+      TVMArray* arr = args[i];
+      int shmid = shmids[i];
+      void* mem = shmat(shmid, nullptr, 0);
+      memcpy(arr->data, mem, arg_sizes[i]);
+      shmdt(mem);
+      shmctl(shmid, IPC_RMID, nullptr);
+    }
+  }
+}
+
+// copy values from the shared mem to local mem
+void PrintCopy(TVMArray* arr, 
+               argInfo& arg_info,
+               std::ofstream& stream, 
+               int indent, size_t nth_arr) {
+  for (int i = 0; i < arr->ndim; i++) {
+    PrintIndent(stream, indent);
+    stream << "for (size_t i" << i << " = 0; ";
+    stream << "i" << i << " < " << arr->shape[i] << "; ";
+    stream << "i" << i << "++) {\n";
+    indent += 2;
+    if (i == arr->ndim - 1) {
+      PrintIndent(stream, indent);
+      stream << std::get<0>(arg_info[nth_arr]);
+      stream << "[i" << arr->ndim-1;
+      int mul2 = 1;
+      for (int j = arr->ndim-2; j >= 0; j--) {
+        mul2 *= arr->shape[j+1];
+        stream << " + i" << j << "*" << mul2;
+      }
+      stream << "]";
+
+      stream << " = (";
+      // stream << Type2ExtStr(arr->dtype);
+      stream << Type2Byte(arr->dtype);
+
+      stream << ")(arg_" << nth_arr;
+      stream << "[i" << arr->ndim-1;
+      int mul = 1;
+      for (int j = arr->ndim-2; j >= 0; j--) {
+        mul *= arr->shape[j+1];
+        stream << " + i" << j << "*" << mul;
+      }
+      stream << "])";
+      if (arr->dtype.fracs > 0)
+        stream << " >> " << static_cast<int>(arr->dtype.fracs);
+      stream << ";\n";
+    }
+  }
+  for (int i = 0; i < arr->ndim; i++) {
+    indent -= 2;
+    PrintIndent(stream, indent);
+    stream << "}\n";
+  }
+}
+
+// copy values from local mem back to shared mem
+void PrintCopyBack(TVMArray* arr, 
+                   argInfo& arg_info,
+                   std::ofstream& stream, 
+                   int indent, size_t nth_arr) {
+  for (int i = 0; i < arr->ndim; i++) {
+    PrintIndent(stream, indent);
+    stream << "for (size_t i" << i << " = 0; ";
+    stream << "i" << i << " < " << arr->shape[i] << "; ";
+    stream << "i" << i << "++) {\n";
+    indent += 2;
+    if (i == arr->ndim-1) {
+      PrintIndent(stream, indent);
+      stream << "arg_" << nth_arr;
+      stream << "[i" << arr->ndim-1;
+      int mul = 1;
+      for (int j = arr->ndim-2; j >= 0; j--) {
+        mul *= arr->shape[j+1];
+        stream << " + i" << j << "*" << mul;
+      }
+      stream << "] = (";
+      stream << Type2Byte(arr->dtype);
+      stream << ")(" << std::get<0>(arg_info[nth_arr]);
+      stream << "[i" << arr->ndim - 1;
+      int mul2 = 1;
+      for (int j = arr->ndim-2; j >= 0; j--) {
+        mul2 *= arr->shape[j+1];
+        stream << " + i" << j << "*" << mul2;
+      }
+
+      stream << "])";
+      if (arr->dtype.fracs > 0)
+        stream << " << " << static_cast<int>(arr->dtype.fracs);
+      stream << ";\n";
+    }
+  }
+  for (int i = 0; i < arr->ndim; i++) {
+    indent -= 2;
+    PrintIndent(stream, indent);
+    stream << "}\n";
+  }
+}
+
+void GenKernelCode(std::string test_file) {
+  std::ofstream stream;
+  stream.open("__tmp__/kernel.cpp");
+  stream << test_file;
+  stream.close();
+}
+
+// interface pragma to specify mem and ctrl interface in sdx
+void GenWrapperCode(TVMArgs& args,
+                 const std::vector<int>& shmids,
+                 const std::vector<TVMType>& arg_types,
+                 argInfo& arg_stream_types,
+                 LoweredFunc func) {
+  std::ofstream stream;
+  int indent = 0;
+  std::string path(getenv("PWD"));
+  stream.open("__tmp__/interface.cpp");
+  stream << "#include <stdio.h>\n";
+  stream << "#include \"" + path + "/__tmp__/kernel.cpp\"\n";
+  stream << "\n\n";
+  stream << "extern \"C\" \n";
+  stream << "{\n";
+  indent += 2;
+  PrintIndent(stream, indent);
+
+  // wrapper func interface
+  stream << "void App( ";
+  size_t ex_arg_count = 0;
+  ex_arg_count = arg_stream_types.size() - arg_types.size();
+  for (size_t i = 0; i < arg_types.size(); i++) {
+    if (i != 0) stream << ", ";
+    stream << Type2WrapStr(arg_types[i]);
+    stream << "*";
+    stream << " source_wrapper_" << i;
+  }
+  for (size_t k = 0; k < ex_arg_count; k++) {
+    if (k != ex_arg_count) stream << ", ";
+    stream << PrintHalideType(std::get<2>(arg_stream_types[k + arg_types.size()])); 
+    stream << "*";
+    stream << " source_wrapper_" << k + arg_types.size();
+  }  
+  stream << " ) {\n";
+
+  // memeory and control pragma 
+  for (size_t i = 0; i < arg_stream_types.size(); i++) {
+    std::string interface;
+    if (std::get<1>(arg_stream_types[i])) interface = " m_axi ";
+    else interface = " m_axi ";
+    PrintIndent(stream, indent);
+    stream << "#pragma HLS INTERFACE" + interface + "port=";
+    stream << "source_wrapper_" << i;
+    stream << " offset=slave bundle=gmem\n";
+  }
+  for (size_t i = 0; i < arg_stream_types.size(); i++) {
+    std::string interface;
+    if (std::get<1>(arg_stream_types[i])) interface = " s_axilite ";
+    else interface = " s_axilite ";
+    PrintIndent(stream, indent);
+    stream << "#pragma HLS INTERFACE" + interface + "port=";
+    stream << "source_wrapper_" << i;
+    stream << " bundle=control\n";
+  }
+  PrintIndent(stream, indent);
+  stream << "#pragma HLS INTERFACE s_axilite port=return bundle=control\n";
+  stream << "\n";
+
+  // intermediate vars init alloc 
+  for (size_t i = 0; i < arg_stream_types.size(); i++) {
+    PrintIndent(stream, indent);
+    stream << PrintHalideType(std::get<2>(arg_stream_types[i]));
+    stream << " source_wrapper_temp_" << i;
+    auto shape = std::get<3>(arg_stream_types[i]);
+    for (size_t j = 0; j < shape.size(); j++) 
+      stream << "[" << shape[j] << "]";
+    if (shape.size() == 0) stream << "[1]";
+    stream << ";\n";
+  }
+
+  // vars init for values
+  for (size_t i = 0; i < arg_stream_types.size(); i++) {
+    auto shape = std::get<3>(arg_stream_types[i]);
+    for (size_t j = 0; j < shape.size(); j++) {
+      PrintIndent(stream, indent);
+      stream << "for (int i" << j << " = 0; ";
+      stream << "i" << j << " < " << shape[j] << "; ";
+      stream << "i" << j << "++) {\n";
+      indent += 2;
+      if (j == shape.size() - 1) {
+        PrintIndent(stream, indent);
+        stream << "source_wrapper_temp_" << i;
+        for (size_t k = 0; k < shape.size(); k++) {
+          stream << "[i" << k << "]";
+        }
+        stream << " = ";
+        stream << "source_wrapper_" << i;
+        stream << "[i" << shape.size() - 1;
+        int mul = 1;
+        for (size_t k = shape.size() - 1; k > 0; k--) {
+          mul *= shape[k];
+          stream << "+ i" << k - 1 << "*" << mul;
+        }
+        stream << "];\n";
+      }
+    }
+    for (size_t j = 0; j < shape.size(); j++) {
+      indent -= 2;
+      PrintIndent(stream, indent);
+      stream << "}\n";
+    }
+    if (shape.size() == 0) {
+      PrintIndent(stream, indent);
+      stream << "source_wrapper_temp_" << i;
+      stream << "[0] = source_wrapper_" << i << "[0];\n";
+    }
+  }
+
+  // print top func
+  stream << "\n";
+  PrintIndent(stream, indent);
+  stream << "top( ";
+  for (size_t i = 0;i < arg_stream_types.size(); i++) {
+    if (i != arg_stream_types.size() - 1){
+      stream << "source_wrapper_temp_" << i;
+      stream << ", ";
+    } else {
+      stream << "source_wrapper_temp_" << i;
+      stream << ");\n";
+    }
+
+  }
+  stream << "\n";
+
+  // read back return val
+  for (int k = arg_stream_types.size() - 1; 
+       k > args.size() - 2; k--) {
+    auto shape = std::get<3>(arg_stream_types[k]);
+    for (size_t i = 0; i < shape.size(); i++) {
+      PrintIndent(stream, indent);
+      stream << "for (int i" << i << " = 0; ";
+      stream << "i" << i << " < " << shape[i] <<  "; ";
+      stream << "i" << i << "++) {\n";
+      indent += 2;
+    
+      if (i == shape.size() - 1) {
+        PrintIndent(stream, indent);
+        stream << "source_wrapper_" << k;
+        stream << "[i" << shape.size() - 1;
+        int mul = 1;
+        for (size_t j = shape.size() - 1; j > 0; j--) {
+          mul *= shape[j];
+          stream << " + i" << j - 1 << "*" << mul;
+        }
+        stream << " ] = ";
+    
+        stream << "source_wrapper_temp_" << k;
+        for (size_t j = 0; j < shape.size(); j++) {
+          stream << "[i" << j << "]";
+        }
+        stream <<";\n";
+      }
+    }
+    for (size_t i = 0;i < shape.size(); i++) {
+        indent -= 2;
+        PrintIndent(stream, indent);
+        stream << "}\n";
+    }
+  }
+  stream << "}\n";
+  indent -= 2;
+  stream << "}\n";
+  stream.close();
+}
+
+// generate opencl wrapper for sdaccel sim
+void GenHostHeaders(std::ofstream& stream,
+                    std::string platform) {
+  stream << "#include <sys/ipc.h>\n";
+  stream << "#include <sys/shm.h>\n\n";
+  stream << "// standard C/C++ headers\n";
+  stream << "#include <cstdio>\n";
+  stream << "#include <cstdlib>\n";
+  stream << "#include <getopt.h>\n";
+  stream << "#include <string>\n";
+  stream << "#include <time.h>\n";
+  stream << "#include <sys/time.h>\n\n";
+  
+  if (platform == "sdaccel") {
+    stream << "// opencl harness headers\n";
+    stream << "#include \"CLWorld.h\"\n";
+    stream << "#include \"CLKernel.h\"\n";
+    stream << "#include \"CLMemObj.h\"\n";
+    stream << "#include \"utils.h\"\n";
+    stream << "// harness namespace\n";
+    stream << "using namespace rosetta;\n";
+  } else if (platform == "vivado_hls") {
+    stream << "// vivado hls headers\n";
+    stream << "#include <ap_int.h>\n";
+    stream << "#include <ap_fixed.h>\n";
+    stream << "#include <hls_stream.h>\n";
+    stream << "#include \"kernel.cpp\"\n\n";
+  }
+}
+
+// initialization before executing kernel 
+void KernelInit(std::ofstream& stream,
+                std::string platform,
+                TVMArgs& args, 
+                const std::vector<TVMType>& arg_types,
+                argInfo& arg_stream_types) {
+  int indent = 2;
+  stream << "\n";
+  PrintIndent(stream, indent);
+  stream << "// parse command line arguments for opencl version\n";
+  PrintIndent(stream, indent);
+  stream << "std::string kernelFile(\"\");\n";
+  PrintIndent(stream, indent);
+  stream << "parse_sdaccel_command_line_args(argc, argv, kernelFile);\n";
+  stream << "\n";
+  PrintIndent(stream, indent);
+  stream << "// create OpenCL world\n";
+  PrintIndent(stream, indent);
+  stream << "CLWorld world = CLWorld(TARGET_DEVICE, CL_DEVICE_TYPE_ACCELERATOR);\n";
+  stream << "\n";
+  PrintIndent(stream, indent);
+  stream << "// add the bitstream file\n";
+  PrintIndent(stream, indent);
+  stream << "dworld.addProgram(kernelFile);\n";
+  stream << "\n\n";
+  PrintIndent(stream, indent);
+  stream << "// create kernels\n";
+  PrintIndent(stream, indent);
+  stream << "CLKernel App(world.getContext(), world.getProgram(), \"App\", world.getDevice());\n";
+  stream << "\n\n";
+
+  PrintIndent(stream, indent);
+  stream << "// create mem objects\n";
+  for (int i = 0;i < args.size(); i++) {
+    PrintIndent(stream, indent);
+    stream << "CLMemObj source_" << i;
+    stream << "((void*)arg_top_" << i;
+    stream << ", sizeof(" << Type2Byte(arg_types[i]) << "), ";
+
+    if (args[i].type_code() == kArrayHandle) {
+      TVMArray* arr = args[i];
+      for (int j = 0;j < arr->ndim;j++) {
+        if (j==0) {
+          stream << arr->shape[j] << " ";
+        } else {
+          stream << "* " << arr->shape[j];
+        }
+      }
+    } else {
+      stream << "1";
+    }
+    stream << ", ";
+    stream << "CL_MEM_READ_WRITE);\n";
+  }
+  // additional streamed data
+  for (size_t k = args.size(); k < arg_stream_types.size(); k++) {
+    auto type = std::get<2>(arg_stream_types[k]);
+    auto shape = std::get<3>(arg_stream_types[k]);
+    PrintIndent(stream, indent);
+    stream << "CLMemObj source_" << k;
+    stream << "((void*)knn_mat";
+    stream << ", sizeof(" << Type2Byte(Type2TVMType(type)) << "), ";
+    if (shape.size() > 0) {
+      for (size_t j = 0; j < shape.size(); j++) {
+        if (j == 0) {
+          stream << shape[j] << " ";
+        } else {
+          stream << "* " << shape[j];
+        }
+      }
+    } else {
+      stream << "1";
+    }
+    stream << ", ";
+    stream << "CL_MEM_READ_WRITE);\n";
+  }
+
+  stream << "\n";
+  PrintIndent(stream, indent);
+  stream << "// add them to the world\n";
+  for (size_t i = 0;i < arg_stream_types.size();i++) {
+    PrintIndent(stream, indent);
+    stream << "world.addMemObj(source_" << i;
+    stream << ");\n";
+  }
+
+  stream << "\n\n";
+  PrintIndent(stream, indent);
+  stream << " // set work size\n";
+  PrintIndent(stream, indent);
+  int size = arg_stream_types.size();
+  std::string arr = "[" + std::to_string(size) + "] = {";
+  for (int i = 0; i < size; i++) {
+    if (i != size -1) arr += "1, ";
+    else arr += "1};\n";
+  }
+  stream << "int global_size" + arr;
+  PrintIndent(stream, indent);
+  stream << "int local_size" + arr;
+  PrintIndent(stream, indent);
+  stream << "App.set_global(global_size);\n";
+  PrintIndent(stream, indent);
+  stream << "App.set_local(local_size);\n";
+  stream << "\n";
+  PrintIndent(stream, indent);
+  stream << "// add them to the world\n";
+  PrintIndent(stream, indent);
+  stream << "world.addKernel(App);\n";
+  stream << "\n";
+  PrintIndent(stream, indent);
+  stream << "// set kernel arguments\n";
+  for (size_t i = 0; i < arg_stream_types.size(); i++) {
+    PrintIndent(stream, indent);
+    stream << "world.setMemKernelArg(0, "<< i << ", " << i;
+    stream << ");\n";
+  }
+
+  stream << "\n";
+  PrintIndent(stream, indent);
+  stream << "// run\n";
+  PrintIndent(stream, indent);
+  stream << "world.runKernels();\n\n";
+  PrintIndent(stream, indent);
+  stream << "// read the data back\n";
+  for (size_t i = args.size() - 1; i < arg_stream_types.size(); i++) {
+    PrintIndent(stream, indent);
+    stream << "world.readMemObj(" << i << ");\n";
+  }
+}
+
+// generate host code according to platform type
+void GenHostCode(TVMArgs& args,
+                 const std::vector<int>& shmids,
+                 const std::vector<TVMType>& arg_types,
+                 LoweredFunc lowered_func,
+                 std::string platform,
+                 std::string host_code,
+                 argInfo& arg_info) {
+  int indent = 0;
+  std::ofstream stream;
+  stream.open("__tmp__/host.cpp");
+  GenHostHeaders(stream, platform);
+
+  stream << "int main(int argc, char ** argv) {\n";
+  indent += 2;
+
+  int cnt = 0; // label the constant value
+  for (int i = 0; i < args.size(); i++) {
+    if (args[i].type_code() == kArrayHandle) {
+      // read from the shared memory
+      PrintIndent(stream, indent);
+      stream << Type2Byte(arg_types[i]) << "* "; 
+      stream << "arg_" << i << " = ";
+      stream << "(" << Type2Byte(arg_types[i]) << "*)";
+      stream << "shmat(" << shmids[i] << ", nullptr, 0);\n";
+      PrintIndent(stream, indent);
+
+      stream << Type2Byte(arg_types[i]) << " ";
+      stream << std::get<0>(arg_info[i]);
+      TVMArray* arr = args[i];
+
+      stream << "[";
+      for (int j = 0; j < arr->ndim; j++) {
+        if (j == arr->ndim - 1) {
+          stream << arr->shape[j];
+        } else {
+          stream << arr->shape[j];
+          stream << " * ";
+        }
+      }
+      stream << "];\n";
+      PrintCopy(arr, arg_info, stream, indent, i);
+
+    } else {
+      // directly assign the value to the variable
+      PrintIndent(stream, indent);
+      stream << Type2Byte(arg_types[i]) << " ";
+      stream << "arg_" << i << " = ";
+      stream << "(" << Type2Byte(arg_types[i]) << ")";
+      if (args[i].type_code() == kDLInt || 
+          args[i].type_code() == kDLUInt) {
+        stream << int64_t(args[i]);
+      }
+      stream << ";\n";
+      PrintIndent(stream, indent);
+      stream << Type2Byte(arg_types[i]) << " ";
+      stream << "arg_top_" << i;
+      stream << "[1] = { ";
+
+      stream << "arg_" << i << " }";
+      if (arg_types[i].fracs > 0)
+        stream << " >> " << static_cast<int>(arg_types[i].fracs);
+      stream << ";\n";
+      cnt += 1;
+    }
+    stream << "\n";
+  }
+
+  // allocate mem for stream vars
+  for (size_t k = args.size(); k < arg_info.size(); k++) {
+    auto type = std::get<2>(arg_info[k]);
+    auto shape = std::get<3>(arg_info[k]);
+    PrintIndent(stream, indent);
+    stream << Type2Byte(Type2TVMType(type)) << " " << "name[";
+    if (shape.size() > 0) {
+      for (size_t i = 0; i < shape.size(); i++) {
+        if (i != shape.size() - 1)
+          stream << shape[i] << " * ";
+        else stream << shape[i];
+      }
+    } else {
+      stream << "1";
+    }
+    stream << "];\n";
+  }
+
+  // generate host side (before kernel)
+  PrintIndent(stream, indent);
+  stream << "printf(\"Finished setting up shared memory\\n\");\n";
+  PrintIndent(stream, indent);
+  stream << "// compute bofore kernel function\n";
+  size_t pos = host_code.find("top(");
+  std::string pre_kernel  = host_code.substr(0, pos -1);
+  std::string post_kernel = host_code.substr(host_code.find('\n', pos) + 1);
+  pre_kernel = pre_kernel.substr(pre_kernel.find_first_not_of("\n"));
+  pre_kernel = pre_kernel.substr(pre_kernel.find_first_not_of(" "));
+  PrintIndent(stream, indent);
+  
+  if (platform == "sdaccel") {
+    // create variable wrapper
+    stream << pre_kernel << "\n";
+    KernelInit(stream, platform, args,
+               arg_types, arg_info);
+  } else if (platform == "vivado_hls") {
+    // init hls stream channels 
+    for (size_t k = 0; k < arg_info.size(); k++) {
+      auto info = arg_info[k]; 
+      if (std::get<1>(info)) {
+        PrintIndent(stream, indent);
+        stream << "hls::stream<" 
+               << PrintHalideType(std::get<2>(info)) 
+               << "> " << "fd_" << std::get<0>(info) << ";\n";
+      }  
+    }
+    PrintIndent(stream, indent);
+    stream << pre_kernel << "\n";
+    PrintIndent(stream, indent);
+    // create kernel call from host 
+    stream << "top(";
+    for (size_t i = 0; i < arg_info.size(); i++) {
+      auto info = arg_info[i];
+      auto name = std::get<0>(info);
+      if (i != 0) stream << ", ";
+      stream << "fd_" << name;
+    }
+    stream << ");\n";
+  }
+
+  // generate host (post-kernel)
+  PrintIndent(stream, indent);
+  stream << "// compute after kernel function\n";
+  stream << post_kernel;
+
+  // copy to shared mem
+  for (int i = 0; i < args.size(); i++) {
+    if (args[i].type_code() == kArrayHandle) {
+      TVMArray* arr = args[i];
+      PrintCopyBack(arr, arg_info, stream, indent, i);
+      PrintIndent(stream, indent);
+      stream << "shmdt(";
+      stream << "arg_" << i << ");\n";
+    }
+  }
+
+  stream << "\n\n";
+  PrintIndent(stream, indent);
+  stream << "}\n";
+  stream.close();
+
+}
+}  // namespace runtime
+}  // namespace TVM
diff --git a/tvm/src/codegen/build_util.h b/tvm/src/codegen/build_util.h
new file mode 100644
index 000000000..ca95364c1
--- /dev/null
+++ b/tvm/src/codegen/build_util.h
@@ -0,0 +1,70 @@
+/*!
+ *  Copyright (c) 2019 by Contributors
+ *  Common build utilities
+ * \file build_util.h
+ */
+#ifndef TVM_CODEGEN_BUILD_HELPER_H_
+#define TVM_CODEGEN_BUILD_HELPER_H_
+
+#include <tvm/codegen.h>
+#include <unordered_map>
+#include <string>
+#include "../runtime/meta_data.h"
+
+namespace TVM {
+namespace runtime {
+
+using argInfo = 
+    std::vector<std::tuple<std::string, bool, Type, std::vector<int>>>;
+
+// get current work directory
+std::string getpath(void);
+void PrintIndent(std::ofstream& stream, int indent);
+inline size_t GetTypeSize(TVMType t);
+inline size_t GetDataSize(TVMArray* arr);
+inline TVMType Type2TVMType(Type t);
+inline std::string PrintHalideType(Type t);
+inline std::string Type2Str(TVMType t);
+inline std::string Type2ExtStr(TVMType t);
+inline std::string Type2WrapStr(TVMType t);
+inline std::string Type2Byte(TVMType t);
+
+void CollectArgInfo(TVMArgs& args, 
+                    LoweredFunc func,
+                    std::vector<size_t>& arg_sizes,
+                    std::vector<TVMType>& arg_types);
+
+void GenSharedMem(TVMArgs& args,
+                  std::vector<int>& shmids,
+                  std::vector<size_t>& arg_sizes);
+
+void FreeSharedMem(TVMArgs& args, 
+                   const std::vector<int>& shmids,
+                   std::vector<size_t>& arg_sizes);
+
+void PrintCopy(TVMArray* arr, 
+               std::ofstream& stream, 
+               int indent, size_t nth_arr);
+
+void PrintCopyBack(TVMArray* arr, 
+                   std::ofstream& stream, 
+                   int indent, size_t nth_arr);
+
+void GenKernelCode(std::string test_file);
+
+void GenWrapperCode(TVMArgs& args,
+                 const std::vector<int>& shmids,
+                 const std::vector<TVMType>& arg_types,
+                 argInfo& arg_info,
+                 LoweredFunc func);
+
+void GenHostCode(TVMArgs& args,
+                 const std::vector<int>& shmids,
+                 const std::vector<TVMType>& arg_types,
+                 LoweredFunc func,
+                 std::string platform,
+                 std::string host_code,
+                 argInfo& arg_info);
+} // namespace runtime
+} // namespace TVM
+#endif  // TVM_CODEGEN_BUILD_HELPER_H_
diff --git a/tvm/src/codegen/codegen_c.cc b/tvm/src/codegen/codegen_c.cc
index 7373711f4..006edf933 100644
--- a/tvm/src/codegen/codegen_c.cc
+++ b/tvm/src/codegen/codegen_c.cc
@@ -2,9 +2,12 @@
  *  Copyright (c) 2017 by Contributors
  * \file codegen_c.cc
  */
+#include <tvm/build_module.h>
+#include <tvm/ir_pass.h>
 #include <iomanip>
 #include <cctype>
 #include "./codegen_c.h"
+#include "./merlinc/codeanalys_merlinc.h"
 #include "../arithmetic/compute_expr.h"
 
 namespace TVM {
@@ -12,6 +15,123 @@ namespace codegen {
 
 using namespace ir;
 
+Type String2Type(std::string& s) {
+  if (s.front() == '\"' && s.back() == '\"') {
+    s.erase(0, 1);
+    s.pop_back();
+  }
+  std::istringstream is(s);
+  halideir_type_code_t code = Type::Int;
+  if (s.substr(0, 3) == "int") {
+    code = Type::Int; s = s.substr(3);
+  } else if (s.substr(0, 4) == "uint") {
+    code = Type::UInt; s = s.substr(4);
+  } else if (s.substr(0, 5) == "float") {
+    code = Type::Float; s = s.substr(5);
+  } else if (s.substr(0, 5) == "float") {
+    code = Type::Float; s = s.substr(5);
+  } else if (s == "handle") {
+    return Handle();
+  } else {
+    LOG(FATAL) << "unknown type " << s;
+  }
+  int bits = 32, lanes = 1;
+  if (sscanf(s.c_str(), "%dx%d", &bits, &lanes) == 0) {
+    LOG(FATAL) << "unknown type " << s;
+  }
+  return Type(code, bits, lanes);
+}
+
+// generate row major index
+std::string getIndex(std::vector<int> shape) {
+  std::string str;
+  int mul = 1;
+  for (size_t i = shape.size(); i > 0; i--) {
+    mul = mul * shape[i-1];
+    str += "i" + std::to_string(i-1) +
+           "*" + std::to_string(mul);
+    if (i != 1) str += "+ ";
+  }
+  return str;
+}
+
+// collect type info for vars
+void TypeCollector::Visit_(const Allocate *op) {
+  auto v = op->buffer_var.get();
+  if (top_args_.count(v)) {
+    std::vector<int> shape;
+    for (size_t i = 0; i < op->extents.size(); i++) 
+      shape.push_back(op->extents[i].as<IntImm>()->value);
+    top_args_[v] = std::make_tuple(std::get<0>(top_args_[v]), op->type, shape);
+  }
+  IRVisitor::Visit_(op);
+}
+
+void StreamCollector::Visit_(const Allocate *op) {
+  this->HandleDef(op->buffer_var.get());
+  IRVisitor::Visit_(op);
+}
+    
+void StreamCollector::Visit_(const Load *op) {
+  this->HandleUse(op->buffer_var);
+  IRVisitor::Visit_(op);
+}
+
+// update placeholder status
+void StreamCollector::Visit_(const Store* op) {
+  if (auto val = op->value.as<StreamExpr>()) {
+    this->HandleDef(op->buffer_var.get());
+  }
+  this->HandleUse(op->buffer_var);
+  IRVisitor::Visit_(op);
+}
+
+void StreamCollector::Visit_(const StreamStmt* op) {
+  this->HandleDef(op->buffer_var.get());
+  IRVisitor::Visit_(op);
+}
+
+void StreamCollector::Visit_(const AttrStmt* op) {
+  if (op->attr_key == attr::device_scope) { 
+    if (op->value.as<StringImm>()->value != scope_)
+      switch_on = true;
+    else switch_on = false;
+  }
+  IRVisitor::Visit_(op);
+}
+
+// additional data saved into stream table 
+void StreamCollector::HandleDef(const Variable* v) {
+  if (!switch_on) { // def on host scope 
+    CHECK(!host_def_count_.count(v))
+        << "variable " << v->name_hint
+        << " has already been defined, the Stmt is not SSA";
+    CHECK(!host_use_count_.count(v))
+        << "variable " << v->name_hint
+        << " has been used before definition!";
+    host_use_count_[v] = 0;
+    host_def_count_[v] = 1;
+  }
+}
+
+void StreamCollector::HandleUse(const Expr& v) {
+  CHECK(v.as<Variable>());
+  Var var(v.node_);
+  auto it = host_use_count_.find(var.get());
+  if (!switch_on) { // def on host scope 
+    if (it != host_use_count_.end()) {
+      if (it->second >= 0) {
+        ++it->second;
+      }
+    } else {
+      if (!stream_table_.count(var.get())) {
+        host_undefined_.push_back(var);
+        host_use_count_[var.get()] = -1;
+      }
+    }
+  }
+}
+
 void CodeGenC::Init(bool output_ssa) {
   print_ssa_form_ = output_ssa;
 }
@@ -19,44 +139,50 @@ void CodeGenC::Init(bool output_ssa) {
 void CodeGenC::InitFuncState(LoweredFunc f) {
   alloc_storage_scope_.clear();
   handle_data_type_.clear();
+  var_shape_map_.clear();
+  range_.clear();
   CodeGenSourceBase::ClearFuncState();
 }
-void CodeGenC::AddFunction(LoweredFunc f) {
+
+void CodeGenC::AddFunction(LoweredFunc f,
+        str2tupleMap<std::string, Type> map_arg_type) {
   // clear previous generated state.
   this->InitFuncState(f);
-  // skip the first underscore, so SSA variable starts from _1
-  GetUniqueName("_");
+  map_arg_type_ = map_arg_type;
   // add to alloc buffer type.
   for (const auto & kv : f->handle_data_type) {
     RegisterHandleType(kv.first.get(), kv.second.type());
   }
 
+  // generate function signature 
   this->stream << "void " << f->name << "(";
   for (size_t i = 0; i < f->args.size(); ++i) {
     Var v = f->args[i];
     std::string vid = AllocVarID(v.get());
     if (i != 0) stream << ", ";
-    if (v.type().is_handle()) {
-      auto it = alloc_storage_scope_.find(v.get());
-      if (it != alloc_storage_scope_.end())
-        PrintStorageScope(it->second, stream);
-      stream << ' ';
-
-      if (handle_data_type_.count(v.get())) {
-        PrintType(handle_data_type_.at(v.get()), stream);
-      } else {
-        stream << "void";
-      }
-      stream << "*";
-
-      if (f->is_restricted && restrict_keyword_.length() != 0) {
-        stream << ' ' << restrict_keyword_;
-      }
+    // check type in the arg map
+    if (map_arg_type.find(vid) == map_arg_type.end()) {
+      LOG(WARNING) << vid << " type not found\n";
+      PrintType(v.type(), this->stream);
+      this->stream << ' ' << vid;
     } else {
-      PrintType(v.type(), stream);
+      auto arg = map_arg_type[vid];
+      PrintType(std::get<1>(arg), this->stream);
+      this->stream << "* " << std::get<0>(arg);
+      const BufferNode* buf = f->api_args[i].as<BufferNode>();
+      if (v.type().is_handle() && buf) {
+        std::vector<int> shape;
+        for (size_t i = 0; i < buf->shape.size(); i++) 
+          shape.push_back(buf->shape[i].as<IntImm>()->value);
+        arg_shapes.push_back(shape);
+        var_shape_map_[buf->data.get()] = buf->shape;
+        auto it = alloc_storage_scope_.find(v.get());
+        if (it != alloc_storage_scope_.end())
+          PrintStorageScope(it->second, stream);
+      }
     }
-    stream << ' ' << vid;
   }
+
   stream << ") {\n";
   int func_scope = this->BeginScope();
   this->PrintStmt(f->body);
@@ -65,8 +191,49 @@ void CodeGenC::AddFunction(LoweredFunc f) {
   this->stream << "}\n\n";
 }
 
+std::string CodeGenC::GetHost() {
+  if (!fpga_scope_)
+    host_stream << stream.str(); 
+  std::string postproc = host_stream.str();
+  postproc.erase(postproc.rfind("}") - 1, 
+                 postproc.length() - 1);
+  postproc.erase(0, postproc.find("{") + 1);
+  return postproc + "\n\n";
+}
+
+std::string CodeGenC::GetDevice() {
+  std::ostringstream device;
+  device << "void top(" << arg_stream.str() << "){\n";
+
+  // process device code
+  PreProcess(device); 
+  // remove the kernel name alloc
+  auto text = device_stream.str();
+  for (auto const& m : stream_arg_pos) {
+    std::string alloc = m.first + ";";
+    size_t nFPos = text.find(alloc);
+    size_t secondNL = text.find('\n', nFPos);
+    size_t firstNL = text.rfind('\n', nFPos);
+    text.erase(firstNL, secondNL - firstNL);
+  }
+  device << text;
+  PostProcess(device);
+
+  if (fpga_scope_) device << stream.str();
+  return decl_stream.str() + module_stream.str() + 
+         device.str() + "}\n\n";
+}
+
 std::string CodeGenC::Finish() {
-  return decl_stream.str() + stream.str();
+  std::ostringstream device;
+  device << "void top(" << arg_stream.str() 
+         << "){\n" << device_stream.str();
+  if (fpga_scope_) device << stream.str();
+  else host_stream << stream.str(); 
+  device << "}\n";
+  return decl_stream.str() + "\n{device}\n" + 
+         module_stream.str() + device.str() + "\n{device}\n" + 
+         "\n{host}\n" + host_stream.str() + "\n{host}\n";
 }
 
 void CodeGenC::PrintExpr(const Expr& n, std::ostream& os) {  // NOLINT(*)
@@ -286,7 +453,7 @@ void CodeGenC::PrintStorageScope(const std::string& scope, std::ostream& os) { /
 
 void CodeGenC::PrintType(Type t, std::ostream& os) {  // NOLINT(*)
   CHECK_EQ(t.lanes(), 1)
-      << "do not yet support vector types";
+     << "do not yet support vector types";
   if (t.is_handle()) {
     os << "void*"; return;
   }
@@ -314,7 +481,6 @@ void CodeGenC::PrintType(Type t, std::ostream& os) {  // NOLINT(*)
   LOG(FATAL) << "Cannot convert type " << t << " to C type";
 }
 
-
 inline void PrintConst(const IntImm* op, std::ostream& os, CodeGenC* p) { // NOLINT(*)
   if (op->type == Int(32)) {
     std::ostringstream temp;
@@ -619,7 +785,7 @@ void CodeGenC::VisitStmt_(const Store* op) {
   Type t = op->value.type();
   if (t.lanes() == 1) {
     std::string value = this->PrintExpr(op->value);
-    std::string ref  = this->GetBufferRef(t, op->buffer_var.get(), op->index);
+    std::string ref = this->GetBufferRef(t, op->buffer_var.get(), op->index);
     this->PrintIndent();
     stream << ref << " = " << value << ";\n";
   } else {
@@ -714,49 +880,92 @@ void CodeGenC::VisitExpr_(const GetSlice *op, std::ostream& os) { // NOLINT(*)
 }
 
 void CodeGenC::VisitExpr_(const SetBit *op, std::ostream& os) { // NOLINT(*)
-  LOG(FATAL) << "SetBit is not implemented yet";
+  LOG(FATAL) << "SetBit is not implemented yet in C";
 }
 
 void CodeGenC::VisitExpr_(const SetSlice *op, std::ostream& os) { // NOLINT(*)
-  LOG(FATAL) << "SetSlice is not implemented yet";
+  LOG(FATAL) << "SetSlice is not implemented yet in C";
 }
 
 void CodeGenC::VisitExpr_(const Quantize *op, std::ostream& os) { // NOLINT(*)
- LOG(FATAL) << "Quantize is not yet support";
+  LOG(FATAL) << "Quantize is not yet support in C";
+}
+
+void CodeGenC::VisitExpr_(const StreamExpr *op, std::ostream& os) { // NOLINT(*)
+  auto v = op->buffer_var.get();
+  auto it = var_idmap_.find(v);
+  CHECK(it != var_idmap_.end())
+    << "variable " << v->name_hint << " not decalred";
 }
 
 void CodeGenC::VisitExpr_(const KernelExpr *op, std::ostream& os) { // NOLINT(*)
-  LOG(FATAL) << "KernelExpr is not yet support";
+  os << op->name << "(";
+  for (size_t i = 0; i < op->args.size(); ++i) {
+    PrintExpr(op->args[i], os);
+    if (i != op->args.size() - 1) os << ", ";
+  }
+  os << ")";
+}
+
+void CodeGenC::VisitStmt_(const StreamStmt *op) { // NOLINT(*)
+    CHECK(!var_idmap_.count(op->buffer_var.get())); 
+    std::string vid = AllocVarID(op->buffer_var.get());
+    vid = GetVarID(op->value.as<Load>()->buffer_var.get()); 
+    PrintIndent();
+    auto load_op = op->value.as<Load>(); 
+    auto v = load_op->buffer_var.as<Variable>();
+    // placeholder args using recv name 
+    if (stream_table.count(v)) {
+      auto tuple = arg_top_vars[v];
+      arg_top_vars[v] = std::make_tuple(vid, std::get<1>(tuple),
+                                        std::get<2>(tuple));
+      stream_table[v] = true;
+    } // else: streamed externop defined in analysis
+    // PrintExpr(op->value, stream);
+    // stream << vid << ".write()\n";
 }
 
 void CodeGenC::VisitStmt_(const LetStmt* op) {
   std::string value = PrintExpr(op->value);
+  // Skip the argument retrieving assign statement
+  std::string vid = AllocVarID(op->var.get());
   if (print_ssa_form_) {
     CHECK(!var_idmap_.count(op->var.get()));
     var_idmap_[op->var.get()] = value;
   } else {
     PrintIndent();
-    if (op->var.type() == Handle() &&
-        handle_data_type_.count(op->var.get())) {
-      PrintType(handle_data_type_.at(op->var.get()), stream);
-      stream << "* "
-             << AllocVarID(op->var.get())
-             << " = (";
-      PrintType(handle_data_type_.at(op->var.get()), stream);
-      stream << "*)"  << value << ";\n";
-    } else {
+    if (op->var.type() != Handle() &&
+        value.find("TVMArray") == std::string::npos &&
+        value.find("arg") != 0) {
+      PrintIndent();
       PrintType(op->var.type(), this->stream);
       this->stream << ' '
-                   << AllocVarID(op->var.get())
+                   << vid
                    << " = " << value << ";\n";
+    // modify var idmap for passed in args
+    } else if (value.find("data") != std::string::npos ||
+               value.substr(0, 3) == "arg") {
+      auto v = op->var.get();
+      arg_vars.push_back(v);
+      stream_table[v] = false; 
+      std::string api_name = "arg" + std::to_string(arg_count);
+      auto arg = map_arg_type_[api_name];
+      // PrintType(std::get<1>(arg), arg_stream);
+      CHECK(arg_count < arg_shapes.size());
+      auto shape = arg_shapes[arg_count];
+      arg_top_vars[v] = std::make_tuple(vid, std::get<1>(arg), shape);
+      arg_count += 1;
     }
+    PrintStmt(op->body);
   }
-  PrintStmt(op->body);
 }
 
 void CodeGenC::VisitStmt_(const Allocate* op) {
   CHECK(!is_zero(op->condition));
-  std::string vid = AllocVarID(op->buffer_var.get());
+  std::string vid; 
+  if (!var_idmap_.count(op->buffer_var.get())) 
+    vid = AllocVarID(op->buffer_var.get());
+  else vid = GetVarID(op->buffer_var.get());
   if (op->new_expr.defined()) {
     // Prefer global static allocation for the program
     CHECK_EQ(op->free_function, "nop");
@@ -799,6 +1008,64 @@ void CodeGenC::VisitStmt_(const AttrStmt* op) {
     const Variable* v = op->node.as<Variable>();
     CHECK(v);
     volatile_buf_.insert(v);
+  } else if (op->attr_key == ir::attr::device_scope) {
+    // print top( ... in host and enter fpga scope 
+    if (op->value.as<StringImm>()->value == "fpga" && !fpga_scope_) {
+      fpga_scope_ = true;
+      PrintIndent();
+       
+      // track the stream usage
+      StreamCollector collector(stream_table, "cpu");
+      collector.Visit(op->body);
+
+      // update data type and name 
+      for (auto k : collector.host_undefined_) {
+        auto v = k.get();
+        arg_vars.push_back(v);
+        stream_table[v] = true;
+        auto tuple = arg_top_vars[v];
+        arg_top_vars[v] = std::make_tuple(v->name_hint,
+                                          std::get<1>(tuple),
+                                          std::get<2>(tuple)); 
+      }
+      TypeCollector visitor(arg_top_vars);
+      visitor.Visit(op->body);
+  
+      // generte function calls 
+      stream << "top(";
+      int index = 0;
+      for (size_t i = 0; i < arg_vars.size(); i++) {
+        auto v = arg_vars[i];
+        std::string arg_name;
+        if (stream_table[v]) 
+          arg_name = std::get<0>(arg_top_vars[v]);
+        else arg_name = GetVarID(v); 
+        if (index !=0) stream << ", ";
+        stream << arg_name;
+        // print kernel func signature
+        if (index != 0) arg_stream << ", ";
+        PrintType(std::get<1>(arg_top_vars[v]), arg_stream);
+        auto shape = std::get<2>(arg_top_vars[v]);
+        arg_stream << " " << arg_name;
+        for (size_t k = 0; k < shape.size(); k++)
+          arg_stream << "[" << shape[k] << "]";
+        index++;
+      }
+      stream << ");\n";
+  
+      // switch context to device scope
+      host_stream << this->stream.str();
+      this->stream.str("");
+      this->stream.clear();
+  
+    // swtich from device to host
+    } else if (op->value.as<StringImm>()->value == "cpu" && 
+               fpga_scope_) {
+      fpga_scope_ = false;
+      device_stream << this->stream.str();
+      this->stream.str("");
+      this->stream.clear();
+    }
   }
   this->PrintStmt(op->body);
 }
@@ -889,17 +1156,75 @@ void CodeGenC::VisitStmt_(const ProducerConsumer *op) {
   PrintStmt(op->body);
 }
 
-void CodeGenC::VisitStmt_(const KernelDef *op) {
-  LOG(FATAL) << "KernelDef is not yet support";
+void CodeGenC::VisitStmt_(const KernelDef* op) {
+  LoweredFunc f;
+  // save func states
+  SaveFuncState(f);
+  InitFuncState(f);
+  std::ostringstream save;
+  save << this->stream.str();
+  this->stream.str("");
+  this->stream.clear();
+
+  // skip the first underscore
+  GetUniqueName("_");
+  // add to alloc buffer : type.
+  for (const auto & k : op->args) {
+    RegisterHandleType(k.get(), k.get()->type);
+  }
+  // print function signature
+  PrintType(op->ret_type, stream);
+  stream << " " << op->name << "(";
+  for (size_t k = 0; k < op->channels.size(); k+=2) {
+    int pos = op->channels[k].as<IntImm>()->value;  
+    stream_arg_pos[op->name].insert(pos);
+  }
+  for (size_t i = 0; i < op->args.size(); ++i) {
+    VarExpr v = op->args[i];
+    var_shape_map_[v.get()] = op->api_args[i];
+    std::string vid = AllocVarID(v.get());
+    if (i != 0) stream << ", ";
+    std::string str = PrintExpr(op->api_types[i]);
+    Type type = String2Type(str);
+    PrintType(type, stream);
+    this->stream << " " << vid << "[";
+    if (v.type().is_handle()) {
+      for (size_t j = 0; j < op->api_args[i].size(); j++) {
+        if (j != 0) stream << "* ";
+        auto dim = op->api_args[i][j].as<IntImm>()->value;
+        this->stream << dim;
+      }
+      this->stream << ']';
+    }
+  }  
+  stream << ") {\n";
+  int func_scope = BeginScope();
+  range_ = CollectIterRange(op->body);
+  PrintStmt(op->body);
+  EndScope(func_scope);
+  stream << "}\n\n";
+
+  // restore default stream
+  module_stream << this->stream.str();
+  this->stream.str(""); 
+  this->stream.clear();
+  this->stream << save.str();
+  RestoreFuncState(f);
 }
 
 void CodeGenC::VisitStmt_(const KernelStmt *op) {
-  LOG(FATAL) << "KernelStmt is not yet support";
+  PrintIndent();
+  stream << op->name << "(";
+  for (size_t i = 0; i < op->args.size(); i++) {
+    PrintExpr(op->args[i], stream);
+    if (i < op->args.size() -1) stream << ", ";
+  }
+  stream << ");\n";
 }
 
 void CodeGenC::VisitStmt_(const Return *op) {
   this->stream << "return ";
-  PrintExpr(op->value);
+  PrintExpr(op->value, stream);
   this->stream << ";\n";
 }
 
@@ -922,5 +1247,28 @@ void CodeGenC::VisitStmt_(const While *op) {
 void CodeGenC::VisitStmt_(const Partition* op) {
 }
 
+void CodeGenC::SaveFuncState(LoweredFunc f) {
+  // clear save info copy
+  alloc_storage_scope_save.clear();
+  handle_data_type_save.clear();
+  var_shape_map_save.clear();
+  range_save.clear();
+  // backup func info and clear
+  alloc_storage_scope_save = alloc_storage_scope_;
+  handle_data_type_save = handle_data_type_;
+  var_shape_map_save = var_shape_map_;
+  range_save = range_;
+  CodeGenSourceBase::SaveFuncState();
+}
+
+void CodeGenC::RestoreFuncState(LoweredFunc f) {
+  this->InitFuncState(f);
+  alloc_storage_scope_ = alloc_storage_scope_save;
+  handle_data_type_ = handle_data_type_save;
+  var_shape_map_ = var_shape_map_save;
+  range_ = range_save;
+  CodeGenSourceBase::RestoreFuncState();
+}
+
 }  // namespace codegen
 }  // namespace TVM
diff --git a/tvm/src/codegen/codegen_c.h b/tvm/src/codegen/codegen_c.h
index f579ca579..d7292b38f 100644
--- a/tvm/src/codegen/codegen_c.h
+++ b/tvm/src/codegen/codegen_c.h
@@ -8,6 +8,7 @@
 
 #include <tvm/ir.h>
 #include <tvm/ir_functor_ext.h>
+#include <tvm/ir_visitor.h>
 #include <tvm/codegen.h>
 #include <tvm/lowered_func.h>
 #include <string>
@@ -15,11 +16,64 @@
 #include <unordered_map>
 #include <unordered_set>
 #include "./codegen_source_base.h"
+#include "./merlinc/codeanalys_merlinc.h"
+#include "../runtime/thread_storage_scope.h"
 
 namespace TVM {
 namespace codegen {
 
 using namespace ir;
+template<class T, class V>
+using str2tupleMap = std::unordered_map<std::string, std::tuple<T, V>>;
+using var2nameType = std::unordered_map<const Variable*, 
+    std::tuple<std::string, Type, std::vector<int>>>; 
+
+Type String2Type(std::string& s);
+std::string getIndex(std::vector<int> shape);
+
+/*!
+ * \brief A data type collector  
+ *
+ *  CodeGenC TypeCollector gathers information  
+ *  of different types of each variable
+ *
+ */
+class TypeCollector final : public IRVisitor {
+ public:
+  var2nameType& top_args_;
+  TypeCollector(var2nameType& top_args) : top_args_(top_args) {};
+  void Visit_(const Allocate *op);
+};
+
+/*!
+ * \brief An undefined variable collector
+ *
+ * CodeGenC stream data collector detects undefined 
+ * variable and create channels for them
+ *
+ * */
+class StreamCollector final : public IRVisitor {
+ public:
+  Array<Var> host_undefined_;
+  std::unordered_map<const Variable*, int> host_use_count_;
+  std::unordered_map<const Variable*, int> host_def_count_;
+  StreamCollector(std::unordered_map<const Variable*, bool>& stream_table,
+                  std::string initial_scope)
+    : stream_table_(stream_table),
+      scope_(initial_scope) {};
+  void Visit_(const Allocate *op);
+  void Visit_(const Load *op);
+  void Visit_(const Store *op);
+  void Visit_(const StreamStmt *op);
+  void Visit_(const AttrStmt *op);
+  void HandleDef(const Variable* v);
+  void HandleUse(const Expr& v);
+ private: 
+  std::unordered_map<const Variable*, bool>& stream_table_;
+  std::string scope_;
+  bool switch_on{true};
+};
+
 /*!
  * \brief A base class to generate C code.
  *
@@ -44,12 +98,22 @@ class CodeGenC :
    * \brief Add the function to the generated module.
    * \param f The function to be compiled.
    */
-  void AddFunction(LoweredFunc f);
+  void AddFunction(LoweredFunc f, str2tupleMap<std::string, Type> map_arg_type);
   /*!
    * \brief Finalize the compilation and return the code.
    * \return The code.
    */
   std::string Finish();
+  /*!
+   * \brief Finalize the compilation and return the code.
+   * \return The host code.
+   */
+  std::string GetHost();
+  /*!
+   * \brief Finalize the compilation and return the code.
+   * \return The device code.
+   */
+  std::string GetDevice();
   /*!
    * \brief Print the Stmt n to CodeGenC->stream
    * \param n The statement to be printed.
@@ -113,6 +177,7 @@ class CodeGenC :
   void VisitExpr_(const SetSlice* op, std::ostream& os) override;  // NOLINT(*)
   void VisitExpr_(const Quantize* op, std::ostream& os) override;  // NOLINT(*)
   void VisitExpr_(const KernelExpr* op, std::ostream& os) override;  // NOLINT(*)
+  void VisitExpr_(const StreamExpr* op, std::ostream& os) override;  // NOLINT(*)
   // statment
   void VisitStmt_(const LetStmt* op) override;
   void VisitStmt_(const Store* op) override;
@@ -126,6 +191,7 @@ class CodeGenC :
   void VisitStmt_(const ProducerConsumer* op) override;
   void VisitStmt_(const KernelDef* op) override;
   void VisitStmt_(const KernelStmt* op) override;
+  void VisitStmt_(const StreamStmt* op) override;
   void VisitStmt_(const Return* op) override;
   void VisitStmt_(const Break* op) override;
   void VisitStmt_(const While* op) override;
@@ -159,10 +225,38 @@ class CodeGenC :
   // print store of single element.
   virtual void PrintVecElemStore(
       const std::string& vec, Type t, int i, const std::string& value);
-  // Get a cast type from to
+  // get a cast type from to
   virtual std::string CastFromTo(std::string value, Type from, Type target);
 
+  // map from var to shape, range and type
+  std::map<const Variable*, Array<Expr> > var_shape_map_;
+  std::unordered_map<const Variable*, Expr> range_;
+  str2tupleMap<std::string, Type> map_arg_type_;
+
+  // save for kernel 
+  std::map<const Variable*, Array<Expr> > var_shape_map_save;
+  std::unordered_map<const Variable*, Expr> range_save;
+
+  // index into ap_arg_type
+  size_t arg_count{0};
+  // map {var : (vid, Type, shape)}
+  var2nameType arg_top_vars;
+  // vector {vars} in top function 
+  std::vector<const Variable*> arg_vars;
+  // vector of top function arg dimension 
+  std::vector<std::vector<int>> arg_shapes;
+  // whether the function arg is streamed
+  std::unordered_map<const Variable*, bool> stream_table;
+  // map from kernel name to set of streamed arg position index
+  std::unordered_map<std::string, std::unordered_set<int>> stream_arg_pos;
+  // pre and post processing device code
+  virtual void PreProcess(std::ostringstream& os) {};
+  virtual void PostProcess(std::ostringstream& os) {};
+
  protected:
+  void SaveFuncState(LoweredFunc f);
+  void RestoreFuncState(LoweredFunc f);
+
   // Print reference to struct location
   std::string GetStructRef(
       Type t, const Expr& buffer, const Expr& index, int kind);
@@ -186,12 +280,22 @@ class CodeGenC :
       const std::string& target, const std::string& src, Type t) final;
   /*! \brief restrict keyword */
   std::string restrict_keyword_{""};
+  /*! \brief the func arg decl stream */
+  std::ostringstream arg_stream;
   /*! \brief the storage scope of allocation */
   std::unordered_map<const Variable*, std::string> alloc_storage_scope_;
   /*! \brief the data type of allocated buffers */
   std::unordered_map<const Variable*, Type> handle_data_type_;
   std::unordered_map<const Variable*, int> buf_length_map_;
 
+  // save for kernel gen
+  std::unordered_map<const Variable*, std::string> alloc_storage_scope_save;
+  std::unordered_map<const Variable*, Type> handle_data_type_save;
+  std::unordered_map<const Variable*, std::string> var_idmap_save;
+  std::unordered_map<std::string, int> name_alloc_map_save;
+  std::unordered_map<std::string, SSAEntry> ssa_assign_map_save;
+  std::vector<bool> scope_mark_save;
+
  private:
   /*! \brief whether to print in SSA form */
   bool print_ssa_form_{false};
diff --git a/tvm/src/codegen/codegen_cuda.cc b/tvm/src/codegen/codegen_cuda.cc
index badbf2849..3c675ad06 100644
--- a/tvm/src/codegen/codegen_cuda.cc
+++ b/tvm/src/codegen/codegen_cuda.cc
@@ -25,9 +25,10 @@ void CodeGenCUDA::Init(bool output_ssa) {
   CHECK_EQ(vid_global_barrier_state_, runtime::symbol::tvm_global_barrier_state);
 }
 
-void CodeGenCUDA::AddFunction(LoweredFunc f) {
+void CodeGenCUDA::AddFunction(LoweredFunc f,
+         str2tupleMap<std::string, Type> map_arg_type) {
   this->stream << "extern \"C\" __global__ ";
-  CodeGenC::AddFunction(f);
+  CodeGenC::AddFunction(f, map_arg_type);
 }
 
 void CodeGenCUDA::VisitStmt_(const ir::For* op) {
diff --git a/tvm/src/codegen/codegen_cuda.h b/tvm/src/codegen/codegen_cuda.h
index e49a47ae3..e0c4f1a41 100644
--- a/tvm/src/codegen/codegen_cuda.h
+++ b/tvm/src/codegen/codegen_cuda.h
@@ -10,6 +10,7 @@
 #include <tvm/packed_func_ext.h>
 #include <string>
 #include "./codegen_c.h"
+#include "./merlinc/codeanalys_merlinc.h"
 
 namespace TVM {
 namespace codegen {
@@ -18,7 +19,8 @@ class CodeGenCUDA final : public CodeGenC {
  public:
   CodeGenCUDA();
   void Init(bool output_ssa);
-  void AddFunction(LoweredFunc f);
+  void AddFunction(LoweredFunc f, 
+      str2tupleMap<std::string, Type> map_arg_type);
   // override behavior
   void VisitStmt_(const ir::For* op) final;
   void PrintStorageSync(const Call* op) final;
diff --git a/tvm/src/codegen/codegen_opencl.h b/tvm/src/codegen/codegen_opencl.h
deleted file mode 100644
index 088ab089a..000000000
--- a/tvm/src/codegen/codegen_opencl.h
+++ /dev/null
@@ -1,51 +0,0 @@
-/*!
- *  Copyright (c) 2017 by Contributors
- * \file codegen_opencl.h
- * \brief Generate OpenCL device code.
- */
-#ifndef TVM_CODEGEN_CODEGEN_OPENCL_H_
-#define TVM_CODEGEN_CODEGEN_OPENCL_H_
-
-#include <tvm/codegen.h>
-#include <tvm/packed_func_ext.h>
-#include <string>
-#include "./codegen_c.h"
-
-namespace TVM {
-namespace codegen {
-
-class CodeGenOpenCL final : public CodeGenC {
- public:
-  CodeGenOpenCL();
-  void AddFunction(LoweredFunc f);
-  std::string Finish();
-
-  // override print thread tag.
-  void InitFuncState(LoweredFunc f) final;
-  void BindThreadIndex(const IterVar& iv) final;  // NOLINT(*)
-  void PrintStorageScope(const std::string& scope, std::ostream& os) final; // NOLINT(*)
-  void PrintStorageSync(const Call* op) final;  // NOLINT(*)
-  void PrintType(Type t, std::ostream& os) final; // NOLINT(*)
-  std::string GetVecLoad(Type t, const Variable* buffer,
-                         Expr base) final;
-  void PrintVecStore(const Variable* buffer,
-                     Type t, Expr base,
-                     const std::string& value) final;  // NOLINT(*)
-  // the address of load/store
-  void PrintVecAddr(const Variable* buffer, Type t,
-                    Expr base, std::ostream& os);  // NOLINT(*)
-  std::string CastFromTo(std::string value, Type from, Type target); // NOLINT(*)
-
-  // overload visitor
-  void VisitExpr_(const Broadcast* op, std::ostream& os) final; // NOLINT(*)
-
- private:
-  // whether enable fp16 and fp64 extension
-  bool enable_fp16_{false};
-  bool enable_fp64_{false};
-};
-
-}  // namespace codegen
-}  // namespace TVM
-
-#endif  // TVM_CODEGEN_CODEGEN_OPENCL_H_
diff --git a/tvm/src/codegen/codegen_source_base.cc b/tvm/src/codegen/codegen_source_base.cc
index 0df1ad276..9fc6fc706 100644
--- a/tvm/src/codegen/codegen_source_base.cc
+++ b/tvm/src/codegen/codegen_source_base.cc
@@ -8,34 +8,79 @@ namespace TVM {
 namespace codegen {
 
 void CodeGenSourceBase::ClearFuncState() {
-  name_alloc_map_.clear();
+  host_name_alloc_map_.clear();
+  device_name_alloc_map_.clear();
   ssa_assign_map_.clear();
   var_idmap_.clear();
   scope_mark_.clear();
 }
 
+void CodeGenSourceBase::SaveFuncState() {
+  host_name_alloc_map_save.clear();
+  device_name_alloc_map_save.clear();
+  ssa_assign_map_save.clear();
+  var_idmap_save.clear();
+  scope_mark_save.clear();
+  // save state into private member
+  host_name_alloc_map_save = host_name_alloc_map_;
+  device_name_alloc_map_save = device_name_alloc_map_;
+  ssa_assign_map_save = ssa_assign_map_;
+  var_idmap_save = var_idmap_;
+  scope_mark_save = scope_mark_;
+}
+
+void CodeGenSourceBase::RestoreFuncState() {
+  this->ClearFuncState();
+  host_name_alloc_map_ = host_name_alloc_map_save;
+  device_name_alloc_map_ = device_name_alloc_map_save;
+  ssa_assign_map_ = ssa_assign_map_save;
+  var_idmap_ = var_idmap_save;
+  scope_mark_ = scope_mark_save;
+}
+
 std::string CodeGenSourceBase::GetUniqueName(std::string prefix) {
   for (size_t i = 0; i < prefix.size(); ++i) {
     if (prefix[i] == '.') prefix[i] = '_';
   }
-  auto it = name_alloc_map_.find(prefix);
-  if (it != name_alloc_map_.end()) {
-    while (true) {
-      std::ostringstream os;
-      os << prefix << (++it->second);
-      std::string name = os.str();
-      if (name_alloc_map_.count(name) == 0) {
-        prefix = name;
-        break;
+  if (fpga_scope_) { 
+    auto it = device_name_alloc_map_.find(prefix);
+    if (it != device_name_alloc_map_.end()) {
+      while (true) {
+        std::ostringstream os;
+        os << prefix << (++it->second);
+        std::string name = os.str();
+        if (device_name_alloc_map_.count(name) == 0) {
+          prefix = name;
+          break;
+        }
       }
     }
+    device_name_alloc_map_[prefix] = 0;
+    return prefix;
+  } else {
+    auto it = host_name_alloc_map_.find(prefix);
+    if (it != host_name_alloc_map_.end()) {
+      while (true) {
+        std::ostringstream os;
+        os << prefix << (++it->second);
+        std::string name = os.str();
+        if (host_name_alloc_map_.count(name) == 0) {
+          prefix = name;
+          break;
+        }
+      }
+    }
+    host_name_alloc_map_[prefix] = 0;
+    return prefix;
   }
-  name_alloc_map_[prefix] = 0;
-  return prefix;
 }
 
 std::string CodeGenSourceBase::SSAGetID(std::string src, Type t) {
-  if (name_alloc_map_.count(src)) return src;
+  if (fpga_scope_) {
+    if (device_name_alloc_map_.count(src)) return src;
+  } else {
+    if (host_name_alloc_map_.count(src)) return src;
+  }
   auto it = ssa_assign_map_.find(src);
   if (it != ssa_assign_map_.end()) {
     if (scope_mark_.at(it->second.scope_id)) {
diff --git a/tvm/src/codegen/codegen_source_base.h b/tvm/src/codegen/codegen_source_base.h
index e140662c1..befc3f8ec 100644
--- a/tvm/src/codegen/codegen_source_base.h
+++ b/tvm/src/codegen/codegen_source_base.h
@@ -39,6 +39,10 @@ class CodeGenSourceBase {
   };
   /*! \brief Clear the states that might relates to function generation */
   void ClearFuncState();
+  /*! \brief Save the states that might relates to function generation */
+  void SaveFuncState();
+  /*! \brief Restore the states that might relates to function generation */
+  void RestoreFuncState();
   /*! \brief print the current indented value */
   void PrintIndent();
   /*!
@@ -89,18 +93,36 @@ class CodeGenSourceBase {
   std::ostringstream decl_stream;
   /*! \brief the stream to be printed */
   std::ostringstream stream;
+  /*! \brief the stream for mocule */
+  std::ostringstream module_stream;
+  /*! \brief the stream host */
+  std::ostringstream host_stream;
+  /*! \brief the stream device */
+  std::ostringstream device_stream;
   /*! \brief name of each variable */
   std::unordered_map<const Variable*, std::string> var_idmap_;
+  /*! \brief save states as copy */
+  std::unordered_map<const Variable*, std::string> var_idmap_save;
+  /*! \brief whether generate code for fpga */
+  bool fpga_scope_{false};
+  /*! \brief name allocation map for host */
+  std::unordered_map<std::string, int> host_name_alloc_map_;
+  /*! \brief name allocation map for device */
+  std::unordered_map<std::string, int> device_name_alloc_map_;
 
  private:
   /*! \brief assignment map of ssa */
   std::unordered_map<std::string, SSAEntry> ssa_assign_map_;
-  /*! \brief name allocation map */
-  std::unordered_map<std::string, int> name_alloc_map_;
   /*! \brief array to check whether we are inside certain scope */
   std::vector<bool> scope_mark_;
   /*! \brief The current indentation value */
   int indent_{0};
+  /*! \brief Save states as copy */
+  std::unordered_map<std::string, SSAEntry> ssa_assign_map_save;
+  std::unordered_map<std::string, int> host_name_alloc_map_save;
+  std::unordered_map<std::string, int> device_name_alloc_map_save;
+  std::vector<bool> scope_mark_save;
+ 
 };
 
 /*!
diff --git a/tvm/src/codegen/hlsc/build_hlsc.cc b/tvm/src/codegen/hlsc/build_hlsc.cc
index 42fb68089..2494ee66f 100644
--- a/tvm/src/codegen/hlsc/build_hlsc.cc
+++ b/tvm/src/codegen/hlsc/build_hlsc.cc
@@ -24,7 +24,6 @@ runtime::Module BuildVivadoHLSCSim(Array<LoweredFunc> funcs) {
     cg.AddFunction(f, map_arg_type);
   }
   std::string code = cg.Finish();
-
   return runtime::CreateVivadoHLSModule(funcs[0], code);
 }
 
@@ -47,7 +46,6 @@ std::string BuildHLSC(Array<LoweredFunc> funcs) {
     cg.AddFunction(f, map_arg_type);
   }
   std::string code = cg.Finish();
-
   LOG(WARNING) << "HLS C doesn't have runtime, return kernel code";
   return code;
 }
diff --git a/tvm/src/codegen/hlsc/codegen_hlsc.cc b/tvm/src/codegen/hlsc/codegen_hlsc.cc
index 3e8696fba..d7fc610d7 100644
--- a/tvm/src/codegen/hlsc/codegen_hlsc.cc
+++ b/tvm/src/codegen/hlsc/codegen_hlsc.cc
@@ -15,49 +15,50 @@ namespace codegen {
 
 void CodeGenHLSC::AddFunction(LoweredFunc f,
         str2tupleMap<std::string, Type> map_arg_type) {
-  // Write header files
-  // TODO: Insert header files here
-  // Clear previous generated state
-  this->InitFuncState(f);
-  // Register alloc buffer type
-  for (const auto & kv : f->handle_data_type) {
-    RegisterHandleType(kv.first.get(), kv.second.type());
-  }
-  // Write entry function name
-  this->stream << "void " << f->name << "(";
-  // Write arguments
-  for (size_t i = 0; i < f->args.size(); ++i) {
-    Var v = f->args[i];
-    std::string vid = AllocVarID(v.get());
-    if (i != 0) this->stream << ", ";
-    if (map_arg_type.find(vid) == map_arg_type.end()) {
-      LOG(WARNING) << vid << " type not found\n";
-      PrintType(v.type(), this->stream);
-      this->stream << ' ' << vid;
-    }
-    else {
-      auto arg = map_arg_type[vid];
-      PrintType(std::get<1>(arg), this->stream);
-      this->stream << ' ' << std::get<0>(arg);
-      const BufferNode* buf = f->api_args[i].as<BufferNode>();
-      if (v.type().is_handle() && buf) {
-        var_shape_map_[buf->data.get()] = buf->shape;
-        for (size_t i = 0; i < buf->shape.size(); i++) {
-          this->stream << '[';
-          this->PrintExpr(buf->shape[i], this->stream);
-          this->stream << ']';
-        }
-      }
-      // this->stream << "*"; TODO: create an option for this
-    }
-  }
-  stream << ") {\n";
-  int func_scope = this->BeginScope();
-  range_ = CollectIterRange(f->body);
-  this->PrintStmt(f->body);
-  this->EndScope(func_scope);
-  this->PrintIndent();
-  this->stream << "}\n\n";
+  CodeGenC::AddFunction(f, map_arg_type);
+  // // Write header files
+  // // TODO: Insert header files here
+  // // Clear previous generated state
+  // this->InitFuncState(f);
+  // // Register alloc buffer type
+  // for (const auto & kv : f->handle_data_type) {
+  //   RegisterHandleType(kv.first.get(), kv.second.type());
+  // }
+  // // Write entry function name
+  // this->stream << "void " << f->name << "(";
+  // // Write arguments
+  // for (size_t i = 0; i < f->args.size(); ++i) {
+  //   Var v = f->args[i];
+  //   std::string vid = AllocVarID(v.get());
+  //   if (i != 0) this->stream << ", ";
+  //   if (map_arg_type.find(vid) == map_arg_type.end()) {
+  //     LOG(WARNING) << vid << " type not found\n";
+  //     PrintType(v.type(), this->stream);
+  //     this->stream << ' ' << vid;
+  //   }
+  //   else {
+  //     auto arg = map_arg_type[vid];
+  //     PrintType(std::get<1>(arg), this->stream);
+  //     this->stream << ' ' << std::get<0>(arg);
+  //     const BufferNode* buf = f->api_args[i].as<BufferNode>();
+  //     if (v.type().is_handle() && buf) {
+  //       var_shape_map_[buf->data.get()] = buf->shape;
+  //       for (size_t i = 0; i < buf->shape.size(); i++) {
+  //         this->stream << '[';
+  //         this->PrintExpr(buf->shape[i], this->stream);
+  //         this->stream << ']';
+  //       }
+  //     }
+  //     // this->stream << "*"; TODO: create an option for this
+  //   }
+  // }
+  // stream << ") {\n";
+  // int func_scope = this->BeginScope();
+  // range_ = CollectIterRange(f->body);
+  // this->PrintStmt(f->body);
+  // this->EndScope(func_scope);
+  // this->PrintIndent();
+  // this->stream << "}\n\n";
 }
 
 std::string CodeGenHLSC::GetBufferRef(Type t, const Variable* buffer, Expr index) {
@@ -68,14 +69,16 @@ std::string CodeGenHLSC::GetBufferRef(Type t, const Variable* buffer, Expr index
         buf_length_map_[buffer] == 1);
     if (is_scalar) {
       os << vid;
-    } else {     
-      os << vid;
-      std::vector<Expr> indices = ExtractIndices(index, var_shape_map_[buffer], range_);
-      for (size_t i = 0; i < indices.size(); i++) {
-        os << '[';
-        PrintExpr(indices[i], os);
-        os << ']';
-      }
+    } else { 
+      os << vid << "[";
+      PrintExpr(index, os);
+      os << "]";
+      // std::vector<Expr> indices = ExtractIndices(index, var_shape_map_[buffer], range_);
+      // for (size_t i = 0; i < indices.size(); i++) {
+      //   os << '[';
+      //   PrintExpr(indices[i], os);
+      //   os << ']';
+      // }
     }
   }  
   return os.str();
@@ -88,6 +91,7 @@ void CodeGenHLSC::VisitExpr_(const Min *op, std::ostream& os) {  // NOLINT(*)
   PrintExpr(op->b, os);
   os << ")";
 }
+
 void CodeGenHLSC::VisitExpr_(const Max *op, std::ostream& os) {  // NOLINT(*)
   os << "std::max(";
   PrintExpr(op->a, os);
@@ -97,19 +101,20 @@ void CodeGenHLSC::VisitExpr_(const Max *op, std::ostream& os) {  // NOLINT(*)
 }
 
 void CodeGenHLSC::VisitStmt_(const LetStmt* op) {
-  std::string value = PrintExpr(op->value);
-  // Skip the argument retrieving assign statement
-  std::string vid = AllocVarID(op->var.get());
-  if (op->var.type() != Handle() &&
-      value.find("TVMArray") == std::string::npos &&
-      value.find("arg") != 0) {
-    PrintIndent();
-    PrintType(op->var.type(), this->stream);
-    this->stream << ' '
-                 << vid
-                 << " = " << value << ";\n";
-  }
-  PrintStmt(op->body);
+  CodeGenC::VisitStmt_(op);
+  // std::string value = PrintExpr(op->value);
+  // // Skip the argument retrieving assign statement
+  // std::string vid = AllocVarID(op->var.get());
+  // if (op->var.type() != Handle() &&
+  //     value.find("TVMArray") == std::string::npos &&
+  //     value.find("arg") != 0) {
+  //   PrintIndent();
+  //   PrintType(op->var.type(), this->stream);
+  //   this->stream << ' '
+  //                << vid
+  //                << " = " << value << ";\n";
+  // }
+  // PrintStmt(op->body);
 }
 
 void CodeGenHLSC::GenForStmt(const For* op, std::string pragma, bool before) {
@@ -164,7 +169,10 @@ void CodeGenHLSC::VisitStmt_(const IfThenElse* op) {
 
 void CodeGenHLSC::VisitStmt_(const Allocate* op) {
   CHECK(!is_zero(op->condition));
-  std::string vid = AllocVarID(op->buffer_var.get());
+  std::string vid; 
+  if (!var_idmap_.count(op->buffer_var.get())) 
+    vid = AllocVarID(op->buffer_var.get());
+  else vid = GetVarID(op->buffer_var.get());
   this->PrintIndent();
   int32_t constant_size = op->constant_allocation_size();
   CHECK_GT(constant_size, 0)
@@ -173,16 +181,22 @@ void CodeGenHLSC::VisitStmt_(const Allocate* op) {
   var_shape_map_[buffer] = op->extents;
   std::string scope = alloc_storage_scope_.at(buffer);
   PrintStorageScope(scope, stream);
-  PrintType(op->type, stream);
-  stream << ' '<< vid;
-  if (constant_size > 1) {// Transfer length one array to scalar
-    for (size_t i = 0; i < op->extents.size(); i++) {
-      stream << '[';
-      PrintExpr(op->extents[i], stream);
+
+  if (vid.find("stream_") != std::string::npos) { 
+    void(0); // alloc stream channel in pre-processing
+  } else {
+    PrintType(op->type, stream);
+    stream << ' '<< vid;
+    if (constant_size > 1) {// Transfer length one array to scalar
+      stream << "[";
+      for (size_t i = 0; i < op->extents.size(); i++) {
+        PrintExpr(op->extents[i], stream);
+        if (i != op->extents.size()-1) stream << "*";
+      }
       stream << "]";
     }
+    stream << ";\n";
   }
-  stream << ";\n";
   buf_length_map_[buffer] = constant_size;
   RegisterHandleType(op->buffer_var.get(), op->type);
   for (size_t i = 0; i < op->attrs.size(); i++) {
diff --git a/tvm/src/codegen/hlsc/codegen_hlsc.h b/tvm/src/codegen/hlsc/codegen_hlsc.h
index c85cbc699..fdd1747fa 100644
--- a/tvm/src/codegen/hlsc/codegen_hlsc.h
+++ b/tvm/src/codegen/hlsc/codegen_hlsc.h
@@ -27,9 +27,7 @@ class CodeGenHLSC : public CodeGenC {
   void VisitStmt_(const Allocate* op) override;
 
   void GenForStmt(const For* op, std::string pragma, bool before);
-
-  std::map<const Variable*, Array<Expr> > var_shape_map_;
-  std::unordered_map<const Variable*, Expr> range_;
+  
  protected:
   std::string GetBufferRef(Type t, const Variable* buffer, Expr index);
 };
diff --git a/tvm/src/codegen/hlsc/codegen_vhls.cc b/tvm/src/codegen/hlsc/codegen_vhls.cc
index 6a0977e40..f944bef83 100644
--- a/tvm/src/codegen/hlsc/codegen_vhls.cc
+++ b/tvm/src/codegen/hlsc/codegen_vhls.cc
@@ -21,12 +21,83 @@
 namespace TVM {
 namespace codegen {
 
+void CodeGenVivadoHLS::PreProcess(std::ostringstream& os) {
+  os << "\n";
+  int indent = 2;
+  for (size_t i = 0; i < arg_vars.size(); i++) {
+    auto v = arg_vars[i];
+    std::string arg_name;
+    if (stream_table[v]) 
+      arg_name = std::get<0>(arg_top_vars[v]);
+    else arg_name = GetVarID(v); 
+
+    // create local buffer saving result
+    auto shape = std::get<2>(arg_top_vars[v]);
+    auto dtype = std::get<1>(arg_top_vars[v]);
+    if (!stream_table[v]) { // unstreamed args 
+      // allocate local buffer 
+      for (int k = 0; k < indent; k++) os << ' ';
+      PrintType(dtype, os); 
+      os << " " << arg_name << "[";
+      for (size_t n = 0; n < shape.size(); n++) {
+        os << shape[n];
+        if (n != shape.size() - 1) os << "* ";
+      } 
+      os << "];\n";
+ 
+      for (size_t j = 0; j < shape.size(); j++) {
+        for (int k = 0; k < indent; k++) os << ' ';
+        os << "for (int i" << j << " = 0; i"
+           << j << "< " << shape[j] << "; i" 
+           << j << "++) {\n";
+        // pass stream reference
+        if (j == shape.size() - 1) {
+          for (int k = 0; k < indent; k++) os << ' ';
+          os << "  " << arg_name << "["
+             << getIndex(shape) << "] = "  
+             << "fd_" << arg_name << ".read();\n";
+        }
+        indent += 2;
+      }
+      for (size_t m = 0; m < shape.size(); m++) {
+        indent -= 2;
+        for (int k = 0; k < indent; k++) os << ' ';
+        os << "}\n";
+      }
+    } else if (i == arg_vars.size() - 1 || true) { 
+      // allocate for return variable 
+      for (int k = 0; k < indent; k++) os << ' ';
+      PrintType(dtype, os); 
+      os << " " << arg_name << "[";
+      for (size_t n = 0; n < shape.size(); n++) {
+        os << shape[n];
+        if (n != shape.size() - 1) os << "* ";
+      } 
+      os << "];\n";
+    }
+  }
+}
+
+void CodeGenVivadoHLS::PostProcess(std::ostringstream& os) {
+//   os << "\n";
+//   int indent = 2;
+//   for (size_t i = 0; i < arg_vars.size(); i++) {
+//     auto v = arg_vars[i];
+//     std::string arg_name;
+//     if (stream_table[v]) 
+//       arg_name = std::get<0>(arg_top_vars[v]);
+//     else arg_name = GetVarID(v); 
+//     os  << arg_name << " = " << "fd_" 
+//         << arg_name << ".write();\n";
+}
+
 void CodeGenVivadoHLS::AddFunction(LoweredFunc f,
         str2tupleMap<std::string, Type> map_arg_type) {
   // Write header files
-  this->stream << "#include <ap_int.h>\n";
-  this->stream << "#include <ap_fixed.h>\n";
-  this->stream << "#include <math.h>\n\n";
+  this->decl_stream << "#include <ap_int.h>\n";
+  this->decl_stream << "#include <ap_fixed.h>\n";
+  this->decl_stream << "#include <hls_stream.h>\n";
+  this->decl_stream << "#include <math.h>\n\n";
   CodeGenHLSC::AddFunction(f, map_arg_type);
   if (soda_header_.is_open())
     soda_header_.close();
@@ -77,6 +148,13 @@ void CodeGenVivadoHLS::VisitStmt_(const Store* op) {
     this->stream << ref
                  << "[" << PrintExpr(sb->index)
                  << "] = " << PrintExpr(sb->value) << ";\n";
+  } else if (const StreamExpr* se = op->value.as<StreamExpr>()) {
+    std::string vid = GetVarID(se->buffer_var.get()); 
+    vid = vid.substr(0, vid.find("_stream_send")); 
+    PrintIndent();
+    this->stream << vid << "["
+                 << op->index << "] = "
+                 << "fd_" << vid << ".read();\n";
   } else {
     CodeGenC::VisitStmt_(op);
   }
@@ -143,6 +221,30 @@ void CodeGenVivadoHLS::VisitStmt_(const Partition* op) {
   stream << "\n";
 }
 
+void CodeGenVivadoHLS::VisitExpr_(const StreamExpr* op, std::ostream& os) {
+  CodeGenC::VisitExpr_(op, os);
+  std::string vid = GetVarID(op->buffer_var.get());
+  vid = vid.substr(0, vid.find("_stream_send")); 
+  os << vid << ".read()";
+}
+
+void CodeGenVivadoHLS::VisitStmt_(const StreamStmt* op) {
+  CodeGenC::VisitStmt_(op);
+  std::string vid = GetVarID(op->buffer_var.get());
+  switch (op->stream_type) {
+    case StreamType::Channel:
+      break;
+    case StreamType::FIFO:
+      break;
+    case StreamType::Pipe:
+      break;
+  }
+  vid = vid.substr(0, vid.find("_stream_send")); 
+  auto load = op->value.as<Load>();
+  stream << "fd_" << vid << ".write(" 
+         << vid << "["<< load->index << "]);\n";
+}
+
 class AllocateCollector final : public IRVisitor {
   public:
     AllocateCollector(std::vector<const Allocate*>& alloc_list,
@@ -160,6 +262,144 @@ class AllocateCollector final : public IRVisitor {
     VarExprUnorderedSet& outputs_;
 };
 
+void CodeGenVivadoHLS::VisitStmt_(const AttrStmt* op) {
+  if (op->attr_key == ir::attr::device_scope) {
+    // print top( ... in host and enter fpga scope 
+    if (op->value.as<StringImm>()->value == "fpga" && !fpga_scope_) {
+      fpga_scope_ = true;
+      PrintIndent();
+       
+      // track the stream usage
+      StreamCollector collector(stream_table, "cpu");
+      collector.Visit(op->body);
+
+      // update data type and name 
+      for (auto k : collector.host_undefined_) {
+        auto v = k.get();
+        arg_vars.push_back(v);
+        stream_table[v] = true;
+        auto tuple = arg_top_vars[v];
+        arg_top_vars[v] = std::make_tuple(v->name_hint,
+                                          std::get<1>(tuple),
+                                          std::get<2>(tuple)); 
+      }
+      TypeCollector visitor(arg_top_vars);
+      visitor.Visit(op->body);
+  
+      // generte function calls 
+      stream << "top(";
+      for (size_t i = 0; i < arg_vars.size(); i++) {
+        auto v = arg_vars[i];
+        std::string arg_name;
+        if (stream_table[v]) 
+          arg_name = std::get<0>(arg_top_vars[v]);
+        else arg_name = GetVarID(v); 
+        if (i != 0) stream << ", ";
+        stream << "fd_" << arg_name;
+
+        // generate kernel func definition
+        if (i != 0) arg_stream << ", ";
+        arg_stream << "hls::stream<";
+        PrintType(std::get<1>(arg_top_vars[v]), arg_stream);
+        auto shape = std::get<2>(arg_top_vars[v]);
+        arg_stream << ">& fd_" << arg_name;
+      }
+      stream << ");\n";
+  
+      // switch context to device scope
+      host_stream << this->stream.str();
+      this->stream.str("");
+      this->stream.clear();
+  
+    // swtich from device to host
+    } else if (op->value.as<StringImm>()->value == "cpu" && 
+               fpga_scope_) {
+      fpga_scope_ = false;
+      device_stream << this->stream.str();
+      this->stream.str("");
+      this->stream.clear();
+    }
+    this->PrintStmt(op->body);
+  } else {
+    CodeGenC::VisitStmt_(op);
+  }
+}
+
+void CodeGenVivadoHLS::VisitStmt_(const KernelStmt *op) {
+  PrintIndent();
+  stream << op->name << "(";
+  for (size_t i = 0; i < op->args.size(); i++) {
+    if (stream_arg_pos[op->name].count(i))
+      stream << "fd_";
+    PrintExpr(op->args[i], stream);
+    if (i < op->args.size() -1) stream << ", ";
+  }
+  stream << ");\n";
+}
+
+void CodeGenVivadoHLS::VisitStmt_(const KernelDef* op) {
+  LoweredFunc f;
+  // save func states
+  CodeGenC::SaveFuncState(f);
+  CodeGenC::InitFuncState(f);
+  std::ostringstream save;
+  save << this->stream.str();
+  this->stream.str("");
+  this->stream.clear();
+
+  // skip the first underscore
+  GetUniqueName("_");
+  // add to alloc buffer : type.
+  for (const auto & k : op->args) {
+    RegisterHandleType(k.get(), k.get()->type);
+  }
+  // print function signature
+  PrintType(op->ret_type, stream);
+  stream << " " << op->name << "(";
+  for (size_t k = 0; k < op->channels.size(); k+=2) {
+    int pos = op->channels[k].as<IntImm>()->value;  
+    stream_arg_pos[op->name].insert(pos);
+  }
+  for (size_t i = 0; i < op->args.size(); ++i) {
+    VarExpr v = op->args[i];
+    var_shape_map_[v.get()] = op->api_args[i];
+    std::string vid = AllocVarID(v.get());
+    if (i != 0) stream << ", ";
+    std::string str = PrintExpr(op->api_types[i]);
+    Type type = String2Type(str);
+
+    // pass the stream channel reference 
+    // TODO: broadcast in hlsc (one wr multi read) 
+    if (stream_arg_pos[op->name].count(i)) {
+      stream << "hls::stream<";
+      PrintType(type, stream);
+      stream << ">& " << vid;
+    } else {
+      PrintType(type, stream);
+      this->stream << " " << vid << "[";
+      int mul = 1;
+      for (size_t j = 0; j < op->api_args[i].size(); j++) {
+        auto dim = op->api_args[i][j].as<IntImm>()->value;
+        mul = mul * dim;
+      }
+      this->stream << mul << "]";
+    }
+  }  
+  stream << ") {\n";
+  int func_scope = BeginScope();
+  range_ = CollectIterRange(op->body);
+  PrintStmt(op->body);
+  EndScope(func_scope);
+  stream << "}\n\n";
+
+  // restore default stream
+  module_stream << this->stream.str();
+  this->stream.str(""); 
+  this->stream.clear();
+  this->stream << save.str();
+  RestoreFuncState(f);
+}
+
 void CodeGenVivadoHLS::VisitStmt_(const Stencil* op) {
   // Use SODA codegen for stencil analysis
   CodeGenSODA cg_soda;
diff --git a/tvm/src/codegen/hlsc/codegen_vhls.h b/tvm/src/codegen/hlsc/codegen_vhls.h
index 5486be1dc..6462251db 100644
--- a/tvm/src/codegen/hlsc/codegen_vhls.h
+++ b/tvm/src/codegen/hlsc/codegen_vhls.h
@@ -23,11 +23,19 @@ class CodeGenVivadoHLS final : public CodeGenHLSC {
   
   void VisitExpr_(const GetBit* op, std::ostream& os) override;
   void VisitExpr_(const GetSlice* op, std::ostream& os) override;
+  void VisitExpr_(const StreamExpr* op, std::ostream& os) override;
 
   void VisitStmt_(const Store* op) override;
   void VisitStmt_(const For* op) override;
   void VisitStmt_(const Partition* op) override;
   void VisitStmt_(const Stencil* op) override;
+  void VisitStmt_(const StreamStmt* op) override;
+  void VisitStmt_(const AttrStmt* op) override;
+  void VisitStmt_(const KernelDef* op) override;
+  void VisitStmt_(const KernelStmt* op) override;
+
+  void PreProcess(std::ostringstream& os);
+  void PostProcess(std::ostringstream& os);
  private:
   std::ofstream soda_header_;
 };
diff --git a/tvm/src/codegen/merlinc/codeanalys_merlinc.cc b/tvm/src/codegen/merlinc/codeanalys_merlinc.cc
index 56b4e1d97..d6fa1c6ba 100644
--- a/tvm/src/codegen/merlinc/codeanalys_merlinc.cc
+++ b/tvm/src/codegen/merlinc/codeanalys_merlinc.cc
@@ -652,6 +652,9 @@ void CodeAnalysMerlinC::VisitExpr_(const Broadcast* op, std::ostream& os) {   //
   LOG(FATAL) << "Broadcast: not supported ";
 }
 
+void CodeAnalysMerlinC::VisitExpr_(const StreamExpr* op, std::ostream& os) {   // NOLINT(*)
+}
+
 void CodeAnalysMerlinC::VisitExpr_(const Select* op, std::ostream& os) {  // NOLINT(*)
   os << "(";
   PrintExpr(op->condition, os);
@@ -716,10 +719,8 @@ void CodeAnalysMerlinC::VisitExpr_(const Quantize *op, std::ostream& os) { // NO
 }
 
 void CodeAnalysMerlinC::VisitExpr_(const KernelExpr *op, std::ostream& os) { // NOLINT(*)
-  LOG(FATAL) << "KernelExpr is not yet support";
 }
 
-
 void CodeAnalysMerlinC::VisitStmt_(const LetStmt* op) {
   // TODO comaniac
   //std::vector<Var> vec_var = GetNodesByType<Var>(op->value);
@@ -882,11 +883,9 @@ void CodeAnalysMerlinC::VisitStmt_(const ProducerConsumer *op) {
 }
 
 void CodeAnalysMerlinC::VisitStmt_(const KernelDef *op) {
-  LOG(FATAL) << "KernelDef is not yet support";
 }
 
 void CodeAnalysMerlinC::VisitStmt_(const KernelStmt *op) {
-  LOG(FATAL) << "KernelStmt is not yet support";
 }
 
 void CodeAnalysMerlinC::VisitStmt_(const Return *op) {
@@ -917,6 +916,8 @@ void CodeAnalysMerlinC::VisitStmt_(const Reuse *op) {
 
 void CodeAnalysMerlinC::VisitStmt_(const Partition *op) {}
 
+void CodeAnalysMerlinC::VisitStmt_(const StreamStmt *op) {}
+
 void CodeAnalysMerlinC::VisitStmt_(const Stencil *op) {
   PrintStmt(op->body);
 }
diff --git a/tvm/src/codegen/merlinc/codeanalys_merlinc.h b/tvm/src/codegen/merlinc/codeanalys_merlinc.h
index 6ba082f09..421f0d96f 100644
--- a/tvm/src/codegen/merlinc/codeanalys_merlinc.h
+++ b/tvm/src/codegen/merlinc/codeanalys_merlinc.h
@@ -112,6 +112,7 @@ class CodeAnalysMerlinC :
   void VisitExpr_(const SetSlice* op, std::ostream& os) override;  // NOLINT(*)
   void VisitExpr_(const Quantize* op, std::ostream& os) override;  // NOLINT(*)
   void VisitExpr_(const KernelExpr* op, std::ostream& os) override;  // NOLINT(*)
+  void VisitExpr_(const StreamExpr* op, std::ostream& os) override;  // NOLINT(*)
   // statment
   void VisitStmt_(const LetStmt* op) override;
   void VisitStmt_(const Store* op) override;
@@ -131,6 +132,7 @@ class CodeAnalysMerlinC :
   void VisitStmt_(const Reuse* op) override;
   void VisitStmt_(const Partition* op) override;
   void VisitStmt_(const Stencil* op) override;
+  void VisitStmt_(const StreamStmt* op) override;
   /*!
    * Print Type represetnation of type t.
    * \param t The type representation.
diff --git a/tvm/src/codegen/opencl/build_opencl.cc b/tvm/src/codegen/opencl/build_opencl.cc
new file mode 100755
index 000000000..f5b1352a7
--- /dev/null
+++ b/tvm/src/codegen/opencl/build_opencl.cc
@@ -0,0 +1,61 @@
+#include "./codegen_aocl.h"
+#include "./codegen_sdaccel.h"
+#include "../build_common.h"
+#include "./sdaccel_module.h"
+#include "../merlinc/codeanalys_merlinc.h"
+
+namespace TVM {
+namespace codegen {
+
+#if HCL_SDACCEL_RUNTIME
+runtime::Module BuildSDAccelSim(Array<LoweredFunc> funcs) {
+  CodeAnalysMerlinC ca;
+  CodeGenSDACCEL cg;
+  for (LoweredFunc f : funcs) {
+    // 1st pass: Analyze AST and collect necessary information
+    ca.AddFunction(f);
+    str2tupleMap<std::string, Type> map_arg_type;
+    map_arg_type = ca.Finish();
+    // 2nd pass: Generate kernel code
+    cg.AddFunction(f, map_arg_type);
+  }
+  std::string code = cg.Finish();
+  return runtime::CreateSDAccelModule(funcs[0], code);
+}
+
+TVM_REGISTER_API("codegen.build_sdaccel_csim")
+.set_body([](TVMArgs args, TVMRetValue* rv) {
+    *rv = BuildSDAccelSim(args[0]);
+  });
+#endif
+
+
+template<class CodeGen>
+std::string BuildOpenCL(Array<LoweredFunc> funcs){
+    using TVM::runtime::Registry;
+    CodeAnalysMerlinC ca;
+    CodeGen cg;
+    for(LoweredFunc f: funcs){
+        ca.AddFunction(f);
+        str2tupleMap<std::string, Type>map_arg_type;
+        map_arg_type = ca.Finish();
+        cg.AddFunction(f, map_arg_type);
+    }
+    std::string code = cg.Finish();
+
+    LOG(WARNING) << "OpenCL doesn't have runtime, return kernel code";
+    return code;
+}
+
+
+TVM_REGISTER_API("codegen.build_sdaccel")
+.set_body([]( TVMArgs args, TVMRetValue * rv ) {
+    * rv = BuildOpenCL<CodeGenSDACCEL>(args[0]);
+    });
+
+TVM_REGISTER_API("codegen.build_aocl")
+.set_body([]( TVMArgs args, TVMRetValue * rv ) {
+    * rv = BuildOpenCL<CodeGenAOCL>(args[0]);
+    });
+} // namespace codegen
+} // namespace TVM
diff --git a/tvm/src/codegen/opencl/codegen_aocl.cc b/tvm/src/codegen/opencl/codegen_aocl.cc
new file mode 100644
index 000000000..6d3247d02
--- /dev/null
+++ b/tvm/src/codegen/opencl/codegen_aocl.cc
@@ -0,0 +1,354 @@
+#include <tvm/ir_pass.h>
+#include <tvm/runtime/config.h>
+#include <tvm/packed_func_ext.h>
+#include <vector>
+#include <string>
+#include "./codegen_aocl.h"
+#include "../../runtime/thread_storage_scope.h"
+
+namespace TVM {
+namespace codegen {
+
+inline Type String2Type(std::string& s) {
+  if (s.front() == '\"' && s.back() == '\"') {
+    s.erase(0, 1);
+    s.pop_back();
+  }
+  std::istringstream is(s);
+  halideir_type_code_t code = Type::Int;
+  if (s.substr(0, 3) == "int") {
+    code = Type::Int; s = s.substr(3);
+  } else if (s.substr(0, 4) == "uint") {
+    code = Type::UInt; s = s.substr(4);
+  } else if (s.substr(0, 5) == "float") {
+    code = Type::Float; s = s.substr(5);
+  } else if (s.substr(0, 5) == "float") {
+    code = Type::Float; s = s.substr(5);
+  } else if (s == "handle") {
+    return Handle();
+  } else {
+    LOG(FATAL) << "unknown type " << s;
+  }
+  int bits = 32, lanes = 1;
+  if (sscanf(s.c_str(), "%dx%d", &bits, &lanes) == 0) {
+    LOG(FATAL) << "unknown type " << s;
+  }
+  return Type(code, bits, lanes);
+}
+
+void CodeGenAOCL::AddFunction(LoweredFunc f,
+        str2tupleMap<std::string, Type> map_arg_type) {
+  // Clear previous generated state
+  this->InitFuncState(f);
+  for (Var arg: f->args) {
+      if (arg.type().is_handle()) {
+          alloc_storage_scope_[arg.get()] = "global";
+      }
+  }
+
+  // Skip the first underscore, so SSA variable starts from _1
+  GetUniqueName("_");
+
+  // Register alloc buffer type
+  for (const auto & kv : f->handle_data_type) {
+    RegisterHandleType(kv.first.get(), kv.second.type());
+  }
+
+  this->decl_stream << "#include \"ihc_apint.h\"" << "\n";
+  this->decl_stream << "#pragma OPENCL EXTENSION cl_intel_arbitrary_precision_integers : enable\n";
+  this->stream << "__kernel " << "void " << f->name << "(";
+
+  // Write arguments
+  for (size_t i = 0; i < f->args.size(); ++i) {
+    // alloc or get var name
+    Var v = f->args[i];
+    std::string vid;
+    if (!var_idmap_.count(v.get())) 
+      vid = AllocVarID(v.get());
+    else vid = GetVarID(v.get());
+
+    if (i != 0) this->stream << ", ";
+    if (map_arg_type.find(vid) == map_arg_type.end()) {
+      LOG(WARNING) << vid << " type not found\n";
+      PrintType(v.type(), this->stream);
+      this->stream << ' ' << vid;
+    }
+    else {
+      auto arg = map_arg_type[vid];
+      this->stream << "__global ";
+      PrintType(std::get<1>(arg), this->stream);
+      if (v.type().is_handle())
+        this->stream << "*";
+      this->stream << ' ' << "restrict ";
+      this->stream << std::get<0>(arg);
+    }
+  }
+  stream << ") {\n";
+  int func_scope = this->BeginScope();
+  this->PrintStmt(f->body);
+  this->EndScope(func_scope);
+  this->PrintIndent();
+  // this->stream << ' '<< ' ' << "return;\n";
+  this->stream << "}\n\n";
+}
+
+void CodeGenAOCL::PrintType(Type t, std::ostream &os)
+{
+  int lanes = t.lanes();
+  if(t.is_handle()) {
+    os << "void*";return;
+  }
+  if(t == Bool()) {
+    os <<"bool"; return;
+  }
+  CHECK_EQ(lanes, 1)
+      << "do not yet support vector types";
+  
+  bool fail = false;
+  if(t.is_float()) {
+    switch(t.bits())
+    {
+      case 16:
+        os<<"half";
+        // enable_fp16_ = true;
+        break;
+      case 32:
+        os<<"float";
+        break;
+      case 64:
+        os<< "double";
+        // enable_fp64_ = true;
+        break;
+      default:
+        fail = true;
+        break;
+    }
+    if(!fail && lanes ==1) return;
+    if(!fail&&(lanes >= 2 && lanes <=16))
+    {
+      os<<lanes; return;
+    }
+  } else if(t.is_uint() || t.is_int()) {
+    fail = true;
+    if(!fail && lanes == 1) return;
+    if(!fail && (lanes >=2 && lanes <= 16)) {
+      os  <<  lanes; return;
+    }
+    if(fail && lanes==1) {
+      if(t.is_uint()) {
+        if (t.bits() > 64) {
+          os << "uint" << "64" << "_t"; return;
+        } else {
+          os<< "ap_uint<"<< t.bits() <<"> uintd_t"; return;
+        }
+      }
+      if(t.is_int()) {
+        if (t.bits() > 64) {
+          os << "int" << "64" << "_t"; return;
+        } else {
+          os << "ap_int<" << t.bits() << "> intd_t"; return;
+        }
+      }
+    }
+  }
+
+  LOG(FATAL) << "Cannot convert type"<<t<<"to AOCL type";
+}
+
+void CodeGenAOCL::VisitStmt_(const For* op) {
+  std::ostringstream os;
+  if (op->for_type == ForType::Unrolled) {
+    int unroll_factor = 0, i = 0;
+    for (auto key : op->annotate_keys) {
+      if (auto str = key.as<StringImm>()) {
+        auto factor = op->annotate_values[i].as<IntImm>();
+        if (str->value == "factor" && factor != nullptr && factor->value > 1) {
+          unroll_factor = factor->value;
+          break;
+        }
+      }
+      i++;
+    }
+    os << "#pragma unroll";
+    if (unroll_factor > 0) os << " " << unroll_factor << "\n";
+    else                   os << "\n";
+  }
+  else if (op->for_type == ForType::Pipelined) {
+    int II = 1, i = 0;
+    for (auto key : op->annotate_keys) {
+      if (auto str = key.as<StringImm>()) {
+        auto initiation_interval = op->annotate_values[i].as<IntImm>();
+        if (str->value == "initiation_interval" &&
+            initiation_interval != nullptr &&
+            initiation_interval->value > 1) {
+          II = initiation_interval->value;
+          break;
+        }
+      }
+      i++;
+    }
+    os << "#pragma";
+    os << " ii " << II << "\n";
+  }
+  CodeGenAOCL::GenForStmt(op, os.str(), true);
+}
+
+void CodeGenAOCL::VisitExpr_(const StreamExpr* op, std::ostream& os) {
+  std::string vid;
+  if (!var_idmap_.count(op->buffer_var.get())) 
+    vid = AllocVarID(op->buffer_var.get());
+  else vid = GetVarID(op->buffer_var.get());
+  int i = 0;
+  for (auto key : op->annotate_keys) {
+    auto str = key.as<StringImm>();
+    auto val = op->annotate_values[i].as<StringImm>();
+    if (str->value == "name" && val != nullptr) {
+        vid = val->value;
+        decl_stream << "channel ";
+        PrintType(op->type, decl_stream);
+        decl_stream << " " << vid << ";\n";
+    }
+    i++;
+  }
+  switch (op->stream_type) {
+    case StreamType::Channel:
+      os << "read_channel_intel(";
+      os << vid << ")";
+      break;
+    case StreamType::Pipe:
+      os << "read_pipe(";
+      break;
+    case StreamType::FIFO:
+      // buffered channel  
+      os << "fifo";
+      break;
+  }
+}
+
+void CodeGenAOCL::VisitStmt_(const KernelDef* op) {
+  LoweredFunc f;
+  SaveFuncState(f);
+  InitFuncState(f);
+  std::ostringstream save;
+  save << this->stream.str();
+  this->stream.str("");
+  this->stream.clear();
+
+  // skip the first underscore
+  GetUniqueName("_");
+  // add to alloc buffer : type.
+  for (const auto & k : op->args) {
+    RegisterHandleType(k.get(), k.get()->type);
+  }
+  stream << "__kernel ";
+  const UIntImm* is_void = op->ret_void.as<UIntImm>();
+  if (is_void) stream << "void";
+  else PrintType(op->ret_type, stream);
+  stream << " " << op->name << "(";
+
+  // streamed arg position to channel index
+  std::unordered_map<int, int> stream_args;
+  for (size_t j = 0; j < op->channels.size(); j=j+2) {
+    int pos = op->channels[j].as<IntImm>()->value;
+    int idx = op->channels[j+1].as<IntImm>()->value;
+    stream_args[pos] = idx;
+  } 
+  for (size_t i = 0; i < op->args.size(); ++i) {
+    VarExpr v = op->args[i];
+    var_shape_map_[v.get()] = op->api_args[i];
+    std::string vid = AllocVarID(v.get());
+    if (stream_args.count(i)) { 
+      stream_arg_pos[op->name].insert(i); 
+      if (!stream_pragma) {
+        decl_stream << "#pragma OPENCL EXTENSION cl_intel_channels : enable\n";
+        stream_pragma = true;
+      }
+    } else {
+      if (i != 0) {
+        if (stream_args.count(i-1)) void(0);
+        else stream << ", ";
+      } // un-streamed argument 
+      this->stream << "__global ";
+      std::string str = PrintExpr(op->api_types[i]);
+      Type type = String2Type(str);
+      PrintType(type, stream);
+      this->stream << "* restrict " << vid;
+    }
+  }  
+  stream << ") {\n";
+  int func_scope = BeginScope();
+  range_ = CollectIterRange(op->body);
+  PrintStmt(op->body);
+  EndScope(func_scope);
+  stream << "}\n\n";
+
+  // restore default stream
+  module_stream << this->stream.str();
+  this->stream.str(""); 
+  this->stream.clear();
+  this->stream << save.str();
+  RestoreFuncState(f);
+}
+
+void CodeGenAOCL::VisitStmt_(const KernelStmt *op) {
+  PrintIndent();
+  stream << op->name << "(";
+  for (size_t i = 0; i < op->args.size(); i++) {
+    std::string str = op->name + "." + PrintExpr(op->args[i]);
+    if (!stream_arg_pos[op->name].count(i)) {
+      if (i != 0) {
+        if (stream_arg_pos[op->name].count(i-1)) void(0);
+        else stream << ", ";
+      }
+      PrintExpr(op->args[i], stream);
+    }
+  }
+  stream << ");\n";
+}
+
+void CodeGenAOCL::VisitExpr_(const KernelExpr *op, std::ostream& os) { // NOLINT(*)
+  os << op->name << "(";
+  for (size_t i = 0; i < op->args.size(); ++i) {
+    if (!stream_arg_pos[op->name].count(i)) {
+      if (i != 0) {
+        if (stream_arg_pos[op->name].count(i-1)) void(0);
+        else stream << ", ";
+      }
+      PrintExpr(op->args[i], stream);
+    }
+  }
+  os << ")";
+}
+
+void CodeGenAOCL::VisitStmt_(const StreamStmt* op) {
+  std::string vid;
+  if (!var_idmap_.count(op->buffer_var.get())) 
+    vid = AllocVarID(op->buffer_var.get());
+  else vid = GetVarID(op->buffer_var.get());
+  PrintIndent();
+  int i = 0;
+  for (auto key : op->annotate_keys) {
+    auto str = key.as<StringImm>();
+    auto val = op->annotate_values[i].as<StringImm>();
+    if (str->value == "name" && val != nullptr) vid = val->value;
+    i++;
+  }
+  switch (op->stream_type) {
+    case StreamType::Channel:
+      stream << "write_channel_intel(";
+      stream << vid << ", ";
+      break;
+    case StreamType::Pipe:
+      stream << "write_pipe(";
+      stream << vid << ", ";
+      break;
+    case StreamType::FIFO:
+      stream << "fifo(";
+      break;
+  }
+  PrintExpr(op->value, stream);
+  stream << ");\n";
+}
+
+} // namespace codegen
+} // namespace TVM
diff --git a/tvm/src/codegen/opencl/codegen_aocl.h b/tvm/src/codegen/opencl/codegen_aocl.h
new file mode 100755
index 000000000..5778b70ec
--- /dev/null
+++ b/tvm/src/codegen/opencl/codegen_aocl.h
@@ -0,0 +1,34 @@
+#ifndef TVM_CODEGEN_CODEGEN_AOCL_H_
+#define TVM_CODEGEN_CODEGEN_AOCL_H_
+
+# include <tvm/codegen.h>
+# include <tvm/packed_func_ext.h>
+# include "./codegen_opencl.h"
+
+namespace TVM {
+namespace codegen {
+
+class CodeGenAOCL : public CodeGenOpenCL {
+ public:
+  CodeGenAOCL(){}
+  void AddFunction(LoweredFunc f, str2tupleMap<std::string, Type> map_arg_type);
+  void PrintType(Type t, std::ostream& os) override; //NOLINT(*)
+
+  void VisitStmt_(const For* op) override; //NOLINT(*)
+  void VisitStmt_(const StreamStmt* op) override; //NOLINT(*)
+  void VisitStmt_(const KernelDef* op) override; //NOLINT(*)
+  void VisitStmt_(const KernelStmt* op) override; //NOLINT(*)
+
+  void VisitExpr_(const StreamExpr* op, std::ostream& os) override; //NOLINT(*)
+  void VisitExpr_(const KernelExpr* op, std::ostream& os) override; //NOLINT(*)
+
+ private:
+  // whether to enable streaming
+  bool stream_pragma{false}; 
+  // map from kernel name to set of streamed arg position index
+  std::unordered_map<std::string, std::unordered_set<int>> stream_arg_pos;
+};
+} // namespace codegen
+} // namespace TVM
+
+#endif // TVM_CODEGEN_CODEGEN_AOCL_H_
diff --git a/tvm/src/codegen/codegen_opencl.cc b/tvm/src/codegen/opencl/codegen_opencl.cc
old mode 100644
new mode 100755
similarity index 53%
rename from tvm/src/codegen/codegen_opencl.cc
rename to tvm/src/codegen/opencl/codegen_opencl.cc
index d0297a1d9..979a19e0f
--- a/tvm/src/codegen/codegen_opencl.cc
+++ b/tvm/src/codegen/opencl/codegen_opencl.cc
@@ -1,206 +1,239 @@
-/*!
- *  Copyright (c) 2017 by Contributors
- * \file codegen_opencl.cc
- */
-#include <tvm/runtime/config.h>
-#include <tvm/packed_func_ext.h>
-#include <vector>
-#include <string>
-#include "./codegen_opencl.h"
-#include "../runtime/thread_storage_scope.h"
-
-namespace TVM {
-namespace codegen {
-
-CodeGenOpenCL::CodeGenOpenCL() {
-  restrict_keyword_ = "restrict";
-}
-
-void CodeGenOpenCL::InitFuncState(LoweredFunc f) {
-  CodeGenC::InitFuncState(f);
-  for (Var arg : f->args) {
-    if (arg.type().is_handle()) {
-      alloc_storage_scope_[arg.get()] = "global";
-    }
-  }
-}
-
-void CodeGenOpenCL::AddFunction(LoweredFunc f) {
-  this->stream << "__kernel ";
-  CodeGenC::AddFunction(f);
-}
-
-std::string CodeGenOpenCL::Finish() {
-  // inject extension enable pragma for fp16 and fp64
-  if (enable_fp16_) {
-    decl_stream
-        << "#ifdef cl_khr_fp16\n"
-           "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n"
-           "#elif defined(cl_amd_fp16)\n"
-           "#pragma OPENCL EXTENSION cl_amd_fp16 : enable\n"
-           "#else\n"
-           "#error \"Half precision floating point not supported"
-                    "by OpenCL implementation on your device.\" \n"
-           "#endif\n\n";
-  }
-
-  if (enable_fp64_) {
-    decl_stream
-        << "#ifdef cl_khr_fp64\n"
-           "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
-           "#elif defined(cl_amd_fp64)\n"
-           "#pragma OPENCL EXTENSION cl_amd_fp64 : enable\n"
-           "#else\n"
-           "#error \"Double precision floating point not supported"
-                    "by OpenCL implementation on your device.\" \n"
-           "#endif\n\n";
-  }
-
-  return CodeGenC::Finish();
-}
-
-void CodeGenOpenCL::BindThreadIndex(const IterVar& iv) {
-  CHECK(!var_idmap_.count(iv->var.get()));
-  runtime::ThreadScope ts = runtime::ThreadScope::make(iv->thread_tag);
-  std::ostringstream os;
-  if (ts.rank == 1) {
-    os << "get_local_id(" << ts.dim_index << ")";
-  } else {
-    os << "get_group_id(" << ts.dim_index << ")";
-  }
-  var_idmap_[iv->var.get()] =
-      CastFromTo(os.str(), UInt(64), iv->var.type());
-}
-
-void CodeGenOpenCL::PrintType(Type t, std::ostream& os) {  // NOLINT(*)
-  int lanes = t.lanes();
-  if (t.is_handle()) {
-    CHECK_EQ(lanes, 1)
-        << "do not yet support vector types";
-    os << "void*"; return;
-  }
-  bool fail = false;
-  if (t.is_float()) {
-    switch (t.bits()) {
-      case 16:
-        os << "half";
-        enable_fp16_ = true;
-        break;
-      case 32: os << "float"; break;
-      case 64:
-        os << "double";
-        enable_fp64_ = true;
-        break;
-      default: fail = true; break;
-    }
-    if (!fail && lanes == 1) return;
-    if (!fail && (lanes >= 2 && lanes <= 16)) {
-      os << lanes; return;
-    }
-  } else if (t.is_uint() || t.is_int()) {
-    if (t.is_uint()) {
-      os << 'u';
-    }
-    if (t.bits() == 8 && t.lanes() == 4) {
-      // directly 4 8 bit int in integer.
-      os << "int"; return;
-    }
-    switch (t.bits()) {
-      case 8: os << "char"; break;
-      case 16: os << "short"; break;
-      case 32: os << "int"; break;
-      case 64: os << "long"; break;
-      case 1: os << "int"; break;
-      default: fail = true; break;
-    }
-    if (!fail && lanes == 1) return;
-    if (!fail && (lanes >= 2 && lanes <= 16)) {
-      os << lanes; return;
-    }
-  }
-  LOG(FATAL) << "Cannot convert type " << t << " to OpenCL type";
-}
-
-void CodeGenOpenCL::PrintVecAddr(const Variable* buffer, Type t,
-                                 Expr base, std::ostream& os) {  // NOLINT(*)
-  if (!HandleTypeMatch(buffer, t.element_of())) {
-    os << '(';
-    auto it = alloc_storage_scope_.find(buffer);
-    if (it != alloc_storage_scope_.end()) {
-      PrintStorageScope(it->second, os);
-    }
-    os << ' ';
-    PrintType(t.element_of(), os);
-    os << "*)";
-  }
-  os << GetVarID(buffer) << " + ";
-  PrintExpr(base, os);
-}
-std::string CodeGenOpenCL::GetVecLoad(
-    Type t, const Variable* buffer, Expr base) {
-  std::ostringstream os;
-  os << "vload" << t.lanes() << "(0, ";
-  PrintVecAddr(buffer, t, base, os);
-  os << ")";
-  return os.str();
-}
-
-void CodeGenOpenCL::PrintVecStore(const Variable* buffer,
-                                  Type t, Expr base,
-                                  const std::string& value) {
-  this->PrintIndent();
-  stream << "vstore" << t.lanes() << "(" << value << ", 0, ";
-  PrintVecAddr(buffer, t, base, stream);
-  stream << ");\n";
-}
-
-void CodeGenOpenCL::PrintStorageSync(const Call* op) {
-  const std::string& sync = op->args[0].as<StringImm>()->value;
-  if (sync == "warp") {
-    LOG(FATAL) << "warp sync not supported in opencl";
-  } else if (sync == "shared") {
-    this->PrintIndent();
-    this->stream << "barrier(CLK_LOCAL_MEM_FENCE);\n";
-  } else if (sync == "global") {
-    LOG(FATAL) << "not supported";
-  }
-}
-
-void CodeGenOpenCL::PrintStorageScope(
-    const std::string& scope, std::ostream& os) { // NOLINT(*)
-  if (scope == "global") {
-    os << "__global";
-  } else if (scope == "shared") {
-    os << "__local";
-  }
-}
-
-std::string CodeGenOpenCL::CastFromTo(std::string value, Type from, Type target) {
-  if (from == target) return value;
-  std::ostringstream os;
-  if (target.lanes() == 1) {
-    os << "((";
-    this->PrintType(target, os);
-    os << ")" << value << ")";
-  } else {  // convert vector type
-    os << "(";
-    os << "convert_";
-    this->PrintType(target, os);
-    os << "(" << value << "))";
-  }
-  return os.str();
-}
-
-void CodeGenOpenCL::VisitExpr_(const Broadcast* op, std::ostream& os) {   // NOLINT(*)
-  std::string v = PrintExpr(op->value);
-  os << "((";
-  PrintType(op->type, os);
-  os << ")(";
-  for (int i = 0; i < op->lanes; ++i) {
-    if (i != 0) os << ", ";
-    os << v;
-  }
-  os << "))";
-}
-}  // namespace codegen
-}  // namespace TVM
+# include <tvm/runtime/config.h>
+# include <tvm/packed_func_ext.h>
+# include <vector>
+# include <string>
+# include <cmath>
+# include <regex>
+# include "./codegen_opencl.h"
+# include "../../runtime/thread_storage_scope.h"
+
+namespace TVM{
+namespace codegen{
+  
+CodeGenOpenCL::CodeGenOpenCL(){
+  restrict_keyword_ = "restrict";
+}
+
+std::string CodeGenOpenCL::Finish() {
+  // inject extension enable pragma for fp16 and fp64
+  if (enable_fp16_) {
+    decl_stream
+        << "#ifdef cl_khr_fp16\n"
+           "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n"
+           "#elif defined(cl_amd_fp16)\n"
+           "#pragma OPENCL EXTENSION cl_amd_fp16 : enable\n"
+           "#else\n"
+           "#error \"Half precision floating point not supported"
+                    "by OpenCL implementation on your device.\" \n"
+           "#endif\n\n";
+  }
+
+  if (enable_fp64_) {
+    decl_stream
+        << "#ifdef cl_khr_fp64\n"
+           "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
+           "#elif defined(cl_amd_fp64)\n"
+           "#pragma OPENCL EXTENSION cl_amd_fp64 : enable\n"
+           "#else\n"
+           "#error \"Double precision floating point not supported"
+                    "by OpenCL implementation on your device.\" \n"
+           "#endif\n\n";
+  }
+
+  return CodeGenC::Finish();
+}
+
+void CodeGenOpenCL::BindThreadIndex(const IterVar& iv) {
+  CHECK(!var_idmap_.count(iv->var.get()));
+  runtime::ThreadScope ts = runtime::ThreadScope::make(iv->thread_tag);
+  std::ostringstream os;
+  if (ts.rank == 1) {
+    os << "get_local_id(" << ts.dim_index << ")";
+  } else {
+    os << "get_group_id(" << ts.dim_index << ")";
+  }
+  var_idmap_[iv->var.get()] =
+      CastFromTo(os.str(), UInt(64), iv->var.type());
+}
+
+
+void CodeGenOpenCL::PrintVecAddr(const Variable* buffer, Type t,
+                                 Expr base, std::ostream& os) {  // NOLINT(*)
+  if (!HandleTypeMatch(buffer, t.element_of())) {
+    os << '(';
+    auto it = alloc_storage_scope_.find(buffer);
+    if (it != alloc_storage_scope_.end()) {
+      PrintStorageScope(it->second, os);
+    }
+    os << ' ';
+    PrintType(t.element_of(), os);
+    os << "*)";
+  }
+  os << GetVarID(buffer) << " + ";
+  PrintExpr(base, os);
+}
+std::string CodeGenOpenCL::GetVecLoad(
+    Type t, const Variable* buffer, Expr base) {
+  std::ostringstream os;
+  os << "vload" << t.lanes() << "(0, ";
+  PrintVecAddr(buffer, t, base, os);
+  os << ")";
+  return os.str();
+}
+
+void CodeGenOpenCL::PrintVecStore(const Variable* buffer,
+                                  Type t, Expr base,
+                                  const std::string& value) {
+  this->PrintIndent();
+  stream << "vstore" << t.lanes() << "(" << value << ", 0, ";
+  PrintVecAddr(buffer, t, base, stream);
+  stream << ");\n";
+}
+
+void CodeGenOpenCL::PrintStorageSync(const Call* op) {
+  const std::string& sync = op->args[0].as<StringImm>()->value;
+  if (sync == "warp") {
+    LOG(FATAL) << "warp sync not supported in opencl";
+  } else if (sync == "shared") {
+    this->PrintIndent();
+    this->stream << "barrier(CLK_LOCAL_MEM_FENCE);\n";
+  } else if (sync == "global") {
+    LOG(FATAL) << "not supported";
+  }
+}
+
+void CodeGenOpenCL::PrintStorageScope(
+    const std::string& scope, std::ostream& os) { // NOLINT(*)
+  if (scope == "global") {
+    // os << "global ";
+  } else if (scope == "shared") {
+    // os << "local ";
+  }
+}
+
+std::string CodeGenOpenCL::CastFromTo(std::string value, Type from, Type target) {
+  if (from == target) return value;
+  std::ostringstream os;
+  if (target.lanes() == 1) {
+    os << "((";
+    this->PrintType(target, os);
+    os << ")" << value << ")";
+  } else {  // convert vector type
+    os << "(";
+    os << "convert_";
+    this->PrintType(target, os);
+    os << "(" << value << "))";
+  }
+  return os.str();
+}
+
+void CodeGenOpenCL::VisitExpr_(const Broadcast* op, std::ostream& os) {   // NOLINT(*)
+  std::string v = PrintExpr(op->value);
+  os << "((";
+  PrintType(op->type, os);
+  os << ")(";
+  for (int i = 0; i < op->lanes; ++i) {
+    if (i != 0) os << ", ";
+    os << v;
+  }
+  os << "))";
+}
+
+void CodeGenOpenCL::VisitExpr_(const Call * op, std::ostream& os) { // NOLINT(*)
+    if (op->is_intrinsic(intrinsic::tvm_if_then_else)) {
+        os << "(";
+        PrintType(op->args[2].type(), os);
+        os << ")";
+    }
+    CodeGenC::VisitExpr_(op, os);
+}
+
+void CodeGenOpenCL::VisitStmt_(const LetStmt* op) {
+  std::string value = PrintExpr(op->value);
+  // Skip the argument retrieving assign statement
+  std::string vid = AllocVarID(op->var.get());
+  if (op->var.type() != Handle() &&
+      value.find("TVMArray") == std::string::npos &&
+      value.find("arg") != 0) {
+    PrintIndent();
+    PrintType(op->var.type(), this->stream);
+    this->stream << ' '
+                 << vid
+                 << " = " << value << ";\n";
+  }
+  PrintStmt(op->body);
+}
+
+
+void CodeGenOpenCL::VisitExpr_(const FloatImm * op, std::ostream& os) { // NOLINT(*)
+    if (std::isinf(op->value)) {
+        if ( op->value < 0) {
+            os << "-";
+        }
+        os << "INFINITY";
+    } else if (std::isnan(op->value)) {
+        os << "NAN";
+    } else {
+        CodeGenC::VisitExpr_(op, os);
+    }
+}
+
+void CodeGenOpenCL::VisitExpr_(const Select * op, std::ostream& os ) { // NOINT(*)
+    os << "(";
+    PrintType(op->true_value.type(), os);
+    os << ")";
+    CodeGenC::VisitExpr_(op, os);
+} 
+
+void CodeGenOpenCL::VisitStmt_(const IfThenElse* op) {
+  std::string cond = PrintExpr(op->condition);
+  // Skip the buffer data checking
+  if (std::regex_match(cond, std::regex("!\\((arg)(.+)(== NULL)\\)")))
+      return ;
+  PrintIndent();
+  if (cond[0] == '(' && cond[cond.length() - 1] == ')') {
+    stream << "if " << cond << " {\n";
+  } else {
+    stream << "if (" << cond << ") {\n";
+  }
+  int then_scope = BeginScope();
+  PrintStmt(op->then_case);
+  this->EndScope(then_scope);
+  if (op->else_case.defined()) {
+    PrintIndent();
+    stream << "} else {\n";
+    int else_scope = BeginScope();
+    PrintStmt(op->else_case);
+    this->EndScope(else_scope);
+  }
+  PrintIndent();
+  stream << "}\n";
+}
+
+void CodeGenOpenCL::GenForStmt(const For* op, std::string pragma, bool before) {
+  std::string extent = PrintExpr(op->extent);
+  std::string vid = AllocVarID(op->loop_var.get());
+  CHECK(is_zero(op->min));
+  if (before && pragma.length() > 0) {
+    PrintIndent();
+    stream << pragma;
+  }
+  PrintIndent();
+  stream << "for (";
+  PrintType(op->loop_var.type(), stream);
+  stream << ' ' << vid << " = 0; "
+            << vid << " < " << extent
+            << "; ++" << vid << ") {\n";
+  if (!before && pragma.length() > 0) {
+    PrintIndent();
+    stream << pragma;
+  }
+  int for_scope = BeginScope();
+  PrintStmt(op->body);
+  this->EndScope(for_scope);
+  PrintIndent();
+  stream << "}\n";
+}
+
+} // namespace codegen
+} // namespace TVM
diff --git a/tvm/src/codegen/opencl/codegen_opencl.h b/tvm/src/codegen/opencl/codegen_opencl.h
new file mode 100755
index 000000000..4f9a15fe5
--- /dev/null
+++ b/tvm/src/codegen/opencl/codegen_opencl.h
@@ -0,0 +1,50 @@
+#ifndef TVM_CODEGEN_CODEGEN_OPENCL_H_
+#define TVM_CODEGEN_CODEGEN_OPENCL_H_
+
+# include <tvm/codegen.h>
+# include <tvm/packed_func_ext.h>
+# include <string>
+# include "../codegen_c.h"
+
+namespace TVM{
+namespace codegen{
+
+class CodeGenOpenCL : public CodeGenC{
+  public:
+    // void AddFunction(LoweredFunc f);
+    CodeGenOpenCL();
+    virtual void AddFunction(LoweredFunc f, str2tupleMap<std::string, Type> map_arg_type) = 0;
+    std::string Finish();
+    void BindThreadIndex(const IterVar& iv) override; // NOLINT(*)
+    void PrintStorageScope(const std::string& scope, std::ostream& os) override; //NOLINT(*)
+    void PrintStorageSync(const Call* op) override; //NOLINT(*)
+    // void PrintType(Type t, std::ostream& os) override; //NOLINT(*)
+    virtual void PrintType(Type t, std::ostream& os) = 0; //NOLINT
+    std::string GetVecLoad(Type t, const Variable * buffer, 
+                           Expr base) override; // NOLINT(*)
+    void PrintVecStore(const Variable * buffer, Type t,
+                       Expr base, const std::string& value) override; //NOLINT(*)
+    void PrintVecAddr(const Variable * buffer, Type t,
+                      Expr base, std::ostream& os); //NOLINT(*)
+    std::string CastFromTo(std::string value, Type from, Type target) override; //NOLINT(*)
+
+    //overload visitor
+    void VisitExpr_(const Broadcast * op, std::ostream& os) override; //NOLINT(*)
+    void VisitExpr_(const Call * op, std::ostream& os) override; //NOLINT(*)
+    void VisitExpr_(const Select * op, std::ostream& os) override; //NOLINT(*)
+    void VisitExpr_(const FloatImm * op, std::ostream& os) override; //NOLINT(*)
+    void VisitStmt_(const IfThenElse* op) override; //NOLINT(*)
+    void VisitStmt_(const LetStmt* op) override; // NOLINT
+    void GenForStmt(const For* op, std::string pragma, bool before);
+    virtual void VisitStmt_(const For* op) = 0;
+
+protected:
+  // fp16 and fp64 extension
+  bool enable_fp16_{false};
+  bool enable_fp64_{false};
+};
+
+} // namespace codegen
+} // namespace TVM
+
+#endif
diff --git a/tvm/src/codegen/opencl/codegen_sdaccel.cc b/tvm/src/codegen/opencl/codegen_sdaccel.cc
new file mode 100644
index 000000000..cba08fa2d
--- /dev/null
+++ b/tvm/src/codegen/opencl/codegen_sdaccel.cc
@@ -0,0 +1,219 @@
+# include <tvm/runtime/config.h>
+# include <tvm/packed_func_ext.h>
+# include <vector>
+# include <string>
+# include "./codegen_sdaccel.h"
+# include "../../runtime/thread_storage_scope.h"
+
+namespace TVM {
+namespace codegen {
+
+void CodeGenSDACCEL::AddFunction(LoweredFunc f,
+        str2tupleMap<std::string, Type> map_arg_type) {
+  // Clear previous generated state
+  this->InitFuncState(f);
+  for (Var arg: f->args) {
+      if (arg.type().is_handle()) {
+          alloc_storage_scope_[arg.get()] = "global";
+      }
+  }
+
+  // Skip the first underscore, so SSA variable starts from _1
+  GetUniqueName("_");
+
+  // Register alloc buffer type
+  for (const auto & kv : f->handle_data_type) {
+    RegisterHandleType(kv.first.get(), kv.second.type());
+  }
+
+  this->stream << "__kernel " << "void " << f->name << "(";
+
+  // Write arguments
+  for (size_t i = 0; i < f->args.size(); ++i) {
+    Var v = f->args[i];
+    std::string vid = AllocVarID(v.get());
+    if (i != 0) this->stream << ", ";
+    if (map_arg_type.find(vid) == map_arg_type.end()) {
+      LOG(WARNING) << vid << " type not found\n";
+      PrintType(v.type(), this->stream);
+      this->stream << ' ' << vid;
+    }
+    else {
+      auto arg = map_arg_type[vid];
+      this->stream << "__global ";
+      // this->stream << "global ";
+      PrintType(std::get<1>(arg), this->stream);
+      if (v.type().is_handle())
+        this->stream << "*";
+      this->stream << ' ' << std::get<0>(arg);
+    }
+  }
+  stream << ") {\n";
+  int func_scope = this->BeginScope();
+  this->PrintStmt(f->body);
+  this->EndScope(func_scope);
+  this->PrintIndent();
+  // this->stream << ' '<< ' ' << "return;\n";
+  this->stream << "}\n\n";
+}
+
+void CodeGenSDACCEL::PrintType(Type t, std::ostream& os) {  // NOLINT(*)
+  int lanes = t.lanes();
+  if (t.is_handle()) {
+    //LOG(FATAL) << "The buffer shouldn't call PrintType for printing type";
+    os << "void*";
+    return ;
+  }
+  bool fail = false;
+  if (t.is_float()) {
+    switch (t.bits()) {
+      case 16: os << "half"; break;
+      case 32: os << "float"; break;
+      case 64: os << "double"; break;
+      // case 128: os << "double double"; break;
+      default: fail = true; break;
+    }
+    if (!fail && lanes == 1) return;
+    if (!fail && (lanes >= 2 && lanes <= 16)) {
+      os << lanes; return;
+    }
+  } else if (t.is_uint() || t.is_int()) {
+    if (t.is_uint()) {
+      os << "unsigned ";
+    }
+    if (t.bits() == 8 && t.lanes() == 4) {
+      // directly 4 8 bit int in integer.
+      os << "int"; return;
+    }
+
+    int target_bit = 1;
+    while (target_bit < t.bits())
+      target_bit <<= 1;
+
+    switch (target_bit) {
+      case 1: os << "int"; break;
+      case 2: os << "char"; break;
+      case 4: os << "char"; break;
+      case 8: os << "char"; break;
+      case 16: os << "short"; break;
+      case 32: os << "int"; break;
+      case 64: os << "long"; break;
+      case 128: os << "long"; break; // FIXME: Should use long long
+      default: fail = true; break;
+    }
+    if (!fail && lanes == 1) return;
+    // FIXME: Not yet support multiple lanes
+    //if (!fail && (lanes >= 2 && lanes <= 16)) {
+    //  os << lanes; return;
+    //}
+  }
+  os << t;
+  LOG(WARNING) << "Cannot convert type " << t ;
+  return ;
+}
+
+void CodeGenSDACCEL::PrintStorageScope(
+    const std::string& scope, std::ostream& os) { // NOLINT(*)
+  if (scope == "global" || scope == "shared") {
+    os << "__local ";
+  }
+}
+
+void CodeGenSDACCEL::VisitStmt_(const For* op) {
+  std::ostringstream os;
+  if (op->for_type == ForType::Unrolled) {
+    int unroll_factor = 0, i = 0;
+    for (auto key : op->annotate_keys) {
+      if (auto str = key.as<StringImm>()) {
+        auto factor = op->annotate_values[i].as<IntImm>();
+        if (str->value == "factor" && factor != nullptr && factor->value > 1) {
+          unroll_factor = factor->value;
+          break;
+        }
+      }
+      i++;
+    }
+    if (unroll_factor > 0) {
+        os << "__attribute__((opencl_unroll_hint(";
+        os << unroll_factor << ")))\n";
+    } else {
+      os << "\n";
+    }
+  }
+  else if (op->for_type == ForType::Pipelined) {
+    int II = 1, i = 0;
+    for (auto key : op->annotate_keys) {
+      if (auto str = key.as<StringImm>()) {
+        auto initiation_interval = op->annotate_values[i].as<IntImm>();
+        if (str->value == "initiation_interval" &&
+            initiation_interval != nullptr &&
+            initiation_interval->value > 1) {
+          II = initiation_interval->value;
+          break;
+        }
+      }
+      i++;
+    }
+    os << "__attribute__((xcl_pipeline_loop(";
+    os << II << ")))\n";
+  }
+  CodeGenSDACCEL::GenForStmt(op, os.str(), true);
+}
+
+void CodeGenSDACCEL::VisitStmt_(const Partition* op) {
+  std::string vid = GetVarID(op->buffer_var.get());
+  stream << vid << " ";
+  if (op->partition_type != PartitionType::Complete) {
+    stream << "__attribute__((xcl_array_partition(";
+    switch (op->partition_type) {
+      // case PartitionType::Complete:
+      //   stream << "complete,";
+      //   break;
+      case PartitionType::Block:
+        stream << "block,";
+        break;
+      case PartitionType::Cyclic:
+        stream << "cyclic,";
+        break;
+      }
+    stream << op->factor << ",";
+    stream << op->dim << ")))\n";
+  } else {
+    if (op->dim == 0) {
+      stream << "__attribute__((xcl_array_partition))\n";
+    } else {
+      stream << "__attribute__((xcl_array_partition(";
+      stream << "complete,";
+      stream << op->factor << ",";
+      stream << op->dim << ")))\n";
+      }
+    }
+}
+
+void CodeGenSDACCEL::VisitStmt_(const StreamStmt* op) {
+  std::string vid = GetVarID(op->buffer_var.get());
+  PrintIndent();
+  stream << vid;
+  switch (op->stream_type) {
+    case StreamType::Channel:
+      stream << "[channel]";
+      break;
+    case StreamType::FIFO:
+      stream << "[fifo]";
+      break;
+    case StreamType::Pipe:
+      stream << "[pipe]";
+      break;
+  }
+  stream << ".write";
+  PrintExpr(op->value, stream);
+  stream << ";\n";
+}
+
+void CodeGenSDACCEL::VisitExpr_(const StreamExpr* op, std::ostream& os) {
+  std::string vid = GetVarID(op->buffer_var.get());
+  os << vid << ".read()";
+}
+
+} // namespace codegen
+} // namespace TVM
diff --git a/tvm/src/codegen/opencl/codegen_sdaccel.h b/tvm/src/codegen/opencl/codegen_sdaccel.h
new file mode 100755
index 000000000..4f1cfa053
--- /dev/null
+++ b/tvm/src/codegen/opencl/codegen_sdaccel.h
@@ -0,0 +1,29 @@
+#ifndef TVM_CODEGEN_CODEGEN_SDACCEL_H_
+#define TVM_CODEGEN_CODEGEN_SDACCEL_H_
+
+# include <tvm/codegen.h>
+# include <tvm/packed_func_ext.h>
+# include "./codegen_opencl.h"
+
+namespace TVM {
+namespace codegen {
+
+class CodeGenSDACCEL : public CodeGenOpenCL {
+  public:
+    CodeGenSDACCEL(){}
+    void AddFunction(LoweredFunc f, str2tupleMap<std::string, Type> map_arg_type);
+
+    void PrintType(Type t, std::ostream& os) override; //NOLINT(*)
+    void PrintStorageScope(const std::string& scope, std::ostream& os) override; //NOLINT(*)
+
+    void VisitStmt_(const For* op) override; //NOLINT(*)
+    void VisitStmt_(const Partition* op) override; //NOLINT(*)
+    void VisitStmt_(const StreamStmt* op) override; //NOLINT(*)
+
+    void VisitExpr_(const StreamExpr* op, std::ostream& os) override; //NOLINT(*)
+   
+};
+} // namespace codegen
+} // namespace TVM
+
+#endif // TVM_CODEGEN_CODEGEN_SDACCEL_H_
diff --git a/tvm/src/codegen/opencl/sdaccel_module.cc b/tvm/src/codegen/opencl/sdaccel_module.cc
new file mode 100644
index 000000000..63f12e86b
--- /dev/null
+++ b/tvm/src/codegen/opencl/sdaccel_module.cc
@@ -0,0 +1,645 @@
+#include "./sdaccel_module.h"
+#include <fstream>
+#include <unistd.h>
+#include <sys/ipc.h>
+#include <sys/shm.h>
+#include <iostream>
+#include <cstring>
+#include <typeinfo>
+
+namespace TVM {
+namespace runtime {
+
+namespace {
+
+void PrintIndent(std::ofstream& stream, int indent) {
+  for (int i = 0; i < indent; i++)
+    stream << ' ';
+}
+
+inline size_t GetTypeSize(TVMType t) {
+  size_t byte = (t.bits + 7) / 8;
+  if (byte > 2){
+    if (byte <= 4) byte = 4;
+    else if (byte <= 8) byte = 8;
+    else byte = 16;
+  }
+  return byte;
+}
+
+inline size_t GetDataSize(TVMArray* arr) {
+  size_t size = 1;
+  for (tvm_index_t i = 0; i < arr->ndim; ++i) {
+    size *= arr->shape[i];
+  }
+  size_t byte = (arr->dtype.bits + 7) / 8;
+  if (byte > 2){
+    if (byte <= 4) byte = 4;
+    else if (byte <= 8) byte = 8;
+    else byte = 16;
+  }
+  size *= (byte * 8 * arr->dtype.lanes + 7) / 8;
+  return size;
+}
+
+inline TVMType Type2TVMType(Type t) {
+  TVMType tt;
+  if (t.is_int())        tt.code = kDLInt;
+  else if (t.is_uint())  tt.code = kDLUInt;
+  else if (t.is_float()) tt.code = kDLFloat;
+  else                   LOG(FATAL) << "Unacceptable type: " << t;
+  tt.bits = static_cast<uint8_t>(t.bits());
+  tt.fracs = static_cast<uint8_t>(t.fracs());
+  return tt;
+}
+
+inline std::string Type2Str(TVMType t) {
+  std::string str = "";
+  if (t.code == kDLInt) {
+    str += "int";
+  } else if (t.code == kDLUInt) {
+    str += "unsigned int";
+  } else if (t.code == kDLFloat) {
+    str += "float";
+  } else {
+    LOG(FATAL) << "Unknown type";
+  }
+  return str;
+}
+
+inline std::string Type2ExtStr(TVMType t) {
+  std::string str = "";
+  if (t.code == kDLInt) {
+    if (t.fracs > 0) str += "ap_fixed<";
+    else             str += "ap_int<";
+    str += std::to_string(static_cast<int>(t.bits + t.fracs));
+    if (t.fracs > 0) str += ", " + std::to_string(static_cast<int>(t.bits)) + ">";
+    else             str += ">";
+  } else if (t.code == kDLUInt) {
+    if (t.fracs > 0) str += "ap_ufixed<";
+    else             str += "ap_uint<";
+    str += std::to_string(static_cast<int>(t.bits + t.fracs));
+    if (t.fracs > 0) str += ", " + std::to_string(static_cast<int>(t.bits)) + ">";
+    else             str += ">";
+  } else if (t.code == kDLFloat) {
+    str += "float";
+  } else {
+    LOG(FATAL) << "Unknown type";
+  }
+  return str;
+}
+
+inline std::string Type2Byte(TVMType t) {
+  std::string str = "";
+  if (t.code == kDLFloat) {
+    str += "float";
+  } else if (t.code == kDLInt || t.code == kDLUInt) {
+    if (t.code == kDLUInt) str += "unsigned";
+    str += "int";
+    if      (t.bits <= 8)  str += "8";
+    else if (t.bits <= 16) str += "16";
+    else if (t.bits <= 32) str += "32";
+    else                   str += "64";
+    // str += "_t";
+  }
+  return str;
+}
+
+void CollectArgInfo(TVMArgs& args, 
+                    LoweredFunc func,
+                    std::vector<size_t>& arg_sizes,
+                    std::vector<TVMType>& arg_types) {
+  for (int i = 0; i < args.size(); i++) {
+    if (args[i].type_code() == kArrayHandle) {
+      TVMArray* arr = args[i];
+      arg_sizes.push_back(GetDataSize(arr));
+      arg_types.push_back(arr->dtype);
+    } else {
+      const Variable* var = func->api_args[i].as<Variable>();
+      TVMType t = Type2TVMType(var->type);
+      arg_sizes.push_back(GetTypeSize(t));
+      arg_types.push_back(t);
+    }
+  }
+}
+
+void GenSharedMem(TVMArgs& args,
+                  std::vector<int>& shmids,
+                  std::vector<size_t>& arg_sizes) {
+  for (int i = 0; i < args.size(); i++) {
+    if (args[i].type_code() == kArrayHandle) {
+      TVMArray* arr = args[i];
+      // generate shared memory key and id
+      // TODO: maybe get the current path??
+      key_t key = ftok("/", i+1);
+      int shmid = shmget(key, arg_sizes[i], 0666|IPC_CREAT);
+      shmids.push_back(shmid);
+      // copy mem from TVM args to the shared memory
+      void* mem = shmat(shmid, nullptr, 0);
+      memcpy(mem, arr->data, arg_sizes[i]);
+    } else {
+      shmids.push_back(0);
+    }
+  }
+}
+
+void FreeSharedMem(TVMArgs& args, 
+                   const std::vector<int>& shmids,
+                   std::vector<size_t>& arg_sizes) {
+  for (size_t i = 0; i < shmids.size(); i++) {
+      TVMArray* arr = args[i];
+      int shmid = shmids[i];
+      void* mem = shmat(shmid, nullptr, 0);
+      memcpy(arr->data, mem, arg_sizes[i]);
+      shmdt(mem);
+      shmctl(shmid, IPC_RMID, nullptr);
+  }
+}
+
+// copy values from the shared mem to local mem
+void PrintCopy(TVMArray* arr, 
+               std::ofstream& stream, 
+               int indent, size_t nth_arr) {
+  for (int i = 0; i < arr->ndim; i++) {
+    PrintIndent(stream, indent);
+    stream << "for (size_t i" << i << " = 0; ";
+    stream << "i" << i << " < " << arr->shape[i] << "; ";
+    stream << "i" << i << "++) {\n";
+    indent += 2;
+    if (i == arr->ndim-1) {
+      PrintIndent(stream, indent);
+      stream << "source_" << nth_arr;
+      stream << "[i" << arr->ndim-1;
+      int mul = 1;
+      for (int j = arr->ndim-2;j >= 0;j--) {
+        mul *= arr->shape[j+1];
+        stream << " + i" << j << "*" << mul;
+      }
+      stream << "] = ";
+      stream << "arg_" << nth_arr;
+      stream << "[i" << arr->ndim - 1;
+
+      int mul2 = 1;
+      for (int j = arr->ndim-2;j >= 0;j--) {
+        mul2 *= arr->shape[j+1];
+        stream << " + i" << j << "*" << mul2;
+      }
+      stream << "]";
+      if (arr->dtype.fracs > 0)
+        stream << " >> " << static_cast<int>(arr->dtype.fracs);
+      stream << ";\n";
+    }
+  }
+  for (int i = 0; i < arr->ndim; i++) {
+    indent -= 2;
+    PrintIndent(stream, indent);
+    stream << "}\n";
+  }
+}
+
+// copy values from local mem back to shared mem
+void PrintCopyBack(TVMArray* arr, 
+                   std::ofstream& stream, 
+                   int indent, size_t nth_arr) {
+  for (int i = 0; i < arr->ndim; i++) {
+    PrintIndent(stream, indent);
+    stream << "for (size_t i" << i << " = 0; ";
+    stream << "i" << i << " < " << arr->shape[i] << "; ";
+    stream << "i" << i << "++) {\n";
+    indent += 2;
+    if (i == arr->ndim-1) {
+      PrintIndent(stream, indent);
+      stream << "arg_" << nth_arr;
+      stream << "[i" << arr->ndim-1;
+      int mul = 1;
+      for (int j = arr->ndim-2; j >= 0; j--) {
+        mul *= arr->shape[j+1];
+        stream << " + i" << j << "*" << mul;
+      }
+      stream << "] = ";
+      // stream << Type2ExtStr(arr->dtype);
+      stream << "source_" << nth_arr;
+      stream << "[i" << arr->ndim - 1;
+      int mul2 = 1;
+      for (int j = arr->ndim-2;j >=0;j--) {
+        mul2 *= arr->shape[j+1];
+        stream << " + i" << j << "*" << mul2;
+      }
+      stream << "]";
+      if (arr->dtype.fracs > 0)
+        stream << " << " << static_cast<int>(arr->dtype.fracs);
+      stream << ";\n";
+    }
+  }
+  for (int i = 0; i < arr->ndim; i++) {
+    indent -= 2;
+    PrintIndent(stream, indent);
+    stream << "}\n";
+  }
+}
+
+void GenMakFile() {
+  int indent = 0;
+  std::ofstream stream;
+  stream.open("sdaccel.mk");
+  indent += 4;
+
+  stream << "ifndef XILINX_SDX\n";
+  stream << "$(error Environment variable XILINX_SDX is required and should point to SDAccel install area)\n";
+  stream << "endif\n";
+
+  stream << "SDA_FLOW = cpu_emu\n";
+  stream << "HOST_SRCS = host.cpp\n";
+  stream << "HOST_EXE_DIR=.\n";
+  stream << "HOST_EXE = host\n";
+  stream << "HOST_CFLAGS = -g -Wall -DFPGA_DEVICE -DC_KERNEL\n";
+  stream << "HOST_LFLAGS = \n";
+  stream << "KERNEL_SRCS = default_function.cl\n";
+  stream << "KERNEL_NAME = default_function\n";
+  stream << "KERNEL_DEFS =\n";
+  stream << "KERNEL_INCS =\n";
+  stream << "XDEVICE=xilinx:adm-pcie-7v3:1ddr:3.0\n";
+  stream << "XDEVICE_REPO_PATH=\n";
+  stream << "KEEP_TEMP=1\n";
+  stream << "KERNEL_DEBUG=\n";
+  stream << "XCLBIN_NAME=bin_krnl\n";
+  stream << "HOST_CFLAGS+=-DTARGET_DEVICE=\\\"${XDEVICE}\\\"\n";
+  stream << "BOARD_SETUP_FILE=setup.sh\n";
+  stream << "ifeq (${SDA_FLOW},cpu_emu)\n";
+  PrintIndent(stream, indent);
+  stream << "CLCC_OPT += -t sw_emu\n";
+  PrintIndent(stream, indent);
+  stream << "XCLBIN = ${XCLBIN_NAME}_cpu_emu.xclbin\n"; 
+  stream << "else ifeq (${SDA_FLOW},hw_emu)\n";
+  PrintIndent(stream, indent);
+  stream << "CLCC_OPT += -t hw_emu\n";
+  PrintIndent(stream, indent);
+  stream << "XCLBIN = ${XCLBIN_NAME}_hw_emu.xclbin\n";
+  stream << "else ifeq (${SDA_FLOW},hw)\n";
+  PrintIndent(stream, indent);
+  stream << "XCLBIN = ${XCLBIN_NAME}_hw.xclbin\n";
+  stream << "CLCC_OPT += -t hw\n";
+  stream << "endif\n";
+
+  stream << "HOST_ARGS = ${XCLBIN}\n";
+  stream << "COMMON_DIR = ./common\n";
+  stream << "include ${COMMON_DIR}/common.mk\n";
+
+  stream.close();
+}
+
+void GenCommonFile() {
+  int indent = 0;
+  std::ofstream stream;
+  stream.open("./common/common.mk");
+  indent += 4;
+  stream << "SHELL = /bin/bash\n";
+  stream << "VPATH = ./\n";
+  stream << "CC = xcpp\n";
+  stream << "CLCC = xocc\n";
+  stream << "ifeq ($(XDEVICE_REPO_PATH),)\n";
+  PrintIndent(stream, indent);
+  stream << "DEVICE_REPO_OPT = \n";
+  stream << "else\n";
+  stream << "DEVICE_REPO_OPT = --xp prop:solution.device_repo_paths=${XDEVICE_REPO_PATH}\n";
+  stream << "endif\n";
+  stream << "HOST_CFLAGS += -I${XILINX_SDX}/runtime/include/1_2\n";
+  stream << "HOST_LFLAGS += -L${XILINX_SDX}/runtime/lib/x86_64 -lxilinxopencl -lrt -pthread\n";
+  stream << "CLCC_OPT += $(CLCC_OPT_LEVEL) ${DEVICE_REPO_OPT} --xdevice ${XDEVICE} -o ${XCLBIN} ${KERNEL_DEFS} ${KERNEL_INCS}\n";
+  stream << "ifeq (${KEEP_TEMP},1)\n";
+  PrintIndent(stream, indent);
+  stream << "CLCC_OPT += -s\n";
+  stream << "endif\n";
+  stream << "ifeq (${KERNEL_DEBUG},1)\n";
+  PrintIndent(stream, indent);
+  stream << "CLCC_OPT += -g\n";
+  stream << "endif\n";
+  stream << "CLCC_OPT += --kernel ${KERNEL_NAME}\n";
+  stream << "OBJECTS := $(HOST_SRCS:.cpp=.o)\n";
+  stream << ".PHONY: all\n";
+  stream << "all: run\n";
+
+  stream << "host: ${HOST_EXE_DIR}/${HOST_EXE}\n";
+  stream << "xbin_cpu_em:\n";
+  PrintIndent(stream, indent);
+  stream << "make SDA_FLOW=cpu_emu xbin -f sdaccel.mk\n";
+  stream << "xbin_hw_em:\n";
+  PrintIndent(stream, indent);
+  stream << "make SDA_FLOW=hw_emu xbin -f sdaccel.mk\n";
+  stream << "xbin_hw :\n";
+  PrintIndent(stream, indent);
+  stream << "make SDA_FLOW=hw xbin -f sdaccel.mk\n";
+  stream << "xbin: ${XCLBIN}\n";
+  stream << "run_cpu_em: \n";
+  PrintIndent(stream, indent);
+  stream << "make SDA_FLOW=cpu_emu run_em -f sdaccel.mk\n";
+  stream << "run_hw_em: \n";
+  PrintIndent(stream, indent);
+  stream << "make SDA_FLOW=hw_emu run_em -f sdaccel.mk\n";
+  stream << "run_hw : \n";
+  PrintIndent(stream, indent);
+  stream << "make SDA_FLOW=hw run_hw_int -f sdaccel.mk\n";
+  stream << "run_em: xconfig host xbin\n";
+  PrintIndent(stream, indent);
+  stream << "XCL_EMULATION_MODE=true ${HOST_EXE_DIR}/${HOST_EXE} ${HOST_ARGS}\n";
+  stream << "run_hw_int : host xbin_hw\n";
+  PrintIndent(stream, indent);
+  stream << "source ${BOARD_SETUP_FILE};${HOST_EXE_DIR}/${HOST_EXE} ${HOST_ARGS}\n";
+  stream << "estimate : \n";
+  PrintIndent(stream, indent);
+  stream << "${CLCC} -c -t hw_emu --xdevice ${XDEVICE} --report estimate ${KERNEL_SRCS}\n";
+  stream << "xconfig : emconfig.json\n";
+  stream << "emconfig.json :\n";
+  PrintIndent(stream, indent);
+  stream << "emconfigutil --xdevice ${XDEVICE} ${DEVICE_REPO_OPT} --od .\n";
+  stream << "${HOST_EXE_DIR}/${HOST_EXE} : ${OBJECTS}\n";
+  PrintIndent(stream, indent);
+  stream << "${CC} ${HOST_LFLAGS} ${OBJECTS} -o $@\n";
+  stream << "${XCLBIN}:\n";
+  PrintIndent(stream, indent);
+  stream << "${CLCC} ${CLCC_OPT} ${KERNEL_SRCS}\n";
+  stream << "%.o: %.cpp\n";
+  PrintIndent(stream, indent);
+  stream << "${CC} ${HOST_CFLAGS} -c $< -o $@\n";
+  stream << "clean:\n";
+  PrintIndent(stream, indent);
+  stream << "${RM} -rf ${HOST_EXE} ${OBJECTS} ${XCLBIN} emconfig.json _xocc_${XCLBIN_NAME}_*.dir .Xil\n";
+  stream << "cleanall: clean\n";
+  PrintIndent(stream, indent);
+  stream << "${RM} -rf *.xclbin sdaccel_profile_summary.* _xocc_* TempConfig *.log *.jou\n";
+
+  stream.close();
+}
+
+void GenHostCode(TVMArgs& args,
+                 const std::vector<int>& shmids,
+                 const std::vector<TVMType>& arg_types,
+                 LoweredFunc func,
+                 std::string test_file) {
+  int indent = 0;
+  std::ofstream stream;
+  stream.open("host.cpp");
+  indent += 2;
+
+  stream << "#define CL_HPP_CL_1_2_DEFAULT_BUILD\n";
+  stream << "#define CL_HPP_TARGET_OPENCL_VERSION 120\n";
+  stream << "#define CL_HPP_MINIMUM_OPENCL_VERSION 120\n";
+  stream << "#define CL_HPP_ENABLE_PROGRAM_CONSTRUCTION_FROM_ARRAY_COMPATIBILITY 1\n";
+  stream << "#include <CL/cl2.hpp>\n";
+  stream << "#include <fstream>\n";
+  stream << "#include <sys/types.h>\n";
+  stream << "#include <sys/stat.h>\n";
+  stream << "#include <fcntl.h>\n";
+  stream << "#include <unistd.h>\n";
+  stream << "#include <stdlib.h>\n";
+  stream << "#include <stdio.h>\n";
+  stream << "#include <cstring>\n";
+  stream << "#include <iostream>\n";
+  stream << "#include <iomanip>\n";
+  // stream << "#include <math.h>\n";
+  stream << "#include <cmath>\n";
+  stream << "#include <sys/ipc.h>\n";
+  stream << "#include <sys/shm.h>\n";
+  stream << "#pragma once\n";
+  stream << "\n\n";
+  
+  // stream << test_file;
+  stream << "\n\n";
+
+  stream << "int main(void) { \n";
+
+  stream << "#if defined(SDX_PLATFORM) && !defined(TARGET_DEVICE)\n";
+  indent += 2;
+  stream << "  #define STR_VALUE(arg) #arg\n";
+  stream << "  #define GET_STRING(name) STR_VALUE(name)\n";
+  stream << "  #define TARGET_DEVICE GET_STRING(SDX_PLATFORM)\n";
+  stream << "#endif\n";
+
+  // get the krnl code
+  PrintIndent(stream, indent);
+  stream << "char* xclbinFilename = argv[1];\n";
+  stream << "\n";
+
+  for (int i = 0;i < args.size();i++) {
+    PrintIndent(stream, indent);
+    stream << "std::vector<" << Type2Str(arg_types[i]);
+    stream << "> ";
+    stream << "source_" << i << "(";
+    TVMArray* arr = args[i];
+    for (int j = 0;j < arr->ndim;j++) {
+      if (j == arr->ndim-1) {
+        stream << arr->shape[j] << ")";
+      } else {
+        // stream << " * " << arr->shape[j] << ")";
+        stream << arr->shape[j] << " * ";
+      }
+    }
+    stream << ";\n";
+  }
+  stream << "\n";
+
+  for (int i = 0;i < args.size();i++) {
+    PrintIndent(stream, indent);
+    stream << "size_t vector_size_bytes_" << i;
+    stream << " = sizeof(" << Type2Str(arg_types[i]);
+    stream << ")";
+    TVMArray* arr = args[i];
+    for (int j = 0;j < arr->ndim;j++) {
+      stream << " * " << arr->shape[j];
+    }
+    stream << ";\n";
+  }
+  stream << "\n";
+
+  for (int i = 0;i < args.size();i++ ) {
+    // read from the shared memory
+    PrintIndent(stream, indent);
+    stream << Type2Str(arg_types[i]) << "* ";
+    stream << "arg_" << i << " = ";
+    stream << "(" << Type2Str(arg_types[i]) << "*)";
+    stream << "shmat(" << shmids[i] << ", nullptr, 0);\n";
+    TVMArray* arr = args[i];
+    // copy from shared mem  
+    PrintCopy(arr, stream, indent, i);
+  }
+
+  // Getting First Platform
+  PrintIndent(stream, indent);
+  stream << "std::vector<cl::Platform> platforms;\n";
+  PrintIndent(stream, indent);
+  stream << "cl::Platform::get(&platforms);\n";
+  PrintIndent(stream, indent);
+  stream << "cl::Platform platform = platforms[0];\n";
+  stream << "\n";
+
+  // Getting ACCELERATOR Devices and selecting 1st such device
+  PrintIndent(stream, indent);
+  stream << "std::vector<cl::Device> devices;\n";
+  PrintIndent(stream, indent);
+  stream << "platform.getDevices(CL_DEVICE_TYPE_ACCELERATOR, &devices);\n";
+  PrintIndent(stream, indent);
+  stream << "cl::Device device = devices[0];\n";
+  stream << "\n";
+
+  // Creating Context and Command Queue for selected Device
+  PrintIndent(stream, indent);
+  stream << "cl::Context context(device);\n";
+  PrintIndent(stream, indent);
+  stream << "cl::CommandQueue q(context, device);\n";
+  stream << "\n";
+
+  // Loading XCL Bin into char buffer
+  PrintIndent(stream, indent);
+  stream << "std::ifstream bin_file(xclbinFilename, std::ifstream::binary);\n";
+  PrintIndent(stream, indent);
+  stream << "bin_file.seekg (0, bin_file.end);\n";
+  PrintIndent(stream, indent);
+  stream << "unsigned nb = bin_file.tellg();\n";
+  PrintIndent(stream, indent);
+  stream << "bin_file.seekg (0, bin_file.beg);\n";
+  PrintIndent(stream, indent);
+  stream << "char *buf = new char [nb];\n";
+  PrintIndent(stream, indent);
+  stream << "bin_file.read(buf, nb);\n";
+  stream << "\n";
+
+  // Creating Program from Binary File
+  PrintIndent(stream, indent);
+  stream << "cl::Program::Binaries bins;\n";
+  PrintIndent(stream, indent);
+  stream << "bins.push_back({buf,nb});\n";
+  PrintIndent(stream, indent);
+  stream << "devices.resize(1);\n";
+  PrintIndent(stream, indent);
+  stream << "cl::Program program(context, devices, bins);\n";
+  stream << "\n";
+
+  // Creating Kernel and Functor of Kernel
+  PrintIndent(stream, indent);
+  stream << "int err1;\n";
+  PrintIndent(stream, indent);
+  stream << "cl::Kernel kernel(program, \"default_function\", &err1);\n";
+  PrintIndent(stream, indent);
+  stream << "auto default_function = cl::KernelFunctor<";
+  for (int i = 0;i < args.size();i++) {
+    if (i == args.size() - 1) {
+      stream << "cl::Buffer&>(kernel);\n";
+    } else {
+      stream << "cl::Buffer&, ";
+    }
+  }
+  stream << "\n";
+
+  // Creating Buffers inside Device
+  for (int i = 0;i < args.size();i++) {
+    PrintIndent(stream, indent);
+    stream << "cl::Buffer buffer_" << i;
+    stream << "(context, CL_MEM_READ_WRITE, vector_size_bytes_" << i << ");\n";
+  }
+  stream << "\n";
+
+  // Copying input data to Device buffer from host memory
+  for (int i = 0;i < args.size();i++) {
+    PrintIndent(stream, indent);
+    stream << "q.enqueueWriteBuffer(buffer_" << i;
+    stream << ", CL_TRUE, 0, vector_size_bytes_" << i;
+    stream << ", source_" << i << ".data());\n"; 
+  }
+  stream << "\n";
+
+  // Running Kernel
+  PrintIndent(stream, indent);
+  stream << func->name << "(";
+  stream << "cl::EnqueueArgs(q, cl::NDRange(1,1,1), cl::NDRange(1,1,1)),";
+  for (int i = 0; i < args.size(); i++) {
+    stream << "buffer_" << i;
+    if (i != args.size()-1) 
+      stream << ", ";
+  }
+  stream << ");\n";
+  PrintIndent(stream, indent);
+  stream << "q.finish();\n";
+  stream << "\n";
+
+  // Copying Device result data to Host memory
+  for (int i = 0;i < args.size(); i++) {
+    PrintIndent(stream, indent);
+    stream << "q.enqueueReadBuffer(buffer_" << i;
+    stream << ", CL_TRUE, 0, vector_size_bytes_" << i;
+    stream << ", source_" << i << ".data());\n";
+  }
+  stream << "\n";
+
+  // copy to shared mem
+  for (int i = 0;i < args.size();i++) {
+    if (args[i].type_code() == kArrayHandle) {
+      TVMArray* arr = args[i];
+      PrintCopyBack(arr, stream, indent, i);
+      PrintIndent(stream, indent);
+      stream << "shmdt(";
+      stream << "arg_" << i << ");\n";
+    }
+  }
+
+  stream << "}\n";
+  stream.close();
+}
+} // namespace
+
+
+class SDAccelModuleNode final : public ModuleNode {
+ public:
+  SDAccelModuleNode(LoweredFunc func, std::string test_file) 
+    : func_(func), test_file_(test_file) {}
+
+  const char* type_key() const {
+    return "sdaccel_sw_emu";
+  }
+
+  PackedFunc GetFunction(
+      const std::string& name,
+      const std::shared_ptr<ModuleNode>& sptr_to_self) final {
+    return PackedFunc([this](TVMArgs args, TVMRetValue* rv){
+
+        if (args.size() != (int)func_->args.size())
+          LOG(FATAL) << "The function should take in " << func_->args.size() 
+                     << " inputs but get " << args.size();
+        std::vector<size_t> arg_sizes;
+        std::vector<TVMType> arg_types;
+        std::vector<int> shmids;
+        CollectArgInfo(args, func_, arg_sizes, arg_types);
+        GenSharedMem(args, shmids, arg_sizes);
+        LOG(CLEAN) << "Creating a Host file for SDAccel Runtime ...";
+        GenHostCode(args, shmids, arg_types, func_, test_file_);
+
+        LOG(CLEAN) << "Creating a Common folder for common.mk ...";
+        system("mkdir common");
+        GenCommonFile();
+
+        LOG(CLEAN) << "Creating a Makfile for compling the SDAccel OpenCL Code ...";
+        GenMakFile();
+        // TODO: find a better way to do the following
+        LOG(CLEAN) << "Compiling the generated SDAccel OpenCL Code ...";
+        // system("make -f ./sdaccel.mk run_cpu_em");
+        LOG(CLEAN) << "Running SDAccel OpenCL Software Simulation ...";
+        LOG(CLEAN) << "Finished SDAccel OpenCL Software Simulation ...";
+        // system("make -f sdaccel.mk cleanall");
+        FreeSharedMem(args, shmids, arg_sizes);
+      });
+  }
+
+ private:
+  LoweredFunc func_;
+  std::string test_file_;
+};
+
+Module CreateSDAccelModule(LoweredFunc func,
+                           std::string code) {
+  std::shared_ptr<SDAccelModuleNode> n =
+    std::make_shared<SDAccelModuleNode>(func, code);
+
+  return Module(n);
+}
+
+} // namespace runtime
+} // namespace TVM
diff --git a/tvm/src/codegen/opencl/sdaccel_module.h b/tvm/src/codegen/opencl/sdaccel_module.h
new file mode 100644
index 000000000..01f361dba
--- /dev/null
+++ b/tvm/src/codegen/opencl/sdaccel_module.h
@@ -0,0 +1,18 @@
+#ifndef SDACCEL_MODULE_H
+#define SDACCEL_MODULE_H
+
+#include <tvm/runtime/module.h>
+#include <tvm/runtime/packed_func.h>
+#include "../build_common.h"
+
+namespace TVM {
+namespace runtime {
+
+Module CreateSDAccelModule(
+    LoweredFunc func,
+    std::string code);
+
+} // namespace runtime
+} // namespace TVM
+
+#endif
diff --git a/tvm/src/codegen/ppac/build_rv64_ppac.cc b/tvm/src/codegen/ppac/build_rv64_ppac.cc
new file mode 100644
index 000000000..c14a1cdf3
--- /dev/null
+++ b/tvm/src/codegen/ppac/build_rv64_ppac.cc
@@ -0,0 +1,32 @@
+/*
+ * \file build_rv64_ppac.cc
+ */
+
+#include "./codegen_rv64_ppac.h"
+#include "../build_common.h"
+
+namespace TVM{
+namespace codegen{
+
+std::string BuildRV64PPAC(Array<LoweredFunc> funcs) {
+    CodeAnalysMerlinC ca;
+    CodeGenRV64PPAC cg;
+    for (LoweredFunc f: funcs) {
+        ca.AddFunction(f);
+        str2tupleMap<std::string, Type> map_arg_type;
+        map_arg_type = ca.Finish();
+        cg.AddFunction(f, map_arg_type);
+    }
+    std::string code = cg.Finish();
+
+    LOG(WARNING) << "RV64_PPAC backend doesn't have runtime, return kernel code";
+    return code;
+}
+
+TVM_REGISTER_API("codegen.build_rv64_ppac")
+.set_body([](TVMArgs args, TVMRetValue* rv) {
+    *rv = BuildRV64PPAC(args[0]);
+  });
+
+}  // namespace codegen
+}  // namespace TVM
\ No newline at end of file
diff --git a/tvm/src/codegen/ppac/codegen_rv64_ppac.cc b/tvm/src/codegen/ppac/codegen_rv64_ppac.cc
new file mode 100644
index 000000000..1fd5e2b6e
--- /dev/null
+++ b/tvm/src/codegen/ppac/codegen_rv64_ppac.cc
@@ -0,0 +1,202 @@
+/*
+ * \file codegen_rv64_ppac.cc
+ */
+ 
+#include <tvm/build_module.h>
+#include <tvm/ir_pass.h>
+#include <vector>
+#include <string>
+#include <regex>
+#include <fstream>
+#include <sys/types.h>
+#include "./codegen_rv64_ppac.h"
+#include "../build_common.h"
+
+namespace TVM {
+namespace codegen {
+
+void CodeGenRV64PPAC::AddFunction(LoweredFunc f, 
+        str2tupleMap<std::string, Type> map_arg_type) {
+  // Clear previous generated state
+  this->InitFuncState(f);
+  // Register alloc buffer type
+  for (const auto & kv : f->handle_data_type) {
+    RegisterHandleType(kv.first.get(), kv.second.type());
+  }
+  // Write entry function name
+  this->stream << "void " << f->name << "(";
+  // Write arguments
+  for (size_t i = 0; i < f->args.size(); ++i) {
+    Var v = f->args[i];
+    std::string vid = AllocVarID(v.get());
+    if (i != 0) this->stream << ", ";
+    if (map_arg_type.find(vid) == map_arg_type.end()) {
+      LOG(WARNING) << vid << " type not found\n";
+      PrintType(v.type(), this->stream);
+      this->stream << ' ' << vid;
+    }
+    else {
+      auto arg = map_arg_type[vid];
+      PrintType(std::get<1>(arg), this->stream);
+      this->stream << "*"; 
+      this->stream << ' ' << std::get<0>(arg);
+    }
+  }
+  stream << ") {\n";
+  int func_scope = this->BeginScope();
+  this->PrintStmt(f->body);
+  this->EndScope(func_scope);
+  this->PrintIndent();
+  this->stream << "}\n\n";
+}
+
+void CodeGenRV64PPAC::VisitStmt_(const For* op) {
+  std::string func_name;
+  bool is_ppac_func = false;
+  uint8_t i = 0;
+  for (auto key: op->annotate_keys) {
+    if (auto str = key.as<StringImm>()) {
+      if (str->value == "_ppac_func_name") {
+        auto name = op->annotate_values[i].as<StringImm>();
+        func_name = name->value;
+        is_ppac_func = true;
+        break;
+      }
+    }
+    ++i;
+  }
+  if (is_ppac_func) {
+    // scan along the annotate list to find parameters
+    std::string ret, arg0, arg1;
+    int batch_num, in_block_num, out_channel_num;
+    i = 0;
+    uint8_t param_num = 0;
+    for (auto key: op->annotate_keys) {
+      if (auto str = key.as<StringImm>()) {
+        if (str->value == "_ret") {
+          auto v = op->annotate_values[i].as<StringImm>();
+          ret = v->value;
+          ++param_num;         
+        } else if (str->value == "_arg0") {
+          auto v = op->annotate_values[i].as<StringImm>();
+          arg0 = v->value;
+          ++param_num;         
+        } else if (str->value == "_arg1") {
+          auto v = op->annotate_values[i].as<StringImm>();
+          arg1 = v->value;
+          ++param_num;         
+        } else if (str->value == "_batch_num") {
+          auto v = op->annotate_values[i].as<IntImm>();
+          batch_num = v->value;
+          ++param_num;
+        } else if (str->value == "_in_block_num") {
+          auto v = op->annotate_values[i].as<IntImm>();
+          in_block_num = v->value;
+          ++param_num;
+        } else if (str->value == "_out_channel_num") {
+          auto v = op->annotate_values[i].as<IntImm>();
+          out_channel_num = v->value;
+          ++param_num;
+        }
+      }
+      ++i;
+    }
+    if (param_num != 6) {
+      LOG(FATAL) << "PPAC function call need exactly 6 parameters but found " << param_num;
+    }
+    // print ppac function call
+    PrintIndent();
+    stream << func_name << "(" 
+           << ret << ", "
+           << arg0 << ", " 
+           << arg1 << ", "
+           << batch_num << ", " 
+           << in_block_num << ", "
+           << out_channel_num 
+           << ");\n";
+    return;
+  }
+  CodeGenC::VisitStmt_(op);
+}
+
+void CodeGenRV64PPAC::VisitStmt_(const LetStmt* op) {
+  std::string value = PrintExpr(op->value);
+  // Skip the argument retrieving assign statement
+  std::string vid = AllocVarID(op->var.get());
+  if (op->var.type() != Handle() &&
+      value.find("TVMArray") == std::string::npos &&
+      value.find("arg") != 0) {
+    PrintIndent();
+    PrintType(op->var.type(), this->stream);
+    this->stream << ' '
+                 << vid
+                 << " = " << value << ";\n";
+  }
+  PrintStmt(op->body);
+}
+
+void CodeGenRV64PPAC::VisitStmt_(const IfThenElse* op) {
+  std::string cond = PrintExpr(op->condition);
+  // Skip the buffer data checking
+  if (std::regex_match(cond, std::regex("!\\((arg)(.+)(== NULL)\\)")))
+      return ;
+  PrintIndent();
+  if (cond[0] == '(' && cond[cond.length() - 1] == ')') {
+    stream << "if " << cond << " {\n";
+  } else {
+    stream << "if (" << cond << ") {\n";
+  }
+  int then_scope = BeginScope();
+  PrintStmt(op->then_case);
+  this->EndScope(then_scope);
+  if (op->else_case.defined()) {
+    PrintIndent();
+    stream << "} else {\n";
+    int else_scope = BeginScope();
+    PrintStmt(op->else_case);
+    this->EndScope(else_scope);
+  }
+  PrintIndent();
+  stream << "}\n";
+}
+
+void CodeGenRV64PPAC::PrintType(Type t, std::ostream& os) {
+  CHECK_EQ(t.lanes(), 1)
+      << "do not support vector types";
+  if (t.is_uint() || t.is_int()) {
+    if (t.is_uint())  {
+      if (t.bits() <= 8) {
+        os << "uint8_t"; return;
+      } else if (t.bits() <= 16) {
+        os << "uint16_t"; return;
+      } else if (t.bits() <= 32) {
+        os << "uint32_t"; return;
+      } else if (t.bits() <= 64) {
+        os << "uint64_t"; return;
+      } else {
+        LOG(WARNING) << "Casting type " << t << " to uint64_t";
+        os << "uint64_t"; 
+        return;
+      }
+    }
+    else if (t.is_int()) {
+      if (t.bits() <= 8) {
+        os << "int8_t"; return;
+      } else if (t.bits() <= 16) {
+        os << "int16_t"; return;
+      } else if (t.bits() <= 32) {
+        os << "int32_t"; return;
+      } else if (t.bits() <= 64) {
+        os << "int64_t"; return;
+      } else {
+        LOG(WARNING) << "Casting type " << t << " to int64_t";
+        os << "int64_t"; 
+        return;
+      }
+    }
+  }
+  os << t;
+}
+
+} //namespace codegen
+} //namespace TVM
\ No newline at end of file
diff --git a/tvm/src/codegen/ppac/codegen_rv64_ppac.h b/tvm/src/codegen/ppac/codegen_rv64_ppac.h
new file mode 100644
index 000000000..881bdea05
--- /dev/null
+++ b/tvm/src/codegen/ppac/codegen_rv64_ppac.h
@@ -0,0 +1,28 @@
+/*
+ * \file codegen_rv64_ppac.h
+ */
+ 
+#ifndef TVM_CODEGEN_CODEGEN_RV64_PPAC_H_
+#define TVM_CODEGEN_CODEGEN_RV64_PPAC_H_
+
+#include <tvm/codegen.h>
+#include <string>
+#include "../codegen_c.h"
+#include "../merlinc/codeanalys_merlinc.h"
+
+namespace TVM {
+namespace codegen {
+
+class CodeGenRV64PPAC : public CodeGenC {
+  public:
+    void AddFunction(LoweredFunc f, str2tupleMap<std::string, Type> map_arg_type);
+    void PrintType(Type t, std::ostream& os) override;
+    void VisitStmt_(const LetStmt* op) override;
+    void VisitStmt_(const IfThenElse* op) override;
+    void VisitStmt_(const For* op) override;
+};
+
+}  // namespace codegen
+}  // namespace TVM
+
+#endif   //TVM_CODEGEN_CODEGEN_RV64_PPAC_H_
\ No newline at end of file
diff --git a/tvm/src/lang/ir.cc b/tvm/src/lang/ir.cc
index 3589de195..c88f8ea94 100644
--- a/tvm/src/lang/ir.cc
+++ b/tvm/src/lang/ir.cc
@@ -149,6 +149,8 @@ TVM_REGISTER_NODE_TYPE(Quantize);
 TVM_REGISTER_NODE_TYPE(KernelDef);
 TVM_REGISTER_NODE_TYPE(KernelExpr);
 TVM_REGISTER_NODE_TYPE(KernelStmt);
+TVM_REGISTER_NODE_TYPE(StreamStmt);
+TVM_REGISTER_NODE_TYPE(StreamExpr);
 TVM_REGISTER_NODE_TYPE(Return);
 TVM_REGISTER_NODE_TYPE(Break);
 TVM_REGISTER_NODE_TYPE(While);
diff --git a/tvm/src/pass/ir_mutator.cc b/tvm/src/pass/ir_mutator.cc
index ec67aa314..89485e723 100644
--- a/tvm/src/pass/ir_mutator.cc
+++ b/tvm/src/pass/ir_mutator.cc
@@ -202,6 +202,15 @@ Stmt IRMutator::Mutate_(const Store *op, const Stmt& s) {
   }
 }
 
+Stmt IRMutator::Mutate_(const StreamStmt *op, const Stmt& s) {
+  Expr value = this->Mutate(op->value);
+  if (value.same_as(op->value)) {
+    return s;
+  } else {
+    return StreamStmt::make(op->buffer_var, value, op->stream_type, op->depth);
+  }
+}
+
 Stmt IRMutator::Mutate_(const Provide* op, const Stmt& s) {
   auto new_args = MutateArray(op->args, this);
   auto new_value = this->Mutate(op->value);
@@ -321,7 +330,8 @@ Stmt IRMutator::Mutate_(const KernelDef *op, const Stmt &s) {
   if (body.same_as(op->body) && ret_void.same_as(op->ret_void)) {
     return s;
   } else {
-    return KernelDef::make(op->args, body, ret_void, op->ret_type, op->name);
+    return KernelDef::make(op->args, op->api_args, op->api_types,
+                           body, ret_void, op->ret_type, op->name, op->channels);
   }
 }
 
@@ -402,6 +412,7 @@ TVM_STATIC_IR_FUNCTOR(IRMutator, vtable_stmt)
 .DISPATCH_TO_MUTATE_STMT(Prefetch)
 .DISPATCH_TO_MUTATE_STMT(KernelDef)
 .DISPATCH_TO_MUTATE_STMT(KernelStmt)
+.DISPATCH_TO_MUTATE_STMT(StreamStmt)
 .DISPATCH_TO_MUTATE_STMT(Return)
 .DISPATCH_TO_MUTATE_STMT(Break)
 .DISPATCH_TO_MUTATE_STMT(While)
@@ -430,6 +441,10 @@ Expr IRMutator::Mutate_(const Load *op, const Expr& e) {
   }
 }
 
+Expr IRMutator::Mutate_(const StreamExpr *op, const Expr& e) {
+  return e;
+}
+
 Expr IRMutator::Mutate_(const Let *op, const Expr& e) {
   Expr value = this->Mutate(op->value);
   Expr body = this->Mutate(op->body);
@@ -665,6 +680,7 @@ TVM_STATIC_IR_FUNCTOR(IRMutator, vtable_expr)
 .DISPATCH_TO_MUTATE_EXPR(SetBit)
 .DISPATCH_TO_MUTATE_EXPR(SetSlice)
 .DISPATCH_TO_MUTATE_EXPR(Quantize)
+.DISPATCH_TO_MUTATE_EXPR(StreamExpr)
 .DISPATCH_TO_MUTATE_EXPR(KernelExpr);
 
 }  // namespace ir
diff --git a/tvm/src/pass/ir_visitor.cc b/tvm/src/pass/ir_visitor.cc
index 160cb906e..6346c6262 100644
--- a/tvm/src/pass/ir_visitor.cc
+++ b/tvm/src/pass/ir_visitor.cc
@@ -252,6 +252,13 @@ void IRVisitor::Visit_(const KernelStmt *op) {
   }
 }
 
+void IRVisitor::Visit_(const StreamStmt *op) {
+  this->Visit(op->value);
+}
+
+void IRVisitor::Visit_(const StreamExpr *op) {
+}
+
 void IRVisitor::Visit_(const Return *op) {
   this->Visit(op->value);
 }
@@ -338,6 +345,8 @@ TVM_STATIC_IR_FUNCTOR(IRVisitor, vtable)
 .DISPATCH_TO_VISIT(KernelDef)
 .DISPATCH_TO_VISIT(KernelExpr)
 .DISPATCH_TO_VISIT(KernelStmt)
+.DISPATCH_TO_VISIT(StreamStmt)
+.DISPATCH_TO_VISIT(StreamExpr)
 .DISPATCH_TO_VISIT(Return)
 .DISPATCH_TO_VISIT(Break)
 .DISPATCH_TO_VISIT(While)
diff --git a/tvm/src/pass/split_host_device.cc b/tvm/src/pass/split_host_device.cc
index 534e0b695..fdcd0c56f 100644
--- a/tvm/src/pass/split_host_device.cc
+++ b/tvm/src/pass/split_host_device.cc
@@ -81,6 +81,14 @@ class IRUseDefAnalysis : public IRMutator {
     return IRMutator::Mutate_(op, s);
   }
 
+  Stmt Mutate_(const StreamStmt *op, const Stmt& s) final {
+    if (!def_count_.count(op->buffer_var.get())) {
+      def_count_[op->buffer_var.get()] = 0;
+      use_count_[op->buffer_var.get()] = 0;
+    }
+    return IRMutator::Mutate_(op, s);
+  }
+
   Expr Mutate_(const Let *op, const Expr& e) final {
     this->HandleDef(op->var.get());
     Expr body = this->Mutate(op->body);
@@ -109,6 +117,14 @@ class IRUseDefAnalysis : public IRMutator {
     return IRMutator::Mutate_(op, e);
   }
 
+  Expr Mutate_(const StreamExpr *op, const Expr& e) final {
+    if (!def_count_.count(op->buffer_var.get())) {
+      def_count_[op->buffer_var.get()] = 0;
+      use_count_[op->buffer_var.get()] = 0;
+    }
+    return IRMutator::Mutate_(op, e);
+  }
+
   Stmt Mutate_(const KernelDef *op, const Stmt& s) {
     for (auto arg : op->args) {
       this->HandleDef(arg.get());
diff --git a/tvm/src/pass/stream_inference.cc b/tvm/src/pass/stream_inference.cc
new file mode 100644
index 000000000..ec18b1871
--- /dev/null
+++ b/tvm/src/pass/stream_inference.cc
@@ -0,0 +1,345 @@
+/*!
+ *  Copyright (c) 2019 by Contributors
+ * \file remove_no_op.cc
+ * \brief Remove no op from the stmt
+ */
+#include <tvm/ir.h>
+#include <tvm/ir_pass.h>
+#include <tvm/ir_mutator.h>
+#include <unordered_map>
+
+namespace TVM {
+namespace ir {
+
+// use/def analysis to capture host xcel deps 
+class StreamUseDefAnalysis : public IRMutator {
+ public:
+  Stmt Mutate_(const AttrStmt *op, const Stmt& s) final {
+    if (op->attr_key == attr::device_scope) {
+      if (op->value.as<StringImm>()->value == "fpga")
+        host_scope_ = false;
+      return IRMutator::Mutate_(op, s);
+    } else {
+      return IRMutator::Mutate_(op, s);
+    }
+  }
+
+  Stmt Mutate_(const LetStmt *op, const Stmt& s) final {
+    this->HandleDef(op->var.get());
+    Stmt body = this->Mutate(op->body);
+    Expr value = this->Mutate(op->value);
+    if (body.same_as(op->body) &&
+        value.same_as(op->value)) {
+      return s;
+    } else {
+      return LetStmt::make(op->var, value, body);
+    }
+  }
+
+  Stmt Mutate_(const For *op, const Stmt& s) final {
+    this->HandleDef(op->loop_var.get());
+    return IRMutator::Mutate_(op, s);
+  }
+
+  Stmt Mutate_(const Allocate *op, const Stmt& s) final {
+    this->HandleDef(op->buffer_var.get());
+    return IRMutator::Mutate_(op, s);
+  }
+
+  Stmt Mutate_(const Store *op, const Stmt& s) final {
+    this->HandleUse(op->buffer_var);
+    return IRMutator::Mutate_(op, s);
+  }
+
+  Stmt Mutate_(const StreamStmt *op, const Stmt& s) final {
+    this->HandleUse(op->buffer_var);
+    return IRMutator::Mutate_(op, s);
+  }
+
+  Expr Mutate_(const Let *op, const Expr& e) final {
+    this->HandleDef(op->var.get());
+    Expr body = this->Mutate(op->body);
+    Expr value = this->Mutate(op->value);
+    if (body.same_as(op->body) &&
+        value.same_as(op->value)) {
+      return e;
+    } else {
+      return Let::make(op->var, value, body);
+    }
+  }
+
+  Expr Mutate_(const Variable *op, const Expr& e) final {
+    this->HandleUse(e);
+    return IRMutator::Mutate_(op, e);
+  }
+
+  Expr Mutate_(const Load *op, const Expr& e) final {
+    this->HandleUse(op->buffer_var);
+    return IRMutator::Mutate_(op, e);
+  }
+
+  Expr Mutate_(const StreamExpr *op, const Expr& e) final {
+    this->HandleUse(op->buffer_var);
+    return IRMutator::Mutate_(op, e);
+  }
+
+  Stmt Mutate_(const KernelDef *op, const Stmt& s) {
+    for (auto arg : op->args) {
+      this->HandleDef(arg.get());
+    }
+    Stmt body = this->Mutate(op->body);
+    for (auto arg : op->args) {
+      xcel_def_count_[arg.get()] = 0;
+    }
+    return s;
+  }
+
+  void HandleDef(const Variable* v) {
+    if (host_scope_) {
+      CHECK(!host_def_count_.count(v))
+          << "variable " << v->name_hint
+          << " has already been defined, the Stmt is not SSA";
+      CHECK(!host_use_count_.count(v))
+          << "variable " << v->name_hint
+          << " has been used before definition!";
+      host_use_count_[v] = 0;
+      host_def_count_[v] = 1;
+    } else {
+      CHECK(!xcel_def_count_.count(v))
+          << "variable " << v->name_hint
+          << " has already been defined, the Stmt is not SSA";
+      CHECK(!xcel_use_count_.count(v))
+          << "variable " << v->name_hint
+          << " has been used before definition!";
+      xcel_use_count_[v] = 0;
+      xcel_def_count_[v] = 1;
+    }
+  }
+
+  void HandleUse(const Expr& v) {
+    CHECK(v.as<Variable>());
+    Var var(v.node_);
+    if (host_scope_) {
+      auto it = host_use_count_.find(var.get());
+      if (it != host_use_count_.end()) {
+        if (it->second >= 0) {
+          ++it->second;
+        }
+      } else {
+        host_undefined_.push_back(var);
+        host_use_count_[var.get()] = -1;
+      }
+    } else {
+      auto it = xcel_use_count_.find(var.get());
+      if (it != xcel_use_count_.end()) {
+        if (it->second >= 0) {
+          ++it->second;
+        }
+      } else {
+        xcel_undefined_.push_back(var);
+        xcel_use_count_[var.get()] = -1;
+      }
+    }
+  }
+
+  bool host_scope_{true};
+  Array<Var> host_undefined_;
+  Array<Var> xcel_undefined_;
+  std::unordered_map<const Variable*, int> host_use_count_;
+  std::unordered_map<const Variable*, int> host_def_count_;
+  std::unordered_map<const Variable*, int> xcel_use_count_;
+  std::unordered_map<const Variable*, int> xcel_def_count_;
+};
+
+
+class StreamMutator : public IRMutator {
+ public:
+  explicit StreamMutator(int bus_bandwidth) {
+    bus_bandwidth_ = bus_bandwidth;
+  }
+  // move device attr to allocate level
+  Stmt Mutate_(const AttrStmt* op, const Stmt& s) final {
+    Stmt stmt = IRMutator::Mutate_(op, s);
+    // if (op->attr_key == attr::device_scope)
+    //   return stmt.as<AttrStmt>()->body;
+    return stmt;
+  }
+
+  Stmt Mutate_(const For* op, const Stmt& s) final {
+    Stmt stmt = IRMutator::Mutate_(op, s);
+    op = stmt.as<For>();
+    auto extent = op->extent.as<IntImm>()->value;
+    auto min = op->min.as<IntImm>()->value;
+    // mutate sender: split and block inner loop
+    if (auto stream_op = op->body.as<StreamStmt>()) {
+      if (extent - min > bus_bandwidth_) {
+        LOG(WARNING) << "large";
+      } else {
+      }
+    // mutate receiver : (StreamExpr + For(Store = GetSlice))
+    } else if (auto store_op = op->body.as<Store>()) {
+      if (store_op->value.as<StreamExpr>() == nullptr) return stmt;
+      if (extent - min > bus_bandwidth_) {
+        LOG(WARNING) << "large";
+      } else {
+        return stmt;
+        // allocate intermediate buffer
+        VarExpr new_var(store_op->buffer_var.get()->name_hint + "_save");
+        Expr new_load = Load::make(store_op->buffer_var.type(), new_var, 0, const_true());
+        Stmt new_store = Store::make(store_op->buffer_var, new_load,
+                                     store_op->index, store_op->predicate);
+        Stmt new_for = For::make(op->loop_var, op->min, op->extent, op->for_type,
+                                 op->device_api, new_store);
+        // save stream data into intermediate buffer
+        Stmt read_in = Store::make(new_var, store_op->value, 
+                                   Expr(0), const_true());
+        // allocate intermediate buffer
+        return Allocate::make(new_var, 
+                              store_op->value.type(),
+                              {make_const(Int(bus_bandwidth_), 1)}, 
+                              const_true(), Block::make(read_in, new_for));
+      }
+    }
+    return stmt;
+  }
+
+  Stmt Mutate_(const StreamStmt* op, const Stmt& s) final {
+    Stmt stmt = IRMutator::Mutate_(op, s);
+    op = stmt.as<StreamStmt>();
+    const Variable* v = op->buffer_var.get();
+    stream_type_map_[v] = op->buffer_var.type();
+    return stmt;
+  }
+
+  Expr Mutate_(const StreamExpr* op, const Expr& e) final {
+    Expr expr = IRMutator::Mutate_(op, e);
+    op = expr.as<StreamExpr>();
+    const Variable* v = op->buffer_var.get();
+    stream_type_map_[v] = op->buffer_var.type();
+    return expr;
+  }
+ private:
+   int bus_bandwidth_;
+   bool is_host_{true}; 
+   std::unordered_map<const Variable*, Type> stream_type_map_;
+};
+
+// Mark the statment scope of each stage.
+class StreamInferer : public IRMutator {
+ public:
+  explicit StreamInferer(int bus_bandwidth) {
+    bus_bandwidth_ = bus_bandwidth;
+  }
+
+  Stmt Mutate_(const Allocate* op, const Stmt& s) final {
+    Stmt stmt = IRMutator::Mutate_(op, s);
+    op = stmt.as<Allocate>();
+    if (auto block = op->body.as<Block>()) {
+      if (auto producer = block->first.as<ProducerConsumer>()){
+        if (const AttrStmt* attr_stmt = producer->body.as<AttrStmt>()) {
+          if (const AttrStmt* device_attr = attr_stmt->body.as<AttrStmt>()) {
+            if (device_attr->attr_key == attr::device_scope) {
+              // mutate allocate body
+              StreamMutator mutator(bus_bandwidth_);
+              // allocate stream for host 
+              Stmt new_body = mutator.Mutate(op->body);
+              Stmt new_stmt = Allocate::make(op->buffer_var,
+                                             op->type,
+                                             op->extents,
+                                             op->condition,
+                                             new_body);
+              return AttrStmt::make(device_attr->node,
+                                    attr::device_scope,
+                                    device_attr->value,
+                                    new_stmt);
+            }
+          }
+        }
+      }
+    }
+    return stmt;
+  }
+
+  // Stmt Mutate_(const ProducerConsumer* op, const Stmt& s) final {
+  //   Stmt stmt = IRMutator::Mutate_(op, s);
+  //   op = stmt.as<ProducerConsumer>();
+  //   return is_no_op(op->body) ? op->body : stmt;
+  // }
+
+  // Stmt Mutate_(const Store* op, const Stmt& s) final {
+  //   Stmt stmt = IRMutator::Mutate_(op, s);
+  //   op = stmt.as<Store>();
+  //   auto it = var_remap_.find(op->buffer_var.get());
+  //   if (it != var_remap_.end() &&
+  //       !it->second.same_as(op->buffer_var)) {
+  //     CHECK(it->second.as<Variable>());
+  //     VarExpr buf_var(it->second.node_);
+  //     if (has_stencil_) outputs_.insert(buf_var);
+  //     return Store::make(buf_var, op->value, op->index, op->predicate);
+  //   } else {
+  //     return stmt;
+  //   }
+  // }
+
+  // Stmt Mutate_(const AttrStmt* op, const Stmt& s) final {
+  //   if (op->attr_key == attr::realize_scope) {
+  //     storage_scope_[op->node.get()] = op->value.as<StringImm>()->value;
+  //     return this->Mutate(op->body);
+  //   } else if (op->attr_key == attr::double_buffer_scope) {
+  //     Operation func(op->node.node_);
+  //     Stmt body = Mutate(op->body);
+  //     for (int i = 0; i < func->num_outputs(); ++i) {
+  //       TensorKey key{func, i};
+  //       auto it = buf_map_.find(key);
+  //       CHECK(it != buf_map_.end())
+  //           << "Cannot find allocated buffer for " << key.f;
+  //       body = AttrStmt::make(
+  //           it->second.buffer->data, op->attr_key, op->value, body);
+  //     }
+  //     return body;
+  //   } else if (op->attr_key == attr::thread_extent) {
+  //     IterVar iv(op->node.node_);
+  //     ThreadScope ts = ThreadScope::make(iv->thread_tag);
+  //     curr_thread_scope_.push_back(ts);
+  //     Stmt stmt = IRMutator::Mutate_(op, s);
+  //     curr_thread_scope_.pop_back();
+  //     return stmt;
+  //   } else if (op->attr_key == attr::buffer_bind_scope) {
+
+  // Stmt Mutate_(const For* op, const Stmt& s) final {
+  //   Stmt stmt = IRMutator::Mutate_(op, s);
+  //   op = stmt.as<For>();
+  //   return is_no_op(op->body) ? MakeEvaluate({op->min, op->extent}) : stmt;
+  // }
+
+ private:
+  int bus_bandwidth_;
+  Stmt MakeEvaluate(Expr value) {
+    if (HasSideEffect(value)) {
+      return Evaluate::make(value);
+    } else {
+      return Evaluate::make(0);
+    }
+  }
+  Stmt MakeEvaluate(const Array<Expr>& values) {
+    Stmt stmt;
+    for (Expr e : values) {
+      if (HasSideEffect(e)) {
+        if (stmt.defined()) {
+          stmt = Block::make(stmt, Evaluate::make(e));
+        } else {
+          stmt = Evaluate::make(e);
+        }
+      }
+    }
+    return stmt.defined() ? stmt : Evaluate::make(0);
+  }
+};
+
+Stmt InferStream(Stmt stmt, 
+                 int bus_bandwidth) {
+  return StreamInferer(bus_bandwidth).Mutate(stmt); 
+}
+
+}  // namespace ir
+}  // namespace TVM
diff --git a/tvm/src/schedule/compute_primitive.h b/tvm/src/schedule/compute_primitive.h
index e65885462..e7167257c 100644
--- a/tvm/src/schedule/compute_primitive.h
+++ b/tvm/src/schedule/compute_primitive.h
@@ -33,6 +33,14 @@ Stmt PerformComputeAt(Stmt& producer,
                       size_t& attach_level,
                       std::unordered_map<const Variable*, Expr>& sub);
 
+Stmt StreamFromProducer(Stmt& stmt, 
+                        Buffer& producer_buf, 
+                        ir::StreamType& type);
+
+Stmt StreamToConsumer(Stmt& stmt, 
+                      Buffer& producer_buf,
+                      ir::StreamType& type);
+
 Stmt UpdateIterVarAttr(Stmt& stmt,
                       const IterVar& var,
                       const IterVarAttrNode* node);
diff --git a/tvm/src/schedule/schedule_dataflow_rewrite.cc b/tvm/src/schedule/schedule_dataflow_rewrite.cc
index b2bd520e7..a7fc8ee72 100644
--- a/tvm/src/schedule/schedule_dataflow_rewrite.cc
+++ b/tvm/src/schedule/schedule_dataflow_rewrite.cc
@@ -8,6 +8,7 @@
 #include <tvm/ir_mutator.h>
 #include <tvm/ir_visitor.h>
 #include <tvm/ir_pass.h>
+#include <regex>
 #include <unordered_set>
 #include "./message_passing.h"
 #include "../pass/ir_util.h"
@@ -27,12 +28,33 @@ size_t FindNodeRef(ArrayNode* array_node, const T& v) {
   return array_node->data.size();
 }
 
+// The replacer of cache.
+class LoadReplacer : public ir::IRMutator {
+ public:
+  explicit LoadReplacer(
+      const std::unordered_map<const Variable*, VarExpr>& vsub)
+      : vsub_(vsub) {}
+
+  Expr Mutate_(const Load* op, const Expr& e) {
+    const Variable* var = op->buffer_var.as<Variable>();
+    auto it = vsub_.find(var);
+    if (it != vsub_.end())
+      return Load::make(op->type, it->second, 
+                        op->index, op->predicate); 
+    return e;
+  }
+
+ private:
+  const std::unordered_map<const Variable*, VarExpr>& vsub_;
+};
+
 // The replacer of cache.
 class VarReplacer : public ir::IRMutator {
  public:
   explicit VarReplacer(
       const std::unordered_map<const Variable*, Expr>& vsub)
       : vsub_(vsub) {}
+
   Expr Mutate_(const Variable* op, const Expr& e) {
     auto it = vsub_.find(op);
     if (it != vsub_.end()) return it->second;
@@ -43,6 +65,17 @@ class VarReplacer : public ir::IRMutator {
   const std::unordered_map<const Variable*, Expr>& vsub_;
 };
 
+// create indices for store 
+Expr getIndex(std::vector<Expr> indices, const Array<Expr> shape) {
+  Expr ret = indices[0];
+  Expr mul = 1;
+  for (size_t i = 1; i < indices.size(); i++) {
+    ret = Simplify(ret + indices[i] * mul);
+    mul = Simplify(mul * shape[i]);
+  }
+  return ret;
+}
+
 Expr InjectPredicate(const Array<Expr>& predicates,
                      Expr body) {
   using ir::Reduce;
@@ -74,6 +107,120 @@ void ReplaceDataFlow(const Array<Stage>& stages,
   }
 }
 
+class StreamConsumer final : public IRMutator {
+  public: 
+    StreamConsumer(
+        const std::string& target,
+        const ir::StreamType& type,
+        int channel_index) 
+      : target_(target), type_(type),
+        channel_index_(channel_index) {} 
+
+    Expr Mutate_(const Load* op, const Expr& e) {
+      Expr index = op->index;
+      std::string target_name = op->buffer_var.get()->name_hint;
+      if (target_ == target_name) {
+        Array<Expr> keys, values;
+        keys.push_back(StringImm::make("index"));
+        values.push_back(IntImm::make(Int(32), channel_index_));
+        return StreamExpr::make(op->type, op->buffer_var, 
+                                type_, 10, keys, values);
+      } else {
+        return Load::make(op->type, op->buffer_var, 
+                          index, op->predicate);
+      }
+   }
+
+  private:
+    const std::string target_;
+    const ir::StreamType type_;
+    const int channel_index_;
+};
+
+class StreamProducer final : public IRMutator {
+  public: 
+    StreamProducer(
+        const std::string& target,
+        const ir::StreamType& type,
+        int channel_index) 
+      : target_(target), type_(type),
+        channel_index_(channel_index) {} 
+
+    Stmt Mutate_(const Store* op, const Stmt& s) {
+      Expr index = op->index;
+      Expr value = this->Mutate(op->value);
+      std::string target_name = op->buffer_var.get()->name_hint;
+      if (target_name == target_) {
+        Array<Expr> keys, values;
+        keys.push_back(StringImm::make("index"));
+        values.push_back(IntImm::make(Int(32), channel_index_));
+        return StreamStmt::make(op->buffer_var, value, 
+                                type_, 10, keys, values); 
+      } else {
+        return Store::make(op->buffer_var, value, 
+                           index, op->predicate);
+      }
+    }
+
+  private:
+    const std::string target_;
+    const ir::StreamType type_;
+    const int channel_index_;
+};
+
+class KernelUpdater final : public IRMutator {
+  public: 
+    static int channelCount;
+    KernelUpdater(
+        const int arg_pos,
+        const ir::StreamType& type,
+        const bool is_producer,
+        const bool kernel_channel) 
+      : arg_pos_(arg_pos), type_(type), 
+        is_producer_(is_producer),
+        kernel_channel_(kernel_channel) {
+          if (kernel_channel_) channel_index_ = getIndex();
+    }
+
+    Stmt Mutate_(const KernelDef* op, const Stmt& s) {
+      Stmt stmt = op->body;
+      // arr saves arg_pos and common channel idx
+      Array<Expr> arr = op->channels;
+      CHECK(op->channels.size() % 2 == 0)
+        << "arg_pos, index pair number mismatch";
+      arr.push_back(IntImm::make(Int(32), arg_pos_));
+      arr.push_back(IntImm::make(Int(32), channel_index_));
+      std::string target_ = op->args[arg_pos_].get()->name_hint;
+      if (is_producer_) { // mutate target load
+        StreamProducer mutator(target_, type_, channel_index_); 
+        stmt = mutator.Mutate(stmt);
+      } else { // replace load consumer
+        StreamConsumer mutator(target_, type_, channel_index_);
+        stmt = mutator.Mutate(stmt);
+      }
+      // update kernel arg signature
+      return KernelDef::make(op->args, op->api_args, 
+                             op->api_types, stmt, op->ret_void,
+                             op->ret_type, op->name, arr);
+   }
+  private:
+    const int arg_pos_;
+    const ir::StreamType type_;
+    const bool is_producer_;
+    const bool kernel_channel_;
+    int channel_index_{0}; 
+    int getIndex() {
+      channelCount += 1; 
+      int channel_num = channelCount;
+      if (channelCount % 2 == 0) 
+        channel_num = channelCount - 1;
+      return channel_num;
+    }
+};
+
+// Initialize static channel count
+int KernelUpdater::channelCount = 0;
+
 class ParentStmtCollector final : public IRMutator {
   public:
     ParentStmtCollector(
@@ -117,6 +264,369 @@ class ParentStmtCollector final : public IRMutator {
     const IterVar& axis_;
 };
 
+// initialize static split bound
+int Schedule::split_bound = 0;
+
+// stream buffer data to kernel stage 
+void Schedule::to_stage(const Tensor& target,
+                        /*kernel def stage*/ Stage dest,
+                        /*position index*/int arg_pos,
+                        StreamType stream_type,
+                        int channel_depth,
+                        std::string name) {
+  Stage target_stage = (*this)[target];
+  Buffer target_buffer;
+
+  // target stage as kernel def operator 
+  if (const ExternOpNode* op = target_stage->op.as<ExternOpNode>()) {
+    target_buffer = op->output_placeholders[0];
+    // remove the receiver buffer (keep the device scope) 
+    const AttrStmt* attr = op->body.as<AttrStmt>();
+    Stmt scope_attr = AttrStmt::make(attr->node, attr->attr_key, 
+                                     attr->value, Evaluate::make(0));
+    target_stage->op = ExternOpNode::make(op->name,
+                                          "",
+                                          Array<IterVar>(),
+                                          op->inputs,
+                                          op->input_placeholders,
+                                          op->output_placeholders,
+                                          scope_attr);
+    // update dest stage body for data stream in 
+    const ExternOpNode* destOp = dest->op.as<ExternOpNode>();
+    KernelUpdater mutator(arg_pos, stream_type, 
+                          /*is producer*/false, 
+                          /*inter module channel*/false);
+    auto new_body = mutator.Mutate(destOp->body);
+    dest->op = ExternOpNode::make(destOp->name, destOp->tag,
+                                  destOp->axis, destOp->inputs,
+                                  destOp->input_placeholders,
+                                  Array<Buffer>(),
+                                  new_body);
+  }
+}
+
+// stream data between hardware modules  
+void Schedule::stream_to(const Tensor& target,
+                         Stage dest,
+                         Stage source,
+                         StreamType stream_type,
+                         int channel_depth, 
+                         std::string new_name) {
+  Stage target_stage = (*this)[target];
+  std::vector<Stage> consumers; 
+  size_t num_stage = (*this)->stages.size();
+  Buffer target_buffer;
+  std::unordered_map<Stage, int, NodeHash, NodeEqual> pos;
+  const ExternOpNode* destOp = dest->op.as<ExternOpNode>();
+  const ExternOpNode* srcOp = source->op.as<ExternOpNode>();
+
+  // update kernel def and scope 
+  const PlaceholderOpNode* op = target_stage->op.as<PlaceholderOpNode>();
+  bool is_placeholder = op ? true : false;
+  if (is_placeholder) {
+    for (size_t i = 0; i < num_stage; i++) {
+      Stage s = (*this)->stages[i];
+      // name matching to locate kernels 
+      if (const ExternOpNode* op = s->op.as<ExternOpNode>()) {
+        for (size_t j = 0; j < op->inputs.size(); j++) {
+          if (target == op->inputs[j]) {
+            target_buffer = op->input_placeholders[j];
+            consumers.push_back(s);
+            // record streamed data pos in kernel call
+            if (std::regex_match(op->name, 
+                    std::regex(destOp->name + "(\\d)")))
+              pos[dest] = j;
+            else if (std::regex_match(op->name, 
+                         std::regex(destOp->name + "(\\d)")))
+              pos[source] = j;
+            break;
+          }
+        }
+      }
+    }
+  } else { // only consumed by self stage 
+    const ExternOpNode* op = target_stage->op.as<ExternOpNode>();
+    target_buffer = op->output_placeholders[0];
+    consumers.push_back(target_stage);
+  }
+  // mutator (is_producer false, kernel_channel true)
+  KernelUpdater destMutator(0, //target_buffer->name, 
+                            stream_type, false, true);
+  // mutate kernel def and repalce lw / st 
+  dest->op = ExternOpNode::make(destOp->name,
+                                destOp->tag,
+                                destOp->axis,
+                                destOp->inputs,
+                                destOp->input_placeholders,
+                                Array<Buffer>(),
+                                destMutator.Mutate(destOp->body));
+  // mutator (is_producer true, kernel_channel true)
+  KernelUpdater srcMutator(0, //target_buffer->name,
+                           stream_type, true, true);
+  source->op = ExternOpNode::make(srcOp->name,
+                                  srcOp->tag,
+                                  srcOp->axis,
+                                  srcOp->inputs,
+                                  srcOp->input_placeholders,
+                                  Array<Buffer>(),
+                                  srcMutator.Mutate(srcOp->body));
+  // update kernel call ops
+  for (auto s : consumers) {
+    const ExternOpNode* op = s->op.as<ExternOpNode>();
+    Stmt body = AttrStmt::make(VarExpr(),
+                               "device_scope",
+                               StringImm::make("fpga"),
+                               op->body);
+    // not alloc buffer for kernel call
+    s->op = ExternOpNode::make(op->name,
+                               op->tag,
+                               op->axis,
+                               op->inputs,
+                               op->input_placeholders,
+                               Array<Buffer>(),
+                               body);
+  }
+}
+
+// move data to device
+Tensor Schedule::move_to(const Tensor& target,
+                         DeviceType device_type,
+                         StreamType stream_type,
+                         int channel_depth, 
+                         std::string new_name) {
+  Stage target_stage = (*this)[target];
+  std::vector<Stage> consumers; 
+  size_t num_stage = (*this)->stages.size();
+  size_t min_pos = num_stage;
+  ArrayNode* stages = (*this)->stages.CopyOnWrite();
+  Buffer target_buffer;
+
+  // create producer and consumer stages for placeholder
+  const PlaceholderOpNode* op = target_stage->op.as<PlaceholderOpNode>();
+  bool is_placeholder = op ? true : false;
+  if (is_placeholder) {
+    min_pos = 0;
+    for (size_t i = 0; i < num_stage; i++) {
+      Stage s = (*this)->stages[i];
+      if (const ExternOpNode* op = s->op.as<ExternOpNode>()) {
+        for (size_t j = 0; j < op->inputs.size(); j++) {
+          if (target == op->inputs[j]) {
+            target_buffer = op->input_placeholders[j];
+            consumers.push_back(s);
+            break;
+          }
+        }
+      }
+    }
+  } else { // move data generated by extern op 
+    min_pos = FindNodeRef(stages, target_stage) + 1;
+    const ExternOpNode* op = target_stage->op.as<ExternOpNode>();
+    target_buffer = op->output_placeholders[0];
+    for (size_t i = 0; i < num_stage; i++) {
+      Stage s = (*this)->stages[i];
+      if (const ExternOpNode* stage_op = s->op.as<ExternOpNode>()) {
+        for (size_t j = 0; j < stage_op->inputs.size(); j++) {
+          if (op->output_placeholders[0] == stage_op->input_placeholders[j]) {
+            consumers.push_back(s);
+            break;
+          }
+        }
+      }
+    }
+  }
+
+  // create sender and write into streaming channel 
+  Array<Tensor> consumer_inputs;
+  Array<Buffer> consumer_input_placeholders;
+  Array<Buffer> consumer_output_placeholders;
+  std::string consumer_name = target_buffer->name + ".stream_send";
+  Buffer consumer_buffer = BufferNode::make(Var(consumer_name, Handle()),
+                                            target->dtype,
+                                            target->shape,
+                                            Array<Expr>(),
+                                            Expr(),
+                                            consumer_name,
+                                            "", 0, 0);
+  consumer_inputs.push_back(target);
+  consumer_input_placeholders.push_back(target_buffer);
+  consumer_output_placeholders.push_back(consumer_buffer);
+
+  // create statement index
+  std::vector<Expr> csm_indices;
+  std::vector<VarExpr> csm_loop_vars;
+  for (size_t i = 0; i < target->shape.size(); i++) {
+    VarExpr iter(target_buffer->name + std::to_string(i));
+    csm_indices.push_back(iter);
+    csm_loop_vars.push_back(iter);
+  }
+  Expr csm_index = getIndex(csm_indices, target->shape); 
+  Expr load_expr = Load::make(target->dtype,
+                              target_buffer->data, 
+                              csm_index, 
+                              UIntImm::make(UInt(1), 1));
+  Stmt consumer_body = StreamStmt::make(consumer_buffer->data,
+                                        load_expr,
+                                        stream_type,
+                                        channel_depth);
+
+  Expr sender_scope, receiver_scope; 
+  size_t consumer_pos = min_pos;
+  switch (device_type) {
+    case DeviceType::CPU:
+      consumer_pos = num_stage; 
+      sender_scope = StringImm::make("fpga");
+      receiver_scope = StringImm::make("cpu");
+      break;
+    case DeviceType::FPGA:
+      sender_scope = StringImm::make("cpu");
+      receiver_scope = StringImm::make("fpga");
+      break;
+    case DeviceType::GPU:
+      sender_scope = StringImm::make("cpu");
+      receiver_scope = StringImm::make("gpu");
+      break;
+  }
+  
+  for (size_t j = 0; j < target->shape.size(); j++) {
+    consumer_body = For::make(
+      VarExpr(csm_loop_vars[j]),
+      0, target->shape[j],
+      ForType::Serial,
+      DeviceAPI::None,
+      consumer_body);
+  }
+
+  consumer_body = AttrStmt::make(
+      consumer_buffer->data,
+      "device_scope", sender_scope, consumer_body);
+
+  // create new stage and return stream tensors 
+  // auto n = std::make_shared<ExternOpNode>();
+  // n->name = consumer_name;
+  // n->body = consumer_body; 
+  // n->inputs = consumer_inputs;
+  // n->input_placeholders = consumer_input_placeholders;
+  // n->output_placeholders = consumer_output_placeholders;
+  // Operation consumer_op(n);
+
+  Operation consumer_op = ExternOpNode::make(consumer_name, 
+                                             "",
+                                             Array<IterVar>(),
+                                             consumer_inputs,
+                                             consumer_input_placeholders,
+                                             consumer_output_placeholders,
+                                             consumer_body);
+  Stage consumer_stage = Stage(consumer_op);
+  // insert sender before bound for (host,xcel <- host) case
+  if (device_type == DeviceType::FPGA) {
+    if (split_bound == 0) {
+      split_bound = consumer_pos + 1;
+    } else { // insert host sender before bound
+      consumer_pos = split_bound;
+      split_bound += 1;
+    }
+  }
+  stages->data.insert(stages->data.begin() + consumer_pos, consumer_stage.node_);
+  (*this)->stage_map.Set(consumer_op, consumer_stage);
+
+  // build producer (receiver) stage which takes in data from streaming
+  // channel and provide data to orginal consumers
+  Array<Tensor> producer_inputs;
+  Array<Buffer> producer_input_placeholders;
+  Array<Buffer> producer_output_placeholders;
+  std::string producer_name = target_buffer->name + ".stream_recv";
+  Buffer producer_buffer = BufferNode::make(Var(producer_name, Handle()),
+                                            target->dtype,
+                                            target->shape,
+                                            Array<Expr>(),
+                                            Expr(),
+                                            producer_name,
+                                            "", 0, 0);
+  // producer_inputs.push_back(consumer_op.output(0));
+  // producer_input_placeholders.push_back(consumer_buffer);
+  producer_output_placeholders.push_back(producer_buffer);
+  // streaming producer tensor reading from placeholder 
+  Expr stream = StreamExpr::make(target->dtype,
+                                 consumer_buffer->data,
+                                 stream_type,
+                                 channel_depth);
+  // create for loops for tensor init
+  std::vector<Expr> indices;
+  std::vector<VarExpr> loop_vars;
+  for (size_t i = 0; i < target->shape.size(); i++) {
+    VarExpr iter(target_buffer->name + std::to_string(i));
+    indices.push_back(iter);
+    loop_vars.push_back(iter);
+  }
+  Expr index = getIndex(indices, target->shape); 
+  // store op initialized with variable node
+  Stmt for_stmt = Store::make(producer_buffer->data,
+                              stream, index,
+                              UIntImm::make(UInt(1), 1));
+  for (size_t j = 0; j < target->shape.size(); j++) {
+    for_stmt = For::make(
+      VarExpr(loop_vars[j]),
+      0, target->shape[j],
+      ForType::Serial,
+      DeviceAPI::None,
+      for_stmt);
+  }
+
+  // attr annotates new scope
+  Stmt body = AttrStmt::make(
+      target_buffer->data,
+      "device_scope", receiver_scope, for_stmt);
+  Tensor producer = ExternOpNode::make(producer_buffer->name, 
+                                       "",
+                                       Array<IterVar>(),
+                                       producer_inputs,
+                                       producer_input_placeholders,
+                                       producer_output_placeholders,
+                                       body).output(0);
+
+  // recv stage creation + return tensor 
+  Stage producer_stage = Stage(producer->op);
+  size_t pos = FindNodeRef(stages, consumer_stage);
+  if (split_bound == 0 || device_type == DeviceType::CPU) 
+    pos = pos + 1;
+  else pos = split_bound + 1; 
+  stages->data.insert(stages->data.begin() + pos, producer_stage.node_);
+  (*this)->stage_map.Set(producer->op, producer_stage);
+
+  // update consumer stages with new tensor and buffer
+  std::unordered_map<const Variable*, VarExpr> vsub;
+  vsub[target_buffer->data.as<Variable>()] = producer_buffer->data;
+  for (size_t i = 0; i < consumers.size(); i++) {
+    Stage s = consumers[i];
+    Array<Tensor> new_inputs;
+    Array<Buffer> new_input_placeholders;
+    const ExternOpNode* op = s->op.as<ExternOpNode>();
+    new_inputs.push_back(producer);
+    new_input_placeholders.push_back(producer_buffer);
+    for (size_t j = 0; j < op->inputs.size(); j++) {
+      if (target != op->inputs[j]) {
+        new_inputs.push_back(op->inputs[j]);
+        new_input_placeholders.push_back(op->input_placeholders[j]);
+      }
+    }
+    Stmt body = LoadReplacer(vsub).Mutate(op->body);
+    Stmt new_body = AttrStmt::make(
+        target_buffer->data,
+        "device_scope",
+        receiver_scope,
+        op->body);
+    s->op = ExternOpNode::make(
+        op->name,
+        op->tag,
+        op->axis,
+        new_inputs,
+        new_input_placeholders,
+        op->output_placeholders,
+        body);
+  }
+  return producer;
+}
+
 Tensor Schedule::reuse_at(const Tensor& target,
                           Stage parent,
                           IterVar axis,
diff --git a/tvm/src/schedule/schedule_ops.cc b/tvm/src/schedule/schedule_ops.cc
index b4f8e7468..8156844f5 100644
--- a/tvm/src/schedule/schedule_ops.cc
+++ b/tvm/src/schedule/schedule_ops.cc
@@ -349,7 +349,7 @@ Stmt ScheduleOps(
         << "call schedule.normalize before scheduleops";
     CHECK(s->op.defined());
     // no need to specify place holder op.
-    if (s->op.as<PlaceholderOpNode>()) continue;
+    if (auto op = s->op.as<PlaceholderOpNode>()) continue;
     // Remove grouping sugar, get the real attach spec.
     Stage attach_spec = s.GetAttachSpec();
 
diff --git a/tvm/src/template/sdaccel/CLKernel.cpp b/tvm/src/template/sdaccel/CLKernel.cpp
new file mode 100644
index 000000000..84cf29465
--- /dev/null
+++ b/tvm/src/template/sdaccel/CLKernel.cpp
@@ -0,0 +1,67 @@
+/*===============================================================*/
+/*                                                               */
+/*                         CLKernel.cpp                          */
+/*                                                               */
+/*          Defines the object class for an OpenCL kernel        */
+/*                                                               */
+/*===============================================================*/
+
+#include "CLKernel.h"
+#include <stdlib.h>
+
+namespace rosetta
+{
+  // initialize the kernel from binary file
+  CLKernel::CLKernel(cl_context context, cl_program program, std::string kernel_name, cl_device_id device_id) 
+  {
+    printf("Creating kernel %s ... ", kernel_name.c_str());
+
+    int err;
+
+    // set the name and device ID
+    this->device_id = device_id;
+    this->kernel_name = kernel_name;
+
+    // Create the compute kernel in the program we wish to run
+    kernel = clCreateKernel(program, kernel_name.c_str(), &err);
+    if (!kernel || err != CL_SUCCESS)
+    {
+      printf("Error: Failed to create compute kernel!\n");
+      printf("Error Code %d\n", err);
+      exit(EXIT_FAILURE);
+    }
+
+    printf("Done!\n");  
+  }
+ 
+  void CLKernel::set_global(int global_work_size[3]) 
+  {
+    printf("Set global work size of kernel %s to [%d, %d, %d]\n", kernel_name.c_str(), 
+           global_work_size[0], global_work_size[1], global_work_size[2]);
+
+    for (int i = 0; i < 3; i ++ )
+      this->global_size[i] = global_work_size[i];
+  }
+  
+  void CLKernel::set_local(int local_work_size[3]) 
+  {
+    printf("Set local work size of kernel %s to [%d, %d, %d]\n", kernel_name.c_str(), 
+           local_work_size[0], local_work_size[1], local_work_size[2]);
+
+    for (int i = 0; i < 3; i ++ )
+      this->local_size[i] = local_work_size[i];
+  }
+
+  std::string CLKernel::get_name()
+  {
+    return this->kernel_name;
+  }
+
+  void CLKernel::releaseKernel()
+  {
+    printf("Release kernel %s ... ", kernel_name.c_str());
+    // release kernel
+    clReleaseKernel(kernel);
+    printf("Done!\n");
+  }
+}
diff --git a/tvm/src/template/sdaccel/CLKernel.h b/tvm/src/template/sdaccel/CLKernel.h
new file mode 100644
index 000000000..2933913c8
--- /dev/null
+++ b/tvm/src/template/sdaccel/CLKernel.h
@@ -0,0 +1,96 @@
+/*===============================================================*/
+/*                                                               */
+/*                         CLKernel.h                            */
+/*                                                               */
+/*          Defines the object class for an OpenCL kernel        */
+/*                                                               */
+/*===============================================================*/
+
+
+#ifndef __CLKernel__Harness__
+#define __CLKernel__Harness__
+
+// standard headers
+#include <cstdio>
+#include <vector>
+#include <string>
+// opencl header
+#include <CL/cl.h>
+// CLMemObj is a member of this class
+#include "CLMemObj.h"
+
+namespace rosetta
+{
+  
+  // wrapper class around an OpenCL kernel
+  class CLKernel 
+  {
+
+    friend class CLWorld;
+
+    public:
+
+      // constructor
+      // compiles the kernel
+      CLKernel(cl_context context, cl_program program, std::string kernel_name, cl_device_id device_id);
+
+      // set global/local work group size
+      void set_global(int global_work_size[3]);
+      void set_local(int local_work_size[3]);
+
+      // get kernel name
+      std::string get_name();
+
+    protected:
+
+      // set cl_mem argument
+      int set_mem_arg(int id, cl_mem mem_obj)
+      {
+        int err;
+        err = clSetKernelArg(this->kernel, id, sizeof(mem_obj), &mem_obj);
+        if (err != CL_SUCCESS)
+        {
+          printf("Error: Failed to set kernel argument %d for kernel %s!\n", id, (this->kernel_name).c_str());
+          printf("Error Code %d\n", err);
+          return EXIT_FAILURE;
+        }
+
+        return err;
+      }
+
+      // set memory arguments for this kernel
+      template<typename T>
+      int set_const_arg(int id, T& mem_obj)
+      {
+        int err;
+	// printf("%d\n", mem_obj);
+        err = clSetKernelArg(this->kernel, id, sizeof(mem_obj), &mem_obj);
+	printf("****************\n");
+	printf("%d\n", err);
+        if (err != CL_SUCCESS)
+        {
+          printf("Error: Failed to set kernel argument %d for kernel %s!\n", id, (this->kernel_name).c_str());
+          printf("Error Code %d\n", err);
+          return EXIT_FAILURE;
+        }
+
+        return err;
+      }
+
+      void releaseKernel();
+
+    private:
+
+      // global and local work group size
+      size_t global_size[3];
+      size_t local_size[3];
+
+      // kernel information and objects
+      std::string kernel_name;
+      cl_device_id device_id;		// target device id
+      cl_kernel kernel;                 // compute kernel
+
+  };
+
+}
+#endif /* defined(__CLKernel__Harness__) */
diff --git a/tvm/src/template/sdaccel/CLMemObj.cpp b/tvm/src/template/sdaccel/CLMemObj.cpp
new file mode 100644
index 000000000..a6fdecf4a
--- /dev/null
+++ b/tvm/src/template/sdaccel/CLMemObj.cpp
@@ -0,0 +1,57 @@
+/*===============================================================*/
+/*                                                               */
+/*                        CLMemObj.cpp                           */
+/*                                                               */
+/*       Implements the member functions of CLMemObj class       */
+/*                                                               */
+/*===============================================================*/
+
+
+#include "CLMemObj.h"
+
+namespace rosetta
+{
+  // default constructor, initializes everything to 0
+  CLMemObj::CLMemObj() 
+  {
+    this->mem_data = nullptr;
+    this->elt_size = 0;
+    this->length   = 0;
+    this->flags    = 0;
+    this->bank     = nullptr;
+  }
+  
+  // meaningful constructor, initialize data info constants
+  CLMemObj::CLMemObj(void *mem_data, int elt_size, int length, cl_mem_flags flags, cl_mem_ext_ptr_t* xil_ext ) 
+  {
+    this->mem_data = mem_data;
+    this->elt_size = elt_size;
+    this->length   = length;
+    this->flags    = flags;
+    // can use Xilinx mem extensions to specify DDR bank
+    if (xil_ext != nullptr)
+    {
+      this->bank = new cl_mem_ext_ptr_t;
+      this->bank->flags = xil_ext->flags;
+      this->bank->obj = xil_ext->obj;
+      this->bank->param = 0;
+    }
+    else
+      this->bank = nullptr;
+  }
+  
+  // return the pointer to data
+  void * CLMemObj::get_data()  { return mem_data; }
+  
+  // get size of each element
+  int CLMemObj::get_element_size() { return elt_size; }
+  
+  // get the number of elements in the buffer
+  int CLMemObj::get_length() { return length; }
+  
+  // get OpenCL memory flags
+  cl_mem_flags CLMemObj::get_flags() { return flags; }
+
+  // get xilinx memory extension pointer
+  cl_mem_ext_ptr_t* CLMemObj::get_xil_ext_ptr() { return bank; }
+}
diff --git a/tvm/src/template/sdaccel/CLMemObj.h b/tvm/src/template/sdaccel/CLMemObj.h
new file mode 100644
index 000000000..30e564aff
--- /dev/null
+++ b/tvm/src/template/sdaccel/CLMemObj.h
@@ -0,0 +1,57 @@
+/*===============================================================*/
+/*                                                               */
+/*                         CLMemObj.h                            */
+/*                                                               */
+/*     Defines the object class for an OpenCL memory buffer      */
+/*                                                               */
+/*===============================================================*/
+
+
+#ifndef __CLMemObj__Harness__
+#define __CLMemObj__Harness__
+
+// standard header for command line output
+#include <cstdio>
+// opencl header
+#include <CL/cl.h>
+// xilinx opencl extension header
+#include <CL/cl_ext.h>
+
+namespace rosetta
+{
+  // wrapper class around cl_mem
+  class CLMemObj 
+  {
+ 
+    friend class CLWorld;
+
+    public:
+  
+      // default constructor
+      CLMemObj ();
+      // a meaningful constructor
+      CLMemObj (void* mem_data, int elt_size, int length, cl_mem_flags flags, cl_mem_ext_ptr_t* xil_ext = nullptr);
+  
+      // get information about the buffer
+      void* get_data();
+      int get_element_size();
+      int get_length();
+      cl_mem_flags get_flags();
+      cl_mem_ext_ptr_t* get_xil_ext_ptr();
+ 
+    private:
+  
+      // pointer to data
+      void *mem_data;
+      // size of each element
+      int elt_size;
+      // number of elements
+      int length;
+      // OpenCL memory flag
+      cl_mem_flags flags;
+      // Xilinx extension describing bank assignment
+      cl_mem_ext_ptr_t* bank;
+  };
+}
+
+#endif /* defined(__CLMemObj__Harness__) */
diff --git a/tvm/src/template/sdaccel/CLWorld.cpp b/tvm/src/template/sdaccel/CLWorld.cpp
new file mode 100644
index 000000000..7be386df2
--- /dev/null
+++ b/tvm/src/template/sdaccel/CLWorld.cpp
@@ -0,0 +1,401 @@
+/*===============================================================*/
+/*                                                               */
+/*                         CLWorld.cpp                           */
+/*                                                               */
+/*             Implementation of the CLWorld class               */
+/*                                                               */
+/*===============================================================*/
+
+#include "CLWorld.h"
+
+namespace rosetta
+{
+  // default constructor
+  // make sure it does something meaningful
+  CLWorld::CLWorld()
+  {
+    // default: run on alpha data 7v3 board
+    this->target_device_name = "xilinx:adm-pcie-7v3:1ddr:3.0";
+    this->device_type = CL_DEVICE_TYPE_ACCELERATOR;
+
+    // configure the OpenCL runtime
+    createWorld();
+  }
+
+  // meaningful constructor
+  // user specifies device
+  CLWorld::CLWorld(std::string target_device_name, cl_device_type device_type)
+  {
+    this->target_device_name = target_device_name;
+    this->device_type = device_type;
+    createWorld();
+  }
+
+  // get the compute device
+  cl_device_id CLWorld::getDevice()
+  {
+    return this->device_id;
+  }
+
+  // get context
+  cl_context CLWorld::getContext()
+  {
+    return this->context;
+  }
+
+  // get compute program
+  cl_program CLWorld::getProgram()
+  {
+    return this->program;
+  }
+
+  // insert a new memory object
+  int CLWorld::addMemObj(CLMemObj &new_mem_obj)
+  {
+    int err;
+
+    printf("Adding memory object into the world ... ");
+
+    // first push the CLMemObj object into our vector
+    mem_objs.push_back(new_mem_obj);
+
+    // then create the actual cl_mem buffer, push it into another vector
+    cl_mem buf;
+
+    buf = clCreateBuffer(context, new_mem_obj.flags, new_mem_obj.elt_size * new_mem_obj.length, new_mem_obj.bank, &err);
+    if (err != CL_SUCCESS)
+    {
+      printf("Error creating buffer for memory object %d!\n", mem_objs.size()-1);
+      printf("Error Code %d\n", err);
+      exit(EXIT_FAILURE);
+    }
+
+    cl_mem_buffers.push_back(buf);
+
+    // write the buffer onto the device if needed
+    if ((new_mem_obj.flags != CL_MEM_WRITE_ONLY) && (new_mem_obj.mem_data != nullptr))
+    {
+      err = clEnqueueWriteBuffer(cmd_queue, buf, true, 0, new_mem_obj.elt_size * new_mem_obj.length, 
+                                 new_mem_obj.mem_data, 0, NULL, NULL);
+      if (err != CL_SUCCESS)
+      {
+        printf("Error writing buffer %d onto the device!\n", mem_objs.size()-1);
+        printf("Error Code %d\n", err);
+        exit(EXIT_FAILURE);
+      }
+    }
+
+    printf("Done!\n");
+
+    return (mem_objs.size() - 1);
+  }
+
+  int CLWorld::updateMemObj(int mem_idx)
+  {
+    printf("Updating mem object %d ... ", mem_idx);
+
+    // write the buffer onto the device if needed
+    if (mem_objs[mem_idx].flags != CL_MEM_WRITE_ONLY)
+    {
+      int err = clEnqueueWriteBuffer(cmd_queue, cl_mem_buffers[mem_idx], true, 0, 
+                                     mem_objs[mem_idx].elt_size * mem_objs[mem_idx].length, 
+                                     mem_objs[mem_idx].mem_data, 0, NULL, NULL);
+      if (err != CL_SUCCESS)
+      {
+        printf("Error writing buffer %d onto the device!\n", mem_idx);
+        printf("Error Code %d\n", err);
+        exit(EXIT_FAILURE);
+      }
+    }
+    else
+      printf("Buffer %d is write_only! Not updating it ... \n", mem_idx);
+    
+    return EXIT_SUCCESS;
+  }
+   
+  int CLWorld::readMemObj(int mem_idx)
+  {
+    printf("Reading mem object %d into host buffers ... ", mem_idx);
+
+    int err = clEnqueueReadBuffer(cmd_queue, cl_mem_buffers[mem_idx], true, 0,
+                                  mem_objs[mem_idx].elt_size * mem_objs[mem_idx].length, 
+				  mem_objs[mem_idx].mem_data, 0, NULL, NULL);
+    if (err != CL_SUCCESS)
+    {
+      printf("Error reading kernel buffer %d!\n", mem_idx);
+      printf("Error code %d\n", err);
+      exit(EXIT_FAILURE);
+    }
+
+    printf("Done!\n");
+
+    return err;
+  }
+     
+
+  // create compute program from a file
+  // return error code
+  int CLWorld::addProgram(std::string filename)
+  {
+    printf("Adding binary program into the world ... ");
+
+    // load the file
+    size_t code_size = (size_t) load_file_to_memory(filename.c_str());
+
+    // start to compile
+    int err;
+    cl_int create_binary_status;
+
+    // Create the compute program from the source buffer
+    program = clCreateProgramWithBinary(context, 1, &device_id, (const size_t *) &code_size, 
+                                        (const unsigned char **) &kernel_code, &create_binary_status, &err);
+    if (!program)
+    {
+      printf("Error: Failed to create compute program!\n");
+      printf("Error Code %d\n", err);
+      exit(EXIT_FAILURE);
+    }
+ 
+    // Build the program executable
+    err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
+    if (err != CL_SUCCESS)
+    {
+      size_t len;
+      char buffer[2048];
+ 
+      printf("Error: Failed to build program executable!\n");
+      printf("Error Code %d\n", err);
+      clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
+      printf("%s\n", buffer);
+      exit(EXIT_FAILURE);
+    }
+
+    printf("Done!\n");
+
+    return err;
+  }
+
+  // insert a kernel into the world
+  // return the position of the kernel in the vector
+  int CLWorld::addKernel(CLKernel &new_kernel)
+  {
+    printf("Adding kernel %s into the world ... ", new_kernel.get_name().c_str());
+
+    kernels.push_back(new_kernel);
+
+    printf("Done!\n");
+
+    return (kernels.size() - 1);
+  }
+
+  // methods to set kernel arguments
+  // memory argument
+  int CLWorld::setMemKernelArg(int kernel_id, int pos, int arg_id)
+  {
+    printf("Set mem arg %d for kernel %d with mem object %d ... ", pos, kernel_id, arg_id);
+
+    int err = kernels[kernel_id].set_mem_arg(pos, cl_mem_buffers[arg_id]);
+    if (err != CL_SUCCESS)
+    {
+      printf("Error setting kernel argument!\n");
+      printf("Error code %d\n", err);
+      exit(EXIT_FAILURE);
+    }
+
+    printf("Done!\n");
+
+    return err;
+  }
+   
+  // run all kernels
+  // return error code
+  int CLWorld::runKernels(bool flush)
+  {
+    printf("Start kernel execution ... ");
+
+    int err;
+
+    // wait for previous write buffer tasks to finish
+    printf("Waiting for queue... \n");
+    clFinish(cmd_queue);
+
+    // enqueue all the kernels
+    // temporarily we assume kernels won't have any dependency between them
+    // or the dependency is handled inside kernels (such as pipes, etc. )
+    for (int i = 0; i < kernels.size(); i ++ )
+    {
+      printf("Start kernel %d!\n", i);
+      err = clEnqueueNDRangeKernel(cmd_queue, kernels[i].kernel, 3, NULL, kernels[i].global_size, kernels[i].local_size, 
+                                   0, NULL, NULL);
+      if (err != CL_SUCCESS)
+      {
+        printf("Error enqueuing kernel %d!\n", i);
+	printf("Error Code %d\n", err);
+	exit(EXIT_FAILURE);
+      }
+    }
+
+    // wait for them to finish
+    printf("Waiting for kernels ... \n");
+    clFinish(cmd_queue);
+
+    // remove all of them from the vector
+    // so that this function can be called multiple times
+    // at a cost that kernels won't be released automatically
+    if (flush)
+    {
+      int total_size = kernels.size();
+      for (int i = 0; i < total_size; i ++ )
+        kernels.pop_back();
+    }
+
+    printf("Done!\n");
+
+    return err;
+  }
+
+  // create runtime environment
+  int CLWorld::createWorld()
+  {
+    printf("Initializing OpenCL runtime environment ... ");
+
+    int err;
+
+    // scan the machine for available OpenCL platforms
+    cl_uint platform_cnt;
+    cl_platform_id platforms[16];
+    err = clGetPlatformIDs(16, platforms, &platform_cnt);
+    if (err != CL_SUCCESS)
+    {
+      printf("Error: Failed to find an OpenCL platform!\n");
+      printf("Error Code %d\n", err);
+      printf("Test failed\n");
+      exit(EXIT_FAILURE);
+    }
+    printf("INFO: Found %d platforms\n", platform_cnt);
+
+
+    // find the target device
+    char device_name[1024];
+    cl_device_id devices[16];
+    cl_uint device_cnt;
+    bool found_device = false;
+    // scan all platforms
+    for (int p = 0; (p < platform_cnt) & (!found_device); p ++ )
+    {
+      err = clGetDeviceIDs(platforms[p], this->device_type, 16, devices, &device_cnt);
+      if (err != CL_SUCCESS)
+      {
+        printf("Error: Failed to create a device group for platform %d!\n", p);
+        printf("Error Code %d\n", err);
+        printf("Test failed\n");
+        exit(EXIT_FAILURE);
+      }
+      // iterate through all devices on the platform
+      for (int d = 0; (d < device_cnt) & (!found_device); d ++ )
+      {
+        err = clGetDeviceInfo(devices[d], CL_DEVICE_NAME, 1024, device_name, 0);
+        if (err != CL_SUCCESS) 
+	{
+	  printf("Error: Failed to get device name for device %d on platform %d!\n", d, p);
+	  printf("Error Code %d\n", err);
+	  printf("Test failed\n");
+	  exit(EXIT_FAILURE);
+	}
+
+        if (std::string(device_name) == this->target_device_name)
+	{
+	  this->platform = platforms[p];
+	  this->device_id = devices[d];
+          found_device = true;
+	  printf("Selected device %d on platform %d as target device!\n", d, p);
+	}
+      }
+    }
+
+    if (!found_device)
+    {
+      printf("Error: Target device %s is not found!\n", (this->target_device_name).c_str());
+      exit(EXIT_FAILURE);
+    }
+
+    // create context and command queue
+    this->context = clCreateContext(0, 1, &(this->device_id), 0, 0, &err);
+    if (!(this->context))
+    {
+      printf("Error: Failed to create a compute context!\n");
+      printf("Error Code %d\n", err);
+      exit(EXIT_FAILURE);
+    }
+    this->cmd_queue = clCreateCommandQueue(this->context, this->device_id, 
+                                           CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE,
+					   &err);
+    if (!(this->cmd_queue))
+    {
+      printf("Error: Failed to create a command queue!\n");
+      printf("Error Code %d\n", err);
+      exit(EXIT_FAILURE);
+    }
+
+    printf("Done!\n");
+
+    return err;
+  }
+
+  // read kernel binary file into memory
+  int CLWorld::load_file_to_memory(const char *filename) 
+  {
+    int size = 0;
+    FILE *f = fopen(filename, "rb");
+    if (f == NULL)
+    {
+      kernel_code = NULL;
+      printf("Can not open kernel file!\n");
+      exit(-1);
+    }
+    fseek(f, 0, SEEK_END);
+    size = ftell(f);
+    printf("Size of the file is %ld\n", size);
+    fseek(f, 0, SEEK_SET);
+    kernel_code = new char[size+1];
+    if ((unsigned int) size != fread(kernel_code, sizeof(char), size, f))
+    {
+      delete []kernel_code;
+      printf("Reading kernel failed!\n");
+      exit(-2);
+    }
+    fclose(f);
+    (kernel_code)[size] = 0;
+    return size;
+  }
+
+
+  // release all runtime constructs
+  void CLWorld::releaseWorld()
+  {
+    printf("Cleaning up OpenCL opjects ... ");
+
+    // release memory objects
+    for (int i = 0; i < cl_mem_buffers.size(); i ++ )
+      clReleaseMemObject(cl_mem_buffers[i]);
+
+    // release program
+    delete []kernel_code;
+    clReleaseProgram(program);
+
+    // release kernels
+    for (int i = 0; i < kernels.size(); i ++ )
+      kernels[i].releaseKernel();
+
+    // release device and context
+    clReleaseCommandQueue(cmd_queue);
+    clReleaseContext(context);
+
+    printf("Done!\n");
+  }
+
+}
+
+
+
+
diff --git a/tvm/src/template/sdaccel/CLWorld.h b/tvm/src/template/sdaccel/CLWorld.h
new file mode 100644
index 000000000..9624687aa
--- /dev/null
+++ b/tvm/src/template/sdaccel/CLWorld.h
@@ -0,0 +1,129 @@
+/*===============================================================*/
+/*                                                               */
+/*                          CLWorld.h                            */
+/*                                                               */
+/*          Defines the object class for OpenCL context          */
+/*                                                               */
+/*===============================================================*/
+
+
+#ifndef __CLWorld__Harness__
+#define __CLWorld__Harness__
+
+// standard headers
+#include <cstdio>
+#include <string>
+#include <vector>
+// opencl header
+#include <CL/cl.h>
+// CLKernel and CLMemObj are members of this class
+#include "CLKernel.h"
+#include "CLMemObj.h"
+
+namespace rosetta
+{
+
+  class CLWorld
+  {
+    
+    public:
+
+      // default constructor
+      CLWorld();
+
+      // meaningful constructor
+      CLWorld(std::string target_device_name, cl_device_type device_type);
+
+      // get the compute device associated with this world
+      cl_device_id getDevice();
+
+      // get the compute context associated with this world
+      cl_context getContext();
+
+      // get the binary program 
+      cl_program getProgram();
+
+      // insert a compute program 
+      int addProgram(std::string filename);
+
+      // insert a kernel
+      int addKernel(CLKernel &new_kernel);
+
+      // insert a memory object
+      int addMemObj(CLMemObj &new_mem_obj);
+
+      // update a memory object (write new value)
+      int updateMemObj(int mem_id);
+
+      // read a memory object
+      int readMemObj(int mem_id);
+
+      // set memory kernel argument
+      int setMemKernelArg(int kernel_id, int pos, int mem_id);
+
+      // set constant kernel argument
+      template<typename T>
+      int setConstKernelArg(int kernel_id, int pos, T& arg)
+      {
+	// printf("%lu\n", arg);
+        printf("Set const arg %d for kernel %d ... ", pos, kernel_id);
+    
+        int err = kernels[kernel_id].set_const_arg(pos, arg);
+        if (err != CL_SUCCESS)
+        {
+          printf("Error setting kernel argument!\n");
+          printf("Error code %d\n", err);
+          exit(EXIT_FAILURE);
+        }
+    
+        printf("Done!\n");
+    
+        return err;
+      }
+
+      // run kernels
+      int runKernels(bool flush = false);
+
+      // clean up
+      void releaseWorld();
+
+    private:
+
+      // OpenCL runtime variables
+
+      // the platform we will use
+      cl_platform_id platform;
+
+      // the device we will use
+      std::string target_device_name;	// device name
+      cl_device_type device_type;       // device type
+      cl_device_id device_id;           // device id
+
+      // compute context
+      cl_context context;
+
+      // command queue
+      cl_command_queue cmd_queue;        
+
+      // binary program for the device
+      char* kernel_code;
+      cl_program program;
+
+      // kernels
+      std::vector<CLKernel> kernels;
+
+      // memory objects
+      std::vector<CLMemObj> mem_objs;
+      // actual OpenCL memory buffers
+      std::vector<cl_mem>   cl_mem_buffers;
+
+      // function to create the OpenCL runtime
+      int createWorld();
+
+      // load binary file into memory
+      int load_file_to_memory(const char *filename);
+  };
+
+}
+
+#endif
diff --git a/tvm/src/template/sdaccel/Makefile b/tvm/src/template/sdaccel/Makefile
new file mode 100644
index 000000000..282f67921
--- /dev/null
+++ b/tvm/src/template/sdaccel/Makefile
@@ -0,0 +1,33 @@
+# Set kernel name
+KERNEL_NAME = App
+
+# Set host source and headers
+# HOST_SRC_CPP = ./src/host/digit_recognition.cpp ./src/host/utils.cpp ./src/host/check_result.cpp
+HOST_SRC_CPP = host.cpp utils.cpp 
+# HOST_SRC_H   = ./src/host/utils.h ./src/host/check_result.h ./src/host/typedefs.h ./src/host/testing_data.h \
+               ./src/host/training_data.h
+HOST_SRC_H = utils.h
+# DATA         = ./data/*.dat
+
+
+# Set host code include paths
+HOST_INC = -I/opt/Xilinx/Vivado/2018.2.op2258646/include/
+HOST_LIB = -L/opt/Xilinx/Vivado/2018.2.op2258646/lib/
+
+# Set kernel file
+OCL_KERNEL_SRC = interface.cpp
+# OCL_KERNEL_H = ./src/host/typedefs.h
+# SDSOC_KERNEL_SRC = ./src/sdsoc/digitrec.cpp
+# SDSOC_KERNEL_H = ./src/host/typedefs.h
+# SW_KERNEL_SRC = ./src/sw/digitrec_sw.cpp
+# SW_KERNEL_H = ./src/host/typedefs.h ./src/sw/digitrec_sw.h
+
+# Set opencl kernel arguments
+# log: removed --report system
+OCL_KERNEL_ARGS = --max_memory_ports all 
+
+#-------------------------
+# Leave the rest to harness
+#-------------------------
+include harness.mk
+
diff --git a/tvm/src/template/sdaccel/harness.mk b/tvm/src/template/sdaccel/harness.mk
new file mode 100644
index 000000000..23856f9c7
--- /dev/null
+++ b/tvm/src/template/sdaccel/harness.mk
@@ -0,0 +1,196 @@
+# ======================================== Check Xilinx SDX Environment Settings ================================================== #
+ifndef XILINX_SDX
+  $(error Environment variable XILINX_SDX is required and should point to SDx install area)
+endif
+
+# =============================================== Tools Used in Rosetta =========================================================== #
+
+# sdaccel tools
+OCL_CXX   = xcpp
+XOCC      = xocc
+
+# sdsoc tools
+SDSXX     = sds++
+
+# default sw compiler
+SW_CXX = g++
+
+# ============================================= SDAccel Platform and Target Settings ============================================== #
+
+# Set Default OpenCL device and platform
+USR_PLATFORM = n
+OCL_DEVICE = xilinx:adm-pcie-7v3:1ddr:3.0
+OCL_PLATFORM = one_of_default_platforms
+
+# Check if the user specified opencl platform
+ifneq ($(OCL_PLATFORM), one_of_default_platforms)
+  USR_PLATFORM=y
+endif
+
+# Check OCL_TARGET value
+OCL_TARGET  = sw_emu
+ifeq ($(OCL_TARGET),sw_emu)
+else ifeq ($(OCL_TARGET),hw_emu)
+else ifeq ($(OCL_TARGET),hw)
+else
+  $(error "OCL_TARGET does not support the $(OCL_TARGET) value. Supported values are: sw_emu, hw_emu, hw")
+endif
+
+# Check opencl kernel file type
+OCL_KERNEL_TYPE = ocl
+
+ifeq ($(suffix $(OCL_KERNEL_SRC)),.cl)
+  OCL_KERNEL_TYPE=ocl
+else
+  OCL_KERNEL_TYPE=c
+endif
+
+# OpenCL runtime Libraries
+OPENCL_INC = $(XILINX_SDX)/runtime/include/1_2
+OPENCL_LIB = $(XILINX_SDX)/runtime/lib/x86_64
+
+# opencl harness files
+OCL_HARNESS_DIR     = .
+OCL_HARNESS_SRC_CPP = $(OCL_HARNESS_DIR)/CLKernel.cpp $(OCL_HARNESS_DIR)/CLMemObj.cpp $(OCL_HARNESS_DIR)/CLWorld.cpp
+OCL_HARNESS_SRC_H   = $(OCL_HARNESS_DIR)/CLKernel.h   $(OCL_HARNESS_DIR)/CLMemObj.h   $(OCL_HARNESS_DIR)/CLWorld.h
+
+# host compilation flags
+OCL_HOST_FLAGS = -DOCL -g -lxilinxopencl -I$(OPENCL_INC) $(HOST_INC) -L$(OPENCL_LIB) $(HOST_LIB) -I$(OCL_HARNESS_DIR) -I$(APPLICATION_DIR)
+
+# xclbin compilation flags
+XCLBIN_FLAGS = -s -t $(OCL_TARGET) -g 
+
+# change OCL_HOST_FLAG
+ifdef K_CONST
+ OCL_HOST_FLAGS += -DK_CONST=$(K_CONST)
+endif
+ifdef NUM_ITER 
+ OCL_HOST_FLAGS += -DNUM_ITER=$(NUM_ITER)
+endif
+ifdef FIXED_FLAG
+ OCL_HOST_FLAGS += -DFIXED_TYPE
+endif
+
+
+ifneq ($(KERNEL_TYPE),ocl)
+  XCLBIN_FLAGS += --kernel $(KERNEL_NAME)
+endif
+
+ifeq ($(USR_PLATFORM),n)
+  XCLBIN_FLAGS += --xdevice $(OCL_DEVICE)
+else
+  XCLBIN_FLAGS += --platform $(OCL_PLATFORM)
+endif
+
+
+# change XCLBIN_FLAGS
+ifdef K_CONST
+ XCLBIN_FLAGS += -DK_CONST=$(K_CONST)
+endif
+ifdef NUM_ITER
+  XCLBIN_FLAGS += -DNUM_ITER=$(NUM_ITER)
+endif
+ifdef FIXED_FLAG
+  XCLBIN_FLAGS += -DFIXED_TYPE
+endif
+
+
+XCLBIN_FLAGS += $(OCL_KERNEL_ARGS)
+
+
+# host exe
+OCL_HOST_EXE        = $(KERNEL_NAME)_host.exe
+
+# Kernel XCLBIN file
+XCLBIN        = $(KERNEL_NAME).$(OCL_TARGET).xclbin
+XO            = $(KERNEL_NAME).$(OCL_TARGET).xo
+
+# =============================================== SDSoC Platform and Target Settings ============================================== #
+
+# platform
+SDSOC_PLATFORM = zc706
+
+# executable
+SDSOC_EXE = $(KERNEL_NAME).elf
+
+# sds++ flags
+SDSFLAGS = -sds-pf $(SDSOC_PLATFORM) -sds-hw $(KERNEL_NAME) $(SDSOC_KERNEL_SRC) -sds-end -clkid 3  \
+           -poll-mode 1 -verbose
+SDSCFLAGS += -DSDSOC -Wall -O3 -c
+SDSCFLAGS += -MMD -MP -MF"$(@:%.o=%.d)"
+SDSLFLAGS = -O3 
+
+# objects
+ALL_SDSOC_SRC = $(HOST_SRC_CPP) $(SDSOC_KERNEL_SRC)
+OBJECTS := $(ALL_SDSOC_SRC:.cpp=.o)
+DEPS := $(OBJECTS:.o=.d)
+
+# =============================================== Pure Software Compilation Settings ============================================== #
+
+# compiler flags
+SW_FLAGS = -DSW -O3
+
+# sw executable
+SW_EXE = $(KERNEL_NAME)_sw.exe
+
+# ========================================================= Rules ================================================================= #
+
+# we will have 4 top-level rules: ocl, sdsoc, sw and clean
+# default to sw
+
+.PHONY: all ocl sdsoc sw clean
+
+all: sw
+
+# ocl rules
+ocl: $(OCL_HOST_EXE) $(XCLBIN)
+
+# ocl secondary rule: host executable
+$(OCL_HOST_EXE): $(HOST_SRC_CPP) $(HOST_SRC_H) $(OCL_HARNESS_SRC_CPP) $(OCL_HARNESS_SRC_H) $(DATA)
+	$(OCL_CXX) $(OCL_HOST_FLAGS) -o $@ $(HOST_SRC_CPP) $(OCL_HARNESS_SRC_CPP) 
+
+# ocl secondary rule: xclbin 
+$(XCLBIN): $(XO)
+	$(XOCC) -l $(XCLBIN_FLAGS) -o $@ $(XO)
+
+# ocl secondary rule: xo
+$(XO): $(OCL_KERNEL_SRC) $(OCL_KERNEL_H)
+	$(XOCC) -c $(XCLBIN_FLAGS) -o $@ $(OCL_KERNEL_SRC)
+
+# sdsoc rules
+sdsoc: $(SDSOC_EXE)
+
+$(SDSOC_EXE): $(OBJECTS)
+	$(SDSXX) $(SDSFLAGS) $(SDSLFLAGS) ${OBJECTS} -o $@
+
+-include $(DEPS)
+
+%.o: %.cpp
+	$(SDSXX) $(SDSFLAGS) $(SDSCFLAGS) $< -o $@
+
+
+# software rules
+sw: $(HOST_SRC_CPP) $(HOST_SRC_H) $(SW_KENREL_SRC) $(SW_KERNEL_H) $(DATA)
+	$(SW_CXX) $(SW_FLAGS) -o $(SW_EXE) $(HOST_SRC_CPP) $(SW_KERNEL_SRC)
+
+# cleanup
+clean:
+	@echo "Cleaning old files"
+	rm -rf *.exe
+	rm -rf *.elf
+	rm -rf *.xclbin
+	rm -rf *.bit
+	rm -rf *.rpt
+	rm -rf system_estimate.xtxt
+	rm -rf _xocc*
+	rm -rf _sds
+	rm -rf sd_card
+	rm -rf .Xil
+	rm -rf ./src/host/*.d
+	rm -rf ./src/sdsoc/*.o
+	rm -rf ./src/sdsoc/*.d
+	rm -rf ./src/host/*.o
+	rm -rf *.dat
+	rm -rf *.html
+	rm -rf *.csv
+	rm -rf *.json
diff --git a/tvm/src/template/sdaccel/run.tcl b/tvm/src/template/sdaccel/run.tcl
new file mode 100644
index 000000000..0d6dca4b5
--- /dev/null
+++ b/tvm/src/template/sdaccel/run.tcl
@@ -0,0 +1,14 @@
+set hls_prj digitrec.prj
+open_project ${hls_prj} -reset
+set_top default_function
+add_files -tb main.cpp
+add_files -tb data
+
+open_solution "solution1"
+set_part {xc7z020clg484-1}
+create_clock -period 10
+
+csim_design -O
+csynth_design
+#cosim_design
+exit
diff --git a/tvm/src/template/sdaccel/run_hw.sh b/tvm/src/template/sdaccel/run_hw.sh
new file mode 100755
index 000000000..f65d28e6d
--- /dev/null
+++ b/tvm/src/template/sdaccel/run_hw.sh
@@ -0,0 +1,28 @@
+#===============================================================#
+#                                                               #
+#                       	run_hw.sh                         	#
+#                                                               #
+#   	A bash script to synthesize and generate bitstream 			#
+#																#
+#                                                               #
+#===============================================================#
+
+
+#!/bin/bash
+make clean
+
+# the k value of KNN, default is 3
+k_value=3
+# the directory of this lab
+app_dir=`pwd`
+
+### COMPILATION
+# create some blank-line space for easy readability
+echo ""; echo ""; echo "" ; echo ""
+echo "####################################################"
+echo " Synthesize and Generate Bitstream with K_CONST=$k_value"
+echo "####################################################"
+make ocl OCL_TARGET=hw OCL_PLATFORM=$AWS_PLATFORM APPLICATION_DIR=$app_dir K_CONST=$k_value
+#export XCL_EMULATION_MODE=hw_emu
+#./DigitRec_host.exe -f DigitRec.hw_emu.xclbin 
+
diff --git a/tvm/src/template/sdaccel/run_sw.sh b/tvm/src/template/sdaccel/run_sw.sh
new file mode 100755
index 000000000..80ba00495
--- /dev/null
+++ b/tvm/src/template/sdaccel/run_sw.sh
@@ -0,0 +1,51 @@
+#===============================================================#
+#                                                               #
+#                       	run1.sh                         	#
+#                                                               #
+#   	A bash script to run the software emulation 			#
+#																#
+#                                                               #
+#===============================================================#
+
+
+#!/bin/bash
+make clean
+
+# check env variable setup
+if [ -z "$AWS_PLATFORM" ]; then
+    echo "AWS_PLATFORM not set up; use default"
+    export AWS_PLATFORM=xilinx:adm-pcie-7v3:1ddr:3.0
+fi
+
+# set up emulation configuration
+echo "#################################################"
+echo " Setting emulation configuration..."
+echo "#################################################"
+export LC_CTYPE=en_US.UTF-8
+export LC_ALL=en_US.UTF-8
+export XCL_EMULATION_MODE=true
+emconfigutil --platform=$AWS_PLATFORM
+
+# the k value of KNN, default is 3
+k_value=3
+# the directory of this lab
+app_dir=`pwd`
+
+### COMPILATION
+# create some blank-line space for easy readability
+echo ""; echo ""; echo "" ; echo ""
+echo "####################################################"
+echo " Compiling project with K_CONST=$k_value"
+echo "####################################################"
+make ocl OCL_TARGET=sw_emu OCL_PLATFORM=$AWS_PLATFORM APPLICATION_DIR=$app_dir K_CONST=$k_value
+
+
+### EXECUTION
+echo ""; echo ""; echo "" ; echo ""
+echo "####################################################"
+echo " Executing DigitRec with K_CONST=$k_value"
+echo "####################################################"
+export XCL_EMULATION_MODE=sw_emu
+#export XCL_EMULATION_MODE=hw_emu
+./App_host.exe -f App.sw_emu.xclbin 
+
diff --git a/tvm/src/template/sdaccel/utils.cpp b/tvm/src/template/sdaccel/utils.cpp
new file mode 100644
index 000000000..0e6dd632e
--- /dev/null
+++ b/tvm/src/template/sdaccel/utils.cpp
@@ -0,0 +1,46 @@
+/*===============================================================*/
+/*                                                               */
+/*                          utils.cpp                            */
+/*                                                               */
+/*                       Utility functions                       */
+/*                                                               */
+/*===============================================================*/
+
+#include <string>
+#include <cstdio>
+#include <getopt.h>
+#include <stdlib.h>
+
+#include "utils.h"
+
+void print_usage(char* filename)
+{
+    printf("usage: %s <options>\n", filename);
+    printf("  -f [kernel file]\n");
+}
+
+void parse_sdaccel_command_line_args(
+    int argc,
+    char** argv,
+    std::string& kernelFile) 
+{
+
+  int c = 0;
+
+  while ((c = getopt(argc, argv, "f:")) != -1) 
+  {
+    switch (c) 
+    {
+      case 'f':
+        kernelFile = optarg;
+        break;
+      default:
+      {
+        print_usage(argv[0]);
+        exit(-1);
+      }
+    } // matching on arguments
+  } // while args present
+}
+
+
diff --git a/tvm/src/template/sdaccel/utils.h b/tvm/src/template/sdaccel/utils.h
new file mode 100644
index 000000000..a3ab77437
--- /dev/null
+++ b/tvm/src/template/sdaccel/utils.h
@@ -0,0 +1,19 @@
+/*===============================================================*/
+/*                                                               */
+/*                           utils.h                             */
+/*                                                               */
+/*                       Utility functions                       */
+/*                                                               */
+/*===============================================================*/
+
+#include <string>
+//target device
+const std::string TARGET_DEVICE = "xilinx_aws-vu9p-f1-04261818_dynamic_5_0";
+
+void print_usage(char* filename);
+
+void parse_sdaccel_command_line_args(
+    int argc,
+    char** argv,
+    std::string& kernelFile);
+
diff --git a/tvm/src/template/vivado/Makefile b/tvm/src/template/vivado/Makefile
new file mode 100644
index 000000000..1d84baead
--- /dev/null
+++ b/tvm/src/template/vivado/Makefile
@@ -0,0 +1,31 @@
+#==========================================================================
+# Makefile
+#==========================================================================
+# @brief: A makefile the compiles and synthesizes the program
+#
+# @desc: 1. "make" runs csim by default
+#        2. "make csim" compiles & executes the fixed-point implementation
+#        3. "make clean" cleans up the directory
+
+
+# Extract Vivado HLS include path
+VHLS_PATH := $(dir $(shell which vivado_hls))/..
+VHLS_INC ?= ${VHLS_PATH}/include
+
+CFLAGS = -g -I${VHLS_INC} 
+
+all: csim
+
+csim:  host.cpp 
+	@echo "Compiling & simulating on amdpool ..."
+	g++ ${CFLAGS} $^ -o out -lrt
+	./out
+
+vivado:
+	@echo "Run Vivado csim and HLS"
+	vivado_hls -f run.tcl
+
+clean:
+	rm -rf out *.txt *.dat *.prj *.log
+	rm -rf zedboard_project* xillydemo.bit
+
diff --git a/tvm/src/template/vivado/run.tcl b/tvm/src/template/vivado/run.tcl
new file mode 100644
index 000000000..d80b865df
--- /dev/null
+++ b/tvm/src/template/vivado/run.tcl
@@ -0,0 +1,36 @@
+#=============================================================================
+# run_base.tcl 
+#=============================================================================
+# @brief: A Tcl script for synthesizing the design.
+
+# Project name
+set hls_prj out.prj
+
+# Open/reset the project
+open_project ${hls_prj} -reset
+
+# Top function of the design is "top"
+set_top top
+
+# Add design and testbench files
+add_files kernel.cpp
+add_files -tb host.cpp
+
+open_solution "solution1"
+# Use Zynq device
+set_part {xc7z020clg484-1}
+
+# Target clock period is 10ns
+create_clock -period 10
+
+# Directives 
+
+############################################
+
+# Simulate the C++ design
+csim_design -O
+# Synthesize the design
+csynth_design
+# Co-simulate the design
+#cosim_design
+exit
diff --git a/tvm/src/template/vivado/timer.h b/tvm/src/template/vivado/timer.h
new file mode 100644
index 000000000..77c461b00
--- /dev/null
+++ b/tvm/src/template/vivado/timer.h
@@ -0,0 +1,94 @@
+//---------------------------------------------------------
+// Timer.h
+//---------------------------------------------------------
+#ifndef __TIMER_H__
+#define __TIMER_H__
+#include <time.h>
+#include <sys/time.h>
+#include <string.h>
+#include <stdio.h>
+
+#define TIMER_ON
+
+//---------------------------------------------------------
+// Timer is an object which helps profile programs using
+// the clock() function.
+// - By default, a timer is stopped when you instantiate it
+//   and must be started manually
+// - Passing True to the constructor starts the timer when
+//   it is constructed
+// - When the timer is destructed it prints stats to stdout
+//---------------------------------------------------------
+class Timer {
+
+  #ifdef TIMER_ON
+
+    char binName[50];
+    unsigned nCalls;
+    timeval ts_start;
+    float totalTime;
+    
+    public:
+      //------------------------------------------------------------------
+      // constructor
+      //------------------------------------------------------------------
+      Timer (const char* Name="", bool On=false) {
+        if (On) {
+          // record the start time
+          gettimeofday(&ts_start, NULL);
+          nCalls = 1;
+        }
+        else {
+          nCalls = 0;
+        }
+        totalTime = 0;	
+        strcpy(binName, Name);
+      }
+
+      //------------------------------------------------------------------
+      // destructor
+      //------------------------------------------------------------------
+      ~Timer () {
+        // on being destroyed, print the average and total time
+        if (nCalls > 0) {
+          printf ("%-20s: ", binName);
+          printf ("%6d calls; ", nCalls);
+          printf ("%7.3f msecs total time\n", 1000*totalTime);
+          //printf ("%7.4f msecs average time;\n", 1000*totalTime/nCalls);
+        }
+      }
+      
+      //------------------------------------------------------------------
+      // start timer
+      //------------------------------------------------------------------
+      void start() {
+        // record start time
+        gettimeofday(&ts_start, NULL);
+        nCalls++;
+      }
+      
+      //------------------------------------------------------------------
+      // stop timer
+      //------------------------------------------------------------------
+      void stop() {
+        // get current time, add elapsed time to totalTime
+        timeval ts_curr;
+        gettimeofday(&ts_curr, NULL);
+        totalTime += float(ts_curr.tv_sec - ts_start.tv_sec) +
+                     float(ts_curr.tv_usec)*1e-6 - float(ts_start.tv_usec)*1e-6;
+      }
+
+  #else
+
+    //--------------------------------------------------------------------
+    // all methods do nothing if TIMER_ON is not set
+    //--------------------------------------------------------------------
+    public:
+      Timer (const char* Name, bool On=true) {}
+      void start() {}
+      void stop() {}
+
+  #endif
+};
+
+#endif