diff --git a/.circleci/config.yml b/.circleci/config.yml index ba119097d..4fa87539c 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -11,6 +11,7 @@ test: &test key: v1.03-libhcl- - run: make build-python - run: pip install --user pytest + - run: pip install --user future - run: python -m pytest tests - run: pip install --user mxnet - run: python -m pytest samples diff --git a/.gitignore b/.gitignore index a70651d15..65f3dfcf8 100644 --- a/.gitignore +++ b/.gitignore @@ -16,8 +16,6 @@ tags docs/source/samples docs/source/tutorials soda_* -*.cpp -*.h out # Downloaded files diff --git a/HISTORY b/HISTORY new file mode 100644 index 000000000..e08d564bc --- /dev/null +++ b/HISTORY @@ -0,0 +1,11 @@ +### 2019-12-09 + * fixed issue of zc706 simulation + * remove kernel-name variable allocation before KernelDef + * change multi-dimension array access to row-major single-dimension access + * create local buffer for each on-device variable + * updated the `KernelUpdater` class (using position index instead of name) + * added `stream_arg_pos` map in `CodeGenC` to facilitate codegen with streaming + * fixed test cases + * changed tvm `build` function to support legacy string type target + * fixed opencl aocl data type mismatching issue + * fixed kernel def data type conversion issue diff --git a/Makefile b/Makefile index 88c653d77..9508b9171 100644 --- a/Makefile +++ b/Makefile @@ -12,15 +12,15 @@ build-tvm: build-pkgs build-hcl: build-tvm cd python; \ - python setup.py install --user; \ + python setup.py develop --user; \ cd ../hlib/python; \ - python setup.py install --user; + python setup.py develop --user; build-python: cd python; \ - python setup.py install --user; \ + python setup.py develop --user; \ cd ../hlib/python; \ - python setup.py install --user; + python setup.py develop --user; clean: rm -rf build diff --git a/Makefile.config b/Makefile.config index 2060d201c..60d1cfd3e 100644 --- a/Makefile.config +++ b/Makefile.config @@ -12,6 +12,9 @@ CMAKE_OK = no # set whether to use vivado hls runtime USE_VIVADO_HLS = 1 +# set whether to use sdaccel opencl runtime +USE_SDACCEL_HLS = 1 + # Specify current directory level with respect to CLAY_ROOT ifndef LEVEL LEVEL := . diff --git a/hlib/python/hlib/nn.py b/hlib/python/hlib/nn.py index c8fa146a8..8f1c4d0e8 100644 --- a/hlib/python/hlib/nn.py +++ b/hlib/python/hlib/nn.py @@ -32,6 +32,17 @@ def _pad(*indices): return data[tuple(index_tuple)] return hcl.compute(out_shape, _pad, name='pad') +def conv2d_nchw_imp(Input, Filter, Output, stride=[1,1], padding=[[0,0],[0,0]]): + with hcl.for_(0,Output.shape[0]) as n: + with hcl.for_(0,Output.shape[1]) as c: + with hcl.for_(0,Output.shape[2]) as h: + with hcl.for_(0,Output.shape[3]) as w: + partial = hcl.scalar(0) + with hcl.for_(0,Filter.shape[-2]) as x: + with hcl.for_(0,Filter.shape[-1]) as y: + partial.v += Input[n][c][h+x][w+y] * Filter[0][0][x][y] + Output[n,c,h,w] = partial + def conv2d_nchw(Input, Filter, name="conv2d", stride=[1,1], padding=[[0,0],[0,0]]): out_dtype = Input.dtype batch, in_channel, in_height, in_width = Input.shape diff --git a/hlib/rocc-ppac b/hlib/rocc-ppac new file mode 160000 index 000000000..40d323d0c --- /dev/null +++ b/hlib/rocc-ppac @@ -0,0 +1 @@ +Subproject commit 40d323d0c81e2f64dbfb63afb5eb5d6ccf7c5e48 diff --git a/python/heterocl/__init__.py b/python/heterocl/__init__.py index 588196177..4b90160f0 100644 --- a/python/heterocl/__init__.py +++ b/python/heterocl/__init__.py @@ -3,6 +3,7 @@ from .compute_api import * from .dsl import * from .types import * +from .devices import * from .nparray import * from .debug import hcl_excepthook from .tvm.intrin import * diff --git a/python/heterocl/api.py b/python/heterocl/api.py index 4da52786f..f3e2151c8 100644 --- a/python/heterocl/api.py +++ b/python/heterocl/api.py @@ -53,7 +53,7 @@ def app2(A, B, C): # execute f2 """ # set the configurations - config.init_dtype = init_dtype + config.init_dtype = init_dtype # initialize global variables Schedule.stage_ops = [] Schedule.last_stages = OrderedSet([]) @@ -90,7 +90,7 @@ def placeholder(shape, name=None, dtype=None): """ name = util.get_name("placeholder", name) dtype = util.get_dtype(dtype) - + if shape == (): return Scalar(tvm_api._Var(name, dtype)) tensor = Tensor(shape, dtype, name) diff --git a/python/heterocl/debug.py b/python/heterocl/debug.py index cba313e23..a885d2e0b 100644 --- a/python/heterocl/debug.py +++ b/python/heterocl/debug.py @@ -45,6 +45,11 @@ class TensorError(HCLError): def __init__(self, msg): HCLError.__init__(self, msg, "\33[1;31m[Tensor]\33[0m ") +class DeviceError(HCLError): + """A subclass for specifying device related exception""" + def __init__(self, msg): + HCLError.__init__(self, msg, "\33[1;31m[Device]\33[0m ") + def hcl_excepthook(etype, value, tb): """Customized excepthook diff --git a/python/heterocl/devices.py b/python/heterocl/devices.py new file mode 100644 index 000000000..a5d81df86 --- /dev/null +++ b/python/heterocl/devices.py @@ -0,0 +1,278 @@ +"""Define HeteroCL device types""" +#pylint: disable=too-few-public-methods, too-many-return-statements +from .debug import DeviceError +from .tools import option_table, model_table +from future.utils import with_metaclass + +class tooling(type): + def __getattr__(cls, key): + if key in option_table: + return cls(key, *option_table[key]) + else: # unsupported device + raise DeviceError("not supported") + +class tool(with_metaclass(tooling, object)): + """The base class for all device tooling + + mode (sim/impl) is decided by tool configuration + e.g. run sw emulation by passing gcc / vivado_hls arg + and actual impl by passing sdaccel / aocl arg + + Parameters + ---------- + types: str + Device of device to place data + model: str + Model of device to place date + """ + def __init__(self, name, mode, kwargs): + self.name = name + self.mode = mode + self.options = kwargs + + def __getattr__(self, entry): + return self.mapping[entry] + + def __call__(self, mode, setting={}): + self.mode = mode + self.options = setting + return self + + def __str__(self): + return str(self.name) + "-" + \ + str(self.mode) + ":\n" + \ + str(self.options) + + def __repr__(self): + return str(self.name) + "-" + \ + str(self.mode) + ":\n" + \ + str(self.options) + +tool_table = { + "aws_f1" : tool("sdaccel", *option_table["sdaccel"]), + "zc706" : tool("vivado_hls", *option_table["vivado_hls"]), + "ppac" : tool("rocket", *option_table["rocket"]), + "stratix10_sx": tool("aocl", *option_table["aocl"]), + "llvm" : tool("llvm", *option_table["llvm"]) +} + +class Device(object): + """The base class for all device types + + The default data placement is on CPU. + + Parameters + ---------- + types: str + Device of device to place data + model: str + Model of device to place date + """ + def __init__(self, types, vendor, + model, **kwargs): + self.vendor = vendor + self.types = types + self.model = model + self.impls = {"lang": ""} + for key, value in kwargs.items(): + self.impls[key] = value + + def __getattr__(self, key): + """ device hierarchy """ + return self.impls[key] + + def set_lang(self, lang): + assert lang in \ + ["opencl", "hlsc", "c", "opengl", "merlinc", "cuda", "metal"], \ + "unsupported lang sepc " + lang + self.impls["lang"] = lang + return self + +class CPU(Device): + """cpu device with different models""" + def __init__(self, vendor, model, **kwargs): + if vendor not in ["riscv", "arm", "intel", "sparc", "powerpc"]: + raise DeviceError(vendor + " not supported yet") + assert "cpu_" + model in model_table[vendor], \ + model + " not supported yet" + super(CPU, self).__init__("CPU", vendor, model, **kwargs) + def __repr__(self): + return "cpu-" + self.vendor + "-" + str(self.model) + \ + ":" + self.impls["lang"] + +class FPGA(Device): + """fpga device with different models""" + def __init__(self, vendor, model, **kwargs): + if vendor not in ["xilinx", "intel"]: + raise DeviceError(vendor + " not supported yet") + assert "fpga_" + model in model_table[vendor], \ + model + " not supported yet" + super(FPGA, self).__init__("FPGA", vendor, model, **kwargs) + def __repr__(self): + return "fpga-" + self.vendor + "-" + str(self.model) + \ + ":" + self.impls["lang"] + +class GPU(Device): + """gpu device with different models""" + def __init__(self, vendor, model, **kwargs): + if vendor not in ["nvidia", "amd"]: + raise DeviceError(vendor + " not supported yet") + assert "gpu_" + model in model_table[vendor], \ + model + " not supported yet" + super(GPU, self).__init__("GPU", vendor, model, **kwargs) + def __repr__(self): + return "gpu-" + self.vendor + "-" + str(self.model) + \ + ":" + self.impls["lang"] + +class PIM(Device): + """cpu device with different models""" + def __init__(self, vendor, model, **kwargs): + if model not in ["ppac"]: + raise DeviceError(model + " not supported yet") + super(PIM, self).__init__("PIM", vendor, model, **kwargs) + def __repr__(self): + return "pim-" + str(self.model) + +dev_table = { + "aws_f1" : [CPU("intel", "e5"), FPGA("xilinx", "xcvu19p")], + "zc706" : [CPU("arm", "a9"), FPGA("xilinx", "xc7z045")], + "rocc-ppac" : [CPU("riscv", "riscv"), PIM("ppac", "ppac")], + "stratix10_sx": [CPU("arm", "a53"), FPGA("intel", "stratix10_gx")] +} + +class env(type): + """The platform class for compute environment setups + + serves as meta-class for attr getting + default platform: aws_f1, zynq, ppac + + Parameters + ---------- + host: str + Device of device to place data + model: str + Model of device to place date + """ + def __getattr__(cls, key): + if key == "aws_f1": + devs = dev_table[key] + host = devs[0].set_lang("opencl") + xcel = devs[1].set_lang("hlsc") + elif key == "zc706": + devs = dev_table[key] + host = devs[0].set_lang("hlsc") + xcel = devs[1].set_lang("hlsc") + elif key == "llvm": + devs = None + host = None + xcel = None + elif key == "ppac": + devs = dev_table["rocc-ppac"] + host = devs[0].set_lang("c") + xcel = None + else: # unsupported device + raise DeviceError("not supported") + tool = tool_table[key] + return cls(key, devs, host, xcel, tool) + +class platform(with_metaclass(env, object)): + def __init__(self, name, devs, host, xcel, tool): + self.name = name + self.devs = devs + self.host = host + self.xcel = xcel + self.tool = tool + + if isinstance(host, CPU): + self.cpu = host + if isinstance(xcel, FPGA): + self.fpga = xcel + elif isinstance(xcel, PIM) and \ + xcel.model == "ppac": + self.ppac = xcel + + def __getattr__(self, key): + """ return tool options """ + return self.tool.__getattr__(key) + + def __call__(self, tooling=None): + if tooling: # check and update + assert isinstance(tooling, tool) + self.tool = tooling + return self + + def __str__(self): + return str(self.name) + "(" + \ + str(self.host) + " : " + \ + str(self.xcel) + ")" + + def __repr__(self): + return str(self.name) + "(" + \ + str(self.host) + " : " + \ + str(self.xcel) + ")" + +def device_to_str(dtype): + """Convert a device type to string format. + + Parameters + ---------- + dtype : Device or str + The device type to be converted + + Returns + ------- + str + The converted device type in string format. + """ + if isinstance(dtype, Device): + if isinstance(dtype, CPU): + return "cpu_" + str(dtype.model) + elif isinstance(dtype, FPGA): + return "fpga_" + str(dtype.model) + else: + if not isinstance(dtype, str): + raise DeviceError("Unsupported device type format") + return dtype + +def device_to_hcl(dtype): + """Convert a device type to Heterocl type. + + Parameters + ---------- + dtype : Device or str + The device type to be converted + + Returns + ------- + Device + """ + if isinstance(dtype, Device): + return dtype + elif isinstance(dtype, str): + device, model = dtype.split("_") + if device == "cpu": + return CPU(model) + elif device == "gpu": + return GPU(model) + elif device == "fpga": + return FPGA(model) + else: + raise DeviceError("Unrecognized device type") + else: + raise DeviceError("Unrecognized device type format") + +def get_model(dtype): + """Get the model of a given device type. + + Parameters + ---------- + dtype : Device or str + The given device type + + Returns + ------- + str + """ + dtype = dtype_to_hcl(dtype) + return dtype.types, dtype.model + diff --git a/python/heterocl/dsl.py b/python/heterocl/dsl.py index 6d42031f1..b226cb0ab 100644 --- a/python/heterocl/dsl.py +++ b/python/heterocl/dsl.py @@ -405,6 +405,7 @@ def decorator(fmodule, shapes=shapes, dtypes=dtypes, ret_dtype=ret_dtype, name=n raise APIError("The number of data types does not match the of arguments") for (name_, dtype_) in zip(new_names, dtypes): dtypes.append(util.get_dtype(dtype_, name_)) + dtypes = dtypes[int(len(dtypes)/2):] else: dtype = util.get_dtype(dtypes) dtypes = [] @@ -414,15 +415,20 @@ def decorator(fmodule, shapes=shapes, dtypes=dtypes, ret_dtype=ret_dtype, name=n # prepare inputs for IR generation inputs = [] inputs_tvm = [] + arg_shapes, arg_dtypes = [], [] for shape, name_, dtype in zip(shapes, new_names, dtypes): if shape == (): var_ = placeholder((), name_, dtype) inputs.append(var_) inputs_tvm.append(var_.var) - else: + arg_shapes.append([1]) + arg_dtypes.append(dtype) + else: # tensor inputs (new bufs) placeholder_ = placeholder(shape, name_, dtype) inputs.append(placeholder_) inputs_tvm.append(placeholder_.buf.data) + arg_shapes.append(list(shape)) + arg_dtypes.append(dtype) s.ret_dtype = ret_dtype fmodule(*inputs) @@ -435,7 +441,8 @@ def decorator(fmodule, shapes=shapes, dtypes=dtypes, ret_dtype=ret_dtype, name=n ret_void = _make.UIntImm("uint1", 0) if s.has_return else _make.UIntImm("uint1", 1) body = s.pop_stmt() s.stmt_stack.append([]) - s.emit(_make.KernelDef(inputs_tvm, body, ret_void, ret_dtype, name)) + s.emit(_make.KernelDef(inputs_tvm, arg_shapes, arg_dtypes, + body, ret_void, ret_dtype, name, [])) for name_, i in zip(names, inputs): s.var_dict[name_] = i s.input_stages.clear() diff --git a/python/heterocl/mutator.py b/python/heterocl/mutator.py index 88ca42788..7d49f1e76 100644 --- a/python/heterocl/mutator.py +++ b/python/heterocl/mutator.py @@ -77,6 +77,8 @@ def mutate(self, node): return self.mutate_SetSlice(node) elif isinstance(node, _expr.KernelExpr): return self.mutate_KernelExpr(node) + elif isinstance(node, _expr.StreamExpr): + return self.mutate_StreamExpr(node) else: return node elif isinstance(node, _stmt.Stmt): @@ -112,6 +114,8 @@ def mutate(self, node): return self.mutate_Break(node) elif isinstance(node, _stmt.While): return self.mutate_While(node) + elif isinstance(node, _stmt.StreamStmt): + return self.mutate_StreamStmt(node) else: return node elif isinstance(node, tuple): @@ -248,6 +252,10 @@ def mutate_KernelExpr(self, node): args = self.mutate(node.args) return _make.KernelExpr(node.dtype, args, node.name) + def mutate_StreamExpr(self, node): + args = self.mutate(node.args) + return _make.StreamExpr(node.dtype, args, node.name) + # statements def mutate_LetStmt(self, node): var = self.mutate(node.var) @@ -320,6 +328,10 @@ def mutate_KernelStmt(self, node): args = self.mutate(node.args) return _make.KernelStmt(args, node.name) + def mutate_StreamStmt(self, node): + args = self.mutate(node.args) + return _make.StreamStmt(node.dtype, args, node.name) + def mutate_Return(self, node): value = self.mutate(node.value) return _make.Return(value) diff --git a/python/heterocl/schedule.py b/python/heterocl/schedule.py index abd74acdc..03af1cf3e 100644 --- a/python/heterocl/schedule.py +++ b/python/heterocl/schedule.py @@ -5,6 +5,7 @@ from ordered_set import OrderedSet from .tvm import make as _make from .tvm import stmt as _stmt +from .tvm import expr as _expr from .tvm import api as tvm_api from .tvm import _api_internal from .tvm._api_internal import _ExternOp @@ -134,6 +135,42 @@ def reuse_at(self, target, parent, axis, name=None): name = target.name + ".reuse" return self.sch.reuse_at(target, parent, axis, name) + def to(self, tensors, dst, src=None, + stream_type=_expr.StreamExpr.Channel, depth=10, name=None): + """Stream a list of Tensors to dst devices + + Parameters + ---------- + tensors : list of Tensor + The tensors to be moved + + dst : device or module + The tensors to be moved + + stream_type : {FIFO, Channel, Burst}, optional + The stream type + """ + if stream_type > 2: + raise APIError("Invalid channel type") + rets = [] + if not isinstance(tensors, list): + tensors = [tensors] + for tensor in tensors: + try: + target = tensor.tensor + except (AttributeError, ValueError): + try: + target = tensor._op + except AttributeError: + target = tensor + if name is None: + name = target.name + ".stream" + ret = self.sch.to(target, dst, src, + stream_type, depth, name) + name = None + rets.append(ret) + return rets + def partition(self, target, partition_type=_stmt.Partition.Complete, dim=0, factor=0): """Partition a Tensor into smaller Tensors or even registers @@ -302,7 +339,7 @@ def __exit__(self, ptype, value, trace): # create the output operation input_ops = [i._op for i in self.input_stages] input_bufs = [i._buf for i in self.input_stages] - output_bufs = [self._buf] + output_bufs = [self._buf] body = self.pop_stmt() Stage._current.pop() op = _ExternOp(self.name, "", self.axis_list, input_ops, @@ -331,8 +368,7 @@ def __exit__(self, ptype, value, trace): superstage.var_dict[self.name] = self # update prefix self.name_with_prefix = superstage.name_with_prefix + "." + self.name - # Otherwise update the list of stages globally - else: + else: # otherwise update the list of stages globally Schedule.stage_ops.append(self) Schedule.last_stages.add(self) Schedule.last_stages -= self.input_stages diff --git a/python/heterocl/tools.py b/python/heterocl/tools.py new file mode 100644 index 000000000..bf47753fa --- /dev/null +++ b/python/heterocl/tools.py @@ -0,0 +1,108 @@ +"""Define HeteroCL default tool settings""" +#pylint: disable=too-few-public-methods, too-many-return-statements + +model_table = { + "xilinx" : ["fpga_xc7z045", "fpga_xcvu19p"], + "intel" : ["cpu_e5", "cpu_i7", "fpga_stratix10_gx", + "fpga_stratix10_dx", "fpga_stratix10_mx"], + "arm" : ["cpu_a7", "cpu_a9", "cpu_a53"], + "riscv" : ["cpu_riscv"] +} + +option_table = { + "llvm" : ("llvm_sim", {"version" : "6.0.0"}), + "sdaccel" : ("sw_emu", {"version" : "2017.1", "clock" : "1"}), + "vivado_hls" : ("csim", {"version" : "2017.1"}), + "rocket" : ("source", {"RISCV" : ""}), + + # refer to xilinx2016_1/ug904-vivado-implementation.pdf + "vivado" : ("pnr", + {"version" : "2017.1", + "logic" : ["Default", "Explore", "ExploreSequentialArea", "AddRemap", "ExploreArea"], + "placement" : ["Default", "Explore", "ExtraNetDelay_high", "ExtraNetDelay_medium", "ExtraNetDelay_low", "ExtraPostPlacementOpt", "WLDrivenBlockPlacement", "LateBlockPlacement", "AltSpreadLogic_low", "AltSpreadLogic_medium", "AltSpreadLogic_high"], + "routing" : ["Default", "Explore", "HigherDelayCost"], + "fanout_opt" : ["on", "off"], + "placement_opt" : ["on", "off"], + "critical_cell_opt" : ["on", "off"], + "critical_pin_opt" : ["on", "off"], + "retime" : ["on", "off"], + "rewire" : ["on", "off"], + }), + + "quartus" : ("pnr", + {"version" : "17.1", + "auto_dsp_recognition" : ['On', 'Off'], + "disable_register_merging_across_hierarchies": ['On', 'Off', 'Auto'], + "mux_restructure" : ['On', 'Off', 'Auto'], + "optimization_technique" : ['Area', 'Speed', 'Balanced'], + "synthesis_effort" : ['Auto', 'Fast'], + "synth_timing_driven_synthesis" : ['On', 'Off'], + "fitter_aggressive_routability_optimization" : ['Always', 'Automatically', 'Never'], + "fitter_effort" : ['Standard Fit', 'Auto Fit'], + "remove_duplicate_registers" : ['On', 'Off'], + "physical_synthesis" : ['On', 'Off'], + "adv_netlist_opt_synth_wysiwyg_remap" : ['On', 'Off'], + "allow_any_ram_size_for_recognition" : ['On', 'Off'], + "allow_any_rom_size_for_recognition" : ['On', 'Off'], + "allow_any_shift_register_size_for_recognition" : ['On', 'Off'], + "allow_power_up_dont_care" : ['On', 'Off'], + "allow_shift_register_merging_across_hierarchies" : ["Always", "Auto", "Off"], + "allow_synch_ctrl_usage" : ['On', 'Off'], + "auto_carry_chains" : ['On', 'Off'], + "auto_clock_enable_recognition" : ['On', 'Off'], + "auto_dsp_recognition" : ['On', 'Off'], + "auto_enable_smart_compile" : ['On', 'Off'], + "auto_open_drain_pins" : ['On', 'Off'], + "auto_ram_recognition" : ['On', 'Off'], + "auto_resource_sharing" : ['On', 'Off'], + "auto_rom_recognition" : ['On', 'Off'], + "auto_shift_register_recognition" : ["Always", "Auto", "Off"], + "disable_register_merging_across_hierarchies" : ["Auto", "On", "Off"], + "enable_state_machine_inference" : ['On', 'Off'], + "force_synch_clear" : ['On', 'Off'], + "ignore_carry_buffers" : ['On', 'Off'], + "ignore_cascade_buffers" : ['On', 'Off'], + "ignore_max_fanout_assignments" : ['On', 'Off'], + "infer_rams_from_raw_logic" : ['On', 'Off'], + "mux_restructure" : ["Auto", "On", "Off"], + "optimization_technique" : ["Area", "Balanced", "Speed"], + "optimize_power_during_synthesis" : ["Extra effort", "Normal compilation", "Off"], + "remove_duplicate_registers" : ['On', 'Off'], + "shift_register_recognition_aclr_signal" : ['On', 'Off'], + "state_machine_processing" : + ["Auto", "Gray", "Johnson, Minimal Bits", "One-Hot", "Sequential", "User-Encoded"], + "strict_ram_recognition" : ['On', 'Off'], + "synthesis_effort" : ["Auto", "Fast"], + "synthesis_keep_synch_clear_preset_behavior_in_unmapper" : ['On', 'Off'], + "synth_resource_aware_inference_for_block_ram" : ['On', 'Off'], + "synth_timing_driven_synthesis" : ['On', 'Off'], + "alm_register_packing_effort" : ["High", "Low", "Medium"], + "auto_delay_chains" : ['On', 'Off'], + "auto_delay_chains_for_high_fanout_input_pins" : ["On", "Off"], + "eco_optimize_timing" : ["On", "Off"], + "final_placement_optimization" : ["Always", "Automatically", "Never"], + "fitter_aggressive_routability_optimization" : ["Always", "Automatically", "Never"], + "fitter_effort" : ["Standard Fit", "Auto Fit"], + "optimize_for_metastability" : ["On", "Off"], + "optimize_hold_timing" : ["All Paths", "IO Paths and Minimum TPD Paths", "Off"], + "optimize_ioc_register_placement_for_timing" : + ["Normal", "Off", "Pack All IO Registers"], + "optimize_multi_corner_timing" : ['On', 'Off'], + "optimize_power_during_fitting" : ["Extra effort", "Normal compilation", "Off"], + "physical_synthesis" : ['On', 'Off'], + "placement_effort_multiplier" : [0.2, 0.5, 1.0, 2.0, 3.0, 4.0], + "programmable_power_technology_setting" : ["Automatic", "Force All Tiles with Failing Timing Paths to High Speed", "Force All Used Tiles to High Speed", "Minimize Power Only"], + "qii_auto_packed_registers" : ["Auto", "Minimize Area", "Minimize Area with Chains", "Normal", "Off", "Sparse", "Sparse Auto"], + "router_clocking_topology_analysis" : ['On', 'Off'], + "router_lcell_insertion_and_logic_duplication" : ["Auto", "On", "Off"], + "router_register_duplication" : ["Auto", "On", "Off"], + "router_timing_optimization_level" : ["MINIMUM", "Normal", "MAXIMUM"], + "seed" : (1, 5), + "tdc_aggressive_hold_closure_effort" : ['On', 'Off'], + "allow_register_retiming" : ['On', 'Off']}), + + "aocl" : ("emu", {"version" : "17.0", + "clokc" : 1.5, + }) +} + diff --git a/python/heterocl/tvm/build_module.py b/python/heterocl/tvm/build_module.py index c8dcc91f2..47b4e31ae 100755 --- a/python/heterocl/tvm/build_module.py +++ b/python/heterocl/tvm/build_module.py @@ -6,8 +6,10 @@ from __future__ import absolute_import as _abs import warnings import types +import os from ._ffi.node import NodeBase, register_node +from ._ffi.function import register_func from ._ffi.base import _RUNTIME_ONLY from . import api from . import tensor @@ -21,6 +23,48 @@ from . import ndarray from . import target as _target from . import make +from ..devices import platform + +# test build sim +@register_func +def tvm_callback_syn_postproc(code): + return "test" + +@register_func +def get_util_path(platform): + if platform == "aws_f1": + return "/work/zhang-x1/users/sx233/heterocl/tvm/src/template/sdaccel/" + elif platform == "rocket": + ppac = "/work/zhang-x1/users/sx233/heterocl/hlib/rocc-ppac" + emulator = os.path.join(ppac, "rocket/emulator/emulator-freechips." + \ + "rocketchip.system-RoccExampleConfig-debug") + # build emulator if not exist + if not os.path.isfile(emulator): + cmd = "cd " + ppac + ";" + cmd += "cp src/Ppac.v rocket/src/main/resources/vsrc;" + \ + "cp src/PpacRoCC.scala rocket/src/main/scala/tile;" + \ + "cd rocket && git apply ../src/rocc-ppac.patch;" + \ + "cd emulator && make CONFIG=RoccExampleConfig debug" + # create subprocess to check + subprocess.Popen(cmd, shell=True, stdout=open("build.log", "w")).wait() + + # re-build proxy kernel + if not os.path.isfile(ppac + "/rocket/riscv-pk/build/pk"): + cmd = "cd " + ppac + "/rocket/riscv-pk;" + cmd += "git apply ../../tests/patches/riscv-pk.patch;" + cmd += "mkdir build; cd build;" + cmd += " ../configure --prefix=$RISCV/riscv64-unknown-elf --host=riscv64-unknown-elf;" + cmd += "make -j8; make install" + subprocess.Popen(cmd, shell=True, stdout=open("build.log", "w")).wait() + # return util folder needed to compile generated test files + return "/work/zhang-x1/users/sx233/heterocl/rocc-ppac/tests" + + # copy tcl and testbench + elif platform == "vivado_hls": + return "/work/zhang-x1/users/sx233/heterocl/tvm/src/template/vivado" + + else: # unrecognized platform + assert False, "unsupported platform" class DumpIR(object): """ @@ -340,6 +384,7 @@ def lower(sch, stmt = f(stmt) # Phase 1 stmt = ir_pass.StorageFlatten(stmt, binds, 64) + stmt = ir_pass.InferStream(stmt, 32) #stmt = ir_pass.CanonicalSimplify(stmt) #TODO: SOLVE THIS!! stmt = ir_pass.LiftAllocateAttrs(stmt) if cfg.generate_reuse_buffer: @@ -378,7 +423,7 @@ def lower(sch, else: return ir_pass.MakeAPI(stmt, name, arg_list, 0, cfg.restricted_func) -def build_fpga_kernel(sch, args, target_name, name="default_function"): +def build_fpga_kernel(sch, args, target, name="default_function"): """Build an FPGA kernel. Parameters @@ -407,20 +452,66 @@ def build_fpga_kernel(sch, args, target_name, name="default_function"): if args is None: raise ValueError("args must be given for build from schedule") - if target_name == "merlinc": + # generate host (device) code / function + if target == "merlinc": BuildConfig.current = build_config(generate_reuse_buffer=False) else: BuildConfig.current = build_config() + flist = lower(sch, args, kernel_only=True, name=name) if isinstance(flist, container.LoweredFunc): flist = [flist] - fdevice = [ir_pass.LowerIntrin(x, target_name) for x in flist] + fdevice = [ir_pass.LowerIntrin(x, str(target)) for x in flist] + + if isinstance(target, str): # string type + builder = getattr(codegen, "build_{0}".format(target)) + ret = builder(fdevice) + if isinstance(ret, str): + decl = ret[:ret.find("{device}")] + start = ret.find("{host}") + end = ret.rfind("{host}") + ret = decl + "\n" + ret[start+6:end] + ret = ret.strip("\n").lstrip("\n") + "\n\n" + return ret + + try: # generate and split code + host, xcel = None, None + if target.tool.name == "sdaccel": + host = target.host.lang.replace("opencl", "aocl") + xcel = target.xcel.lang.replace("hlsc", "vhls") + elif target.tool.name == "vivado_hls": + host = target.host.lang.replace("hlsc", "vhls") + xcel = target.xcel.lang.replace("hlsc", "vhls") + elif target.tool.name == "rocket": + host = target.host.lang.replace("c", "rv64_ppac") + + # return simulation built function + mode = str(target.tool.mode) + if "emu" in mode or "sim" in mode: + builder = getattr(codegen, "build_{0}".format("sim")) + keys = [k for k in target.tool.options.keys()] + vals = [v for v in target.tool.options.values()] + keys.insert(0, "name") + vals.insert(0, target.tool.name) + return builder(fdevice, keys, vals) + elif mode != "debug": # impl mode + pass + else: # return source code only + host_code, xcel_code = "", "" + if host: # src mode generate host code + builder = getattr(codegen, "build_{0}".format(host)) + host_code = builder(fdevice) + findex, rindex = host_code.find("{host}"), host_code.rfind("{host}") + host_code = host_code[findex + 6 : rindex] + if xcel: # src mode generate xcel code + builder = getattr(codegen, "build_{0}".format(xcel)) + xcel_code = builder(fdevice) + findex, rindex = xcel_code.find("{device}"), xcel_code.rfind("{device}") + xcel_code = xcel_code[findex + 8 : rindex] + return xcel_code + host_code - try: - builder = getattr(codegen, "build_{0}".format(target_name)) - return builder(fdevice) except AttributeError: - raise AttributeError("Cannot find the target builder %s" % target_name) + raise AttributeError("Cannot find the target builder %s" % target) return None def build(sch, @@ -468,11 +559,13 @@ def build(sch, ---- See the note on :any:`tvm.target` on target string format. """ - target = _target.current_target() if target is None else target - target = _target.create(target) if target else _target.create("llvm") - - if "fpga" in target.keys: - return build_fpga_kernel(sch, args, target.target_name, name=name) + if isinstance(target, platform): + return build_fpga_kernel(sch, args, target, name=name) + else: # default string type target + target = _target.current_target() if target is None else target + target = _target.create(target) if target else _target.create("llvm") + if "fpga" in target.keys: + return build_fpga_kernel(sch, args, target.target_name, name=name) BuildConfig.current = build_config() if isinstance(sch, schedule._Schedule): diff --git a/python/heterocl/tvm/expr.py b/python/heterocl/tvm/expr.py index d71307e8f..d1ea4ae75 100644 --- a/python/heterocl/tvm/expr.py +++ b/python/heterocl/tvm/expr.py @@ -382,3 +382,9 @@ class Quantize(Expr): @register_node class KernelExpr(Expr): pass + +@register_node +class StreamExpr(Expr): + Channel = 0 + Pipe = 1 + FIFO = 2 diff --git a/python/heterocl/tvm/schedule.py b/python/heterocl/tvm/schedule.py index 21905b443..36ead39de 100644 --- a/python/heterocl/tvm/schedule.py +++ b/python/heterocl/tvm/schedule.py @@ -3,6 +3,7 @@ from ._ffi.base import string_types from ._ffi.node import NodeBase, register_node from ._ffi.function import _init_api +from ..devices import Device from . import _api_internal from . import tensor as _tensor from . import expr as _expr @@ -332,6 +333,53 @@ def reuse_at(self, target, parent, axis, name): def partition(self, target, partition_type, dim, factor): return _api_internal._SchedulePartition(self, target, dim, factor, partition_type) + def to(self, tensor, dst, src, + types=_expr.StreamExpr.Channel, + depth=1, name=None): + """ Stream data to devices or on-chip module + + Parameters + ---------- + tensor : list of Tensors + Tensor to be streamed. + dst : hcl device or dst stage + The device or module for streaming + type : channel type + The streaming type (e.g. fifo or pipe) + + Returns + ------- + outer : IterVar + The outer variable of iteration. + """ + # create producer and consumer for stream + if isinstance(dst, Device): + dst = 1 if 'fpga' in str(dst) else 0 + return _api_internal._ScheduleMove(self, tensor, dst, + types, depth, name) + else: # connect kernel + assert isinstance(dst, _Stage), "dst not a stage " + if src: # remove buffer between kernels + assert isinstance(src, _Stage), \ + "destination should be a stage but " + str(type(src)) + try: + self.remove_args.append(tensor.op.output(0)) + except: + self.remove_args = [] + self.remove_args.append(tensor.op.output(0)) + _api_internal._ScheduleStream(self, tensor, dst, src, + types, depth, name) + else: # from externop buffer to kernel + shape = [_.value for _ in tensor.shape] + index, match = 0, [] + for s in dst.op.body.api_args: + arg_shape = [_.value for _ in s] + if shape == arg_shape: match.append(index) + index = index + 1 + assert len(match) > 0, "wrong kernel or tensor (shape not matching)" + _api_internal._ScheduleMoveToStage(self, tensor, dst, match[0], + types, depth, name) + @register_node("Stage") class _Stage(NodeBase): """A Stage represents schedule for one operation. @@ -654,7 +702,7 @@ def pragma(self, var, pragma_type): - **parallel_stride_pattern** Hint parallel loop to execute in strided pattern. - :code:`for (int i = task_id; i < end; i += num_task)` + :code:`for (int i = task_id; i < end; i += num_task)` """ _api_internal._StagePragma(self, var, pragma_type) diff --git a/python/heterocl/tvm/stmt.py b/python/heterocl/tvm/stmt.py index 4db84970f..d5c2d0a18 100644 --- a/python/heterocl/tvm/stmt.py +++ b/python/heterocl/tvm/stmt.py @@ -112,3 +112,7 @@ class Partition(Stmt): @register_node class Stencil(Stmt): pass + +@register_node +class StreamStmt(Stmt): + pass diff --git a/python/heterocl/tvm/target.py b/python/heterocl/tvm/target.py index 12235d95d..5687953ca 100644 --- a/python/heterocl/tvm/target.py +++ b/python/heterocl/tvm/target.py @@ -1,43 +1,3 @@ -"""Target management API of TVM. - -TVM's target string is in fomat `` [-option=value]...``. - -Note ----- -The list of options include: - -- **-device=** - - The device name. - -- **-mtriple=** or **-target** - - Specify the target triple, which is useful for cross - compilation. - -- **-mcpu=** - - Specify a specific chip in the current architecture to - generate code for. By default this is infered from the - target triple and autodetected to the current architecture. - -- **-mattr=a1,+a2,-a3,...** - - Override or control specific attributes of the target, - such as whether SIMD operations are enabled or not. The - default set of attributes is set by the current CPU. - -- **-system-lib** - - Build TVM system library module. System lib is a global module that contains - self registered functions in program startup. User can get the module using - :any:`tvm.module.system_lib`. - It is useful in environments where dynamic loading api like dlopen is banned. - The system lib will be available as long as the result code is linked by the program. - -We can use :any:`tvm.target.create` to create a tvm.target.Target from the target string. -We can also use other specific function in this module to create specific targets. -""" from __future__ import absolute_import import warnings @@ -50,7 +10,8 @@ if _LIB_NAME != "libhcl_runtime.so": raise err_msg -FPGA_TARGETS = ['merlinc', 'soda', 'soda_xhls', 'vhls', 'ihls', 'vhls_csim'] +FPGA_TARGETS = ['merlinc', 'soda', 'soda_xhls', 'vhls', 'ihls', 'vhls_csim', + 'opencl', 'sdaccel', 'sdaccel_csim', 'aocl', 'aocl_csim', 'rv64_ppac'] def _merge_opts(opts, new_opts): """Helper function to merge options""" @@ -68,7 +29,7 @@ class Target(object): Parameters ---------- - target_name : {"llvm", "cuda", "opencl", "metal", "rocm", "stackvm", "opengl", "ext_dev"} + target_name : {"llvm", "cuda", "opencl", "metal", "rocm", "stackvm", "opengl", "ext_dev", "rv64_ppac"} The major target name. {"merlinc", "soda", "soda_xhls", "vhls"} diff --git a/python/heterocl/util.py b/python/heterocl/util.py index 996201105..704b774cb 100644 --- a/python/heterocl/util.py +++ b/python/heterocl/util.py @@ -4,6 +4,7 @@ from .tvm.expr import Var, Call from .tvm.api import _IterVar, decl_buffer from . import types +from . import devices from . import config from .scheme import Scheme from .debug import DTypeError diff --git a/samples/conv/conv.py b/samples/conv/conv.py new file mode 100644 index 000000000..ca41a50a1 --- /dev/null +++ b/samples/conv/conv.py @@ -0,0 +1,70 @@ +import heterocl as hcl +import hlib +import numpy as np +from PIL import Image +from urllib.request import urlopen + +batch_size = 1 +hcl.init(hcl.UInt(32)) +dtype = hcl.UInt(32) +image_size = () +kernel_size = 3 + +# setup target using vivado +tool = hcl.tool.vivado("csim") +target = hcl.platform.zc706 + +def conv(): + image = hcl.placeholder((batch_size, 1, 256, 256), "input_image") + k1 = hcl.placeholder((1, 1, 3, 3), "kernel_1") + k2 = hcl.placeholder((1, 1, 3, 3), "kernel_2") + + def kernel(input_image, kernel_1, kernel_2): + + # return tensor required (cannot do def_()) + interm_shape = (1,1,254,254) + output_shape = (1,1,252,252) + + # make compute wrapped in hcl def + module1 = hcl.def_([input_image.shape, kernel_1.shape, interm_shape], name="conv1")(hlib.nn.conv2d_nchw_imp) + module2 = hcl.def_([interm_shape, kernel_2.shape, output_shape], name="conv2")(hlib.nn.conv2d_nchw_imp) + conv1 = hcl.compute(interm_shape, lambda *args: 0) + conv2 = hcl.compute(output_shape, lambda *args: 0) + module1(input_image, kernel_1, conv1) + module2(conv1, kernel_2, conv2) + + # derivative module for normalization + return hcl.compute(output_shape, lambda *args: conv2[args], name="derv") + + s = hcl.create_schedule([image, k1, k2], kernel) + + # data moved to local + i0, k10, k20 = s.to([image, k1, k2], target.fpga) + # s.to([i0, k10], s[kernel.conv1]) + # s.to([k20], s[kernel.conv2]) + s.to(kernel.derv, target.cpu) + + # create stream channel between modules + print(type(target.fpga), hcl.lower(s)) + return hcl.build(s, target) + +# Load sample data +img = Image.open(urlopen('http://i.stack.imgur.com/8zINU.gif')) +kernel_x = np.array([[-1, 0, 1], [-2, 0, 2], [-1, 0, 1]]) +kernel_y = np.flip(kernel_x.T.T, axis=0) +img = np.array(img) + +img = img[np.newaxis, ...] +img = img[np.newaxis, ...] +kernel_x = kernel_x[np.newaxis, ...] +kernel_x = kernel_x[np.newaxis, ...] +kernel_y = kernel_y[np.newaxis, ...] +kernel_y = kernel_y[np.newaxis, ...] + +hcl_input = hcl.asarray(img, dtype) +kernel_x = hcl.asarray(kernel_x, dtype) +kernel_y = hcl.asarray(kernel_y, dtype) +hcl_output = hcl.asarray(np.zeros((1,1,254,254)), dtype) + +f = conv() +f(hcl_input, kernel_x, kernel_y, hcl_output) diff --git a/samples/digitrec/digitrec_stream.py b/samples/digitrec/digitrec_stream.py new file mode 100644 index 000000000..4c0da096a --- /dev/null +++ b/samples/digitrec/digitrec_stream.py @@ -0,0 +1,150 @@ +import heterocl as hcl +import time +import numpy as np +import math +from digitrec_data import read_digitrec_data + +N = 8 * 8 +max_bit = int(math.ceil(math.log(N, 2))) +test_size = (180, ) +data_size = (10, 1800) + +dtype_image = hcl.UInt(N) +dtype_knnmat = hcl.UInt(max_bit) + +setting = { + "version" : "2019.1", + "clock" : "10" +} +tool = hcl.tool.vivado("csim", setting) +target = hcl.platform.aws_f1 + +def knn(test_images, train_images): + + def popcount(num): + out = hcl.local(0, "out") + with hcl.for_(0, train_images.type.bits) as i: + out.v += num[i] + return out.v + + def update_knn(dist, knn_mat, i, j): + max_id = hcl.local(0, "max_id") + with hcl.for_(0, 3) as k: + with hcl.if_(knn_mat[i][k] > knn_mat[i][max_id.v]): + max_id.v = k + with hcl.if_(dist[i][j] < knn_mat[i][max_id.v]): + knn_mat[i][max_id.v] = dist[i][j] + + def sort_knn(knn_mat, i, j): + val = hcl.local(0, "val") + with hcl.if_( j == 1 ): + with hcl.if_( knn_mat[i][1] > knn_mat[i][2] ): + val.v = knn_mat[i][1] + knn_mat[i][1] = knn_mat[i][2] + knn_mat[i][2] = val.v + with hcl.else_(): + with hcl.if_( knn_mat[i][0] > knn_mat[i][1] ): + val.v = knn_mat[i][0] + knn_mat[i][0] = knn_mat[i][1] + knn_mat[i][1] = val.v + + def knn_vote(knn_mat, j): + id0 = hcl.local(0, "id0") + id1 = hcl.local(0, "id1") + id2 = hcl.local(0, "id2") + count = hcl.local(0, "count") + with hcl.for_(0, 10) as n: + with hcl.if_(knn_mat[n][0] < knn_mat[id0.v][0]): + id0.v = n + with hcl.for_(0, 10) as m: + with hcl.if_(knn_mat[m][0] < knn_mat[id1.v][0]): + id1.v = m + with hcl.for_(0, 10) as k: + with hcl.if_(knn_mat[k][0] < knn_mat[id2.v][0]): + id2.v = k + with hcl.if_(j == id0.v): + count.v += 1 + with hcl.elif_(j == id1.v): + count.v += 1 + with hcl.elif_(j == id2.v): + count.v += 1 + with hcl.else_(): + count.v += 0 + return count.v + + # support hcl.compute in hcl def + @hcl.def_([(), data_size, (10,3)]) + def knn_dist(test_image, train_images, pred_matrix): + pass + + with hcl.for_(0, 180) as index: + test_image = test_images[index] + diff = hcl.compute(train_images.shape, + lambda x, y: train_images[x][y] ^ test_image, + "diff") + dist = hcl.compute(diff.shape, + lambda x, y: popcount(diff[x][y]), + "dist") + knn_mat = hcl.compute((10, 3), lambda x, y: 50, "knn_mat") + hcl.mutate(dist.shape, + lambda x, y: update_knn(dist, knn_mat, x, y), + "knn_update") + hcl.mutate((10, 3), lambda x, y: sort_knn(knn_mat, x, y), "sort") + knn_new = hcl.compute(knn_mat.shape, + lambda x, y: knn_mat[x][y], "copy") + knn_pred = hcl.compute((10,), + lambda x: knn_vote(knn_mat, x), "vote") + return knn_pred + +test_image = hcl.placeholder(test_size, "test_image", dtype_image) +train_images = hcl.placeholder(data_size, "train_images", dtype_image) + +scheme = hcl.create_scheme([test_image, train_images], knn) +scheme.downsize([knn.dist, knn.dist.out, knn.knn_mat], dtype_knnmat) + +s = hcl.create_schedule_from_scheme(scheme) + +diff = knn.diff +dist = knn.dist +vote = knn.copy +knn_update = knn.knn_update + +s.to([test_images, train_images], target.xcel) +s.to(vote, target.host) + +# merge loop nests +s[diff].compute_at(s[dist], dist.axis[1]) +s[dist].compute_at(s[knn_update], knn_update.axis[1]) + +# reorder loop to expose more parallelism +s[knn_update].reorder(knn_update.axis[1], knn_update.axis[0]) + +# parallel outer loop and pipeline inner loop +s[knn_update].parallel(knn_update.axis[1]) +s[knn_update].pipeline(knn_update.axis[0]) + +# at the end, we build the whole offloaded function. +# print(hcl.lower(s)) +f = hcl.build(s, target) + +train_images, _, test_images, test_labels = read_digitrec_data() +total = len(test_images) +total_time = 0 + +# read returned prediction from streaming pipe +hcl_train_images = hcl.asarray(train_images, dtype_image) +hcl_knn_pred = hcl.asarray(np.zeros((total, 10)), dtype_knnmat) + +start = time.time() +f(test_images, hcl_train_images, hcl_knn_pred) +total_time = total_time + (time.time() - start) + +knn_result = hcl_knn_pred.asnumpy() + +correct = 0.0 +for i in range(total): + if np.argmax(knn_result[i]) == test_labels[i]: + correct += 1 + +print("Average kernel time (s): {:.2f}".format(total_time/total)) +print("Accuracy (%): {:.2f}".format(100*correct/1)) diff --git a/samples/digitrec/kernel.cpp b/samples/digitrec/kernel.cpp deleted file mode 100644 index 21b550c8b..000000000 --- a/samples/digitrec/kernel.cpp +++ /dev/null @@ -1,38 +0,0 @@ -#include -#include -#include -#pragma ACCEL kernel -void default_function(unsigned long test_image, unsigned long* train_images, unsigned char* knn_mat) { - for (int x = 0; x < 10; ++x) { - for (int y = 0; y < 3; ++y) { - knn_mat[(y + (x * 3))] = (unsigned char)50; - } - } - unsigned long knn_update; -#pragma ACCEL parallel - for (int y1 = 0; y1 < 1800; ++y1) { -#pragma ACCEL pipeline - for (int x1 = 0; x1 < 10; ++x1) { - unsigned char dist; - unsigned long diff; - diff = (train_images[(y1 + (x1 * 1800))] ^ test_image); - unsigned char out; - out = (unsigned char)0; - for (int i = 0; i < 49; ++i) { - out = ((unsigned char)(((unsigned long)out) + ((unsigned long)((diff & (1L << i)) >> i)))); - } - dist = out; - unsigned long max_id; - max_id = (unsigned long)0; - for (int i1 = 0; i1 < 3; ++i1) { - if (knn_mat[(((long)max_id) + ((long)(x1 * 3)))] < knn_mat[(i1 + (x1 * 3))]) { - max_id = ((unsigned long)i1); - } - } - if (dist < knn_mat[(((long)max_id) + ((long)(x1 * 3)))]) { - knn_mat[(((long)max_id) + ((long)(x1 * 3)))] = dist; - } - } - } -} - diff --git a/samples/gemm/common/common.mk b/samples/gemm/common/common.mk new file mode 100644 index 000000000..3409e4aa5 --- /dev/null +++ b/samples/gemm/common/common.mk @@ -0,0 +1,55 @@ +SHELL = /bin/bash +VPATH = ./ +CC = xcpp +CLCC = xocc +ifeq ($(XDEVICE_REPO_PATH),) + DEVICE_REPO_OPT = +else +DEVICE_REPO_OPT = --xp prop:solution.device_repo_paths=${XDEVICE_REPO_PATH} +endif +HOST_CFLAGS += -I${XILINX_SDX}/runtime/include/1_2 +HOST_LFLAGS += -L${XILINX_SDX}/runtime/lib/x86_64 -lxilinxopencl -lrt -pthread +CLCC_OPT += $(CLCC_OPT_LEVEL) ${DEVICE_REPO_OPT} --xdevice ${XDEVICE} -o ${XCLBIN} ${KERNEL_DEFS} ${KERNEL_INCS} +ifeq (${KEEP_TEMP},1) + CLCC_OPT += -s +endif +ifeq (${KERNEL_DEBUG},1) + CLCC_OPT += -g +endif +CLCC_OPT += --kernel ${KERNEL_NAME} +OBJECTS := $(HOST_SRCS:.cpp=.o) +.PHONY: all +all: run +host: ${HOST_EXE_DIR}/${HOST_EXE} +xbin_cpu_em: + make SDA_FLOW=cpu_emu xbin -f sdaccel.mk +xbin_hw_em: + make SDA_FLOW=hw_emu xbin -f sdaccel.mk +xbin_hw : + make SDA_FLOW=hw xbin -f sdaccel.mk +xbin: ${XCLBIN} +run_cpu_em: + make SDA_FLOW=cpu_emu run_em -f sdaccel.mk +run_hw_em: + make SDA_FLOW=hw_emu run_em -f sdaccel.mk +run_hw : + make SDA_FLOW=hw run_hw_int -f sdaccel.mk +run_em: xconfig host xbin + XCL_EMULATION_MODE=true ${HOST_EXE_DIR}/${HOST_EXE} ${HOST_ARGS} +run_hw_int : host xbin_hw + source ${BOARD_SETUP_FILE};${HOST_EXE_DIR}/${HOST_EXE} ${HOST_ARGS} +estimate : + ${CLCC} -c -t hw_emu --xdevice ${XDEVICE} --report estimate ${KERNEL_SRCS} +xconfig : emconfig.json +emconfig.json : + emconfigutil --xdevice ${XDEVICE} ${DEVICE_REPO_OPT} --od . +${HOST_EXE_DIR}/${HOST_EXE} : ${OBJECTS} + ${CC} ${HOST_LFLAGS} ${OBJECTS} -o $@ +${XCLBIN}: + ${CLCC} ${CLCC_OPT} ${KERNEL_SRCS} +%.o: %.cpp + ${CC} ${HOST_CFLAGS} -c $< -o $@ +clean: + ${RM} -rf ${HOST_EXE} ${OBJECTS} ${XCLBIN} emconfig.json _xocc_${XCLBIN_NAME}_*.dir .Xil +cleanall: clean + ${RM} -rf *.xclbin sdaccel_profile_summary.* _xocc_* TempConfig *.log *.jou diff --git a/samples/gemm/gemm_aocl.cl b/samples/gemm/gemm_aocl.cl new file mode 100644 index 000000000..198757823 --- /dev/null +++ b/samples/gemm/gemm_aocl.cl @@ -0,0 +1,14 @@ +#include "ihc_apint.h" +__kernel void default_function(__global int* restrict placeholder0, __global int* restrict placeholder1, __global int* restrict matrix_3) { + for (int x = 0; x < 10; ++x) { + for (int y = 0; y < 10; ++y) { + int sum; + sum = 0; + for (int k = 0; k < 10; ++k) { + sum = ((int)(((int64_t)(((long)placeholder0[(k + (x * 10))]) * ((long)placeholder1[(y + (k * 10))]))) + ((int64_t)sum))); + } + matrix_3[(y + (x * 10))] = sum; + } + } +} + diff --git a/samples/gemm/gemm_main.py b/samples/gemm/gemm_main.py index fb05a094d..4796bf2fb 100644 --- a/samples/gemm/gemm_main.py +++ b/samples/gemm/gemm_main.py @@ -52,5 +52,6 @@ def time_gemm(dtype, m=1024, n=1024, k=1024, target=None): ############################################################################### # Test the algorithm with different data types dtypes = [hcl.Int(32), hcl.Float(), hcl.Fixed(32, 16)] -for dtype in dtypes: - time_gemm(dtype) + +# for dtype in dtypes: +# time_gemm(hcl.Float(), 10, 10, 10, 'sdaccel') diff --git a/samples/gemm/gemm_runtime.py b/samples/gemm/gemm_runtime.py new file mode 100644 index 000000000..49947fa4c --- /dev/null +++ b/samples/gemm/gemm_runtime.py @@ -0,0 +1,86 @@ +# Yang.Bai +# yb269@cornell.edu + +import heterocl as hcl +import numpy as np + +hcl.init() + +# matrix_size = (16, 16) +# def add_compute(A, B): +# C = hcl.compute(A.shape, lambda x, y: A[x, y] + B[x, y], "C") +# return C + +# def add_compute_2(A, B): +# C = hcl.compute(A.shape, lambda x: A[x] + B[x], "C") +# return C + +# A = hcl.placeholder(matrix_size, "A") +# B = hcl.placeholder(matrix_size, "B") + +# s = hcl.create_schedule([A, B], add_compute) +# # f2 = hcl.build(s, target='sdaccel') +# f2 = hcl.build(s, target='aocl') +# print (f2) + +# hcl_A = hcl.asarray(np.random.random_sample(matrix_size), dtype = hcl.Float()) +# hcl_B = hcl.asarray(np.random.random_sample(matrix_size), dtype = hcl.Float()) +# hcl_C = hcl.asarray(np.zeros(matrix_size), dtype = hcl.Float()) +# hcl_C2 = hcl.asarray(np.zeros(matrix_size), dtype = hcl.Float()) +# f3 = hcl.build(s) + +# A = hcl.placeholder((10, ), "A") +# B = hcl.placeholder((10, ), "B") +# s = hcl.create_schedule([A, B], add_compute_2) +# f4 = hcl.build(s, target='sdaccel') +# print (f4) +# print (hcl_A, hcl_B, hcl_C) + +matrix_1_size = (10, 10) +matrix_2_size = (10, 10) +matrix_3_size = (matrix_1_size[0], matrix_2_size[1]) + +def gemm_compute(matrix_1, matrix_2): + m = matrix_1.shape[0]; + k = matrix_1.shape[1]; + n = matrix_2.shape[1]; + r = hcl.reduce_axis(0, k, 'k') + temp = hcl.compute((m, n), + lambda x, y: hcl.sum(matrix_1[x, r] * matrix_2[r, y], + axis = r), name='matrix_3') + return temp + +matrix_1 = hcl.placeholder(matrix_1_size) +matrix_2 = hcl.placeholder(matrix_2_size) + +s = hcl.create_schedule([matrix_1, matrix_2], gemm_compute) +f = hcl.build(s, target='sdaccel_csim') +code = hcl.build(s, target='aocl') +with open('gemm_aocl.cl', 'w') as fin: + fin.write(code) + +code2 = hcl.build(s, target='sdaccel') +with open('gemm_sdaccel.cl', 'w') as fin2: + fin2.write(code2) + + +matrix_1_np = np.random.randint(10, size=matrix_1_size) +matrix_2_np = np.random.randint(10, size=matrix_2_size) +matrix_3_np = np.random.randint(10, size=matrix_3_size) + +hcl_matrix_1 = hcl.asarray(matrix_1_np) +hcl_matrix_2 = hcl.asarray(matrix_2_np) +hcl_matrix_3 = hcl.asarray(matrix_3_np) + +# f(hcl_matrix_1, hcl_matrix_2, hcl_matrix_3) + + + + + +# with open('sdaccel.cl', 'w') as f: +# f.write(code) + + + + diff --git a/samples/gemm/gemm_sdaccel.cl b/samples/gemm/gemm_sdaccel.cl new file mode 100644 index 000000000..f46a88426 --- /dev/null +++ b/samples/gemm/gemm_sdaccel.cl @@ -0,0 +1,13 @@ +__kernel void default_function(__global int* placeholder0, __global int* placeholder1, __global int* matrix_3) { + for (int x = 0; x < 10; ++x) { + for (int y = 0; y < 10; ++y) { + __local int sum; + sum = 0; + for (int k = 0; k < 10; ++k) { + sum = ((int)(((long)(((long)placeholder0[(k + (x * 10))]) * ((long)placeholder1[(y + (k * 10))]))) + ((long)sum))); + } + matrix_3[(y + (x * 10))] = sum; + } + } +} + diff --git a/samples/gemm/gemm_sdaccel.py b/samples/gemm/gemm_sdaccel.py new file mode 100644 index 000000000..85c318120 --- /dev/null +++ b/samples/gemm/gemm_sdaccel.py @@ -0,0 +1,8 @@ +import heterocl as hcl +import numpy as np +from gemm_main import * + +#dtypes = [hcl.Int(32), hcl.Float(), hcl.Fixed(32, 16)] +#for dtype in dtypes: +time_gemm(hcl.Int(32), 15, 15, 15, 'sdaccel_sw_emu') +# time_gemm(hcl.Float(), 100, 100, 100, 'sdaccel_sw_emu') diff --git a/samples/gemm/gemm_vhls.py b/samples/gemm/gemm_vhls.py index e27fa155e..8edd84bdd 100644 --- a/samples/gemm/gemm_vhls.py +++ b/samples/gemm/gemm_vhls.py @@ -2,6 +2,6 @@ import numpy as np from gemm_main import * -dtypes = [hcl.Int(32), hcl.Float(), hcl.Fixed(32, 16)] -for dtype in dtypes: - time_gemm(dtype, 10, 10, 10, 'vhls_csim') +#dtypes = [hcl.Int(32), hcl.Float(), hcl.Fixed(32, 16)] +#for dtype in dtypes: +time_gemm(hcl.Int(32), 10, 10, 10, 'vhls_csim') diff --git a/samples/gemm/host.cpp b/samples/gemm/host.cpp new file mode 100644 index 000000000..914b2aa26 --- /dev/null +++ b/samples/gemm/host.cpp @@ -0,0 +1,118 @@ +#define CL_HPP_CL_1_2_DEFAULT_BUILD +#define CL_HPP_TARGET_OPENCL_VERSION 120 +#define CL_HPP_MINIMUM_OPENCL_VERSION 120 +#define CL_HPP_ENABLE_PROGRAM_CONSTRUCTION_FROM_ARRAY_COMPATIBILITY 1 +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#pragma once + + + + +int main(void) { +#if defined(SDX_PLATFORM) && !defined(TARGET_DEVICE) + #define STR_VALUE(arg) #arg + #define GET_STRING(name) STR_VALUE(name) + #define TARGET_DEVICE GET_STRING(SDX_PLATFORM) +#endif + char* xclbinFilename = argv[1]; + + std::vector source_0(6 * 2); + std::vector source_1(2 * 7); + std::vector source_2(6 * 7); + + size_t vector_size_bytes_0 = sizeof(int) * 6 * 2; + size_t vector_size_bytes_1 = sizeof(int) * 2 * 7; + size_t vector_size_bytes_2 = sizeof(int) * 6 * 7; + + int* arg_0 = (int*)shmat(4849666, nullptr, 0); + for (size_t i0 = 0; i0 < 6; i0++) { + for (size_t i1 = 0; i1 < 2; i1++) { + source_0[i1 + i0*2] = arg_0[i1 + i0*2]; + } + } + int* arg_1 = (int*)shmat(7667712, nullptr, 0); + for (size_t i0 = 0; i0 < 2; i0++) { + for (size_t i1 = 0; i1 < 7; i1++) { + source_1[i1 + i0*7] = arg_1[i1 + i0*7]; + } + } + int* arg_2 = (int*)shmat(7667713, nullptr, 0); + for (size_t i0 = 0; i0 < 6; i0++) { + for (size_t i1 = 0; i1 < 7; i1++) { + source_2[i1 + i0*7] = arg_2[i1 + i0*7]; + } + } + std::vector platforms; + cl::Platform::get(&platforms); + cl::Platform platform = platforms[0]; + + std::vector devices; + platform.getDevices(CL_DEVICE_TYPE_ACCELERATOR, &devices); + cl::Device device = devices[0]; + + cl::Context context(device); + cl::CommandQueue q(context, device); + + std::ifstream bin_file(xclbinFilename, std::ifstream::binary); + bin_file.seekg (0, bin_file.end); + unsigned nb = bin_file.tellg(); + bin_file.seekg (0, bin_file.beg); + char *buf = new char [nb]; + bin_file.read(buf, nb); + + cl::Program::Binaries bins; + bins.push_back({buf,nb}); + devices.resize(1); + cl::Program program(context, devices, bins); + + int err1; + cl::Kernel kernel(program, "default_function", &err1); + auto default_function = cl::KernelFunctor(kernel); + + cl::Buffer buffer_0(context, CL_MEM_READ_WRITE, vector_size_bytes_0); + cl::Buffer buffer_1(context, CL_MEM_READ_WRITE, vector_size_bytes_1); + cl::Buffer buffer_2(context, CL_MEM_READ_WRITE, vector_size_bytes_2); + + q.enqueueWriteBuffer(buffer_0, CL_TRUE, 0, vector_size_bytes_0, source_0.data()); + q.enqueueWriteBuffer(buffer_1, CL_TRUE, 0, vector_size_bytes_1, source_1.data()); + q.enqueueWriteBuffer(buffer_2, CL_TRUE, 0, vector_size_bytes_2, source_2.data()); + + default_function(cl::EnqueueArgs(q, cl::NDRange(1,1,1), cl::NDRange(1,1,1)),buffer_0, buffer_1, buffer_2); + q.finish(); + + q.enqueueReadBuffer(buffer_0, CL_TRUE, 0, vector_size_bytes_0, source_0.data()); + q.enqueueReadBuffer(buffer_1, CL_TRUE, 0, vector_size_bytes_1, source_1.data()); + q.enqueueReadBuffer(buffer_2, CL_TRUE, 0, vector_size_bytes_2, source_2.data()); + + for (size_t i0 = 0; i0 < 6; i0++) { + for (size_t i1 = 0; i1 < 2; i1++) { + arg_0[i1 + i0*2] = source_0[i1 + i0*2]; + } + } + shmdt(arg_0); + for (size_t i0 = 0; i0 < 2; i0++) { + for (size_t i1 = 0; i1 < 7; i1++) { + arg_1[i1 + i0*7] = source_1[i1 + i0*7]; + } + } + shmdt(arg_1); + for (size_t i0 = 0; i0 < 6; i0++) { + for (size_t i1 = 0; i1 < 7; i1++) { + arg_2[i1 + i0*7] = source_2[i1 + i0*7]; + } + } + shmdt(arg_2); +} diff --git a/samples/gemm/sdaccel.mk b/samples/gemm/sdaccel.mk new file mode 100644 index 000000000..9cf0dafd7 --- /dev/null +++ b/samples/gemm/sdaccel.mk @@ -0,0 +1,33 @@ +ifndef XILINX_SDX +$(error Environment variable XILINX_SDX is required and should point to SDAccel install area) +endif +SDA_FLOW = cpu_emu +HOST_SRCS = host.cpp +HOST_EXE_DIR=. +HOST_EXE = host +HOST_CFLAGS = -g -Wall -DFPGA_DEVICE -DC_KERNEL +HOST_LFLAGS = +KERNEL_SRCS = default_function.cl +KERNEL_NAME = default_function +KERNEL_DEFS = +KERNEL_INCS = +XDEVICE=xilinx:adm-pcie-7v3:1ddr:3.0 +XDEVICE_REPO_PATH= +KEEP_TEMP=1 +KERNEL_DEBUG= +XCLBIN_NAME=bin_krnl +HOST_CFLAGS+=-DTARGET_DEVICE=\"${XDEVICE}\" +BOARD_SETUP_FILE=setup.sh +ifeq (${SDA_FLOW},cpu_emu) + CLCC_OPT += -t sw_emu + XCLBIN = ${XCLBIN_NAME}_cpu_emu.xclbin +else ifeq (${SDA_FLOW},hw_emu) + CLCC_OPT += -t hw_emu + XCLBIN = ${XCLBIN_NAME}_hw_emu.xclbin +else ifeq (${SDA_FLOW},hw) + XCLBIN = ${XCLBIN_NAME}_hw.xclbin +CLCC_OPT += -t hw +endif +HOST_ARGS = ${XCLBIN} +COMMON_DIR = ./common +include ${COMMON_DIR}/common.mk diff --git a/samples/kmeans/kmeans_aocl.cl b/samples/kmeans/kmeans_aocl.cl new file mode 100644 index 000000000..e64b116f4 --- /dev/null +++ b/samples/kmeans/kmeans_aocl.cl @@ -0,0 +1,49 @@ +#include "ihc_apint.h" +__kernel void default_function(__global int* restrict placeholder2, __global int* restrict placeholder3, __global int* restrict compute3) { + for (int x = 0; x < 32; ++x) { + compute3[x] = 0; + } + int main_loop; + for (int _1 = 0; _1 < 10; ++_1) { + #pragma ii 1 + for (int N = 0; N < 32; ++N) { + int local2; + local2 = 100000; + for (int i = 0; i < 6; ++i) { + int local3; + local3 = 0; + for (int i1 = 0; i1 < 3; ++i1) { + local3 = ((int)(((int64_t)local3) + ((int64_t)(((int64_t)((int33_t)(placeholder2[(i1 + (N * 3))] - placeholder3[(i1 + (i * 3))]))) * ((int64_t)((int33_t)(placeholder2[(i1 + (N * 3))] - placeholder3[(i1 + (i * 3))]))))))); + } + if (local3 < local2) { + local2 = local3; + compute3[N] = i; + } + } + } + int compute4[6]; + for (int x1 = 0; x1 < 6; ++x1) { + compute4[x1] = 0; + } + int compute5[18]; + for (int x2 = 0; x2 < 6; ++x2) { + for (int y = 0; y < 3; ++y) { + compute5[(y + (x2 * 3))] = 0; + } + } + int calc_sum; + #pragma unroll + for (int n = 0; n < 32; ++n) { + compute4[compute3[n]] = (compute4[compute3[n]] + 1); + for (int i2 = 0; i2 < 3; ++i2) { + compute5[(i2 + (compute3[n] * 3))] = ((int)(((int33_t)compute5[(i2 + (compute3[n] * 3))]) + ((int33_t)placeholder2[(i2 + (n * 3))]))); + } + } + int update_mean; + #pragma unroll + for (int k_d_fused = 0; k_d_fused < 18; ++k_d_fused) { + placeholder3[k_d_fused] = (compute5[k_d_fused] / compute4[(k_d_fused / 3)]); + } + } +} + diff --git a/samples/kmeans/kmeans_sdaccel.py b/samples/kmeans/kmeans_sdaccel.py new file mode 100644 index 000000000..c204c592e --- /dev/null +++ b/samples/kmeans/kmeans_sdaccel.py @@ -0,0 +1,27 @@ +import numpy as np +import random +import heterocl as hcl +from kmeans_main import top + +K = 16 +N = 320 +dim = 32 + +f1 = top('sdaccel_sw_emu') +#f2 = top() +points_np = np.random.randint(100, size=(N, dim)) +labels_np = np.zeros(N) +means_np = points_np[random.sample(range(N), K),:] + +hcl_points1 = hcl.asarray(points_np) +hcl_means1 = hcl.asarray(means_np) +hcl_labels1 = hcl.asarray(labels_np) + +hcl_points2 = hcl.asarray(points_np) +hcl_means2 = hcl.asarray(means_np) +hcl_labels2 = hcl.asarray(labels_np) + +f1(hcl_points1, hcl_means1, hcl_labels1) +#f2(hcl_points2, hcl_means2, hcl_labels2) + +#assert np.array_equal(hcl_labels1.asnumpy(), hcl_labels2.asnumpy()) diff --git a/samples/kmeans/merlinc_code.cl b/samples/kmeans/merlinc_code.cl new file mode 100644 index 000000000..ea672313d --- /dev/null +++ b/samples/kmeans/merlinc_code.cl @@ -0,0 +1,52 @@ +#include +#include +#include +#pragma ACCEL kernel +void default_function(int* placeholder2, int* placeholder3, int* compute3) { + for (int x = 0; x < 320; ++x) { + compute3[x] = 0; + } + int main_loop; + for (int _1 = 0; _1 < 200; ++_1) { +#pragma ACCEL pipeline + for (int N = 0; N < 320; ++N) { + int local2; + local2 = 100000; + for (int i = 0; i < 16; ++i) { + int local3; + local3 = 0; + for (int i1 = 0; i1 < 32; ++i1) { + local3 = ((int)(((long)local3) + ((long)(((long)((long)(placeholder2[(i1 + (N * 32))] - placeholder3[(i1 + (i * 32))]))) * ((long)((long)(placeholder2[(i1 + (N * 32))] - placeholder3[(i1 + (i * 32))]))))))); + } + if (local3 < local2) { + local2 = local3; + compute3[N] = i; + } + } + } + int compute4[16]; + for (int x1 = 0; x1 < 16; ++x1) { + compute4[x1] = 0; + } + int compute5[512]; + for (int x2 = 0; x2 < 16; ++x2) { + for (int y = 0; y < 32; ++y) { + compute5[(y + (x2 * 32))] = 0; + } + } + int calc_sum; +#pragma ACCEL parallel flatten + for (int n = 0; n < 320; ++n) { + compute4[compute3[n]] = (compute4[compute3[n]] + 1); + for (int i2 = 0; i2 < 32; ++i2) { + compute5[(i2 + (compute3[n] * 32))] = ((int)(((long)compute5[(i2 + (compute3[n] * 32))]) + ((long)placeholder2[(i2 + (n * 32))]))); + } + } + int update_mean; +#pragma ACCEL parallel flatten + for (int k_d_fused = 0; k_d_fused < 512; ++k_d_fused) { + placeholder3[k_d_fused] = (compute5[k_d_fused] / compute4[(k_d_fused / 32)]); + } + } +} + diff --git a/samples/kmeans/sdaccel_code.cl b/samples/kmeans/sdaccel_code.cl new file mode 100644 index 000000000..196f96257 --- /dev/null +++ b/samples/kmeans/sdaccel_code.cl @@ -0,0 +1,48 @@ +__kernel void default_function(__global int* placeholder4, __global int* placeholder5, __global int* compute6) { + for (int x = 0; x < 320; ++x) { + compute6[x] = 0; + } + __local int main_loop; + for (int _1 = 0; _1 < 200; ++_1) { + __attribute__((xcl_pipeline_loop(1))) + for (int N = 0; N < 320; ++N) { + __local int local4; + local4 = 100000; + for (int i = 0; i < 16; ++i) { + __local int local5; + local5 = 0; + for (int i1 = 0; i1 < 32; ++i1) { + local5 = ((int)(((long)local5) + ((long)(((long)((long)(placeholder4[(i1 + (N * 32))] - placeholder5[(i1 + (i * 32))]))) * ((long)((long)(placeholder4[(i1 + (N * 32))] - placeholder5[(i1 + (i * 32))]))))))); + } + if (local5 < local4) { + local4 = local5; + compute6[N] = i; + } + } + } + __local int compute7[16]; + for (int x1 = 0; x1 < 16; ++x1) { + compute7[x1] = 0; + } + __local int compute8[512]; + for (int x2 = 0; x2 < 16; ++x2) { + for (int y = 0; y < 32; ++y) { + compute8[(y + (x2 * 32))] = 0; + } + } + __local int calc_sum; + + for (int n = 0; n < 320; ++n) { + compute7[compute6[n]] = (compute7[compute6[n]] + 1); + for (int i2 = 0; i2 < 32; ++i2) { + compute8[(i2 + (compute6[n] * 32))] = ((int)(((long)compute8[(i2 + (compute6[n] * 32))]) + ((long)placeholder4[(i2 + (n * 32))]))); + } + } + __local int update_mean; + + for (int k_d_fused = 0; k_d_fused < 512; ++k_d_fused) { + placeholder5[k_d_fused] = (compute8[k_d_fused] / compute7[(k_d_fused / 32)]); + } + } +} + diff --git a/samples/kmeans/submit.sh b/samples/kmeans/submit.sh new file mode 100644 index 000000000..a4345a542 --- /dev/null +++ b/samples/kmeans/submit.sh @@ -0,0 +1,3 @@ +unset DISPLAY +aoc -board=a10gx -time time.out -time-passes -regtest_mode -v -fpc -fp-relaxed --opt-arg -nocaching -regtest_mode -report -I $INTELFPGAOCLSDKROOT/include/kernel_headers kmeans_aocl.cl + diff --git a/samples/kmeans/vhls_code.cl b/samples/kmeans/vhls_code.cl new file mode 100644 index 000000000..b651dd8bf --- /dev/null +++ b/samples/kmeans/vhls_code.cl @@ -0,0 +1,52 @@ +#include +#include +#include + +void default_function(ap_int<32> placeholder6[320][32], ap_int<32> placeholder7[16][32], ap_int<32> compute9[320]) { + for (ap_int<32> x = 0; x < 320; ++x) { + compute9[x] = 0; + } + ap_int<32> main_loop; + for (ap_int<32> _ = 0; _ < 200; ++_) { + for (ap_int<32> N = 0; N < 320; ++N) { + #pragma HLS pipeline + ap_int<32> local6; + local6 = 100000; + for (ap_int<32> i = 0; i < 16; ++i) { + ap_int<32> local7; + local7 = 0; + for (ap_int<32> i1 = 0; i1 < 32; ++i1) { + local7 = ((ap_int<32>)(((ap_int<67>)local7) + ((ap_int<67>)(((ap_int<66>)((ap_int<33>)(placeholder6[N][i1] - placeholder7[i][i1]))) * ((ap_int<66>)((ap_int<33>)(placeholder6[N][i1] - placeholder7[i][i1]))))))); + } + if (local7 < local6) { + local6 = local7; + compute9[N] = i; + } + } + } + ap_int<32> compute10[16]; + for (ap_int<32> x1 = 0; x1 < 16; ++x1) { + compute10[x1] = 0; + } + ap_int<32> compute11[16][32]; + for (ap_int<32> x2 = 0; x2 < 16; ++x2) { + for (ap_int<32> y = 0; y < 32; ++y) { + compute11[x2][y] = 0; + } + } + ap_int<32> calc_sum; + for (ap_int<32> n = 0; n < 320; ++n) { + #pragma HLS unroll + compute10[compute9[n]] = (compute10[compute9[n]] + 1); + for (ap_int<32> i2 = 0; i2 < 32; ++i2) { + compute11[compute9[n]][i2] = ((ap_int<32>)(((ap_int<33>)compute11[compute9[n]][i2]) + ((ap_int<33>)placeholder6[n][i2]))); + } + } + ap_int<32> update_mean; + for (ap_int<32> k_d_fused = 0; k_d_fused < 512; ++k_d_fused) { + #pragma HLS unroll + placeholder7[(k_d_fused / 32)][(k_d_fused % 32)] = (compute11[(k_d_fused / 32)][(k_d_fused % 32)] / compute10[(k_d_fused / 32)]); + } + } +} + diff --git a/samples/lenet/common/common.mk b/samples/lenet/common/common.mk new file mode 100644 index 000000000..3409e4aa5 --- /dev/null +++ b/samples/lenet/common/common.mk @@ -0,0 +1,55 @@ +SHELL = /bin/bash +VPATH = ./ +CC = xcpp +CLCC = xocc +ifeq ($(XDEVICE_REPO_PATH),) + DEVICE_REPO_OPT = +else +DEVICE_REPO_OPT = --xp prop:solution.device_repo_paths=${XDEVICE_REPO_PATH} +endif +HOST_CFLAGS += -I${XILINX_SDX}/runtime/include/1_2 +HOST_LFLAGS += -L${XILINX_SDX}/runtime/lib/x86_64 -lxilinxopencl -lrt -pthread +CLCC_OPT += $(CLCC_OPT_LEVEL) ${DEVICE_REPO_OPT} --xdevice ${XDEVICE} -o ${XCLBIN} ${KERNEL_DEFS} ${KERNEL_INCS} +ifeq (${KEEP_TEMP},1) + CLCC_OPT += -s +endif +ifeq (${KERNEL_DEBUG},1) + CLCC_OPT += -g +endif +CLCC_OPT += --kernel ${KERNEL_NAME} +OBJECTS := $(HOST_SRCS:.cpp=.o) +.PHONY: all +all: run +host: ${HOST_EXE_DIR}/${HOST_EXE} +xbin_cpu_em: + make SDA_FLOW=cpu_emu xbin -f sdaccel.mk +xbin_hw_em: + make SDA_FLOW=hw_emu xbin -f sdaccel.mk +xbin_hw : + make SDA_FLOW=hw xbin -f sdaccel.mk +xbin: ${XCLBIN} +run_cpu_em: + make SDA_FLOW=cpu_emu run_em -f sdaccel.mk +run_hw_em: + make SDA_FLOW=hw_emu run_em -f sdaccel.mk +run_hw : + make SDA_FLOW=hw run_hw_int -f sdaccel.mk +run_em: xconfig host xbin + XCL_EMULATION_MODE=true ${HOST_EXE_DIR}/${HOST_EXE} ${HOST_ARGS} +run_hw_int : host xbin_hw + source ${BOARD_SETUP_FILE};${HOST_EXE_DIR}/${HOST_EXE} ${HOST_ARGS} +estimate : + ${CLCC} -c -t hw_emu --xdevice ${XDEVICE} --report estimate ${KERNEL_SRCS} +xconfig : emconfig.json +emconfig.json : + emconfigutil --xdevice ${XDEVICE} ${DEVICE_REPO_OPT} --od . +${HOST_EXE_DIR}/${HOST_EXE} : ${OBJECTS} + ${CC} ${HOST_LFLAGS} ${OBJECTS} -o $@ +${XCLBIN}: + ${CLCC} ${CLCC_OPT} ${KERNEL_SRCS} +%.o: %.cpp + ${CC} ${HOST_CFLAGS} -c $< -o $@ +clean: + ${RM} -rf ${HOST_EXE} ${OBJECTS} ${XCLBIN} emconfig.json _xocc_${XCLBIN_NAME}_*.dir .Xil +cleanall: clean + ${RM} -rf *.xclbin sdaccel_profile_summary.* _xocc_* TempConfig *.log *.jou diff --git a/samples/lenet/lenet_aocl.cl b/samples/lenet/lenet_aocl.cl new file mode 100644 index 000000000..9b2a200f8 --- /dev/null +++ b/samples/lenet/lenet_aocl.cl @@ -0,0 +1,138 @@ +#include "ihc_apint.h" +__kernel void default_function(__global float* restrict input_image, __global float* restrict weight_conv1, __global float* restrict weight_conv2, __global float* restrict weight_fc1, __global float* restrict weight_fc2, __global float* restrict lenet) { + float conv2d; + for (int nn = 0; nn < 1; ++nn) { + for (int yy = 0; yy < -1; ++yy) { + for (int xx = 0; xx < -1; ++xx) { + float reducer0; + reducer0 = 0.000000e+00f; + for (int ra1 = 0; ra1 < 5; ++ra1) { + for (int ra2 = 0; ra2 < 5; ++ra2) { + reducer0 = ((input_image[(((xx + ra2) + ((yy + ra1) * 3)) + (nn * 9))] * weight_conv1[(ra2 + (ra1 * 5))]) + reducer0); + } + } + conv2d = reducer0; + } + } + } + float tanh1; + for (int args = 0; args < 1; ++args) { + for (int args1 = 0; args1 < -1; ++args1) { + for (int args2 = 0; args2 < -1; ++args2) { + tanh1 = ((float)tanh(((float)conv2d))); + } + } + } + float max_pool; + for (int i = 0; i < 1; ++i) { + for (int h = 0; h < -1; ++h) { + for (int w = 0; w < -1; ++w) { + float reducer1; + reducer1 = -1.000000e+00f; + for (int ra3 = 0; ra3 < 2; ++ra3) { + for (int ra4 = 0; ra4 < 2; ++ra4) { + reducer1 = max(tanh1, reducer1); + } + } + max_pool = reducer1; + } + } + } + float conv2d1[250]; + for (int nn1 = 0; nn1 < 1; ++nn1) { + for (int ff = 0; ff < 10; ++ff) { + for (int yy1 = 0; yy1 < -5; ++yy1) { + for (int xx1 = 0; xx1 < -5; ++xx1) { + float reducer2; + reducer2 = 0.000000e+00f; + for (int ra6 = 0; ra6 < 5; ++ra6) { + for (int ra7 = 0; ra7 < 5; ++ra7) { + reducer2 = ((max_pool * weight_conv2[((ra7 + (ra6 * 5)) + (ff * 25))]) + reducer2); + } + } + conv2d1[(((xx1 - (yy1 * 5)) + (ff * 25)) + (nn1 * 250))] = reducer2; + } + } + } + } + float tanh2[250]; + for (int args3 = 0; args3 < 1; ++args3) { + for (int args0 = 0; args0 < 10; ++args0) { + for (int args11 = 0; args11 < -5; ++args11) { + for (int args21 = 0; args21 < -5; ++args21) { + tanh2[(((args21 - (args11 * 5)) + (args0 * 25)) + (args3 * 250))] = ((float)tanh(((float)conv2d1[(((args21 - (args11 * 5)) + (args0 * 25)) + (args3 * 250))]))); + } + } + } + } + float max_pool1[90]; + for (int i1 = 0; i1 < 1; ++i1) { + for (int c = 0; c < 10; ++c) { + for (int h1 = 0; h1 < -3; ++h1) { + for (int w1 = 0; w1 < -3; ++w1) { + float reducer3; + reducer3 = -1.000000e+00f; + for (int ra8 = 0; ra8 < 2; ++ra8) { + for (int ra9 = 0; ra9 < 2; ++ra9) { + reducer3 = max(tanh2[(((((w1 * 2) - (((h1 * 2) + ra8) * 5)) + ra9) + (c * 25)) + (i1 * 250))], reducer3); + } + } + max_pool1[(((w1 - (h1 * 3)) + (c * 9)) + (i1 * 90))] = reducer3; + } + } + } + } + float compute0[90]; + for (int i2 = 0; i2 < 1; ++i2) { + for (int j = 0; j < 90; ++j) { + compute0[(j + (i2 * 90))] = max_pool1[((((j % -3) - (((j / -3) % -3) * 3)) + ((((j / -3) / -3) % 10) * 9)) + (i2 * 90))]; + } + } + float dense[25]; + for (int i3 = 0; i3 < 1; ++i3) { + for (int j1 = 0; j1 < 25; ++j1) { + float reducer4; + reducer4 = 0.000000e+00f; + for (int ra10 = 0; ra10 < 90; ++ra10) { + reducer4 = ((compute0[(ra10 + (i3 * 90))] * weight_fc1[(ra10 + (j1 * 40))]) + reducer4); + } + dense[(j1 + (i3 * 25))] = reducer4; + } + } + float tanh3[25]; + for (int args4 = 0; args4 < 1; ++args4) { + for (int args01 = 0; args01 < 25; ++args01) { + tanh3[(args01 + (args4 * 25))] = ((float)tanh(((float)dense[(args01 + (args4 * 25))]))); + } + } + float dense1[10]; + for (int i4 = 0; i4 < 1; ++i4) { + for (int j2 = 0; j2 < 10; ++j2) { + float reducer5; + reducer5 = 0.000000e+00f; + for (int ra11 = 0; ra11 < 25; ++ra11) { + reducer5 = ((tanh3[(ra11 + (i4 * 25))] * weight_fc2[(ra11 + (j2 * 25))]) + reducer5); + } + dense1[(j2 + (i4 * 10))] = reducer5; + } + } + float compute1; + int max1; + max1 = 0; + for (int ra12 = 0; ra12 < 10; ++ra12) { + max1 = ((int)max(dense1[ra12], ((float)max1))); + } + compute1 = ((float)max1); + float compute2; + int sum; + sum = 0; + for (int ra13 = 0; ra13 < 10; ++ra13) { + sum = ((int)(exp(((float)(dense1[ra13] - compute1))) + ((float)sum))); + } + compute2 = ((float)sum); + float update0; + for (int j3 = 0; j3 < 10; ++j3) { + lenet[j3] = ((float)(exp(((float)(dense1[j3] - compute1))) / ((float)compute2))); + } +} + diff --git a/samples/lenet/lenet_main_withoutq.py b/samples/lenet/lenet_main_withoutq.py new file mode 100644 index 000000000..b16bdd6c3 --- /dev/null +++ b/samples/lenet/lenet_main_withoutq.py @@ -0,0 +1,125 @@ +import heterocl as hcl +import hlib +import numpy as np + +hcl.init() + +def softmax(out, x): + assert len(x.shape) == 2, "only support 2-dim softmax" + m, n = x.shape + k = hcl.reduce_axis(0, n) + max_elem = hcl.compute((m,), lambda i: hcl.max(x[i, k], axis=k)) + k = hcl.reduce_axis(0, n) + expsum = hcl.compute((m,), + lambda i: hcl.sum(hcl.exp(x[i, k] - max_elem[i]), axis=k)) + return hcl.update(out, + lambda i, j: hcl.exp(x[i, j] - max_elem[i]) / expsum[i]) + +def build_lenet(input_image, weight_conv1, weight_conv2, + weight_fc1, weight_fc2, lenet): + # first conv + conv1 = hlib.nn.conv2d_nchw(input_image, weight_conv1) + tanh1 = hlib.nn.tanh(conv1, "tanh1") + pool1 = hlib.nn.max_pool(tanh1, kernel=(2,2), stride=(2,2)) + # second conv + conv2 = hlib.nn.conv2d_nchw(pool1, weight_conv2) + tanh2 = hlib.nn.tanh(conv2, "tanh2") + pool2 = hlib.nn.max_pool(tanh2, kernel=(2,2), stride=(2,2)) + # first fc + flat = hlib.nn.flatten(pool2) + fc1 = hlib.nn.dense(flat, weight_fc1) + tanh3 = hlib.nn.tanh(fc1, "tanh3") + # second fc + fc2 = hlib.nn.dense(tanh3, weight_fc2) + # loss + return softmax(lenet, fc2) + + +import mxnet as mx +# download pretrained lenet model +mx.gluon.utils.download('https://gist.githubusercontent.com/Huyuwei/dc00ce83f537914c64a204133d23b019/raw/79af41e7c8ba9120ea7f35fb1d0484b65bccd54f/lenet-0010.params') +mx.gluon.utils.download('https://gist.githubusercontent.com/Huyuwei/dc00ce83f537914c64a204133d23b019/raw/79af41e7c8ba9120ea7f35fb1d0484b65bccd54f/lenet-symbol.json') +sym, arg_params, aux_params = mx.model.load_checkpoint('lenet', 10) +# get weights +weight_conv1_np = arg_params['convolution0_weight'].asnumpy() +weight_conv2_np = arg_params['convolution1_weight'].asnumpy() +weight_fc1_np = arg_params['fullyconnected0_weight'].asnumpy() +weight_fc2_np = arg_params['fullyconnected1_weight'].asnumpy() + + +# qtype1 = hcl.Fixed(16, 14) +# qtype2 = hcl.Fixed(16, 14) + +# qtype1 = hcl.Fixed(16, 12) +# qtype2 = hcl.Fixed(16, 12) + + + +correct_sum = 0 +batch_size = 1000 +mnist = mx.test_utils.get_mnist() + + +def build_lenet_inf(batch_size=batch_size, target=None): + # set up input/output placeholders + input_image = hcl.placeholder((batch_size, 1, 28, 28), "input_image") + # weight_conv1 = hcl.placeholder((20, 1, 5, 5), "weight_conv1", qtype1) + # weight_conv2 = hcl.placeholder((50, 20, 5, 5), "weight_conv2", qtype1) + # weight_fc1 = hcl.placeholder((500, 800), "weight_fc1", qtype1) + # weight_fc2 = hcl.placeholder((10, 500), "weight_fc2", qtype1) + weight_conv1 = hcl.placeholder((20, 1, 5, 5), "weight_conv1") + weight_conv2 = hcl.placeholder((50, 20, 5, 5), "weight_conv2") + weight_fc1 = hcl.placeholder((500, 800), "weight_fc1") + weight_fc2 = hcl.placeholder((10, 500), "weight_fc2") + lenet = hcl.placeholder((batch_size, 10), "lenet") + # create a quantization scheme + # scheme = hcl.create_scheme( + # [input_image, weight_conv1, weight_conv2, + # weight_fc1, weight_fc2, lenet], build_lenet) + # # quantize the three activation layers + # scheme.quantize( + # [build_lenet.tanh1, build_lenet.tanh2, build_lenet.tanh3], qtype2) + # s = hcl.create_schedule_from_scheme(scheme) + s = hcl.create_schedule([input_image, weight_conv1, weight_conv2, weight_fc1, weight_fc2, lenet], build_lenet) + return hcl.build(s, target=target) + +code1 = build_lenet_inf(batch_size, 'merlinc') +# print (code1) +with open('merlinc_code.cl', 'w') as f: + f.write(code1) + +code2 = build_lenet_inf(batch_size, 'sdaccel') + +with open('sdaccel_code.cl', 'w') as f: + f.write(code2) + +code3 = build_lenet_inf(batch_size, 'vhls') +with open('vhls_code.cl', 'w') as f: + f.write(code3) + +f = build_lenet_inf(batch_size, 'sdaccel_sw_emu') + +# weight_conv1_hcl = hcl.asarray(weight_conv1_np, dtype=qtype1) +# weight_conv2_hcl = hcl.asarray(weight_conv2_np, dtype=qtype1) +# weight_fc1_hcl = hcl.asarray(weight_fc1_np, dtype=qtype1) +# weight_fc2_hcl = hcl.asarray(weight_fc2_np, dtype=qtype1) + +weight_conv1_hcl = hcl.asarray(weight_conv1_np) +weight_conv2_hcl = hcl.asarray(weight_conv2_np) +weight_fc1_hcl = hcl.asarray(weight_fc1_np) +weight_fc2_hcl = hcl.asarray(weight_fc2_np) + + +for i in range(10000 // batch_size): + label = mnist['test_label'][i*batch_size:(i+1)*batch_size] + input_image_np = mnist['test_data'][i*batch_size:(i+1)*batch_size] + input_image_hcl = hcl.asarray(input_image_np) + output_hcl = hcl.asarray(np.zeros((batch_size,10))) + f(input_image_hcl, weight_conv1_hcl, weight_conv2_hcl, + weight_fc1_hcl, weight_fc2_hcl, output_hcl) + print (output_hcl.asnumpy()) + prediction = np.argmax(output_hcl.asnumpy(), axis=1) + correct_sum += np.sum(np.equal(prediction, label)) + +print("Testing accuracy: {}".format(correct_sum / 10000.)) + diff --git a/samples/lenet/lenet_sdaccel.py b/samples/lenet/lenet_sdaccel.py new file mode 100644 index 000000000..917b2b625 --- /dev/null +++ b/samples/lenet/lenet_sdaccel.py @@ -0,0 +1,23 @@ +import heterocl as hcl +import numpy as np +from lenet_main import * + +batch_size = 50 + +# f = build_lenet_inf(batch_size, 'vhls_csim') +f = build_lenet_inf(batch_size, 'sdaccel_sw_emu') + +mnist = mx.test_utils.get_mnist() +correct_sum = 0 + +for i in range(50 // batch_size): + label = mnist['test_label'][i*batch_size:(i+1)*batch_size] + input_image_np = mnist['test_data'][i*batch_size:(i+1)*batch_size] + input_image_hcl = hcl.asarray(input_image_np) + output_hcl = hcl.asarray(np.zeros((batch_size,10))) + f(input_image_hcl, weight_conv1_hcl, weight_conv2_hcl, weight_fc1_hcl, weight_fc2_hcl, output_hcl) + prediction = np.argmax(output_hcl.asnumpy(), axis=1) + correct_sum += np.sum(np.equal(prediction, label)) + +print(str(qtype1) + ", " + str(qtype2) + ": Accuracy over 10000 test images is: {}".format(correct_sum / 10000.)) +assert correct_sum == 9882 diff --git a/samples/lenet/merlinc_code.cl b/samples/lenet/merlinc_code.cl new file mode 100644 index 000000000..1c5118707 --- /dev/null +++ b/samples/lenet/merlinc_code.cl @@ -0,0 +1,155 @@ +#include +#include +#include +#pragma ACCEL kernel +void default_function(int* input_image, int* weight_conv1, int* weight_conv2, int* weight_fc1, int* weight_fc2, int* lenet) { + int conv2d[11520000]; + for (int nn = 0; nn < 1000; ++nn) { + for (int ff = 0; ff < 20; ++ff) { + for (int yy = 0; yy < 24; ++yy) { + for (int xx = 0; xx < 24; ++xx) { + float reducer0; + reducer0 = 0.000000e+00f; + for (int ra1 = 0; ra1 < 5; ++ra1) { + for (int ra2 = 0; ra2 < 5; ++ra2) { + reducer0 = (((float)(((long)input_image[(((xx + ra2) + ((yy + ra1) * 28)) + (nn * 784))]) * ((long)weight_conv1[((ra2 + (ra1 * 5)) + (ff * 25))]))) + reducer0); + } + } + conv2d[(((xx + (yy * 24)) + (ff * 576)) + (nn * 11520))] = ((int)reducer0); + } + } + } + } + int tanh1[11520000]; + for (int args = 0; args < 1000; ++args) { + for (int args0 = 0; args0 < 20; ++args0) { + for (int args1 = 0; args1 < 24; ++args1) { + for (int args2 = 0; args2 < 24; ++args2) { + tanh1[(((args2 + (args1 * 24)) + (args0 * 576)) + (args * 11520))] = ((int)tanh(((double)conv2d[(((args2 + (args1 * 24)) + (args0 * 576)) + (args * 11520))]))); + } + } + } + } + int max_pool[2880000]; + for (int i = 0; i < 1000; ++i) { + for (int c = 0; c < 20; ++c) { + for (int h = 0; h < 12; ++h) { + for (int w = 0; w < 12; ++w) { + float reducer1; + reducer1 = -1.000000e+00f; + for (int ra3 = 0; ra3 < 2; ++ra3) { + for (int ra4 = 0; ra4 < 2; ++ra4) { + reducer1 = max(((float)tanh1[(((((w * 2) + ra4) + (((h * 2) + ra3) * 24)) + (c * 576)) + (i * 11520))]), reducer1); + } + } + max_pool[(((w + (h * 12)) + (c * 144)) + (i * 2880))] = ((int)reducer1); + } + } + } + } + int conv2d1[3200000]; + for (int nn1 = 0; nn1 < 1000; ++nn1) { + for (int ff1 = 0; ff1 < 50; ++ff1) { + for (int yy1 = 0; yy1 < 8; ++yy1) { + for (int xx1 = 0; xx1 < 8; ++xx1) { + float reducer2; + reducer2 = 0.000000e+00f; + for (int ra5 = 0; ra5 < 20; ++ra5) { + for (int ra6 = 0; ra6 < 5; ++ra6) { + for (int ra7 = 0; ra7 < 5; ++ra7) { + reducer2 = (((float)(((long)max_pool[((((xx1 + ra7) + ((yy1 + ra6) * 12)) + (ra5 * 144)) + (nn1 * 2880))]) * ((long)weight_conv2[(((ra7 + (ra6 * 5)) + (ra5 * 25)) + (ff1 * 500))]))) + reducer2); + } + } + } + conv2d1[(((xx1 + (yy1 * 8)) + (ff1 * 64)) + (nn1 * 3200))] = ((int)reducer2); + } + } + } + } + int tanh2[3200000]; + for (int args3 = 0; args3 < 1000; ++args3) { + for (int args01 = 0; args01 < 50; ++args01) { + for (int args11 = 0; args11 < 8; ++args11) { + for (int args21 = 0; args21 < 8; ++args21) { + tanh2[(((args21 + (args11 * 8)) + (args01 * 64)) + (args3 * 3200))] = ((int)tanh(((double)conv2d1[(((args21 + (args11 * 8)) + (args01 * 64)) + (args3 * 3200))]))); + } + } + } + } + int max_pool1[800000]; + for (int i1 = 0; i1 < 1000; ++i1) { + for (int c1 = 0; c1 < 50; ++c1) { + for (int h1 = 0; h1 < 4; ++h1) { + for (int w1 = 0; w1 < 4; ++w1) { + float reducer3; + reducer3 = -1.000000e+00f; + for (int ra8 = 0; ra8 < 2; ++ra8) { + for (int ra9 = 0; ra9 < 2; ++ra9) { + reducer3 = max(((float)tanh2[(((((w1 * 2) + ra9) + (((h1 * 2) + ra8) * 8)) + (c1 * 64)) + (i1 * 3200))]), reducer3); + } + } + max_pool1[(((w1 + (h1 * 4)) + (c1 * 16)) + (i1 * 800))] = ((int)reducer3); + } + } + } + } + int compute0[800000]; + for (int i2 = 0; i2 < 1000; ++i2) { + for (int j = 0; j < 800; ++j) { + compute0[(j + (i2 * 800))] = max_pool1[((((((j / 4) % 4) * 4) + (j % 4)) + ((j / 16) * 16)) + (i2 * 800))]; + } + } + int dense[500000]; + for (int i3 = 0; i3 < 1000; ++i3) { + for (int j1 = 0; j1 < 500; ++j1) { + float reducer4; + reducer4 = 0.000000e+00f; + for (int ra10 = 0; ra10 < 800; ++ra10) { + reducer4 = (((float)(((long)compute0[(ra10 + (i3 * 800))]) * ((long)weight_fc1[(ra10 + (j1 * 800))]))) + reducer4); + } + dense[(j1 + (i3 * 500))] = ((int)reducer4); + } + } + int tanh3[500000]; + for (int args4 = 0; args4 < 1000; ++args4) { + for (int args02 = 0; args02 < 500; ++args02) { + tanh3[(args02 + (args4 * 500))] = ((int)tanh(((double)dense[(args02 + (args4 * 500))]))); + } + } + int dense1[10000]; + for (int i4 = 0; i4 < 1000; ++i4) { + for (int j2 = 0; j2 < 10; ++j2) { + float reducer5; + reducer5 = 0.000000e+00f; + for (int ra11 = 0; ra11 < 500; ++ra11) { + reducer5 = (((float)(((long)tanh3[(ra11 + (i4 * 500))]) * ((long)weight_fc2[(ra11 + (j2 * 500))]))) + reducer5); + } + dense1[(j2 + (i4 * 10))] = ((int)reducer5); + } + } + int compute1[1000]; + for (int i5 = 0; i5 < 1000; ++i5) { + int max; + max = 0; + for (int ra12 = 0; ra12 < 10; ++ra12) { + max = max(dense1[(ra12 + (i5 * 10))], max); + } + compute1[i5] = max; + } + int compute2[1000]; + for (int i6 = 0; i6 < 1000; ++i6) { + int sum; + sum = 0; + for (int ra13 = 0; ra13 < 10; ++ra13) { + sum = ((int)(exp(((double)((long)(dense1[(ra13 + (i6 * 10))] - compute1[i6])))) + ((double)sum))); + } + compute2[i6] = sum; + } + int update0; + for (int i7 = 0; i7 < 1000; ++i7) { + for (int j3 = 0; j3 < 10; ++j3) { + lenet[(j3 + (i7 * 10))] = ((int)(exp(((double)((long)(dense1[(j3 + (i7 * 10))] - compute1[i7])))) / ((double)compute2[i7]))); + } + } +} + diff --git a/samples/lenet/sdaccel.mk b/samples/lenet/sdaccel.mk new file mode 100644 index 000000000..ce266d89e --- /dev/null +++ b/samples/lenet/sdaccel.mk @@ -0,0 +1,32 @@ +ifndef XILINX_SDX +$(error Environment variable XILINX_SDX is required and should point to SDAccel install area) +endif +SDA_FLOW = cpu_emu +HOST_SRCS = host.cpp +HOST_EXE_DIR=. +HOST_EXE = host +HOST_CFLAGS = -g -Wall -DFPGA_DEVICE -DC_KERNEL +HOST_LFLAGS = +KERNEL_SRCS = default_function.cl +KERNEL_NAME = default_function +KERNEL_DEFS = +KERNEL_INCS = +XDEVICE=xilinx:adm-pcie-7v3:1ddr:3.0 +XDEVICE_REPO_PATH= +KEEP_TEMP=1 +KERNEL_DEBUG= +XCLBIN_NAME=bin_krnl +HOST_CFLAGS+=-DTARGET_DEVICE=\"${XDEVICE}\" +BOARD_SETUP_FILE=setup.sh +ifeq (${SDA_FLOW},cpu_emu) + CLCC_OPT += -t sw_emu + XCLBIN = ${XCLBIN_NAME}_cpu_emu.xclbin +else ifeq (${SDA_FLOW},hw_emu) + CLCC_OPT += -t hw_emu + XCLBIN = ${XCLBIN_NAME}_hw_emu.xclbin +else ifeq (${SDA_FLOW},hw) + XCLBIN = ${XCLBIN_NAME}_hw.xclbin +CLCC_OPT += -t hw +endifHOST_ARGS = ${XCLBIN} +COMMON_DIR = ./common +include ${COMMON_DIR}/common.mk diff --git a/samples/lenet/sdaccel_code.cl b/samples/lenet/sdaccel_code.cl new file mode 100644 index 000000000..114880df0 --- /dev/null +++ b/samples/lenet/sdaccel_code.cl @@ -0,0 +1,151 @@ +__kernel void default_function(__global int* input_image, __global int* weight_conv1, __global int* weight_conv2, __global int* weight_fc1, __global int* weight_fc2, __global int* lenet) { + __local int conv2d[11520000]; + for (int nn = 0; nn < 1000; ++nn) { + for (int ff = 0; ff < 20; ++ff) { + for (int yy = 0; yy < 24; ++yy) { + for (int xx = 0; xx < 24; ++xx) { + __local float reducer6; + reducer6 = 0.000000e+00f; + for (int ra15 = 0; ra15 < 5; ++ra15) { + for (int ra16 = 0; ra16 < 5; ++ra16) { + reducer6 = (((float)(((long)input_image[(((xx + ra16) + ((yy + ra15) * 28)) + (nn * 784))]) * ((long)weight_conv1[((ra16 + (ra15 * 5)) + (ff * 25))]))) + reducer6); + } + } + conv2d[(((xx + (yy * 24)) + (ff * 576)) + (nn * 11520))] = ((int)reducer6); + } + } + } + } + __local int tanh1[11520000]; + for (int args = 0; args < 1000; ++args) { + for (int args0 = 0; args0 < 20; ++args0) { + for (int args1 = 0; args1 < 24; ++args1) { + for (int args2 = 0; args2 < 24; ++args2) { + tanh1[(((args2 + (args1 * 24)) + (args0 * 576)) + (args * 11520))] = ((int)tanh(((double)conv2d[(((args2 + (args1 * 24)) + (args0 * 576)) + (args * 11520))]))); + } + } + } + } + __local int max_pool[2880000]; + for (int i = 0; i < 1000; ++i) { + for (int c = 0; c < 20; ++c) { + for (int h = 0; h < 12; ++h) { + for (int w = 0; w < 12; ++w) { + __local float reducer7; + reducer7 = -1.000000e+00f; + for (int ra17 = 0; ra17 < 2; ++ra17) { + for (int ra18 = 0; ra18 < 2; ++ra18) { + reducer7 = max(((float)tanh1[(((((w * 2) + ra18) + (((h * 2) + ra17) * 24)) + (c * 576)) + (i * 11520))]), reducer7); + } + } + max_pool[(((w + (h * 12)) + (c * 144)) + (i * 2880))] = ((int)reducer7); + } + } + } + } + __local int conv2d1[3200000]; + for (int nn1 = 0; nn1 < 1000; ++nn1) { + for (int ff1 = 0; ff1 < 50; ++ff1) { + for (int yy1 = 0; yy1 < 8; ++yy1) { + for (int xx1 = 0; xx1 < 8; ++xx1) { + __local float reducer8; + reducer8 = 0.000000e+00f; + for (int ra19 = 0; ra19 < 20; ++ra19) { + for (int ra20 = 0; ra20 < 5; ++ra20) { + for (int ra21 = 0; ra21 < 5; ++ra21) { + reducer8 = (((float)(((long)max_pool[((((xx1 + ra21) + ((yy1 + ra20) * 12)) + (ra19 * 144)) + (nn1 * 2880))]) * ((long)weight_conv2[(((ra21 + (ra20 * 5)) + (ra19 * 25)) + (ff1 * 500))]))) + reducer8); + } + } + } + conv2d1[(((xx1 + (yy1 * 8)) + (ff1 * 64)) + (nn1 * 3200))] = ((int)reducer8); + } + } + } + } + __local int tanh2[3200000]; + for (int args3 = 0; args3 < 1000; ++args3) { + for (int args01 = 0; args01 < 50; ++args01) { + for (int args11 = 0; args11 < 8; ++args11) { + for (int args21 = 0; args21 < 8; ++args21) { + tanh2[(((args21 + (args11 * 8)) + (args01 * 64)) + (args3 * 3200))] = ((int)tanh(((double)conv2d1[(((args21 + (args11 * 8)) + (args01 * 64)) + (args3 * 3200))]))); + } + } + } + } + __local int max_pool1[800000]; + for (int i1 = 0; i1 < 1000; ++i1) { + for (int c1 = 0; c1 < 50; ++c1) { + for (int h1 = 0; h1 < 4; ++h1) { + for (int w1 = 0; w1 < 4; ++w1) { + __local float reducer9; + reducer9 = -1.000000e+00f; + for (int ra22 = 0; ra22 < 2; ++ra22) { + for (int ra23 = 0; ra23 < 2; ++ra23) { + reducer9 = max(((float)tanh2[(((((w1 * 2) + ra23) + (((h1 * 2) + ra22) * 8)) + (c1 * 64)) + (i1 * 3200))]), reducer9); + } + } + max_pool1[(((w1 + (h1 * 4)) + (c1 * 16)) + (i1 * 800))] = ((int)reducer9); + } + } + } + } + __local int compute3[800000]; + for (int i2 = 0; i2 < 1000; ++i2) { + for (int j = 0; j < 800; ++j) { + compute3[(j + (i2 * 800))] = max_pool1[((((((j / 4) % 4) * 4) + (j % 4)) + ((j / 16) * 16)) + (i2 * 800))]; + } + } + __local int dense[500000]; + for (int i3 = 0; i3 < 1000; ++i3) { + for (int j1 = 0; j1 < 500; ++j1) { + __local float reducer10; + reducer10 = 0.000000e+00f; + for (int ra24 = 0; ra24 < 800; ++ra24) { + reducer10 = (((float)(((long)compute3[(ra24 + (i3 * 800))]) * ((long)weight_fc1[(ra24 + (j1 * 800))]))) + reducer10); + } + dense[(j1 + (i3 * 500))] = ((int)reducer10); + } + } + __local int tanh3[500000]; + for (int args4 = 0; args4 < 1000; ++args4) { + for (int args02 = 0; args02 < 500; ++args02) { + tanh3[(args02 + (args4 * 500))] = ((int)tanh(((double)dense[(args02 + (args4 * 500))]))); + } + } + __local int dense1[10000]; + for (int i4 = 0; i4 < 1000; ++i4) { + for (int j2 = 0; j2 < 10; ++j2) { + __local float reducer11; + reducer11 = 0.000000e+00f; + for (int ra25 = 0; ra25 < 500; ++ra25) { + reducer11 = (((float)(((long)tanh3[(ra25 + (i4 * 500))]) * ((long)weight_fc2[(ra25 + (j2 * 500))]))) + reducer11); + } + dense1[(j2 + (i4 * 10))] = ((int)reducer11); + } + } + __local int compute4[1000]; + for (int i5 = 0; i5 < 1000; ++i5) { + __local int max; + max = 0; + for (int ra26 = 0; ra26 < 10; ++ra26) { + max = max(dense1[(ra26 + (i5 * 10))], max); + } + compute4[i5] = max; + } + __local int compute5[1000]; + for (int i6 = 0; i6 < 1000; ++i6) { + __local int sum; + sum = 0; + for (int ra27 = 0; ra27 < 10; ++ra27) { + sum = ((int)(exp(((double)((long)(dense1[(ra27 + (i6 * 10))] - compute4[i6])))) + ((double)sum))); + } + compute5[i6] = sum; + } + __local int update1; + for (int i7 = 0; i7 < 1000; ++i7) { + for (int j3 = 0; j3 < 10; ++j3) { + lenet[(j3 + (i7 * 10))] = ((int)(exp(((double)((long)(dense1[(j3 + (i7 * 10))] - compute4[i7])))) / ((double)compute5[i7]))); + } + } +} + diff --git a/samples/lenet/vhls_code.cl b/samples/lenet/vhls_code.cl new file mode 100644 index 000000000..3d85466b4 --- /dev/null +++ b/samples/lenet/vhls_code.cl @@ -0,0 +1,155 @@ +#include +#include +#include + +void default_function(ap_int<32> input_image[1000][1][28][28], ap_int<32> weight_conv1[20][1][5][5], ap_int<32> weight_conv2[50][20][5][5], ap_int<32> weight_fc1[500][800], ap_int<32> weight_fc2[10][500], ap_int<32> lenet[1000][10]) { + ap_int<32> conv2d[1000][20][24][24]; + for (ap_int<32> nn = 0; nn < 1000; ++nn) { + for (ap_int<32> ff = 0; ff < 20; ++ff) { + for (ap_int<32> yy = 0; yy < 24; ++yy) { + for (ap_int<32> xx = 0; xx < 24; ++xx) { + float reducer12; + reducer12 = 0.000000e+00f; + for (ap_int<32> ra29 = 0; ra29 < 5; ++ra29) { + for (ap_int<32> ra30 = 0; ra30 < 5; ++ra30) { + reducer12 = (((float)(((ap_int<64>)input_image[nn][0][(yy + ra29)][(xx + ra30)]) * ((ap_int<64>)weight_conv1[ff][0][ra29][ra30]))) + reducer12); + } + } + conv2d[nn][ff][yy][xx] = ((ap_int<32>)reducer12); + } + } + } + } + ap_int<32> tanh1[1000][20][24][24]; + for (ap_int<32> args = 0; args < 1000; ++args) { + for (ap_int<32> args0 = 0; args0 < 20; ++args0) { + for (ap_int<32> args1 = 0; args1 < 24; ++args1) { + for (ap_int<32> args2 = 0; args2 < 24; ++args2) { + tanh1[args][args0][args1][args2] = ((ap_int<32>)tanh(((double)conv2d[args][args0][args1][args2]))); + } + } + } + } + ap_int<32> max_pool[1000][20][12][12]; + for (ap_int<32> i = 0; i < 1000; ++i) { + for (ap_int<32> c = 0; c < 20; ++c) { + for (ap_int<32> h = 0; h < 12; ++h) { + for (ap_int<32> w = 0; w < 12; ++w) { + float reducer13; + reducer13 = -1.000000e+00f; + for (ap_int<32> ra31 = 0; ra31 < 2; ++ra31) { + for (ap_int<32> ra32 = 0; ra32 < 2; ++ra32) { + reducer13 = std::max(((float)tanh1[i][c][((h * 2) + ra31)][((w * 2) + ra32)]), reducer13); + } + } + max_pool[i][c][h][w] = ((ap_int<32>)reducer13); + } + } + } + } + ap_int<32> conv2d1[1000][50][8][8]; + for (ap_int<32> nn1 = 0; nn1 < 1000; ++nn1) { + for (ap_int<32> ff1 = 0; ff1 < 50; ++ff1) { + for (ap_int<32> yy1 = 0; yy1 < 8; ++yy1) { + for (ap_int<32> xx1 = 0; xx1 < 8; ++xx1) { + float reducer14; + reducer14 = 0.000000e+00f; + for (ap_int<32> ra33 = 0; ra33 < 20; ++ra33) { + for (ap_int<32> ra34 = 0; ra34 < 5; ++ra34) { + for (ap_int<32> ra35 = 0; ra35 < 5; ++ra35) { + reducer14 = (((float)(((ap_int<64>)max_pool[nn1][ra33][(yy1 + ra34)][(xx1 + ra35)]) * ((ap_int<64>)weight_conv2[ff1][ra33][ra34][ra35]))) + reducer14); + } + } + } + conv2d1[nn1][ff1][yy1][xx1] = ((ap_int<32>)reducer14); + } + } + } + } + ap_int<32> tanh2[1000][50][8][8]; + for (ap_int<32> args3 = 0; args3 < 1000; ++args3) { + for (ap_int<32> args01 = 0; args01 < 50; ++args01) { + for (ap_int<32> args11 = 0; args11 < 8; ++args11) { + for (ap_int<32> args21 = 0; args21 < 8; ++args21) { + tanh2[args3][args01][args11][args21] = ((ap_int<32>)tanh(((double)conv2d1[args3][args01][args11][args21]))); + } + } + } + } + ap_int<32> max_pool1[1000][50][4][4]; + for (ap_int<32> i1 = 0; i1 < 1000; ++i1) { + for (ap_int<32> c1 = 0; c1 < 50; ++c1) { + for (ap_int<32> h1 = 0; h1 < 4; ++h1) { + for (ap_int<32> w1 = 0; w1 < 4; ++w1) { + float reducer15; + reducer15 = -1.000000e+00f; + for (ap_int<32> ra36 = 0; ra36 < 2; ++ra36) { + for (ap_int<32> ra37 = 0; ra37 < 2; ++ra37) { + reducer15 = std::max(((float)tanh2[i1][c1][((h1 * 2) + ra36)][((w1 * 2) + ra37)]), reducer15); + } + } + max_pool1[i1][c1][h1][w1] = ((ap_int<32>)reducer15); + } + } + } + } + ap_int<32> compute6[1000][800]; + for (ap_int<32> i2 = 0; i2 < 1000; ++i2) { + for (ap_int<32> j = 0; j < 800; ++j) { + compute6[i2][j] = max_pool1[i2][(j / 16)][((j / 4) % 4)][(j % 4)]; + } + } + ap_int<32> dense[1000][500]; + for (ap_int<32> i3 = 0; i3 < 1000; ++i3) { + for (ap_int<32> j1 = 0; j1 < 500; ++j1) { + float reducer16; + reducer16 = 0.000000e+00f; + for (ap_int<32> ra38 = 0; ra38 < 800; ++ra38) { + reducer16 = (((float)(((ap_int<64>)compute6[i3][ra38]) * ((ap_int<64>)weight_fc1[j1][ra38]))) + reducer16); + } + dense[i3][j1] = ((ap_int<32>)reducer16); + } + } + ap_int<32> tanh3[1000][500]; + for (ap_int<32> args4 = 0; args4 < 1000; ++args4) { + for (ap_int<32> args02 = 0; args02 < 500; ++args02) { + tanh3[args4][args02] = ((ap_int<32>)tanh(((double)dense[args4][args02]))); + } + } + ap_int<32> dense1[1000][10]; + for (ap_int<32> i4 = 0; i4 < 1000; ++i4) { + for (ap_int<32> j2 = 0; j2 < 10; ++j2) { + float reducer17; + reducer17 = 0.000000e+00f; + for (ap_int<32> ra39 = 0; ra39 < 500; ++ra39) { + reducer17 = (((float)(((ap_int<64>)tanh3[i4][ra39]) * ((ap_int<64>)weight_fc2[j2][ra39]))) + reducer17); + } + dense1[i4][j2] = ((ap_int<32>)reducer17); + } + } + ap_int<32> compute7[1000]; + for (ap_int<32> i5 = 0; i5 < 1000; ++i5) { + ap_int<32> max; + max = 0; + for (ap_int<32> ra40 = 0; ra40 < 10; ++ra40) { + max = std::max(dense1[i5][ra40], max); + } + compute7[i5] = max; + } + ap_int<32> compute8[1000]; + for (ap_int<32> i6 = 0; i6 < 1000; ++i6) { + ap_int<32> sum; + sum = 0; + for (ap_int<32> ra41 = 0; ra41 < 10; ++ra41) { + sum = ((ap_int<32>)(exp(((double)((ap_int<33>)(dense1[i6][ra41] - compute7[i6])))) + ((double)sum))); + } + compute8[i6] = sum; + } + ap_int<32> update2; + for (ap_int<32> i7 = 0; i7 < 1000; ++i7) { + for (ap_int<32> j3 = 0; j3 < 10; ++j3) { + lenet[i7][j3] = ((ap_int<32>)(exp(((double)((ap_int<33>)(dense1[i7][j3] - compute7[i7])))) / ((double)compute8[i7]))); + } + } +} + diff --git a/samples/smith_waterman/common/common.mk b/samples/smith_waterman/common/common.mk new file mode 100644 index 000000000..3409e4aa5 --- /dev/null +++ b/samples/smith_waterman/common/common.mk @@ -0,0 +1,55 @@ +SHELL = /bin/bash +VPATH = ./ +CC = xcpp +CLCC = xocc +ifeq ($(XDEVICE_REPO_PATH),) + DEVICE_REPO_OPT = +else +DEVICE_REPO_OPT = --xp prop:solution.device_repo_paths=${XDEVICE_REPO_PATH} +endif +HOST_CFLAGS += -I${XILINX_SDX}/runtime/include/1_2 +HOST_LFLAGS += -L${XILINX_SDX}/runtime/lib/x86_64 -lxilinxopencl -lrt -pthread +CLCC_OPT += $(CLCC_OPT_LEVEL) ${DEVICE_REPO_OPT} --xdevice ${XDEVICE} -o ${XCLBIN} ${KERNEL_DEFS} ${KERNEL_INCS} +ifeq (${KEEP_TEMP},1) + CLCC_OPT += -s +endif +ifeq (${KERNEL_DEBUG},1) + CLCC_OPT += -g +endif +CLCC_OPT += --kernel ${KERNEL_NAME} +OBJECTS := $(HOST_SRCS:.cpp=.o) +.PHONY: all +all: run +host: ${HOST_EXE_DIR}/${HOST_EXE} +xbin_cpu_em: + make SDA_FLOW=cpu_emu xbin -f sdaccel.mk +xbin_hw_em: + make SDA_FLOW=hw_emu xbin -f sdaccel.mk +xbin_hw : + make SDA_FLOW=hw xbin -f sdaccel.mk +xbin: ${XCLBIN} +run_cpu_em: + make SDA_FLOW=cpu_emu run_em -f sdaccel.mk +run_hw_em: + make SDA_FLOW=hw_emu run_em -f sdaccel.mk +run_hw : + make SDA_FLOW=hw run_hw_int -f sdaccel.mk +run_em: xconfig host xbin + XCL_EMULATION_MODE=true ${HOST_EXE_DIR}/${HOST_EXE} ${HOST_ARGS} +run_hw_int : host xbin_hw + source ${BOARD_SETUP_FILE};${HOST_EXE_DIR}/${HOST_EXE} ${HOST_ARGS} +estimate : + ${CLCC} -c -t hw_emu --xdevice ${XDEVICE} --report estimate ${KERNEL_SRCS} +xconfig : emconfig.json +emconfig.json : + emconfigutil --xdevice ${XDEVICE} ${DEVICE_REPO_OPT} --od . +${HOST_EXE_DIR}/${HOST_EXE} : ${OBJECTS} + ${CC} ${HOST_LFLAGS} ${OBJECTS} -o $@ +${XCLBIN}: + ${CLCC} ${CLCC_OPT} ${KERNEL_SRCS} +%.o: %.cpp + ${CC} ${HOST_CFLAGS} -c $< -o $@ +clean: + ${RM} -rf ${HOST_EXE} ${OBJECTS} ${XCLBIN} emconfig.json _xocc_${XCLBIN_NAME}_*.dir .Xil +cleanall: clean + ${RM} -rf *.xclbin sdaccel_profile_summary.* _xocc_* TempConfig *.log *.jou diff --git a/samples/smith_waterman/lenet_aocl.cl b/samples/smith_waterman/lenet_aocl.cl new file mode 100644 index 000000000..bf8608082 --- /dev/null +++ b/samples/smith_waterman/lenet_aocl.cl @@ -0,0 +1,143 @@ +#include "ihc_apint.h" +__kernel void default_function(__global uint3_t* restrict seqAs, __global uint3_t* restrict seqBs, __global uint3_t* restrict outAs, __global uint3_t* restrict outBs) { + int B; + #pragma ii 1 + for (int t_outer = 0; t_outer < 32; ++t_outer) { + #pragma unroll + for (int t_inner = 0; t_inner < 32; ++t_inner) { + int maxtrix_max; + maxtrix_max = 0; + int i_max; + i_max = 0; + int j_max; + j_max = 0; + short matrix[16641]; + for (int x = 0; x < 129; ++x) { + for (int y = 0; y < 129; ++y) { + matrix[(y + (x * 129))] = (short)0; + } + } + short action[16641]; + for (int x1 = 0; x1 < 129; ++x1) { + for (int y1 = 0; y1 < 129; ++y1) { + action[(y1 + (x1 * 129))] = (short)3; + } + } + int mutate3; + for (int i = 0; i < 129; ++i) { + for (int j = 0; j < 129; ++j) { + int trace_back[4]; + for (int x2 = 0; x2 < 4; ++x2) { + trace_back[x2] = 0; + } + if ((i != 0) && (j != 0)) { + trace_back[0] = ((int)(((int33_t)matrix[((j + (i * 129)) + -130)]) + ((int33_t)(int)((seqAs[((i + ((t_inner + (t_outer * 32)) * 128)) + -1)] == seqBs[((j + ((t_inner + (t_outer * 32)) * 128)) + -1)]) ? 1 : -4)))); + trace_back[1] = (((int)matrix[((j + (i * 129)) + -129)]) + -4); + trace_back[2] = (((int)matrix[((j + (i * 129)) + -1)]) + -4); + trace_back[3] = 0; + int max; + max = trace_back[0]; + int act; + act = 0; + for (int i1 = 0; i1 < 4; ++i1) { + if (max < trace_back[i1]) { + max = trace_back[i1]; + act = i1; + } + } + matrix[(j + (i * 129))] = ((short)max); + action[(j + (i * 129))] = ((short)act); + if (maxtrix_max < ((int)matrix[(j + (i * 129))])) { + maxtrix_max = ((int)matrix[(j + (i * 129))]); + i_max = i; + j_max = j; + } + } + } + } + int T; + int curr_i; + curr_i = i_max; + int curr_j; + curr_j = j_max; + int next_i; + next_i = 0; + int next_j; + next_j = 0; + int act1; + act1 = ((int)action[(curr_j + (curr_i * 129))]); + int next_i1; + next_i1 = 0; + int next_j1; + next_j1 = 0; + if (act1 == 0) { + next_i1 = (curr_i + -1); + next_j1 = (curr_j + -1); + } else { + if (act1 == 1) { + next_i1 = (curr_i + -1); + next_j1 = curr_j; + } else { + if (act1 == 2) { + next_i1 = curr_i; + next_j1 = (curr_j + -1); + } else { + next_i1 = curr_i; + next_j1 = curr_j; + } + } + } + next_i = next_i1; + next_j = next_j1; + int tick; + tick = 0; + while (((curr_i != next_i) || (curr_j != next_j))) { + int a; + a = 0; + int b; + b = 0; + if (next_i == curr_i) { + a = 0; + } else { + a = ((int)seqAs[((curr_i + ((t_inner + (t_outer * 32)) * 128)) + -1)]); + } + if (next_j == curr_j) { + b = 0; + } else { + b = ((int)seqBs[((curr_j + ((t_inner + (t_outer * 32)) * 128)) + -1)]); + } + outAs[(tick + ((t_inner + (t_outer * 32)) * 256))] = ((uint3_t)a); + outBs[(tick + ((t_inner + (t_outer * 32)) * 256))] = ((uint3_t)b); + curr_i = next_i; + curr_j = next_j; + int act2; + act2 = ((int)action[(curr_j + (curr_i * 129))]); + int next_i2; + next_i2 = 0; + int next_j2; + next_j2 = 0; + if (act2 == 0) { + next_i2 = (curr_i + -1); + next_j2 = (curr_j + -1); + } else { + if (act2 == 1) { + next_i2 = (curr_i + -1); + next_j2 = curr_j; + } else { + if (act2 == 2) { + next_i2 = curr_i; + next_j2 = (curr_j + -1); + } else { + next_i2 = curr_i; + next_j2 = curr_j; + } + } + } + next_i = next_i2; + next_j = next_j2; + tick = (tick + 1); + } + } + } +} + diff --git a/samples/smith_waterman/main.cpp b/samples/smith_waterman/main.cpp new file mode 100644 index 000000000..851a98bf7 --- /dev/null +++ b/samples/smith_waterman/main.cpp @@ -0,0 +1,135 @@ +#define CL_HPP_CL_1_2_DEFAULT_BUILD +#define CL_HPP_TARGET_OPENCL_VERSION 120 +#define CL_HPP_MINIMUM_OPENCL_VERSION 120 +#define CL_HPP_ENABLE_PROGRAM_CONSTRUCTION_FROM_ARRAY_COMPATIBILITY 1 +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#pragma once + + + + +int main(void) { +#if defined(SDX_PLATFORM) && !defined(TARGET_DEVICE) + #define STR_VALUE(arg) #arg + #define GET_STRING(name) STR_VALUE(name) + #define TARGET_DEVICE GET_STRING(SDX_PLATFORM) +#endif + char* xclbinFilename = argv[1]; + + std::vector source_0(1024 * 128); + std::vector source_1(1024 * 128); + std::vector source_2(1024 * 256); + std::vector source_3(1024 * 256); + + size_t vector_size_bytes_0 = sizeof(unsigned int) * 1024 * 128; + size_t vector_size_bytes_1 = sizeof(unsigned int) * 1024 * 128; + size_t vector_size_bytes_2 = sizeof(unsigned int) * 1024 * 256; + size_t vector_size_bytes_3 = sizeof(unsigned int) * 1024 * 256; + + unsigned int* arg_0 = (unsigned int*)shmat(1769476, nullptr, 0); + for (size_t i0 = 0; i0 < 1024; i0++) { + for (size_t i1 = 0; i1 < 128; i1++) { + source_0[i1 + i0*128] = arg_0[i1 + i0*128]; + } + } + unsigned int* arg_1 = (unsigned int*)shmat(3538944, nullptr, 0); + for (size_t i0 = 0; i0 < 1024; i0++) { + for (size_t i1 = 0; i1 < 128; i1++) { + source_1[i1 + i0*128] = arg_1[i1 + i0*128]; + } + } + unsigned int* arg_2 = (unsigned int*)shmat(3538945, nullptr, 0); + for (size_t i0 = 0; i0 < 1024; i0++) { + for (size_t i1 = 0; i1 < 256; i1++) { + source_2[i1 + i0*256] = arg_2[i1 + i0*256]; + } + } + unsigned int* arg_3 = (unsigned int*)shmat(2162690, nullptr, 0); + for (size_t i0 = 0; i0 < 1024; i0++) { + for (size_t i1 = 0; i1 < 256; i1++) { + source_3[i1 + i0*256] = arg_3[i1 + i0*256]; + } + } + std::vector platforms; + cl::Platform::get(&platforms); + cl::Platform platform = platforms[0]; + + std::vector devices; + platform.getDevices(CL_DEVICE_TYPE_ACCELERATOR, &devices); + cl::Device device = devices[0]; + + cl::Context context(device); + cl::CommandQueue q(context, device); + + std::ifstream bin_file(xclbinFilename, std::ifstream::binary); + bin_file.seekg (0, bin_file.end); + unsigned nb = bin_file.tellg(); + bin_file.seekg (0, bin_file.beg); + char *buf = new char [nb]; + bin_file.read(buf, nb); + + cl::Program::Binaries bins; + bins.push_back({buf,nb}); + devices.resize(1); + cl::Program program(context, devices, bins); + + int err1; + cl::Kernel kernel(program, "default_function", &err1); + auto default_function = cl::KernelFunctor(kernel); + + cl::Buffer buffer_0(context, CL_MEM_READ_WRITE, vector_size_bytes_0); + cl::Buffer buffer_1(context, CL_MEM_READ_WRITE, vector_size_bytes_1); + cl::Buffer buffer_2(context, CL_MEM_READ_WRITE, vector_size_bytes_2); + cl::Buffer buffer_3(context, CL_MEM_READ_WRITE, vector_size_bytes_3); + + q.enqueueWriteBuffer(buffer_0, CL_TRUE, 0, vector_size_bytes_0, source_0.data()); + q.enqueueWriteBuffer(buffer_1, CL_TRUE, 0, vector_size_bytes_1, source_1.data()); + q.enqueueWriteBuffer(buffer_2, CL_TRUE, 0, vector_size_bytes_2, source_2.data()); + q.enqueueWriteBuffer(buffer_3, CL_TRUE, 0, vector_size_bytes_3, source_3.data()); + + default_function(cl::EnqueueArgs(q, cl::NDRange(1,1,1), cl::NDRange(1,1,1)),buffer_0, buffer_1, buffer_2, buffer_3); + q.finish(); + + q.enqueueReadBuffer(buffer_0, CL_TRUE, 0, vector_size_bytes_0, source_0.data()); + q.enqueueReadBuffer(buffer_1, CL_TRUE, 0, vector_size_bytes_1, source_1.data()); + q.enqueueReadBuffer(buffer_2, CL_TRUE, 0, vector_size_bytes_2, source_2.data()); + q.enqueueReadBuffer(buffer_3, CL_TRUE, 0, vector_size_bytes_3, source_3.data()); + + for (size_t i0 = 0; i0 < 1024; i0++) { + for (size_t i1 = 0; i1 < 128; i1++) { + arg_0[i1 + i0*128] = source_0[i1 + i0*128]; + } + } + shmdt(arg_0); + for (size_t i0 = 0; i0 < 1024; i0++) { + for (size_t i1 = 0; i1 < 128; i1++) { + arg_1[i1 + i0*128] = source_1[i1 + i0*128]; + } + } + shmdt(arg_1); + for (size_t i0 = 0; i0 < 1024; i0++) { + for (size_t i1 = 0; i1 < 256; i1++) { + arg_2[i1 + i0*256] = source_2[i1 + i0*256]; + } + } + shmdt(arg_2); + for (size_t i0 = 0; i0 < 1024; i0++) { + for (size_t i1 = 0; i1 < 256; i1++) { + arg_3[i1 + i0*256] = source_3[i1 + i0*256]; + } + } + shmdt(arg_3); +} diff --git a/samples/smith_waterman/merlinc_code.cl b/samples/smith_waterman/merlinc_code.cl new file mode 100644 index 000000000..c3a347f35 --- /dev/null +++ b/samples/smith_waterman/merlinc_code.cl @@ -0,0 +1,146 @@ +#include +#include +#include +#pragma ACCEL kernel +void default_function(unsigned char* seqAs, unsigned char* seqBs, unsigned char* outAs, unsigned char* outBs) { + int B; +#pragma ACCEL pipeline + for (int t_outer = 0; t_outer < 32; ++t_outer) { +#pragma ACCEL parallel + for (int t_inner = 0; t_inner < 32; ++t_inner) { + int maxtrix_max; + maxtrix_max = 0; + int i_max; + i_max = 0; + int j_max; + j_max = 0; + short matrix[16641]; + for (int x = 0; x < 129; ++x) { + for (int y = 0; y < 129; ++y) { + matrix[(y + (x * 129))] = (short)0; + } + } + short action[16641]; + for (int x1 = 0; x1 < 129; ++x1) { + for (int y1 = 0; y1 < 129; ++y1) { + action[(y1 + (x1 * 129))] = (short)3; + } + } + int mutate3; + for (int i = 0; i < 129; ++i) { + for (int j = 0; j < 129; ++j) { + int trace_back[4]; + for (int x2 = 0; x2 < 4; ++x2) { + trace_back[x2] = 0; + } + if ((i != 0) && (j != 0)) { + trace_back[0] = ((int)(((long)matrix[((j + (i * 129)) + -130)]) + ((long)((seqAs[((i + ((t_inner + (t_outer * 32)) * 128)) + -1)] == seqBs[((j + ((t_inner + (t_outer * 32)) * 128)) + -1)]) ? 1 : -4)))); + trace_back[1] = (((int)matrix[((j + (i * 129)) + -129)]) + -4); + trace_back[2] = (((int)matrix[((j + (i * 129)) + -1)]) + -4); + trace_back[3] = 0; + int max; + max = trace_back[0]; + int act; + act = 0; + for (int i1 = 0; i1 < 4; ++i1) { + if (max < trace_back[i1]) { + max = trace_back[i1]; + act = i1; + } + } + matrix[(j + (i * 129))] = ((short)max); + action[(j + (i * 129))] = ((short)act); + if (maxtrix_max < ((int)matrix[(j + (i * 129))])) { + maxtrix_max = ((int)matrix[(j + (i * 129))]); + i_max = i; + j_max = j; + } + } + } + } + int T; + int curr_i; + curr_i = i_max; + int curr_j; + curr_j = j_max; + int next_i; + next_i = 0; + int next_j; + next_j = 0; + int act1; + act1 = ((int)action[(curr_j + (curr_i * 129))]); + int next_i1; + next_i1 = 0; + int next_j1; + next_j1 = 0; + if (act1 == 0) { + next_i1 = (curr_i + -1); + next_j1 = (curr_j + -1); + } else { + if (act1 == 1) { + next_i1 = (curr_i + -1); + next_j1 = curr_j; + } else { + if (act1 == 2) { + next_i1 = curr_i; + next_j1 = (curr_j + -1); + } else { + next_i1 = curr_i; + next_j1 = curr_j; + } + } + } + next_i = next_i1; + next_j = next_j1; + int tick; + tick = 0; + while (((curr_i != next_i) || (curr_j != next_j))) { + int a; + a = 0; + int b; + b = 0; + if (next_i == curr_i) { + a = 0; + } else { + a = ((int)seqAs[((curr_i + ((t_inner + (t_outer * 32)) * 128)) + -1)]); + } + if (next_j == curr_j) { + b = 0; + } else { + b = ((int)seqBs[((curr_j + ((t_inner + (t_outer * 32)) * 128)) + -1)]); + } + outAs[(tick + ((t_inner + (t_outer * 32)) * 256))] = ((unsigned char)a); + outBs[(tick + ((t_inner + (t_outer * 32)) * 256))] = ((unsigned char)b); + curr_i = next_i; + curr_j = next_j; + int act2; + act2 = ((int)action[(curr_j + (curr_i * 129))]); + int next_i2; + next_i2 = 0; + int next_j2; + next_j2 = 0; + if (act2 == 0) { + next_i2 = (curr_i + -1); + next_j2 = (curr_j + -1); + } else { + if (act2 == 1) { + next_i2 = (curr_i + -1); + next_j2 = curr_j; + } else { + if (act2 == 2) { + next_i2 = curr_i; + next_j2 = (curr_j + -1); + } else { + next_i2 = curr_i; + next_j2 = curr_j; + } + } + } + next_i = next_i2; + next_j = next_j2; + tick = (tick + 1); + } + } + } +} + diff --git a/samples/smith_waterman/sdaccel.mk b/samples/smith_waterman/sdaccel.mk new file mode 100644 index 000000000..ce266d89e --- /dev/null +++ b/samples/smith_waterman/sdaccel.mk @@ -0,0 +1,32 @@ +ifndef XILINX_SDX +$(error Environment variable XILINX_SDX is required and should point to SDAccel install area) +endif +SDA_FLOW = cpu_emu +HOST_SRCS = host.cpp +HOST_EXE_DIR=. +HOST_EXE = host +HOST_CFLAGS = -g -Wall -DFPGA_DEVICE -DC_KERNEL +HOST_LFLAGS = +KERNEL_SRCS = default_function.cl +KERNEL_NAME = default_function +KERNEL_DEFS = +KERNEL_INCS = +XDEVICE=xilinx:adm-pcie-7v3:1ddr:3.0 +XDEVICE_REPO_PATH= +KEEP_TEMP=1 +KERNEL_DEBUG= +XCLBIN_NAME=bin_krnl +HOST_CFLAGS+=-DTARGET_DEVICE=\"${XDEVICE}\" +BOARD_SETUP_FILE=setup.sh +ifeq (${SDA_FLOW},cpu_emu) + CLCC_OPT += -t sw_emu + XCLBIN = ${XCLBIN_NAME}_cpu_emu.xclbin +else ifeq (${SDA_FLOW},hw_emu) + CLCC_OPT += -t hw_emu + XCLBIN = ${XCLBIN_NAME}_hw_emu.xclbin +else ifeq (${SDA_FLOW},hw) + XCLBIN = ${XCLBIN_NAME}_hw.xclbin +CLCC_OPT += -t hw +endifHOST_ARGS = ${XCLBIN} +COMMON_DIR = ./common +include ${COMMON_DIR}/common.mk diff --git a/samples/smith_waterman/sdaccel_code.cl b/samples/smith_waterman/sdaccel_code.cl new file mode 100644 index 000000000..a0f5fdb01 --- /dev/null +++ b/samples/smith_waterman/sdaccel_code.cl @@ -0,0 +1,142 @@ +__kernel void default_function(__global unsigned char* seqAs, __global unsigned char* seqBs, __global unsigned char* outAs, __global unsigned char* outBs) { + __local int B; + __attribute__((xcl_pipeline_loop(1))) + for (int t_outer = 0; t_outer < 2; ++t_outer) { + + for (int t_inner = 0; t_inner < 32; ++t_inner) { + __local int maxtrix_max; + maxtrix_max = 0; + __local int i_max; + i_max = 0; + __local int j_max; + j_max = 0; + __local short matrix[841]; + for (int x = 0; x < 29; ++x) { + for (int y = 0; y < 29; ++y) { + matrix[(y + (x * 29))] = (short)0; + } + } + __local short action[841]; + for (int x1 = 0; x1 < 29; ++x1) { + for (int y1 = 0; y1 < 29; ++y1) { + action[(y1 + (x1 * 29))] = (short)3; + } + } + __local int mutate1; + for (int i = 0; i < 29; ++i) { + for (int j = 0; j < 29; ++j) { + __local int trace_back[4]; + for (int x2 = 0; x2 < 4; ++x2) { + trace_back[x2] = 0; + } + if ((i != 0) && (j != 0)) { + trace_back[0] = ((int)(((long)matrix[((j + (i * 29)) + -30)]) + ((long)(int)((seqAs[((i + ((t_inner + (t_outer * 32)) * 28)) + -1)] == seqBs[((j + ((t_inner + (t_outer * 32)) * 28)) + -1)]) ? 1 : -4)))); + trace_back[1] = (((int)matrix[((j + (i * 29)) + -29)]) + -4); + trace_back[2] = (((int)matrix[((j + (i * 29)) + -1)]) + -4); + trace_back[3] = 0; + __local int max; + max = trace_back[0]; + __local int act; + act = 0; + for (int i1 = 0; i1 < 4; ++i1) { + if (max < trace_back[i1]) { + max = trace_back[i1]; + act = i1; + } + } + matrix[(j + (i * 29))] = ((short)max); + action[(j + (i * 29))] = ((short)act); + if (maxtrix_max < ((int)matrix[(j + (i * 29))])) { + maxtrix_max = ((int)matrix[(j + (i * 29))]); + i_max = i; + j_max = j; + } + } + } + } + __local int T; + __local int curr_i; + curr_i = i_max; + __local int curr_j; + curr_j = j_max; + __local int next_i; + next_i = 0; + __local int next_j; + next_j = 0; + __local int act1; + act1 = ((int)action[(curr_j + (curr_i * 29))]); + __local int next_i1; + next_i1 = 0; + __local int next_j1; + next_j1 = 0; + if (act1 == 0) { + next_i1 = (curr_i + -1); + next_j1 = (curr_j + -1); + } else { + if (act1 == 1) { + next_i1 = (curr_i + -1); + next_j1 = curr_j; + } else { + if (act1 == 2) { + next_i1 = curr_i; + next_j1 = (curr_j + -1); + } else { + next_i1 = curr_i; + next_j1 = curr_j; + } + } + } + next_i = next_i1; + next_j = next_j1; + __local int tick; + tick = 0; + while (((curr_i != next_i) || (curr_j != next_j))) { + __local int a; + a = 0; + __local int b; + b = 0; + if (next_i == curr_i) { + a = 0; + } else { + a = ((int)seqAs[((curr_i + ((t_inner + (t_outer * 32)) * 28)) + -1)]); + } + if (next_j == curr_j) { + b = 0; + } else { + b = ((int)seqBs[((curr_j + ((t_inner + (t_outer * 32)) * 28)) + -1)]); + } + outAs[(tick + ((t_inner + (t_outer * 32)) * 56))] = ((unsigned char)a); + outBs[(tick + ((t_inner + (t_outer * 32)) * 56))] = ((unsigned char)b); + curr_i = next_i; + curr_j = next_j; + __local int act2; + act2 = ((int)action[(curr_j + (curr_i * 29))]); + __local int next_i2; + next_i2 = 0; + __local int next_j2; + next_j2 = 0; + if (act2 == 0) { + next_i2 = (curr_i + -1); + next_j2 = (curr_j + -1); + } else { + if (act2 == 1) { + next_i2 = (curr_i + -1); + next_j2 = curr_j; + } else { + if (act2 == 2) { + next_i2 = curr_i; + next_j2 = (curr_j + -1); + } else { + next_i2 = curr_i; + next_j2 = curr_j; + } + } + } + next_i = next_i2; + next_j = next_j2; + tick = (tick + 1); + } + } + } +} + diff --git a/samples/smith_waterman/sdaccel_code_nounroll.cl b/samples/smith_waterman/sdaccel_code_nounroll.cl new file mode 100644 index 000000000..d5e145c05 --- /dev/null +++ b/samples/smith_waterman/sdaccel_code_nounroll.cl @@ -0,0 +1,142 @@ +__kernel void default_function(__global unsigned char* seqAs, __global unsigned char* seqBs, __global unsigned char* outAs, __global unsigned char* outBs) { + __local int B; + __attribute__((xcl_pipeline_loop(1))) + for (int t_outer = 0; t_outer < 32; ++t_outer) { + __attribute__((opencl_unroll_hint(2))) + for (int t_inner = 0; t_inner < 32; ++t_inner) { + __local int maxtrix_max; + maxtrix_max = 0; + __local int i_max; + i_max = 0; + __local int j_max; + j_max = 0; + __local short matrix[16641]; + for (int x = 0; x < 129; ++x) { + for (int y = 0; y < 129; ++y) { + matrix[(y + (x * 129))] = (short)0; + } + } + __local short action[16641]; + for (int x1 = 0; x1 < 129; ++x1) { + for (int y1 = 0; y1 < 129; ++y1) { + action[(y1 + (x1 * 129))] = (short)3; + } + } + __local int mutate1; + for (int i = 0; i < 129; ++i) { + for (int j = 0; j < 129; ++j) { + __local int trace_back[4]; + for (int x2 = 0; x2 < 4; ++x2) { + trace_back[x2] = 0; + } + if ((i != 0) && (j != 0)) { + trace_back[0] = ((int)(((long)matrix[((j + (i * 129)) + -130)]) + ((long)(int)((seqAs[((i + ((t_inner + (t_outer * 32)) * 128)) + -1)] == seqBs[((j + ((t_inner + (t_outer * 32)) * 128)) + -1)]) ? 1 : -4)))); + trace_back[1] = (((int)matrix[((j + (i * 129)) + -129)]) + -4); + trace_back[2] = (((int)matrix[((j + (i * 129)) + -1)]) + -4); + trace_back[3] = 0; + __local int max; + max = trace_back[0]; + __local int act; + act = 0; + for (int i1 = 0; i1 < 4; ++i1) { + if (max < trace_back[i1]) { + max = trace_back[i1]; + act = i1; + } + } + matrix[(j + (i * 129))] = ((short)max); + action[(j + (i * 129))] = ((short)act); + if (maxtrix_max < ((int)matrix[(j + (i * 129))])) { + maxtrix_max = ((int)matrix[(j + (i * 129))]); + i_max = i; + j_max = j; + } + } + } + } + __local int T; + __local int curr_i; + curr_i = i_max; + __local int curr_j; + curr_j = j_max; + __local int next_i; + next_i = 0; + __local int next_j; + next_j = 0; + __local int act1; + act1 = ((int)action[(curr_j + (curr_i * 129))]); + __local int next_i1; + next_i1 = 0; + __local int next_j1; + next_j1 = 0; + if (act1 == 0) { + next_i1 = (curr_i + -1); + next_j1 = (curr_j + -1); + } else { + if (act1 == 1) { + next_i1 = (curr_i + -1); + next_j1 = curr_j; + } else { + if (act1 == 2) { + next_i1 = curr_i; + next_j1 = (curr_j + -1); + } else { + next_i1 = curr_i; + next_j1 = curr_j; + } + } + } + next_i = next_i1; + next_j = next_j1; + __local int tick; + tick = 0; + while (((curr_i != next_i) || (curr_j != next_j))) { + __local int a; + a = 0; + __local int b; + b = 0; + if (next_i == curr_i) { + a = 0; + } else { + a = ((int)seqAs[((curr_i + ((t_inner + (t_outer * 32)) * 128)) + -1)]); + } + if (next_j == curr_j) { + b = 0; + } else { + b = ((int)seqBs[((curr_j + ((t_inner + (t_outer * 32)) * 128)) + -1)]); + } + outAs[(tick + ((t_inner + (t_outer * 32)) * 256))] = ((unsigned char)a); + outBs[(tick + ((t_inner + (t_outer * 32)) * 256))] = ((unsigned char)b); + curr_i = next_i; + curr_j = next_j; + __local int act2; + act2 = ((int)action[(curr_j + (curr_i * 129))]); + __local int next_i2; + next_i2 = 0; + __local int next_j2; + next_j2 = 0; + if (act2 == 0) { + next_i2 = (curr_i + -1); + next_j2 = (curr_j + -1); + } else { + if (act2 == 1) { + next_i2 = (curr_i + -1); + next_j2 = curr_j; + } else { + if (act2 == 2) { + next_i2 = curr_i; + next_j2 = (curr_j + -1); + } else { + next_i2 = curr_i; + next_j2 = curr_j; + } + } + } + next_i = next_i2; + next_j = next_j2; + tick = (tick + 1); + } + } + } +} + diff --git a/samples/smith_waterman/smith_aocl.cl b/samples/smith_waterman/smith_aocl.cl new file mode 100644 index 000000000..80a4ba601 --- /dev/null +++ b/samples/smith_waterman/smith_aocl.cl @@ -0,0 +1,143 @@ +#include "ihc_apint.h" +__kernel void default_function(__global uint* restrict seqAs, __global uint* restrict seqBs, __global uint* restrict outAs, __global uint* restrict outBs) { + int B; + #pragma ii 1 + for (int t_outer = 0; t_outer < 2; ++t_outer) { + #pragma unroll + for (int t_inner = 0; t_inner < 32; ++t_inner) { + int maxtrix_max; + maxtrix_max = 0; + int i_max; + i_max = 0; + int j_max; + j_max = 0; + short matrix[841]; + for (int x = 0; x < 29; ++x) { + for (int y = 0; y < 29; ++y) { + matrix[(y + (x * 29))] = (short)0; + } + } + short action[841]; + for (int x1 = 0; x1 < 29; ++x1) { + for (int y1 = 0; y1 < 29; ++y1) { + action[(y1 + (x1 * 29))] = (short)3; + } + } + int mutate3; + for (int i = 0; i < 29; ++i) { + for (int j = 0; j < 29; ++j) { + int trace_back[4]; + for (int x2 = 0; x2 < 4; ++x2) { + trace_back[x2] = 0; + } + if ((i != 0) && (j != 0)) { + trace_back[0] = ((int)(((int33_t)matrix[((j + (i * 29)) + -30)]) + ((int33_t)(int)((seqAs[((i + ((t_inner + (t_outer * 32)) * 28)) + -1)] == seqBs[((j + ((t_inner + (t_outer * 32)) * 28)) + -1)]) ? 1 : -4)))); + trace_back[1] = (((int)matrix[((j + (i * 29)) + -29)]) + -4); + trace_back[2] = (((int)matrix[((j + (i * 29)) + -1)]) + -4); + trace_back[3] = 0; + int max; + max = trace_back[0]; + int act; + act = 0; + for (int i1 = 0; i1 < 4; ++i1) { + if (max < trace_back[i1]) { + max = trace_back[i1]; + act = i1; + } + } + matrix[(j + (i * 29))] = ((short)max); + action[(j + (i * 29))] = ((short)act); + if (maxtrix_max < ((int)matrix[(j + (i * 29))])) { + maxtrix_max = ((int)matrix[(j + (i * 29))]); + i_max = i; + j_max = j; + } + } + } + } + int T; + int curr_i; + curr_i = i_max; + int curr_j; + curr_j = j_max; + int next_i; + next_i = 0; + int next_j; + next_j = 0; + int act1; + act1 = ((int)action[(curr_j + (curr_i * 29))]); + int next_i1; + next_i1 = 0; + int next_j1; + next_j1 = 0; + if (act1 == 0) { + next_i1 = (curr_i + -1); + next_j1 = (curr_j + -1); + } else { + if (act1 == 1) { + next_i1 = (curr_i + -1); + next_j1 = curr_j; + } else { + if (act1 == 2) { + next_i1 = curr_i; + next_j1 = (curr_j + -1); + } else { + next_i1 = curr_i; + next_j1 = curr_j; + } + } + } + next_i = next_i1; + next_j = next_j1; + int tick; + tick = 0; + while (((curr_i != next_i) || (curr_j != next_j))) { + int a; + a = 0; + int b; + b = 0; + if (next_i == curr_i) { + a = 0; + } else { + a = ((int)seqAs[((curr_i + ((t_inner + (t_outer * 32)) * 28)) + -1)]); + } + if (next_j == curr_j) { + b = 0; + } else { + b = ((int)seqBs[((curr_j + ((t_inner + (t_outer * 32)) * 28)) + -1)]); + } + outAs[(tick + ((t_inner + (t_outer * 32)) * 56))] = ((uint3_t)a); + outBs[(tick + ((t_inner + (t_outer * 32)) * 56))] = ((uint3_t)b); + curr_i = next_i; + curr_j = next_j; + int act2; + act2 = ((int)action[(curr_j + (curr_i * 29))]); + int next_i2; + next_i2 = 0; + int next_j2; + next_j2 = 0; + if (act2 == 0) { + next_i2 = (curr_i + -1); + next_j2 = (curr_j + -1); + } else { + if (act2 == 1) { + next_i2 = (curr_i + -1); + next_j2 = curr_j; + } else { + if (act2 == 2) { + next_i2 = curr_i; + next_j2 = (curr_j + -1); + } else { + next_i2 = curr_i; + next_j2 = curr_j; + } + } + } + next_i = next_i2; + next_j = next_j2; + tick = (tick + 1); + } + } + } +} + diff --git a/samples/smith_waterman/smith_vhls.cl b/samples/smith_waterman/smith_vhls.cl new file mode 100644 index 000000000..4fd36c8aa --- /dev/null +++ b/samples/smith_waterman/smith_vhls.cl @@ -0,0 +1,146 @@ +#include +#include +#include + +void default_function(ap_uint<3> seqAs[64][28], ap_uint<3> seqBs[64][28], ap_uint<3> outAs[64][56], ap_uint<3> outBs[64][56]) { + ap_int<32> B; + for (ap_int<32> t_outer = 0; t_outer < 2; ++t_outer) { + #pragma HLS pipeline + for (ap_int<32> t_inner = 0; t_inner < 32; ++t_inner) { + #pragma HLS unroll + ap_int<32> maxtrix_max; + maxtrix_max = 0; + ap_int<32> i_max; + i_max = 0; + ap_int<32> j_max; + j_max = 0; + ap_int<16> matrix[29][29]; + for (ap_int<32> x = 0; x < 29; ++x) { + for (ap_int<32> y = 0; y < 29; ++y) { + matrix[x][y] = (ap_int<16>)0; + } + } + ap_int<16> action[29][29]; + for (ap_int<32> x1 = 0; x1 < 29; ++x1) { + for (ap_int<32> y1 = 0; y1 < 29; ++y1) { + action[x1][y1] = (ap_int<16>)3; + } + } + ap_int<32> mutate5; + for (ap_int<32> i = 0; i < 29; ++i) { + for (ap_int<32> j = 0; j < 29; ++j) { + ap_int<32> trace_back[4]; + for (ap_int<32> x2 = 0; x2 < 4; ++x2) { + trace_back[x2] = 0; + } + if ((i != 0) && (j != 0)) { + trace_back[0] = ((ap_int<32>)(((ap_int<33>)matrix[(i + -1)][(j + -1)]) + ((ap_int<33>)((seqAs[(t_inner + (t_outer * 32))][(i + -1)] == seqBs[(t_inner + (t_outer * 32))][(j + -1)]) ? 1 : -4)))); + trace_back[1] = (((ap_int<32>)matrix[(i + -1)][j]) + -4); + trace_back[2] = (((ap_int<32>)matrix[i][(j + -1)]) + -4); + trace_back[3] = 0; + ap_int<32> max; + max = trace_back[0]; + ap_int<32> act; + act = 0; + for (ap_int<32> i1 = 0; i1 < 4; ++i1) { + if (max < trace_back[i1]) { + max = trace_back[i1]; + act = i1; + } + } + matrix[i][j] = ((ap_int<16>)max); + action[i][j] = ((ap_int<16>)act); + if (maxtrix_max < ((ap_int<32>)matrix[i][j])) { + maxtrix_max = ((ap_int<32>)matrix[i][j]); + i_max = i; + j_max = j; + } + } + } + } + ap_int<32> T; + ap_int<32> curr_i; + curr_i = i_max; + ap_int<32> curr_j; + curr_j = j_max; + ap_int<32> next_i; + next_i = 0; + ap_int<32> next_j; + next_j = 0; + ap_int<32> act1; + act1 = ((ap_int<32>)action[((curr_j / 29) + curr_i)][(curr_j % 29)]); + ap_int<32> next_i1; + next_i1 = 0; + ap_int<32> next_j1; + next_j1 = 0; + if (act1 == 0) { + next_i1 = (curr_i + -1); + next_j1 = (curr_j + -1); + } else { + if (act1 == 1) { + next_i1 = (curr_i + -1); + next_j1 = curr_j; + } else { + if (act1 == 2) { + next_i1 = curr_i; + next_j1 = (curr_j + -1); + } else { + next_i1 = curr_i; + next_j1 = curr_j; + } + } + } + next_i = next_i1; + next_j = next_j1; + ap_int<32> tick; + tick = 0; + while (((curr_i != next_i) || (curr_j != next_j))) { + ap_int<32> a; + a = 0; + ap_int<32> b; + b = 0; + if (next_i == curr_i) { + a = 0; + } else { + a = ((ap_int<32>)seqAs[((((curr_i - ((curr_i + -1) % 28)) + ((t_inner + (t_outer * 32)) * 28)) + -1) / 28)][((curr_i + -1) % 28)]); + } + if (next_j == curr_j) { + b = 0; + } else { + b = ((ap_int<32>)seqBs[((((curr_j - ((curr_j + -1) % 28)) + ((t_inner + (t_outer * 32)) * 28)) + -1) / 28)][((curr_j + -1) % 28)]); + } + outAs[((tick / 56) + (t_inner + (t_outer * 32)))][(tick % 56)] = ((ap_uint<3>)a); + outBs[((tick / 56) + (t_inner + (t_outer * 32)))][(tick % 56)] = ((ap_uint<3>)b); + curr_i = next_i; + curr_j = next_j; + ap_int<32> act2; + act2 = ((ap_int<32>)action[((curr_j / 29) + curr_i)][(curr_j % 29)]); + ap_int<32> next_i2; + next_i2 = 0; + ap_int<32> next_j2; + next_j2 = 0; + if (act2 == 0) { + next_i2 = (curr_i + -1); + next_j2 = (curr_j + -1); + } else { + if (act2 == 1) { + next_i2 = (curr_i + -1); + next_j2 = curr_j; + } else { + if (act2 == 2) { + next_i2 = curr_i; + next_j2 = (curr_j + -1); + } else { + next_i2 = curr_i; + next_j2 = curr_j; + } + } + } + next_i = next_i2; + next_j = next_j2; + tick = (tick + 1); + } + } + } +} + diff --git a/samples/smith_waterman/smith_waterman_sdaccel.py b/samples/smith_waterman/smith_waterman_sdaccel.py new file mode 100644 index 000000000..354cac757 --- /dev/null +++ b/samples/smith_waterman/smith_waterman_sdaccel.py @@ -0,0 +1,24 @@ +import heterocl as hcl +import numpy as np +from smith_waterman_main import * + +# f = top("vhls_csim") +f = top("sdaccel_sw_emu") + +# add a very simple test +_seqA_np = np.ones((num, lenA)) +for i in range(0, 4): + _seqA_np[0][i] = 2 +_seqB_np = np.ones((num, lenB)) +_seqA = hcl.asarray(_seqA_np, dtype) +_seqB = hcl.asarray(_seqB_np, dtype) +_consensusA = hcl.asarray(np.zeros((num, (lenA + lenB))), dtype) +_consensusB = hcl.asarray(np.zeros((num, (lenA + lenB))), dtype) +f(_seqA, _seqB, _consensusA, _consensusB) +_consensusA_np = _consensusA.asnumpy() +_consensusB_np = _consensusB.asnumpy() +for i in range(0, 256): + if i < 124: + assert _consensusA_np[0][i] == 1 + else: + assert _consensusA_np[0][i] == 0 diff --git a/samples/smith_waterman/vhls_code.cl b/samples/smith_waterman/vhls_code.cl new file mode 100644 index 000000000..8066bc2c2 --- /dev/null +++ b/samples/smith_waterman/vhls_code.cl @@ -0,0 +1,146 @@ +#include +#include +#include + +void default_function(ap_uint<3> seqAs[1024][128], ap_uint<3> seqBs[1024][128], ap_uint<3> outAs[1024][256], ap_uint<3> outBs[1024][256]) { + ap_int<32> B; + for (ap_int<32> t_outer = 0; t_outer < 32; ++t_outer) { + #pragma HLS pipeline + for (ap_int<32> t_inner = 0; t_inner < 32; ++t_inner) { + #pragma HLS unroll + ap_int<32> maxtrix_max; + maxtrix_max = 0; + ap_int<32> i_max; + i_max = 0; + ap_int<32> j_max; + j_max = 0; + ap_int<16> matrix[129][129]; + for (ap_int<32> x = 0; x < 129; ++x) { + for (ap_int<32> y = 0; y < 129; ++y) { + matrix[x][y] = (ap_int<16>)0; + } + } + ap_int<16> action[129][129]; + for (ap_int<32> x1 = 0; x1 < 129; ++x1) { + for (ap_int<32> y1 = 0; y1 < 129; ++y1) { + action[x1][y1] = (ap_int<16>)3; + } + } + ap_int<32> mutate3; + for (ap_int<32> i = 0; i < 129; ++i) { + for (ap_int<32> j = 0; j < 129; ++j) { + ap_int<32> trace_back[4]; + for (ap_int<32> x2 = 0; x2 < 4; ++x2) { + trace_back[x2] = 0; + } + if ((i != 0) && (j != 0)) { + trace_back[0] = ((ap_int<32>)(((ap_int<33>)matrix[(i + -1)][(j + -1)]) + ((ap_int<33>)((seqAs[(t_inner + (t_outer * 32))][(i + -1)] == seqBs[(t_inner + (t_outer * 32))][(j + -1)]) ? 1 : -4)))); + trace_back[1] = (((ap_int<32>)matrix[(i + -1)][j]) + -4); + trace_back[2] = (((ap_int<32>)matrix[i][(j + -1)]) + -4); + trace_back[3] = 0; + ap_int<32> max; + max = trace_back[0]; + ap_int<32> act; + act = 0; + for (ap_int<32> i1 = 0; i1 < 4; ++i1) { + if (max < trace_back[i1]) { + max = trace_back[i1]; + act = i1; + } + } + matrix[i][j] = ((ap_int<16>)max); + action[i][j] = ((ap_int<16>)act); + if (maxtrix_max < ((ap_int<32>)matrix[i][j])) { + maxtrix_max = ((ap_int<32>)matrix[i][j]); + i_max = i; + j_max = j; + } + } + } + } + ap_int<32> T; + ap_int<32> curr_i; + curr_i = i_max; + ap_int<32> curr_j; + curr_j = j_max; + ap_int<32> next_i; + next_i = 0; + ap_int<32> next_j; + next_j = 0; + ap_int<32> act1; + act1 = ((ap_int<32>)action[((curr_j / 129) + curr_i)][(curr_j % 129)]); + ap_int<32> next_i1; + next_i1 = 0; + ap_int<32> next_j1; + next_j1 = 0; + if (act1 == 0) { + next_i1 = (curr_i + -1); + next_j1 = (curr_j + -1); + } else { + if (act1 == 1) { + next_i1 = (curr_i + -1); + next_j1 = curr_j; + } else { + if (act1 == 2) { + next_i1 = curr_i; + next_j1 = (curr_j + -1); + } else { + next_i1 = curr_i; + next_j1 = curr_j; + } + } + } + next_i = next_i1; + next_j = next_j1; + ap_int<32> tick; + tick = 0; + while (((curr_i != next_i) || (curr_j != next_j))) { + ap_int<32> a; + a = 0; + ap_int<32> b; + b = 0; + if (next_i == curr_i) { + a = 0; + } else { + a = ((ap_int<32>)seqAs[((((curr_i - ((curr_i + -1) % 128)) + ((t_inner + (t_outer * 32)) * 128)) + -1) / 128)][((curr_i + -1) % 128)]); + } + if (next_j == curr_j) { + b = 0; + } else { + b = ((ap_int<32>)seqBs[((((curr_j - ((curr_j + -1) % 128)) + ((t_inner + (t_outer * 32)) * 128)) + -1) / 128)][((curr_j + -1) % 128)]); + } + outAs[((tick / 256) + (t_inner + (t_outer * 32)))][(tick % 256)] = ((ap_uint<3>)a); + outBs[((tick / 256) + (t_inner + (t_outer * 32)))][(tick % 256)] = ((ap_uint<3>)b); + curr_i = next_i; + curr_j = next_j; + ap_int<32> act2; + act2 = ((ap_int<32>)action[((curr_j / 129) + curr_i)][(curr_j % 129)]); + ap_int<32> next_i2; + next_i2 = 0; + ap_int<32> next_j2; + next_j2 = 0; + if (act2 == 0) { + next_i2 = (curr_i + -1); + next_j2 = (curr_j + -1); + } else { + if (act2 == 1) { + next_i2 = (curr_i + -1); + next_j2 = curr_j; + } else { + if (act2 == 2) { + next_i2 = curr_i; + next_j2 = (curr_j + -1); + } else { + next_i2 = curr_i; + next_j2 = curr_j; + } + } + } + next_i = next_i2; + next_j = next_j2; + tick = (tick + 1); + } + } + } +} + diff --git a/samples/sobel/sobel.py b/samples/sobel/sobel.py new file mode 100644 index 000000000..a4299d8ae --- /dev/null +++ b/samples/sobel/sobel.py @@ -0,0 +1,91 @@ +import heterocl as hcl +import hlib +import numpy as np +from PIL import Image +from urllib.request import urlopen + +batch_size = 1 +hcl.init(hcl.UInt(32)) +dtype = hcl.UInt(32) +image_size = () +kernel_size = 3 + +# setup target using vivado +tool = hcl.tool.vivado("csim") +target = hcl.platform.zc706 + +def sobel(): + image = hcl.placeholder((batch_size, 1, 256, 256), "input_image") + k1 = hcl.placeholder((1, 1, 3, 3), "kernel_1") + k2 = hcl.placeholder((1, 1, 3, 3), "kernel_2") + + def kernel(input_image, kernel_1, kernel_2): + + def absolute(image, *args): + with hcl.if_(image[args] > 0): + hcl.return_(image[args]) + with hcl.else_(): + hcl.return_(-1 * image[args]) + + def dev(gx, gy, org): + assert gx.shape == gy.shape, "mismatch" + rx = hcl.reduce_axis(0, 255, "rx") + ry = hcl.reduce_axis(0, 255, "ry") + mat_sum = hcl.compute(gx.shape, lambda nn, ff, xx, yy: + gx[nn, ff, xx, yy] + gy[nn, ff, xx, yy], name="add") + return hcl.compute(mat_sum.shape, lambda nn, ff, xx, yy: + mat_sum[nn, ff, xx, yy] * 255.0 / hcl.max(mat_sum[nn, ff, rx, ry], axis=[rx, ry]), + name = "derv") + + # make the conv op a kernel on fpga. + # return tensor required (cannot do def_()) + output_shape = (1,1,254,254) + + # make compute wrapped in hcl def + module1 = hcl.def_([input_image.shape, kernel_1.shape, output_shape], name="conv1")(hlib.nn.conv2d_nchw_imp) + module2 = hcl.def_([input_image.shape, kernel_1.shape, output_shape], name="conv2")(hlib.nn.conv2d_nchw_imp) + conv1 = hcl.compute(output_shape, lambda *args: 0) + conv2 = hcl.compute(output_shape, lambda *args: 0) + module1(input_image, kernel_1, conv1) + module2(input_image, kernel_2, conv2) + + abs1 = hcl.compute(conv1.shape, + lambda *args: absolute(conv1, *args)) + abs2 = hcl.compute(conv2.shape, + lambda *args: absolute(conv2, *args)) + + # derivative module for normalization + return dev(abs1, abs2, input_image) + + s = hcl.create_schedule([image, k1, k2], kernel) + + # data moved to local + i0, k10 = s.to([image, k1], target.fpga) + s.to([i0, k10], s[kernel.conv1]) + s.to(kernel.derv, target.cpu) + + # create stream channel between modules + print(type(target.fpga), hcl.lower(s)) + return hcl.build(s, target) + +# Load sample data +img = Image.open(urlopen('http://i.stack.imgur.com/8zINU.gif')) +kernel_x = np.array([[-1, 0, 1], [-2, 0, 2], [-1, 0, 1]]) +kernel_y = np.flip(kernel_x.T.T, axis=0) +img = np.array(img) + +img = img[np.newaxis, ...] +img = img[np.newaxis, ...] +kernel_x = kernel_x[np.newaxis, ...] +kernel_x = kernel_x[np.newaxis, ...] +kernel_y = kernel_y[np.newaxis, ...] +kernel_y = kernel_y[np.newaxis, ...] + +hcl_input = hcl.asarray(img, dtype) +kernel_x = hcl.asarray(kernel_x, dtype) +kernel_y = hcl.asarray(kernel_y, dtype) +hcl_output = hcl.asarray(np.zeros((1,1,254,254)), dtype) + +f = sobel() +f(hcl_input, kernel_x, kernel_y, hcl_output) + diff --git a/samples/stream/example.cl b/samples/stream/example.cl new file mode 100644 index 000000000..fa3cfbd81 --- /dev/null +++ b/samples/stream/example.cl @@ -0,0 +1,34 @@ +#include "ihc_apint.h" +#pragma OPENCL EXTENSION cl_intel_channels : enable +channel int ret_add_c; +channel int ret_mul_c; +__kernel void ret_add(__global int* restrict ret_add_a, __global int* restrict ret_add_b) { + for (int i = 0; i < 10; ++i) { + for (int i1 = 0; i1 < 20; ++i1) { + write_channel_intel(ret_add_c, ((int)(((int33_t)ret_add_a[(i1 + (i * 20))]) + ((int33_t)ret_add_b[(i1 + (i * 20))])))); + } + } +} + +__kernel void ret_mul(__global int* restrict ret_mul_d, __global int* restrict ret_mul_e) { + for (int i = 0; i < 10; ++i) { + for (int i1 = 0; i1 < 20; ++i1) { + ret_mul_e[(i1 + (i * 20))] = ((int)(((long)read_channel_intel(ret_mul_c)) * ((long)ret_mul_d[(i1 + (i * 20))]))); + } + } +} + +__kernel void default_function(__global int* restrict a, __global int* restrict b, __global int* restrict c, __global int* restrict d, __global int* restrict e) { + int ret_add; + int ret_mul; + for (int x = 0; x < 10; ++x) { + for (int y = 0; y < 20; ++y) { + c[(y + (x * 20))] = 0; + } + } + int ret_add0; + ret_add(a, b); + int ret_mul0; + ret_mul(d, e); +} + diff --git a/samples/stream/mod.py b/samples/stream/mod.py new file mode 100644 index 000000000..8c12ad722 --- /dev/null +++ b/samples/stream/mod.py @@ -0,0 +1,32 @@ +import heterocl as hcl + +hcl.init() +initiation_interval = 4 +a = hcl.placeholder((10, 20)) +b = hcl.placeholder((10, 20)) + +@hcl.def_([a.shape, b.shape, (), ()]) +def ret_add(a, b, x, y): + hcl.return_(a[x, y] + b[x, y]) + +@hcl.def_([a.shape, b.shape, (), ()]) +def ret_mul(a, b, x, y): + hcl.return_(a[x, y] * b[x, y]) + +c = hcl.compute(a.shape, lambda i, j: ret_add(a, b, i, j)) +d = hcl.compute(b.shape, lambda i, j: ret_mul(a, b, i, j)) +s = hcl.create_schedule([a, b, c, d]) + +# compute customization +s[c].pipeline(c.axis[0], initiation_interval) +s.partition(b, dim=2, factor=2) + +# stream into modules / device +# s[c].stream_to(ret_mul) +# s[d].stream_to(hcl.FPGA) + +print(hcl.lower(s)) +code = hcl.build(s, target="vhls") +print(code) + + diff --git a/samples/stream/stream.py b/samples/stream/stream.py new file mode 100644 index 000000000..5c2396a57 --- /dev/null +++ b/samples/stream/stream.py @@ -0,0 +1,58 @@ +import heterocl as hcl + +hcl.init() +target = hcl.platform.zc706 +initiation_interval = 4 + +a = hcl.placeholder((10, 20), name="a") +b = hcl.placeholder((10, 20), name="b") +c = hcl.placeholder((10, 20), name="c") +d = hcl.placeholder((10, 20), name="d") +e = hcl.placeholder((10, 20), name="e") + +def add_mul(a, b, c, d, e): + @hcl.def_([a.shape, b.shape, c.shape]) + def ret_add(a, b, c): + with hcl.for_(0, a.shape[0]) as i: + with hcl.for_(0, a.shape[1]) as j: + c[i, j] = a[i, j] + b[i, j] + + @hcl.def_([c.shape, d.shape, e.shape]) + def ret_mul(c, d, e): + # hcl.update(c, lambda x, y: a[x, y] * b[x, y], 'c_mul') + with hcl.for_(0, c.shape[0]) as i: + with hcl.for_(0, c.shape[1]) as j: + e[i, j] = c[i, j] * d[i, j] + + ret_add(a, b, c) + ret_mul(c, d, e) + +# compute customization +s = hcl.create_schedule([a, b, c, d, e], add_mul) +# op1 = add_mul.ret_add.c +# op2 = add_mul.ret_mul.c +# s[op1].pipeline(op1.axis[0], initiation_interval) + +# stream into modules / device +a0, b0 = s.to([a, b], target.xcel) +d0 = s.to(d, target.xcel) +#s.partition(b0, dim=2, factor=2) +s.to([a0, b0], s[add_mul.ret_add]) +s.to(d0, s[add_mul.ret_mul]) + +# within device move producer to consumer +s.to(c, s[add_mul.ret_mul], + s[add_mul.ret_add], depth=10) + +# return tensor for inter-device move +# e0 = s.stream_to(e, hcl.CPU('riscv')) + +# print(add_mul.ret_mul._buf, c._buf) +print(hcl.lower(s)) +code = hcl.build(s, target) +print(code) +# +# with open("example.cl", "w") as f: +# f.write(code) +# f.close() + diff --git a/tests/test_codegen_aocl.py b/tests/test_codegen_aocl.py new file mode 100644 index 000000000..a72d364f2 --- /dev/null +++ b/tests/test_codegen_aocl.py @@ -0,0 +1,99 @@ +import heterocl as hcl + +def test_ap_int(): + hcl.init(); + A = hcl.placeholder((1, 32), dtype=hcl.Int(3)) + B = hcl.placeholder((1, 32), dtype=hcl.UInt(3)) + C = hcl.compute(A.shape, lambda i, j: A[i][j] + B[i][j], dtype=hcl.Int(8)) + s = hcl.create_schedule([A, B, C]) + code = hcl.build(s, target='aocl') + print (code) + assert "#pragma OPENCL EXTENSION cl_intel_arbitrary_precision_integers : enable" in code + assert "ap_int<3> intd_t" in code + assert "ap_uint<3> uintd_t" in code + assert "ap_int<8> intd_t" in code + +def test_pragma(): + hcl.init() + A = hcl.placeholder((10, 32), "A") + B = hcl.placeholder((10, 32)) + C = hcl.compute(A.shape, lambda i, j: A[i][j] + B[i][j]) + + # unroll + s1 = hcl.create_schedule([A, B, C]) + s1[C].unroll(C.axis[1], factor=4) + code1 = hcl.build(s1, target='aocl') + print (code1) + assert "#pragma unroll 4" in code1 + + # pipeline + s2 = hcl.create_schedule([A, B, C]) + s2[C].pipeline(C.axis[0], initiation_interval=2) + code2 = hcl.build(s2, target='aocl') + print (code2) + assert "#pragma ii 2" in code2 + +def test_reorder(): + hcl.init() + A = hcl.placeholder((10, 100), "A") + + def two_stage(A): + B = hcl.compute(A.shape, lambda x, y : A[x, y] + 1, "B") + C = hcl.compute(A.shape, lambda x, y : B[x, y] + 1, "C") + return C + + s = hcl.create_schedule([A], two_stage) + s_B = two_stage.B + code = hcl.build(s, target='aocl') + print (code) + s[s_B].reorder(s_B.axis[1], s_B.axis[0]) + code2 = hcl.build(s, target='aocl') + print (code2) + +def test_split_fuse(): + hcl.init() + A = hcl.placeholder((10, 100), "A") + + def two_stage(A): + B = hcl.compute(A.shape, lambda x, y : A[x, y] + 1, "B") + C = hcl.compute(A.shape, lambda x, y : B[x, y] + 1, 'C') + return C + + s = hcl.create_schedule([A], two_stage) + s_B = two_stage.B + x_out, x_in = s[s_B].split(s_B.axis[0], 5) + code = hcl.build(s, target='aocl') + print (code) + s2 = hcl.create_schedule([A], two_stage) + s2_B = two_stage.B + x_y = s[s_B].fuse(s2_B.axis[0], s2_B.axis[1]) + code2 = hcl.build(s2, target='aocl') + print (code2) + +def test_binary_conv(): + hcl.init() + A = hcl.placeholder((1, 32, 14, 14), dtype=hcl.UInt(1), name="A") + B = hcl.placeholder((64, 32, 3, 3), dtype=hcl.UInt(1), name="B") + rc = hcl.reduce_axis(0, 32) + ry = hcl.reduce_axis(0, 3) + rx = hcl.reduce_axis(0, 3) + C = hcl.compute((1, 64, 12, 12), + lambda nn, ff, yy, xx: hcl.sum( + A[nn, rc, yy + ry, xx + rx] * B[ff, rc, ry, rx], axis=[rc, ry, rx]), + dtype=hcl.UInt(8), name="C") + s = hcl.create_schedule([A, B, C]) + s[C].split(C.axis[1], factor=5) + code = hcl.build(s, target='aocl') + print (code) + assert "for (ap_int<32> intd_t ff_outer = 0; ff_outer < 13; ++ff_outer)" in code + assert "for (ap_int<32> intd_t ff_inner = 0; ff_inner < 5; ++ff_inner)" in code + assert "if (ff_inner < (64 - (ff_outer * 5)))" in code + +if __name__ == '__main__': + test_ap_int() + test_pragma() + test_reorder() + test_split_fuse() + test_binary_conv() + + diff --git a/tests/test_codegen_ihls.py b/tests/test_codegen_ihls.py index fc5a7e53b..1b53f18ca 100644 --- a/tests/test_codegen_ihls.py +++ b/tests/test_codegen_ihls.py @@ -65,3 +65,4 @@ def kernel(A): s = hcl.create_schedule([A], kernel) code = hcl.build(s, target="ihls") assert "A[0].slc<4>(1)" in code + diff --git a/tests/test_codegen_sdaccel.py b/tests/test_codegen_sdaccel.py new file mode 100644 index 000000000..43d94f238 --- /dev/null +++ b/tests/test_codegen_sdaccel.py @@ -0,0 +1,36 @@ +import heterocl as hcl + + + + + +def test_pragma(): + hcl.init(hcl.Float()) + A = hcl.placeholder((10, 32), "A") + B = hcl.placeholder((10, 32)) + C = hcl.compute(A.shape, lambda i, j: A[i][j] + B[i][j]) + + # unroll + s1 = hcl.create_schedule([A, B, C]) + s1[C].unroll(C.axis[1], factor=6) + code1 = hcl.build(s1, target='sdaccel') + print (code1) + assert "__attribute__((opencl_unroll_hint(6)))" in code1 + + # pipeline + s2 = hcl.create_schedule([A, B, C]) + s2[C].pipeline(C.axis[0], initiation_interval=2) + code2 = hcl.build(s2, target='sdaccel') + print (code2) + assert "__attribute__((xcl_pipeline_loop(2)))" in code2 + + # partition + s3 = hcl.create_schedule([A, B, C]) + s3.partition(A, hcl.Partition.Block, dim=2, factor=2) + code3 = hcl.build(s3, target='sdaccel') + print (code3) + assert "__attribute__((xcl_array_partition(block,2,2)))" in code3 + + +if __name__ == "__main__": + test_pragma() \ No newline at end of file diff --git a/tests/test_codegen_soda.py b/tests/test_codegen_soda.py index 56fb8df77..492ee6146 100644 --- a/tests/test_codegen_soda.py +++ b/tests/test_codegen_soda.py @@ -52,6 +52,7 @@ def test_blur(self): img_t(0, 0) = uint16((int32((uint18((uint17(img_i(-1, 0)) + uint17(img_i(0, 0)))) + uint18(img_i(1, 0)))) / 3)) output uint16: img_o(0, 0) = uint16((int32((uint18((uint17(img_t(0, -1)) + uint17(img_t(0, 0)))) + uint18(img_t(0, 1)))) / 3)) + ''') def test_gaussian(self): @@ -76,6 +77,7 @@ def test_gaussian(self): reduce_ssa3 = float32(((float64(img_i(-1, 0)) * 3699.65) + float64(reduce_ssa2))) reduce_ssa4 = float32(((float64(img_i(0, 0)) * 4620.30) + float64(reduce_ssa3))) img_o(0, 0) = reduce_ssa4 + ''' ) diff --git a/tests/test_codegen_vhls.py b/tests/test_codegen_vhls.py index dadae5068..a6385975b 100644 --- a/tests/test_codegen_vhls.py +++ b/tests/test_codegen_vhls.py @@ -85,7 +85,7 @@ def test_index_split(): s = hcl.create_schedule([A, B]) s[B].split(B.axis[0], 5) code = hcl.build(s, target="vhls") - assert "B[(y_inner + (y_outer * 5))][x]" in code + assert "B[(x + ((y_inner + (y_outer * 5)) * 10))]" in code def test_index_split_reshape(): hcl.init() @@ -95,7 +95,7 @@ def test_index_split_reshape(): s[B].split(B.axis[0], 5) s.reshape(B, (2, 5, 10)) code = hcl.build(s, target="vhls") - assert "B[y_outer][y_inner][x]" in code + assert "B[(x + ((y_inner + (y_outer * 5)) * 10))]" in code def test_index_fuse(): hcl.init() @@ -104,7 +104,7 @@ def test_index_fuse(): s = hcl.create_schedule([A, B]) s[B].fuse(B.axis[0], B.axis[1]) code = hcl.build(s, target="vhls") - assert "B[(y_x_fused / 10)][(y_x_fused % 10)]" in code + assert "B[y_x_fused]" in code def test_binary_conv(): hcl.init() diff --git a/tvm/HalideIR/src/ir/Expr.h b/tvm/HalideIR/src/ir/Expr.h index b78a466ed..4b70d51fc 100644 --- a/tvm/HalideIR/src/ir/Expr.h +++ b/tvm/HalideIR/src/ir/Expr.h @@ -91,6 +91,9 @@ enum class IRNodeType : int { /** for memory customization **/ Reuse, Partition, + /** for data stream **/ + StreamExpr, + StreamStmt, /** for stencil analysis **/ Stencil }; @@ -302,6 +305,20 @@ enum class PartitionType : int { Cyclic = 2 }; +/** An enum describing the stream type */ +enum class StreamType : int { + Channel = 0, + Pipe = 1, + FIFO = 2 +}; + +/** An enum class for device type */ +enum class DeviceType : int { + CPU = 0, + FPGA = 1, + GPU = 2 +}; + /** A reference-counted handle to a statement node. */ struct Stmt : public IRHandle { Stmt() : IRHandle() {} diff --git a/tvm/HalideIR/src/ir/IR.cpp b/tvm/HalideIR/src/ir/IR.cpp index a9718b40e..a604b6fd2 100644 --- a/tvm/HalideIR/src/ir/IR.cpp +++ b/tvm/HalideIR/src/ir/IR.cpp @@ -692,17 +692,27 @@ Expr Quantize::make(Expr body, Expr bitwidth) { return Expr(node); } -Stmt KernelDef::make(Array args, Stmt body, Expr ret_void, Type ret_type, std::string name) { +Stmt KernelDef::make(Array args, Array> api_args, + Array api_types, Stmt body, Expr ret_void, + Type ret_type, std::string name, Array channels) { + internal_assert(api_args.size() == api_types.size()) << "KernelDef of unmatched args\n"; for (size_t i = 0; i < args.size(); i++) { internal_assert(args[i].defined()) << "KernelDef of undefined arg\n"; + internal_assert(api_types[i].defined()) << "KernelDef of undefined type\n"; + for (size_t j = 0; j < api_args[i].size(); j++) { + internal_assert(api_args[i][j].defined()) << "KernelDef of undefined shape\n"; + } } internal_assert(body.defined()) << "KernelDef of undefined body\n"; internal_assert(ret_void.defined()) << "KernelDef of undefined return type\n"; std::shared_ptr node = std::make_shared(); node->args = std::move(args); + node->api_args = std::move(api_args); + node->api_types = std::move(api_types); node->body = std::move(body); node->ret_void = std::move(ret_void); node->ret_type = ret_type; + node->channels = std::move(channels); node->name = name; return Stmt(node); } @@ -772,6 +782,62 @@ Stmt Partition::make(VarExpr buffer_var, int dim, int factor, PartitionType part return Stmt(node); } +Expr StreamExpr::make(Type type, VarExpr buffer_var, StreamType stream_type, int depth) { + internal_assert(depth>= 1) << "The stream channel depth must be larger than 1\n"; + + std::shared_ptr node = std::make_shared(); + node->type = type; + node->buffer_var = std::move(buffer_var); + node->depth = depth; + node->stream_type = stream_type; + return Expr(node); +} + +Expr StreamExpr::make(Type type, VarExpr buffer_var, StreamType stream_type, int depth, + Array annotate_keys, Array annotate_values) { + internal_assert(depth>= 1) << "The stream channel depth must be larger than 1\n"; + internal_assert(annotate_keys.size() == annotate_values.size()) << + "Length of annotate keys and annotate values not equal"; + + std::shared_ptr node = std::make_shared(); + node->type = type; + node->buffer_var = std::move(buffer_var); + node->depth = depth; + node->stream_type = stream_type; + node->annotate_keys = std::move(annotate_keys); + node->annotate_values = std::move(annotate_values); + return Expr(node); +} + +Stmt StreamStmt::make(VarExpr buffer_var, Expr value, StreamType stream_type, int depth) { + internal_assert(value.defined()) << "The stream-in value not defined\n"; + internal_assert(depth>= 1) << "The stream channel depth must be larger than 1\n"; + + std::shared_ptr node = std::make_shared(); + node->buffer_var = std::move(buffer_var); + node->value = std::move(value); + node->depth = depth; + node->stream_type = stream_type; + return Stmt(node); +} + +Stmt StreamStmt::make(VarExpr buffer_var, Expr value, StreamType stream_type, int depth, + Array annotate_keys, Array annotate_values) { + internal_assert(value.defined()) << "The stream-in value not defined\n"; + internal_assert(depth>= 1) << "The stream channel depth must be larger than 1\n"; + internal_assert(annotate_keys.size() == annotate_values.size()) << + "Length of annotate keys and annotate values not equal"; + + std::shared_ptr node = std::make_shared(); + node->buffer_var = std::move(buffer_var); + node->value = std::move(value); + node->depth = depth; + node->stream_type = stream_type; + node->annotate_keys = std::move(annotate_keys); + node->annotate_values = std::move(annotate_values); + return Stmt(node); +} + Stmt Stencil::make(Array inputs, Array outputs, Stmt body, int burst_width, int unroll_factor, int num_iteration) { internal_assert(body.defined()) << "Stencil of undefined body\n"; @@ -884,6 +950,8 @@ template<> void StmtNode::accept(IRVisitor *v, const Stmt &s) const { v-> template<> void StmtNode::accept(IRVisitor *v, const Stmt &s) const { v->visit((const Reuse *)this, s); } template<> void StmtNode::accept(IRVisitor *v, const Stmt &s) const { v->visit((const Partition *)this, s); } template<> void StmtNode::accept(IRVisitor *v, const Stmt &s) const { v->visit((const Stencil *)this, s); } +template<> void StmtNode::accept(IRVisitor *v, const Stmt &s) const { v->visit((const StreamStmt *)this, s); } +template<> void ExprNode::accept(IRVisitor *v, const Expr &e) const { v->visit((const StreamExpr *)this, e); } Call::ConstString Call::debug_to_file = "debug_to_file"; Call::ConstString Call::reinterpret = "reinterpret"; diff --git a/tvm/HalideIR/src/ir/IR.h b/tvm/HalideIR/src/ir/IR.h index fae48da29..e8a8835bf 100644 --- a/tvm/HalideIR/src/ir/IR.h +++ b/tvm/HalideIR/src/ir/IR.h @@ -1049,19 +1049,29 @@ struct Quantize : public ExprNode { /** The imperative function definition */ struct KernelDef : public StmtNode { Array args; + Array> api_args; + Array api_types; Stmt body; Expr ret_void; Type ret_type; std::string name; + // args to stream data + Array channels; - EXPORT static Stmt make(Array args, Stmt body, Expr ret_void, Type ret_type, std::string name); + EXPORT static Stmt make(Array args, Array> api_args, + Array api_types, Stmt body, Expr ret_void, + Type ret_type, std::string name, + Array channels); void VisitAttrs(IR::AttrVisitor* v) final { v -> Visit("args", &args); + v -> Visit("api_args", &api_args); + v -> Visit("api_types", &api_types); v -> Visit("body", &body); v -> Visit("ret_void", &ret_void); v -> Visit("ret_type", &ret_type); v -> Visit("name", &name); + v -> Visit("channels", &channels); } static const IRNodeType _type_info = IRNodeType::KernelDef; static constexpr const char* _type_key = "KernelDef"; @@ -1170,6 +1180,70 @@ struct Partition : public StmtNode { static constexpr const char* _type_key = "Partition"; }; +struct StreamStmt : public StmtNode { + VarExpr buffer_var; + Expr value; + int depth; + StreamType stream_type; + Array annotate_keys; + Array annotate_values; + + EXPORT static Stmt make(VarExpr buffer_var, + Expr value, + StreamType stream_type, + int depth); + + EXPORT static Stmt make(VarExpr buffer_var, + Expr value, + StreamType stream_type, + int depth, + Array annotate_keys, + Array annotate_values); + + void VisitAttrs(IR::AttrVisitor* v) final { + v -> Visit("buffer_var", &buffer_var); + v -> Visit("value", &value); + v -> Visit("depth", &depth); + v -> Visit("stream_type", &stream_type); + v -> Visit("annotate_keys", &annotate_keys); + v -> Visit("annotate_values", &annotate_values); + } + + static const IRNodeType _type_info = IRNodeType::StreamStmt; + static constexpr const char* _type_key = "StreamStmt"; +}; + +struct StreamExpr : public ExprNode { + VarExpr buffer_var; // var loaded + int depth; + StreamType stream_type; + Array annotate_keys; + Array annotate_values; + + EXPORT static Expr make(Type type, + VarExpr buffer_var, + StreamType stream_type, + int depth); + + EXPORT static Expr make(Type type, + VarExpr buffer_var, + StreamType stream_type, + int depth, + Array annotate_keys, + Array annotate_values); + + void VisitAttrs(IR::AttrVisitor* v) final { + v -> Visit("dtype", &type); + v -> Visit("buffer_var", &buffer_var); + v -> Visit("depth", &depth); + v -> Visit("stream_type", &stream_type); + v -> Visit("annotate_keys", &annotate_keys); + v -> Visit("annotate_values", &annotate_values); + } + static const IRNodeType _type_info = IRNodeType::StreamExpr; + static constexpr const char* _type_key = "StreamExpr"; +}; + struct Stencil : public StmtNode { Array inputs; Array outputs; diff --git a/tvm/HalideIR/src/ir/IREquality.cpp b/tvm/HalideIR/src/ir/IREquality.cpp index 9e5798fbb..46590056e 100644 --- a/tvm/HalideIR/src/ir/IREquality.cpp +++ b/tvm/HalideIR/src/ir/IREquality.cpp @@ -80,6 +80,7 @@ class IRComparer : public IRVisitor { void visit(const Call *, const Expr &); void visit(const Let *, const Expr &); void visit(const Shuffle *, const Expr &); + void visit(const StreamExpr *, const Expr &); void visit(const LetStmt *, const Stmt &); void visit(const AttrStmt *, const Stmt &); void visit(const AssertStmt *, const Stmt &); @@ -488,6 +489,11 @@ void IRComparer::visit(const Shuffle *op, const Expr &expr) { compare_expr_vector(e->indices, op->indices); } +void IRComparer::visit(const StreamExpr *op, const Expr &expr) { + const StreamExpr *node = expr_.as(); + compare_node_refs(op->buffer_var, node->buffer_var); +} + } // namespace diff --git a/tvm/HalideIR/src/ir/IRMutator.cpp b/tvm/HalideIR/src/ir/IRMutator.cpp index 13b346e93..fbd3e82b5 100644 --- a/tvm/HalideIR/src/ir/IRMutator.cpp +++ b/tvm/HalideIR/src/ir/IRMutator.cpp @@ -480,7 +480,8 @@ void IRMutator::visit(const KernelDef *op, const Stmt &s) { stmt = s; } else { - stmt = KernelDef::make(op->args, body, ret_void, op->ret_type, op->name); + stmt = KernelDef::make(op->args, op->api_args, op->api_types, + body, ret_void, op->ret_type, op->name, op->channels); } } @@ -524,6 +525,20 @@ void IRMutator::visit(const KernelStmt *op, const Stmt &s) { } } +void IRMutator::visit(const StreamStmt *op, const Stmt &s) { + Expr value = mutate(op->value); + if (value.same_as(op->value)) { + stmt = s; + } else { + stmt = StreamStmt::make(op->buffer_var, value, + op->stream_type, op->depth); + } +} + +void IRMutator::visit(const StreamExpr *op, const Expr &e) { + expr = e; +} + void IRMutator::visit(const Return *op, const Stmt &s) { Expr value = mutate(op->value); if (value.same_as(op->value)) { diff --git a/tvm/HalideIR/src/ir/IRMutator.h b/tvm/HalideIR/src/ir/IRMutator.h index 1fea5fec6..4088ae5ea 100644 --- a/tvm/HalideIR/src/ir/IRMutator.h +++ b/tvm/HalideIR/src/ir/IRMutator.h @@ -99,6 +99,8 @@ class IRMutator : public IRVisitor { EXPORT virtual void visit(const Reuse *, const Stmt &); EXPORT virtual void visit(const Partition *, const Stmt &); EXPORT virtual void visit(const Stencil *, const Stmt &); + EXPORT virtual void visit(const StreamExpr *, const Expr &); + EXPORT virtual void visit(const StreamStmt *, const Stmt &); }; diff --git a/tvm/HalideIR/src/ir/IRPrinter.cpp b/tvm/HalideIR/src/ir/IRPrinter.cpp index 6a3a5d651..b6f3e6082 100644 --- a/tvm/HalideIR/src/ir/IRPrinter.cpp +++ b/tvm/HalideIR/src/ir/IRPrinter.cpp @@ -336,6 +336,19 @@ TVM_STATIC_IR_FUNCTOR(IRPrinter, vtable) } }); +TVM_STATIC_IR_FUNCTOR(IRPrinter, vtable) +.set_dispatch([](const StreamStmt *op, IRPrinter* p) { + p->do_indent(); + p->stream << op->buffer_var << ".write("; + p->print(op->value); + p->stream << ")\n"; +}); + +TVM_STATIC_IR_FUNCTOR(IRPrinter, vtable) +.set_dispatch([](const StreamExpr *op, IRPrinter* p) { + p->stream << op->buffer_var << ".read()"; +}); + TVM_STATIC_IR_FUNCTOR(IRPrinter, vtable) .set_dispatch([](const Ramp *op, IRPrinter* p) { p->stream << "ramp("; @@ -723,7 +736,16 @@ TVM_STATIC_IR_FUNCTOR(IRPrinter, vtable) p->do_indent(); p->stream << "def " << op->name << "("; for (size_t i = 0; i < op->args.size(); i++) { + p->stream << op->args[i].type() << "("; // handle type p->print(op->args[i]); + if (op->api_args[i].size() > 1) { + p->stream << "["; + for (size_t j = 0; j < op->api_args[i].size(); j++) { + p->print(op->api_args[i][j]); + if (j < op->api_args[i].size() - 1) p->stream << "*"; + } + p->stream << "])"; + } if (i < op->args.size() - 1) { p->stream << ", "; } diff --git a/tvm/HalideIR/src/ir/IRVisitor.cpp b/tvm/HalideIR/src/ir/IRVisitor.cpp index 02880fdb4..30e1fe86b 100644 --- a/tvm/HalideIR/src/ir/IRVisitor.cpp +++ b/tvm/HalideIR/src/ir/IRVisitor.cpp @@ -137,6 +137,9 @@ void IRVisitor::visit(const Let *op, const Expr &) { op->body.accept(this); } +void IRVisitor::visit(const StreamExpr *op, const Expr &) { +} + void IRVisitor::visit(const LetStmt *op, const Stmt &) { op->value.accept(this); op->body.accept(this); @@ -169,6 +172,10 @@ void IRVisitor::visit(const Store *op, const Stmt &) { op->predicate.accept(this); } +void IRVisitor::visit(const StreamStmt *op, const Stmt &) { + op->value.accept(this); +} + void IRVisitor::visit(const Provide *op, const Stmt &) { op->value.accept(this); for (size_t i = 0; i < op->args.size(); i++) { @@ -266,6 +273,10 @@ void IRVisitor::visit(const Quantize *op, const Expr &) { void IRVisitor::visit(const KernelDef *op, const Stmt &) { for (size_t i = 0; i < op->args.size(); i++) { op->args[i].accept(this); + op->api_types[i].accept(this); + for (size_t j = 0; j < op->api_args[i].size(); j++) { + op->api_args[i][j].accept(this); + } } op->ret_void.accept(this); } @@ -574,6 +585,10 @@ void IRGraphVisitor::visit(const Quantize *op, const Expr &) { void IRGraphVisitor::visit(const KernelDef *op, const Stmt &) { for (size_t i = 0; i < op->args.size(); i++) { include(op->args[i]); + include(op->api_types[i]); + for (size_t j = 0; j < op->api_args[i].size(); j++) { + include(op->api_args[i][j]); + } } include(op->ret_void); } @@ -607,6 +622,12 @@ void IRGraphVisitor::visit(const Reuse *op, const Stmt &) { void IRGraphVisitor::visit(const Partition *op, const Stmt &) {} +void IRGraphVisitor::visit(const StreamExpr *op, const Expr &) {} + +void IRGraphVisitor::visit(const StreamStmt *op, const Stmt &) { + include(op->value); +} + void IRGraphVisitor::visit(const Stencil *op, const Stmt &) { include(op->body); } diff --git a/tvm/HalideIR/src/ir/IRVisitor.h b/tvm/HalideIR/src/ir/IRVisitor.h index 931f1c5c9..a4faa4aba 100644 --- a/tvm/HalideIR/src/ir/IRVisitor.h +++ b/tvm/HalideIR/src/ir/IRVisitor.h @@ -79,6 +79,8 @@ class IRVisitor { EXPORT virtual void visit(const Reuse *, const Stmt &); EXPORT virtual void visit(const Partition *, const Stmt &); EXPORT virtual void visit(const Stencil *, const Stmt &); + EXPORT virtual void visit(const StreamStmt *, const Stmt &); + EXPORT virtual void visit(const StreamExpr *, const Expr &); }; /** A base class for algorithms that walk recursively over the IR @@ -159,6 +161,8 @@ class IRGraphVisitor : public IRVisitor { EXPORT virtual void visit(const Reuse *, const Stmt &); EXPORT virtual void visit(const Partition *, const Stmt &); EXPORT virtual void visit(const Stencil *, const Stmt &); + EXPORT virtual void visit(const StreamExpr *, const Expr &); + EXPORT virtual void visit(const StreamStmt *, const Stmt &); // @} }; diff --git a/tvm/Makefile b/tvm/Makefile index 1a78cbe7c..1b2030645 100644 --- a/tvm/Makefile +++ b/tvm/Makefile @@ -126,6 +126,13 @@ else CFLAGS += -DTVM_OPENCL_RUNTIME=0 endif +ifeq ($(USE_SDACCEL_HLS), 1) + CFLAGS += -DHCL_SDACCEL_RUNTIME=1 +else + CFLAGS += -DHCL_SDACCEL_RUNTIME=0 +endif + + ifeq ($(USE_VIVADO_HLS), 1) CFLAGS += -DHCL_VHLS_RUNTIME=1 else diff --git a/tvm/include/tvm/codegen.h b/tvm/include/tvm/codegen.h index 3877db941..4d6be0230 100644 --- a/tvm/include/tvm/codegen.h +++ b/tvm/include/tvm/codegen.h @@ -42,6 +42,7 @@ runtime::Module Build(const Array& funcs, * \return cstr The C string representation of the file. */ std::string PackImportsToC(const runtime::Module& m, bool system_lib); + } // namespace codegen } // namespace TVM diff --git a/tvm/include/tvm/ir.h b/tvm/include/tvm/ir.h index e66db3fb4..8a26e551c 100644 --- a/tvm/include/tvm/ir.h +++ b/tvm/include/tvm/ir.h @@ -21,6 +21,8 @@ using Halide::Internal::StmtNode; using Halide::Internal::IRNodeType; using Halide::Internal::ForType; using Halide::Internal::PartitionType; +using Halide::Internal::StreamType; +using Halide::Internal::DeviceType; using Halide::DeviceAPI; // Node container for CommReducer @@ -232,6 +234,8 @@ constexpr const char* pipeline_exec_scope = "pipeline_exec_scope"; constexpr const char* opengl_stage_scope = "opengl_stage_scope"; constexpr const char* attach_scope = "attach_scope"; + +constexpr const char* device_scope = "device_scope"; } // namespace attr /*! \brief namespace of TVM Intrinsic functions */ @@ -501,6 +505,8 @@ using Halide::Internal::Quantize; using Halide::Internal::KernelDef; using Halide::Internal::KernelExpr; using Halide::Internal::KernelStmt; +using Halide::Internal::StreamExpr; +using Halide::Internal::StreamStmt; using Halide::Internal::Return; using Halide::Internal::Break; using Halide::Internal::While; diff --git a/tvm/include/tvm/ir_functor_ext.h b/tvm/include/tvm/ir_functor_ext.h index c4f18ba7e..39ce6d2b8 100644 --- a/tvm/include/tvm/ir_functor_ext.h +++ b/tvm/include/tvm/ir_functor_ext.h @@ -148,6 +148,7 @@ class ExprFunctor { virtual R VisitExpr_(const SetSlice* op, Args... args) EXPR_FUNCTOR_DEFAULT; virtual R VisitExpr_(const Quantize* op, Args... args) EXPR_FUNCTOR_DEFAULT; virtual R VisitExpr_(const KernelExpr* op, Args... args) EXPR_FUNCTOR_DEFAULT; + virtual R VisitExpr_(const StreamExpr* op, Args... args) EXPR_FUNCTOR_DEFAULT; virtual R VisitExprDefault_(const Node* op, Args ...) { LOG(FATAL) << "Do not have a default for " << op->type_key(); return R(); @@ -193,6 +194,7 @@ class ExprFunctor { IR_EXPR_FUNCTOR_DISPATCH(SetSlice); IR_EXPR_FUNCTOR_DISPATCH(Quantize); IR_EXPR_FUNCTOR_DISPATCH(KernelExpr); + IR_EXPR_FUNCTOR_DISPATCH(StreamExpr); return vtable; } }; @@ -244,6 +246,7 @@ class StmtFunctor { virtual R VisitStmt_(const Evaluate* op, Args... args) STMT_FUNCTOR_DEFAULT; virtual R VisitStmt_(const KernelDef* op, Args... args) STMT_FUNCTOR_DEFAULT; virtual R VisitStmt_(const KernelStmt* op, Args... args) STMT_FUNCTOR_DEFAULT; + virtual R VisitStmt_(const StreamStmt* op, Args... args) STMT_FUNCTOR_DEFAULT; virtual R VisitStmt_(const Return* op, Args... args) STMT_FUNCTOR_DEFAULT; virtual R VisitStmt_(const Break* op, Args... args) STMT_FUNCTOR_DEFAULT; virtual R VisitStmt_(const While* op, Args... args) STMT_FUNCTOR_DEFAULT; @@ -275,6 +278,7 @@ class StmtFunctor { IR_STMT_FUNCTOR_DISPATCH(Evaluate); IR_STMT_FUNCTOR_DISPATCH(KernelDef); IR_STMT_FUNCTOR_DISPATCH(KernelStmt); + IR_STMT_FUNCTOR_DISPATCH(StreamStmt); IR_STMT_FUNCTOR_DISPATCH(Return); IR_STMT_FUNCTOR_DISPATCH(Break); IR_STMT_FUNCTOR_DISPATCH(While); diff --git a/tvm/include/tvm/ir_mutator.h b/tvm/include/tvm/ir_mutator.h index 964684ec1..200534644 100644 --- a/tvm/include/tvm/ir_mutator.h +++ b/tvm/include/tvm/ir_mutator.h @@ -77,6 +77,7 @@ class TVM_DLL IRMutator { virtual Stmt Mutate_(const Reuse* op, const Stmt& s); virtual Stmt Mutate_(const Partition* op, const Stmt& s); virtual Stmt Mutate_(const Stencil* op, const Stmt& s); + virtual Stmt Mutate_(const StreamStmt* op, const Stmt& s); virtual Expr Mutate_(const Variable* op, const Expr& e); virtual Expr Mutate_(const Load* op, const Expr& e); @@ -114,6 +115,7 @@ class TVM_DLL IRMutator { virtual Expr Mutate_(const SetSlice* op, const Expr& e); virtual Expr Mutate_(const Quantize* op, const Expr& e); virtual Expr Mutate_(const KernelExpr* op, const Expr& e); + virtual Expr Mutate_(const StreamExpr* op, const Expr& e); }; /*! diff --git a/tvm/include/tvm/ir_pass.h b/tvm/include/tvm/ir_pass.h index 88c29f32c..dfba91d32 100644 --- a/tvm/include/tvm/ir_pass.h +++ b/tvm/include/tvm/ir_pass.h @@ -214,6 +214,14 @@ Stmt StorageFlatten(Stmt stmt, */ Stmt RemoveNoOp(Stmt stmt); +/*! + * \brief Infer device scope. + * \param stmt The stmt to be trasnformed + * \param bus_bandwidth The bandwisth of the stream bus + * \return Transformed stmt. + */ +Stmt InferStream(Stmt stmt, int bus_bandwidth); + /*! * \brief Split statement into pipeine stages. * \param stmt The stmt to be splitted diff --git a/tvm/include/tvm/ir_visitor.h b/tvm/include/tvm/ir_visitor.h index 6fe616aab..21ef77c32 100644 --- a/tvm/include/tvm/ir_visitor.h +++ b/tvm/include/tvm/ir_visitor.h @@ -131,6 +131,8 @@ class TVM_DLL IRVisitor { virtual void Visit_(const KernelDef* op); virtual void Visit_(const KernelExpr* op); virtual void Visit_(const KernelStmt* op); + virtual void Visit_(const StreamExpr* op); + virtual void Visit_(const StreamStmt* op); virtual void Visit_(const Return* op); virtual void Visit_(const Break* op); virtual void Visit_(const While* op); diff --git a/tvm/include/tvm/schedule.h b/tvm/include/tvm/schedule.h index 9dc1956c8..faacc7d96 100644 --- a/tvm/include/tvm/schedule.h +++ b/tvm/include/tvm/schedule.h @@ -351,11 +351,31 @@ class Schedule : public NodeRef { const IterVar& axis, int factor_axis = 0); - EXPORT Tensor reuse_at(const Tensor& target, - Stage parent, + EXPORT Tensor reuse_at(const Tensor& target, + Stage parent, IterVar axis, std::string name); + EXPORT void to_stage(const Tensor& target, + Stage dest, + int arg_pos, + ir::StreamType stream_type, + int channel_depth, + std::string name); + + EXPORT Tensor move_to(const Tensor& target, + ir::DeviceType device_type, + ir::StreamType stream_type, + int channel_depth, + std::string new_name); + + EXPORT void stream_to(const Tensor& target, + Stage dest, + Stage source, + ir::StreamType stream_type, + int channel_depth, + std::string new_name); + EXPORT Tensor partition(const Tensor& target, int dim, int factor, ir::PartitionType partition_type); @@ -381,6 +401,8 @@ class Schedule : public NodeRef { inline ScheduleNode* operator->(); // declare container type using ContainerType = ScheduleNode; + // insertion point for host & xcel separation + static int split_bound; }; /*! diff --git a/tvm/src/api/api_ir.cc b/tvm/src/api/api_ir.cc index 825f7580d..8edb1a0e8 100644 --- a/tvm/src/api/api_ir.cc +++ b/tvm/src/api/api_ir.cc @@ -176,6 +176,20 @@ TVM_REGISTER_API("make.Select") *ret = Node::make(args[0], args[1], args[2], args[3], args[4], args[5]); \ }) \ +#define REGISTER_MAKE7(Node) \ + TVM_REGISTER_API("make."#Node) \ + .set_body([](TVMArgs args, TVMRetValue *ret) { \ + *ret = Node::make(args[0], args[1], args[2], args[3], \ + args[4], args[5], args[6]); \ + }) \ + +#define REGISTER_MAKE8(Node) \ + TVM_REGISTER_API("make."#Node) \ + .set_body([](TVMArgs args, TVMRetValue *ret) { \ + *ret = Node::make(args[0], args[1], args[2], args[3], \ + args[4], args[5], args[6], args[7]); \ + }) \ + #define REGISTER_MAKE_BINARY_OP(Node) \ TVM_REGISTER_API("make."#Node) \ .set_body([](TVMArgs args, TVMRetValue *ret) { \ @@ -222,7 +236,7 @@ REGISTER_MAKE3(GetSlice); REGISTER_MAKE3(SetBit); REGISTER_MAKE4(SetSlice); REGISTER_MAKE2(Quantize); -REGISTER_MAKE5(KernelDef); +REGISTER_MAKE8(KernelDef); REGISTER_MAKE3(KernelExpr); REGISTER_MAKE2(KernelStmt); REGISTER_MAKE1(Return); diff --git a/tvm/src/api/api_lang.cc b/tvm/src/api/api_lang.cc index f07d590a5..543e816aa 100644 --- a/tvm/src/api/api_lang.cc +++ b/tvm/src/api/api_lang.cc @@ -461,6 +461,31 @@ TVM_REGISTER_API("_SchedulePartition") static_cast(args[4].operator int())); }); +TVM_REGISTER_API("_ScheduleMoveToStage") + .set_body([](TVMArgs args, TVMRetValue *ret) { + args[0].operator Schedule() + .to_stage(args[1], args[2], args[3], + static_cast(args[4].operator int()), + args[5], args[6]); + }); + +TVM_REGISTER_API("_ScheduleMove") + .set_body([](TVMArgs args, TVMRetValue *ret) { + *ret = args[0].operator Schedule() + .move_to(args[1], + static_cast(args[2].operator int()), + static_cast(args[3].operator int()), + args[4], args[5]); + }); + +TVM_REGISTER_API("_ScheduleStream") + .set_body([](TVMArgs args, TVMRetValue *ret) { + args[0].operator Schedule() + .stream_to(args[1], args[2], args[3], + static_cast(args[4].operator int()), + args[5], args[6]); + }); + TVM_REGISTER_API("_ScheduleReshape") .set_body([](TVMArgs args, TVMRetValue *ret) { args[0].operator Schedule().reshape(args[1], args[2]); diff --git a/tvm/src/api/api_pass.cc b/tvm/src/api/api_pass.cc index 348b8816e..1728b0c23 100644 --- a/tvm/src/api/api_pass.cc +++ b/tvm/src/api/api_pass.cc @@ -122,6 +122,7 @@ REGISTER_PASS1(InjectPrefetch); REGISTER_PASS2(InjectDoubleBuffer); REGISTER_PASS2(LoopPartition); REGISTER_PASS1(RemoveNoOp); +REGISTER_PASS2(InferStream); REGISTER_PASS2(SplitPipeline); REGISTER_PASS2(LiftAttrScope); REGISTER_PASS1(NarrowChannelAccess); diff --git a/tvm/src/codegen/build_common.cc b/tvm/src/codegen/build_common.cc new file mode 100644 index 000000000..8bdbf7e98 --- /dev/null +++ b/tvm/src/codegen/build_common.cc @@ -0,0 +1,220 @@ +/*! + * Copyright (c) 2019 by Contributors + * \file build_common.cc + * \brief Build unified simulation module + */ +#include +#include +#include +#include +#include +#include +#include "./build_common.h" +#include "./build_util.h" + +#include +#include +#include +#include +#include + +#include "merlinc/codeanalys_merlinc.h" +#include "hlsc/codegen_vhls.h" +#include "opencl/codegen_aocl.h" +#include "ppac/codegen_rv64_ppac.h" + +namespace TVM { +namespace runtime { + +class SimModuleNode final : public ModuleNode { + public: + SimModuleNode(LoweredFunc func, + std::string host_code, + argInfo arg_info, + std::string dev_code, std::string platform, + std::unordered_map options) + : func_(func), + host_(host_code), + arg_info_(arg_info), + dev_(dev_code), + platform_(platform), + options_(options) { + } + + const char* type_key() const { + return "unified_sim"; + } + + // unified simulation function + PackedFunc GetFunction( + const std::string& name, + const std::shared_ptr& sptr_to_self) final { + return PackedFunc([this](TVMArgs args, TVMRetValue* rv){ + + if (args.size() != (int)func_->args.size()) + LOG(FATAL) << "The function should take in " << func_->args.size() + << " inputs but get " << args.size(); + std::vector shmids; + std::vector arg_sizes; + std::vector arg_types; + + CollectArgInfo(args, func_, arg_sizes, arg_types); + GenSharedMem(args, shmids, arg_sizes); + + LOG(CLEAN) << "Generating harness files ..."; + system("rm -rf __tmp__; mkdir __tmp__"); + std::string path; + if (const auto* f = Registry::Get("get_util_path")) + path = (*f)(platform_).operator std::string(); + system(("cp -r " + path + "/* __tmp__/").c_str()); + LOG(CLEAN) << "Running SW simulation on " + platform_; + + if (platform_ == "sdaccel") { + GenWrapperCode(args, shmids, arg_types, arg_info_, func_); + GenHostCode(args, shmids, arg_types, func_, + platform_, host_, arg_info_); + GenKernelCode(dev_); + + LOG(CLEAN) << "Running SW simulation ..."; + system("cd __tmp__; source ./run_sw.sh"); + + } else if (platform_ == "rocket") { + // generate host and run proxy kernel test + GenHostCode(args, shmids, arg_types, func_, + platform_, host_, arg_info_); + std::string compile = "cd __tmp__;"; + compile += std::string("autoconf; mkdir build; cd build;") + + std::string("../configure --with-riscvtools=") + + options_["RISCV"] + std::string(";make -j8"); + system(compile.c_str()); + + } else if (platform_ == "vivado_hls") { + GenHostCode(args, shmids, arg_types, func_, + platform_, host_, arg_info_); + GenKernelCode(dev_); + system("cd __tmp__; make csim"); + } else { + LOG(FATAL) << "unrecognized platform " << platform_; + } + + // clean & extract resource information + FreeSharedMem(args, shmids, arg_sizes); + if (const auto* f = Registry::Get("tvm_callback_syn_postproc")) { + std::string code; + code = (*f)("test").operator std::string(); + LOG(CLEAN) << "extract res info"; + } + }); + } + + private: + LoweredFunc func_; + std::string host_; + argInfo arg_info_; + std::string dev_; + std::string platform_; + std::unordered_map options_; +}; + +using var2nameType = std::unordered_map>>; + +Module CreateSimModule( + LoweredFunc func, + std::string host_code, + std::string dev_code, + argInfo arg_types, + std::string platform, + std::unordered_map options) { + std::shared_ptr n = + std::make_shared(func, host_code, + arg_types, dev_code, + platform, options); + return Module(n); +} +} // namespace runtime + +namespace codegen { +using var2nameType = std::unordered_map>>; + +using argInfo = + std::vector>>; + +// unified simulation function for diff platforms +template +runtime::Module BuildSimModule(Array funcs, + Array attrs, + Array values) { + CodeAnalysMerlinC ca; + CGHost cg_host; + CGXcel cg_dev; + + for (LoweredFunc f : funcs) { + ca.AddFunction(f); + str2tupleMap map_arg_type; + map_arg_type = ca.Finish(); + cg_host.AddFunction(f, map_arg_type); + cg_dev.AddFunction(f, map_arg_type); + } + // vector {vars} + auto& arg_vars = cg_dev.arg_vars; + // map {var : is_streamed(bool) } + auto& stream_table = cg_dev.stream_table; + // map {var : (vid, Type, shape)} + auto& arg_top_vars = cg_dev.arg_top_vars; + + argInfo arg_info; + for (size_t i = 0 ; i < arg_vars.size(); i++) { + auto v = arg_vars[i]; + auto nameType = arg_top_vars[v]; + bool is_stream; + if (stream_table[v]) + is_stream = true; + else is_stream = false; + auto item = std::make_tuple( + /*var name*/std::get<0>(nameType), + /*whether is streamed*/is_stream, + /*data type*/std::get<1>(nameType), + /*shape*/std::get<2>(nameType)); + arg_info.push_back(item); + } + // tool option mapping and platform + std::string platform = values[0].as()->value; + std::unordered_map options; + for (size_t k = 1; k < attrs.size(); k++) { + auto key = attrs[k].as()->value; + auto val = values[k].as()->value; + options[key] = val; + } + return runtime::CreateSimModule(funcs[0], + cg_host.GetHost(), + cg_dev.GetDevice(), + arg_info, platform, options); +} + +TVM_REGISTER_API("codegen.build_sim") +.set_body([](TVMArgs args, TVMRetValue* rv) { + // dispatch to corr codegen + auto& sptr = args[2].node_sptr(); + CHECK(sptr->is_type()); + auto* n = static_cast(sptr.get()); + auto data = n->data[static_cast(0)]; + + // create module node for simulation + std::string type = Expr(data).as()->value; + if (type == "rocket") { + *rv = BuildSimModule + (args[0], args[1], args[2]); + } else if (type == "sdaccel") { + *rv = BuildSimModule + (args[0], args[1], args[2]); + } else if (type == "vivado_hls") { + *rv = BuildSimModule + (args[0], args[1], args[2]); + } else { + } + }); + +} // namespace codegen +} // namespace TVM diff --git a/tvm/src/codegen/build_common.h b/tvm/src/codegen/build_common.h index ee8cbc509..f9f42d219 100644 --- a/tvm/src/codegen/build_common.h +++ b/tvm/src/codegen/build_common.h @@ -29,6 +29,7 @@ ExtractFuncInfo(const Array& funcs) { } return fmap; } + } // namespace codegen } // namespace TVM #endif // TVM_CODEGEN_BUILD_COMMON_H_ diff --git a/tvm/src/codegen/build_opencl.cc b/tvm/src/codegen/build_opencl.cc deleted file mode 100644 index 5054085cd..000000000 --- a/tvm/src/codegen/build_opencl.cc +++ /dev/null @@ -1,44 +0,0 @@ -/*! - * Copyright (c) 2017 by Contributors - * Build opencl modules from source. - * \file build_opencl.cc - */ -#include -#include -#include "./codegen_opencl.h" -#include "./build_common.h" - -#if TVM_OPENCL_RUNTIME -#include "../runtime/opencl/opencl_module.h" -#endif // TVM_OPENCL_RUNTIME - -namespace TVM { -namespace codegen { - -runtime::Module BuildOpenCL(Array funcs) { - using TVM::runtime::Registry; - bool output_ssa = false; - CodeGenOpenCL cg; - cg.Init(output_ssa); - for (LoweredFunc f : funcs) { - cg.AddFunction(f); - } - std::string code = cg.Finish(); - - if (const auto* f = Registry::Get("tvm_callback_opencl_postproc")) { - code = (*f)(code).operator std::string(); - } -#if TVM_OPENCL_RUNTIME - return OpenCLModuleCreate(code, "cl", ExtractFuncInfo(funcs)); -#else - LOG(WARNING) << "OpenCL runtime not enabled, return a source module..."; - return DeviceSourceModuleCreate(code, "cl", ExtractFuncInfo(funcs), "opencl"); -#endif // TVM_OPENCL_RUNTIME -} - -TVM_REGISTER_API("codegen.build_opencl") -.set_body([](TVMArgs args, TVMRetValue* rv) { - *rv = BuildOpenCL(args[0]); - }); -} // namespace codegen -} // namespace TVM diff --git a/tvm/src/codegen/build_util.cc b/tvm/src/codegen/build_util.cc new file mode 100644 index 000000000..e0a5f8b2d --- /dev/null +++ b/tvm/src/codegen/build_util.cc @@ -0,0 +1,812 @@ +/*! + * Copyright (c) 2019 by Contributors + * \file build_common.cc + * \brief Build unified simulation module + */ +#include +#include +#include +#include +#include +#include +#include "./build_common.h" +#include "./build_util.h" + +#include +#include +#include +#include +#include + +#include "merlinc/codeanalys_merlinc.h" +#include "hlsc/codegen_vhls.h" +#include "opencl/codegen_aocl.h" +#include "ppac/codegen_rv64_ppac.h" + +namespace TVM { +namespace runtime { + +std::string getpath(void) { + char buff[256]; + getcwd(buff, 256); + std::string cwd(buff); + return cwd; +} + +void PrintIndent(std::ofstream& stream, int indent) { + for (int i = 0; i < indent; i++) + stream << ' '; +} + +inline size_t GetTypeSize(TVMType t) { + size_t byte = (t.bits + 7) / 8; + if (byte > 2){ + if (byte <= 4) byte = 4; + else if (byte <= 8) byte = 8; + else byte = 16; + } + return byte; +} + +inline size_t GetDataSize(TVMArray* arr) { + size_t size = 1; + for (tvm_index_t i = 0; i < arr->ndim; ++i) { + size *= arr->shape[i]; + } + size_t byte = (arr->dtype.bits + 7) / 8; + if (byte > 2){ + if (byte <= 4) byte = 4; + else if (byte <= 8) byte = 8; + else byte = 16; + } + size *= (byte * 8 * arr->dtype.lanes + 7) / 8; + return size; +} + +inline TVMType Type2TVMType(Type t) { + TVMType tt; + if (t.is_int()) tt.code = kDLInt; + else if (t.is_uint()) tt.code = kDLUInt; + else if (t.is_float()) tt.code = kDLFloat; + else LOG(FATAL) << "Unacceptable type: " << t; + tt.bits = static_cast(t.bits()); + tt.fracs = static_cast(t.fracs()); + return tt; +} + +inline std::string PrintHalideType(Type t) { + std::string str = ""; + if (t.is_uint() || t.is_int() || t.is_fixed() || t.is_ufixed()) { + if (t.is_uint()) str += "ap_uint<" + std::to_string(t.bits()) + ">"; + else if (t.is_int()) str += "ap_int<" + std::to_string(t.bits()) + ">"; + else if (t.is_ufixed()) str += "ap_ufixed<" + std::to_string(t.bits()) + ", " + std::to_string(t.bits() - t.fracs()) + ">"; + else str += "ap_fixed<" + std::to_string(t.bits()) + ", " + std::to_string(t.bits() - t.fracs()) + ">"; + } else { + LOG(FATAL) << "Cannot convert type " << t << " to C type"; + } + return str; +} + +inline std::string Type2Str(TVMType t) { + std::string str = ""; + if (t.code == kDLInt) { + if (t.fracs > 0) str += "ap_fixed<"; + else str += "ap_int<"; + str += std::to_string(static_cast(t.bits)); + if (t.fracs > 0) str += ", " + std::to_string(static_cast(t.bits - t.fracs)) + ">"; + else str += ">"; + } else if (t.code == kDLUInt) { + if (t.fracs > 0) str += "ap_ufixed<"; + else str += "ap_uint<"; + str += std::to_string(static_cast(t.bits)); + if (t.fracs > 0) str += ", " + std::to_string(static_cast(t.bits - t.fracs)) + ">"; + else str += ">"; + } else if (t.code == kDLFloat) { + str += "float"; + } else { + LOG(FATAL) << "Unknown type"; + } + return str; +} + +inline std::string Type2ExtStr(TVMType t) { + std::string str = ""; + if (t.code == kDLInt) { + if (t.fracs > 0) str += "ap_fixed<"; + else str += "ap_int<"; + str += std::to_string(static_cast(t.bits + t.fracs)); + if (t.fracs > 0) str += ", " + std::to_string(static_cast(t.bits)) + ">"; + else str += ">"; + } else if (t.code == kDLUInt) { + if (t.fracs > 0) str += "ap_ufixed<"; + else str += "ap_uint<"; + str += std::to_string(static_cast(t.bits + t.fracs)); + if (t.fracs > 0) str += ", " + std::to_string(static_cast(t.bits)) + ">"; + else str += ">"; + } else if (t.code == kDLFloat) { + str += "float"; + } else { + LOG(FATAL) << "Unknown type"; + } + return str; +} + +inline std::string Type2WrapStr(TVMType t) { + std::string str = ""; + if (t.code == kDLInt) { + if (t.fracs > 0) { + str += "ap_fixed<"; + str += std::to_string(static_cast(t.bits + t.fracs)); + } else { + str += "ap_int<"; + if (t.bits <= 8) str += std::to_string(static_cast(t.bits)); + else if (t.bits <= 16) str += "16"; + else if (t.bits <= 32) str += "32"; + else str += "64"; + } + if (t.fracs > 0) str += ", " + std::to_string(static_cast(t.bits)) + ">"; + else str += ">"; + } else if (t.code == kDLUInt) { + if (t.fracs > 0) { + str += "ap_ufixed<"; + str += std::to_string(static_cast(t.bits + t.fracs)); + } else { + str += "ap_uint<"; + if (t.bits <= 8) str += std::to_string(static_cast(t.bits)); + else if (t.bits <= 16) str += "16"; + else if (t.bits <= 32) str += "32"; + else str += "64"; + } + if (t.fracs > 0) str += ", " + std::to_string(static_cast(t.bits)) + ">"; + else str += ">"; + } else if (t.code == kDLFloat) { + str += "float"; + } else { + LOG(FATAL) << "Unknown type"; + } + return str; +} + +inline std::string Type2Byte(TVMType t) { + std::string str = ""; + if (t.code == kDLFloat) { + str += "float"; + } else if (t.code == kDLInt || t.code == kDLUInt) { + if (t.code == kDLUInt) str += "u"; + str += "int"; + if (t.bits <= 8) str += "8"; + else if (t.bits <= 16) str += "16"; + else if (t.bits <= 32) str += "32"; + else str += "64"; + str += "_t"; + } + return str; +} + +void CollectArgInfo(TVMArgs& args, + LoweredFunc func, + std::vector& arg_sizes, + std::vector& arg_types) { + for (int i = 0; i < args.size(); i++) { + if (args[i].type_code() == kArrayHandle) { + TVMArray* arr = args[i]; + arg_sizes.push_back(GetDataSize(arr)); + arg_types.push_back(arr->dtype); + } else { + const Variable* var = func->api_args[i].as(); + TVMType t = Type2TVMType(var->type); + arg_sizes.push_back(GetTypeSize(t)); + arg_types.push_back(t); + } + } +} + +void GenSharedMem(TVMArgs& args, + std::vector& shmids, + std::vector& arg_sizes) { + for (int i = 0; i < args.size(); i++) { + if (args[i].type_code() == kArrayHandle) { + TVMArray* arr = args[i]; + // generate shared memory key and id + // TODO: maybe get the current path?? + key_t key = ftok("/", i+1); + int shmid = shmget(key, arg_sizes[i], 0666|IPC_CREAT); + shmids.push_back(shmid); + // copy mem from TVM args to the shared memory + void* mem = shmat(shmid, nullptr, 0); + memcpy(mem, arr->data, arg_sizes[i]); + } else { + shmids.push_back(0); + } + } +} + +void FreeSharedMem(TVMArgs& args, + const std::vector& shmids, + std::vector& arg_sizes) { + for (size_t i = 0; i < shmids.size(); i++) { + if (args[i].type_code() == kArrayHandle) { + TVMArray* arr = args[i]; + int shmid = shmids[i]; + void* mem = shmat(shmid, nullptr, 0); + memcpy(arr->data, mem, arg_sizes[i]); + shmdt(mem); + shmctl(shmid, IPC_RMID, nullptr); + } + } +} + +// copy values from the shared mem to local mem +void PrintCopy(TVMArray* arr, + argInfo& arg_info, + std::ofstream& stream, + int indent, size_t nth_arr) { + for (int i = 0; i < arr->ndim; i++) { + PrintIndent(stream, indent); + stream << "for (size_t i" << i << " = 0; "; + stream << "i" << i << " < " << arr->shape[i] << "; "; + stream << "i" << i << "++) {\n"; + indent += 2; + if (i == arr->ndim - 1) { + PrintIndent(stream, indent); + stream << std::get<0>(arg_info[nth_arr]); + stream << "[i" << arr->ndim-1; + int mul2 = 1; + for (int j = arr->ndim-2; j >= 0; j--) { + mul2 *= arr->shape[j+1]; + stream << " + i" << j << "*" << mul2; + } + stream << "]"; + + stream << " = ("; + // stream << Type2ExtStr(arr->dtype); + stream << Type2Byte(arr->dtype); + + stream << ")(arg_" << nth_arr; + stream << "[i" << arr->ndim-1; + int mul = 1; + for (int j = arr->ndim-2; j >= 0; j--) { + mul *= arr->shape[j+1]; + stream << " + i" << j << "*" << mul; + } + stream << "])"; + if (arr->dtype.fracs > 0) + stream << " >> " << static_cast(arr->dtype.fracs); + stream << ";\n"; + } + } + for (int i = 0; i < arr->ndim; i++) { + indent -= 2; + PrintIndent(stream, indent); + stream << "}\n"; + } +} + +// copy values from local mem back to shared mem +void PrintCopyBack(TVMArray* arr, + argInfo& arg_info, + std::ofstream& stream, + int indent, size_t nth_arr) { + for (int i = 0; i < arr->ndim; i++) { + PrintIndent(stream, indent); + stream << "for (size_t i" << i << " = 0; "; + stream << "i" << i << " < " << arr->shape[i] << "; "; + stream << "i" << i << "++) {\n"; + indent += 2; + if (i == arr->ndim-1) { + PrintIndent(stream, indent); + stream << "arg_" << nth_arr; + stream << "[i" << arr->ndim-1; + int mul = 1; + for (int j = arr->ndim-2; j >= 0; j--) { + mul *= arr->shape[j+1]; + stream << " + i" << j << "*" << mul; + } + stream << "] = ("; + stream << Type2Byte(arr->dtype); + stream << ")(" << std::get<0>(arg_info[nth_arr]); + stream << "[i" << arr->ndim - 1; + int mul2 = 1; + for (int j = arr->ndim-2; j >= 0; j--) { + mul2 *= arr->shape[j+1]; + stream << " + i" << j << "*" << mul2; + } + + stream << "])"; + if (arr->dtype.fracs > 0) + stream << " << " << static_cast(arr->dtype.fracs); + stream << ";\n"; + } + } + for (int i = 0; i < arr->ndim; i++) { + indent -= 2; + PrintIndent(stream, indent); + stream << "}\n"; + } +} + +void GenKernelCode(std::string test_file) { + std::ofstream stream; + stream.open("__tmp__/kernel.cpp"); + stream << test_file; + stream.close(); +} + +// interface pragma to specify mem and ctrl interface in sdx +void GenWrapperCode(TVMArgs& args, + const std::vector& shmids, + const std::vector& arg_types, + argInfo& arg_stream_types, + LoweredFunc func) { + std::ofstream stream; + int indent = 0; + std::string path(getenv("PWD")); + stream.open("__tmp__/interface.cpp"); + stream << "#include \n"; + stream << "#include \"" + path + "/__tmp__/kernel.cpp\"\n"; + stream << "\n\n"; + stream << "extern \"C\" \n"; + stream << "{\n"; + indent += 2; + PrintIndent(stream, indent); + + // wrapper func interface + stream << "void App( "; + size_t ex_arg_count = 0; + ex_arg_count = arg_stream_types.size() - arg_types.size(); + for (size_t i = 0; i < arg_types.size(); i++) { + if (i != 0) stream << ", "; + stream << Type2WrapStr(arg_types[i]); + stream << "*"; + stream << " source_wrapper_" << i; + } + for (size_t k = 0; k < ex_arg_count; k++) { + if (k != ex_arg_count) stream << ", "; + stream << PrintHalideType(std::get<2>(arg_stream_types[k + arg_types.size()])); + stream << "*"; + stream << " source_wrapper_" << k + arg_types.size(); + } + stream << " ) {\n"; + + // memeory and control pragma + for (size_t i = 0; i < arg_stream_types.size(); i++) { + std::string interface; + if (std::get<1>(arg_stream_types[i])) interface = " m_axi "; + else interface = " m_axi "; + PrintIndent(stream, indent); + stream << "#pragma HLS INTERFACE" + interface + "port="; + stream << "source_wrapper_" << i; + stream << " offset=slave bundle=gmem\n"; + } + for (size_t i = 0; i < arg_stream_types.size(); i++) { + std::string interface; + if (std::get<1>(arg_stream_types[i])) interface = " s_axilite "; + else interface = " s_axilite "; + PrintIndent(stream, indent); + stream << "#pragma HLS INTERFACE" + interface + "port="; + stream << "source_wrapper_" << i; + stream << " bundle=control\n"; + } + PrintIndent(stream, indent); + stream << "#pragma HLS INTERFACE s_axilite port=return bundle=control\n"; + stream << "\n"; + + // intermediate vars init alloc + for (size_t i = 0; i < arg_stream_types.size(); i++) { + PrintIndent(stream, indent); + stream << PrintHalideType(std::get<2>(arg_stream_types[i])); + stream << " source_wrapper_temp_" << i; + auto shape = std::get<3>(arg_stream_types[i]); + for (size_t j = 0; j < shape.size(); j++) + stream << "[" << shape[j] << "]"; + if (shape.size() == 0) stream << "[1]"; + stream << ";\n"; + } + + // vars init for values + for (size_t i = 0; i < arg_stream_types.size(); i++) { + auto shape = std::get<3>(arg_stream_types[i]); + for (size_t j = 0; j < shape.size(); j++) { + PrintIndent(stream, indent); + stream << "for (int i" << j << " = 0; "; + stream << "i" << j << " < " << shape[j] << "; "; + stream << "i" << j << "++) {\n"; + indent += 2; + if (j == shape.size() - 1) { + PrintIndent(stream, indent); + stream << "source_wrapper_temp_" << i; + for (size_t k = 0; k < shape.size(); k++) { + stream << "[i" << k << "]"; + } + stream << " = "; + stream << "source_wrapper_" << i; + stream << "[i" << shape.size() - 1; + int mul = 1; + for (size_t k = shape.size() - 1; k > 0; k--) { + mul *= shape[k]; + stream << "+ i" << k - 1 << "*" << mul; + } + stream << "];\n"; + } + } + for (size_t j = 0; j < shape.size(); j++) { + indent -= 2; + PrintIndent(stream, indent); + stream << "}\n"; + } + if (shape.size() == 0) { + PrintIndent(stream, indent); + stream << "source_wrapper_temp_" << i; + stream << "[0] = source_wrapper_" << i << "[0];\n"; + } + } + + // print top func + stream << "\n"; + PrintIndent(stream, indent); + stream << "top( "; + for (size_t i = 0;i < arg_stream_types.size(); i++) { + if (i != arg_stream_types.size() - 1){ + stream << "source_wrapper_temp_" << i; + stream << ", "; + } else { + stream << "source_wrapper_temp_" << i; + stream << ");\n"; + } + + } + stream << "\n"; + + // read back return val + for (int k = arg_stream_types.size() - 1; + k > args.size() - 2; k--) { + auto shape = std::get<3>(arg_stream_types[k]); + for (size_t i = 0; i < shape.size(); i++) { + PrintIndent(stream, indent); + stream << "for (int i" << i << " = 0; "; + stream << "i" << i << " < " << shape[i] << "; "; + stream << "i" << i << "++) {\n"; + indent += 2; + + if (i == shape.size() - 1) { + PrintIndent(stream, indent); + stream << "source_wrapper_" << k; + stream << "[i" << shape.size() - 1; + int mul = 1; + for (size_t j = shape.size() - 1; j > 0; j--) { + mul *= shape[j]; + stream << " + i" << j - 1 << "*" << mul; + } + stream << " ] = "; + + stream << "source_wrapper_temp_" << k; + for (size_t j = 0; j < shape.size(); j++) { + stream << "[i" << j << "]"; + } + stream <<";\n"; + } + } + for (size_t i = 0;i < shape.size(); i++) { + indent -= 2; + PrintIndent(stream, indent); + stream << "}\n"; + } + } + stream << "}\n"; + indent -= 2; + stream << "}\n"; + stream.close(); +} + +// generate opencl wrapper for sdaccel sim +void GenHostHeaders(std::ofstream& stream, + std::string platform) { + stream << "#include \n"; + stream << "#include \n\n"; + stream << "// standard C/C++ headers\n"; + stream << "#include \n"; + stream << "#include \n"; + stream << "#include \n"; + stream << "#include \n"; + stream << "#include \n"; + stream << "#include \n\n"; + + if (platform == "sdaccel") { + stream << "// opencl harness headers\n"; + stream << "#include \"CLWorld.h\"\n"; + stream << "#include \"CLKernel.h\"\n"; + stream << "#include \"CLMemObj.h\"\n"; + stream << "#include \"utils.h\"\n"; + stream << "// harness namespace\n"; + stream << "using namespace rosetta;\n"; + } else if (platform == "vivado_hls") { + stream << "// vivado hls headers\n"; + stream << "#include \n"; + stream << "#include \n"; + stream << "#include \n"; + stream << "#include \"kernel.cpp\"\n\n"; + } +} + +// initialization before executing kernel +void KernelInit(std::ofstream& stream, + std::string platform, + TVMArgs& args, + const std::vector& arg_types, + argInfo& arg_stream_types) { + int indent = 2; + stream << "\n"; + PrintIndent(stream, indent); + stream << "// parse command line arguments for opencl version\n"; + PrintIndent(stream, indent); + stream << "std::string kernelFile(\"\");\n"; + PrintIndent(stream, indent); + stream << "parse_sdaccel_command_line_args(argc, argv, kernelFile);\n"; + stream << "\n"; + PrintIndent(stream, indent); + stream << "// create OpenCL world\n"; + PrintIndent(stream, indent); + stream << "CLWorld world = CLWorld(TARGET_DEVICE, CL_DEVICE_TYPE_ACCELERATOR);\n"; + stream << "\n"; + PrintIndent(stream, indent); + stream << "// add the bitstream file\n"; + PrintIndent(stream, indent); + stream << "dworld.addProgram(kernelFile);\n"; + stream << "\n\n"; + PrintIndent(stream, indent); + stream << "// create kernels\n"; + PrintIndent(stream, indent); + stream << "CLKernel App(world.getContext(), world.getProgram(), \"App\", world.getDevice());\n"; + stream << "\n\n"; + + PrintIndent(stream, indent); + stream << "// create mem objects\n"; + for (int i = 0;i < args.size(); i++) { + PrintIndent(stream, indent); + stream << "CLMemObj source_" << i; + stream << "((void*)arg_top_" << i; + stream << ", sizeof(" << Type2Byte(arg_types[i]) << "), "; + + if (args[i].type_code() == kArrayHandle) { + TVMArray* arr = args[i]; + for (int j = 0;j < arr->ndim;j++) { + if (j==0) { + stream << arr->shape[j] << " "; + } else { + stream << "* " << arr->shape[j]; + } + } + } else { + stream << "1"; + } + stream << ", "; + stream << "CL_MEM_READ_WRITE);\n"; + } + // additional streamed data + for (size_t k = args.size(); k < arg_stream_types.size(); k++) { + auto type = std::get<2>(arg_stream_types[k]); + auto shape = std::get<3>(arg_stream_types[k]); + PrintIndent(stream, indent); + stream << "CLMemObj source_" << k; + stream << "((void*)knn_mat"; + stream << ", sizeof(" << Type2Byte(Type2TVMType(type)) << "), "; + if (shape.size() > 0) { + for (size_t j = 0; j < shape.size(); j++) { + if (j == 0) { + stream << shape[j] << " "; + } else { + stream << "* " << shape[j]; + } + } + } else { + stream << "1"; + } + stream << ", "; + stream << "CL_MEM_READ_WRITE);\n"; + } + + stream << "\n"; + PrintIndent(stream, indent); + stream << "// add them to the world\n"; + for (size_t i = 0;i < arg_stream_types.size();i++) { + PrintIndent(stream, indent); + stream << "world.addMemObj(source_" << i; + stream << ");\n"; + } + + stream << "\n\n"; + PrintIndent(stream, indent); + stream << " // set work size\n"; + PrintIndent(stream, indent); + int size = arg_stream_types.size(); + std::string arr = "[" + std::to_string(size) + "] = {"; + for (int i = 0; i < size; i++) { + if (i != size -1) arr += "1, "; + else arr += "1};\n"; + } + stream << "int global_size" + arr; + PrintIndent(stream, indent); + stream << "int local_size" + arr; + PrintIndent(stream, indent); + stream << "App.set_global(global_size);\n"; + PrintIndent(stream, indent); + stream << "App.set_local(local_size);\n"; + stream << "\n"; + PrintIndent(stream, indent); + stream << "// add them to the world\n"; + PrintIndent(stream, indent); + stream << "world.addKernel(App);\n"; + stream << "\n"; + PrintIndent(stream, indent); + stream << "// set kernel arguments\n"; + for (size_t i = 0; i < arg_stream_types.size(); i++) { + PrintIndent(stream, indent); + stream << "world.setMemKernelArg(0, "<< i << ", " << i; + stream << ");\n"; + } + + stream << "\n"; + PrintIndent(stream, indent); + stream << "// run\n"; + PrintIndent(stream, indent); + stream << "world.runKernels();\n\n"; + PrintIndent(stream, indent); + stream << "// read the data back\n"; + for (size_t i = args.size() - 1; i < arg_stream_types.size(); i++) { + PrintIndent(stream, indent); + stream << "world.readMemObj(" << i << ");\n"; + } +} + +// generate host code according to platform type +void GenHostCode(TVMArgs& args, + const std::vector& shmids, + const std::vector& arg_types, + LoweredFunc lowered_func, + std::string platform, + std::string host_code, + argInfo& arg_info) { + int indent = 0; + std::ofstream stream; + stream.open("__tmp__/host.cpp"); + GenHostHeaders(stream, platform); + + stream << "int main(int argc, char ** argv) {\n"; + indent += 2; + + int cnt = 0; // label the constant value + for (int i = 0; i < args.size(); i++) { + if (args[i].type_code() == kArrayHandle) { + // read from the shared memory + PrintIndent(stream, indent); + stream << Type2Byte(arg_types[i]) << "* "; + stream << "arg_" << i << " = "; + stream << "(" << Type2Byte(arg_types[i]) << "*)"; + stream << "shmat(" << shmids[i] << ", nullptr, 0);\n"; + PrintIndent(stream, indent); + + stream << Type2Byte(arg_types[i]) << " "; + stream << std::get<0>(arg_info[i]); + TVMArray* arr = args[i]; + + stream << "["; + for (int j = 0; j < arr->ndim; j++) { + if (j == arr->ndim - 1) { + stream << arr->shape[j]; + } else { + stream << arr->shape[j]; + stream << " * "; + } + } + stream << "];\n"; + PrintCopy(arr, arg_info, stream, indent, i); + + } else { + // directly assign the value to the variable + PrintIndent(stream, indent); + stream << Type2Byte(arg_types[i]) << " "; + stream << "arg_" << i << " = "; + stream << "(" << Type2Byte(arg_types[i]) << ")"; + if (args[i].type_code() == kDLInt || + args[i].type_code() == kDLUInt) { + stream << int64_t(args[i]); + } + stream << ";\n"; + PrintIndent(stream, indent); + stream << Type2Byte(arg_types[i]) << " "; + stream << "arg_top_" << i; + stream << "[1] = { "; + + stream << "arg_" << i << " }"; + if (arg_types[i].fracs > 0) + stream << " >> " << static_cast(arg_types[i].fracs); + stream << ";\n"; + cnt += 1; + } + stream << "\n"; + } + + // allocate mem for stream vars + for (size_t k = args.size(); k < arg_info.size(); k++) { + auto type = std::get<2>(arg_info[k]); + auto shape = std::get<3>(arg_info[k]); + PrintIndent(stream, indent); + stream << Type2Byte(Type2TVMType(type)) << " " << "name["; + if (shape.size() > 0) { + for (size_t i = 0; i < shape.size(); i++) { + if (i != shape.size() - 1) + stream << shape[i] << " * "; + else stream << shape[i]; + } + } else { + stream << "1"; + } + stream << "];\n"; + } + + // generate host side (before kernel) + PrintIndent(stream, indent); + stream << "printf(\"Finished setting up shared memory\\n\");\n"; + PrintIndent(stream, indent); + stream << "// compute bofore kernel function\n"; + size_t pos = host_code.find("top("); + std::string pre_kernel = host_code.substr(0, pos -1); + std::string post_kernel = host_code.substr(host_code.find('\n', pos) + 1); + pre_kernel = pre_kernel.substr(pre_kernel.find_first_not_of("\n")); + pre_kernel = pre_kernel.substr(pre_kernel.find_first_not_of(" ")); + PrintIndent(stream, indent); + + if (platform == "sdaccel") { + // create variable wrapper + stream << pre_kernel << "\n"; + KernelInit(stream, platform, args, + arg_types, arg_info); + } else if (platform == "vivado_hls") { + // init hls stream channels + for (size_t k = 0; k < arg_info.size(); k++) { + auto info = arg_info[k]; + if (std::get<1>(info)) { + PrintIndent(stream, indent); + stream << "hls::stream<" + << PrintHalideType(std::get<2>(info)) + << "> " << "fd_" << std::get<0>(info) << ";\n"; + } + } + PrintIndent(stream, indent); + stream << pre_kernel << "\n"; + PrintIndent(stream, indent); + // create kernel call from host + stream << "top("; + for (size_t i = 0; i < arg_info.size(); i++) { + auto info = arg_info[i]; + auto name = std::get<0>(info); + if (i != 0) stream << ", "; + stream << "fd_" << name; + } + stream << ");\n"; + } + + // generate host (post-kernel) + PrintIndent(stream, indent); + stream << "// compute after kernel function\n"; + stream << post_kernel; + + // copy to shared mem + for (int i = 0; i < args.size(); i++) { + if (args[i].type_code() == kArrayHandle) { + TVMArray* arr = args[i]; + PrintCopyBack(arr, arg_info, stream, indent, i); + PrintIndent(stream, indent); + stream << "shmdt("; + stream << "arg_" << i << ");\n"; + } + } + + stream << "\n\n"; + PrintIndent(stream, indent); + stream << "}\n"; + stream.close(); + +} +} // namespace runtime +} // namespace TVM diff --git a/tvm/src/codegen/build_util.h b/tvm/src/codegen/build_util.h new file mode 100644 index 000000000..ca95364c1 --- /dev/null +++ b/tvm/src/codegen/build_util.h @@ -0,0 +1,70 @@ +/*! + * Copyright (c) 2019 by Contributors + * Common build utilities + * \file build_util.h + */ +#ifndef TVM_CODEGEN_BUILD_HELPER_H_ +#define TVM_CODEGEN_BUILD_HELPER_H_ + +#include +#include +#include +#include "../runtime/meta_data.h" + +namespace TVM { +namespace runtime { + +using argInfo = + std::vector>>; + +// get current work directory +std::string getpath(void); +void PrintIndent(std::ofstream& stream, int indent); +inline size_t GetTypeSize(TVMType t); +inline size_t GetDataSize(TVMArray* arr); +inline TVMType Type2TVMType(Type t); +inline std::string PrintHalideType(Type t); +inline std::string Type2Str(TVMType t); +inline std::string Type2ExtStr(TVMType t); +inline std::string Type2WrapStr(TVMType t); +inline std::string Type2Byte(TVMType t); + +void CollectArgInfo(TVMArgs& args, + LoweredFunc func, + std::vector& arg_sizes, + std::vector& arg_types); + +void GenSharedMem(TVMArgs& args, + std::vector& shmids, + std::vector& arg_sizes); + +void FreeSharedMem(TVMArgs& args, + const std::vector& shmids, + std::vector& arg_sizes); + +void PrintCopy(TVMArray* arr, + std::ofstream& stream, + int indent, size_t nth_arr); + +void PrintCopyBack(TVMArray* arr, + std::ofstream& stream, + int indent, size_t nth_arr); + +void GenKernelCode(std::string test_file); + +void GenWrapperCode(TVMArgs& args, + const std::vector& shmids, + const std::vector& arg_types, + argInfo& arg_info, + LoweredFunc func); + +void GenHostCode(TVMArgs& args, + const std::vector& shmids, + const std::vector& arg_types, + LoweredFunc func, + std::string platform, + std::string host_code, + argInfo& arg_info); +} // namespace runtime +} // namespace TVM +#endif // TVM_CODEGEN_BUILD_HELPER_H_ diff --git a/tvm/src/codegen/codegen_c.cc b/tvm/src/codegen/codegen_c.cc index 7373711f4..006edf933 100644 --- a/tvm/src/codegen/codegen_c.cc +++ b/tvm/src/codegen/codegen_c.cc @@ -2,9 +2,12 @@ * Copyright (c) 2017 by Contributors * \file codegen_c.cc */ +#include +#include #include #include #include "./codegen_c.h" +#include "./merlinc/codeanalys_merlinc.h" #include "../arithmetic/compute_expr.h" namespace TVM { @@ -12,6 +15,123 @@ namespace codegen { using namespace ir; +Type String2Type(std::string& s) { + if (s.front() == '\"' && s.back() == '\"') { + s.erase(0, 1); + s.pop_back(); + } + std::istringstream is(s); + halideir_type_code_t code = Type::Int; + if (s.substr(0, 3) == "int") { + code = Type::Int; s = s.substr(3); + } else if (s.substr(0, 4) == "uint") { + code = Type::UInt; s = s.substr(4); + } else if (s.substr(0, 5) == "float") { + code = Type::Float; s = s.substr(5); + } else if (s.substr(0, 5) == "float") { + code = Type::Float; s = s.substr(5); + } else if (s == "handle") { + return Handle(); + } else { + LOG(FATAL) << "unknown type " << s; + } + int bits = 32, lanes = 1; + if (sscanf(s.c_str(), "%dx%d", &bits, &lanes) == 0) { + LOG(FATAL) << "unknown type " << s; + } + return Type(code, bits, lanes); +} + +// generate row major index +std::string getIndex(std::vector shape) { + std::string str; + int mul = 1; + for (size_t i = shape.size(); i > 0; i--) { + mul = mul * shape[i-1]; + str += "i" + std::to_string(i-1) + + "*" + std::to_string(mul); + if (i != 1) str += "+ "; + } + return str; +} + +// collect type info for vars +void TypeCollector::Visit_(const Allocate *op) { + auto v = op->buffer_var.get(); + if (top_args_.count(v)) { + std::vector shape; + for (size_t i = 0; i < op->extents.size(); i++) + shape.push_back(op->extents[i].as()->value); + top_args_[v] = std::make_tuple(std::get<0>(top_args_[v]), op->type, shape); + } + IRVisitor::Visit_(op); +} + +void StreamCollector::Visit_(const Allocate *op) { + this->HandleDef(op->buffer_var.get()); + IRVisitor::Visit_(op); +} + +void StreamCollector::Visit_(const Load *op) { + this->HandleUse(op->buffer_var); + IRVisitor::Visit_(op); +} + +// update placeholder status +void StreamCollector::Visit_(const Store* op) { + if (auto val = op->value.as()) { + this->HandleDef(op->buffer_var.get()); + } + this->HandleUse(op->buffer_var); + IRVisitor::Visit_(op); +} + +void StreamCollector::Visit_(const StreamStmt* op) { + this->HandleDef(op->buffer_var.get()); + IRVisitor::Visit_(op); +} + +void StreamCollector::Visit_(const AttrStmt* op) { + if (op->attr_key == attr::device_scope) { + if (op->value.as()->value != scope_) + switch_on = true; + else switch_on = false; + } + IRVisitor::Visit_(op); +} + +// additional data saved into stream table +void StreamCollector::HandleDef(const Variable* v) { + if (!switch_on) { // def on host scope + CHECK(!host_def_count_.count(v)) + << "variable " << v->name_hint + << " has already been defined, the Stmt is not SSA"; + CHECK(!host_use_count_.count(v)) + << "variable " << v->name_hint + << " has been used before definition!"; + host_use_count_[v] = 0; + host_def_count_[v] = 1; + } +} + +void StreamCollector::HandleUse(const Expr& v) { + CHECK(v.as()); + Var var(v.node_); + auto it = host_use_count_.find(var.get()); + if (!switch_on) { // def on host scope + if (it != host_use_count_.end()) { + if (it->second >= 0) { + ++it->second; + } + } else { + if (!stream_table_.count(var.get())) { + host_undefined_.push_back(var); + host_use_count_[var.get()] = -1; + } + } + } +} + void CodeGenC::Init(bool output_ssa) { print_ssa_form_ = output_ssa; } @@ -19,44 +139,50 @@ void CodeGenC::Init(bool output_ssa) { void CodeGenC::InitFuncState(LoweredFunc f) { alloc_storage_scope_.clear(); handle_data_type_.clear(); + var_shape_map_.clear(); + range_.clear(); CodeGenSourceBase::ClearFuncState(); } -void CodeGenC::AddFunction(LoweredFunc f) { + +void CodeGenC::AddFunction(LoweredFunc f, + str2tupleMap map_arg_type) { // clear previous generated state. this->InitFuncState(f); - // skip the first underscore, so SSA variable starts from _1 - GetUniqueName("_"); + map_arg_type_ = map_arg_type; // add to alloc buffer type. for (const auto & kv : f->handle_data_type) { RegisterHandleType(kv.first.get(), kv.second.type()); } + // generate function signature this->stream << "void " << f->name << "("; for (size_t i = 0; i < f->args.size(); ++i) { Var v = f->args[i]; std::string vid = AllocVarID(v.get()); if (i != 0) stream << ", "; - if (v.type().is_handle()) { - auto it = alloc_storage_scope_.find(v.get()); - if (it != alloc_storage_scope_.end()) - PrintStorageScope(it->second, stream); - stream << ' '; - - if (handle_data_type_.count(v.get())) { - PrintType(handle_data_type_.at(v.get()), stream); - } else { - stream << "void"; - } - stream << "*"; - - if (f->is_restricted && restrict_keyword_.length() != 0) { - stream << ' ' << restrict_keyword_; - } + // check type in the arg map + if (map_arg_type.find(vid) == map_arg_type.end()) { + LOG(WARNING) << vid << " type not found\n"; + PrintType(v.type(), this->stream); + this->stream << ' ' << vid; } else { - PrintType(v.type(), stream); + auto arg = map_arg_type[vid]; + PrintType(std::get<1>(arg), this->stream); + this->stream << "* " << std::get<0>(arg); + const BufferNode* buf = f->api_args[i].as(); + if (v.type().is_handle() && buf) { + std::vector shape; + for (size_t i = 0; i < buf->shape.size(); i++) + shape.push_back(buf->shape[i].as()->value); + arg_shapes.push_back(shape); + var_shape_map_[buf->data.get()] = buf->shape; + auto it = alloc_storage_scope_.find(v.get()); + if (it != alloc_storage_scope_.end()) + PrintStorageScope(it->second, stream); + } } - stream << ' ' << vid; } + stream << ") {\n"; int func_scope = this->BeginScope(); this->PrintStmt(f->body); @@ -65,8 +191,49 @@ void CodeGenC::AddFunction(LoweredFunc f) { this->stream << "}\n\n"; } +std::string CodeGenC::GetHost() { + if (!fpga_scope_) + host_stream << stream.str(); + std::string postproc = host_stream.str(); + postproc.erase(postproc.rfind("}") - 1, + postproc.length() - 1); + postproc.erase(0, postproc.find("{") + 1); + return postproc + "\n\n"; +} + +std::string CodeGenC::GetDevice() { + std::ostringstream device; + device << "void top(" << arg_stream.str() << "){\n"; + + // process device code + PreProcess(device); + // remove the kernel name alloc + auto text = device_stream.str(); + for (auto const& m : stream_arg_pos) { + std::string alloc = m.first + ";"; + size_t nFPos = text.find(alloc); + size_t secondNL = text.find('\n', nFPos); + size_t firstNL = text.rfind('\n', nFPos); + text.erase(firstNL, secondNL - firstNL); + } + device << text; + PostProcess(device); + + if (fpga_scope_) device << stream.str(); + return decl_stream.str() + module_stream.str() + + device.str() + "}\n\n"; +} + std::string CodeGenC::Finish() { - return decl_stream.str() + stream.str(); + std::ostringstream device; + device << "void top(" << arg_stream.str() + << "){\n" << device_stream.str(); + if (fpga_scope_) device << stream.str(); + else host_stream << stream.str(); + device << "}\n"; + return decl_stream.str() + "\n{device}\n" + + module_stream.str() + device.str() + "\n{device}\n" + + "\n{host}\n" + host_stream.str() + "\n{host}\n"; } void CodeGenC::PrintExpr(const Expr& n, std::ostream& os) { // NOLINT(*) @@ -286,7 +453,7 @@ void CodeGenC::PrintStorageScope(const std::string& scope, std::ostream& os) { / void CodeGenC::PrintType(Type t, std::ostream& os) { // NOLINT(*) CHECK_EQ(t.lanes(), 1) - << "do not yet support vector types"; + << "do not yet support vector types"; if (t.is_handle()) { os << "void*"; return; } @@ -314,7 +481,6 @@ void CodeGenC::PrintType(Type t, std::ostream& os) { // NOLINT(*) LOG(FATAL) << "Cannot convert type " << t << " to C type"; } - inline void PrintConst(const IntImm* op, std::ostream& os, CodeGenC* p) { // NOLINT(*) if (op->type == Int(32)) { std::ostringstream temp; @@ -619,7 +785,7 @@ void CodeGenC::VisitStmt_(const Store* op) { Type t = op->value.type(); if (t.lanes() == 1) { std::string value = this->PrintExpr(op->value); - std::string ref = this->GetBufferRef(t, op->buffer_var.get(), op->index); + std::string ref = this->GetBufferRef(t, op->buffer_var.get(), op->index); this->PrintIndent(); stream << ref << " = " << value << ";\n"; } else { @@ -714,49 +880,92 @@ void CodeGenC::VisitExpr_(const GetSlice *op, std::ostream& os) { // NOLINT(*) } void CodeGenC::VisitExpr_(const SetBit *op, std::ostream& os) { // NOLINT(*) - LOG(FATAL) << "SetBit is not implemented yet"; + LOG(FATAL) << "SetBit is not implemented yet in C"; } void CodeGenC::VisitExpr_(const SetSlice *op, std::ostream& os) { // NOLINT(*) - LOG(FATAL) << "SetSlice is not implemented yet"; + LOG(FATAL) << "SetSlice is not implemented yet in C"; } void CodeGenC::VisitExpr_(const Quantize *op, std::ostream& os) { // NOLINT(*) - LOG(FATAL) << "Quantize is not yet support"; + LOG(FATAL) << "Quantize is not yet support in C"; +} + +void CodeGenC::VisitExpr_(const StreamExpr *op, std::ostream& os) { // NOLINT(*) + auto v = op->buffer_var.get(); + auto it = var_idmap_.find(v); + CHECK(it != var_idmap_.end()) + << "variable " << v->name_hint << " not decalred"; } void CodeGenC::VisitExpr_(const KernelExpr *op, std::ostream& os) { // NOLINT(*) - LOG(FATAL) << "KernelExpr is not yet support"; + os << op->name << "("; + for (size_t i = 0; i < op->args.size(); ++i) { + PrintExpr(op->args[i], os); + if (i != op->args.size() - 1) os << ", "; + } + os << ")"; +} + +void CodeGenC::VisitStmt_(const StreamStmt *op) { // NOLINT(*) + CHECK(!var_idmap_.count(op->buffer_var.get())); + std::string vid = AllocVarID(op->buffer_var.get()); + vid = GetVarID(op->value.as()->buffer_var.get()); + PrintIndent(); + auto load_op = op->value.as(); + auto v = load_op->buffer_var.as(); + // placeholder args using recv name + if (stream_table.count(v)) { + auto tuple = arg_top_vars[v]; + arg_top_vars[v] = std::make_tuple(vid, std::get<1>(tuple), + std::get<2>(tuple)); + stream_table[v] = true; + } // else: streamed externop defined in analysis + // PrintExpr(op->value, stream); + // stream << vid << ".write()\n"; } void CodeGenC::VisitStmt_(const LetStmt* op) { std::string value = PrintExpr(op->value); + // Skip the argument retrieving assign statement + std::string vid = AllocVarID(op->var.get()); if (print_ssa_form_) { CHECK(!var_idmap_.count(op->var.get())); var_idmap_[op->var.get()] = value; } else { PrintIndent(); - if (op->var.type() == Handle() && - handle_data_type_.count(op->var.get())) { - PrintType(handle_data_type_.at(op->var.get()), stream); - stream << "* " - << AllocVarID(op->var.get()) - << " = ("; - PrintType(handle_data_type_.at(op->var.get()), stream); - stream << "*)" << value << ";\n"; - } else { + if (op->var.type() != Handle() && + value.find("TVMArray") == std::string::npos && + value.find("arg") != 0) { + PrintIndent(); PrintType(op->var.type(), this->stream); this->stream << ' ' - << AllocVarID(op->var.get()) + << vid << " = " << value << ";\n"; + // modify var idmap for passed in args + } else if (value.find("data") != std::string::npos || + value.substr(0, 3) == "arg") { + auto v = op->var.get(); + arg_vars.push_back(v); + stream_table[v] = false; + std::string api_name = "arg" + std::to_string(arg_count); + auto arg = map_arg_type_[api_name]; + // PrintType(std::get<1>(arg), arg_stream); + CHECK(arg_count < arg_shapes.size()); + auto shape = arg_shapes[arg_count]; + arg_top_vars[v] = std::make_tuple(vid, std::get<1>(arg), shape); + arg_count += 1; } + PrintStmt(op->body); } - PrintStmt(op->body); } void CodeGenC::VisitStmt_(const Allocate* op) { CHECK(!is_zero(op->condition)); - std::string vid = AllocVarID(op->buffer_var.get()); + std::string vid; + if (!var_idmap_.count(op->buffer_var.get())) + vid = AllocVarID(op->buffer_var.get()); + else vid = GetVarID(op->buffer_var.get()); if (op->new_expr.defined()) { // Prefer global static allocation for the program CHECK_EQ(op->free_function, "nop"); @@ -799,6 +1008,64 @@ void CodeGenC::VisitStmt_(const AttrStmt* op) { const Variable* v = op->node.as(); CHECK(v); volatile_buf_.insert(v); + } else if (op->attr_key == ir::attr::device_scope) { + // print top( ... in host and enter fpga scope + if (op->value.as()->value == "fpga" && !fpga_scope_) { + fpga_scope_ = true; + PrintIndent(); + + // track the stream usage + StreamCollector collector(stream_table, "cpu"); + collector.Visit(op->body); + + // update data type and name + for (auto k : collector.host_undefined_) { + auto v = k.get(); + arg_vars.push_back(v); + stream_table[v] = true; + auto tuple = arg_top_vars[v]; + arg_top_vars[v] = std::make_tuple(v->name_hint, + std::get<1>(tuple), + std::get<2>(tuple)); + } + TypeCollector visitor(arg_top_vars); + visitor.Visit(op->body); + + // generte function calls + stream << "top("; + int index = 0; + for (size_t i = 0; i < arg_vars.size(); i++) { + auto v = arg_vars[i]; + std::string arg_name; + if (stream_table[v]) + arg_name = std::get<0>(arg_top_vars[v]); + else arg_name = GetVarID(v); + if (index !=0) stream << ", "; + stream << arg_name; + // print kernel func signature + if (index != 0) arg_stream << ", "; + PrintType(std::get<1>(arg_top_vars[v]), arg_stream); + auto shape = std::get<2>(arg_top_vars[v]); + arg_stream << " " << arg_name; + for (size_t k = 0; k < shape.size(); k++) + arg_stream << "[" << shape[k] << "]"; + index++; + } + stream << ");\n"; + + // switch context to device scope + host_stream << this->stream.str(); + this->stream.str(""); + this->stream.clear(); + + // swtich from device to host + } else if (op->value.as()->value == "cpu" && + fpga_scope_) { + fpga_scope_ = false; + device_stream << this->stream.str(); + this->stream.str(""); + this->stream.clear(); + } } this->PrintStmt(op->body); } @@ -889,17 +1156,75 @@ void CodeGenC::VisitStmt_(const ProducerConsumer *op) { PrintStmt(op->body); } -void CodeGenC::VisitStmt_(const KernelDef *op) { - LOG(FATAL) << "KernelDef is not yet support"; +void CodeGenC::VisitStmt_(const KernelDef* op) { + LoweredFunc f; + // save func states + SaveFuncState(f); + InitFuncState(f); + std::ostringstream save; + save << this->stream.str(); + this->stream.str(""); + this->stream.clear(); + + // skip the first underscore + GetUniqueName("_"); + // add to alloc buffer : type. + for (const auto & k : op->args) { + RegisterHandleType(k.get(), k.get()->type); + } + // print function signature + PrintType(op->ret_type, stream); + stream << " " << op->name << "("; + for (size_t k = 0; k < op->channels.size(); k+=2) { + int pos = op->channels[k].as()->value; + stream_arg_pos[op->name].insert(pos); + } + for (size_t i = 0; i < op->args.size(); ++i) { + VarExpr v = op->args[i]; + var_shape_map_[v.get()] = op->api_args[i]; + std::string vid = AllocVarID(v.get()); + if (i != 0) stream << ", "; + std::string str = PrintExpr(op->api_types[i]); + Type type = String2Type(str); + PrintType(type, stream); + this->stream << " " << vid << "["; + if (v.type().is_handle()) { + for (size_t j = 0; j < op->api_args[i].size(); j++) { + if (j != 0) stream << "* "; + auto dim = op->api_args[i][j].as()->value; + this->stream << dim; + } + this->stream << ']'; + } + } + stream << ") {\n"; + int func_scope = BeginScope(); + range_ = CollectIterRange(op->body); + PrintStmt(op->body); + EndScope(func_scope); + stream << "}\n\n"; + + // restore default stream + module_stream << this->stream.str(); + this->stream.str(""); + this->stream.clear(); + this->stream << save.str(); + RestoreFuncState(f); } void CodeGenC::VisitStmt_(const KernelStmt *op) { - LOG(FATAL) << "KernelStmt is not yet support"; + PrintIndent(); + stream << op->name << "("; + for (size_t i = 0; i < op->args.size(); i++) { + PrintExpr(op->args[i], stream); + if (i < op->args.size() -1) stream << ", "; + } + stream << ");\n"; } void CodeGenC::VisitStmt_(const Return *op) { this->stream << "return "; - PrintExpr(op->value); + PrintExpr(op->value, stream); this->stream << ";\n"; } @@ -922,5 +1247,28 @@ void CodeGenC::VisitStmt_(const While *op) { void CodeGenC::VisitStmt_(const Partition* op) { } +void CodeGenC::SaveFuncState(LoweredFunc f) { + // clear save info copy + alloc_storage_scope_save.clear(); + handle_data_type_save.clear(); + var_shape_map_save.clear(); + range_save.clear(); + // backup func info and clear + alloc_storage_scope_save = alloc_storage_scope_; + handle_data_type_save = handle_data_type_; + var_shape_map_save = var_shape_map_; + range_save = range_; + CodeGenSourceBase::SaveFuncState(); +} + +void CodeGenC::RestoreFuncState(LoweredFunc f) { + this->InitFuncState(f); + alloc_storage_scope_ = alloc_storage_scope_save; + handle_data_type_ = handle_data_type_save; + var_shape_map_ = var_shape_map_save; + range_ = range_save; + CodeGenSourceBase::RestoreFuncState(); +} + } // namespace codegen } // namespace TVM diff --git a/tvm/src/codegen/codegen_c.h b/tvm/src/codegen/codegen_c.h index f579ca579..d7292b38f 100644 --- a/tvm/src/codegen/codegen_c.h +++ b/tvm/src/codegen/codegen_c.h @@ -8,6 +8,7 @@ #include #include +#include #include #include #include @@ -15,11 +16,64 @@ #include #include #include "./codegen_source_base.h" +#include "./merlinc/codeanalys_merlinc.h" +#include "../runtime/thread_storage_scope.h" namespace TVM { namespace codegen { using namespace ir; +template +using str2tupleMap = std::unordered_map>; +using var2nameType = std::unordered_map>>; + +Type String2Type(std::string& s); +std::string getIndex(std::vector shape); + +/*! + * \brief A data type collector + * + * CodeGenC TypeCollector gathers information + * of different types of each variable + * + */ +class TypeCollector final : public IRVisitor { + public: + var2nameType& top_args_; + TypeCollector(var2nameType& top_args) : top_args_(top_args) {}; + void Visit_(const Allocate *op); +}; + +/*! + * \brief An undefined variable collector + * + * CodeGenC stream data collector detects undefined + * variable and create channels for them + * + * */ +class StreamCollector final : public IRVisitor { + public: + Array host_undefined_; + std::unordered_map host_use_count_; + std::unordered_map host_def_count_; + StreamCollector(std::unordered_map& stream_table, + std::string initial_scope) + : stream_table_(stream_table), + scope_(initial_scope) {}; + void Visit_(const Allocate *op); + void Visit_(const Load *op); + void Visit_(const Store *op); + void Visit_(const StreamStmt *op); + void Visit_(const AttrStmt *op); + void HandleDef(const Variable* v); + void HandleUse(const Expr& v); + private: + std::unordered_map& stream_table_; + std::string scope_; + bool switch_on{true}; +}; + /*! * \brief A base class to generate C code. * @@ -44,12 +98,22 @@ class CodeGenC : * \brief Add the function to the generated module. * \param f The function to be compiled. */ - void AddFunction(LoweredFunc f); + void AddFunction(LoweredFunc f, str2tupleMap map_arg_type); /*! * \brief Finalize the compilation and return the code. * \return The code. */ std::string Finish(); + /*! + * \brief Finalize the compilation and return the code. + * \return The host code. + */ + std::string GetHost(); + /*! + * \brief Finalize the compilation and return the code. + * \return The device code. + */ + std::string GetDevice(); /*! * \brief Print the Stmt n to CodeGenC->stream * \param n The statement to be printed. @@ -113,6 +177,7 @@ class CodeGenC : void VisitExpr_(const SetSlice* op, std::ostream& os) override; // NOLINT(*) void VisitExpr_(const Quantize* op, std::ostream& os) override; // NOLINT(*) void VisitExpr_(const KernelExpr* op, std::ostream& os) override; // NOLINT(*) + void VisitExpr_(const StreamExpr* op, std::ostream& os) override; // NOLINT(*) // statment void VisitStmt_(const LetStmt* op) override; void VisitStmt_(const Store* op) override; @@ -126,6 +191,7 @@ class CodeGenC : void VisitStmt_(const ProducerConsumer* op) override; void VisitStmt_(const KernelDef* op) override; void VisitStmt_(const KernelStmt* op) override; + void VisitStmt_(const StreamStmt* op) override; void VisitStmt_(const Return* op) override; void VisitStmt_(const Break* op) override; void VisitStmt_(const While* op) override; @@ -159,10 +225,38 @@ class CodeGenC : // print store of single element. virtual void PrintVecElemStore( const std::string& vec, Type t, int i, const std::string& value); - // Get a cast type from to + // get a cast type from to virtual std::string CastFromTo(std::string value, Type from, Type target); + // map from var to shape, range and type + std::map > var_shape_map_; + std::unordered_map range_; + str2tupleMap map_arg_type_; + + // save for kernel + std::map > var_shape_map_save; + std::unordered_map range_save; + + // index into ap_arg_type + size_t arg_count{0}; + // map {var : (vid, Type, shape)} + var2nameType arg_top_vars; + // vector {vars} in top function + std::vector arg_vars; + // vector of top function arg dimension + std::vector> arg_shapes; + // whether the function arg is streamed + std::unordered_map stream_table; + // map from kernel name to set of streamed arg position index + std::unordered_map> stream_arg_pos; + // pre and post processing device code + virtual void PreProcess(std::ostringstream& os) {}; + virtual void PostProcess(std::ostringstream& os) {}; + protected: + void SaveFuncState(LoweredFunc f); + void RestoreFuncState(LoweredFunc f); + // Print reference to struct location std::string GetStructRef( Type t, const Expr& buffer, const Expr& index, int kind); @@ -186,12 +280,22 @@ class CodeGenC : const std::string& target, const std::string& src, Type t) final; /*! \brief restrict keyword */ std::string restrict_keyword_{""}; + /*! \brief the func arg decl stream */ + std::ostringstream arg_stream; /*! \brief the storage scope of allocation */ std::unordered_map alloc_storage_scope_; /*! \brief the data type of allocated buffers */ std::unordered_map handle_data_type_; std::unordered_map buf_length_map_; + // save for kernel gen + std::unordered_map alloc_storage_scope_save; + std::unordered_map handle_data_type_save; + std::unordered_map var_idmap_save; + std::unordered_map name_alloc_map_save; + std::unordered_map ssa_assign_map_save; + std::vector scope_mark_save; + private: /*! \brief whether to print in SSA form */ bool print_ssa_form_{false}; diff --git a/tvm/src/codegen/codegen_cuda.cc b/tvm/src/codegen/codegen_cuda.cc index badbf2849..3c675ad06 100644 --- a/tvm/src/codegen/codegen_cuda.cc +++ b/tvm/src/codegen/codegen_cuda.cc @@ -25,9 +25,10 @@ void CodeGenCUDA::Init(bool output_ssa) { CHECK_EQ(vid_global_barrier_state_, runtime::symbol::tvm_global_barrier_state); } -void CodeGenCUDA::AddFunction(LoweredFunc f) { +void CodeGenCUDA::AddFunction(LoweredFunc f, + str2tupleMap map_arg_type) { this->stream << "extern \"C\" __global__ "; - CodeGenC::AddFunction(f); + CodeGenC::AddFunction(f, map_arg_type); } void CodeGenCUDA::VisitStmt_(const ir::For* op) { diff --git a/tvm/src/codegen/codegen_cuda.h b/tvm/src/codegen/codegen_cuda.h index e49a47ae3..e0c4f1a41 100644 --- a/tvm/src/codegen/codegen_cuda.h +++ b/tvm/src/codegen/codegen_cuda.h @@ -10,6 +10,7 @@ #include #include #include "./codegen_c.h" +#include "./merlinc/codeanalys_merlinc.h" namespace TVM { namespace codegen { @@ -18,7 +19,8 @@ class CodeGenCUDA final : public CodeGenC { public: CodeGenCUDA(); void Init(bool output_ssa); - void AddFunction(LoweredFunc f); + void AddFunction(LoweredFunc f, + str2tupleMap map_arg_type); // override behavior void VisitStmt_(const ir::For* op) final; void PrintStorageSync(const Call* op) final; diff --git a/tvm/src/codegen/codegen_opencl.h b/tvm/src/codegen/codegen_opencl.h deleted file mode 100644 index 088ab089a..000000000 --- a/tvm/src/codegen/codegen_opencl.h +++ /dev/null @@ -1,51 +0,0 @@ -/*! - * Copyright (c) 2017 by Contributors - * \file codegen_opencl.h - * \brief Generate OpenCL device code. - */ -#ifndef TVM_CODEGEN_CODEGEN_OPENCL_H_ -#define TVM_CODEGEN_CODEGEN_OPENCL_H_ - -#include -#include -#include -#include "./codegen_c.h" - -namespace TVM { -namespace codegen { - -class CodeGenOpenCL final : public CodeGenC { - public: - CodeGenOpenCL(); - void AddFunction(LoweredFunc f); - std::string Finish(); - - // override print thread tag. - void InitFuncState(LoweredFunc f) final; - void BindThreadIndex(const IterVar& iv) final; // NOLINT(*) - void PrintStorageScope(const std::string& scope, std::ostream& os) final; // NOLINT(*) - void PrintStorageSync(const Call* op) final; // NOLINT(*) - void PrintType(Type t, std::ostream& os) final; // NOLINT(*) - std::string GetVecLoad(Type t, const Variable* buffer, - Expr base) final; - void PrintVecStore(const Variable* buffer, - Type t, Expr base, - const std::string& value) final; // NOLINT(*) - // the address of load/store - void PrintVecAddr(const Variable* buffer, Type t, - Expr base, std::ostream& os); // NOLINT(*) - std::string CastFromTo(std::string value, Type from, Type target); // NOLINT(*) - - // overload visitor - void VisitExpr_(const Broadcast* op, std::ostream& os) final; // NOLINT(*) - - private: - // whether enable fp16 and fp64 extension - bool enable_fp16_{false}; - bool enable_fp64_{false}; -}; - -} // namespace codegen -} // namespace TVM - -#endif // TVM_CODEGEN_CODEGEN_OPENCL_H_ diff --git a/tvm/src/codegen/codegen_source_base.cc b/tvm/src/codegen/codegen_source_base.cc index 0df1ad276..9fc6fc706 100644 --- a/tvm/src/codegen/codegen_source_base.cc +++ b/tvm/src/codegen/codegen_source_base.cc @@ -8,34 +8,79 @@ namespace TVM { namespace codegen { void CodeGenSourceBase::ClearFuncState() { - name_alloc_map_.clear(); + host_name_alloc_map_.clear(); + device_name_alloc_map_.clear(); ssa_assign_map_.clear(); var_idmap_.clear(); scope_mark_.clear(); } +void CodeGenSourceBase::SaveFuncState() { + host_name_alloc_map_save.clear(); + device_name_alloc_map_save.clear(); + ssa_assign_map_save.clear(); + var_idmap_save.clear(); + scope_mark_save.clear(); + // save state into private member + host_name_alloc_map_save = host_name_alloc_map_; + device_name_alloc_map_save = device_name_alloc_map_; + ssa_assign_map_save = ssa_assign_map_; + var_idmap_save = var_idmap_; + scope_mark_save = scope_mark_; +} + +void CodeGenSourceBase::RestoreFuncState() { + this->ClearFuncState(); + host_name_alloc_map_ = host_name_alloc_map_save; + device_name_alloc_map_ = device_name_alloc_map_save; + ssa_assign_map_ = ssa_assign_map_save; + var_idmap_ = var_idmap_save; + scope_mark_ = scope_mark_save; +} + std::string CodeGenSourceBase::GetUniqueName(std::string prefix) { for (size_t i = 0; i < prefix.size(); ++i) { if (prefix[i] == '.') prefix[i] = '_'; } - auto it = name_alloc_map_.find(prefix); - if (it != name_alloc_map_.end()) { - while (true) { - std::ostringstream os; - os << prefix << (++it->second); - std::string name = os.str(); - if (name_alloc_map_.count(name) == 0) { - prefix = name; - break; + if (fpga_scope_) { + auto it = device_name_alloc_map_.find(prefix); + if (it != device_name_alloc_map_.end()) { + while (true) { + std::ostringstream os; + os << prefix << (++it->second); + std::string name = os.str(); + if (device_name_alloc_map_.count(name) == 0) { + prefix = name; + break; + } } } + device_name_alloc_map_[prefix] = 0; + return prefix; + } else { + auto it = host_name_alloc_map_.find(prefix); + if (it != host_name_alloc_map_.end()) { + while (true) { + std::ostringstream os; + os << prefix << (++it->second); + std::string name = os.str(); + if (host_name_alloc_map_.count(name) == 0) { + prefix = name; + break; + } + } + } + host_name_alloc_map_[prefix] = 0; + return prefix; } - name_alloc_map_[prefix] = 0; - return prefix; } std::string CodeGenSourceBase::SSAGetID(std::string src, Type t) { - if (name_alloc_map_.count(src)) return src; + if (fpga_scope_) { + if (device_name_alloc_map_.count(src)) return src; + } else { + if (host_name_alloc_map_.count(src)) return src; + } auto it = ssa_assign_map_.find(src); if (it != ssa_assign_map_.end()) { if (scope_mark_.at(it->second.scope_id)) { diff --git a/tvm/src/codegen/codegen_source_base.h b/tvm/src/codegen/codegen_source_base.h index e140662c1..befc3f8ec 100644 --- a/tvm/src/codegen/codegen_source_base.h +++ b/tvm/src/codegen/codegen_source_base.h @@ -39,6 +39,10 @@ class CodeGenSourceBase { }; /*! \brief Clear the states that might relates to function generation */ void ClearFuncState(); + /*! \brief Save the states that might relates to function generation */ + void SaveFuncState(); + /*! \brief Restore the states that might relates to function generation */ + void RestoreFuncState(); /*! \brief print the current indented value */ void PrintIndent(); /*! @@ -89,18 +93,36 @@ class CodeGenSourceBase { std::ostringstream decl_stream; /*! \brief the stream to be printed */ std::ostringstream stream; + /*! \brief the stream for mocule */ + std::ostringstream module_stream; + /*! \brief the stream host */ + std::ostringstream host_stream; + /*! \brief the stream device */ + std::ostringstream device_stream; /*! \brief name of each variable */ std::unordered_map var_idmap_; + /*! \brief save states as copy */ + std::unordered_map var_idmap_save; + /*! \brief whether generate code for fpga */ + bool fpga_scope_{false}; + /*! \brief name allocation map for host */ + std::unordered_map host_name_alloc_map_; + /*! \brief name allocation map for device */ + std::unordered_map device_name_alloc_map_; private: /*! \brief assignment map of ssa */ std::unordered_map ssa_assign_map_; - /*! \brief name allocation map */ - std::unordered_map name_alloc_map_; /*! \brief array to check whether we are inside certain scope */ std::vector scope_mark_; /*! \brief The current indentation value */ int indent_{0}; + /*! \brief Save states as copy */ + std::unordered_map ssa_assign_map_save; + std::unordered_map host_name_alloc_map_save; + std::unordered_map device_name_alloc_map_save; + std::vector scope_mark_save; + }; /*! diff --git a/tvm/src/codegen/hlsc/build_hlsc.cc b/tvm/src/codegen/hlsc/build_hlsc.cc index 42fb68089..2494ee66f 100644 --- a/tvm/src/codegen/hlsc/build_hlsc.cc +++ b/tvm/src/codegen/hlsc/build_hlsc.cc @@ -24,7 +24,6 @@ runtime::Module BuildVivadoHLSCSim(Array funcs) { cg.AddFunction(f, map_arg_type); } std::string code = cg.Finish(); - return runtime::CreateVivadoHLSModule(funcs[0], code); } @@ -47,7 +46,6 @@ std::string BuildHLSC(Array funcs) { cg.AddFunction(f, map_arg_type); } std::string code = cg.Finish(); - LOG(WARNING) << "HLS C doesn't have runtime, return kernel code"; return code; } diff --git a/tvm/src/codegen/hlsc/codegen_hlsc.cc b/tvm/src/codegen/hlsc/codegen_hlsc.cc index 3e8696fba..d7fc610d7 100644 --- a/tvm/src/codegen/hlsc/codegen_hlsc.cc +++ b/tvm/src/codegen/hlsc/codegen_hlsc.cc @@ -15,49 +15,50 @@ namespace codegen { void CodeGenHLSC::AddFunction(LoweredFunc f, str2tupleMap map_arg_type) { - // Write header files - // TODO: Insert header files here - // Clear previous generated state - this->InitFuncState(f); - // Register alloc buffer type - for (const auto & kv : f->handle_data_type) { - RegisterHandleType(kv.first.get(), kv.second.type()); - } - // Write entry function name - this->stream << "void " << f->name << "("; - // Write arguments - for (size_t i = 0; i < f->args.size(); ++i) { - Var v = f->args[i]; - std::string vid = AllocVarID(v.get()); - if (i != 0) this->stream << ", "; - if (map_arg_type.find(vid) == map_arg_type.end()) { - LOG(WARNING) << vid << " type not found\n"; - PrintType(v.type(), this->stream); - this->stream << ' ' << vid; - } - else { - auto arg = map_arg_type[vid]; - PrintType(std::get<1>(arg), this->stream); - this->stream << ' ' << std::get<0>(arg); - const BufferNode* buf = f->api_args[i].as(); - if (v.type().is_handle() && buf) { - var_shape_map_[buf->data.get()] = buf->shape; - for (size_t i = 0; i < buf->shape.size(); i++) { - this->stream << '['; - this->PrintExpr(buf->shape[i], this->stream); - this->stream << ']'; - } - } - // this->stream << "*"; TODO: create an option for this - } - } - stream << ") {\n"; - int func_scope = this->BeginScope(); - range_ = CollectIterRange(f->body); - this->PrintStmt(f->body); - this->EndScope(func_scope); - this->PrintIndent(); - this->stream << "}\n\n"; + CodeGenC::AddFunction(f, map_arg_type); + // // Write header files + // // TODO: Insert header files here + // // Clear previous generated state + // this->InitFuncState(f); + // // Register alloc buffer type + // for (const auto & kv : f->handle_data_type) { + // RegisterHandleType(kv.first.get(), kv.second.type()); + // } + // // Write entry function name + // this->stream << "void " << f->name << "("; + // // Write arguments + // for (size_t i = 0; i < f->args.size(); ++i) { + // Var v = f->args[i]; + // std::string vid = AllocVarID(v.get()); + // if (i != 0) this->stream << ", "; + // if (map_arg_type.find(vid) == map_arg_type.end()) { + // LOG(WARNING) << vid << " type not found\n"; + // PrintType(v.type(), this->stream); + // this->stream << ' ' << vid; + // } + // else { + // auto arg = map_arg_type[vid]; + // PrintType(std::get<1>(arg), this->stream); + // this->stream << ' ' << std::get<0>(arg); + // const BufferNode* buf = f->api_args[i].as(); + // if (v.type().is_handle() && buf) { + // var_shape_map_[buf->data.get()] = buf->shape; + // for (size_t i = 0; i < buf->shape.size(); i++) { + // this->stream << '['; + // this->PrintExpr(buf->shape[i], this->stream); + // this->stream << ']'; + // } + // } + // // this->stream << "*"; TODO: create an option for this + // } + // } + // stream << ") {\n"; + // int func_scope = this->BeginScope(); + // range_ = CollectIterRange(f->body); + // this->PrintStmt(f->body); + // this->EndScope(func_scope); + // this->PrintIndent(); + // this->stream << "}\n\n"; } std::string CodeGenHLSC::GetBufferRef(Type t, const Variable* buffer, Expr index) { @@ -68,14 +69,16 @@ std::string CodeGenHLSC::GetBufferRef(Type t, const Variable* buffer, Expr index buf_length_map_[buffer] == 1); if (is_scalar) { os << vid; - } else { - os << vid; - std::vector indices = ExtractIndices(index, var_shape_map_[buffer], range_); - for (size_t i = 0; i < indices.size(); i++) { - os << '['; - PrintExpr(indices[i], os); - os << ']'; - } + } else { + os << vid << "["; + PrintExpr(index, os); + os << "]"; + // std::vector indices = ExtractIndices(index, var_shape_map_[buffer], range_); + // for (size_t i = 0; i < indices.size(); i++) { + // os << '['; + // PrintExpr(indices[i], os); + // os << ']'; + // } } } return os.str(); @@ -88,6 +91,7 @@ void CodeGenHLSC::VisitExpr_(const Min *op, std::ostream& os) { // NOLINT(*) PrintExpr(op->b, os); os << ")"; } + void CodeGenHLSC::VisitExpr_(const Max *op, std::ostream& os) { // NOLINT(*) os << "std::max("; PrintExpr(op->a, os); @@ -97,19 +101,20 @@ void CodeGenHLSC::VisitExpr_(const Max *op, std::ostream& os) { // NOLINT(*) } void CodeGenHLSC::VisitStmt_(const LetStmt* op) { - std::string value = PrintExpr(op->value); - // Skip the argument retrieving assign statement - std::string vid = AllocVarID(op->var.get()); - if (op->var.type() != Handle() && - value.find("TVMArray") == std::string::npos && - value.find("arg") != 0) { - PrintIndent(); - PrintType(op->var.type(), this->stream); - this->stream << ' ' - << vid - << " = " << value << ";\n"; - } - PrintStmt(op->body); + CodeGenC::VisitStmt_(op); + // std::string value = PrintExpr(op->value); + // // Skip the argument retrieving assign statement + // std::string vid = AllocVarID(op->var.get()); + // if (op->var.type() != Handle() && + // value.find("TVMArray") == std::string::npos && + // value.find("arg") != 0) { + // PrintIndent(); + // PrintType(op->var.type(), this->stream); + // this->stream << ' ' + // << vid + // << " = " << value << ";\n"; + // } + // PrintStmt(op->body); } void CodeGenHLSC::GenForStmt(const For* op, std::string pragma, bool before) { @@ -164,7 +169,10 @@ void CodeGenHLSC::VisitStmt_(const IfThenElse* op) { void CodeGenHLSC::VisitStmt_(const Allocate* op) { CHECK(!is_zero(op->condition)); - std::string vid = AllocVarID(op->buffer_var.get()); + std::string vid; + if (!var_idmap_.count(op->buffer_var.get())) + vid = AllocVarID(op->buffer_var.get()); + else vid = GetVarID(op->buffer_var.get()); this->PrintIndent(); int32_t constant_size = op->constant_allocation_size(); CHECK_GT(constant_size, 0) @@ -173,16 +181,22 @@ void CodeGenHLSC::VisitStmt_(const Allocate* op) { var_shape_map_[buffer] = op->extents; std::string scope = alloc_storage_scope_.at(buffer); PrintStorageScope(scope, stream); - PrintType(op->type, stream); - stream << ' '<< vid; - if (constant_size > 1) {// Transfer length one array to scalar - for (size_t i = 0; i < op->extents.size(); i++) { - stream << '['; - PrintExpr(op->extents[i], stream); + + if (vid.find("stream_") != std::string::npos) { + void(0); // alloc stream channel in pre-processing + } else { + PrintType(op->type, stream); + stream << ' '<< vid; + if (constant_size > 1) {// Transfer length one array to scalar + stream << "["; + for (size_t i = 0; i < op->extents.size(); i++) { + PrintExpr(op->extents[i], stream); + if (i != op->extents.size()-1) stream << "*"; + } stream << "]"; } + stream << ";\n"; } - stream << ";\n"; buf_length_map_[buffer] = constant_size; RegisterHandleType(op->buffer_var.get(), op->type); for (size_t i = 0; i < op->attrs.size(); i++) { diff --git a/tvm/src/codegen/hlsc/codegen_hlsc.h b/tvm/src/codegen/hlsc/codegen_hlsc.h index c85cbc699..fdd1747fa 100644 --- a/tvm/src/codegen/hlsc/codegen_hlsc.h +++ b/tvm/src/codegen/hlsc/codegen_hlsc.h @@ -27,9 +27,7 @@ class CodeGenHLSC : public CodeGenC { void VisitStmt_(const Allocate* op) override; void GenForStmt(const For* op, std::string pragma, bool before); - - std::map > var_shape_map_; - std::unordered_map range_; + protected: std::string GetBufferRef(Type t, const Variable* buffer, Expr index); }; diff --git a/tvm/src/codegen/hlsc/codegen_vhls.cc b/tvm/src/codegen/hlsc/codegen_vhls.cc index 6a0977e40..f944bef83 100644 --- a/tvm/src/codegen/hlsc/codegen_vhls.cc +++ b/tvm/src/codegen/hlsc/codegen_vhls.cc @@ -21,12 +21,83 @@ namespace TVM { namespace codegen { +void CodeGenVivadoHLS::PreProcess(std::ostringstream& os) { + os << "\n"; + int indent = 2; + for (size_t i = 0; i < arg_vars.size(); i++) { + auto v = arg_vars[i]; + std::string arg_name; + if (stream_table[v]) + arg_name = std::get<0>(arg_top_vars[v]); + else arg_name = GetVarID(v); + + // create local buffer saving result + auto shape = std::get<2>(arg_top_vars[v]); + auto dtype = std::get<1>(arg_top_vars[v]); + if (!stream_table[v]) { // unstreamed args + // allocate local buffer + for (int k = 0; k < indent; k++) os << ' '; + PrintType(dtype, os); + os << " " << arg_name << "["; + for (size_t n = 0; n < shape.size(); n++) { + os << shape[n]; + if (n != shape.size() - 1) os << "* "; + } + os << "];\n"; + + for (size_t j = 0; j < shape.size(); j++) { + for (int k = 0; k < indent; k++) os << ' '; + os << "for (int i" << j << " = 0; i" + << j << "< " << shape[j] << "; i" + << j << "++) {\n"; + // pass stream reference + if (j == shape.size() - 1) { + for (int k = 0; k < indent; k++) os << ' '; + os << " " << arg_name << "[" + << getIndex(shape) << "] = " + << "fd_" << arg_name << ".read();\n"; + } + indent += 2; + } + for (size_t m = 0; m < shape.size(); m++) { + indent -= 2; + for (int k = 0; k < indent; k++) os << ' '; + os << "}\n"; + } + } else if (i == arg_vars.size() - 1 || true) { + // allocate for return variable + for (int k = 0; k < indent; k++) os << ' '; + PrintType(dtype, os); + os << " " << arg_name << "["; + for (size_t n = 0; n < shape.size(); n++) { + os << shape[n]; + if (n != shape.size() - 1) os << "* "; + } + os << "];\n"; + } + } +} + +void CodeGenVivadoHLS::PostProcess(std::ostringstream& os) { +// os << "\n"; +// int indent = 2; +// for (size_t i = 0; i < arg_vars.size(); i++) { +// auto v = arg_vars[i]; +// std::string arg_name; +// if (stream_table[v]) +// arg_name = std::get<0>(arg_top_vars[v]); +// else arg_name = GetVarID(v); +// os << arg_name << " = " << "fd_" +// << arg_name << ".write();\n"; +} + void CodeGenVivadoHLS::AddFunction(LoweredFunc f, str2tupleMap map_arg_type) { // Write header files - this->stream << "#include \n"; - this->stream << "#include \n"; - this->stream << "#include \n\n"; + this->decl_stream << "#include \n"; + this->decl_stream << "#include \n"; + this->decl_stream << "#include \n"; + this->decl_stream << "#include \n\n"; CodeGenHLSC::AddFunction(f, map_arg_type); if (soda_header_.is_open()) soda_header_.close(); @@ -77,6 +148,13 @@ void CodeGenVivadoHLS::VisitStmt_(const Store* op) { this->stream << ref << "[" << PrintExpr(sb->index) << "] = " << PrintExpr(sb->value) << ";\n"; + } else if (const StreamExpr* se = op->value.as()) { + std::string vid = GetVarID(se->buffer_var.get()); + vid = vid.substr(0, vid.find("_stream_send")); + PrintIndent(); + this->stream << vid << "[" + << op->index << "] = " + << "fd_" << vid << ".read();\n"; } else { CodeGenC::VisitStmt_(op); } @@ -143,6 +221,30 @@ void CodeGenVivadoHLS::VisitStmt_(const Partition* op) { stream << "\n"; } +void CodeGenVivadoHLS::VisitExpr_(const StreamExpr* op, std::ostream& os) { + CodeGenC::VisitExpr_(op, os); + std::string vid = GetVarID(op->buffer_var.get()); + vid = vid.substr(0, vid.find("_stream_send")); + os << vid << ".read()"; +} + +void CodeGenVivadoHLS::VisitStmt_(const StreamStmt* op) { + CodeGenC::VisitStmt_(op); + std::string vid = GetVarID(op->buffer_var.get()); + switch (op->stream_type) { + case StreamType::Channel: + break; + case StreamType::FIFO: + break; + case StreamType::Pipe: + break; + } + vid = vid.substr(0, vid.find("_stream_send")); + auto load = op->value.as(); + stream << "fd_" << vid << ".write(" + << vid << "["<< load->index << "]);\n"; +} + class AllocateCollector final : public IRVisitor { public: AllocateCollector(std::vector& alloc_list, @@ -160,6 +262,144 @@ class AllocateCollector final : public IRVisitor { VarExprUnorderedSet& outputs_; }; +void CodeGenVivadoHLS::VisitStmt_(const AttrStmt* op) { + if (op->attr_key == ir::attr::device_scope) { + // print top( ... in host and enter fpga scope + if (op->value.as()->value == "fpga" && !fpga_scope_) { + fpga_scope_ = true; + PrintIndent(); + + // track the stream usage + StreamCollector collector(stream_table, "cpu"); + collector.Visit(op->body); + + // update data type and name + for (auto k : collector.host_undefined_) { + auto v = k.get(); + arg_vars.push_back(v); + stream_table[v] = true; + auto tuple = arg_top_vars[v]; + arg_top_vars[v] = std::make_tuple(v->name_hint, + std::get<1>(tuple), + std::get<2>(tuple)); + } + TypeCollector visitor(arg_top_vars); + visitor.Visit(op->body); + + // generte function calls + stream << "top("; + for (size_t i = 0; i < arg_vars.size(); i++) { + auto v = arg_vars[i]; + std::string arg_name; + if (stream_table[v]) + arg_name = std::get<0>(arg_top_vars[v]); + else arg_name = GetVarID(v); + if (i != 0) stream << ", "; + stream << "fd_" << arg_name; + + // generate kernel func definition + if (i != 0) arg_stream << ", "; + arg_stream << "hls::stream<"; + PrintType(std::get<1>(arg_top_vars[v]), arg_stream); + auto shape = std::get<2>(arg_top_vars[v]); + arg_stream << ">& fd_" << arg_name; + } + stream << ");\n"; + + // switch context to device scope + host_stream << this->stream.str(); + this->stream.str(""); + this->stream.clear(); + + // swtich from device to host + } else if (op->value.as()->value == "cpu" && + fpga_scope_) { + fpga_scope_ = false; + device_stream << this->stream.str(); + this->stream.str(""); + this->stream.clear(); + } + this->PrintStmt(op->body); + } else { + CodeGenC::VisitStmt_(op); + } +} + +void CodeGenVivadoHLS::VisitStmt_(const KernelStmt *op) { + PrintIndent(); + stream << op->name << "("; + for (size_t i = 0; i < op->args.size(); i++) { + if (stream_arg_pos[op->name].count(i)) + stream << "fd_"; + PrintExpr(op->args[i], stream); + if (i < op->args.size() -1) stream << ", "; + } + stream << ");\n"; +} + +void CodeGenVivadoHLS::VisitStmt_(const KernelDef* op) { + LoweredFunc f; + // save func states + CodeGenC::SaveFuncState(f); + CodeGenC::InitFuncState(f); + std::ostringstream save; + save << this->stream.str(); + this->stream.str(""); + this->stream.clear(); + + // skip the first underscore + GetUniqueName("_"); + // add to alloc buffer : type. + for (const auto & k : op->args) { + RegisterHandleType(k.get(), k.get()->type); + } + // print function signature + PrintType(op->ret_type, stream); + stream << " " << op->name << "("; + for (size_t k = 0; k < op->channels.size(); k+=2) { + int pos = op->channels[k].as()->value; + stream_arg_pos[op->name].insert(pos); + } + for (size_t i = 0; i < op->args.size(); ++i) { + VarExpr v = op->args[i]; + var_shape_map_[v.get()] = op->api_args[i]; + std::string vid = AllocVarID(v.get()); + if (i != 0) stream << ", "; + std::string str = PrintExpr(op->api_types[i]); + Type type = String2Type(str); + + // pass the stream channel reference + // TODO: broadcast in hlsc (one wr multi read) + if (stream_arg_pos[op->name].count(i)) { + stream << "hls::stream<"; + PrintType(type, stream); + stream << ">& " << vid; + } else { + PrintType(type, stream); + this->stream << " " << vid << "["; + int mul = 1; + for (size_t j = 0; j < op->api_args[i].size(); j++) { + auto dim = op->api_args[i][j].as()->value; + mul = mul * dim; + } + this->stream << mul << "]"; + } + } + stream << ") {\n"; + int func_scope = BeginScope(); + range_ = CollectIterRange(op->body); + PrintStmt(op->body); + EndScope(func_scope); + stream << "}\n\n"; + + // restore default stream + module_stream << this->stream.str(); + this->stream.str(""); + this->stream.clear(); + this->stream << save.str(); + RestoreFuncState(f); +} + void CodeGenVivadoHLS::VisitStmt_(const Stencil* op) { // Use SODA codegen for stencil analysis CodeGenSODA cg_soda; diff --git a/tvm/src/codegen/hlsc/codegen_vhls.h b/tvm/src/codegen/hlsc/codegen_vhls.h index 5486be1dc..6462251db 100644 --- a/tvm/src/codegen/hlsc/codegen_vhls.h +++ b/tvm/src/codegen/hlsc/codegen_vhls.h @@ -23,11 +23,19 @@ class CodeGenVivadoHLS final : public CodeGenHLSC { void VisitExpr_(const GetBit* op, std::ostream& os) override; void VisitExpr_(const GetSlice* op, std::ostream& os) override; + void VisitExpr_(const StreamExpr* op, std::ostream& os) override; void VisitStmt_(const Store* op) override; void VisitStmt_(const For* op) override; void VisitStmt_(const Partition* op) override; void VisitStmt_(const Stencil* op) override; + void VisitStmt_(const StreamStmt* op) override; + void VisitStmt_(const AttrStmt* op) override; + void VisitStmt_(const KernelDef* op) override; + void VisitStmt_(const KernelStmt* op) override; + + void PreProcess(std::ostringstream& os); + void PostProcess(std::ostringstream& os); private: std::ofstream soda_header_; }; diff --git a/tvm/src/codegen/merlinc/codeanalys_merlinc.cc b/tvm/src/codegen/merlinc/codeanalys_merlinc.cc index 56b4e1d97..d6fa1c6ba 100644 --- a/tvm/src/codegen/merlinc/codeanalys_merlinc.cc +++ b/tvm/src/codegen/merlinc/codeanalys_merlinc.cc @@ -652,6 +652,9 @@ void CodeAnalysMerlinC::VisitExpr_(const Broadcast* op, std::ostream& os) { // LOG(FATAL) << "Broadcast: not supported "; } +void CodeAnalysMerlinC::VisitExpr_(const StreamExpr* op, std::ostream& os) { // NOLINT(*) +} + void CodeAnalysMerlinC::VisitExpr_(const Select* op, std::ostream& os) { // NOLINT(*) os << "("; PrintExpr(op->condition, os); @@ -716,10 +719,8 @@ void CodeAnalysMerlinC::VisitExpr_(const Quantize *op, std::ostream& os) { // NO } void CodeAnalysMerlinC::VisitExpr_(const KernelExpr *op, std::ostream& os) { // NOLINT(*) - LOG(FATAL) << "KernelExpr is not yet support"; } - void CodeAnalysMerlinC::VisitStmt_(const LetStmt* op) { // TODO comaniac //std::vector vec_var = GetNodesByType(op->value); @@ -882,11 +883,9 @@ void CodeAnalysMerlinC::VisitStmt_(const ProducerConsumer *op) { } void CodeAnalysMerlinC::VisitStmt_(const KernelDef *op) { - LOG(FATAL) << "KernelDef is not yet support"; } void CodeAnalysMerlinC::VisitStmt_(const KernelStmt *op) { - LOG(FATAL) << "KernelStmt is not yet support"; } void CodeAnalysMerlinC::VisitStmt_(const Return *op) { @@ -917,6 +916,8 @@ void CodeAnalysMerlinC::VisitStmt_(const Reuse *op) { void CodeAnalysMerlinC::VisitStmt_(const Partition *op) {} +void CodeAnalysMerlinC::VisitStmt_(const StreamStmt *op) {} + void CodeAnalysMerlinC::VisitStmt_(const Stencil *op) { PrintStmt(op->body); } diff --git a/tvm/src/codegen/merlinc/codeanalys_merlinc.h b/tvm/src/codegen/merlinc/codeanalys_merlinc.h index 6ba082f09..421f0d96f 100644 --- a/tvm/src/codegen/merlinc/codeanalys_merlinc.h +++ b/tvm/src/codegen/merlinc/codeanalys_merlinc.h @@ -112,6 +112,7 @@ class CodeAnalysMerlinC : void VisitExpr_(const SetSlice* op, std::ostream& os) override; // NOLINT(*) void VisitExpr_(const Quantize* op, std::ostream& os) override; // NOLINT(*) void VisitExpr_(const KernelExpr* op, std::ostream& os) override; // NOLINT(*) + void VisitExpr_(const StreamExpr* op, std::ostream& os) override; // NOLINT(*) // statment void VisitStmt_(const LetStmt* op) override; void VisitStmt_(const Store* op) override; @@ -131,6 +132,7 @@ class CodeAnalysMerlinC : void VisitStmt_(const Reuse* op) override; void VisitStmt_(const Partition* op) override; void VisitStmt_(const Stencil* op) override; + void VisitStmt_(const StreamStmt* op) override; /*! * Print Type represetnation of type t. * \param t The type representation. diff --git a/tvm/src/codegen/opencl/build_opencl.cc b/tvm/src/codegen/opencl/build_opencl.cc new file mode 100755 index 000000000..f5b1352a7 --- /dev/null +++ b/tvm/src/codegen/opencl/build_opencl.cc @@ -0,0 +1,61 @@ +#include "./codegen_aocl.h" +#include "./codegen_sdaccel.h" +#include "../build_common.h" +#include "./sdaccel_module.h" +#include "../merlinc/codeanalys_merlinc.h" + +namespace TVM { +namespace codegen { + +#if HCL_SDACCEL_RUNTIME +runtime::Module BuildSDAccelSim(Array funcs) { + CodeAnalysMerlinC ca; + CodeGenSDACCEL cg; + for (LoweredFunc f : funcs) { + // 1st pass: Analyze AST and collect necessary information + ca.AddFunction(f); + str2tupleMap map_arg_type; + map_arg_type = ca.Finish(); + // 2nd pass: Generate kernel code + cg.AddFunction(f, map_arg_type); + } + std::string code = cg.Finish(); + return runtime::CreateSDAccelModule(funcs[0], code); +} + +TVM_REGISTER_API("codegen.build_sdaccel_csim") +.set_body([](TVMArgs args, TVMRetValue* rv) { + *rv = BuildSDAccelSim(args[0]); + }); +#endif + + +template +std::string BuildOpenCL(Array funcs){ + using TVM::runtime::Registry; + CodeAnalysMerlinC ca; + CodeGen cg; + for(LoweredFunc f: funcs){ + ca.AddFunction(f); + str2tupleMapmap_arg_type; + map_arg_type = ca.Finish(); + cg.AddFunction(f, map_arg_type); + } + std::string code = cg.Finish(); + + LOG(WARNING) << "OpenCL doesn't have runtime, return kernel code"; + return code; +} + + +TVM_REGISTER_API("codegen.build_sdaccel") +.set_body([]( TVMArgs args, TVMRetValue * rv ) { + * rv = BuildOpenCL(args[0]); + }); + +TVM_REGISTER_API("codegen.build_aocl") +.set_body([]( TVMArgs args, TVMRetValue * rv ) { + * rv = BuildOpenCL(args[0]); + }); +} // namespace codegen +} // namespace TVM diff --git a/tvm/src/codegen/opencl/codegen_aocl.cc b/tvm/src/codegen/opencl/codegen_aocl.cc new file mode 100644 index 000000000..6d3247d02 --- /dev/null +++ b/tvm/src/codegen/opencl/codegen_aocl.cc @@ -0,0 +1,354 @@ +#include +#include +#include +#include +#include +#include "./codegen_aocl.h" +#include "../../runtime/thread_storage_scope.h" + +namespace TVM { +namespace codegen { + +inline Type String2Type(std::string& s) { + if (s.front() == '\"' && s.back() == '\"') { + s.erase(0, 1); + s.pop_back(); + } + std::istringstream is(s); + halideir_type_code_t code = Type::Int; + if (s.substr(0, 3) == "int") { + code = Type::Int; s = s.substr(3); + } else if (s.substr(0, 4) == "uint") { + code = Type::UInt; s = s.substr(4); + } else if (s.substr(0, 5) == "float") { + code = Type::Float; s = s.substr(5); + } else if (s.substr(0, 5) == "float") { + code = Type::Float; s = s.substr(5); + } else if (s == "handle") { + return Handle(); + } else { + LOG(FATAL) << "unknown type " << s; + } + int bits = 32, lanes = 1; + if (sscanf(s.c_str(), "%dx%d", &bits, &lanes) == 0) { + LOG(FATAL) << "unknown type " << s; + } + return Type(code, bits, lanes); +} + +void CodeGenAOCL::AddFunction(LoweredFunc f, + str2tupleMap map_arg_type) { + // Clear previous generated state + this->InitFuncState(f); + for (Var arg: f->args) { + if (arg.type().is_handle()) { + alloc_storage_scope_[arg.get()] = "global"; + } + } + + // Skip the first underscore, so SSA variable starts from _1 + GetUniqueName("_"); + + // Register alloc buffer type + for (const auto & kv : f->handle_data_type) { + RegisterHandleType(kv.first.get(), kv.second.type()); + } + + this->decl_stream << "#include \"ihc_apint.h\"" << "\n"; + this->decl_stream << "#pragma OPENCL EXTENSION cl_intel_arbitrary_precision_integers : enable\n"; + this->stream << "__kernel " << "void " << f->name << "("; + + // Write arguments + for (size_t i = 0; i < f->args.size(); ++i) { + // alloc or get var name + Var v = f->args[i]; + std::string vid; + if (!var_idmap_.count(v.get())) + vid = AllocVarID(v.get()); + else vid = GetVarID(v.get()); + + if (i != 0) this->stream << ", "; + if (map_arg_type.find(vid) == map_arg_type.end()) { + LOG(WARNING) << vid << " type not found\n"; + PrintType(v.type(), this->stream); + this->stream << ' ' << vid; + } + else { + auto arg = map_arg_type[vid]; + this->stream << "__global "; + PrintType(std::get<1>(arg), this->stream); + if (v.type().is_handle()) + this->stream << "*"; + this->stream << ' ' << "restrict "; + this->stream << std::get<0>(arg); + } + } + stream << ") {\n"; + int func_scope = this->BeginScope(); + this->PrintStmt(f->body); + this->EndScope(func_scope); + this->PrintIndent(); + // this->stream << ' '<< ' ' << "return;\n"; + this->stream << "}\n\n"; +} + +void CodeGenAOCL::PrintType(Type t, std::ostream &os) +{ + int lanes = t.lanes(); + if(t.is_handle()) { + os << "void*";return; + } + if(t == Bool()) { + os <<"bool"; return; + } + CHECK_EQ(lanes, 1) + << "do not yet support vector types"; + + bool fail = false; + if(t.is_float()) { + switch(t.bits()) + { + case 16: + os<<"half"; + // enable_fp16_ = true; + break; + case 32: + os<<"float"; + break; + case 64: + os<< "double"; + // enable_fp64_ = true; + break; + default: + fail = true; + break; + } + if(!fail && lanes ==1) return; + if(!fail&&(lanes >= 2 && lanes <=16)) + { + os<=2 && lanes <= 16)) { + os << lanes; return; + } + if(fail && lanes==1) { + if(t.is_uint()) { + if (t.bits() > 64) { + os << "uint" << "64" << "_t"; return; + } else { + os<< "ap_uint<"<< t.bits() <<"> uintd_t"; return; + } + } + if(t.is_int()) { + if (t.bits() > 64) { + os << "int" << "64" << "_t"; return; + } else { + os << "ap_int<" << t.bits() << "> intd_t"; return; + } + } + } + } + + LOG(FATAL) << "Cannot convert type"<for_type == ForType::Unrolled) { + int unroll_factor = 0, i = 0; + for (auto key : op->annotate_keys) { + if (auto str = key.as()) { + auto factor = op->annotate_values[i].as(); + if (str->value == "factor" && factor != nullptr && factor->value > 1) { + unroll_factor = factor->value; + break; + } + } + i++; + } + os << "#pragma unroll"; + if (unroll_factor > 0) os << " " << unroll_factor << "\n"; + else os << "\n"; + } + else if (op->for_type == ForType::Pipelined) { + int II = 1, i = 0; + for (auto key : op->annotate_keys) { + if (auto str = key.as()) { + auto initiation_interval = op->annotate_values[i].as(); + if (str->value == "initiation_interval" && + initiation_interval != nullptr && + initiation_interval->value > 1) { + II = initiation_interval->value; + break; + } + } + i++; + } + os << "#pragma"; + os << " ii " << II << "\n"; + } + CodeGenAOCL::GenForStmt(op, os.str(), true); +} + +void CodeGenAOCL::VisitExpr_(const StreamExpr* op, std::ostream& os) { + std::string vid; + if (!var_idmap_.count(op->buffer_var.get())) + vid = AllocVarID(op->buffer_var.get()); + else vid = GetVarID(op->buffer_var.get()); + int i = 0; + for (auto key : op->annotate_keys) { + auto str = key.as(); + auto val = op->annotate_values[i].as(); + if (str->value == "name" && val != nullptr) { + vid = val->value; + decl_stream << "channel "; + PrintType(op->type, decl_stream); + decl_stream << " " << vid << ";\n"; + } + i++; + } + switch (op->stream_type) { + case StreamType::Channel: + os << "read_channel_intel("; + os << vid << ")"; + break; + case StreamType::Pipe: + os << "read_pipe("; + break; + case StreamType::FIFO: + // buffered channel + os << "fifo"; + break; + } +} + +void CodeGenAOCL::VisitStmt_(const KernelDef* op) { + LoweredFunc f; + SaveFuncState(f); + InitFuncState(f); + std::ostringstream save; + save << this->stream.str(); + this->stream.str(""); + this->stream.clear(); + + // skip the first underscore + GetUniqueName("_"); + // add to alloc buffer : type. + for (const auto & k : op->args) { + RegisterHandleType(k.get(), k.get()->type); + } + stream << "__kernel "; + const UIntImm* is_void = op->ret_void.as(); + if (is_void) stream << "void"; + else PrintType(op->ret_type, stream); + stream << " " << op->name << "("; + + // streamed arg position to channel index + std::unordered_map stream_args; + for (size_t j = 0; j < op->channels.size(); j=j+2) { + int pos = op->channels[j].as()->value; + int idx = op->channels[j+1].as()->value; + stream_args[pos] = idx; + } + for (size_t i = 0; i < op->args.size(); ++i) { + VarExpr v = op->args[i]; + var_shape_map_[v.get()] = op->api_args[i]; + std::string vid = AllocVarID(v.get()); + if (stream_args.count(i)) { + stream_arg_pos[op->name].insert(i); + if (!stream_pragma) { + decl_stream << "#pragma OPENCL EXTENSION cl_intel_channels : enable\n"; + stream_pragma = true; + } + } else { + if (i != 0) { + if (stream_args.count(i-1)) void(0); + else stream << ", "; + } // un-streamed argument + this->stream << "__global "; + std::string str = PrintExpr(op->api_types[i]); + Type type = String2Type(str); + PrintType(type, stream); + this->stream << "* restrict " << vid; + } + } + stream << ") {\n"; + int func_scope = BeginScope(); + range_ = CollectIterRange(op->body); + PrintStmt(op->body); + EndScope(func_scope); + stream << "}\n\n"; + + // restore default stream + module_stream << this->stream.str(); + this->stream.str(""); + this->stream.clear(); + this->stream << save.str(); + RestoreFuncState(f); +} + +void CodeGenAOCL::VisitStmt_(const KernelStmt *op) { + PrintIndent(); + stream << op->name << "("; + for (size_t i = 0; i < op->args.size(); i++) { + std::string str = op->name + "." + PrintExpr(op->args[i]); + if (!stream_arg_pos[op->name].count(i)) { + if (i != 0) { + if (stream_arg_pos[op->name].count(i-1)) void(0); + else stream << ", "; + } + PrintExpr(op->args[i], stream); + } + } + stream << ");\n"; +} + +void CodeGenAOCL::VisitExpr_(const KernelExpr *op, std::ostream& os) { // NOLINT(*) + os << op->name << "("; + for (size_t i = 0; i < op->args.size(); ++i) { + if (!stream_arg_pos[op->name].count(i)) { + if (i != 0) { + if (stream_arg_pos[op->name].count(i-1)) void(0); + else stream << ", "; + } + PrintExpr(op->args[i], stream); + } + } + os << ")"; +} + +void CodeGenAOCL::VisitStmt_(const StreamStmt* op) { + std::string vid; + if (!var_idmap_.count(op->buffer_var.get())) + vid = AllocVarID(op->buffer_var.get()); + else vid = GetVarID(op->buffer_var.get()); + PrintIndent(); + int i = 0; + for (auto key : op->annotate_keys) { + auto str = key.as(); + auto val = op->annotate_values[i].as(); + if (str->value == "name" && val != nullptr) vid = val->value; + i++; + } + switch (op->stream_type) { + case StreamType::Channel: + stream << "write_channel_intel("; + stream << vid << ", "; + break; + case StreamType::Pipe: + stream << "write_pipe("; + stream << vid << ", "; + break; + case StreamType::FIFO: + stream << "fifo("; + break; + } + PrintExpr(op->value, stream); + stream << ");\n"; +} + +} // namespace codegen +} // namespace TVM diff --git a/tvm/src/codegen/opencl/codegen_aocl.h b/tvm/src/codegen/opencl/codegen_aocl.h new file mode 100755 index 000000000..5778b70ec --- /dev/null +++ b/tvm/src/codegen/opencl/codegen_aocl.h @@ -0,0 +1,34 @@ +#ifndef TVM_CODEGEN_CODEGEN_AOCL_H_ +#define TVM_CODEGEN_CODEGEN_AOCL_H_ + +# include +# include +# include "./codegen_opencl.h" + +namespace TVM { +namespace codegen { + +class CodeGenAOCL : public CodeGenOpenCL { + public: + CodeGenAOCL(){} + void AddFunction(LoweredFunc f, str2tupleMap map_arg_type); + void PrintType(Type t, std::ostream& os) override; //NOLINT(*) + + void VisitStmt_(const For* op) override; //NOLINT(*) + void VisitStmt_(const StreamStmt* op) override; //NOLINT(*) + void VisitStmt_(const KernelDef* op) override; //NOLINT(*) + void VisitStmt_(const KernelStmt* op) override; //NOLINT(*) + + void VisitExpr_(const StreamExpr* op, std::ostream& os) override; //NOLINT(*) + void VisitExpr_(const KernelExpr* op, std::ostream& os) override; //NOLINT(*) + + private: + // whether to enable streaming + bool stream_pragma{false}; + // map from kernel name to set of streamed arg position index + std::unordered_map> stream_arg_pos; +}; +} // namespace codegen +} // namespace TVM + +#endif // TVM_CODEGEN_CODEGEN_AOCL_H_ diff --git a/tvm/src/codegen/codegen_opencl.cc b/tvm/src/codegen/opencl/codegen_opencl.cc old mode 100644 new mode 100755 similarity index 53% rename from tvm/src/codegen/codegen_opencl.cc rename to tvm/src/codegen/opencl/codegen_opencl.cc index d0297a1d9..979a19e0f --- a/tvm/src/codegen/codegen_opencl.cc +++ b/tvm/src/codegen/opencl/codegen_opencl.cc @@ -1,206 +1,239 @@ -/*! - * Copyright (c) 2017 by Contributors - * \file codegen_opencl.cc - */ -#include -#include -#include -#include -#include "./codegen_opencl.h" -#include "../runtime/thread_storage_scope.h" - -namespace TVM { -namespace codegen { - -CodeGenOpenCL::CodeGenOpenCL() { - restrict_keyword_ = "restrict"; -} - -void CodeGenOpenCL::InitFuncState(LoweredFunc f) { - CodeGenC::InitFuncState(f); - for (Var arg : f->args) { - if (arg.type().is_handle()) { - alloc_storage_scope_[arg.get()] = "global"; - } - } -} - -void CodeGenOpenCL::AddFunction(LoweredFunc f) { - this->stream << "__kernel "; - CodeGenC::AddFunction(f); -} - -std::string CodeGenOpenCL::Finish() { - // inject extension enable pragma for fp16 and fp64 - if (enable_fp16_) { - decl_stream - << "#ifdef cl_khr_fp16\n" - "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n" - "#elif defined(cl_amd_fp16)\n" - "#pragma OPENCL EXTENSION cl_amd_fp16 : enable\n" - "#else\n" - "#error \"Half precision floating point not supported" - "by OpenCL implementation on your device.\" \n" - "#endif\n\n"; - } - - if (enable_fp64_) { - decl_stream - << "#ifdef cl_khr_fp64\n" - "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n" - "#elif defined(cl_amd_fp64)\n" - "#pragma OPENCL EXTENSION cl_amd_fp64 : enable\n" - "#else\n" - "#error \"Double precision floating point not supported" - "by OpenCL implementation on your device.\" \n" - "#endif\n\n"; - } - - return CodeGenC::Finish(); -} - -void CodeGenOpenCL::BindThreadIndex(const IterVar& iv) { - CHECK(!var_idmap_.count(iv->var.get())); - runtime::ThreadScope ts = runtime::ThreadScope::make(iv->thread_tag); - std::ostringstream os; - if (ts.rank == 1) { - os << "get_local_id(" << ts.dim_index << ")"; - } else { - os << "get_group_id(" << ts.dim_index << ")"; - } - var_idmap_[iv->var.get()] = - CastFromTo(os.str(), UInt(64), iv->var.type()); -} - -void CodeGenOpenCL::PrintType(Type t, std::ostream& os) { // NOLINT(*) - int lanes = t.lanes(); - if (t.is_handle()) { - CHECK_EQ(lanes, 1) - << "do not yet support vector types"; - os << "void*"; return; - } - bool fail = false; - if (t.is_float()) { - switch (t.bits()) { - case 16: - os << "half"; - enable_fp16_ = true; - break; - case 32: os << "float"; break; - case 64: - os << "double"; - enable_fp64_ = true; - break; - default: fail = true; break; - } - if (!fail && lanes == 1) return; - if (!fail && (lanes >= 2 && lanes <= 16)) { - os << lanes; return; - } - } else if (t.is_uint() || t.is_int()) { - if (t.is_uint()) { - os << 'u'; - } - if (t.bits() == 8 && t.lanes() == 4) { - // directly 4 8 bit int in integer. - os << "int"; return; - } - switch (t.bits()) { - case 8: os << "char"; break; - case 16: os << "short"; break; - case 32: os << "int"; break; - case 64: os << "long"; break; - case 1: os << "int"; break; - default: fail = true; break; - } - if (!fail && lanes == 1) return; - if (!fail && (lanes >= 2 && lanes <= 16)) { - os << lanes; return; - } - } - LOG(FATAL) << "Cannot convert type " << t << " to OpenCL type"; -} - -void CodeGenOpenCL::PrintVecAddr(const Variable* buffer, Type t, - Expr base, std::ostream& os) { // NOLINT(*) - if (!HandleTypeMatch(buffer, t.element_of())) { - os << '('; - auto it = alloc_storage_scope_.find(buffer); - if (it != alloc_storage_scope_.end()) { - PrintStorageScope(it->second, os); - } - os << ' '; - PrintType(t.element_of(), os); - os << "*)"; - } - os << GetVarID(buffer) << " + "; - PrintExpr(base, os); -} -std::string CodeGenOpenCL::GetVecLoad( - Type t, const Variable* buffer, Expr base) { - std::ostringstream os; - os << "vload" << t.lanes() << "(0, "; - PrintVecAddr(buffer, t, base, os); - os << ")"; - return os.str(); -} - -void CodeGenOpenCL::PrintVecStore(const Variable* buffer, - Type t, Expr base, - const std::string& value) { - this->PrintIndent(); - stream << "vstore" << t.lanes() << "(" << value << ", 0, "; - PrintVecAddr(buffer, t, base, stream); - stream << ");\n"; -} - -void CodeGenOpenCL::PrintStorageSync(const Call* op) { - const std::string& sync = op->args[0].as()->value; - if (sync == "warp") { - LOG(FATAL) << "warp sync not supported in opencl"; - } else if (sync == "shared") { - this->PrintIndent(); - this->stream << "barrier(CLK_LOCAL_MEM_FENCE);\n"; - } else if (sync == "global") { - LOG(FATAL) << "not supported"; - } -} - -void CodeGenOpenCL::PrintStorageScope( - const std::string& scope, std::ostream& os) { // NOLINT(*) - if (scope == "global") { - os << "__global"; - } else if (scope == "shared") { - os << "__local"; - } -} - -std::string CodeGenOpenCL::CastFromTo(std::string value, Type from, Type target) { - if (from == target) return value; - std::ostringstream os; - if (target.lanes() == 1) { - os << "(("; - this->PrintType(target, os); - os << ")" << value << ")"; - } else { // convert vector type - os << "("; - os << "convert_"; - this->PrintType(target, os); - os << "(" << value << "))"; - } - return os.str(); -} - -void CodeGenOpenCL::VisitExpr_(const Broadcast* op, std::ostream& os) { // NOLINT(*) - std::string v = PrintExpr(op->value); - os << "(("; - PrintType(op->type, os); - os << ")("; - for (int i = 0; i < op->lanes; ++i) { - if (i != 0) os << ", "; - os << v; - } - os << "))"; -} -} // namespace codegen -} // namespace TVM +# include +# include +# include +# include +# include +# include +# include "./codegen_opencl.h" +# include "../../runtime/thread_storage_scope.h" + +namespace TVM{ +namespace codegen{ + +CodeGenOpenCL::CodeGenOpenCL(){ + restrict_keyword_ = "restrict"; +} + +std::string CodeGenOpenCL::Finish() { + // inject extension enable pragma for fp16 and fp64 + if (enable_fp16_) { + decl_stream + << "#ifdef cl_khr_fp16\n" + "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n" + "#elif defined(cl_amd_fp16)\n" + "#pragma OPENCL EXTENSION cl_amd_fp16 : enable\n" + "#else\n" + "#error \"Half precision floating point not supported" + "by OpenCL implementation on your device.\" \n" + "#endif\n\n"; + } + + if (enable_fp64_) { + decl_stream + << "#ifdef cl_khr_fp64\n" + "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n" + "#elif defined(cl_amd_fp64)\n" + "#pragma OPENCL EXTENSION cl_amd_fp64 : enable\n" + "#else\n" + "#error \"Double precision floating point not supported" + "by OpenCL implementation on your device.\" \n" + "#endif\n\n"; + } + + return CodeGenC::Finish(); +} + +void CodeGenOpenCL::BindThreadIndex(const IterVar& iv) { + CHECK(!var_idmap_.count(iv->var.get())); + runtime::ThreadScope ts = runtime::ThreadScope::make(iv->thread_tag); + std::ostringstream os; + if (ts.rank == 1) { + os << "get_local_id(" << ts.dim_index << ")"; + } else { + os << "get_group_id(" << ts.dim_index << ")"; + } + var_idmap_[iv->var.get()] = + CastFromTo(os.str(), UInt(64), iv->var.type()); +} + + +void CodeGenOpenCL::PrintVecAddr(const Variable* buffer, Type t, + Expr base, std::ostream& os) { // NOLINT(*) + if (!HandleTypeMatch(buffer, t.element_of())) { + os << '('; + auto it = alloc_storage_scope_.find(buffer); + if (it != alloc_storage_scope_.end()) { + PrintStorageScope(it->second, os); + } + os << ' '; + PrintType(t.element_of(), os); + os << "*)"; + } + os << GetVarID(buffer) << " + "; + PrintExpr(base, os); +} +std::string CodeGenOpenCL::GetVecLoad( + Type t, const Variable* buffer, Expr base) { + std::ostringstream os; + os << "vload" << t.lanes() << "(0, "; + PrintVecAddr(buffer, t, base, os); + os << ")"; + return os.str(); +} + +void CodeGenOpenCL::PrintVecStore(const Variable* buffer, + Type t, Expr base, + const std::string& value) { + this->PrintIndent(); + stream << "vstore" << t.lanes() << "(" << value << ", 0, "; + PrintVecAddr(buffer, t, base, stream); + stream << ");\n"; +} + +void CodeGenOpenCL::PrintStorageSync(const Call* op) { + const std::string& sync = op->args[0].as()->value; + if (sync == "warp") { + LOG(FATAL) << "warp sync not supported in opencl"; + } else if (sync == "shared") { + this->PrintIndent(); + this->stream << "barrier(CLK_LOCAL_MEM_FENCE);\n"; + } else if (sync == "global") { + LOG(FATAL) << "not supported"; + } +} + +void CodeGenOpenCL::PrintStorageScope( + const std::string& scope, std::ostream& os) { // NOLINT(*) + if (scope == "global") { + // os << "global "; + } else if (scope == "shared") { + // os << "local "; + } +} + +std::string CodeGenOpenCL::CastFromTo(std::string value, Type from, Type target) { + if (from == target) return value; + std::ostringstream os; + if (target.lanes() == 1) { + os << "(("; + this->PrintType(target, os); + os << ")" << value << ")"; + } else { // convert vector type + os << "("; + os << "convert_"; + this->PrintType(target, os); + os << "(" << value << "))"; + } + return os.str(); +} + +void CodeGenOpenCL::VisitExpr_(const Broadcast* op, std::ostream& os) { // NOLINT(*) + std::string v = PrintExpr(op->value); + os << "(("; + PrintType(op->type, os); + os << ")("; + for (int i = 0; i < op->lanes; ++i) { + if (i != 0) os << ", "; + os << v; + } + os << "))"; +} + +void CodeGenOpenCL::VisitExpr_(const Call * op, std::ostream& os) { // NOLINT(*) + if (op->is_intrinsic(intrinsic::tvm_if_then_else)) { + os << "("; + PrintType(op->args[2].type(), os); + os << ")"; + } + CodeGenC::VisitExpr_(op, os); +} + +void CodeGenOpenCL::VisitStmt_(const LetStmt* op) { + std::string value = PrintExpr(op->value); + // Skip the argument retrieving assign statement + std::string vid = AllocVarID(op->var.get()); + if (op->var.type() != Handle() && + value.find("TVMArray") == std::string::npos && + value.find("arg") != 0) { + PrintIndent(); + PrintType(op->var.type(), this->stream); + this->stream << ' ' + << vid + << " = " << value << ";\n"; + } + PrintStmt(op->body); +} + + +void CodeGenOpenCL::VisitExpr_(const FloatImm * op, std::ostream& os) { // NOLINT(*) + if (std::isinf(op->value)) { + if ( op->value < 0) { + os << "-"; + } + os << "INFINITY"; + } else if (std::isnan(op->value)) { + os << "NAN"; + } else { + CodeGenC::VisitExpr_(op, os); + } +} + +void CodeGenOpenCL::VisitExpr_(const Select * op, std::ostream& os ) { // NOINT(*) + os << "("; + PrintType(op->true_value.type(), os); + os << ")"; + CodeGenC::VisitExpr_(op, os); +} + +void CodeGenOpenCL::VisitStmt_(const IfThenElse* op) { + std::string cond = PrintExpr(op->condition); + // Skip the buffer data checking + if (std::regex_match(cond, std::regex("!\\((arg)(.+)(== NULL)\\)"))) + return ; + PrintIndent(); + if (cond[0] == '(' && cond[cond.length() - 1] == ')') { + stream << "if " << cond << " {\n"; + } else { + stream << "if (" << cond << ") {\n"; + } + int then_scope = BeginScope(); + PrintStmt(op->then_case); + this->EndScope(then_scope); + if (op->else_case.defined()) { + PrintIndent(); + stream << "} else {\n"; + int else_scope = BeginScope(); + PrintStmt(op->else_case); + this->EndScope(else_scope); + } + PrintIndent(); + stream << "}\n"; +} + +void CodeGenOpenCL::GenForStmt(const For* op, std::string pragma, bool before) { + std::string extent = PrintExpr(op->extent); + std::string vid = AllocVarID(op->loop_var.get()); + CHECK(is_zero(op->min)); + if (before && pragma.length() > 0) { + PrintIndent(); + stream << pragma; + } + PrintIndent(); + stream << "for ("; + PrintType(op->loop_var.type(), stream); + stream << ' ' << vid << " = 0; " + << vid << " < " << extent + << "; ++" << vid << ") {\n"; + if (!before && pragma.length() > 0) { + PrintIndent(); + stream << pragma; + } + int for_scope = BeginScope(); + PrintStmt(op->body); + this->EndScope(for_scope); + PrintIndent(); + stream << "}\n"; +} + +} // namespace codegen +} // namespace TVM diff --git a/tvm/src/codegen/opencl/codegen_opencl.h b/tvm/src/codegen/opencl/codegen_opencl.h new file mode 100755 index 000000000..4f9a15fe5 --- /dev/null +++ b/tvm/src/codegen/opencl/codegen_opencl.h @@ -0,0 +1,50 @@ +#ifndef TVM_CODEGEN_CODEGEN_OPENCL_H_ +#define TVM_CODEGEN_CODEGEN_OPENCL_H_ + +# include +# include +# include +# include "../codegen_c.h" + +namespace TVM{ +namespace codegen{ + +class CodeGenOpenCL : public CodeGenC{ + public: + // void AddFunction(LoweredFunc f); + CodeGenOpenCL(); + virtual void AddFunction(LoweredFunc f, str2tupleMap map_arg_type) = 0; + std::string Finish(); + void BindThreadIndex(const IterVar& iv) override; // NOLINT(*) + void PrintStorageScope(const std::string& scope, std::ostream& os) override; //NOLINT(*) + void PrintStorageSync(const Call* op) override; //NOLINT(*) + // void PrintType(Type t, std::ostream& os) override; //NOLINT(*) + virtual void PrintType(Type t, std::ostream& os) = 0; //NOLINT + std::string GetVecLoad(Type t, const Variable * buffer, + Expr base) override; // NOLINT(*) + void PrintVecStore(const Variable * buffer, Type t, + Expr base, const std::string& value) override; //NOLINT(*) + void PrintVecAddr(const Variable * buffer, Type t, + Expr base, std::ostream& os); //NOLINT(*) + std::string CastFromTo(std::string value, Type from, Type target) override; //NOLINT(*) + + //overload visitor + void VisitExpr_(const Broadcast * op, std::ostream& os) override; //NOLINT(*) + void VisitExpr_(const Call * op, std::ostream& os) override; //NOLINT(*) + void VisitExpr_(const Select * op, std::ostream& os) override; //NOLINT(*) + void VisitExpr_(const FloatImm * op, std::ostream& os) override; //NOLINT(*) + void VisitStmt_(const IfThenElse* op) override; //NOLINT(*) + void VisitStmt_(const LetStmt* op) override; // NOLINT + void GenForStmt(const For* op, std::string pragma, bool before); + virtual void VisitStmt_(const For* op) = 0; + +protected: + // fp16 and fp64 extension + bool enable_fp16_{false}; + bool enable_fp64_{false}; +}; + +} // namespace codegen +} // namespace TVM + +#endif diff --git a/tvm/src/codegen/opencl/codegen_sdaccel.cc b/tvm/src/codegen/opencl/codegen_sdaccel.cc new file mode 100644 index 000000000..cba08fa2d --- /dev/null +++ b/tvm/src/codegen/opencl/codegen_sdaccel.cc @@ -0,0 +1,219 @@ +# include +# include +# include +# include +# include "./codegen_sdaccel.h" +# include "../../runtime/thread_storage_scope.h" + +namespace TVM { +namespace codegen { + +void CodeGenSDACCEL::AddFunction(LoweredFunc f, + str2tupleMap map_arg_type) { + // Clear previous generated state + this->InitFuncState(f); + for (Var arg: f->args) { + if (arg.type().is_handle()) { + alloc_storage_scope_[arg.get()] = "global"; + } + } + + // Skip the first underscore, so SSA variable starts from _1 + GetUniqueName("_"); + + // Register alloc buffer type + for (const auto & kv : f->handle_data_type) { + RegisterHandleType(kv.first.get(), kv.second.type()); + } + + this->stream << "__kernel " << "void " << f->name << "("; + + // Write arguments + for (size_t i = 0; i < f->args.size(); ++i) { + Var v = f->args[i]; + std::string vid = AllocVarID(v.get()); + if (i != 0) this->stream << ", "; + if (map_arg_type.find(vid) == map_arg_type.end()) { + LOG(WARNING) << vid << " type not found\n"; + PrintType(v.type(), this->stream); + this->stream << ' ' << vid; + } + else { + auto arg = map_arg_type[vid]; + this->stream << "__global "; + // this->stream << "global "; + PrintType(std::get<1>(arg), this->stream); + if (v.type().is_handle()) + this->stream << "*"; + this->stream << ' ' << std::get<0>(arg); + } + } + stream << ") {\n"; + int func_scope = this->BeginScope(); + this->PrintStmt(f->body); + this->EndScope(func_scope); + this->PrintIndent(); + // this->stream << ' '<< ' ' << "return;\n"; + this->stream << "}\n\n"; +} + +void CodeGenSDACCEL::PrintType(Type t, std::ostream& os) { // NOLINT(*) + int lanes = t.lanes(); + if (t.is_handle()) { + //LOG(FATAL) << "The buffer shouldn't call PrintType for printing type"; + os << "void*"; + return ; + } + bool fail = false; + if (t.is_float()) { + switch (t.bits()) { + case 16: os << "half"; break; + case 32: os << "float"; break; + case 64: os << "double"; break; + // case 128: os << "double double"; break; + default: fail = true; break; + } + if (!fail && lanes == 1) return; + if (!fail && (lanes >= 2 && lanes <= 16)) { + os << lanes; return; + } + } else if (t.is_uint() || t.is_int()) { + if (t.is_uint()) { + os << "unsigned "; + } + if (t.bits() == 8 && t.lanes() == 4) { + // directly 4 8 bit int in integer. + os << "int"; return; + } + + int target_bit = 1; + while (target_bit < t.bits()) + target_bit <<= 1; + + switch (target_bit) { + case 1: os << "int"; break; + case 2: os << "char"; break; + case 4: os << "char"; break; + case 8: os << "char"; break; + case 16: os << "short"; break; + case 32: os << "int"; break; + case 64: os << "long"; break; + case 128: os << "long"; break; // FIXME: Should use long long + default: fail = true; break; + } + if (!fail && lanes == 1) return; + // FIXME: Not yet support multiple lanes + //if (!fail && (lanes >= 2 && lanes <= 16)) { + // os << lanes; return; + //} + } + os << t; + LOG(WARNING) << "Cannot convert type " << t ; + return ; +} + +void CodeGenSDACCEL::PrintStorageScope( + const std::string& scope, std::ostream& os) { // NOLINT(*) + if (scope == "global" || scope == "shared") { + os << "__local "; + } +} + +void CodeGenSDACCEL::VisitStmt_(const For* op) { + std::ostringstream os; + if (op->for_type == ForType::Unrolled) { + int unroll_factor = 0, i = 0; + for (auto key : op->annotate_keys) { + if (auto str = key.as()) { + auto factor = op->annotate_values[i].as(); + if (str->value == "factor" && factor != nullptr && factor->value > 1) { + unroll_factor = factor->value; + break; + } + } + i++; + } + if (unroll_factor > 0) { + os << "__attribute__((opencl_unroll_hint("; + os << unroll_factor << ")))\n"; + } else { + os << "\n"; + } + } + else if (op->for_type == ForType::Pipelined) { + int II = 1, i = 0; + for (auto key : op->annotate_keys) { + if (auto str = key.as()) { + auto initiation_interval = op->annotate_values[i].as(); + if (str->value == "initiation_interval" && + initiation_interval != nullptr && + initiation_interval->value > 1) { + II = initiation_interval->value; + break; + } + } + i++; + } + os << "__attribute__((xcl_pipeline_loop("; + os << II << ")))\n"; + } + CodeGenSDACCEL::GenForStmt(op, os.str(), true); +} + +void CodeGenSDACCEL::VisitStmt_(const Partition* op) { + std::string vid = GetVarID(op->buffer_var.get()); + stream << vid << " "; + if (op->partition_type != PartitionType::Complete) { + stream << "__attribute__((xcl_array_partition("; + switch (op->partition_type) { + // case PartitionType::Complete: + // stream << "complete,"; + // break; + case PartitionType::Block: + stream << "block,"; + break; + case PartitionType::Cyclic: + stream << "cyclic,"; + break; + } + stream << op->factor << ","; + stream << op->dim << ")))\n"; + } else { + if (op->dim == 0) { + stream << "__attribute__((xcl_array_partition))\n"; + } else { + stream << "__attribute__((xcl_array_partition("; + stream << "complete,"; + stream << op->factor << ","; + stream << op->dim << ")))\n"; + } + } +} + +void CodeGenSDACCEL::VisitStmt_(const StreamStmt* op) { + std::string vid = GetVarID(op->buffer_var.get()); + PrintIndent(); + stream << vid; + switch (op->stream_type) { + case StreamType::Channel: + stream << "[channel]"; + break; + case StreamType::FIFO: + stream << "[fifo]"; + break; + case StreamType::Pipe: + stream << "[pipe]"; + break; + } + stream << ".write"; + PrintExpr(op->value, stream); + stream << ";\n"; +} + +void CodeGenSDACCEL::VisitExpr_(const StreamExpr* op, std::ostream& os) { + std::string vid = GetVarID(op->buffer_var.get()); + os << vid << ".read()"; +} + +} // namespace codegen +} // namespace TVM diff --git a/tvm/src/codegen/opencl/codegen_sdaccel.h b/tvm/src/codegen/opencl/codegen_sdaccel.h new file mode 100755 index 000000000..4f1cfa053 --- /dev/null +++ b/tvm/src/codegen/opencl/codegen_sdaccel.h @@ -0,0 +1,29 @@ +#ifndef TVM_CODEGEN_CODEGEN_SDACCEL_H_ +#define TVM_CODEGEN_CODEGEN_SDACCEL_H_ + +# include +# include +# include "./codegen_opencl.h" + +namespace TVM { +namespace codegen { + +class CodeGenSDACCEL : public CodeGenOpenCL { + public: + CodeGenSDACCEL(){} + void AddFunction(LoweredFunc f, str2tupleMap map_arg_type); + + void PrintType(Type t, std::ostream& os) override; //NOLINT(*) + void PrintStorageScope(const std::string& scope, std::ostream& os) override; //NOLINT(*) + + void VisitStmt_(const For* op) override; //NOLINT(*) + void VisitStmt_(const Partition* op) override; //NOLINT(*) + void VisitStmt_(const StreamStmt* op) override; //NOLINT(*) + + void VisitExpr_(const StreamExpr* op, std::ostream& os) override; //NOLINT(*) + +}; +} // namespace codegen +} // namespace TVM + +#endif // TVM_CODEGEN_CODEGEN_SDACCEL_H_ diff --git a/tvm/src/codegen/opencl/sdaccel_module.cc b/tvm/src/codegen/opencl/sdaccel_module.cc new file mode 100644 index 000000000..63f12e86b --- /dev/null +++ b/tvm/src/codegen/opencl/sdaccel_module.cc @@ -0,0 +1,645 @@ +#include "./sdaccel_module.h" +#include +#include +#include +#include +#include +#include +#include + +namespace TVM { +namespace runtime { + +namespace { + +void PrintIndent(std::ofstream& stream, int indent) { + for (int i = 0; i < indent; i++) + stream << ' '; +} + +inline size_t GetTypeSize(TVMType t) { + size_t byte = (t.bits + 7) / 8; + if (byte > 2){ + if (byte <= 4) byte = 4; + else if (byte <= 8) byte = 8; + else byte = 16; + } + return byte; +} + +inline size_t GetDataSize(TVMArray* arr) { + size_t size = 1; + for (tvm_index_t i = 0; i < arr->ndim; ++i) { + size *= arr->shape[i]; + } + size_t byte = (arr->dtype.bits + 7) / 8; + if (byte > 2){ + if (byte <= 4) byte = 4; + else if (byte <= 8) byte = 8; + else byte = 16; + } + size *= (byte * 8 * arr->dtype.lanes + 7) / 8; + return size; +} + +inline TVMType Type2TVMType(Type t) { + TVMType tt; + if (t.is_int()) tt.code = kDLInt; + else if (t.is_uint()) tt.code = kDLUInt; + else if (t.is_float()) tt.code = kDLFloat; + else LOG(FATAL) << "Unacceptable type: " << t; + tt.bits = static_cast(t.bits()); + tt.fracs = static_cast(t.fracs()); + return tt; +} + +inline std::string Type2Str(TVMType t) { + std::string str = ""; + if (t.code == kDLInt) { + str += "int"; + } else if (t.code == kDLUInt) { + str += "unsigned int"; + } else if (t.code == kDLFloat) { + str += "float"; + } else { + LOG(FATAL) << "Unknown type"; + } + return str; +} + +inline std::string Type2ExtStr(TVMType t) { + std::string str = ""; + if (t.code == kDLInt) { + if (t.fracs > 0) str += "ap_fixed<"; + else str += "ap_int<"; + str += std::to_string(static_cast(t.bits + t.fracs)); + if (t.fracs > 0) str += ", " + std::to_string(static_cast(t.bits)) + ">"; + else str += ">"; + } else if (t.code == kDLUInt) { + if (t.fracs > 0) str += "ap_ufixed<"; + else str += "ap_uint<"; + str += std::to_string(static_cast(t.bits + t.fracs)); + if (t.fracs > 0) str += ", " + std::to_string(static_cast(t.bits)) + ">"; + else str += ">"; + } else if (t.code == kDLFloat) { + str += "float"; + } else { + LOG(FATAL) << "Unknown type"; + } + return str; +} + +inline std::string Type2Byte(TVMType t) { + std::string str = ""; + if (t.code == kDLFloat) { + str += "float"; + } else if (t.code == kDLInt || t.code == kDLUInt) { + if (t.code == kDLUInt) str += "unsigned"; + str += "int"; + if (t.bits <= 8) str += "8"; + else if (t.bits <= 16) str += "16"; + else if (t.bits <= 32) str += "32"; + else str += "64"; + // str += "_t"; + } + return str; +} + +void CollectArgInfo(TVMArgs& args, + LoweredFunc func, + std::vector& arg_sizes, + std::vector& arg_types) { + for (int i = 0; i < args.size(); i++) { + if (args[i].type_code() == kArrayHandle) { + TVMArray* arr = args[i]; + arg_sizes.push_back(GetDataSize(arr)); + arg_types.push_back(arr->dtype); + } else { + const Variable* var = func->api_args[i].as(); + TVMType t = Type2TVMType(var->type); + arg_sizes.push_back(GetTypeSize(t)); + arg_types.push_back(t); + } + } +} + +void GenSharedMem(TVMArgs& args, + std::vector& shmids, + std::vector& arg_sizes) { + for (int i = 0; i < args.size(); i++) { + if (args[i].type_code() == kArrayHandle) { + TVMArray* arr = args[i]; + // generate shared memory key and id + // TODO: maybe get the current path?? + key_t key = ftok("/", i+1); + int shmid = shmget(key, arg_sizes[i], 0666|IPC_CREAT); + shmids.push_back(shmid); + // copy mem from TVM args to the shared memory + void* mem = shmat(shmid, nullptr, 0); + memcpy(mem, arr->data, arg_sizes[i]); + } else { + shmids.push_back(0); + } + } +} + +void FreeSharedMem(TVMArgs& args, + const std::vector& shmids, + std::vector& arg_sizes) { + for (size_t i = 0; i < shmids.size(); i++) { + TVMArray* arr = args[i]; + int shmid = shmids[i]; + void* mem = shmat(shmid, nullptr, 0); + memcpy(arr->data, mem, arg_sizes[i]); + shmdt(mem); + shmctl(shmid, IPC_RMID, nullptr); + } +} + +// copy values from the shared mem to local mem +void PrintCopy(TVMArray* arr, + std::ofstream& stream, + int indent, size_t nth_arr) { + for (int i = 0; i < arr->ndim; i++) { + PrintIndent(stream, indent); + stream << "for (size_t i" << i << " = 0; "; + stream << "i" << i << " < " << arr->shape[i] << "; "; + stream << "i" << i << "++) {\n"; + indent += 2; + if (i == arr->ndim-1) { + PrintIndent(stream, indent); + stream << "source_" << nth_arr; + stream << "[i" << arr->ndim-1; + int mul = 1; + for (int j = arr->ndim-2;j >= 0;j--) { + mul *= arr->shape[j+1]; + stream << " + i" << j << "*" << mul; + } + stream << "] = "; + stream << "arg_" << nth_arr; + stream << "[i" << arr->ndim - 1; + + int mul2 = 1; + for (int j = arr->ndim-2;j >= 0;j--) { + mul2 *= arr->shape[j+1]; + stream << " + i" << j << "*" << mul2; + } + stream << "]"; + if (arr->dtype.fracs > 0) + stream << " >> " << static_cast(arr->dtype.fracs); + stream << ";\n"; + } + } + for (int i = 0; i < arr->ndim; i++) { + indent -= 2; + PrintIndent(stream, indent); + stream << "}\n"; + } +} + +// copy values from local mem back to shared mem +void PrintCopyBack(TVMArray* arr, + std::ofstream& stream, + int indent, size_t nth_arr) { + for (int i = 0; i < arr->ndim; i++) { + PrintIndent(stream, indent); + stream << "for (size_t i" << i << " = 0; "; + stream << "i" << i << " < " << arr->shape[i] << "; "; + stream << "i" << i << "++) {\n"; + indent += 2; + if (i == arr->ndim-1) { + PrintIndent(stream, indent); + stream << "arg_" << nth_arr; + stream << "[i" << arr->ndim-1; + int mul = 1; + for (int j = arr->ndim-2; j >= 0; j--) { + mul *= arr->shape[j+1]; + stream << " + i" << j << "*" << mul; + } + stream << "] = "; + // stream << Type2ExtStr(arr->dtype); + stream << "source_" << nth_arr; + stream << "[i" << arr->ndim - 1; + int mul2 = 1; + for (int j = arr->ndim-2;j >=0;j--) { + mul2 *= arr->shape[j+1]; + stream << " + i" << j << "*" << mul2; + } + stream << "]"; + if (arr->dtype.fracs > 0) + stream << " << " << static_cast(arr->dtype.fracs); + stream << ";\n"; + } + } + for (int i = 0; i < arr->ndim; i++) { + indent -= 2; + PrintIndent(stream, indent); + stream << "}\n"; + } +} + +void GenMakFile() { + int indent = 0; + std::ofstream stream; + stream.open("sdaccel.mk"); + indent += 4; + + stream << "ifndef XILINX_SDX\n"; + stream << "$(error Environment variable XILINX_SDX is required and should point to SDAccel install area)\n"; + stream << "endif\n"; + + stream << "SDA_FLOW = cpu_emu\n"; + stream << "HOST_SRCS = host.cpp\n"; + stream << "HOST_EXE_DIR=.\n"; + stream << "HOST_EXE = host\n"; + stream << "HOST_CFLAGS = -g -Wall -DFPGA_DEVICE -DC_KERNEL\n"; + stream << "HOST_LFLAGS = \n"; + stream << "KERNEL_SRCS = default_function.cl\n"; + stream << "KERNEL_NAME = default_function\n"; + stream << "KERNEL_DEFS =\n"; + stream << "KERNEL_INCS =\n"; + stream << "XDEVICE=xilinx:adm-pcie-7v3:1ddr:3.0\n"; + stream << "XDEVICE_REPO_PATH=\n"; + stream << "KEEP_TEMP=1\n"; + stream << "KERNEL_DEBUG=\n"; + stream << "XCLBIN_NAME=bin_krnl\n"; + stream << "HOST_CFLAGS+=-DTARGET_DEVICE=\\\"${XDEVICE}\\\"\n"; + stream << "BOARD_SETUP_FILE=setup.sh\n"; + stream << "ifeq (${SDA_FLOW},cpu_emu)\n"; + PrintIndent(stream, indent); + stream << "CLCC_OPT += -t sw_emu\n"; + PrintIndent(stream, indent); + stream << "XCLBIN = ${XCLBIN_NAME}_cpu_emu.xclbin\n"; + stream << "else ifeq (${SDA_FLOW},hw_emu)\n"; + PrintIndent(stream, indent); + stream << "CLCC_OPT += -t hw_emu\n"; + PrintIndent(stream, indent); + stream << "XCLBIN = ${XCLBIN_NAME}_hw_emu.xclbin\n"; + stream << "else ifeq (${SDA_FLOW},hw)\n"; + PrintIndent(stream, indent); + stream << "XCLBIN = ${XCLBIN_NAME}_hw.xclbin\n"; + stream << "CLCC_OPT += -t hw\n"; + stream << "endif\n"; + + stream << "HOST_ARGS = ${XCLBIN}\n"; + stream << "COMMON_DIR = ./common\n"; + stream << "include ${COMMON_DIR}/common.mk\n"; + + stream.close(); +} + +void GenCommonFile() { + int indent = 0; + std::ofstream stream; + stream.open("./common/common.mk"); + indent += 4; + stream << "SHELL = /bin/bash\n"; + stream << "VPATH = ./\n"; + stream << "CC = xcpp\n"; + stream << "CLCC = xocc\n"; + stream << "ifeq ($(XDEVICE_REPO_PATH),)\n"; + PrintIndent(stream, indent); + stream << "DEVICE_REPO_OPT = \n"; + stream << "else\n"; + stream << "DEVICE_REPO_OPT = --xp prop:solution.device_repo_paths=${XDEVICE_REPO_PATH}\n"; + stream << "endif\n"; + stream << "HOST_CFLAGS += -I${XILINX_SDX}/runtime/include/1_2\n"; + stream << "HOST_LFLAGS += -L${XILINX_SDX}/runtime/lib/x86_64 -lxilinxopencl -lrt -pthread\n"; + stream << "CLCC_OPT += $(CLCC_OPT_LEVEL) ${DEVICE_REPO_OPT} --xdevice ${XDEVICE} -o ${XCLBIN} ${KERNEL_DEFS} ${KERNEL_INCS}\n"; + stream << "ifeq (${KEEP_TEMP},1)\n"; + PrintIndent(stream, indent); + stream << "CLCC_OPT += -s\n"; + stream << "endif\n"; + stream << "ifeq (${KERNEL_DEBUG},1)\n"; + PrintIndent(stream, indent); + stream << "CLCC_OPT += -g\n"; + stream << "endif\n"; + stream << "CLCC_OPT += --kernel ${KERNEL_NAME}\n"; + stream << "OBJECTS := $(HOST_SRCS:.cpp=.o)\n"; + stream << ".PHONY: all\n"; + stream << "all: run\n"; + + stream << "host: ${HOST_EXE_DIR}/${HOST_EXE}\n"; + stream << "xbin_cpu_em:\n"; + PrintIndent(stream, indent); + stream << "make SDA_FLOW=cpu_emu xbin -f sdaccel.mk\n"; + stream << "xbin_hw_em:\n"; + PrintIndent(stream, indent); + stream << "make SDA_FLOW=hw_emu xbin -f sdaccel.mk\n"; + stream << "xbin_hw :\n"; + PrintIndent(stream, indent); + stream << "make SDA_FLOW=hw xbin -f sdaccel.mk\n"; + stream << "xbin: ${XCLBIN}\n"; + stream << "run_cpu_em: \n"; + PrintIndent(stream, indent); + stream << "make SDA_FLOW=cpu_emu run_em -f sdaccel.mk\n"; + stream << "run_hw_em: \n"; + PrintIndent(stream, indent); + stream << "make SDA_FLOW=hw_emu run_em -f sdaccel.mk\n"; + stream << "run_hw : \n"; + PrintIndent(stream, indent); + stream << "make SDA_FLOW=hw run_hw_int -f sdaccel.mk\n"; + stream << "run_em: xconfig host xbin\n"; + PrintIndent(stream, indent); + stream << "XCL_EMULATION_MODE=true ${HOST_EXE_DIR}/${HOST_EXE} ${HOST_ARGS}\n"; + stream << "run_hw_int : host xbin_hw\n"; + PrintIndent(stream, indent); + stream << "source ${BOARD_SETUP_FILE};${HOST_EXE_DIR}/${HOST_EXE} ${HOST_ARGS}\n"; + stream << "estimate : \n"; + PrintIndent(stream, indent); + stream << "${CLCC} -c -t hw_emu --xdevice ${XDEVICE} --report estimate ${KERNEL_SRCS}\n"; + stream << "xconfig : emconfig.json\n"; + stream << "emconfig.json :\n"; + PrintIndent(stream, indent); + stream << "emconfigutil --xdevice ${XDEVICE} ${DEVICE_REPO_OPT} --od .\n"; + stream << "${HOST_EXE_DIR}/${HOST_EXE} : ${OBJECTS}\n"; + PrintIndent(stream, indent); + stream << "${CC} ${HOST_LFLAGS} ${OBJECTS} -o $@\n"; + stream << "${XCLBIN}:\n"; + PrintIndent(stream, indent); + stream << "${CLCC} ${CLCC_OPT} ${KERNEL_SRCS}\n"; + stream << "%.o: %.cpp\n"; + PrintIndent(stream, indent); + stream << "${CC} ${HOST_CFLAGS} -c $< -o $@\n"; + stream << "clean:\n"; + PrintIndent(stream, indent); + stream << "${RM} -rf ${HOST_EXE} ${OBJECTS} ${XCLBIN} emconfig.json _xocc_${XCLBIN_NAME}_*.dir .Xil\n"; + stream << "cleanall: clean\n"; + PrintIndent(stream, indent); + stream << "${RM} -rf *.xclbin sdaccel_profile_summary.* _xocc_* TempConfig *.log *.jou\n"; + + stream.close(); +} + +void GenHostCode(TVMArgs& args, + const std::vector& shmids, + const std::vector& arg_types, + LoweredFunc func, + std::string test_file) { + int indent = 0; + std::ofstream stream; + stream.open("host.cpp"); + indent += 2; + + stream << "#define CL_HPP_CL_1_2_DEFAULT_BUILD\n"; + stream << "#define CL_HPP_TARGET_OPENCL_VERSION 120\n"; + stream << "#define CL_HPP_MINIMUM_OPENCL_VERSION 120\n"; + stream << "#define CL_HPP_ENABLE_PROGRAM_CONSTRUCTION_FROM_ARRAY_COMPATIBILITY 1\n"; + stream << "#include \n"; + stream << "#include \n"; + stream << "#include \n"; + stream << "#include \n"; + stream << "#include \n"; + stream << "#include \n"; + stream << "#include \n"; + stream << "#include \n"; + stream << "#include \n"; + stream << "#include \n"; + stream << "#include \n"; + // stream << "#include \n"; + stream << "#include \n"; + stream << "#include \n"; + stream << "#include \n"; + stream << "#pragma once\n"; + stream << "\n\n"; + + // stream << test_file; + stream << "\n\n"; + + stream << "int main(void) { \n"; + + stream << "#if defined(SDX_PLATFORM) && !defined(TARGET_DEVICE)\n"; + indent += 2; + stream << " #define STR_VALUE(arg) #arg\n"; + stream << " #define GET_STRING(name) STR_VALUE(name)\n"; + stream << " #define TARGET_DEVICE GET_STRING(SDX_PLATFORM)\n"; + stream << "#endif\n"; + + // get the krnl code + PrintIndent(stream, indent); + stream << "char* xclbinFilename = argv[1];\n"; + stream << "\n"; + + for (int i = 0;i < args.size();i++) { + PrintIndent(stream, indent); + stream << "std::vector<" << Type2Str(arg_types[i]); + stream << "> "; + stream << "source_" << i << "("; + TVMArray* arr = args[i]; + for (int j = 0;j < arr->ndim;j++) { + if (j == arr->ndim-1) { + stream << arr->shape[j] << ")"; + } else { + // stream << " * " << arr->shape[j] << ")"; + stream << arr->shape[j] << " * "; + } + } + stream << ";\n"; + } + stream << "\n"; + + for (int i = 0;i < args.size();i++) { + PrintIndent(stream, indent); + stream << "size_t vector_size_bytes_" << i; + stream << " = sizeof(" << Type2Str(arg_types[i]); + stream << ")"; + TVMArray* arr = args[i]; + for (int j = 0;j < arr->ndim;j++) { + stream << " * " << arr->shape[j]; + } + stream << ";\n"; + } + stream << "\n"; + + for (int i = 0;i < args.size();i++ ) { + // read from the shared memory + PrintIndent(stream, indent); + stream << Type2Str(arg_types[i]) << "* "; + stream << "arg_" << i << " = "; + stream << "(" << Type2Str(arg_types[i]) << "*)"; + stream << "shmat(" << shmids[i] << ", nullptr, 0);\n"; + TVMArray* arr = args[i]; + // copy from shared mem + PrintCopy(arr, stream, indent, i); + } + + // Getting First Platform + PrintIndent(stream, indent); + stream << "std::vector platforms;\n"; + PrintIndent(stream, indent); + stream << "cl::Platform::get(&platforms);\n"; + PrintIndent(stream, indent); + stream << "cl::Platform platform = platforms[0];\n"; + stream << "\n"; + + // Getting ACCELERATOR Devices and selecting 1st such device + PrintIndent(stream, indent); + stream << "std::vector devices;\n"; + PrintIndent(stream, indent); + stream << "platform.getDevices(CL_DEVICE_TYPE_ACCELERATOR, &devices);\n"; + PrintIndent(stream, indent); + stream << "cl::Device device = devices[0];\n"; + stream << "\n"; + + // Creating Context and Command Queue for selected Device + PrintIndent(stream, indent); + stream << "cl::Context context(device);\n"; + PrintIndent(stream, indent); + stream << "cl::CommandQueue q(context, device);\n"; + stream << "\n"; + + // Loading XCL Bin into char buffer + PrintIndent(stream, indent); + stream << "std::ifstream bin_file(xclbinFilename, std::ifstream::binary);\n"; + PrintIndent(stream, indent); + stream << "bin_file.seekg (0, bin_file.end);\n"; + PrintIndent(stream, indent); + stream << "unsigned nb = bin_file.tellg();\n"; + PrintIndent(stream, indent); + stream << "bin_file.seekg (0, bin_file.beg);\n"; + PrintIndent(stream, indent); + stream << "char *buf = new char [nb];\n"; + PrintIndent(stream, indent); + stream << "bin_file.read(buf, nb);\n"; + stream << "\n"; + + // Creating Program from Binary File + PrintIndent(stream, indent); + stream << "cl::Program::Binaries bins;\n"; + PrintIndent(stream, indent); + stream << "bins.push_back({buf,nb});\n"; + PrintIndent(stream, indent); + stream << "devices.resize(1);\n"; + PrintIndent(stream, indent); + stream << "cl::Program program(context, devices, bins);\n"; + stream << "\n"; + + // Creating Kernel and Functor of Kernel + PrintIndent(stream, indent); + stream << "int err1;\n"; + PrintIndent(stream, indent); + stream << "cl::Kernel kernel(program, \"default_function\", &err1);\n"; + PrintIndent(stream, indent); + stream << "auto default_function = cl::KernelFunctor<"; + for (int i = 0;i < args.size();i++) { + if (i == args.size() - 1) { + stream << "cl::Buffer&>(kernel);\n"; + } else { + stream << "cl::Buffer&, "; + } + } + stream << "\n"; + + // Creating Buffers inside Device + for (int i = 0;i < args.size();i++) { + PrintIndent(stream, indent); + stream << "cl::Buffer buffer_" << i; + stream << "(context, CL_MEM_READ_WRITE, vector_size_bytes_" << i << ");\n"; + } + stream << "\n"; + + // Copying input data to Device buffer from host memory + for (int i = 0;i < args.size();i++) { + PrintIndent(stream, indent); + stream << "q.enqueueWriteBuffer(buffer_" << i; + stream << ", CL_TRUE, 0, vector_size_bytes_" << i; + stream << ", source_" << i << ".data());\n"; + } + stream << "\n"; + + // Running Kernel + PrintIndent(stream, indent); + stream << func->name << "("; + stream << "cl::EnqueueArgs(q, cl::NDRange(1,1,1), cl::NDRange(1,1,1)),"; + for (int i = 0; i < args.size(); i++) { + stream << "buffer_" << i; + if (i != args.size()-1) + stream << ", "; + } + stream << ");\n"; + PrintIndent(stream, indent); + stream << "q.finish();\n"; + stream << "\n"; + + // Copying Device result data to Host memory + for (int i = 0;i < args.size(); i++) { + PrintIndent(stream, indent); + stream << "q.enqueueReadBuffer(buffer_" << i; + stream << ", CL_TRUE, 0, vector_size_bytes_" << i; + stream << ", source_" << i << ".data());\n"; + } + stream << "\n"; + + // copy to shared mem + for (int i = 0;i < args.size();i++) { + if (args[i].type_code() == kArrayHandle) { + TVMArray* arr = args[i]; + PrintCopyBack(arr, stream, indent, i); + PrintIndent(stream, indent); + stream << "shmdt("; + stream << "arg_" << i << ");\n"; + } + } + + stream << "}\n"; + stream.close(); +} +} // namespace + + +class SDAccelModuleNode final : public ModuleNode { + public: + SDAccelModuleNode(LoweredFunc func, std::string test_file) + : func_(func), test_file_(test_file) {} + + const char* type_key() const { + return "sdaccel_sw_emu"; + } + + PackedFunc GetFunction( + const std::string& name, + const std::shared_ptr& sptr_to_self) final { + return PackedFunc([this](TVMArgs args, TVMRetValue* rv){ + + if (args.size() != (int)func_->args.size()) + LOG(FATAL) << "The function should take in " << func_->args.size() + << " inputs but get " << args.size(); + std::vector arg_sizes; + std::vector arg_types; + std::vector shmids; + CollectArgInfo(args, func_, arg_sizes, arg_types); + GenSharedMem(args, shmids, arg_sizes); + LOG(CLEAN) << "Creating a Host file for SDAccel Runtime ..."; + GenHostCode(args, shmids, arg_types, func_, test_file_); + + LOG(CLEAN) << "Creating a Common folder for common.mk ..."; + system("mkdir common"); + GenCommonFile(); + + LOG(CLEAN) << "Creating a Makfile for compling the SDAccel OpenCL Code ..."; + GenMakFile(); + // TODO: find a better way to do the following + LOG(CLEAN) << "Compiling the generated SDAccel OpenCL Code ..."; + // system("make -f ./sdaccel.mk run_cpu_em"); + LOG(CLEAN) << "Running SDAccel OpenCL Software Simulation ..."; + LOG(CLEAN) << "Finished SDAccel OpenCL Software Simulation ..."; + // system("make -f sdaccel.mk cleanall"); + FreeSharedMem(args, shmids, arg_sizes); + }); + } + + private: + LoweredFunc func_; + std::string test_file_; +}; + +Module CreateSDAccelModule(LoweredFunc func, + std::string code) { + std::shared_ptr n = + std::make_shared(func, code); + + return Module(n); +} + +} // namespace runtime +} // namespace TVM diff --git a/tvm/src/codegen/opencl/sdaccel_module.h b/tvm/src/codegen/opencl/sdaccel_module.h new file mode 100644 index 000000000..01f361dba --- /dev/null +++ b/tvm/src/codegen/opencl/sdaccel_module.h @@ -0,0 +1,18 @@ +#ifndef SDACCEL_MODULE_H +#define SDACCEL_MODULE_H + +#include +#include +#include "../build_common.h" + +namespace TVM { +namespace runtime { + +Module CreateSDAccelModule( + LoweredFunc func, + std::string code); + +} // namespace runtime +} // namespace TVM + +#endif diff --git a/tvm/src/codegen/ppac/build_rv64_ppac.cc b/tvm/src/codegen/ppac/build_rv64_ppac.cc new file mode 100644 index 000000000..c14a1cdf3 --- /dev/null +++ b/tvm/src/codegen/ppac/build_rv64_ppac.cc @@ -0,0 +1,32 @@ +/* + * \file build_rv64_ppac.cc + */ + +#include "./codegen_rv64_ppac.h" +#include "../build_common.h" + +namespace TVM{ +namespace codegen{ + +std::string BuildRV64PPAC(Array funcs) { + CodeAnalysMerlinC ca; + CodeGenRV64PPAC cg; + for (LoweredFunc f: funcs) { + ca.AddFunction(f); + str2tupleMap map_arg_type; + map_arg_type = ca.Finish(); + cg.AddFunction(f, map_arg_type); + } + std::string code = cg.Finish(); + + LOG(WARNING) << "RV64_PPAC backend doesn't have runtime, return kernel code"; + return code; +} + +TVM_REGISTER_API("codegen.build_rv64_ppac") +.set_body([](TVMArgs args, TVMRetValue* rv) { + *rv = BuildRV64PPAC(args[0]); + }); + +} // namespace codegen +} // namespace TVM \ No newline at end of file diff --git a/tvm/src/codegen/ppac/codegen_rv64_ppac.cc b/tvm/src/codegen/ppac/codegen_rv64_ppac.cc new file mode 100644 index 000000000..1fd5e2b6e --- /dev/null +++ b/tvm/src/codegen/ppac/codegen_rv64_ppac.cc @@ -0,0 +1,202 @@ +/* + * \file codegen_rv64_ppac.cc + */ + +#include +#include +#include +#include +#include +#include +#include +#include "./codegen_rv64_ppac.h" +#include "../build_common.h" + +namespace TVM { +namespace codegen { + +void CodeGenRV64PPAC::AddFunction(LoweredFunc f, + str2tupleMap map_arg_type) { + // Clear previous generated state + this->InitFuncState(f); + // Register alloc buffer type + for (const auto & kv : f->handle_data_type) { + RegisterHandleType(kv.first.get(), kv.second.type()); + } + // Write entry function name + this->stream << "void " << f->name << "("; + // Write arguments + for (size_t i = 0; i < f->args.size(); ++i) { + Var v = f->args[i]; + std::string vid = AllocVarID(v.get()); + if (i != 0) this->stream << ", "; + if (map_arg_type.find(vid) == map_arg_type.end()) { + LOG(WARNING) << vid << " type not found\n"; + PrintType(v.type(), this->stream); + this->stream << ' ' << vid; + } + else { + auto arg = map_arg_type[vid]; + PrintType(std::get<1>(arg), this->stream); + this->stream << "*"; + this->stream << ' ' << std::get<0>(arg); + } + } + stream << ") {\n"; + int func_scope = this->BeginScope(); + this->PrintStmt(f->body); + this->EndScope(func_scope); + this->PrintIndent(); + this->stream << "}\n\n"; +} + +void CodeGenRV64PPAC::VisitStmt_(const For* op) { + std::string func_name; + bool is_ppac_func = false; + uint8_t i = 0; + for (auto key: op->annotate_keys) { + if (auto str = key.as()) { + if (str->value == "_ppac_func_name") { + auto name = op->annotate_values[i].as(); + func_name = name->value; + is_ppac_func = true; + break; + } + } + ++i; + } + if (is_ppac_func) { + // scan along the annotate list to find parameters + std::string ret, arg0, arg1; + int batch_num, in_block_num, out_channel_num; + i = 0; + uint8_t param_num = 0; + for (auto key: op->annotate_keys) { + if (auto str = key.as()) { + if (str->value == "_ret") { + auto v = op->annotate_values[i].as(); + ret = v->value; + ++param_num; + } else if (str->value == "_arg0") { + auto v = op->annotate_values[i].as(); + arg0 = v->value; + ++param_num; + } else if (str->value == "_arg1") { + auto v = op->annotate_values[i].as(); + arg1 = v->value; + ++param_num; + } else if (str->value == "_batch_num") { + auto v = op->annotate_values[i].as(); + batch_num = v->value; + ++param_num; + } else if (str->value == "_in_block_num") { + auto v = op->annotate_values[i].as(); + in_block_num = v->value; + ++param_num; + } else if (str->value == "_out_channel_num") { + auto v = op->annotate_values[i].as(); + out_channel_num = v->value; + ++param_num; + } + } + ++i; + } + if (param_num != 6) { + LOG(FATAL) << "PPAC function call need exactly 6 parameters but found " << param_num; + } + // print ppac function call + PrintIndent(); + stream << func_name << "(" + << ret << ", " + << arg0 << ", " + << arg1 << ", " + << batch_num << ", " + << in_block_num << ", " + << out_channel_num + << ");\n"; + return; + } + CodeGenC::VisitStmt_(op); +} + +void CodeGenRV64PPAC::VisitStmt_(const LetStmt* op) { + std::string value = PrintExpr(op->value); + // Skip the argument retrieving assign statement + std::string vid = AllocVarID(op->var.get()); + if (op->var.type() != Handle() && + value.find("TVMArray") == std::string::npos && + value.find("arg") != 0) { + PrintIndent(); + PrintType(op->var.type(), this->stream); + this->stream << ' ' + << vid + << " = " << value << ";\n"; + } + PrintStmt(op->body); +} + +void CodeGenRV64PPAC::VisitStmt_(const IfThenElse* op) { + std::string cond = PrintExpr(op->condition); + // Skip the buffer data checking + if (std::regex_match(cond, std::regex("!\\((arg)(.+)(== NULL)\\)"))) + return ; + PrintIndent(); + if (cond[0] == '(' && cond[cond.length() - 1] == ')') { + stream << "if " << cond << " {\n"; + } else { + stream << "if (" << cond << ") {\n"; + } + int then_scope = BeginScope(); + PrintStmt(op->then_case); + this->EndScope(then_scope); + if (op->else_case.defined()) { + PrintIndent(); + stream << "} else {\n"; + int else_scope = BeginScope(); + PrintStmt(op->else_case); + this->EndScope(else_scope); + } + PrintIndent(); + stream << "}\n"; +} + +void CodeGenRV64PPAC::PrintType(Type t, std::ostream& os) { + CHECK_EQ(t.lanes(), 1) + << "do not support vector types"; + if (t.is_uint() || t.is_int()) { + if (t.is_uint()) { + if (t.bits() <= 8) { + os << "uint8_t"; return; + } else if (t.bits() <= 16) { + os << "uint16_t"; return; + } else if (t.bits() <= 32) { + os << "uint32_t"; return; + } else if (t.bits() <= 64) { + os << "uint64_t"; return; + } else { + LOG(WARNING) << "Casting type " << t << " to uint64_t"; + os << "uint64_t"; + return; + } + } + else if (t.is_int()) { + if (t.bits() <= 8) { + os << "int8_t"; return; + } else if (t.bits() <= 16) { + os << "int16_t"; return; + } else if (t.bits() <= 32) { + os << "int32_t"; return; + } else if (t.bits() <= 64) { + os << "int64_t"; return; + } else { + LOG(WARNING) << "Casting type " << t << " to int64_t"; + os << "int64_t"; + return; + } + } + } + os << t; +} + +} //namespace codegen +} //namespace TVM \ No newline at end of file diff --git a/tvm/src/codegen/ppac/codegen_rv64_ppac.h b/tvm/src/codegen/ppac/codegen_rv64_ppac.h new file mode 100644 index 000000000..881bdea05 --- /dev/null +++ b/tvm/src/codegen/ppac/codegen_rv64_ppac.h @@ -0,0 +1,28 @@ +/* + * \file codegen_rv64_ppac.h + */ + +#ifndef TVM_CODEGEN_CODEGEN_RV64_PPAC_H_ +#define TVM_CODEGEN_CODEGEN_RV64_PPAC_H_ + +#include +#include +#include "../codegen_c.h" +#include "../merlinc/codeanalys_merlinc.h" + +namespace TVM { +namespace codegen { + +class CodeGenRV64PPAC : public CodeGenC { + public: + void AddFunction(LoweredFunc f, str2tupleMap map_arg_type); + void PrintType(Type t, std::ostream& os) override; + void VisitStmt_(const LetStmt* op) override; + void VisitStmt_(const IfThenElse* op) override; + void VisitStmt_(const For* op) override; +}; + +} // namespace codegen +} // namespace TVM + +#endif //TVM_CODEGEN_CODEGEN_RV64_PPAC_H_ \ No newline at end of file diff --git a/tvm/src/lang/ir.cc b/tvm/src/lang/ir.cc index 3589de195..c88f8ea94 100644 --- a/tvm/src/lang/ir.cc +++ b/tvm/src/lang/ir.cc @@ -149,6 +149,8 @@ TVM_REGISTER_NODE_TYPE(Quantize); TVM_REGISTER_NODE_TYPE(KernelDef); TVM_REGISTER_NODE_TYPE(KernelExpr); TVM_REGISTER_NODE_TYPE(KernelStmt); +TVM_REGISTER_NODE_TYPE(StreamStmt); +TVM_REGISTER_NODE_TYPE(StreamExpr); TVM_REGISTER_NODE_TYPE(Return); TVM_REGISTER_NODE_TYPE(Break); TVM_REGISTER_NODE_TYPE(While); diff --git a/tvm/src/pass/ir_mutator.cc b/tvm/src/pass/ir_mutator.cc index ec67aa314..89485e723 100644 --- a/tvm/src/pass/ir_mutator.cc +++ b/tvm/src/pass/ir_mutator.cc @@ -202,6 +202,15 @@ Stmt IRMutator::Mutate_(const Store *op, const Stmt& s) { } } +Stmt IRMutator::Mutate_(const StreamStmt *op, const Stmt& s) { + Expr value = this->Mutate(op->value); + if (value.same_as(op->value)) { + return s; + } else { + return StreamStmt::make(op->buffer_var, value, op->stream_type, op->depth); + } +} + Stmt IRMutator::Mutate_(const Provide* op, const Stmt& s) { auto new_args = MutateArray(op->args, this); auto new_value = this->Mutate(op->value); @@ -321,7 +330,8 @@ Stmt IRMutator::Mutate_(const KernelDef *op, const Stmt &s) { if (body.same_as(op->body) && ret_void.same_as(op->ret_void)) { return s; } else { - return KernelDef::make(op->args, body, ret_void, op->ret_type, op->name); + return KernelDef::make(op->args, op->api_args, op->api_types, + body, ret_void, op->ret_type, op->name, op->channels); } } @@ -402,6 +412,7 @@ TVM_STATIC_IR_FUNCTOR(IRMutator, vtable_stmt) .DISPATCH_TO_MUTATE_STMT(Prefetch) .DISPATCH_TO_MUTATE_STMT(KernelDef) .DISPATCH_TO_MUTATE_STMT(KernelStmt) +.DISPATCH_TO_MUTATE_STMT(StreamStmt) .DISPATCH_TO_MUTATE_STMT(Return) .DISPATCH_TO_MUTATE_STMT(Break) .DISPATCH_TO_MUTATE_STMT(While) @@ -430,6 +441,10 @@ Expr IRMutator::Mutate_(const Load *op, const Expr& e) { } } +Expr IRMutator::Mutate_(const StreamExpr *op, const Expr& e) { + return e; +} + Expr IRMutator::Mutate_(const Let *op, const Expr& e) { Expr value = this->Mutate(op->value); Expr body = this->Mutate(op->body); @@ -665,6 +680,7 @@ TVM_STATIC_IR_FUNCTOR(IRMutator, vtable_expr) .DISPATCH_TO_MUTATE_EXPR(SetBit) .DISPATCH_TO_MUTATE_EXPR(SetSlice) .DISPATCH_TO_MUTATE_EXPR(Quantize) +.DISPATCH_TO_MUTATE_EXPR(StreamExpr) .DISPATCH_TO_MUTATE_EXPR(KernelExpr); } // namespace ir diff --git a/tvm/src/pass/ir_visitor.cc b/tvm/src/pass/ir_visitor.cc index 160cb906e..6346c6262 100644 --- a/tvm/src/pass/ir_visitor.cc +++ b/tvm/src/pass/ir_visitor.cc @@ -252,6 +252,13 @@ void IRVisitor::Visit_(const KernelStmt *op) { } } +void IRVisitor::Visit_(const StreamStmt *op) { + this->Visit(op->value); +} + +void IRVisitor::Visit_(const StreamExpr *op) { +} + void IRVisitor::Visit_(const Return *op) { this->Visit(op->value); } @@ -338,6 +345,8 @@ TVM_STATIC_IR_FUNCTOR(IRVisitor, vtable) .DISPATCH_TO_VISIT(KernelDef) .DISPATCH_TO_VISIT(KernelExpr) .DISPATCH_TO_VISIT(KernelStmt) +.DISPATCH_TO_VISIT(StreamStmt) +.DISPATCH_TO_VISIT(StreamExpr) .DISPATCH_TO_VISIT(Return) .DISPATCH_TO_VISIT(Break) .DISPATCH_TO_VISIT(While) diff --git a/tvm/src/pass/split_host_device.cc b/tvm/src/pass/split_host_device.cc index 534e0b695..fdcd0c56f 100644 --- a/tvm/src/pass/split_host_device.cc +++ b/tvm/src/pass/split_host_device.cc @@ -81,6 +81,14 @@ class IRUseDefAnalysis : public IRMutator { return IRMutator::Mutate_(op, s); } + Stmt Mutate_(const StreamStmt *op, const Stmt& s) final { + if (!def_count_.count(op->buffer_var.get())) { + def_count_[op->buffer_var.get()] = 0; + use_count_[op->buffer_var.get()] = 0; + } + return IRMutator::Mutate_(op, s); + } + Expr Mutate_(const Let *op, const Expr& e) final { this->HandleDef(op->var.get()); Expr body = this->Mutate(op->body); @@ -109,6 +117,14 @@ class IRUseDefAnalysis : public IRMutator { return IRMutator::Mutate_(op, e); } + Expr Mutate_(const StreamExpr *op, const Expr& e) final { + if (!def_count_.count(op->buffer_var.get())) { + def_count_[op->buffer_var.get()] = 0; + use_count_[op->buffer_var.get()] = 0; + } + return IRMutator::Mutate_(op, e); + } + Stmt Mutate_(const KernelDef *op, const Stmt& s) { for (auto arg : op->args) { this->HandleDef(arg.get()); diff --git a/tvm/src/pass/stream_inference.cc b/tvm/src/pass/stream_inference.cc new file mode 100644 index 000000000..ec18b1871 --- /dev/null +++ b/tvm/src/pass/stream_inference.cc @@ -0,0 +1,345 @@ +/*! + * Copyright (c) 2019 by Contributors + * \file remove_no_op.cc + * \brief Remove no op from the stmt + */ +#include +#include +#include +#include + +namespace TVM { +namespace ir { + +// use/def analysis to capture host xcel deps +class StreamUseDefAnalysis : public IRMutator { + public: + Stmt Mutate_(const AttrStmt *op, const Stmt& s) final { + if (op->attr_key == attr::device_scope) { + if (op->value.as()->value == "fpga") + host_scope_ = false; + return IRMutator::Mutate_(op, s); + } else { + return IRMutator::Mutate_(op, s); + } + } + + Stmt Mutate_(const LetStmt *op, const Stmt& s) final { + this->HandleDef(op->var.get()); + Stmt body = this->Mutate(op->body); + Expr value = this->Mutate(op->value); + if (body.same_as(op->body) && + value.same_as(op->value)) { + return s; + } else { + return LetStmt::make(op->var, value, body); + } + } + + Stmt Mutate_(const For *op, const Stmt& s) final { + this->HandleDef(op->loop_var.get()); + return IRMutator::Mutate_(op, s); + } + + Stmt Mutate_(const Allocate *op, const Stmt& s) final { + this->HandleDef(op->buffer_var.get()); + return IRMutator::Mutate_(op, s); + } + + Stmt Mutate_(const Store *op, const Stmt& s) final { + this->HandleUse(op->buffer_var); + return IRMutator::Mutate_(op, s); + } + + Stmt Mutate_(const StreamStmt *op, const Stmt& s) final { + this->HandleUse(op->buffer_var); + return IRMutator::Mutate_(op, s); + } + + Expr Mutate_(const Let *op, const Expr& e) final { + this->HandleDef(op->var.get()); + Expr body = this->Mutate(op->body); + Expr value = this->Mutate(op->value); + if (body.same_as(op->body) && + value.same_as(op->value)) { + return e; + } else { + return Let::make(op->var, value, body); + } + } + + Expr Mutate_(const Variable *op, const Expr& e) final { + this->HandleUse(e); + return IRMutator::Mutate_(op, e); + } + + Expr Mutate_(const Load *op, const Expr& e) final { + this->HandleUse(op->buffer_var); + return IRMutator::Mutate_(op, e); + } + + Expr Mutate_(const StreamExpr *op, const Expr& e) final { + this->HandleUse(op->buffer_var); + return IRMutator::Mutate_(op, e); + } + + Stmt Mutate_(const KernelDef *op, const Stmt& s) { + for (auto arg : op->args) { + this->HandleDef(arg.get()); + } + Stmt body = this->Mutate(op->body); + for (auto arg : op->args) { + xcel_def_count_[arg.get()] = 0; + } + return s; + } + + void HandleDef(const Variable* v) { + if (host_scope_) { + CHECK(!host_def_count_.count(v)) + << "variable " << v->name_hint + << " has already been defined, the Stmt is not SSA"; + CHECK(!host_use_count_.count(v)) + << "variable " << v->name_hint + << " has been used before definition!"; + host_use_count_[v] = 0; + host_def_count_[v] = 1; + } else { + CHECK(!xcel_def_count_.count(v)) + << "variable " << v->name_hint + << " has already been defined, the Stmt is not SSA"; + CHECK(!xcel_use_count_.count(v)) + << "variable " << v->name_hint + << " has been used before definition!"; + xcel_use_count_[v] = 0; + xcel_def_count_[v] = 1; + } + } + + void HandleUse(const Expr& v) { + CHECK(v.as()); + Var var(v.node_); + if (host_scope_) { + auto it = host_use_count_.find(var.get()); + if (it != host_use_count_.end()) { + if (it->second >= 0) { + ++it->second; + } + } else { + host_undefined_.push_back(var); + host_use_count_[var.get()] = -1; + } + } else { + auto it = xcel_use_count_.find(var.get()); + if (it != xcel_use_count_.end()) { + if (it->second >= 0) { + ++it->second; + } + } else { + xcel_undefined_.push_back(var); + xcel_use_count_[var.get()] = -1; + } + } + } + + bool host_scope_{true}; + Array host_undefined_; + Array xcel_undefined_; + std::unordered_map host_use_count_; + std::unordered_map host_def_count_; + std::unordered_map xcel_use_count_; + std::unordered_map xcel_def_count_; +}; + + +class StreamMutator : public IRMutator { + public: + explicit StreamMutator(int bus_bandwidth) { + bus_bandwidth_ = bus_bandwidth; + } + // move device attr to allocate level + Stmt Mutate_(const AttrStmt* op, const Stmt& s) final { + Stmt stmt = IRMutator::Mutate_(op, s); + // if (op->attr_key == attr::device_scope) + // return stmt.as()->body; + return stmt; + } + + Stmt Mutate_(const For* op, const Stmt& s) final { + Stmt stmt = IRMutator::Mutate_(op, s); + op = stmt.as(); + auto extent = op->extent.as()->value; + auto min = op->min.as()->value; + // mutate sender: split and block inner loop + if (auto stream_op = op->body.as()) { + if (extent - min > bus_bandwidth_) { + LOG(WARNING) << "large"; + } else { + } + // mutate receiver : (StreamExpr + For(Store = GetSlice)) + } else if (auto store_op = op->body.as()) { + if (store_op->value.as() == nullptr) return stmt; + if (extent - min > bus_bandwidth_) { + LOG(WARNING) << "large"; + } else { + return stmt; + // allocate intermediate buffer + VarExpr new_var(store_op->buffer_var.get()->name_hint + "_save"); + Expr new_load = Load::make(store_op->buffer_var.type(), new_var, 0, const_true()); + Stmt new_store = Store::make(store_op->buffer_var, new_load, + store_op->index, store_op->predicate); + Stmt new_for = For::make(op->loop_var, op->min, op->extent, op->for_type, + op->device_api, new_store); + // save stream data into intermediate buffer + Stmt read_in = Store::make(new_var, store_op->value, + Expr(0), const_true()); + // allocate intermediate buffer + return Allocate::make(new_var, + store_op->value.type(), + {make_const(Int(bus_bandwidth_), 1)}, + const_true(), Block::make(read_in, new_for)); + } + } + return stmt; + } + + Stmt Mutate_(const StreamStmt* op, const Stmt& s) final { + Stmt stmt = IRMutator::Mutate_(op, s); + op = stmt.as(); + const Variable* v = op->buffer_var.get(); + stream_type_map_[v] = op->buffer_var.type(); + return stmt; + } + + Expr Mutate_(const StreamExpr* op, const Expr& e) final { + Expr expr = IRMutator::Mutate_(op, e); + op = expr.as(); + const Variable* v = op->buffer_var.get(); + stream_type_map_[v] = op->buffer_var.type(); + return expr; + } + private: + int bus_bandwidth_; + bool is_host_{true}; + std::unordered_map stream_type_map_; +}; + +// Mark the statment scope of each stage. +class StreamInferer : public IRMutator { + public: + explicit StreamInferer(int bus_bandwidth) { + bus_bandwidth_ = bus_bandwidth; + } + + Stmt Mutate_(const Allocate* op, const Stmt& s) final { + Stmt stmt = IRMutator::Mutate_(op, s); + op = stmt.as(); + if (auto block = op->body.as()) { + if (auto producer = block->first.as()){ + if (const AttrStmt* attr_stmt = producer->body.as()) { + if (const AttrStmt* device_attr = attr_stmt->body.as()) { + if (device_attr->attr_key == attr::device_scope) { + // mutate allocate body + StreamMutator mutator(bus_bandwidth_); + // allocate stream for host + Stmt new_body = mutator.Mutate(op->body); + Stmt new_stmt = Allocate::make(op->buffer_var, + op->type, + op->extents, + op->condition, + new_body); + return AttrStmt::make(device_attr->node, + attr::device_scope, + device_attr->value, + new_stmt); + } + } + } + } + } + return stmt; + } + + // Stmt Mutate_(const ProducerConsumer* op, const Stmt& s) final { + // Stmt stmt = IRMutator::Mutate_(op, s); + // op = stmt.as(); + // return is_no_op(op->body) ? op->body : stmt; + // } + + // Stmt Mutate_(const Store* op, const Stmt& s) final { + // Stmt stmt = IRMutator::Mutate_(op, s); + // op = stmt.as(); + // auto it = var_remap_.find(op->buffer_var.get()); + // if (it != var_remap_.end() && + // !it->second.same_as(op->buffer_var)) { + // CHECK(it->second.as()); + // VarExpr buf_var(it->second.node_); + // if (has_stencil_) outputs_.insert(buf_var); + // return Store::make(buf_var, op->value, op->index, op->predicate); + // } else { + // return stmt; + // } + // } + + // Stmt Mutate_(const AttrStmt* op, const Stmt& s) final { + // if (op->attr_key == attr::realize_scope) { + // storage_scope_[op->node.get()] = op->value.as()->value; + // return this->Mutate(op->body); + // } else if (op->attr_key == attr::double_buffer_scope) { + // Operation func(op->node.node_); + // Stmt body = Mutate(op->body); + // for (int i = 0; i < func->num_outputs(); ++i) { + // TensorKey key{func, i}; + // auto it = buf_map_.find(key); + // CHECK(it != buf_map_.end()) + // << "Cannot find allocated buffer for " << key.f; + // body = AttrStmt::make( + // it->second.buffer->data, op->attr_key, op->value, body); + // } + // return body; + // } else if (op->attr_key == attr::thread_extent) { + // IterVar iv(op->node.node_); + // ThreadScope ts = ThreadScope::make(iv->thread_tag); + // curr_thread_scope_.push_back(ts); + // Stmt stmt = IRMutator::Mutate_(op, s); + // curr_thread_scope_.pop_back(); + // return stmt; + // } else if (op->attr_key == attr::buffer_bind_scope) { + + // Stmt Mutate_(const For* op, const Stmt& s) final { + // Stmt stmt = IRMutator::Mutate_(op, s); + // op = stmt.as(); + // return is_no_op(op->body) ? MakeEvaluate({op->min, op->extent}) : stmt; + // } + + private: + int bus_bandwidth_; + Stmt MakeEvaluate(Expr value) { + if (HasSideEffect(value)) { + return Evaluate::make(value); + } else { + return Evaluate::make(0); + } + } + Stmt MakeEvaluate(const Array& values) { + Stmt stmt; + for (Expr e : values) { + if (HasSideEffect(e)) { + if (stmt.defined()) { + stmt = Block::make(stmt, Evaluate::make(e)); + } else { + stmt = Evaluate::make(e); + } + } + } + return stmt.defined() ? stmt : Evaluate::make(0); + } +}; + +Stmt InferStream(Stmt stmt, + int bus_bandwidth) { + return StreamInferer(bus_bandwidth).Mutate(stmt); +} + +} // namespace ir +} // namespace TVM diff --git a/tvm/src/schedule/compute_primitive.h b/tvm/src/schedule/compute_primitive.h index e65885462..e7167257c 100644 --- a/tvm/src/schedule/compute_primitive.h +++ b/tvm/src/schedule/compute_primitive.h @@ -33,6 +33,14 @@ Stmt PerformComputeAt(Stmt& producer, size_t& attach_level, std::unordered_map& sub); +Stmt StreamFromProducer(Stmt& stmt, + Buffer& producer_buf, + ir::StreamType& type); + +Stmt StreamToConsumer(Stmt& stmt, + Buffer& producer_buf, + ir::StreamType& type); + Stmt UpdateIterVarAttr(Stmt& stmt, const IterVar& var, const IterVarAttrNode* node); diff --git a/tvm/src/schedule/schedule_dataflow_rewrite.cc b/tvm/src/schedule/schedule_dataflow_rewrite.cc index b2bd520e7..a7fc8ee72 100644 --- a/tvm/src/schedule/schedule_dataflow_rewrite.cc +++ b/tvm/src/schedule/schedule_dataflow_rewrite.cc @@ -8,6 +8,7 @@ #include #include #include +#include #include #include "./message_passing.h" #include "../pass/ir_util.h" @@ -27,12 +28,33 @@ size_t FindNodeRef(ArrayNode* array_node, const T& v) { return array_node->data.size(); } +// The replacer of cache. +class LoadReplacer : public ir::IRMutator { + public: + explicit LoadReplacer( + const std::unordered_map& vsub) + : vsub_(vsub) {} + + Expr Mutate_(const Load* op, const Expr& e) { + const Variable* var = op->buffer_var.as(); + auto it = vsub_.find(var); + if (it != vsub_.end()) + return Load::make(op->type, it->second, + op->index, op->predicate); + return e; + } + + private: + const std::unordered_map& vsub_; +}; + // The replacer of cache. class VarReplacer : public ir::IRMutator { public: explicit VarReplacer( const std::unordered_map& vsub) : vsub_(vsub) {} + Expr Mutate_(const Variable* op, const Expr& e) { auto it = vsub_.find(op); if (it != vsub_.end()) return it->second; @@ -43,6 +65,17 @@ class VarReplacer : public ir::IRMutator { const std::unordered_map& vsub_; }; +// create indices for store +Expr getIndex(std::vector indices, const Array shape) { + Expr ret = indices[0]; + Expr mul = 1; + for (size_t i = 1; i < indices.size(); i++) { + ret = Simplify(ret + indices[i] * mul); + mul = Simplify(mul * shape[i]); + } + return ret; +} + Expr InjectPredicate(const Array& predicates, Expr body) { using ir::Reduce; @@ -74,6 +107,120 @@ void ReplaceDataFlow(const Array& stages, } } +class StreamConsumer final : public IRMutator { + public: + StreamConsumer( + const std::string& target, + const ir::StreamType& type, + int channel_index) + : target_(target), type_(type), + channel_index_(channel_index) {} + + Expr Mutate_(const Load* op, const Expr& e) { + Expr index = op->index; + std::string target_name = op->buffer_var.get()->name_hint; + if (target_ == target_name) { + Array keys, values; + keys.push_back(StringImm::make("index")); + values.push_back(IntImm::make(Int(32), channel_index_)); + return StreamExpr::make(op->type, op->buffer_var, + type_, 10, keys, values); + } else { + return Load::make(op->type, op->buffer_var, + index, op->predicate); + } + } + + private: + const std::string target_; + const ir::StreamType type_; + const int channel_index_; +}; + +class StreamProducer final : public IRMutator { + public: + StreamProducer( + const std::string& target, + const ir::StreamType& type, + int channel_index) + : target_(target), type_(type), + channel_index_(channel_index) {} + + Stmt Mutate_(const Store* op, const Stmt& s) { + Expr index = op->index; + Expr value = this->Mutate(op->value); + std::string target_name = op->buffer_var.get()->name_hint; + if (target_name == target_) { + Array keys, values; + keys.push_back(StringImm::make("index")); + values.push_back(IntImm::make(Int(32), channel_index_)); + return StreamStmt::make(op->buffer_var, value, + type_, 10, keys, values); + } else { + return Store::make(op->buffer_var, value, + index, op->predicate); + } + } + + private: + const std::string target_; + const ir::StreamType type_; + const int channel_index_; +}; + +class KernelUpdater final : public IRMutator { + public: + static int channelCount; + KernelUpdater( + const int arg_pos, + const ir::StreamType& type, + const bool is_producer, + const bool kernel_channel) + : arg_pos_(arg_pos), type_(type), + is_producer_(is_producer), + kernel_channel_(kernel_channel) { + if (kernel_channel_) channel_index_ = getIndex(); + } + + Stmt Mutate_(const KernelDef* op, const Stmt& s) { + Stmt stmt = op->body; + // arr saves arg_pos and common channel idx + Array arr = op->channels; + CHECK(op->channels.size() % 2 == 0) + << "arg_pos, index pair number mismatch"; + arr.push_back(IntImm::make(Int(32), arg_pos_)); + arr.push_back(IntImm::make(Int(32), channel_index_)); + std::string target_ = op->args[arg_pos_].get()->name_hint; + if (is_producer_) { // mutate target load + StreamProducer mutator(target_, type_, channel_index_); + stmt = mutator.Mutate(stmt); + } else { // replace load consumer + StreamConsumer mutator(target_, type_, channel_index_); + stmt = mutator.Mutate(stmt); + } + // update kernel arg signature + return KernelDef::make(op->args, op->api_args, + op->api_types, stmt, op->ret_void, + op->ret_type, op->name, arr); + } + private: + const int arg_pos_; + const ir::StreamType type_; + const bool is_producer_; + const bool kernel_channel_; + int channel_index_{0}; + int getIndex() { + channelCount += 1; + int channel_num = channelCount; + if (channelCount % 2 == 0) + channel_num = channelCount - 1; + return channel_num; + } +}; + +// Initialize static channel count +int KernelUpdater::channelCount = 0; + class ParentStmtCollector final : public IRMutator { public: ParentStmtCollector( @@ -117,6 +264,369 @@ class ParentStmtCollector final : public IRMutator { const IterVar& axis_; }; +// initialize static split bound +int Schedule::split_bound = 0; + +// stream buffer data to kernel stage +void Schedule::to_stage(const Tensor& target, + /*kernel def stage*/ Stage dest, + /*position index*/int arg_pos, + StreamType stream_type, + int channel_depth, + std::string name) { + Stage target_stage = (*this)[target]; + Buffer target_buffer; + + // target stage as kernel def operator + if (const ExternOpNode* op = target_stage->op.as()) { + target_buffer = op->output_placeholders[0]; + // remove the receiver buffer (keep the device scope) + const AttrStmt* attr = op->body.as(); + Stmt scope_attr = AttrStmt::make(attr->node, attr->attr_key, + attr->value, Evaluate::make(0)); + target_stage->op = ExternOpNode::make(op->name, + "", + Array(), + op->inputs, + op->input_placeholders, + op->output_placeholders, + scope_attr); + // update dest stage body for data stream in + const ExternOpNode* destOp = dest->op.as(); + KernelUpdater mutator(arg_pos, stream_type, + /*is producer*/false, + /*inter module channel*/false); + auto new_body = mutator.Mutate(destOp->body); + dest->op = ExternOpNode::make(destOp->name, destOp->tag, + destOp->axis, destOp->inputs, + destOp->input_placeholders, + Array(), + new_body); + } +} + +// stream data between hardware modules +void Schedule::stream_to(const Tensor& target, + Stage dest, + Stage source, + StreamType stream_type, + int channel_depth, + std::string new_name) { + Stage target_stage = (*this)[target]; + std::vector consumers; + size_t num_stage = (*this)->stages.size(); + Buffer target_buffer; + std::unordered_map pos; + const ExternOpNode* destOp = dest->op.as(); + const ExternOpNode* srcOp = source->op.as(); + + // update kernel def and scope + const PlaceholderOpNode* op = target_stage->op.as(); + bool is_placeholder = op ? true : false; + if (is_placeholder) { + for (size_t i = 0; i < num_stage; i++) { + Stage s = (*this)->stages[i]; + // name matching to locate kernels + if (const ExternOpNode* op = s->op.as()) { + for (size_t j = 0; j < op->inputs.size(); j++) { + if (target == op->inputs[j]) { + target_buffer = op->input_placeholders[j]; + consumers.push_back(s); + // record streamed data pos in kernel call + if (std::regex_match(op->name, + std::regex(destOp->name + "(\\d)"))) + pos[dest] = j; + else if (std::regex_match(op->name, + std::regex(destOp->name + "(\\d)"))) + pos[source] = j; + break; + } + } + } + } + } else { // only consumed by self stage + const ExternOpNode* op = target_stage->op.as(); + target_buffer = op->output_placeholders[0]; + consumers.push_back(target_stage); + } + // mutator (is_producer false, kernel_channel true) + KernelUpdater destMutator(0, //target_buffer->name, + stream_type, false, true); + // mutate kernel def and repalce lw / st + dest->op = ExternOpNode::make(destOp->name, + destOp->tag, + destOp->axis, + destOp->inputs, + destOp->input_placeholders, + Array(), + destMutator.Mutate(destOp->body)); + // mutator (is_producer true, kernel_channel true) + KernelUpdater srcMutator(0, //target_buffer->name, + stream_type, true, true); + source->op = ExternOpNode::make(srcOp->name, + srcOp->tag, + srcOp->axis, + srcOp->inputs, + srcOp->input_placeholders, + Array(), + srcMutator.Mutate(srcOp->body)); + // update kernel call ops + for (auto s : consumers) { + const ExternOpNode* op = s->op.as(); + Stmt body = AttrStmt::make(VarExpr(), + "device_scope", + StringImm::make("fpga"), + op->body); + // not alloc buffer for kernel call + s->op = ExternOpNode::make(op->name, + op->tag, + op->axis, + op->inputs, + op->input_placeholders, + Array(), + body); + } +} + +// move data to device +Tensor Schedule::move_to(const Tensor& target, + DeviceType device_type, + StreamType stream_type, + int channel_depth, + std::string new_name) { + Stage target_stage = (*this)[target]; + std::vector consumers; + size_t num_stage = (*this)->stages.size(); + size_t min_pos = num_stage; + ArrayNode* stages = (*this)->stages.CopyOnWrite(); + Buffer target_buffer; + + // create producer and consumer stages for placeholder + const PlaceholderOpNode* op = target_stage->op.as(); + bool is_placeholder = op ? true : false; + if (is_placeholder) { + min_pos = 0; + for (size_t i = 0; i < num_stage; i++) { + Stage s = (*this)->stages[i]; + if (const ExternOpNode* op = s->op.as()) { + for (size_t j = 0; j < op->inputs.size(); j++) { + if (target == op->inputs[j]) { + target_buffer = op->input_placeholders[j]; + consumers.push_back(s); + break; + } + } + } + } + } else { // move data generated by extern op + min_pos = FindNodeRef(stages, target_stage) + 1; + const ExternOpNode* op = target_stage->op.as(); + target_buffer = op->output_placeholders[0]; + for (size_t i = 0; i < num_stage; i++) { + Stage s = (*this)->stages[i]; + if (const ExternOpNode* stage_op = s->op.as()) { + for (size_t j = 0; j < stage_op->inputs.size(); j++) { + if (op->output_placeholders[0] == stage_op->input_placeholders[j]) { + consumers.push_back(s); + break; + } + } + } + } + } + + // create sender and write into streaming channel + Array consumer_inputs; + Array consumer_input_placeholders; + Array consumer_output_placeholders; + std::string consumer_name = target_buffer->name + ".stream_send"; + Buffer consumer_buffer = BufferNode::make(Var(consumer_name, Handle()), + target->dtype, + target->shape, + Array(), + Expr(), + consumer_name, + "", 0, 0); + consumer_inputs.push_back(target); + consumer_input_placeholders.push_back(target_buffer); + consumer_output_placeholders.push_back(consumer_buffer); + + // create statement index + std::vector csm_indices; + std::vector csm_loop_vars; + for (size_t i = 0; i < target->shape.size(); i++) { + VarExpr iter(target_buffer->name + std::to_string(i)); + csm_indices.push_back(iter); + csm_loop_vars.push_back(iter); + } + Expr csm_index = getIndex(csm_indices, target->shape); + Expr load_expr = Load::make(target->dtype, + target_buffer->data, + csm_index, + UIntImm::make(UInt(1), 1)); + Stmt consumer_body = StreamStmt::make(consumer_buffer->data, + load_expr, + stream_type, + channel_depth); + + Expr sender_scope, receiver_scope; + size_t consumer_pos = min_pos; + switch (device_type) { + case DeviceType::CPU: + consumer_pos = num_stage; + sender_scope = StringImm::make("fpga"); + receiver_scope = StringImm::make("cpu"); + break; + case DeviceType::FPGA: + sender_scope = StringImm::make("cpu"); + receiver_scope = StringImm::make("fpga"); + break; + case DeviceType::GPU: + sender_scope = StringImm::make("cpu"); + receiver_scope = StringImm::make("gpu"); + break; + } + + for (size_t j = 0; j < target->shape.size(); j++) { + consumer_body = For::make( + VarExpr(csm_loop_vars[j]), + 0, target->shape[j], + ForType::Serial, + DeviceAPI::None, + consumer_body); + } + + consumer_body = AttrStmt::make( + consumer_buffer->data, + "device_scope", sender_scope, consumer_body); + + // create new stage and return stream tensors + // auto n = std::make_shared(); + // n->name = consumer_name; + // n->body = consumer_body; + // n->inputs = consumer_inputs; + // n->input_placeholders = consumer_input_placeholders; + // n->output_placeholders = consumer_output_placeholders; + // Operation consumer_op(n); + + Operation consumer_op = ExternOpNode::make(consumer_name, + "", + Array(), + consumer_inputs, + consumer_input_placeholders, + consumer_output_placeholders, + consumer_body); + Stage consumer_stage = Stage(consumer_op); + // insert sender before bound for (host,xcel <- host) case + if (device_type == DeviceType::FPGA) { + if (split_bound == 0) { + split_bound = consumer_pos + 1; + } else { // insert host sender before bound + consumer_pos = split_bound; + split_bound += 1; + } + } + stages->data.insert(stages->data.begin() + consumer_pos, consumer_stage.node_); + (*this)->stage_map.Set(consumer_op, consumer_stage); + + // build producer (receiver) stage which takes in data from streaming + // channel and provide data to orginal consumers + Array producer_inputs; + Array producer_input_placeholders; + Array producer_output_placeholders; + std::string producer_name = target_buffer->name + ".stream_recv"; + Buffer producer_buffer = BufferNode::make(Var(producer_name, Handle()), + target->dtype, + target->shape, + Array(), + Expr(), + producer_name, + "", 0, 0); + // producer_inputs.push_back(consumer_op.output(0)); + // producer_input_placeholders.push_back(consumer_buffer); + producer_output_placeholders.push_back(producer_buffer); + // streaming producer tensor reading from placeholder + Expr stream = StreamExpr::make(target->dtype, + consumer_buffer->data, + stream_type, + channel_depth); + // create for loops for tensor init + std::vector indices; + std::vector loop_vars; + for (size_t i = 0; i < target->shape.size(); i++) { + VarExpr iter(target_buffer->name + std::to_string(i)); + indices.push_back(iter); + loop_vars.push_back(iter); + } + Expr index = getIndex(indices, target->shape); + // store op initialized with variable node + Stmt for_stmt = Store::make(producer_buffer->data, + stream, index, + UIntImm::make(UInt(1), 1)); + for (size_t j = 0; j < target->shape.size(); j++) { + for_stmt = For::make( + VarExpr(loop_vars[j]), + 0, target->shape[j], + ForType::Serial, + DeviceAPI::None, + for_stmt); + } + + // attr annotates new scope + Stmt body = AttrStmt::make( + target_buffer->data, + "device_scope", receiver_scope, for_stmt); + Tensor producer = ExternOpNode::make(producer_buffer->name, + "", + Array(), + producer_inputs, + producer_input_placeholders, + producer_output_placeholders, + body).output(0); + + // recv stage creation + return tensor + Stage producer_stage = Stage(producer->op); + size_t pos = FindNodeRef(stages, consumer_stage); + if (split_bound == 0 || device_type == DeviceType::CPU) + pos = pos + 1; + else pos = split_bound + 1; + stages->data.insert(stages->data.begin() + pos, producer_stage.node_); + (*this)->stage_map.Set(producer->op, producer_stage); + + // update consumer stages with new tensor and buffer + std::unordered_map vsub; + vsub[target_buffer->data.as()] = producer_buffer->data; + for (size_t i = 0; i < consumers.size(); i++) { + Stage s = consumers[i]; + Array new_inputs; + Array new_input_placeholders; + const ExternOpNode* op = s->op.as(); + new_inputs.push_back(producer); + new_input_placeholders.push_back(producer_buffer); + for (size_t j = 0; j < op->inputs.size(); j++) { + if (target != op->inputs[j]) { + new_inputs.push_back(op->inputs[j]); + new_input_placeholders.push_back(op->input_placeholders[j]); + } + } + Stmt body = LoadReplacer(vsub).Mutate(op->body); + Stmt new_body = AttrStmt::make( + target_buffer->data, + "device_scope", + receiver_scope, + op->body); + s->op = ExternOpNode::make( + op->name, + op->tag, + op->axis, + new_inputs, + new_input_placeholders, + op->output_placeholders, + body); + } + return producer; +} + Tensor Schedule::reuse_at(const Tensor& target, Stage parent, IterVar axis, diff --git a/tvm/src/schedule/schedule_ops.cc b/tvm/src/schedule/schedule_ops.cc index b4f8e7468..8156844f5 100644 --- a/tvm/src/schedule/schedule_ops.cc +++ b/tvm/src/schedule/schedule_ops.cc @@ -349,7 +349,7 @@ Stmt ScheduleOps( << "call schedule.normalize before scheduleops"; CHECK(s->op.defined()); // no need to specify place holder op. - if (s->op.as()) continue; + if (auto op = s->op.as()) continue; // Remove grouping sugar, get the real attach spec. Stage attach_spec = s.GetAttachSpec(); diff --git a/tvm/src/template/sdaccel/CLKernel.cpp b/tvm/src/template/sdaccel/CLKernel.cpp new file mode 100644 index 000000000..84cf29465 --- /dev/null +++ b/tvm/src/template/sdaccel/CLKernel.cpp @@ -0,0 +1,67 @@ +/*===============================================================*/ +/* */ +/* CLKernel.cpp */ +/* */ +/* Defines the object class for an OpenCL kernel */ +/* */ +/*===============================================================*/ + +#include "CLKernel.h" +#include + +namespace rosetta +{ + // initialize the kernel from binary file + CLKernel::CLKernel(cl_context context, cl_program program, std::string kernel_name, cl_device_id device_id) + { + printf("Creating kernel %s ... ", kernel_name.c_str()); + + int err; + + // set the name and device ID + this->device_id = device_id; + this->kernel_name = kernel_name; + + // Create the compute kernel in the program we wish to run + kernel = clCreateKernel(program, kernel_name.c_str(), &err); + if (!kernel || err != CL_SUCCESS) + { + printf("Error: Failed to create compute kernel!\n"); + printf("Error Code %d\n", err); + exit(EXIT_FAILURE); + } + + printf("Done!\n"); + } + + void CLKernel::set_global(int global_work_size[3]) + { + printf("Set global work size of kernel %s to [%d, %d, %d]\n", kernel_name.c_str(), + global_work_size[0], global_work_size[1], global_work_size[2]); + + for (int i = 0; i < 3; i ++ ) + this->global_size[i] = global_work_size[i]; + } + + void CLKernel::set_local(int local_work_size[3]) + { + printf("Set local work size of kernel %s to [%d, %d, %d]\n", kernel_name.c_str(), + local_work_size[0], local_work_size[1], local_work_size[2]); + + for (int i = 0; i < 3; i ++ ) + this->local_size[i] = local_work_size[i]; + } + + std::string CLKernel::get_name() + { + return this->kernel_name; + } + + void CLKernel::releaseKernel() + { + printf("Release kernel %s ... ", kernel_name.c_str()); + // release kernel + clReleaseKernel(kernel); + printf("Done!\n"); + } +} diff --git a/tvm/src/template/sdaccel/CLKernel.h b/tvm/src/template/sdaccel/CLKernel.h new file mode 100644 index 000000000..2933913c8 --- /dev/null +++ b/tvm/src/template/sdaccel/CLKernel.h @@ -0,0 +1,96 @@ +/*===============================================================*/ +/* */ +/* CLKernel.h */ +/* */ +/* Defines the object class for an OpenCL kernel */ +/* */ +/*===============================================================*/ + + +#ifndef __CLKernel__Harness__ +#define __CLKernel__Harness__ + +// standard headers +#include +#include +#include +// opencl header +#include +// CLMemObj is a member of this class +#include "CLMemObj.h" + +namespace rosetta +{ + + // wrapper class around an OpenCL kernel + class CLKernel + { + + friend class CLWorld; + + public: + + // constructor + // compiles the kernel + CLKernel(cl_context context, cl_program program, std::string kernel_name, cl_device_id device_id); + + // set global/local work group size + void set_global(int global_work_size[3]); + void set_local(int local_work_size[3]); + + // get kernel name + std::string get_name(); + + protected: + + // set cl_mem argument + int set_mem_arg(int id, cl_mem mem_obj) + { + int err; + err = clSetKernelArg(this->kernel, id, sizeof(mem_obj), &mem_obj); + if (err != CL_SUCCESS) + { + printf("Error: Failed to set kernel argument %d for kernel %s!\n", id, (this->kernel_name).c_str()); + printf("Error Code %d\n", err); + return EXIT_FAILURE; + } + + return err; + } + + // set memory arguments for this kernel + template + int set_const_arg(int id, T& mem_obj) + { + int err; + // printf("%d\n", mem_obj); + err = clSetKernelArg(this->kernel, id, sizeof(mem_obj), &mem_obj); + printf("****************\n"); + printf("%d\n", err); + if (err != CL_SUCCESS) + { + printf("Error: Failed to set kernel argument %d for kernel %s!\n", id, (this->kernel_name).c_str()); + printf("Error Code %d\n", err); + return EXIT_FAILURE; + } + + return err; + } + + void releaseKernel(); + + private: + + // global and local work group size + size_t global_size[3]; + size_t local_size[3]; + + // kernel information and objects + std::string kernel_name; + cl_device_id device_id; // target device id + cl_kernel kernel; // compute kernel + + }; + +} +#endif /* defined(__CLKernel__Harness__) */ diff --git a/tvm/src/template/sdaccel/CLMemObj.cpp b/tvm/src/template/sdaccel/CLMemObj.cpp new file mode 100644 index 000000000..a6fdecf4a --- /dev/null +++ b/tvm/src/template/sdaccel/CLMemObj.cpp @@ -0,0 +1,57 @@ +/*===============================================================*/ +/* */ +/* CLMemObj.cpp */ +/* */ +/* Implements the member functions of CLMemObj class */ +/* */ +/*===============================================================*/ + + +#include "CLMemObj.h" + +namespace rosetta +{ + // default constructor, initializes everything to 0 + CLMemObj::CLMemObj() + { + this->mem_data = nullptr; + this->elt_size = 0; + this->length = 0; + this->flags = 0; + this->bank = nullptr; + } + + // meaningful constructor, initialize data info constants + CLMemObj::CLMemObj(void *mem_data, int elt_size, int length, cl_mem_flags flags, cl_mem_ext_ptr_t* xil_ext ) + { + this->mem_data = mem_data; + this->elt_size = elt_size; + this->length = length; + this->flags = flags; + // can use Xilinx mem extensions to specify DDR bank + if (xil_ext != nullptr) + { + this->bank = new cl_mem_ext_ptr_t; + this->bank->flags = xil_ext->flags; + this->bank->obj = xil_ext->obj; + this->bank->param = 0; + } + else + this->bank = nullptr; + } + + // return the pointer to data + void * CLMemObj::get_data() { return mem_data; } + + // get size of each element + int CLMemObj::get_element_size() { return elt_size; } + + // get the number of elements in the buffer + int CLMemObj::get_length() { return length; } + + // get OpenCL memory flags + cl_mem_flags CLMemObj::get_flags() { return flags; } + + // get xilinx memory extension pointer + cl_mem_ext_ptr_t* CLMemObj::get_xil_ext_ptr() { return bank; } +} diff --git a/tvm/src/template/sdaccel/CLMemObj.h b/tvm/src/template/sdaccel/CLMemObj.h new file mode 100644 index 000000000..30e564aff --- /dev/null +++ b/tvm/src/template/sdaccel/CLMemObj.h @@ -0,0 +1,57 @@ +/*===============================================================*/ +/* */ +/* CLMemObj.h */ +/* */ +/* Defines the object class for an OpenCL memory buffer */ +/* */ +/*===============================================================*/ + + +#ifndef __CLMemObj__Harness__ +#define __CLMemObj__Harness__ + +// standard header for command line output +#include +// opencl header +#include +// xilinx opencl extension header +#include + +namespace rosetta +{ + // wrapper class around cl_mem + class CLMemObj + { + + friend class CLWorld; + + public: + + // default constructor + CLMemObj (); + // a meaningful constructor + CLMemObj (void* mem_data, int elt_size, int length, cl_mem_flags flags, cl_mem_ext_ptr_t* xil_ext = nullptr); + + // get information about the buffer + void* get_data(); + int get_element_size(); + int get_length(); + cl_mem_flags get_flags(); + cl_mem_ext_ptr_t* get_xil_ext_ptr(); + + private: + + // pointer to data + void *mem_data; + // size of each element + int elt_size; + // number of elements + int length; + // OpenCL memory flag + cl_mem_flags flags; + // Xilinx extension describing bank assignment + cl_mem_ext_ptr_t* bank; + }; +} + +#endif /* defined(__CLMemObj__Harness__) */ diff --git a/tvm/src/template/sdaccel/CLWorld.cpp b/tvm/src/template/sdaccel/CLWorld.cpp new file mode 100644 index 000000000..7be386df2 --- /dev/null +++ b/tvm/src/template/sdaccel/CLWorld.cpp @@ -0,0 +1,401 @@ +/*===============================================================*/ +/* */ +/* CLWorld.cpp */ +/* */ +/* Implementation of the CLWorld class */ +/* */ +/*===============================================================*/ + +#include "CLWorld.h" + +namespace rosetta +{ + // default constructor + // make sure it does something meaningful + CLWorld::CLWorld() + { + // default: run on alpha data 7v3 board + this->target_device_name = "xilinx:adm-pcie-7v3:1ddr:3.0"; + this->device_type = CL_DEVICE_TYPE_ACCELERATOR; + + // configure the OpenCL runtime + createWorld(); + } + + // meaningful constructor + // user specifies device + CLWorld::CLWorld(std::string target_device_name, cl_device_type device_type) + { + this->target_device_name = target_device_name; + this->device_type = device_type; + createWorld(); + } + + // get the compute device + cl_device_id CLWorld::getDevice() + { + return this->device_id; + } + + // get context + cl_context CLWorld::getContext() + { + return this->context; + } + + // get compute program + cl_program CLWorld::getProgram() + { + return this->program; + } + + // insert a new memory object + int CLWorld::addMemObj(CLMemObj &new_mem_obj) + { + int err; + + printf("Adding memory object into the world ... "); + + // first push the CLMemObj object into our vector + mem_objs.push_back(new_mem_obj); + + // then create the actual cl_mem buffer, push it into another vector + cl_mem buf; + + buf = clCreateBuffer(context, new_mem_obj.flags, new_mem_obj.elt_size * new_mem_obj.length, new_mem_obj.bank, &err); + if (err != CL_SUCCESS) + { + printf("Error creating buffer for memory object %d!\n", mem_objs.size()-1); + printf("Error Code %d\n", err); + exit(EXIT_FAILURE); + } + + cl_mem_buffers.push_back(buf); + + // write the buffer onto the device if needed + if ((new_mem_obj.flags != CL_MEM_WRITE_ONLY) && (new_mem_obj.mem_data != nullptr)) + { + err = clEnqueueWriteBuffer(cmd_queue, buf, true, 0, new_mem_obj.elt_size * new_mem_obj.length, + new_mem_obj.mem_data, 0, NULL, NULL); + if (err != CL_SUCCESS) + { + printf("Error writing buffer %d onto the device!\n", mem_objs.size()-1); + printf("Error Code %d\n", err); + exit(EXIT_FAILURE); + } + } + + printf("Done!\n"); + + return (mem_objs.size() - 1); + } + + int CLWorld::updateMemObj(int mem_idx) + { + printf("Updating mem object %d ... ", mem_idx); + + // write the buffer onto the device if needed + if (mem_objs[mem_idx].flags != CL_MEM_WRITE_ONLY) + { + int err = clEnqueueWriteBuffer(cmd_queue, cl_mem_buffers[mem_idx], true, 0, + mem_objs[mem_idx].elt_size * mem_objs[mem_idx].length, + mem_objs[mem_idx].mem_data, 0, NULL, NULL); + if (err != CL_SUCCESS) + { + printf("Error writing buffer %d onto the device!\n", mem_idx); + printf("Error Code %d\n", err); + exit(EXIT_FAILURE); + } + } + else + printf("Buffer %d is write_only! Not updating it ... \n", mem_idx); + + return EXIT_SUCCESS; + } + + int CLWorld::readMemObj(int mem_idx) + { + printf("Reading mem object %d into host buffers ... ", mem_idx); + + int err = clEnqueueReadBuffer(cmd_queue, cl_mem_buffers[mem_idx], true, 0, + mem_objs[mem_idx].elt_size * mem_objs[mem_idx].length, + mem_objs[mem_idx].mem_data, 0, NULL, NULL); + if (err != CL_SUCCESS) + { + printf("Error reading kernel buffer %d!\n", mem_idx); + printf("Error code %d\n", err); + exit(EXIT_FAILURE); + } + + printf("Done!\n"); + + return err; + } + + + // create compute program from a file + // return error code + int CLWorld::addProgram(std::string filename) + { + printf("Adding binary program into the world ... "); + + // load the file + size_t code_size = (size_t) load_file_to_memory(filename.c_str()); + + // start to compile + int err; + cl_int create_binary_status; + + // Create the compute program from the source buffer + program = clCreateProgramWithBinary(context, 1, &device_id, (const size_t *) &code_size, + (const unsigned char **) &kernel_code, &create_binary_status, &err); + if (!program) + { + printf("Error: Failed to create compute program!\n"); + printf("Error Code %d\n", err); + exit(EXIT_FAILURE); + } + + // Build the program executable + err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); + if (err != CL_SUCCESS) + { + size_t len; + char buffer[2048]; + + printf("Error: Failed to build program executable!\n"); + printf("Error Code %d\n", err); + clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len); + printf("%s\n", buffer); + exit(EXIT_FAILURE); + } + + printf("Done!\n"); + + return err; + } + + // insert a kernel into the world + // return the position of the kernel in the vector + int CLWorld::addKernel(CLKernel &new_kernel) + { + printf("Adding kernel %s into the world ... ", new_kernel.get_name().c_str()); + + kernels.push_back(new_kernel); + + printf("Done!\n"); + + return (kernels.size() - 1); + } + + // methods to set kernel arguments + // memory argument + int CLWorld::setMemKernelArg(int kernel_id, int pos, int arg_id) + { + printf("Set mem arg %d for kernel %d with mem object %d ... ", pos, kernel_id, arg_id); + + int err = kernels[kernel_id].set_mem_arg(pos, cl_mem_buffers[arg_id]); + if (err != CL_SUCCESS) + { + printf("Error setting kernel argument!\n"); + printf("Error code %d\n", err); + exit(EXIT_FAILURE); + } + + printf("Done!\n"); + + return err; + } + + // run all kernels + // return error code + int CLWorld::runKernels(bool flush) + { + printf("Start kernel execution ... "); + + int err; + + // wait for previous write buffer tasks to finish + printf("Waiting for queue... \n"); + clFinish(cmd_queue); + + // enqueue all the kernels + // temporarily we assume kernels won't have any dependency between them + // or the dependency is handled inside kernels (such as pipes, etc. ) + for (int i = 0; i < kernels.size(); i ++ ) + { + printf("Start kernel %d!\n", i); + err = clEnqueueNDRangeKernel(cmd_queue, kernels[i].kernel, 3, NULL, kernels[i].global_size, kernels[i].local_size, + 0, NULL, NULL); + if (err != CL_SUCCESS) + { + printf("Error enqueuing kernel %d!\n", i); + printf("Error Code %d\n", err); + exit(EXIT_FAILURE); + } + } + + // wait for them to finish + printf("Waiting for kernels ... \n"); + clFinish(cmd_queue); + + // remove all of them from the vector + // so that this function can be called multiple times + // at a cost that kernels won't be released automatically + if (flush) + { + int total_size = kernels.size(); + for (int i = 0; i < total_size; i ++ ) + kernels.pop_back(); + } + + printf("Done!\n"); + + return err; + } + + // create runtime environment + int CLWorld::createWorld() + { + printf("Initializing OpenCL runtime environment ... "); + + int err; + + // scan the machine for available OpenCL platforms + cl_uint platform_cnt; + cl_platform_id platforms[16]; + err = clGetPlatformIDs(16, platforms, &platform_cnt); + if (err != CL_SUCCESS) + { + printf("Error: Failed to find an OpenCL platform!\n"); + printf("Error Code %d\n", err); + printf("Test failed\n"); + exit(EXIT_FAILURE); + } + printf("INFO: Found %d platforms\n", platform_cnt); + + + // find the target device + char device_name[1024]; + cl_device_id devices[16]; + cl_uint device_cnt; + bool found_device = false; + // scan all platforms + for (int p = 0; (p < platform_cnt) & (!found_device); p ++ ) + { + err = clGetDeviceIDs(platforms[p], this->device_type, 16, devices, &device_cnt); + if (err != CL_SUCCESS) + { + printf("Error: Failed to create a device group for platform %d!\n", p); + printf("Error Code %d\n", err); + printf("Test failed\n"); + exit(EXIT_FAILURE); + } + // iterate through all devices on the platform + for (int d = 0; (d < device_cnt) & (!found_device); d ++ ) + { + err = clGetDeviceInfo(devices[d], CL_DEVICE_NAME, 1024, device_name, 0); + if (err != CL_SUCCESS) + { + printf("Error: Failed to get device name for device %d on platform %d!\n", d, p); + printf("Error Code %d\n", err); + printf("Test failed\n"); + exit(EXIT_FAILURE); + } + + if (std::string(device_name) == this->target_device_name) + { + this->platform = platforms[p]; + this->device_id = devices[d]; + found_device = true; + printf("Selected device %d on platform %d as target device!\n", d, p); + } + } + } + + if (!found_device) + { + printf("Error: Target device %s is not found!\n", (this->target_device_name).c_str()); + exit(EXIT_FAILURE); + } + + // create context and command queue + this->context = clCreateContext(0, 1, &(this->device_id), 0, 0, &err); + if (!(this->context)) + { + printf("Error: Failed to create a compute context!\n"); + printf("Error Code %d\n", err); + exit(EXIT_FAILURE); + } + this->cmd_queue = clCreateCommandQueue(this->context, this->device_id, + CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, + &err); + if (!(this->cmd_queue)) + { + printf("Error: Failed to create a command queue!\n"); + printf("Error Code %d\n", err); + exit(EXIT_FAILURE); + } + + printf("Done!\n"); + + return err; + } + + // read kernel binary file into memory + int CLWorld::load_file_to_memory(const char *filename) + { + int size = 0; + FILE *f = fopen(filename, "rb"); + if (f == NULL) + { + kernel_code = NULL; + printf("Can not open kernel file!\n"); + exit(-1); + } + fseek(f, 0, SEEK_END); + size = ftell(f); + printf("Size of the file is %ld\n", size); + fseek(f, 0, SEEK_SET); + kernel_code = new char[size+1]; + if ((unsigned int) size != fread(kernel_code, sizeof(char), size, f)) + { + delete []kernel_code; + printf("Reading kernel failed!\n"); + exit(-2); + } + fclose(f); + (kernel_code)[size] = 0; + return size; + } + + + // release all runtime constructs + void CLWorld::releaseWorld() + { + printf("Cleaning up OpenCL opjects ... "); + + // release memory objects + for (int i = 0; i < cl_mem_buffers.size(); i ++ ) + clReleaseMemObject(cl_mem_buffers[i]); + + // release program + delete []kernel_code; + clReleaseProgram(program); + + // release kernels + for (int i = 0; i < kernels.size(); i ++ ) + kernels[i].releaseKernel(); + + // release device and context + clReleaseCommandQueue(cmd_queue); + clReleaseContext(context); + + printf("Done!\n"); + } + +} + + + + diff --git a/tvm/src/template/sdaccel/CLWorld.h b/tvm/src/template/sdaccel/CLWorld.h new file mode 100644 index 000000000..9624687aa --- /dev/null +++ b/tvm/src/template/sdaccel/CLWorld.h @@ -0,0 +1,129 @@ +/*===============================================================*/ +/* */ +/* CLWorld.h */ +/* */ +/* Defines the object class for OpenCL context */ +/* */ +/*===============================================================*/ + + +#ifndef __CLWorld__Harness__ +#define __CLWorld__Harness__ + +// standard headers +#include +#include +#include +// opencl header +#include +// CLKernel and CLMemObj are members of this class +#include "CLKernel.h" +#include "CLMemObj.h" + +namespace rosetta +{ + + class CLWorld + { + + public: + + // default constructor + CLWorld(); + + // meaningful constructor + CLWorld(std::string target_device_name, cl_device_type device_type); + + // get the compute device associated with this world + cl_device_id getDevice(); + + // get the compute context associated with this world + cl_context getContext(); + + // get the binary program + cl_program getProgram(); + + // insert a compute program + int addProgram(std::string filename); + + // insert a kernel + int addKernel(CLKernel &new_kernel); + + // insert a memory object + int addMemObj(CLMemObj &new_mem_obj); + + // update a memory object (write new value) + int updateMemObj(int mem_id); + + // read a memory object + int readMemObj(int mem_id); + + // set memory kernel argument + int setMemKernelArg(int kernel_id, int pos, int mem_id); + + // set constant kernel argument + template + int setConstKernelArg(int kernel_id, int pos, T& arg) + { + // printf("%lu\n", arg); + printf("Set const arg %d for kernel %d ... ", pos, kernel_id); + + int err = kernels[kernel_id].set_const_arg(pos, arg); + if (err != CL_SUCCESS) + { + printf("Error setting kernel argument!\n"); + printf("Error code %d\n", err); + exit(EXIT_FAILURE); + } + + printf("Done!\n"); + + return err; + } + + // run kernels + int runKernels(bool flush = false); + + // clean up + void releaseWorld(); + + private: + + // OpenCL runtime variables + + // the platform we will use + cl_platform_id platform; + + // the device we will use + std::string target_device_name; // device name + cl_device_type device_type; // device type + cl_device_id device_id; // device id + + // compute context + cl_context context; + + // command queue + cl_command_queue cmd_queue; + + // binary program for the device + char* kernel_code; + cl_program program; + + // kernels + std::vector kernels; + + // memory objects + std::vector mem_objs; + // actual OpenCL memory buffers + std::vector cl_mem_buffers; + + // function to create the OpenCL runtime + int createWorld(); + + // load binary file into memory + int load_file_to_memory(const char *filename); + }; + +} + +#endif diff --git a/tvm/src/template/sdaccel/Makefile b/tvm/src/template/sdaccel/Makefile new file mode 100644 index 000000000..282f67921 --- /dev/null +++ b/tvm/src/template/sdaccel/Makefile @@ -0,0 +1,33 @@ +# Set kernel name +KERNEL_NAME = App + +# Set host source and headers +# HOST_SRC_CPP = ./src/host/digit_recognition.cpp ./src/host/utils.cpp ./src/host/check_result.cpp +HOST_SRC_CPP = host.cpp utils.cpp +# HOST_SRC_H = ./src/host/utils.h ./src/host/check_result.h ./src/host/typedefs.h ./src/host/testing_data.h \ + ./src/host/training_data.h +HOST_SRC_H = utils.h +# DATA = ./data/*.dat + + +# Set host code include paths +HOST_INC = -I/opt/Xilinx/Vivado/2018.2.op2258646/include/ +HOST_LIB = -L/opt/Xilinx/Vivado/2018.2.op2258646/lib/ + +# Set kernel file +OCL_KERNEL_SRC = interface.cpp +# OCL_KERNEL_H = ./src/host/typedefs.h +# SDSOC_KERNEL_SRC = ./src/sdsoc/digitrec.cpp +# SDSOC_KERNEL_H = ./src/host/typedefs.h +# SW_KERNEL_SRC = ./src/sw/digitrec_sw.cpp +# SW_KERNEL_H = ./src/host/typedefs.h ./src/sw/digitrec_sw.h + +# Set opencl kernel arguments +# log: removed --report system +OCL_KERNEL_ARGS = --max_memory_ports all + +#------------------------- +# Leave the rest to harness +#------------------------- +include harness.mk + diff --git a/tvm/src/template/sdaccel/harness.mk b/tvm/src/template/sdaccel/harness.mk new file mode 100644 index 000000000..23856f9c7 --- /dev/null +++ b/tvm/src/template/sdaccel/harness.mk @@ -0,0 +1,196 @@ +# ======================================== Check Xilinx SDX Environment Settings ================================================== # +ifndef XILINX_SDX + $(error Environment variable XILINX_SDX is required and should point to SDx install area) +endif + +# =============================================== Tools Used in Rosetta =========================================================== # + +# sdaccel tools +OCL_CXX = xcpp +XOCC = xocc + +# sdsoc tools +SDSXX = sds++ + +# default sw compiler +SW_CXX = g++ + +# ============================================= SDAccel Platform and Target Settings ============================================== # + +# Set Default OpenCL device and platform +USR_PLATFORM = n +OCL_DEVICE = xilinx:adm-pcie-7v3:1ddr:3.0 +OCL_PLATFORM = one_of_default_platforms + +# Check if the user specified opencl platform +ifneq ($(OCL_PLATFORM), one_of_default_platforms) + USR_PLATFORM=y +endif + +# Check OCL_TARGET value +OCL_TARGET = sw_emu +ifeq ($(OCL_TARGET),sw_emu) +else ifeq ($(OCL_TARGET),hw_emu) +else ifeq ($(OCL_TARGET),hw) +else + $(error "OCL_TARGET does not support the $(OCL_TARGET) value. Supported values are: sw_emu, hw_emu, hw") +endif + +# Check opencl kernel file type +OCL_KERNEL_TYPE = ocl + +ifeq ($(suffix $(OCL_KERNEL_SRC)),.cl) + OCL_KERNEL_TYPE=ocl +else + OCL_KERNEL_TYPE=c +endif + +# OpenCL runtime Libraries +OPENCL_INC = $(XILINX_SDX)/runtime/include/1_2 +OPENCL_LIB = $(XILINX_SDX)/runtime/lib/x86_64 + +# opencl harness files +OCL_HARNESS_DIR = . +OCL_HARNESS_SRC_CPP = $(OCL_HARNESS_DIR)/CLKernel.cpp $(OCL_HARNESS_DIR)/CLMemObj.cpp $(OCL_HARNESS_DIR)/CLWorld.cpp +OCL_HARNESS_SRC_H = $(OCL_HARNESS_DIR)/CLKernel.h $(OCL_HARNESS_DIR)/CLMemObj.h $(OCL_HARNESS_DIR)/CLWorld.h + +# host compilation flags +OCL_HOST_FLAGS = -DOCL -g -lxilinxopencl -I$(OPENCL_INC) $(HOST_INC) -L$(OPENCL_LIB) $(HOST_LIB) -I$(OCL_HARNESS_DIR) -I$(APPLICATION_DIR) + +# xclbin compilation flags +XCLBIN_FLAGS = -s -t $(OCL_TARGET) -g + +# change OCL_HOST_FLAG +ifdef K_CONST + OCL_HOST_FLAGS += -DK_CONST=$(K_CONST) +endif +ifdef NUM_ITER + OCL_HOST_FLAGS += -DNUM_ITER=$(NUM_ITER) +endif +ifdef FIXED_FLAG + OCL_HOST_FLAGS += -DFIXED_TYPE +endif + + +ifneq ($(KERNEL_TYPE),ocl) + XCLBIN_FLAGS += --kernel $(KERNEL_NAME) +endif + +ifeq ($(USR_PLATFORM),n) + XCLBIN_FLAGS += --xdevice $(OCL_DEVICE) +else + XCLBIN_FLAGS += --platform $(OCL_PLATFORM) +endif + + +# change XCLBIN_FLAGS +ifdef K_CONST + XCLBIN_FLAGS += -DK_CONST=$(K_CONST) +endif +ifdef NUM_ITER + XCLBIN_FLAGS += -DNUM_ITER=$(NUM_ITER) +endif +ifdef FIXED_FLAG + XCLBIN_FLAGS += -DFIXED_TYPE +endif + + +XCLBIN_FLAGS += $(OCL_KERNEL_ARGS) + + +# host exe +OCL_HOST_EXE = $(KERNEL_NAME)_host.exe + +# Kernel XCLBIN file +XCLBIN = $(KERNEL_NAME).$(OCL_TARGET).xclbin +XO = $(KERNEL_NAME).$(OCL_TARGET).xo + +# =============================================== SDSoC Platform and Target Settings ============================================== # + +# platform +SDSOC_PLATFORM = zc706 + +# executable +SDSOC_EXE = $(KERNEL_NAME).elf + +# sds++ flags +SDSFLAGS = -sds-pf $(SDSOC_PLATFORM) -sds-hw $(KERNEL_NAME) $(SDSOC_KERNEL_SRC) -sds-end -clkid 3 \ + -poll-mode 1 -verbose +SDSCFLAGS += -DSDSOC -Wall -O3 -c +SDSCFLAGS += -MMD -MP -MF"$(@:%.o=%.d)" +SDSLFLAGS = -O3 + +# objects +ALL_SDSOC_SRC = $(HOST_SRC_CPP) $(SDSOC_KERNEL_SRC) +OBJECTS := $(ALL_SDSOC_SRC:.cpp=.o) +DEPS := $(OBJECTS:.o=.d) + +# =============================================== Pure Software Compilation Settings ============================================== # + +# compiler flags +SW_FLAGS = -DSW -O3 + +# sw executable +SW_EXE = $(KERNEL_NAME)_sw.exe + +# ========================================================= Rules ================================================================= # + +# we will have 4 top-level rules: ocl, sdsoc, sw and clean +# default to sw + +.PHONY: all ocl sdsoc sw clean + +all: sw + +# ocl rules +ocl: $(OCL_HOST_EXE) $(XCLBIN) + +# ocl secondary rule: host executable +$(OCL_HOST_EXE): $(HOST_SRC_CPP) $(HOST_SRC_H) $(OCL_HARNESS_SRC_CPP) $(OCL_HARNESS_SRC_H) $(DATA) + $(OCL_CXX) $(OCL_HOST_FLAGS) -o $@ $(HOST_SRC_CPP) $(OCL_HARNESS_SRC_CPP) + +# ocl secondary rule: xclbin +$(XCLBIN): $(XO) + $(XOCC) -l $(XCLBIN_FLAGS) -o $@ $(XO) + +# ocl secondary rule: xo +$(XO): $(OCL_KERNEL_SRC) $(OCL_KERNEL_H) + $(XOCC) -c $(XCLBIN_FLAGS) -o $@ $(OCL_KERNEL_SRC) + +# sdsoc rules +sdsoc: $(SDSOC_EXE) + +$(SDSOC_EXE): $(OBJECTS) + $(SDSXX) $(SDSFLAGS) $(SDSLFLAGS) ${OBJECTS} -o $@ + +-include $(DEPS) + +%.o: %.cpp + $(SDSXX) $(SDSFLAGS) $(SDSCFLAGS) $< -o $@ + + +# software rules +sw: $(HOST_SRC_CPP) $(HOST_SRC_H) $(SW_KENREL_SRC) $(SW_KERNEL_H) $(DATA) + $(SW_CXX) $(SW_FLAGS) -o $(SW_EXE) $(HOST_SRC_CPP) $(SW_KERNEL_SRC) + +# cleanup +clean: + @echo "Cleaning old files" + rm -rf *.exe + rm -rf *.elf + rm -rf *.xclbin + rm -rf *.bit + rm -rf *.rpt + rm -rf system_estimate.xtxt + rm -rf _xocc* + rm -rf _sds + rm -rf sd_card + rm -rf .Xil + rm -rf ./src/host/*.d + rm -rf ./src/sdsoc/*.o + rm -rf ./src/sdsoc/*.d + rm -rf ./src/host/*.o + rm -rf *.dat + rm -rf *.html + rm -rf *.csv + rm -rf *.json diff --git a/tvm/src/template/sdaccel/run.tcl b/tvm/src/template/sdaccel/run.tcl new file mode 100644 index 000000000..0d6dca4b5 --- /dev/null +++ b/tvm/src/template/sdaccel/run.tcl @@ -0,0 +1,14 @@ +set hls_prj digitrec.prj +open_project ${hls_prj} -reset +set_top default_function +add_files -tb main.cpp +add_files -tb data + +open_solution "solution1" +set_part {xc7z020clg484-1} +create_clock -period 10 + +csim_design -O +csynth_design +#cosim_design +exit diff --git a/tvm/src/template/sdaccel/run_hw.sh b/tvm/src/template/sdaccel/run_hw.sh new file mode 100755 index 000000000..f65d28e6d --- /dev/null +++ b/tvm/src/template/sdaccel/run_hw.sh @@ -0,0 +1,28 @@ +#===============================================================# +# # +# run_hw.sh # +# # +# A bash script to synthesize and generate bitstream # +# # +# # +#===============================================================# + + +#!/bin/bash +make clean + +# the k value of KNN, default is 3 +k_value=3 +# the directory of this lab +app_dir=`pwd` + +### COMPILATION +# create some blank-line space for easy readability +echo ""; echo ""; echo "" ; echo "" +echo "####################################################" +echo " Synthesize and Generate Bitstream with K_CONST=$k_value" +echo "####################################################" +make ocl OCL_TARGET=hw OCL_PLATFORM=$AWS_PLATFORM APPLICATION_DIR=$app_dir K_CONST=$k_value +#export XCL_EMULATION_MODE=hw_emu +#./DigitRec_host.exe -f DigitRec.hw_emu.xclbin + diff --git a/tvm/src/template/sdaccel/run_sw.sh b/tvm/src/template/sdaccel/run_sw.sh new file mode 100755 index 000000000..80ba00495 --- /dev/null +++ b/tvm/src/template/sdaccel/run_sw.sh @@ -0,0 +1,51 @@ +#===============================================================# +# # +# run1.sh # +# # +# A bash script to run the software emulation # +# # +# # +#===============================================================# + + +#!/bin/bash +make clean + +# check env variable setup +if [ -z "$AWS_PLATFORM" ]; then + echo "AWS_PLATFORM not set up; use default" + export AWS_PLATFORM=xilinx:adm-pcie-7v3:1ddr:3.0 +fi + +# set up emulation configuration +echo "#################################################" +echo " Setting emulation configuration..." +echo "#################################################" +export LC_CTYPE=en_US.UTF-8 +export LC_ALL=en_US.UTF-8 +export XCL_EMULATION_MODE=true +emconfigutil --platform=$AWS_PLATFORM + +# the k value of KNN, default is 3 +k_value=3 +# the directory of this lab +app_dir=`pwd` + +### COMPILATION +# create some blank-line space for easy readability +echo ""; echo ""; echo "" ; echo "" +echo "####################################################" +echo " Compiling project with K_CONST=$k_value" +echo "####################################################" +make ocl OCL_TARGET=sw_emu OCL_PLATFORM=$AWS_PLATFORM APPLICATION_DIR=$app_dir K_CONST=$k_value + + +### EXECUTION +echo ""; echo ""; echo "" ; echo "" +echo "####################################################" +echo " Executing DigitRec with K_CONST=$k_value" +echo "####################################################" +export XCL_EMULATION_MODE=sw_emu +#export XCL_EMULATION_MODE=hw_emu +./App_host.exe -f App.sw_emu.xclbin + diff --git a/tvm/src/template/sdaccel/utils.cpp b/tvm/src/template/sdaccel/utils.cpp new file mode 100644 index 000000000..0e6dd632e --- /dev/null +++ b/tvm/src/template/sdaccel/utils.cpp @@ -0,0 +1,46 @@ +/*===============================================================*/ +/* */ +/* utils.cpp */ +/* */ +/* Utility functions */ +/* */ +/*===============================================================*/ + +#include +#include +#include +#include + +#include "utils.h" + +void print_usage(char* filename) +{ + printf("usage: %s \n", filename); + printf(" -f [kernel file]\n"); +} + +void parse_sdaccel_command_line_args( + int argc, + char** argv, + std::string& kernelFile) +{ + + int c = 0; + + while ((c = getopt(argc, argv, "f:")) != -1) + { + switch (c) + { + case 'f': + kernelFile = optarg; + break; + default: + { + print_usage(argv[0]); + exit(-1); + } + } // matching on arguments + } // while args present +} + + diff --git a/tvm/src/template/sdaccel/utils.h b/tvm/src/template/sdaccel/utils.h new file mode 100644 index 000000000..a3ab77437 --- /dev/null +++ b/tvm/src/template/sdaccel/utils.h @@ -0,0 +1,19 @@ +/*===============================================================*/ +/* */ +/* utils.h */ +/* */ +/* Utility functions */ +/* */ +/*===============================================================*/ + +#include +//target device +const std::string TARGET_DEVICE = "xilinx_aws-vu9p-f1-04261818_dynamic_5_0"; + +void print_usage(char* filename); + +void parse_sdaccel_command_line_args( + int argc, + char** argv, + std::string& kernelFile); + diff --git a/tvm/src/template/vivado/Makefile b/tvm/src/template/vivado/Makefile new file mode 100644 index 000000000..1d84baead --- /dev/null +++ b/tvm/src/template/vivado/Makefile @@ -0,0 +1,31 @@ +#========================================================================== +# Makefile +#========================================================================== +# @brief: A makefile the compiles and synthesizes the program +# +# @desc: 1. "make" runs csim by default +# 2. "make csim" compiles & executes the fixed-point implementation +# 3. "make clean" cleans up the directory + + +# Extract Vivado HLS include path +VHLS_PATH := $(dir $(shell which vivado_hls))/.. +VHLS_INC ?= ${VHLS_PATH}/include + +CFLAGS = -g -I${VHLS_INC} + +all: csim + +csim: host.cpp + @echo "Compiling & simulating on amdpool ..." + g++ ${CFLAGS} $^ -o out -lrt + ./out + +vivado: + @echo "Run Vivado csim and HLS" + vivado_hls -f run.tcl + +clean: + rm -rf out *.txt *.dat *.prj *.log + rm -rf zedboard_project* xillydemo.bit + diff --git a/tvm/src/template/vivado/run.tcl b/tvm/src/template/vivado/run.tcl new file mode 100644 index 000000000..d80b865df --- /dev/null +++ b/tvm/src/template/vivado/run.tcl @@ -0,0 +1,36 @@ +#============================================================================= +# run_base.tcl +#============================================================================= +# @brief: A Tcl script for synthesizing the design. + +# Project name +set hls_prj out.prj + +# Open/reset the project +open_project ${hls_prj} -reset + +# Top function of the design is "top" +set_top top + +# Add design and testbench files +add_files kernel.cpp +add_files -tb host.cpp + +open_solution "solution1" +# Use Zynq device +set_part {xc7z020clg484-1} + +# Target clock period is 10ns +create_clock -period 10 + +# Directives + +############################################ + +# Simulate the C++ design +csim_design -O +# Synthesize the design +csynth_design +# Co-simulate the design +#cosim_design +exit diff --git a/tvm/src/template/vivado/timer.h b/tvm/src/template/vivado/timer.h new file mode 100644 index 000000000..77c461b00 --- /dev/null +++ b/tvm/src/template/vivado/timer.h @@ -0,0 +1,94 @@ +//--------------------------------------------------------- +// Timer.h +//--------------------------------------------------------- +#ifndef __TIMER_H__ +#define __TIMER_H__ +#include +#include +#include +#include + +#define TIMER_ON + +//--------------------------------------------------------- +// Timer is an object which helps profile programs using +// the clock() function. +// - By default, a timer is stopped when you instantiate it +// and must be started manually +// - Passing True to the constructor starts the timer when +// it is constructed +// - When the timer is destructed it prints stats to stdout +//--------------------------------------------------------- +class Timer { + + #ifdef TIMER_ON + + char binName[50]; + unsigned nCalls; + timeval ts_start; + float totalTime; + + public: + //------------------------------------------------------------------ + // constructor + //------------------------------------------------------------------ + Timer (const char* Name="", bool On=false) { + if (On) { + // record the start time + gettimeofday(&ts_start, NULL); + nCalls = 1; + } + else { + nCalls = 0; + } + totalTime = 0; + strcpy(binName, Name); + } + + //------------------------------------------------------------------ + // destructor + //------------------------------------------------------------------ + ~Timer () { + // on being destroyed, print the average and total time + if (nCalls > 0) { + printf ("%-20s: ", binName); + printf ("%6d calls; ", nCalls); + printf ("%7.3f msecs total time\n", 1000*totalTime); + //printf ("%7.4f msecs average time;\n", 1000*totalTime/nCalls); + } + } + + //------------------------------------------------------------------ + // start timer + //------------------------------------------------------------------ + void start() { + // record start time + gettimeofday(&ts_start, NULL); + nCalls++; + } + + //------------------------------------------------------------------ + // stop timer + //------------------------------------------------------------------ + void stop() { + // get current time, add elapsed time to totalTime + timeval ts_curr; + gettimeofday(&ts_curr, NULL); + totalTime += float(ts_curr.tv_sec - ts_start.tv_sec) + + float(ts_curr.tv_usec)*1e-6 - float(ts_start.tv_usec)*1e-6; + } + + #else + + //-------------------------------------------------------------------- + // all methods do nothing if TIMER_ON is not set + //-------------------------------------------------------------------- + public: + Timer (const char* Name, bool On=true) {} + void start() {} + void stop() {} + + #endif +}; + +#endif