From ded3b5d0455aa5576f9eca2654ebb7cd7a58f6d8 Mon Sep 17 00:00:00 2001 From: Bohan Hou <32121147+spectrometerHBH@users.noreply.github.com> Date: Fri, 18 Sep 2020 05:49:09 +0800 Subject: [PATCH] [TIR][Hybrid] Hybrid Script Improvement (#6507) * [TIR][Hybrid] update * [TIR][Hybrid] python formatting --- python/tvm/hybrid/_ffi_api.py | 3 +- python/tvm/hybrid/intrin.py | 108 +- python/tvm/hybrid/parser.py | 318 ++--- python/tvm/hybrid/registry.py | 357 +++-- python/tvm/hybrid/scope_handler.py | 190 ++- python/tvm/hybrid/special_stmt.py | 48 +- python/tvm/hybrid/ty.py | 6 +- src/printer/tir_hybrid_printer.cc | 130 +- .../unittest/test_hybrid_error_report.py | 46 +- .../python/unittest/test_hybrid_roundtrip.py | 1230 ++++++----------- 10 files changed, 1207 insertions(+), 1229 deletions(-) diff --git a/python/tvm/hybrid/_ffi_api.py b/python/tvm/hybrid/_ffi_api.py index d59302a95dd1..929a65c03049 100644 --- a/python/tvm/hybrid/_ffi_api.py +++ b/python/tvm/hybrid/_ffi_api.py @@ -17,5 +17,4 @@ """FFI APIs for tvm.hybrid""" import tvm._ffi - -tvm._ffi._init_api("tir.hybrid", __name__) +tvm._ffi._init_api("hybrid", __name__) diff --git a/python/tvm/hybrid/intrin.py b/python/tvm/hybrid/intrin.py index 3dc46a280b72..fdd48f37d1bd 100644 --- a/python/tvm/hybrid/intrin.py +++ b/python/tvm/hybrid/intrin.py @@ -23,114 +23,146 @@ from .registry import register_intrin -@register_intrin +@register_intrin() def bool(imm): - return tvm.tir.const(imm.value, "bool") + return tvm.tir.const(imm, "bool") -@register_intrin +@register_intrin() def int8(imm): - return tvm.tir.const(imm.value, "int8") + return tvm.tir.const(imm, "int8") -@register_intrin +@register_intrin() def int16(imm): - return tvm.tir.const(imm.value, "int16") + return tvm.tir.const(imm, "int16") -@register_intrin +@register_intrin() def int32(imm): - return tvm.tir.const(imm.value, "int32") + return tvm.tir.const(imm, "int32") -@register_intrin +@register_intrin() def int64(imm): - return tvm.tir.const(imm.value, "int64") + return tvm.tir.const(imm, "int64") -@register_intrin +@register_intrin() def uint8(imm): - return tvm.tir.const(imm.value, "uint8") + return tvm.tir.const(imm, "uint8") -@register_intrin +@register_intrin() def uint16(imm): - return tvm.tir.const(imm.value, "uint16") + return tvm.tir.const(imm, "uint16") -@register_intrin +@register_intrin() def uint32(imm): - return tvm.tir.const(imm.value, "uint32") + return tvm.tir.const(imm, "uint32") -@register_intrin +@register_intrin() def uint64(imm): - return tvm.tir.const(imm.value, "uint64") + return tvm.tir.const(imm, "uint64") -@register_intrin +@register_intrin() def float8(imm): - return tvm.tir.const(imm.value, "float8") + return tvm.tir.const(imm, "float8") -@register_intrin +@register_intrin() def float16(imm): - return tvm.tir.const(imm.value, "float16") + return tvm.tir.const(imm, "float16") -@register_intrin +@register_intrin() def float32(imm): - return tvm.tir.const(imm.value, "float32") + return tvm.tir.const(imm, "float32") -@register_intrin +@register_intrin() def float64(imm): - return tvm.tir.const(imm.value, "float64") + return tvm.tir.const(imm, "float64") -@register_intrin +@register_intrin() def floordiv(x, y): return tvm.tir.floordiv(x, y) -@register_intrin +@register_intrin() def floormod(x, y): return tvm.tir.floormod(x, y) -@register_intrin +@register_intrin() def load(dtype, var, index, predicate=True): return tvm.tir.Load(dtype, var, index, predicate) -@register_intrin -def cast(dtype, value): +@register_intrin() +def cast(value, dtype): return tvm.tir.Cast(dtype, value) -@register_intrin +@register_intrin() def ramp(base, stride, lanes): - lanes = lanes.value if not isinstance(lanes, int) else lanes return tvm.tir.Ramp(base, stride, lanes) -@register_intrin +@register_intrin() def broadcast(value, lanes): - lanes = lanes.value if not isinstance(lanes, int) else lanes return tvm.tir.Broadcast(value, lanes) -@register_intrin +@register_intrin() def evaluate(value): return tvm.tir.Evaluate(value) -@register_intrin +@register_intrin() def store(var, index, value, predicate=True): return tvm.tir.Store(var, value, index, predicate) -@register_intrin +@register_intrin() def iter_var(var, dom, iter_type, thread_tag): iter_type = getattr(tvm.tir.IterVar, iter_type) return tvm.tir.IterVar(dom, var, iter_type, thread_tag) + + +@register_intrin() +def max(a, b): # pylint: disable=redefined-builtin + return tvm.tir.Max(a, b) + + +def get_axis(begin, end, iter_type): + ana = tvm.arith.Analyzer() + extent = ana.simplify(end - begin) + block_var_dom = tvm.ir.Range.from_min_extent(begin, extent) + + iter_type_dict = {"data_par": 0, "reduce": 2, "scan": 3, "opaque": 4} + return tvm.tir.IterVar(block_var_dom, "bv", iter_type_dict[iter_type]) + + +@register_intrin() +def range(begin, end): + return get_axis(begin, end, "data_par") + + +@register_intrin() +def reduce_axis(begin, end): + return get_axis(begin, end, "reduce") + + +@register_intrin() +def scan_axis(begin, end): + return get_axis(begin, end, "scan") + + +@register_intrin() +def opaque_axis(begin, end): + return get_axis(begin, end, "opaque") diff --git a/python/tvm/hybrid/parser.py b/python/tvm/hybrid/parser.py index b3b042a6de7d..a1aa652c79d9 100644 --- a/python/tvm/hybrid/parser.py +++ b/python/tvm/hybrid/parser.py @@ -19,7 +19,6 @@ # pylint: disable=unnecessary-comprehension, unused-argument, import-outside-toplevel # pylint: disable=unused-import import json -import numbers import operator from typed_ast import ast3 as ast @@ -30,9 +29,10 @@ from tvm.tir import all as _all from tvm.tir import expr as _expr -from . import scope_emitter, special_stmt, scope_handler, intrin +from . import scope_emitter, special_stmt, scope_handler, intrin, ty from .meta_unparser import MetaUnparser from .registry import Registry +from . import _ffi_api class HybridParserError(RuntimeError): @@ -45,13 +45,14 @@ class HybridParser(ast.NodeVisitor): 1. To support new types of AST nodes. Add a function visit_xxx(). 2. To support new functions We divide allowed function calls in hybrid script into 3 categories, - which is scope_handler, intrin and special_stmt. - 1) scope_handler: scope_handler functions correspond to StmtNodes without body, which can be - further classified into 2 categories: with scope handler can for scope handlers - 2) intrin: intrin functions corresponds to the remaining IRNodes (StmtNodes without body, - PrimExprNodes and more) - 3) special_stmt: special_stmt functions don't correspond to an IRNode in the AST directly. - It is usually used for some information that is not suitable to be printed directly. + which is intrin, scope_handler and special_stmt. + 1) intrin functions ought to have return value. + User can also register intrin category function into parser. + 2) scope_handler functions have no return value and accepts parser and AST node + as its arguments, which is used in for scope and with scope. + 3) special_stmt functions have return value and accepts parser and AST node as its arguments + When visiting Call node, we check special_stmt registry at first. If no registered function + is found, we then check intrin. When visiting With node, we check with_scope registry. When visiting For node, we check for_scope registry. """ @@ -83,6 +84,7 @@ def __init__(self, src, base_lienno): self.buffer_map = None self.dict_attr = None self.scope_emitter = None + self.var_env_dict = None self.src = src.split("\n") self.base_lineno = base_lienno @@ -91,9 +93,7 @@ def __init__(self, src, base_lienno): self.meta = None self.functions = {} - - self._in_with_func_arg = False - self._assign_target = None + self.target = None def init_function_parsing_env(self): """Initialize function parsing environment""" @@ -101,6 +101,7 @@ def init_function_parsing_env(self): self.buffer_map = {} # buffer map self.dict_attr = {} # dict attr self.scope_emitter = scope_emitter.ScopeEmitter(self) # scope emitter + self.var_env_dict = {} # map from var to thread env name @staticmethod def is_meta(node): @@ -169,15 +170,6 @@ def report_error(self, message, lineno=None, col_offset=None): col_offset = self.current_col_offset raise HybridParserError(self.wrap_line_col(message, lineno, col_offset)) - def get_type_name(self, vtype): - if ( - isinstance(vtype, ast.Attribute) - and isinstance(vtype.value, ast.Name) - and vtype.value.id == "ty" - ): - return vtype.attr - self.report_error("invalid type annotation") - def get_body(self): body = [] while len(self.scope_emitter.node_stack[-1]) > 0: @@ -186,31 +178,19 @@ def get_body(self): body.append(res) return tvm.tir.SeqStmt(body) if len(body) > 1 else body[0] - def parse_type(self, vtype): - """ Parse type annotation AST into Type object """ - if isinstance(vtype, ast.NameConstant) and vtype.value is None: - return tvm.ir.TupleType([]) - elif isinstance(vtype, ast.Attribute): - return tvm.ir.PrimType(self.get_type_name(vtype)) - elif isinstance(vtype, ast.Subscript) and isinstance(vtype.slice, ast.Index): - type_name = self.get_type_name(vtype.value) - if isinstance(vtype.slice.value, ast.Tuple): - args = [self.parse_type(element) for element in vtype.slice.value.elts] - else: - args = [self.parse_type(vtype.slice.value)] - if type_name == "Ptr": - return tvm.ir.PointerType(*args) - elif type_name == "Tuple": - return tvm.ir.TupleType(args) - - self.report_error("invalid type annotation") + def get_type(self, type_node): + """ Parse type """ + if type_node is None: + self.report_error("missing type annotation") + res_type = self.visit(type_node) + return tvm.ir.TupleType([]) if res_type is None else res_type.evaluate() def generic_visit(self, node): """Override method in ast.NodeVisitor. To directly filter out invalidate type of stmt. """ - self.report_error(type(node).__name__ + " stmt is not supported now") + self.report_error(type(node).__name__ + " AST node is not supported now") def visit_Module(self, node): """Module visitor @@ -304,7 +284,7 @@ def visit_FunctionDef(self, node): self.init_function_parsing_env() # add parameters of function for arg in node.args.args: - arg_var = tvm.te.var(arg.arg, self.parse_type(arg.annotation)) + arg_var = tvm.te.var(arg.arg, self.get_type(arg.annotation)) self.scope_emitter.update_symbol(arg.arg, arg_var) self.params.append(arg_var) @@ -315,7 +295,7 @@ def visit_FunctionDef(self, node): func = tvm.tir.PrimFunc( self.params, self.get_body(), - ret_type=self.parse_type(node.returns), + ret_type=self.get_type(node.returns), buffer_map=self.buffer_map, attrs=tvm.ir.make_node("DictAttrs", **self.dict_attr), ) @@ -326,12 +306,15 @@ def visit_Assign(self, node): """Assign visitor AST abstract grammar: Assign(expr* targets, expr value, string? type_comment) - By now only 2 types of Assign is supported: - 1. special stmts that appear as assign stmt + By now only 3 types of Assign is supported: + 1. special stmts with return value 1.1 Buffer = tir.buffer_bind()/tir.buffer_decl() 1.2 Var = tir.var() + 1.3 Var = tir.env_thread() 2. (BufferStore) Buffer[PrimExpr, PrimExpr, ..., PrimExpr] = PrimExpr - 3. (Store) Var[PrimExpr] = PrimExpr + 3. (Store) Var[PrimExpr] = PrimExpr + 4. with scope handlers with concise scoping and var def + 4.1 var = tir.alloc_with_scope() """ if not len(node.targets) == 1: @@ -339,22 +322,29 @@ def visit_Assign(self, node): target = node.targets[0] if isinstance(target, ast.Name): - # scenario 1 - self._assign_target = target.id - rhs = self.visit(node.value) + # scenario 1&4 + self.target = [target.id] if not isinstance(node.value, ast.Call): - self.report_error("Unsupported Assign stmt") - self.scope_emitter.update_symbol(target.id, rhs) + self.report_error("Unsupported assign stmt") + func = self.visit(node.value.func) + if Registry.is_with_scope(func): + # scenario 4 + return self.visit(node.value) + else: + # scenario 1 + rhs = self.visit(node.value) + self.scope_emitter.update_symbol(target.id, rhs) elif isinstance(target, ast.Subscript): # scenario 2&3 symbol, indexes = self.visit(target) - self._assign_target = (symbol, indexes) rhs = self.visit(node.value) if isinstance(symbol, tvm.tir.Buffer): + # BufferStore return tvm.tir.BufferStore(symbol, tvm.runtime.convert(rhs), indexes) else: if len(indexes) != 1: self.report_error("Invalid Store stmt") + # Store return tvm.tir.Store( symbol, tvm.runtime.convert(rhs), indexes[0], tvm.runtime.convert(True) ) @@ -370,7 +360,7 @@ def visit_AnnAssign(self, node): if isinstance(node.target, ast.Name): value = self.visit(node.value) - var = tvm.te.var(node.target.id, self.parse_type(node.annotation)) + var = tvm.te.var(node.target.id, self.get_type(node.annotation)) self.scope_emitter.update_symbol(var.name, var) return tvm.tir.LetStmt(var, value, self.visit(self.scope_emitter.node_stack[-1].pop())) else: @@ -394,40 +384,26 @@ def visit_For(self, node): AST abstract grammar: For(expr target, expr iter, stmt* body, stmt* orelse, string? type_comment) By now only 1 type of For is supported: - 1. for name in tir.range(begin, end, for_type) + 1. for name in tir.serial/parallel/vectorized/unroll(begin, end) """ - if not isinstance(node.target, ast.Name): - self.report_error("The loop variable should be a name variable") - # check node.iter, which is a tir Call + # check node.iter, which is a Call if not isinstance(node.iter, ast.Call): self.report_error("The loop iter should be a Call") - if ( - not isinstance(node.iter.func, ast.Attribute) - or not isinstance(node.iter.func.value, ast.Name) - or node.iter.func.value.id != "tir" - ): - self.report_error("The loop iter Call should be tir.name()") - - func_name = node.iter.func.attr + func = self.visit(node.iter.func) + if not Registry.is_for_scope(func): + self.report_error("Function not allowed in for scope") # collect arguments args = [self.visit(arg) for arg in node.iter.args] kw_args = [self.visit(keyword) for keyword in node.iter.keywords] kw_args = {kw_arg[0]: kw_arg[1] for kw_arg in kw_args} - # All the functions supported in For stmt are registered in scope_handler.ForScope - if func_name not in Registry.for_scope: - self.report_error( - "Function " + func_name + " used in For stmt is not supported now", - self.current_lineno, - node.iter.col_offset, - ) old_lineno, old_col_offset = self.current_lineno, self.current_col_offset self.current_lineno, self.current_col_offset = ( self.base_lineno + node.iter.lineno - 1, node.iter.col_offset, ) - res = Registry.for_scope.get(func_name)(self, node, args, kw_args) + res = func(self, node, args, kw_args) self.current_lineno, self.current_col_offset = old_lineno, old_col_offset return res @@ -436,37 +412,45 @@ def visit_With(self, node): AST abstract grammar: With(withitem* items, stmt* body, string? type_comment) withitem = (expr context_expr, expr? optional_vars) - By now only 1 type of With is supported: - 1. with tir.let/tir.Assert()/tir.attr()/tir.allocate()/tir.realize() + By now 2 types of With is supported: + 1. with tir.allocate() as targets: + 2. with tir.let()/tir.Assert()/tir.attr()//tir.realize() """ - - if len(node.items) != 1: + if not len(node.items) == 1: self.report_error("Only one with element is supported now") if not isinstance(node.items[0].context_expr, ast.Call): self.report_error("The context expression of with should be a Call") + func_call = node.items[0].context_expr - if ( - not isinstance(func_call.func, ast.Attribute) - or not isinstance(func_call.func.value, ast.Name) - or func_call.func.value.id != "tir" - ): - self.report_error("The context expression of with should be tir.name()") - - func_name = func_call.func.attr - # collect arguments + func_node = func_call.func + func = self.visit(func_node) + + if not Registry.is_with_scope(func): + self.report_error("Function not allowed in with scope") + + self.target = [] + if node.items[0].optional_vars is not None: + # preprocess optional var names + if isinstance(node.items[0].optional_vars, ast.Name): + self.target = [node.items[0].optional_vars.id] + elif isinstance(node.items[0].optional_vars, (ast.List, ast.Tuple)): + for var in node.items[0].optional_vars.elts: + if not isinstance(var, ast.Name): + self.report_error("Invalid optional var definition") + self.target = [var.id for var in node.items[0].optional_vars.elts] + else: + self.report_error("Invalid optional var definition") + # parse other arguments args = [self.visit(arg) for arg in func_call.args] kw_args = [self.visit(keyword) for keyword in func_call.keywords] kw_args = {kw_arg[0]: kw_arg[1] for kw_arg in kw_args} - if func_name not in Registry.with_scope: - self.report_error("Function " + func_name + " used in With stmt is not supported now") - # All the functions supported in With stmt are registered in scope_handler.WithScope old_lineno, old_col_offset = self.current_lineno, self.current_col_offset self.current_lineno, self.current_col_offset = ( self.base_lineno + func_call.lineno - 1, func_call.col_offset, ) - res = Registry.with_scope.get(func_name)(self, node, args, kw_args) + res = func(self, node, args, kw_args) self.current_lineno, self.current_col_offset = old_lineno, old_col_offset return res @@ -498,49 +482,41 @@ def visit_Call(self, node): AST abstract grammar: Call(expr func, expr* args, keyword* keywords) keyword = (identifier? arg, expr value) + All the functions used outside With and For are registered in special_stmt or intrin """ + func = self.visit(node.func) # collect arguments args = [self.visit(arg) for arg in node.args] kw_args = [self.visit(keyword) for keyword in node.keywords] kw_args = {kw_arg[0]: kw_arg[1] for kw_arg in kw_args} - maybe_intrin = False - if isinstance(node.func, ast.Attribute) and isinstance(node.func.value, ast.Name): - if node.func.value.id == "tir": - func_name = node.func.attr - maybe_intrin = True + if callable(func): + if Registry.is_registered(func): + return func(self, node, args, kw_args) else: - self.report_error("Unsupported Attribute typed function call") - else: - self.report_error("Unsupported function call") + return func(*args, **kw_args) + elif isinstance(func, tvm.tir.op.Op): + return tvm.tir.Call(kw_args["dtype"], func, args) - if func_name in Registry.special_stmt: - return Registry.special_stmt.get(func_name)(self, node, args, kw_args) - if func_name in Registry.intrin: - return Registry.intrin.get(func_name)(self, node, args, kw_args) - if func_name in Registry.with_scope: - return Registry.with_scope.get(func_name)(self, node, args, kw_args) - if maybe_intrin: - return tvm.tir.Call(kw_args["dtype"], tvm.ir.op.Op.get("tir." + func_name), args) - - self.report_error("Function " + func_name + " is not supported now") + self.report_error("Unsupported function call") def visit_Expr(self, node): """Expr visitor AST abstract grammar: Expr(expr value) - - Now only 2 types of Expr stmt is allowed: - 1. Concise mode of with scope handlers - tir.attr()/tir.assert()/tir.allocate()/tir.realize() - 2. special stmts appear as a call - tir.set_func_attr() + Now only 3 types of `Expr` stmt is allowed: + 1. reducer.step()/tir.store() + 2. tir.attr()/tir.assert()/tir.allocate()/tir.realize() + 3. tir.set_func_attr() """ if not isinstance(node.value, ast.Call): self.report_error("Unsupported Expr stmt") - return self.visit(node.value) + res = self.visit(node.value) + if res is None or isinstance(res, tvm.tir.Stmt): + return res + self.report_error("Invalid Expr stmt") def visit_BinOp(self, node): """BinOp visitor @@ -602,16 +578,14 @@ def visit_Subscript(self, node): 2. meta[type_key][index], Meta info access """ - if isinstance(node.value, (ast.Name, ast.Attribute)): - symbol = self.visit(node.value) + symbol = self.visit(node.value) + if symbol is None: + self.report_error(node.value.id + " is not defined") + if isinstance(symbol, (tvm.tir.expr.Var, tvm.tir.Buffer)): if isinstance(node.slice, ast.Index): - # BufferLoad & BufferStore - if isinstance(node.slice.value, ast.Tuple): - # Buffer/Var[index, index, ...] - indexes = [self.visit(element) for element in node.slice.value.elts] - else: - # Buffer/Var[index] - indexes = [self.visit(node.slice.value)] + # BufferLoad & BufferStore, Buffer/Var[index, index, ...] + indexes = self.visit(node.slice.value) + indexes = list(indexes) if isinstance(indexes, tuple) else [indexes] if isinstance(node.ctx, ast.Load): if isinstance(symbol, tir.expr.Var): return tvm.tir.Load("float32", symbol, indexes, True) @@ -643,42 +617,11 @@ def visit_Subscript(self, node): extent = ana.simplify(extent) doms.append(tvm.ir.Range.from_min_extent(lower, extent)) return symbol, doms - - elif ( - isinstance(node.value, ast.Subscript) - and isinstance(node.value.value, ast.Name) - and node.value.value.id == "meta" - ): - # meta[type_key][index] - if not ( - isinstance(node.slice, ast.Index) and isinstance(node.slice.value, ast.Num) - ) or not ( - isinstance(node.value.slice, ast.Index) - and isinstance(node.value.slice.value, ast.Name) - ): - self.report_error("The meta access format ought to be meta[type_key][index]") - type_key = node.value.slice.value.id - index = node.slice.value.n - node_list = self.meta[type_key] - if node_list is None: - self.report_error("type_key " + type_key + " in meta not found") - if len(node_list) <= index: - self.report_error("index " + index + " out of range " + len(node_list)) - return node_list[index] else: - self.report_error("Only buffer variable and meta can be subscriptable") - - def visit_Name(self, node): - """Name visitor - AST abstract grammar: - Name(identifier id, expr_context ctx) - """ - - name = node.id - symbol = self.scope_emitter.lookup_symbol(name) - if symbol is None: - self.report_error("Unknown symbol %s" % name) - return symbol + res = symbol[self.visit(slice)] + if res is None: + self.report_error("Only buffer variable and meta can be subscriptable") + return res def visit_Attribute(self, node): """Attribute visitor @@ -686,15 +629,28 @@ def visit_Attribute(self, node): Attribute(expr value, identifier attr, expr_context ctx) """ - if not isinstance(node.value, ast.Name): - self.report_error("The value of Attribute ought to a Name") - name = node.value.id - symbol = self.scope_emitter.lookup_symbol(name) - if symbol is None or not isinstance(symbol, tvm.tir.Buffer): + if isinstance(node.value, ast.Name): + if node.value.id == "tir": + func_name = "tir." + node.attr + res = Registry.look_up_function(func_name) + if res is not None: + return res + try: + return tvm.ir.op.Op.get(func_name) + except AttributeError: + self.report_error("Unregistered function tir." + node.attr) + elif node.value.id == "ty": + if not hasattr(ty, node.attr): + self.report_error("invalid type annotation ty." + node.attr) + return getattr(ty, node.attr) + + symbol = self.visit(node.value) + if symbol is None: self.report_error("Unsupported Attribute expression") if not hasattr(symbol, node.attr): self.report_error("Type " + type(symbol) + " has not attr " + node.attr) - return getattr(symbol, node.attr) + res = getattr(symbol, node.attr) + return res def visit_Dict(self, node): """Dict visitor @@ -731,20 +687,32 @@ def visit_keyword(self, node): return node.arg, self.visit(node.value) - def visit_NameConstant(self, node): - return tvm.runtime.convert(node.value) + def visit_Name(self, node): + """Name visitor + AST abstract grammar: + Name(identifier id, expr_context ctx) + """ + + name = node.id + if name == "meta": + return self.meta + symbol = Registry.look_up_function(name) + if symbol is not None: + return symbol + symbol = self.scope_emitter.lookup_symbol(name) + if symbol is not None: + return symbol + self.report_error("Unknown identifier %s" % name) + # note that after Python3.8, ast.NameConstant, ast.Num, ast.Str are no longer used def visit_Constant(self, node): - return tvm.runtime.convert(node.value) + return node.value + + def visit_NameConstant(self, node): + return node.value def visit_Num(self, node): - if isinstance(node.n, numbers.Integral): - dtype = "int32" - elif isinstance(node.n, float): - dtype = "float32" - else: - self.report_error("The data type should be one of (int, float)") - return tvm.tir.const(node.n, dtype) + return node.n def visit_Str(self, node): return node.s @@ -787,4 +755,4 @@ def from_source(src, func_lineno=0): raise HybridParserError(inject_e) -tvm._ffi._init_api("tvm.hybrid.parser") +tvm._ffi._init_api("hybrid", __name__) diff --git a/python/tvm/hybrid/registry.py b/python/tvm/hybrid/registry.py index 9f5c39161150..a1b2b3cd4e39 100644 --- a/python/tvm/hybrid/registry.py +++ b/python/tvm/hybrid/registry.py @@ -17,18 +17,62 @@ """Hybrid Script Parser Function Registry """ # pylint: disable=inconsistent-return-statements import inspect +from enum import Enum from typed_ast import ast3 as ast +import tvm + + +class Category(Enum): + INTRIN = 0 + WITH_SCOPE = 1 + FOR_SCOPE = 2 + SPECIAL_STMT = 3 + class Registry(object): """Registration map All these maps are static """ - intrin = dict() - with_scope = dict() - for_scope = dict() - special_stmt = dict() + functions = dict() + + @staticmethod + def look_up_function(func_name): + """look up a registered function by name""" + if func_name in Registry.functions: + return Registry.functions[func_name][0] + return None + + @staticmethod + def is_intrin(func): + """check whether a function belongs to intrin""" + return (func, Category.INTRIN) in Registry.functions.values() + + @staticmethod + def is_with_scope(func): + """check whether a function belongs to with scope handlers""" + return (func, Category.WITH_SCOPE) in Registry.functions.values() + + @staticmethod + def is_for_scope(func): + """check whether a function belongs to for scope handlers""" + return (func, Category.FOR_SCOPE) in Registry.functions.values() + + @staticmethod + def is_special_stmt(func): + """check whether a function belongs to special stmts""" + return (func, Category.SPECIAL_STMT) in Registry.functions.values() + + @staticmethod + def is_registered(func): + """check whether a function is registered""" + return ( + Registry.is_intrin(func) + or Registry.is_with_scope(func) + or Registry.is_for_scope(func) + or Registry.is_special_stmt(func) + ) class CallArgumentReader(object): @@ -40,99 +84,171 @@ def __init__(self, func_name, args, kwargs, parser): self.kwargs = kwargs self.parser = parser - def get_func_compulsory_arg(self, pos, name): - """Get corresponding function argument from argument list which is compulsory""" - + def get_pos_only_arg(self, pos, name): + """Get corresponding position only function argument from argument list""" if len(self.args) >= pos: arg = self.args[pos - 1] - elif name not in self.kwargs.keys(): + elif name not in self.kwargs: self.parser.report_error(self.func_name + " misses argument " + name) else: arg = self.kwargs[name] return arg - def get_func_optional_arg(self, pos, name, default): - """Get corresponding function argument from argument list which is optional. + def get_kwarg(self, pos, name, default): + """Get corresponding keyword function argument from argument list If user doesn't provide the argument, set it to default value """ - if len(self.args) >= pos: arg = self.args[pos - 1] - elif name in self.kwargs.keys(): + elif name in self.kwargs: arg = self.kwargs[name] else: return default return arg + def get_varargs(self, pos): + """Get corresponding variable argument from argument list""" + if len(self.args) >= pos and len(self.kwargs) == 0: + return self.args[pos - 1 :] + return [] -def func_wrapper(func_name, func_to_register, arg_list, need_parser_and_node, need_body, concise): + def auto_insert_body(self, pos, body): + """Automatically provide body as function call argument""" + if len(self.args) >= pos: + self.args.insert(pos - 1, body) + else: + self.kwargs["body"] = body + + +def func_wrapper(func_name, func_to_register, arg_list, category, concise=False, with_var=False): """Helper function to wrap a function to be registered """ def wrap_func(parser, node, args, kwargs): - reader = CallArgumentReader(func_name, args, kwargs, parser) - internal_args = list() - - if need_body and not isinstance(node, ast.For): - # automatically parse body for with scope handlers - if isinstance(node, ast.With): - # the with scope handler is used inside with context - parser.scope_emitter.new_scope() - parser.scope_emitter.node_stack[-1].extend(reversed(node.body)) - body = parser.get_body() - parser.scope_emitter.pop_scope() + if category == Category.FOR_SCOPE: + # automatically parse loop vars and body for for_scope handlers + loop_var_names = list() + if isinstance(node.target, ast.Name): + loop_var_names.append(node.target.id) + elif isinstance(node.target, ast.Tuple): + for elt in node.target.elts: + if not isinstance(elt, ast.Name): + parser.report_error("Invalid loop var") + loop_var_names.append(elt.id) else: - # the with scope handler is used in concise scoping - if not concise: - parser.report_error("Concise scoping is not allowed here") - body = parser.get_body() - - if need_parser_and_node: - internal_args.append(parser) - internal_args.append(node) - - for i, arg_info in enumerate(arg_list): - if len(arg_info) == 1: - (arg_name,) = arg_info - if need_body and arg_name == "body": - internal_args.append(body) + parser.report_error("Invalid loop var") + loop_vars = [tvm.te.var(name, dtype="int32") for name in loop_var_names] + + parser.scope_emitter.new_scope() + parser.scope_emitter.node_stack[-1].extend(reversed(node.body)) + for loop_var in loop_vars: + parser.scope_emitter.update_symbol(loop_var.name, loop_var) + body = parser.get_body() + parser.scope_emitter.pop_scope() + elif category == Category.WITH_SCOPE: + if not with_var: + if isinstance(node, ast.With) and node.items[0].optional_vars is not None: + parser.report_error("Function " + func_name + " expects no optional vars") + # automatically parse body for with_scope handlers without optional vars + if isinstance(node, ast.With): + parser.scope_emitter.new_scope() + parser.scope_emitter.node_stack[-1].extend(reversed(node.body)) + body = parser.get_body() + parser.scope_emitter.pop_scope() else: - internal_args.append(reader.get_func_compulsory_arg(i + 1, arg_name)) + body = parser.get_body() else: - arg_name, default = arg_info - internal_args.append(reader.get_func_optional_arg(i + 1, arg_name, default=default)) + if isinstance(node, ast.With) and node.items[0].optional_vars is None: + parser.report_error("Function " + func_name + " expects optional vars") + body = None + + if not isinstance(node, ast.With) and not concise: + parser.report_error("Concise scoping is not allowed here") + + reader = CallArgumentReader(func_name, args, kwargs, parser) + pos_only, kwargs, varargs = arg_list + + internal_args = list() + if category == Category.WITH_SCOPE: + if not with_var: + internal_args.extend([parser, node, body]) + else: + internal_args.extend([parser, node]) + elif category == Category.FOR_SCOPE: + internal_args.extend([parser, node, body, loop_vars]) + elif category == Category.SPECIAL_STMT: + internal_args.extend([parser, node]) + + for i, arg_name in enumerate(pos_only): + internal_args.append(reader.get_pos_only_arg(i + 1, arg_name)) + + for i, arg_info in enumerate(kwargs): + arg_name, default = arg_info + internal_args.append(reader.get_kwarg(i + 1 + len(pos_only), arg_name, default=default)) + + if varargs is not None: + internal_args.extend(reader.get_varargs(len(pos_only) + len(kwargs) + 1)) return func_to_register(*internal_args) return wrap_func -def get_arg_list(origin_func, need_parser_and_node): +def get_arg_list(origin_func, category, with_var=False): """Helper function to get the argument list of Function - Parameters ---------- origin_func: function The function to get the argument list - - need_parser_and_node: bool - Whether the function need parser and node in its arguments + category: Category + The category of registered function + with_var: bool, optional + Whether the with scope handler neeeds optional vars """ - full_arg_spec = inspect.getfullargspec(origin_func) args, defaults = full_arg_spec.args, full_arg_spec.defaults if defaults is None: defaults = tuple() - if need_parser_and_node: + + if category == Category.WITH_SCOPE: + if not with_var: + if len(args) < 3 or args[0] != "parser" or args[1] != "node" or args[2] != "body": + raise RuntimeError( + "TVM Hybrid Script register error : the first three arguments of " + "this with scope handler must be parser, node, body" + ) + args = args[3:] + else: + if len(args) < 2 or args[0] != "parser" or args[1] != "node": + raise RuntimeError( + "TVM Hybrid Script register error : the first two arguments of " + "this with scope handler must be parser, node" + ) + args = args[2:] + elif category == Category.FOR_SCOPE: + if ( + len(args) < 4 + or args[0] != "parser" + or args[1] != "node" + or args[2] != "body" + or args[3] != "loop_vars" + ): + raise RuntimeError( + "TVM Hybrid Script register error : the first three arguments of for scope handler" + "must be parser, node, body, loop_vars" + ) + args = args[4:] + elif category == Category.SPECIAL_STMT: + if len(args) < 2 or args[0] != "parser" or args[1] != "node": + raise RuntimeError( + "TVM Hybrid Script register error : the first three arguments of special stmt" + "must be parser, node" + ) args = args[2:] - if full_arg_spec.varargs is not None: - raise RuntimeError( - "TVM Hybrid Script register error : variable argument is not supported now" - ) if full_arg_spec.varkw is not None: raise RuntimeError( "TVM Hybrid Script register error : variable keyword argument is not supported now" @@ -142,95 +258,111 @@ def get_arg_list(origin_func, need_parser_and_node): "TVM Hybrid Script register error : keyword only argument is not supported now" ) - arg_list = list() + pos_only = list() for arg in args[: len(args) - len(defaults)]: - arg_list.append((arg,)) + pos_only.append(arg) + kwargs = list() for default, arg in zip(defaults, args[len(args) - len(defaults) :]): - arg_list.append((arg, default)) + kwargs.append((arg, default)) - return arg_list + return pos_only, kwargs, full_arg_spec.varargs -def register_intrin(origin_func): +def register_intrin(name=None): """Decorator to register function under category intrin - + Parameters + ---------- + name: str, optional + registered name for the function Example ------ .. code-block:: python - @register_intrin def broadcast(value, lanes): lanes = lanes.value if not isinstance(lanes, int) else lanes return tvm.tir.Broadcast(value, lanes) """ - func_name = origin_func.__qualname__ - Registry.intrin[func_name] = func_wrapper( - func_name, - origin_func, - get_arg_list(origin_func, False), - need_parser_and_node=False, - need_body=False, - concise=False, - ) - return origin_func - - -def register_with_scope(concise=False): - """Decorator to register function under with scope handler + def decorate(origin_func): + func_name = "tir." + origin_func.__qualname__ if name is None else name + Registry.functions[func_name] = ( + func_wrapper( + func_name, origin_func, get_arg_list(origin_func, Category.INTRIN), Category.INTRIN + ), + Category.INTRIN, + ) + return origin_func + + return decorate + + +def register_with_scope(concise=False, with_var=False, name=None): + """Decorator to register function under with scope handler Parameters ---------- - concise: bool - whether this scope handler is allowed in concise scoping - + concise: bool, optional + whether this with scope handler is allowed in concise scoping + with_var: bool, optional + whether this with scope handler neeeds optional vars + name: str, optional + registered name for the function Example ------ .. code-block:: python - @register_scope_handler(concise=True) def attr(parser, node, attr_node, attr_key, value, body): - return tvm.tir.AttrStmt(attr_node, attr_key, tvm.runtime.convert(value), body) """ def decorate(origin_func): """Register function under category with_scope""" - func_name = origin_func.__qualname__ - Registry.with_scope[func_name] = func_wrapper( - func_name, - origin_func, - get_arg_list(origin_func, True), - need_parser_and_node=True, - need_body=True, - concise=concise, + func_name = "tir." + origin_func.__qualname__ if name is None else name + Registry.functions[func_name] = ( + func_wrapper( + func_name, + origin_func, + get_arg_list(origin_func, Category.WITH_SCOPE, with_var), + Category.WITH_SCOPE, + concise=concise, + with_var=with_var, + ), + Category.WITH_SCOPE, ) return origin_func return decorate -def register_for_scope(): - """Decorator to register function under for scope handler""" +def register_for_scope(name=None): + """Decorator to register function under for scope handler + Parameters + ---------- + name: str, optional + registered name for the function + """ def decorate(origin_func): - """Register function under category for_scope""" - func_name = origin_func.__qualname__ - Registry.for_scope[func_name] = func_wrapper( - func_name, - origin_func, - get_arg_list(origin_func, True), - need_parser_and_node=True, - need_body=True, - concise=False, + func_name = "tir." + origin_func.__qualname__ if name is None else name + Registry.functions[func_name] = ( + func_wrapper( + func_name, + origin_func, + get_arg_list(origin_func, Category.FOR_SCOPE), + Category.FOR_SCOPE, + ), + Category.FOR_SCOPE, ) return origin_func return decorate -def register_special_stmt(origin_func): +def register_special_stmt(name=None): """Decorator to register function under category special_stmt - + Parameters + ---------- + name: str, optional + registered name for the function Example ------- @register_special_stmt @@ -238,19 +370,22 @@ def buffer_decl(parser, node, shape, dtype="float32", data=None, strides=[], ele scope="global", align=-1, offset_factor=0, buffer_type="default"): align = align.value if not isinstance(align, int) else align offset_factor = offset_factor.value if not isinstance(offset_factor, int) else offset_factor - buffer = tvm.tir.decl_buffer(shape, dtype, parser._assign_target, data, strides, + buffer = tvm.tir.decl_buffer(shape, dtype, parser.assign_target, data, strides, elem_offset, scope, align, offset_factor, buffer_type) return buffer - """ - func_name = origin_func.__qualname__ - Registry.special_stmt[func_name] = func_wrapper( - func_name, - origin_func, - get_arg_list(origin_func, True), - need_parser_and_node=True, - need_body=False, - concise=False, - ) - return origin_func + def decorate(origin_func): + func_name = "tir." + origin_func.__qualname__ if name is None else name + Registry.functions[func_name] = ( + func_wrapper( + func_name, + origin_func, + get_arg_list(origin_func, Category.SPECIAL_STMT), + Category.SPECIAL_STMT, + ), + Category.SPECIAL_STMT, + ) + return origin_func + + return decorate diff --git a/python/tvm/hybrid/scope_handler.py b/python/tvm/hybrid/scope_handler.py index 3b1b7a2c5987..126a3dc2432e 100644 --- a/python/tvm/hybrid/scope_handler.py +++ b/python/tvm/hybrid/scope_handler.py @@ -15,75 +15,181 @@ # specific language governing permissions and limitations # under the License. """Hybrid Script Parser Scope Handler Functions - This module provides the functions registered into parser under with_scope or for_scope category. Scope handler nodes are StmtNodes with body, which are used to handle such scenarios. - +1. For scope handler +When registering a for scope handler, the first 4 arguments must be parser, node, body, loop_vars +and these arguments will provided by Hybrid Script parser automatically .. code-block:: python - - for x in tir.name(): - with tir.name(): - tir.name() # with scope handlers + concise scoping - + for loop_vars in tir.xxx(): +2. With scope handler +There are 4 subtypes of with scope handlers, classified by + 1) with or without as + 2) allow concise scoping or not +1) with as & concise +the first 2 arguments must be parser, node +Need to parse the body manually +Example : tir.alloc_with_scope +.. code-block:: python + target = tir.xxx() + with tir.xxx() as target: +2) with as & not concise +the first 2 arguments must be parser, node +Need to parse the body manually +Example : None atm +.. code-block:: python + with tir.xxx() as target: +3) without as & concise +the first 3 arguments must be parser, node, body +Hybrid Script parser will parse the body automatically +Example : tir.allocate()/tir.realize()/tir.attr() +.. code-block:: python + tir.xxx() + with tir.xxx(): +4) without as & not concise +the first 3 arguments must be parser, node, body +Hybrid Script parser will parse the body automatically +Example : tir.assert()/tir.let() +.. code-block:: python + with tir.xxx(): """ # pylint: disable=redefined-builtin, unused-argument, invalid-name + +from typed_ast import ast3 as ast import tvm.tir from .registry import register_with_scope, register_for_scope # With scope handler -@register_with_scope(concise=False) -def Assert(parser, node, condition, message, body): - """ With scope handler function assert(condition, message, body) """ - - return tvm.tir.AssertStmt(condition, tvm.runtime.convert(message), body) - +@register_with_scope(concise=True, with_var=True) +def allocate(parser, node, extents, dtype, scope, condition=True): + """ With scope handler function tir.alloc_with_scope(var, extents, dtype, scope, condition) """ + # defining buffer var and parse the body manually + + buffer_var = tvm.te.var(parser.target[0], "handle") + # (TODO) Uncomment this line if we have richer type info for buffer var + # buffer_var = tvm.te.var(parser.target[0], tvm.ir.PointerType(tvm.ir.PrimType(dtype))) + if isinstance(node, ast.With): + parser.scope_emitter.new_scope() + parser.scope_emitter.update_symbol(buffer_var.name, buffer_var) + parser.scope_emitter.node_stack[-1].extend(reversed(node.body)) + body = parser.get_body() + parser.scope_emitter.pop_scope() + else: + parser.scope_emitter.update_symbol(buffer_var.name, buffer_var) + body = parser.get_body() + condition = tvm.runtime.convert(condition) + scope = tvm.runtime.convert(scope) + body = tvm.tir.Allocate(buffer_var, dtype, extents, condition, body) + return tvm.tir.AttrStmt(buffer_var, "storage_scope", scope, body) -@register_with_scope(concise=False) -def let(parser, node, var, value, body): - """ With scope handler function let(var, value, body) """ - return tvm.tir.LetStmt(var, value, body) +@register_with_scope(concise=True) +def launch_thread(parser, node, body, env_var, extent): + extent = tvm.runtime.convert(extent) + return tvm.tir.AttrStmt( + tvm.tir.IterVar( + None, env_var, getattr(tvm.tir.IterVar, "ThreadIndex"), parser.var_env_dict[env_var] + ), + "thread_extent", + extent, + body, + ) @register_with_scope(concise=True) -def realize(parser, node, buffer_bounds, body, condition=True): - """ With scope handler function realize(buffer_bounds, condition, body) """ - +def realize(parser, node, body, buffer_bounds, scope, condition=True): + """ With scope handler function tir.realize(buffer_bounds, scope, condition) """ buffer, bounds = buffer_bounds - return tvm.tir.BufferRealize(buffer, bounds, condition, body) + scope = tvm.runtime.convert(scope) + return tvm.tir.AttrStmt( + buffer, "realize_scope", scope, tvm.tir.BufferRealize(buffer, bounds, condition, body) + ) @register_with_scope(concise=True) -def attr(parser, node, attr_node, attr_key, value, body): - """ With scope handler function attr(attr_node, attr_key, value, body) """ +def attr(parser, node, body, attr_node, attr_key, value): + """ With scope handler function tir.attr(attr_node, attr_key, value) """ + attr_node = tvm.runtime.convert(attr_node) + value = tvm.runtime.convert(value) + return tvm.tir.AttrStmt(attr_node, attr_key, value, body) - return tvm.tir.AttrStmt(attr_node, attr_key, tvm.runtime.convert(value), body) +@register_with_scope(concise=False) +def Assert(parser, node, body, condition, message): + """ With scope handler function tir.Assert(condition, message) """ + return tvm.tir.AssertStmt(condition, tvm.runtime.convert(message), body) -@register_with_scope(concise=True) -def allocate(parser, node, buffer_var, dtype, extents, body, condition=True): - """ With scope handler function allocate(buffer_var, dtype, extents, condition, body) """ - return tvm.tir.Allocate(buffer_var, dtype, extents, tvm.runtime.convert(condition), body) +@register_with_scope(concise=False) +def let(parser, node, body, var, value): + """ With scope handler function tir.let(var, value) """ + return tvm.tir.LetStmt(var, value, body) # For scope handler @register_for_scope() -def range(parser, node, begin, end, for_type="serial"): +def serial(parser, node, body, loop_vars, begin, end): + """ For scope handler function tir.serial(begin, end)""" + if len(loop_vars) != 1: + parser.report_error("Expect exact 1 loop var") + ana = tvm.arith.Analyzer() + extent = end if begin == 0 else ana.simplify(end - begin) + return tvm.tir.For(loop_vars[0], begin, extent, 0, 0, body) + + +@register_for_scope() +def parallel(parser, node, body, loop_vars, begin, end): + """ For scope handler function tir.parallel(begin, end)""" + if len(loop_vars) != 1: + parser.report_error("Expect exact 1 loop var") + ana = tvm.arith.Analyzer() + extent = end if begin == 0 else ana.simplify(end - begin) + return tvm.tir.For(loop_vars[0], begin, extent, 1, 0, body) + + +@register_for_scope() +def vectorized(parser, node, body, loop_vars, begin, end): + """ For scope handler function tir.vectorized(begin, end)""" + if len(loop_vars) != 1: + parser.report_error("Expect exact 1 loop var") + ana = tvm.arith.Analyzer() + extent = end if begin == 0 else ana.simplify(end - begin) + return tvm.tir.For(loop_vars[0], begin, extent, 2, 0, body) + + +@register_for_scope() +def unroll(parser, node, body, loop_vars, begin, end): + """ For scope handler function tir.unroll(begin, end)""" + if len(loop_vars) != 1: + parser.report_error("Expect exact 1 loop var") + ana = tvm.arith.Analyzer() + extent = end if begin == 0 else ana.simplify(end - begin) + return tvm.tir.For(loop_vars[0], begin, extent, 3, 0, body) + + +@register_for_scope(name="range") +def Range(parser, node, body, loop_vars, begin, end, annotation=None): """ For scope handler function range(begin, end, annotation)""" + if len(loop_vars) != 1: + parser.report_error("Expect exact 1 loop var") ana = tvm.arith.Analyzer() extent = end if begin == 0 else ana.simplify(end - begin) - loop_var_name = node.target.id - loop_var = tvm.te.var(loop_var_name, dtype="int32") - - parser.scope_emitter.new_scope() - parser.scope_emitter.update_symbol(loop_var_name, loop_var) - parser.scope_emitter.node_stack[-1].extend(reversed(node.body)) - body = parser.get_body() - parser.scope_emitter.pop_scope() - - for_type_dict = {"serial": 0, "parallel": 1, "vectorized": 2, "unroll": 3} - if for_type not in for_type_dict: - parser.report_error("unknown for type " + for_type) - return tvm.tir.For(loop_var, begin, extent, for_type_dict[for_type], 0, body) + if annotation is None: + annotation = [] + else: + annotation = [ + tvm.tir.Annotation(key, tvm.runtime.convert(val) if isinstance(val, str) else val) + for key, val in annotation.items() + ] + return tvm.tir.Loop(loop_vars[0], begin, extent, annotation, body) + + +@register_for_scope() +def grid(parser, node, body, loop_vars, *extents): + """ For scope handler function tir.grid(*extents) """ + if len(loop_vars) != len(extents): + parser.report_error("Inconsitent number of loop vars and extents") + for loop_var, extent in zip(reversed(loop_vars), reversed(extents)): + body = tvm.tir.Loop(loop_var, 0, extent, [], body) + return body diff --git a/python/tvm/hybrid/special_stmt.py b/python/tvm/hybrid/special_stmt.py index 129354db1443..f080071666f2 100644 --- a/python/tvm/hybrid/special_stmt.py +++ b/python/tvm/hybrid/special_stmt.py @@ -15,27 +15,24 @@ # specific language governing permissions and limitations # under the License. """Hybrid Script Parser Special Stmt Functions - This module provides the functions registered into parser under special_stmt category. special_stmt functions don't correspond to an IRNode in the AST directly. It is usually used for some information that is not suitable to be printed directly. - special_stmt can appear as 2 formats - .. code-block:: python - target = tir.name(): tir.name() - +When registering a special stmt, the first two arguments must be parser, node """ -# pylint: disable=unused-argument +# pylint: disable=unused-argument, no-self-argument, inconsistent-return-statements + import tvm.tir from tvm import te from .registry import register_special_stmt -@register_special_stmt -def buffer_bind( +@register_special_stmt() +def match_buffer( parser, node, param, @@ -49,15 +46,12 @@ def buffer_bind( offset_factor=0, buffer_type="default", ): - """Special function buffer_bind(var, shape, dtype, data, strides, elem_offset, scope, align, - offset_factor, buffer_type) - + """Special function match_buffer(var, shape, dtype, data, strides, elem_offset, scope, align, + offset_factor, buffer_type) Example ------- .. code-block:: python - - A = tir.buffer_bind(a, (128, 128), dtype="float32") - + A = tir.match_buffer(a, (128, 128), dtype="float32") """ if param not in parser.params: @@ -69,7 +63,7 @@ def buffer_bind( buffer = tvm.tir.decl_buffer( shape, dtype, - parser._assign_target, + parser.target[0], data, strides, elem_offset, @@ -82,7 +76,7 @@ def buffer_bind( return buffer -@register_special_stmt +@register_special_stmt() def buffer_decl( parser, node, @@ -98,14 +92,12 @@ def buffer_decl( ): """Special function buffer_decl(shape, dtype, data, strides, elem_offset, scope, align, offset_factor, buffer_type) - Example ------- .. code-block:: python - A = tir.buffer_decl((128, 128), dtype="float32") - """ + if strides is None: strides = [] align = align.value if not isinstance(align, int) else align @@ -113,7 +105,7 @@ def buffer_decl( buffer = tvm.tir.decl_buffer( shape, dtype, - parser._assign_target, + parser.target[0], data, strides, elem_offset, @@ -125,20 +117,26 @@ def buffer_decl( return buffer -@register_special_stmt +@register_special_stmt() def var(parser, node, dtype): """ Special function for defining a Var""" - return te.var(parser._assign_target, dtype) + return te.var(parser.target[0], dtype) -@register_special_stmt +@register_special_stmt() +def env_thread(parser, node, env_name): + """ Bind a var to thread env """ + v = te.var(parser.target[0]) + parser.var_env_dict[v] = env_name + return v + + +@register_special_stmt() def func_attr(parser, node, dict_attr): """Special function for declaring the DictAttr of PrimFunc - Example ------- .. code-block:: python - tir.func_attr({"tir.noalias": True, "global_symbol"}) """ diff --git a/python/tvm/hybrid/ty.py b/python/tvm/hybrid/ty.py index a3319474825a..c309fbe9104e 100644 --- a/python/tvm/hybrid/ty.py +++ b/python/tvm/hybrid/ty.py @@ -37,7 +37,7 @@ def __init__(self, vtype): self.type = vtype def evaluate(self): - return self.type + return tvm.ir.PrimType(self.type) class GenericPtrType(TypeGeneric): @@ -61,7 +61,7 @@ def __getitem__(self, vtypes): return ConcreteType(tvm.ir.TupleType([vtype.evaluate() for vtype in vtypes])) -int32 = ConcreteType(tvm.ir.PrimType("int32")) -handle = ConcreteType(tvm.ir.PrimType("handle")) +int32 = ConcreteType("int32") +handle = ConcreteType("handle") Ptr = GenericPtrType() Tuple = GenericTupleType() diff --git a/src/printer/tir_hybrid_printer.cc b/src/printer/tir_hybrid_printer.cc index 8f6b37a2bbc4..b58e5fc2e9a0 100644 --- a/src/printer/tir_hybrid_printer.cc +++ b/src/printer/tir_hybrid_printer.cc @@ -19,10 +19,9 @@ /*! * \file printer/tir_hybrid_printer.cc - * \brief Printer class to print Te IR to python syntax script + * \brief Printer class to print Tensor IR to python syntax script */ -#include #include #include #include @@ -34,6 +33,7 @@ #include #include +#include #include "doc.h" #include "meta_data.h" @@ -48,7 +48,7 @@ class TIRHybridPrinter : public StmtFunctor, public: explicit TIRHybridPrinter(bool show_meta, runtime::TypedPackedFunc annotate = nullptr) - : show_meta_(show_meta), annotate_(annotate), meta_collector_(&meta_) {} + : show_meta_(show_meta), annotate_(std::move(annotate)), meta_collector_(&meta_) {} /*! \brief Print the node */ TVM_DLL Doc Print(const ObjectRef& node); @@ -68,6 +68,8 @@ class TIRHybridPrinter : public StmtFunctor, std::unordered_set var_not_in_headers; /*! \brief buffer collector (buffer defined in BufferMap and BufferAllocation)*/ std::unordered_set buf_not_in_headers; + /*! \breif Map from Var to thread env name */ + std::unordered_map var_env_map_; /*! \brief Map from Var to Doc */ std::unordered_map memo_var_; /*! \brief Map from Buffer to Doc */ @@ -356,7 +358,11 @@ Doc TIRHybridPrinter::VisitExpr_(const StringImmNode* op) { return Doc::StrLiter Doc TIRHybridPrinter::VisitExpr_(const CastNode* op) { Doc doc; - doc << "tir.cast(" << PrintDType(op->dtype) << ", " << Print(op->value) << ")"; + if (cast(op->dtype, op->value)->IsInstance()) { + doc << Print(op->value) << ".astype(" << PrintDType(op->dtype) << ")"; + } else { + doc << "tir.cast(" << Print(op->value) << ", " << PrintDType(op->dtype) << ")"; + } return doc; } @@ -509,6 +515,70 @@ Doc TIRHybridPrinter::VisitStmt_(const LetStmtNode* op) { Doc TIRHybridPrinter::VisitStmt_(const AttrStmtNode* op) { Doc doc; + // merge attr with allocate when possible + if (op->node->IsInstance() && op->attr_key == "storage_scope" && + op->body->IsInstance()) { + const auto* alloc = Downcast(op->body).get(); + if (alloc->buffer_var.same_as(op->node)) { + var_not_in_headers.insert(alloc->buffer_var.get()); + if (current_num_ != num_child_ - 1) { + doc << "with tir.allocate(" << Print(alloc->extents) << ", " << PrintDType(alloc->dtype) + << ", " << Print(op->value); + if (!is_one(alloc->condition)) { + doc << ", " << Print(alloc->condition); + } + doc << ") as " << Print(op->node) << ":"; + doc << Doc::Indent(4, Doc::NewLine() << PrintBody(alloc->body)); + } else { + doc << Print(op->node) << " = tir.allocate(" << Print(alloc->extents) << ", " + << PrintDType(alloc->dtype) << ", " << Print(op->value); + if (!is_one(alloc->condition)) { + doc << ", " << Print(alloc->condition); + } + doc << ")" << Doc::NewLine() << PrintBody(alloc->body); + } + return doc; + } + } + // merge attr with realize when possible + if (op->node->IsInstance() && op->attr_key == "realize_scope" && + op->body->IsInstance()) { + const auto* realize = Downcast(op->body).get(); + if (realize->buffer.same_as(op->node)) { + if (current_num_ != num_child_ - 1) { + doc << "with tir.realize(" << Print(realize->buffer) << Print(realize->bounds) << ", " + << Print(op->value); + if (!is_one(realize->condition)) { + doc << ", " << Print(realize->condition); + } + doc << "):" << Doc::Indent(4, Doc::NewLine() << PrintBody(realize->body)); + } else { + doc << "tir.realize(" << Print(realize->buffer) << Print(realize->bounds) << ", " + << Print(op->value); + if (!is_one(realize->condition)) { + doc << ", " << Print(realize->condition); + } + doc << ")" << Doc::NewLine() << PrintBody(realize->body); + } + return doc; + } + } + // concise thread env + if (op->node->IsInstance() && op->attr_key == "thread_extent") { + const auto* iter_var = Downcast(op->node).get(); + CHECK(!iter_var->dom.defined()); + var_not_in_headers.insert(iter_var->var.get()); + var_env_map_[iter_var->var] = iter_var->thread_tag; + if (current_num_ != num_child_ - 1) { + doc << "with tir.launch_thread(" << Print(iter_var->var) << ", " << Print(op->value) << "):"; + doc << Doc::Indent(4, Doc::NewLine() << PrintBody(op->body)); + } else { + doc << "tir.launch_thread(" << Print(iter_var->var) << ", " << Print(op->value) << ")"; + doc << Doc::NewLine() << PrintBody(op->body); + } + return doc; + } + // default if (current_num_ != num_child_ - 1) { doc << "with tir.attr(" << Print(op->node) << ", " << Doc::StrLiteral(op->attr_key) << ", " << Print(op->value) << "):"; @@ -545,35 +615,13 @@ Doc TIRHybridPrinter::VisitStmt_(const StoreNode* op) { } Doc TIRHybridPrinter::VisitStmt_(const BufferRealizeNode* op) { - Doc doc; - if (current_num_ != num_child_ - 1) { - doc << "with tir.realize(" << Print(op->buffer) << Print(op->bounds); - if (!is_one(op->condition)) { - doc << ", " << Print(op->condition); - } - doc << "):" << Doc::Indent(4, Doc::NewLine() << PrintBody(op->body)); - } else { - doc << "tir.realize(" << Print(op->buffer) << Print(op->bounds); - if (!is_one(op->condition)) { - doc << ", " << Print(op->condition); - } - doc << ")" << Doc::NewLine() << PrintBody(op->body); - } - return doc; + LOG(FATAL) << "Hybrid Printer Internal Error: All the BufferRealize should be folded with Attr"; + return Doc(); } Doc TIRHybridPrinter::VisitStmt_(const AllocateNode* op) { - Doc doc; - if (current_num_ != num_child_ - 1) { - doc << "with tir.allocate(" << Print(op->buffer_var) << ", " << PrintDType(op->dtype) << ", " - << Print(op->extents) << "):"; - doc << Doc::Indent(4, PrintBody(op->body)); - } else { - doc << "tir.allocate(" << Print(op->buffer_var) << ", " << PrintDType(op->dtype) << ", " - << Print(op->extents) << ")"; - doc << Doc::NewLine() << PrintBody(op->body); - } - return doc; + LOG(FATAL) << "Hybrid Printer Internal Error: All the Allocate should be folded with Attr"; + return Doc(); } Doc TIRHybridPrinter::VisitStmt_(const IfThenElseNode* op) { @@ -618,12 +666,10 @@ inline const char* ForType2String(ForType t) { Doc TIRHybridPrinter::VisitStmt_(const ForNode* op) { Doc doc; var_not_in_headers.insert(op->loop_var.get()); - doc << "for " << Print(op->loop_var) << " in tir.range(" << Print(op->min) << ", " - << Print(op->min + op->extent); - if (op->for_type != ForType::Serial) { - doc << ", " << Doc::StrLiteral(ForType2String(op->for_type)); - } - doc << "):" << Doc::Indent(4, Doc::NewLine() << PrintBody(op->body)); + doc << "for " << Print(op->loop_var) + << " in tir." + std::string(ForType2String(op->for_type)) + "(" << Print(op->min) << ", " + << Print(op->min + op->extent) + << "):" << Doc::Indent(4, Doc::NewLine() << PrintBody(op->body)); return doc; } @@ -734,7 +780,7 @@ Doc TIRHybridPrinter::PrintPrimFunc(const PrimFunc& primFunc) { // print buffer_bind for (const auto& it : op->buffer_map) { buf_not_in_headers.insert(it.second.get()); - body << Print(it.second) << " = tir.buffer_bind("; + body << Print(it.second) << " = tir.match_buffer("; body << Print(it.first) << ", " << memo_buf_decl_[it.second]; body << ")" << Doc::NewLine(); } @@ -785,8 +831,14 @@ Doc TIRHybridPrinter::PrintPrimFunc(const PrimFunc& primFunc) { vars.push_back(it.first.get()); } } - if (!vars.empty()) { + if (!var_env_map_.empty()) { header_var << Doc::NewLine() << "# var definition"; + for (const auto& it : var_env_map_) { + header_var << Doc::NewLine() << Print(it.first) << " = tir.env_thread(" + << Doc::StrLiteral(it.second) << ")"; + } + } + if (!vars.empty()) { std::sort(vars.begin(), vars.end(), [&](const VarNode* a, const VarNode* b) { return memo_var_[GetRef(a)].str() < memo_var_[GetRef(b)].str(); }); @@ -834,7 +886,7 @@ Doc TIRHybridPrinter::PrintBuffer(const BufferNode* op) { return meta_.InMeta(buffer) ? meta_.GetMetaNode(buffer) : AllocBuf(buffer); } -TVM_REGISTER_GLOBAL("tir.hybrid.AsHybrid") +TVM_REGISTER_GLOBAL("hybrid.AsHybrid") .set_body_typed([](const ObjectRef& functions, bool show_meta) { CHECK(functions.as() != nullptr || functions.as() != nullptr); diff --git a/tests/python/unittest/test_hybrid_error_report.py b/tests/python/unittest/test_hybrid_error_report.py index 39b8bfc0251b..9b3c5bb663eb 100644 --- a/tests/python/unittest/test_hybrid_error_report.py +++ b/tests/python/unittest/test_hybrid_error_report.py @@ -26,30 +26,30 @@ @tvm.hybrid.script class Module1: def buffer_bind_missing_args(a: ty.handle) -> None: - A = tir.buffer_bind((16, 16), "float32") + A = tir.match_buffer((16, 16), "float32") @tvm.hybrid.script class Module2: def range_missing_args(a: ty.handle) -> None: - A = tir.buffer_bind(a, (16, 16), "float32") + A = tir.match_buffer(a, (16, 16), "float32") tir.attr(A, "realize_scope", "") tir.realize(A[0:16, 0:16]) - for i in tir.range(16): - for j in tir.range(0, 16): + for i in tir.serial(16): + for j in tir.serial(0, 16): A[i, j] = 0.0 @tvm.hybrid.script class Module3: def undefined_buffer(a: ty.handle) -> None: - A = tir.buffer_bind(a, (16, 16), "float32") + A = tir.match_buffer(a, (16, 16), "float32") tir.attr(A, "realize_scope", "") tir.realize(C[0:16, 0:16]) - for i in tir.range(16): - for j in tir.range(0, 16): + for i in tir.serial(16): + for j in tir.serial(0, 16): A[i, j] = 0.0 @@ -63,12 +63,12 @@ def unsupported_stmt(a: ty.int32) -> None: @tvm.hybrid.script class Module5: def unsupported_function_call(a: ty.handle) -> None: - A = tir.buffer_bind(a, (16, 16), "float32") + A = tir.match_buffer(a, (16, 16), "float32") tir.attr(A, "realize_scope", "") tir.realize(A[0:16, 0:16]) for i in tir.const_range(16): - for j in tir.range(0, 16): + for j in tir.serial(0, 16): A[i, j] = 0.0 @@ -85,6 +85,31 @@ def invalid_concise_scoping() -> None: tir.evaluate(0.0) +@tvm.hybrid.script +class Module8: + def invalid_expr_stmt() -> None: + tir.max(1, 2) + + +@tvm.hybrid.script +class Module9: + def invalid_for_function(a: ty.handle) -> None: + A = tir.match_buffer(a, (16, 16), "float32") + + for i in tir.evaluate(0.0): + for j in tir.serial(0, 16): + A[i, j] = 0.0 + + +@tvm.hybrid.script +class Module10: + def invalid_block_function(a: ty.handle) -> None: + A = tir.match_buffer(a, (16, 16), "float32") + + with tir.evaluate(0.0): + pass + + def wrap_error(module, lineno): with pytest.raises(HybridParserError) as error: mod = module() @@ -103,3 +128,6 @@ def wrap_error(module, lineno): wrap_error(Module5, 70) wrap_error(Module6, 77) wrap_error(Module7, 84) + wrap_error(Module8, 91) + wrap_error(Module9, 99) + wrap_error(Module10, 109) diff --git a/tests/python/unittest/test_hybrid_roundtrip.py b/tests/python/unittest/test_hybrid_roundtrip.py index 90d76a2b9875..ea67a4e6f549 100644 --- a/tests/python/unittest/test_hybrid_roundtrip.py +++ b/tests/python/unittest/test_hybrid_roundtrip.py @@ -28,36 +28,34 @@ def mmult(A: ty.handle, B: ty.handle, C: ty.handle) -> None: # buffer definition C_global = tir.buffer_decl([1024, 1024], elem_offset=0, align=128, offset_factor=1) packedB = tir.buffer_decl([32, 1024, 32], elem_offset=0, align=128, offset_factor=1) - A_1 = tir.buffer_bind(A, [1024, 1024], elem_offset=0, align=128, offset_factor=1) - B_1 = tir.buffer_bind(B, [1024, 1024], elem_offset=0, align=128, offset_factor=1) - C_1 = tir.buffer_bind(C, [1024, 1024], elem_offset=0, align=128, offset_factor=1) + A_1 = tir.match_buffer(A, [1024, 1024], elem_offset=0, align=128, offset_factor=1) + B_1 = tir.match_buffer(B, [1024, 1024], elem_offset=0, align=128, offset_factor=1) + C_1 = tir.match_buffer(C, [1024, 1024], elem_offset=0, align=128, offset_factor=1) # body - tir.attr(packedB, "realize_scope", "") - tir.realize(packedB[0:32, 0:1024, 0:32]) - for x in tir.range(0, 32, "parallel"): - for y in tir.range(0, 1024): - for z in tir.range(0, 32, "vectorized"): + tir.realize(packedB[0:32, 0:1024, 0:32], "") + for x in tir.parallel(0, 32): + for y in tir.serial(0, 1024): + for z in tir.vectorized(0, 32): packedB[x, y, z] = B_1[y, ((x * 32) + z)] - tir.attr(C_1, "realize_scope", "") - tir.realize(C_1[0:1024, 0:1024]) - for x_outer in tir.range(0, 32, "parallel"): - for y_outer in tir.range(0, 32): - tir.attr(C_global, "realize_scope", "global") + tir.realize(C_1[0:1024, 0:1024], "") + for x_outer in tir.parallel(0, 32): + for y_outer in tir.serial(0, 32): tir.realize( C_global[ (x_outer * 32) : ((x_outer * 32) + 32), (y_outer * 32) : ((y_outer * 32) + 32), - ] + ], + "global", ) - for x_c_init in tir.range(0, 32): - for y_c_init in tir.range(0, 32, "vectorized"): + for x_c_init in tir.serial(0, 32): + for y_c_init in tir.vectorized(0, 32): C_global[ (x_c_init + (x_outer * 32)), (y_c_init + (y_outer * 32)) ] = tir.float32(0) - for k_outer in tir.range(0, 256): - for x_c in tir.range(0, 32): - for k_inner in tir.range(0, 4, "unroll"): - for y_c in tir.range(0, 32, "vectorized"): + for k_outer in tir.serial(0, 256): + for x_c in tir.serial(0, 32): + for k_inner in tir.unroll(0, 4): + for y_c in tir.vectorized(0, 32): C_global[(x_c + (x_outer * 32)), (y_c + (y_outer * 32))] = C_global[ (x_c + (x_outer * 32)), (y_c + (y_outer * 32)) ] + ( @@ -68,8 +66,8 @@ def mmult(A: ty.handle, B: ty.handle, C: ty.handle) -> None: tir.floormod((y_c + (y_outer * 32)), 32), ] ) - for x_inner in tir.range(0, 32): - for y_inner in tir.range(0, 32): + for x_inner in tir.serial(0, 32): + for y_inner in tir.serial(0, 32): C_1[(x_inner + (x_outer * 32)), (y_inner + (y_outer * 32))] = C_global[ (x_inner + (x_outer * 32)), (y_inner + (y_outer * 32)) ] @@ -86,19 +84,13 @@ class Module2: def mmult(A: ty.handle, B: ty.handle, C: ty.handle) -> None: # function attr dict tir.func_attr({"global_symbol": "mmult", "tir.noalias": True}) - # var definition - C_global = tir.var("handle") - packedB = tir.var("handle") - A_1 = tir.buffer_bind(A, [1024, 1024], elem_offset=0, align=128, offset_factor=1) - B_1 = tir.buffer_bind(B, [1024, 1024], elem_offset=0, align=128, offset_factor=1) - C_1 = tir.buffer_bind(C, [1024, 1024], elem_offset=0, align=128, offset_factor=1) + A_1 = tir.match_buffer(A, [1024, 1024], elem_offset=0, align=128, offset_factor=1) + B_1 = tir.match_buffer(B, [1024, 1024], elem_offset=0, align=128, offset_factor=1) + C_1 = tir.match_buffer(C, [1024, 1024], elem_offset=0, align=128, offset_factor=1) # body - tir.attr(packedB, "storage_scope", "global") - tir.allocate(packedB, "float32x32", [32768]) - tir.attr(C_global, "storage_scope", "global") - tir.allocate(C_global, "float32", [1024]) - for x in tir.range(0, 32, "parallel"): - for y in tir.range(0, 1024): + packedB = tir.allocate([32768], "float32x32", "global") + for x in tir.parallel(0, 32): + for y in tir.serial(0, 1024): tir.store( packedB, tir.ramp(((x * 32768) + (y * 32)), 1, 32), @@ -110,17 +102,18 @@ def mmult(A: ty.handle, B: ty.handle, C: ty.handle) -> None: ), tir.broadcast(True, 32), ) - for x_outer in tir.range(0, 32): - for y_outer in tir.range(0, 32): - for x_c_init in tir.range(0, 32): + for x_outer in tir.parallel(0, 32): + C_global = tir.allocate([1024], "float32", "global") + for y_outer in tir.serial(0, 32): + for x_c_init in tir.serial(0, 32): tir.store( C_global, tir.ramp((x_c_init * 32), 1, 32), tir.broadcast(tir.float32(0), 32), tir.broadcast(True, 32), ) - for k_outer in tir.range(0, 256): - for x_c in tir.range(0, 32): + for k_outer in tir.serial(0, 256): + for x_c in tir.serial(0, 32): tir.store( C_global, tir.ramp((x_c * 32), 1, 32), @@ -252,8 +245,8 @@ def mmult(A: ty.handle, B: ty.handle, C: ty.handle) -> None: ), tir.broadcast(True, 32), ) - for x_inner in tir.range(0, 32): - for y_inner in tir.range(0, 32): + for x_inner in tir.serial(0, 32): + for y_inner in tir.serial(0, 32): C_1.data[ ((((x_outer * 32768) + (x_inner * 1024)) + (y_outer * 32)) + y_inner) ] = tir.load("float32", C_global, ((x_inner * 32) + y_inner)) @@ -329,14 +322,14 @@ def mmult( tir.tvm_struct_get(arg0, 0, 7, dtype="uint16") == tir.uint16(1) ), "arg0.dtype is expected to be float32" assert 1024 == tir.cast( - "int32", tir.load("int64", arg0_shape, 0) + tir.load("int64", arg0_shape, 0), "int32" ), "Argument arg0.shape[0] has an unsatisfied constraint" assert 1024 == tir.cast( - "int32", tir.load("int64", arg0_shape, 1) + tir.load("int64", arg0_shape, 1), "int32" ), "Argument arg0.shape[1] has an unsatisfied constraint" if not (tir.isnullptr(arg0_strides, dtype="bool")): - assert (1 == tir.cast("int32", tir.load("int64", arg0_strides, 1))) and ( - 1024 == tir.cast("int32", tir.load("int64", arg0_strides, 0)) + assert (1 == tir.cast(tir.load("int64", arg0_strides, 1), "int32")) and ( + 1024 == tir.cast(tir.load("int64", arg0_strides, 0), "int32") ), "arg0.strides: expected to be compact array" tir.evaluate(0) assert tir.uint64(0) == tir.tvm_struct_get( @@ -358,14 +351,14 @@ def mmult( tir.tvm_struct_get(arg1, 0, 7, dtype="uint16") == tir.uint16(1) ), "arg1.dtype is expected to be float32" assert 1024 == tir.cast( - "int32", tir.load("int64", arg1_shape, 0) + tir.load("int64", arg1_shape, 0), "int32" ), "Argument arg1.shape[0] has an unsatisfied constraint" assert 1024 == tir.cast( - "int32", tir.load("int64", arg1_shape, 1) + tir.load("int64", arg1_shape, 1), "int32" ), "Argument arg1.shape[1] has an unsatisfied constraint" if not (tir.isnullptr(arg1_strides, dtype="bool")): - assert (1 == tir.cast("int32", tir.load("int64", arg1_strides, 1))) and ( - 1024 == tir.cast("int32", tir.load("int64", arg1_strides, 0)) + assert (1 == tir.cast(tir.load("int64", arg1_strides, 1), "int32")) and ( + 1024 == tir.cast(tir.load("int64", arg1_strides, 0), "int32") ), "arg1.strides: expected to be compact array" tir.evaluate(0) assert tir.uint64(0) == tir.tvm_struct_get( @@ -390,14 +383,14 @@ def mmult( tir.tvm_struct_get(arg2, 0, 7, dtype="uint16") == tir.uint16(1) ), "arg2.dtype is expected to be float32" assert 1024 == tir.cast( - "int32", tir.load("int64", arg2_shape, 0) + tir.load("int64", arg2_shape, 0), "int32" ), "Argument arg2.shape[0] has an unsatisfied constraint" assert 1024 == tir.cast( - "int32", tir.load("int64", arg2_shape, 1) + tir.load("int64", arg2_shape, 1), "int32" ), "Argument arg2.shape[1] has an unsatisfied constraint" if not (tir.isnullptr(arg2_strides, dtype="bool")): - assert (1 == tir.cast("int32", tir.load("int64", arg2_strides, 1))) and ( - 1024 == tir.cast("int32", tir.load("int64", arg2_strides, 0)) + assert (1 == tir.cast(tir.load("int64", arg2_strides, 1), "int32")) and ( + 1024 == tir.cast(tir.load("int64", arg2_strides, 0), "int32") ), "arg2.strides: expected to be compact array" tir.evaluate(0) assert tir.uint64(0) == tir.tvm_struct_get( @@ -418,8 +411,8 @@ def mmult( ): if tir.isnullptr(packedB, dtype="bool"): tir.evaluate(tir.tvm_throw_last_error(dtype="int32")) - for x in tir.range(0, 32, "parallel"): - for y in tir.range(0, 1024): + for x in tir.parallel(0, 32): + for y in tir.serial(0, 1024): tir.store( packedB, tir.ramp(((x * 32768) + (y * 32)), 1, 32), @@ -431,7 +424,7 @@ def mmult( ), tir.broadcast(True, 32), ) - for x_outer in tir.range(0, 32, "parallel"): + for x_outer in tir.parallel(0, 32): tir.attr(C_global, "storage_scope", "global") tir.attr(C_global, "storage_alignment", 128) with tir.let( @@ -442,16 +435,16 @@ def mmult( ): if tir.isnullptr(C_global, dtype="bool"): tir.evaluate(tir.tvm_throw_last_error(dtype="int32")) - for y_outer in tir.range(0, 32): - for x_c_init in tir.range(0, 32): + for y_outer in tir.serial(0, 32): + for x_c_init in tir.serial(0, 32): tir.store( C_global, tir.ramp((x_c_init * 32), 1, 32), tir.broadcast(tir.float32(0), 32), tir.broadcast(True, 32), ) - for k_outer in tir.range(0, 256): - for x_c in tir.range(0, 32): + for k_outer in tir.serial(0, 256): + for x_c in tir.serial(0, 32): tir.store( C_global, tir.ramp((x_c * 32), 1, 32), @@ -599,8 +592,8 @@ def mmult( ), tir.broadcast(True, 32), ) - for x_inner in tir.range(0, 32): - for y_inner in tir.range(0, 32): + for x_inner in tir.serial(0, 32): + for y_inner in tir.serial(0, 32): C[ ( (((x_outer * 32768) + (x_inner * 1024)) + (y_outer * 32)) @@ -624,12 +617,12 @@ def opt_conv_tensorcore_normalize(A: ty.handle, W: ty.handle, Conv: ty.handle) - # function attr dict tir.func_attr({"global_symbol": "default_function", "tir.noalias": True}) # var definition - blockIdx_x = tir.var("int32") - blockIdx_y = tir.var("int32") - blockIdx_z = tir.var("int32") - threadIdx_x = tir.var("int32") - threadIdx_y = tir.var("int32") - threadIdx_z = tir.var("int32") + bx = tir.env_thread("blockIdx.x") + by = tir.env_thread("blockIdx.y") + bz = tir.env_thread("blockIdx.z") + tx = tir.env_thread("threadIdx.x") + ty = tir.env_thread("threadIdx.y") + tz = tir.env_thread("threadIdx.z") # buffer definition Apad_shared = tir.buffer_decl( [16, 16, 16, 16, 16, 16], dtype="float16", elem_offset=0, align=128, offset_factor=1 @@ -665,47 +658,46 @@ def opt_conv_tensorcore_normalize(A: ty.handle, W: ty.handle, Conv: ty.handle) - ) buffer_4 = tir.buffer_decl([16, 16], scope="wmma.accumulator", align=32, offset_factor=256) buffer_5 = tir.buffer_decl([16, 16], align=32, offset_factor=256) - A_1 = tir.buffer_bind( + A_1 = tir.match_buffer( A, [16, 14, 14, 16, 16, 16], dtype="float16", elem_offset=0, align=128, offset_factor=1 ) - W_1 = tir.buffer_bind( + W_1 = tir.match_buffer( W, [3, 3, 16, 32, 16, 16], dtype="float16", elem_offset=0, align=128, offset_factor=1 ) - Conv_1 = tir.buffer_bind( + Conv_1 = tir.match_buffer( Conv, [16, 14, 14, 32, 16, 16], elem_offset=0, align=128, offset_factor=1 ) # body - tir.attr(Conv_1, "realize_scope", "") - tir.realize(Conv_1[0:16, 0:14, 0:14, 0:32, 0:16, 0:16]) - tir.attr(tir.iter_var(blockIdx_z, None, "ThreadIndex", "blockIdx.z"), "thread_extent", 196) - tir.attr(tir.iter_var(blockIdx_x, None, "ThreadIndex", "blockIdx.x"), "thread_extent", 2) - tir.attr(tir.iter_var(blockIdx_y, None, "ThreadIndex", "blockIdx.y"), "thread_extent", 4) - tir.attr(tir.iter_var(threadIdx_y, None, "ThreadIndex", "threadIdx.y"), "thread_extent", 4) - tir.attr(tir.iter_var(threadIdx_z, None, "ThreadIndex", "threadIdx.z"), "thread_extent", 2) - tir.attr(Conv_wmma_accumulator, "realize_scope", "wmma.accumulator") + tir.realize(Conv_1[0:16, 0:14, 0:14, 0:32, 0:16, 0:16], "") + tir.launch_thread(bz, 196) + tir.launch_thread(bx, 2) + tir.launch_thread(by, 4) + tir.launch_thread(ty, 4) + tir.launch_thread(tz, 2) tir.realize( Conv_wmma_accumulator[ - ((blockIdx_x * 8) + (threadIdx_y * 2)) : (((blockIdx_x * 8) + (threadIdx_y * 2)) + 2), - tir.floordiv(blockIdx_z, 14) : (tir.floordiv(blockIdx_z, 14) + 1), - tir.floormod(blockIdx_z, 14) : (tir.floormod(blockIdx_z, 14) + 1), - ((blockIdx_y * 8) + (threadIdx_z * 4)) : (((blockIdx_y * 8) + (threadIdx_z * 4)) + 4), + ((bx * 8) + (ty * 2)) : (((bx * 8) + (ty * 2)) + 2), + tir.floordiv(bz, 14) : (tir.floordiv(bz, 14) + 1), + tir.floormod(bz, 14) : (tir.floormod(bz, 14) + 1), + ((by * 8) + (tz * 4)) : (((by * 8) + (tz * 4)) + 4), 0:16, 0:16, - ] + ], + "wmma.accumulator", ) - for n_c_init in tir.range(0, 2): - for o_c_init in tir.range(0, 4): + for n_c_init in tir.serial(0, 2): + for o_c_init in tir.serial(0, 4): tir.attr( [BC, Conv_wmma_accumulator], "buffer_bind_scope", tir.tvm_tuple( - (n_c_init + ((blockIdx_x * 8) + (threadIdx_y * 2))), + (n_c_init + ((bx * 8) + (ty * 2))), 1, - tir.floordiv(blockIdx_z, 14), + tir.floordiv(bz, 14), 1, - tir.floormod(blockIdx_z, 14), + tir.floormod(bz, 14), 1, - (o_c_init + ((blockIdx_y * 8) + (threadIdx_z * 4))), + (o_c_init + ((by * 8) + (tz * 4))), 1, 0, 16, @@ -725,119 +717,105 @@ def opt_conv_tensorcore_normalize(A: ty.handle, W: ty.handle, Conv: ty.handle) - dtype="handle", ) ) - for ic_outer in tir.range(0, 8): - for kh in tir.range(0, 3): - tir.attr(Apad_shared, "realize_scope", "shared") + for ic_outer in tir.serial(0, 8): + for kh in tir.serial(0, 3): tir.realize( Apad_shared[ - (blockIdx_x * 8) : ((blockIdx_x * 8) + 8), - (tir.floordiv(blockIdx_z, 14) + kh) : ((tir.floordiv(blockIdx_z, 14) + kh) + 1), - tir.floormod(blockIdx_z, 14) : (tir.floormod(blockIdx_z, 14) + 3), + (bx * 8) : ((bx * 8) + 8), + (tir.floordiv(bz, 14) + kh) : ((tir.floordiv(bz, 14) + kh) + 1), + tir.floormod(bz, 14) : (tir.floormod(bz, 14) + 3), (ic_outer * 2) : ((ic_outer * 2) + 2), 0:16, 0:16, - ] + ], + "shared", ) - for ax2 in tir.range(0, 3): - for ax3 in tir.range(0, 2): - for ax4_ax5_fused_outer in tir.range(0, 8): - tir.attr( - tir.iter_var(threadIdx_x, None, "ThreadIndex", "threadIdx.x"), - "thread_extent", - 32, - ) + for ax2 in tir.serial(0, 3): + for ax3 in tir.serial(0, 2): + for ax4_ax5_fused_outer in tir.serial(0, 8): + tir.launch_thread(tx, 32) Apad_shared[ - ((threadIdx_z + (threadIdx_y * 2)) + (blockIdx_x * 8)), - (tir.floordiv(blockIdx_z, 14) + kh), - (ax2 + tir.floormod(blockIdx_z, 14)), + ((tz + (ty * 2)) + (bx * 8)), + (tir.floordiv(bz, 14) + kh), + (ax2 + tir.floormod(bz, 14)), (ax3 + (ic_outer * 2)), - tir.floordiv((threadIdx_x + (ax4_ax5_fused_outer * 32)), 16), - tir.floormod((threadIdx_x + (ax4_ax5_fused_outer * 32)), 16), + tir.floordiv((tx + (ax4_ax5_fused_outer * 32)), 16), + tir.floormod((tx + (ax4_ax5_fused_outer * 32)), 16), ] = tir.if_then_else( ( ( ( - ((tir.floordiv(blockIdx_z, 14) + kh) >= 1) - and (((tir.floordiv(blockIdx_z, 14) + kh) - 1) < 14) + ((tir.floordiv(bz, 14) + kh) >= 1) + and (((tir.floordiv(bz, 14) + kh) - 1) < 14) ) - and ((ax2 + tir.floormod(blockIdx_z, 14)) >= 1) + and ((ax2 + tir.floormod(bz, 14)) >= 1) ) - and (((ax2 + tir.floormod(blockIdx_z, 14)) - 1) < 14) + and (((ax2 + tir.floormod(bz, 14)) - 1) < 14) ), A_1[ - ((threadIdx_z + (threadIdx_y * 2)) + (blockIdx_x * 8)), - ((tir.floordiv(blockIdx_z, 14) + kh) - 1), - ((ax2 + tir.floormod(blockIdx_z, 14)) - 1), + ((tz + (ty * 2)) + (bx * 8)), + ((tir.floordiv(bz, 14) + kh) - 1), + ((ax2 + tir.floormod(bz, 14)) - 1), (ax3 + (ic_outer * 2)), - tir.floordiv((threadIdx_x + (ax4_ax5_fused_outer * 32)), 16), - tir.floormod((threadIdx_x + (ax4_ax5_fused_outer * 32)), 16), + tir.floordiv((tx + (ax4_ax5_fused_outer * 32)), 16), + tir.floormod((tx + (ax4_ax5_fused_outer * 32)), 16), ], tir.float16(0), dtype="float16", ) - tir.attr(W_shared, "realize_scope", "shared") tir.realize( W_shared[ kh : (kh + 1), 0:3, (ic_outer * 2) : ((ic_outer * 2) + 2), - (blockIdx_y * 8) : ((blockIdx_y * 8) + 8), + (by * 8) : ((by * 8) + 8), 0:16, 0:16, - ] + ], + "shared", ) - for ax1 in tir.range(0, 3): - for ax2_1 in tir.range(0, 2): - tir.attr( - tir.iter_var(threadIdx_x, None, "ThreadIndex", "threadIdx.x"), - "thread_extent", - 32, - ) - for ax4_ax5_fused_inner in tir.range(0, 8, "vectorized"): + for ax1 in tir.serial(0, 3): + for ax2_1 in tir.serial(0, 2): + tir.launch_thread(tx, 32) + for ax4_ax5_fused_inner in tir.vectorized(0, 8): W_shared[ kh, ax1, (ax2_1 + (ic_outer * 2)), - ((threadIdx_z + (threadIdx_y * 2)) + (blockIdx_y * 8)), - tir.floordiv((ax4_ax5_fused_inner + (threadIdx_x * 8)), 16), - tir.floormod((ax4_ax5_fused_inner + (threadIdx_x * 8)), 16), + ((tz + (ty * 2)) + (by * 8)), + tir.floordiv((ax4_ax5_fused_inner + (tx * 8)), 16), + tir.floormod((ax4_ax5_fused_inner + (tx * 8)), 16), ] = W_1[ kh, ax1, (ax2_1 + (ic_outer * 2)), - ((threadIdx_z + (threadIdx_y * 2)) + (blockIdx_y * 8)), - tir.floordiv((ax4_ax5_fused_inner + (threadIdx_x * 8)), 16), - tir.floormod((ax4_ax5_fused_inner + (threadIdx_x * 8)), 16), + ((tz + (ty * 2)) + (by * 8)), + tir.floordiv((ax4_ax5_fused_inner + (tx * 8)), 16), + tir.floormod((ax4_ax5_fused_inner + (tx * 8)), 16), ] - for ic_inner in tir.range(0, 2): - for kw in tir.range(0, 3): - tir.attr(Apad_shared_wmma_matrix_a, "realize_scope", "wmma.matrix_a") + for ic_inner in tir.serial(0, 2): + for kw in tir.serial(0, 3): tir.realize( Apad_shared_wmma_matrix_a[ - ((blockIdx_x * 8) + (threadIdx_y * 2)) : ( - ((blockIdx_x * 8) + (threadIdx_y * 2)) + 2 - ), - (tir.floordiv(blockIdx_z, 14) + kh) : ( - (tir.floordiv(blockIdx_z, 14) + kh) + 1 - ), - (kw + tir.floormod(blockIdx_z, 14)) : ( - (kw + tir.floormod(blockIdx_z, 14)) + 1 - ), + ((bx * 8) + (ty * 2)) : (((bx * 8) + (ty * 2)) + 2), + (tir.floordiv(bz, 14) + kh) : ((tir.floordiv(bz, 14) + kh) + 1), + (kw + tir.floormod(bz, 14)) : ((kw + tir.floormod(bz, 14)) + 1), ((ic_outer * 2) + ic_inner) : (((ic_outer * 2) + ic_inner) + 1), 0:16, 0:16, - ] + ], + "wmma.matrix_a", ) - for ax0 in tir.range(0, 2): + for ax0 in tir.serial(0, 2): tir.attr( [buffer, Apad_shared], "buffer_bind_scope", tir.tvm_tuple( - (ax0 + ((blockIdx_x * 8) + (threadIdx_y * 2))), + (ax0 + ((bx * 8) + (ty * 2))), 1, - (tir.floordiv(blockIdx_z, 14) + kh), + (tir.floordiv(bz, 14) + kh), 1, - (kw + tir.floormod(blockIdx_z, 14)), + (kw + tir.floormod(bz, 14)), 1, ((ic_outer * 2) + ic_inner), 1, @@ -852,11 +830,11 @@ def opt_conv_tensorcore_normalize(A: ty.handle, W: ty.handle, Conv: ty.handle) - [buffer_1, Apad_shared_wmma_matrix_a], "buffer_bind_scope", tir.tvm_tuple( - (ax0 + ((blockIdx_x * 8) + (threadIdx_y * 2))), + (ax0 + ((bx * 8) + (ty * 2))), 1, - (tir.floordiv(blockIdx_z, 14) + kh), + (tir.floordiv(bz, 14) + kh), 1, - (kw + tir.floormod(blockIdx_z, 14)), + (kw + tir.floormod(bz, 14)), 1, ((ic_outer * 2) + ic_inner), 1, @@ -887,20 +865,18 @@ def opt_conv_tensorcore_normalize(A: ty.handle, W: ty.handle, Conv: ty.handle) - dtype="handle", ) ) - tir.attr(W_shared_wmma_matrix_b, "realize_scope", "wmma.matrix_b") tir.realize( W_shared_wmma_matrix_b[ kh : (kh + 1), kw : (kw + 1), ((ic_outer * 2) + ic_inner) : (((ic_outer * 2) + ic_inner) + 1), - ((blockIdx_y * 8) + (threadIdx_z * 4)) : ( - ((blockIdx_y * 8) + (threadIdx_z * 4)) + 4 - ), + ((by * 8) + (tz * 4)) : (((by * 8) + (tz * 4)) + 4), 0:16, 0:16, - ] + ], + "wmma.matrix_b", ) - for ax3_1 in tir.range(0, 4): + for ax3_1 in tir.serial(0, 4): tir.attr( [buffer_2, W_shared], "buffer_bind_scope", @@ -911,7 +887,7 @@ def opt_conv_tensorcore_normalize(A: ty.handle, W: ty.handle, Conv: ty.handle) - 1, ((ic_outer * 2) + ic_inner), 1, - (ax3_1 + ((blockIdx_y * 8) + (threadIdx_z * 4))), + (ax3_1 + ((by * 8) + (tz * 4))), 1, 0, 16, @@ -930,7 +906,7 @@ def opt_conv_tensorcore_normalize(A: ty.handle, W: ty.handle, Conv: ty.handle) - 1, ((ic_outer * 2) + ic_inner), 1, - (ax3_1 + ((blockIdx_y * 8) + (threadIdx_z * 4))), + (ax3_1 + ((by * 8) + (tz * 4))), 1, 0, 16, @@ -959,17 +935,17 @@ def opt_conv_tensorcore_normalize(A: ty.handle, W: ty.handle, Conv: ty.handle) - dtype="handle", ) ) - for n_c in tir.range(0, 2): - for o_c in tir.range(0, 4): + for n_c in tir.serial(0, 2): + for o_c in tir.serial(0, 4): tir.attr( [BA, Apad_shared_wmma_matrix_a], "buffer_bind_scope", tir.tvm_tuple( - (n_c + ((blockIdx_x * 8) + (threadIdx_y * 2))), + (n_c + ((bx * 8) + (ty * 2))), 1, - (tir.floordiv(blockIdx_z, 14) + kh), + (tir.floordiv(bz, 14) + kh), 1, - (tir.floormod(blockIdx_z, 14) + kw), + (tir.floormod(bz, 14) + kw), 1, ((ic_outer * 2) + ic_inner), 1, @@ -990,7 +966,7 @@ def opt_conv_tensorcore_normalize(A: ty.handle, W: ty.handle, Conv: ty.handle) - 1, ((ic_outer * 2) + ic_inner), 1, - (o_c + ((blockIdx_y * 8) + (threadIdx_z * 4))), + (o_c + ((by * 8) + (tz * 4))), 1, 0, 16, @@ -1003,13 +979,13 @@ def opt_conv_tensorcore_normalize(A: ty.handle, W: ty.handle, Conv: ty.handle) - [BC, Conv_wmma_accumulator], "buffer_bind_scope", tir.tvm_tuple( - (n_c + ((blockIdx_x * 8) + (threadIdx_y * 2))), + (n_c + ((bx * 8) + (ty * 2))), 1, - tir.floordiv(blockIdx_z, 14), + tir.floordiv(bz, 14), 1, - tir.floormod(blockIdx_z, 14), + tir.floormod(bz, 14), 1, - (o_c + ((blockIdx_y * 8) + (threadIdx_z * 4))), + (o_c + ((by * 8) + (tz * 4))), 1, 0, 16, @@ -1031,19 +1007,19 @@ def opt_conv_tensorcore_normalize(A: ty.handle, W: ty.handle, Conv: ty.handle) - dtype="handle", ) ) - for n_inner in tir.range(0, 2): - for o_inner in tir.range(0, 4): + for n_inner in tir.serial(0, 2): + for o_inner in tir.serial(0, 4): tir.attr( [buffer_4, Conv_wmma_accumulator], "buffer_bind_scope", tir.tvm_tuple( - ((((blockIdx_x * 4) + threadIdx_y) * 2) + n_inner), + ((((bx * 4) + ty) * 2) + n_inner), 1, - tir.floordiv(blockIdx_z, 14), + tir.floordiv(bz, 14), 1, - tir.floormod(blockIdx_z, 14), + tir.floormod(bz, 14), 1, - ((((blockIdx_y * 2) + threadIdx_z) * 4) + o_inner), + ((((by * 2) + tz) * 4) + o_inner), 1, 0, 16, @@ -1056,13 +1032,13 @@ def opt_conv_tensorcore_normalize(A: ty.handle, W: ty.handle, Conv: ty.handle) - [buffer_5, Conv_1], "buffer_bind_scope", tir.tvm_tuple( - ((((blockIdx_x * 4) + threadIdx_y) * 2) + n_inner), + ((((bx * 4) + ty) * 2) + n_inner), 1, - tir.floordiv(blockIdx_z, 14), + tir.floordiv(bz, 14), 1, - tir.floormod(blockIdx_z, 14), + tir.floormod(bz, 14), 1, - ((((blockIdx_y * 2) + threadIdx_z) * 4) + o_inner), + ((((by * 2) + tz) * 4) + o_inner), 1, 0, 16, @@ -1103,43 +1079,32 @@ def test_opt_conv_tensorcore_normalize(): def opt_conv_tensorcore_lower(A: ty.handle, W: ty.handle, Conv: ty.handle) -> None: # function attr dict tir.func_attr({"global_symbol": "default_function", "tir.noalias": True}) - # var definition - Apad_shared = tir.var("handle") - Apad_shared_wmma_matrix_a = tir.var("handle") - Conv_wmma_accumulator = tir.var("handle") - W_shared = tir.var("handle") - W_shared_wmma_matrix_b = tir.var("handle") - blockIdx_x = tir.var("int32") - blockIdx_y = tir.var("int32") - blockIdx_z = tir.var("int32") - threadIdx_x = tir.var("int32") - threadIdx_y = tir.var("int32") - threadIdx_z = tir.var("int32") - A_1 = tir.buffer_bind( + # body + A_1 = tir.match_buffer( A, [16, 14, 14, 16, 16, 16], dtype="float16", elem_offset=0, align=128, offset_factor=1 ) - W_1 = tir.buffer_bind( + W_1 = tir.match_buffer( W, [3, 3, 16, 32, 16, 16], dtype="float16", elem_offset=0, align=128, offset_factor=1 ) - Conv_1 = tir.buffer_bind( + Conv_1 = tir.match_buffer( Conv, [16, 14, 14, 32, 16, 16], elem_offset=0, align=128, offset_factor=1 ) - # body - tir.attr(tir.iter_var(blockIdx_z, None, "ThreadIndex", "blockIdx.z"), "thread_extent", 196) - tir.attr(Conv_wmma_accumulator, "storage_scope", "wmma.accumulator") - tir.allocate(Conv_wmma_accumulator, "float32", [2048]) - tir.attr(Apad_shared, "storage_scope", "shared") - tir.allocate(Apad_shared, "float16", [12288]) - tir.attr(W_shared, "storage_scope", "shared") - tir.allocate(W_shared, "float16", [12288]) - tir.attr(Apad_shared_wmma_matrix_a, "storage_scope", "wmma.matrix_a") - tir.allocate(Apad_shared_wmma_matrix_a, "float16", [512]) - tir.attr(W_shared_wmma_matrix_b, "storage_scope", "wmma.matrix_b") - tir.allocate(W_shared_wmma_matrix_b, "float16", [1024]) - tir.attr(tir.iter_var(blockIdx_x, None, "ThreadIndex", "blockIdx.x"), "thread_extent", 2) - tir.attr(tir.iter_var(blockIdx_y, None, "ThreadIndex", "blockIdx.y"), "thread_extent", 4) - tir.attr(tir.iter_var(threadIdx_y, None, "ThreadIndex", "threadIdx.y"), "thread_extent", 4) - tir.attr(tir.iter_var(threadIdx_z, None, "ThreadIndex", "threadIdx.z"), "thread_extent", 2) + bx = tir.env_thread("blockIdx.x") + by = tir.env_thread("blockIdx.y") + bz = tir.env_thread("blockIdx.z") + tx = tir.env_thread("threadIdx.x") + ty = tir.env_thread("threadIdx.y") + tz = tir.env_thread("threadIdx.z") + tir.launch_thread(bz, 196) + Conv_wmma_accumulator = tir.allocate([2048], "float32", "wmma.accumulator") + Apad_shared = tir.allocate([12288], "float16", "shared") + W_shared = tir.allocate([12288], "float16", "shared") + Apad_shared_wmma_matrix_a = tir.allocate([512], "float16", "wmma.matrix_a") + W_shared_wmma_matrix_b = tir.allocate([1024], "float16", "wmma.matrix_b") + tir.launch_thread(bx, 2) + tir.launch_thread(by, 4) + tir.launch_thread(ty, 4) + tir.launch_thread(tz, 2) tir.evaluate( tir.tvm_fill_fragment(Conv_wmma_accumulator, 16, 16, 16, 0, tir.float32(0), dtype="handle") ) @@ -1164,29 +1129,22 @@ def opt_conv_tensorcore_lower(A: ty.handle, W: ty.handle, Conv: ty.handle) -> No tir.evaluate( tir.tvm_fill_fragment(Conv_wmma_accumulator, 16, 16, 16, 7, tir.float32(0), dtype="handle") ) - for ic_outer in tir.range(0, 8): - for kh in tir.range(0, 3): - for ax2 in tir.range(0, 3): - with tir.attr( - tir.iter_var(threadIdx_x, None, "ThreadIndex", "threadIdx.x"), - "thread_extent", - 32, - ): + for ic_outer in tir.serial(0, 8): + for kh in tir.serial(0, 3): + for ax2 in tir.serial(0, 3): + with tir.launch_thread(tx, 32): Apad_shared[ - ( - (((threadIdx_y * 3072) + (threadIdx_z * 1536)) + (ax2 * 512)) - + threadIdx_x - ) + ((((ty * 3072) + (tz * 1536)) + (ax2 * 512)) + tx) ] = tir.if_then_else( ( ( ( - (1 <= (tir.floordiv(blockIdx_z, 14) + kh)) - and ((tir.floordiv(blockIdx_z, 14) + kh) < 15) + (1 <= (tir.floordiv(bz, 14) + kh)) + and ((tir.floordiv(bz, 14) + kh) < 15) ) - and (1 <= (ax2 + tir.floormod(blockIdx_z, 14))) + and (1 <= (ax2 + tir.floormod(bz, 14))) ) - and ((ax2 + tir.floormod(blockIdx_z, 14)) < 15) + and ((ax2 + tir.floormod(bz, 14)) < 15) ), tir.load( "float16", @@ -1198,21 +1156,18 @@ def opt_conv_tensorcore_lower(A: ty.handle, W: ty.handle, Conv: ty.handle) -> No ( ( ( - ( - (blockIdx_x * 6422528) - + (threadIdx_y * 1605632) - ) - + (threadIdx_z * 802816) + ((bx * 6422528) + (ty * 1605632)) + + (tz * 802816) ) + (kh * 57344) ) - + (blockIdx_z * 4096) + + (bz * 4096) ) + (ax2 * 4096) ) + (ic_outer * 512) ) - + threadIdx_x + + tx ) - 61440 ), @@ -1220,29 +1175,19 @@ def opt_conv_tensorcore_lower(A: ty.handle, W: ty.handle, Conv: ty.handle) -> No tir.float16(0), dtype="float16", ) - with tir.attr( - tir.iter_var(threadIdx_x, None, "ThreadIndex", "threadIdx.x"), - "thread_extent", - 32, - ): + with tir.launch_thread(tx, 32): Apad_shared[ - ( - ( - (((threadIdx_y * 3072) + (threadIdx_z * 1536)) + (ax2 * 512)) - + threadIdx_x - ) - + 32 - ) + (((((ty * 3072) + (tz * 1536)) + (ax2 * 512)) + tx) + 32) ] = tir.if_then_else( ( ( ( - (1 <= (tir.floordiv(blockIdx_z, 14) + kh)) - and ((tir.floordiv(blockIdx_z, 14) + kh) < 15) + (1 <= (tir.floordiv(bz, 14) + kh)) + and ((tir.floordiv(bz, 14) + kh) < 15) ) - and (1 <= (ax2 + tir.floormod(blockIdx_z, 14))) + and (1 <= (ax2 + tir.floormod(bz, 14))) ) - and ((ax2 + tir.floormod(blockIdx_z, 14)) < 15) + and ((ax2 + tir.floormod(bz, 14)) < 15) ), tir.load( "float16", @@ -1254,21 +1199,18 @@ def opt_conv_tensorcore_lower(A: ty.handle, W: ty.handle, Conv: ty.handle) -> No ( ( ( - ( - (blockIdx_x * 6422528) - + (threadIdx_y * 1605632) - ) - + (threadIdx_z * 802816) + ((bx * 6422528) + (ty * 1605632)) + + (tz * 802816) ) + (kh * 57344) ) - + (blockIdx_z * 4096) + + (bz * 4096) ) + (ax2 * 4096) ) + (ic_outer * 512) ) - + threadIdx_x + + tx ) - 61408 ), @@ -1276,29 +1218,19 @@ def opt_conv_tensorcore_lower(A: ty.handle, W: ty.handle, Conv: ty.handle) -> No tir.float16(0), dtype="float16", ) - with tir.attr( - tir.iter_var(threadIdx_x, None, "ThreadIndex", "threadIdx.x"), - "thread_extent", - 32, - ): + with tir.launch_thread(tx, 32): Apad_shared[ - ( - ( - (((threadIdx_y * 3072) + (threadIdx_z * 1536)) + (ax2 * 512)) - + threadIdx_x - ) - + 64 - ) + (((((ty * 3072) + (tz * 1536)) + (ax2 * 512)) + tx) + 64) ] = tir.if_then_else( ( ( ( - (1 <= (tir.floordiv(blockIdx_z, 14) + kh)) - and ((tir.floordiv(blockIdx_z, 14) + kh) < 15) + (1 <= (tir.floordiv(bz, 14) + kh)) + and ((tir.floordiv(bz, 14) + kh) < 15) ) - and (1 <= (ax2 + tir.floormod(blockIdx_z, 14))) + and (1 <= (ax2 + tir.floormod(bz, 14))) ) - and ((ax2 + tir.floormod(blockIdx_z, 14)) < 15) + and ((ax2 + tir.floormod(bz, 14)) < 15) ), tir.load( "float16", @@ -1310,21 +1242,18 @@ def opt_conv_tensorcore_lower(A: ty.handle, W: ty.handle, Conv: ty.handle) -> No ( ( ( - ( - (blockIdx_x * 6422528) - + (threadIdx_y * 1605632) - ) - + (threadIdx_z * 802816) + ((bx * 6422528) + (ty * 1605632)) + + (tz * 802816) ) + (kh * 57344) ) - + (blockIdx_z * 4096) + + (bz * 4096) ) + (ax2 * 4096) ) + (ic_outer * 512) ) - + threadIdx_x + + tx ) - 61376 ), @@ -1332,29 +1261,19 @@ def opt_conv_tensorcore_lower(A: ty.handle, W: ty.handle, Conv: ty.handle) -> No tir.float16(0), dtype="float16", ) - with tir.attr( - tir.iter_var(threadIdx_x, None, "ThreadIndex", "threadIdx.x"), - "thread_extent", - 32, - ): + with tir.launch_thread(tx, 32): Apad_shared[ - ( - ( - (((threadIdx_y * 3072) + (threadIdx_z * 1536)) + (ax2 * 512)) - + threadIdx_x - ) - + 96 - ) + (((((ty * 3072) + (tz * 1536)) + (ax2 * 512)) + tx) + 96) ] = tir.if_then_else( ( ( ( - (1 <= (tir.floordiv(blockIdx_z, 14) + kh)) - and ((tir.floordiv(blockIdx_z, 14) + kh) < 15) + (1 <= (tir.floordiv(bz, 14) + kh)) + and ((tir.floordiv(bz, 14) + kh) < 15) ) - and (1 <= (ax2 + tir.floormod(blockIdx_z, 14))) + and (1 <= (ax2 + tir.floormod(bz, 14))) ) - and ((ax2 + tir.floormod(blockIdx_z, 14)) < 15) + and ((ax2 + tir.floormod(bz, 14)) < 15) ), tir.load( "float16", @@ -1366,21 +1285,18 @@ def opt_conv_tensorcore_lower(A: ty.handle, W: ty.handle, Conv: ty.handle) -> No ( ( ( - ( - (blockIdx_x * 6422528) - + (threadIdx_y * 1605632) - ) - + (threadIdx_z * 802816) + ((bx * 6422528) + (ty * 1605632)) + + (tz * 802816) ) + (kh * 57344) ) - + (blockIdx_z * 4096) + + (bz * 4096) ) + (ax2 * 4096) ) + (ic_outer * 512) ) - + threadIdx_x + + tx ) - 61344 ), @@ -1388,29 +1304,19 @@ def opt_conv_tensorcore_lower(A: ty.handle, W: ty.handle, Conv: ty.handle) -> No tir.float16(0), dtype="float16", ) - with tir.attr( - tir.iter_var(threadIdx_x, None, "ThreadIndex", "threadIdx.x"), - "thread_extent", - 32, - ): + with tir.launch_thread(tx, 32): Apad_shared[ - ( - ( - (((threadIdx_y * 3072) + (threadIdx_z * 1536)) + (ax2 * 512)) - + threadIdx_x - ) - + 128 - ) + (((((ty * 3072) + (tz * 1536)) + (ax2 * 512)) + tx) + 128) ] = tir.if_then_else( ( ( ( - (1 <= (tir.floordiv(blockIdx_z, 14) + kh)) - and ((tir.floordiv(blockIdx_z, 14) + kh) < 15) + (1 <= (tir.floordiv(bz, 14) + kh)) + and ((tir.floordiv(bz, 14) + kh) < 15) ) - and (1 <= (ax2 + tir.floormod(blockIdx_z, 14))) + and (1 <= (ax2 + tir.floormod(bz, 14))) ) - and ((ax2 + tir.floormod(blockIdx_z, 14)) < 15) + and ((ax2 + tir.floormod(bz, 14)) < 15) ), tir.load( "float16", @@ -1422,21 +1328,18 @@ def opt_conv_tensorcore_lower(A: ty.handle, W: ty.handle, Conv: ty.handle) -> No ( ( ( - ( - (blockIdx_x * 6422528) - + (threadIdx_y * 1605632) - ) - + (threadIdx_z * 802816) + ((bx * 6422528) + (ty * 1605632)) + + (tz * 802816) ) + (kh * 57344) ) - + (blockIdx_z * 4096) + + (bz * 4096) ) + (ax2 * 4096) ) + (ic_outer * 512) ) - + threadIdx_x + + tx ) - 61312 ), @@ -1444,29 +1347,19 @@ def opt_conv_tensorcore_lower(A: ty.handle, W: ty.handle, Conv: ty.handle) -> No tir.float16(0), dtype="float16", ) - with tir.attr( - tir.iter_var(threadIdx_x, None, "ThreadIndex", "threadIdx.x"), - "thread_extent", - 32, - ): + with tir.launch_thread(tx, 32): Apad_shared[ - ( - ( - (((threadIdx_y * 3072) + (threadIdx_z * 1536)) + (ax2 * 512)) - + threadIdx_x - ) - + 160 - ) + (((((ty * 3072) + (tz * 1536)) + (ax2 * 512)) + tx) + 160) ] = tir.if_then_else( ( ( ( - (1 <= (tir.floordiv(blockIdx_z, 14) + kh)) - and ((tir.floordiv(blockIdx_z, 14) + kh) < 15) + (1 <= (tir.floordiv(bz, 14) + kh)) + and ((tir.floordiv(bz, 14) + kh) < 15) ) - and (1 <= (ax2 + tir.floormod(blockIdx_z, 14))) + and (1 <= (ax2 + tir.floormod(bz, 14))) ) - and ((ax2 + tir.floormod(blockIdx_z, 14)) < 15) + and ((ax2 + tir.floormod(bz, 14)) < 15) ), tir.load( "float16", @@ -1478,21 +1371,18 @@ def opt_conv_tensorcore_lower(A: ty.handle, W: ty.handle, Conv: ty.handle) -> No ( ( ( - ( - (blockIdx_x * 6422528) - + (threadIdx_y * 1605632) - ) - + (threadIdx_z * 802816) + ((bx * 6422528) + (ty * 1605632)) + + (tz * 802816) ) + (kh * 57344) ) - + (blockIdx_z * 4096) + + (bz * 4096) ) + (ax2 * 4096) ) + (ic_outer * 512) ) - + threadIdx_x + + tx ) - 61280 ), @@ -1500,29 +1390,19 @@ def opt_conv_tensorcore_lower(A: ty.handle, W: ty.handle, Conv: ty.handle) -> No tir.float16(0), dtype="float16", ) - with tir.attr( - tir.iter_var(threadIdx_x, None, "ThreadIndex", "threadIdx.x"), - "thread_extent", - 32, - ): + with tir.launch_thread(tx, 32): Apad_shared[ - ( - ( - (((threadIdx_y * 3072) + (threadIdx_z * 1536)) + (ax2 * 512)) - + threadIdx_x - ) - + 192 - ) + (((((ty * 3072) + (tz * 1536)) + (ax2 * 512)) + tx) + 192) ] = tir.if_then_else( ( ( ( - (1 <= (tir.floordiv(blockIdx_z, 14) + kh)) - and ((tir.floordiv(blockIdx_z, 14) + kh) < 15) + (1 <= (tir.floordiv(bz, 14) + kh)) + and ((tir.floordiv(bz, 14) + kh) < 15) ) - and (1 <= (ax2 + tir.floormod(blockIdx_z, 14))) + and (1 <= (ax2 + tir.floormod(bz, 14))) ) - and ((ax2 + tir.floormod(blockIdx_z, 14)) < 15) + and ((ax2 + tir.floormod(bz, 14)) < 15) ), tir.load( "float16", @@ -1534,21 +1414,18 @@ def opt_conv_tensorcore_lower(A: ty.handle, W: ty.handle, Conv: ty.handle) -> No ( ( ( - ( - (blockIdx_x * 6422528) - + (threadIdx_y * 1605632) - ) - + (threadIdx_z * 802816) + ((bx * 6422528) + (ty * 1605632)) + + (tz * 802816) ) + (kh * 57344) ) - + (blockIdx_z * 4096) + + (bz * 4096) ) + (ax2 * 4096) ) + (ic_outer * 512) ) - + threadIdx_x + + tx ) - 61248 ), @@ -1556,29 +1433,19 @@ def opt_conv_tensorcore_lower(A: ty.handle, W: ty.handle, Conv: ty.handle) -> No tir.float16(0), dtype="float16", ) - with tir.attr( - tir.iter_var(threadIdx_x, None, "ThreadIndex", "threadIdx.x"), - "thread_extent", - 32, - ): + with tir.launch_thread(tx, 32): Apad_shared[ - ( - ( - (((threadIdx_y * 3072) + (threadIdx_z * 1536)) + (ax2 * 512)) - + threadIdx_x - ) - + 224 - ) + (((((ty * 3072) + (tz * 1536)) + (ax2 * 512)) + tx) + 224) ] = tir.if_then_else( ( ( ( - (1 <= (tir.floordiv(blockIdx_z, 14) + kh)) - and ((tir.floordiv(blockIdx_z, 14) + kh) < 15) + (1 <= (tir.floordiv(bz, 14) + kh)) + and ((tir.floordiv(bz, 14) + kh) < 15) ) - and (1 <= (ax2 + tir.floormod(blockIdx_z, 14))) + and (1 <= (ax2 + tir.floormod(bz, 14))) ) - and ((ax2 + tir.floormod(blockIdx_z, 14)) < 15) + and ((ax2 + tir.floormod(bz, 14)) < 15) ), tir.load( "float16", @@ -1590,21 +1457,18 @@ def opt_conv_tensorcore_lower(A: ty.handle, W: ty.handle, Conv: ty.handle) -> No ( ( ( - ( - (blockIdx_x * 6422528) - + (threadIdx_y * 1605632) - ) - + (threadIdx_z * 802816) + ((bx * 6422528) + (ty * 1605632)) + + (tz * 802816) ) + (kh * 57344) ) - + (blockIdx_z * 4096) + + (bz * 4096) ) + (ax2 * 4096) ) + (ic_outer * 512) ) - + threadIdx_x + + tx ) - 61216 ), @@ -1612,29 +1476,19 @@ def opt_conv_tensorcore_lower(A: ty.handle, W: ty.handle, Conv: ty.handle) -> No tir.float16(0), dtype="float16", ) - with tir.attr( - tir.iter_var(threadIdx_x, None, "ThreadIndex", "threadIdx.x"), - "thread_extent", - 32, - ): + with tir.launch_thread(tx, 32): Apad_shared[ - ( - ( - (((threadIdx_y * 3072) + (threadIdx_z * 1536)) + (ax2 * 512)) - + threadIdx_x - ) - + 256 - ) + (((((ty * 3072) + (tz * 1536)) + (ax2 * 512)) + tx) + 256) ] = tir.if_then_else( ( ( ( - (1 <= (tir.floordiv(blockIdx_z, 14) + kh)) - and ((tir.floordiv(blockIdx_z, 14) + kh) < 15) + (1 <= (tir.floordiv(bz, 14) + kh)) + and ((tir.floordiv(bz, 14) + kh) < 15) ) - and (1 <= (ax2 + tir.floormod(blockIdx_z, 14))) + and (1 <= (ax2 + tir.floormod(bz, 14))) ) - and ((ax2 + tir.floormod(blockIdx_z, 14)) < 15) + and ((ax2 + tir.floormod(bz, 14)) < 15) ), tir.load( "float16", @@ -1646,21 +1500,18 @@ def opt_conv_tensorcore_lower(A: ty.handle, W: ty.handle, Conv: ty.handle) -> No ( ( ( - ( - (blockIdx_x * 6422528) - + (threadIdx_y * 1605632) - ) - + (threadIdx_z * 802816) + ((bx * 6422528) + (ty * 1605632)) + + (tz * 802816) ) + (kh * 57344) ) - + (blockIdx_z * 4096) + + (bz * 4096) ) + (ax2 * 4096) ) + (ic_outer * 512) ) - + threadIdx_x + + tx ) - 61184 ), @@ -1668,29 +1519,19 @@ def opt_conv_tensorcore_lower(A: ty.handle, W: ty.handle, Conv: ty.handle) -> No tir.float16(0), dtype="float16", ) - with tir.attr( - tir.iter_var(threadIdx_x, None, "ThreadIndex", "threadIdx.x"), - "thread_extent", - 32, - ): + with tir.launch_thread(tx, 32): Apad_shared[ - ( - ( - (((threadIdx_y * 3072) + (threadIdx_z * 1536)) + (ax2 * 512)) - + threadIdx_x - ) - + 288 - ) + (((((ty * 3072) + (tz * 1536)) + (ax2 * 512)) + tx) + 288) ] = tir.if_then_else( ( ( ( - (1 <= (tir.floordiv(blockIdx_z, 14) + kh)) - and ((tir.floordiv(blockIdx_z, 14) + kh) < 15) + (1 <= (tir.floordiv(bz, 14) + kh)) + and ((tir.floordiv(bz, 14) + kh) < 15) ) - and (1 <= (ax2 + tir.floormod(blockIdx_z, 14))) + and (1 <= (ax2 + tir.floormod(bz, 14))) ) - and ((ax2 + tir.floormod(blockIdx_z, 14)) < 15) + and ((ax2 + tir.floormod(bz, 14)) < 15) ), tir.load( "float16", @@ -1702,21 +1543,18 @@ def opt_conv_tensorcore_lower(A: ty.handle, W: ty.handle, Conv: ty.handle) -> No ( ( ( - ( - (blockIdx_x * 6422528) - + (threadIdx_y * 1605632) - ) - + (threadIdx_z * 802816) + ((bx * 6422528) + (ty * 1605632)) + + (tz * 802816) ) + (kh * 57344) ) - + (blockIdx_z * 4096) + + (bz * 4096) ) + (ax2 * 4096) ) + (ic_outer * 512) ) - + threadIdx_x + + tx ) - 61152 ), @@ -1724,29 +1562,19 @@ def opt_conv_tensorcore_lower(A: ty.handle, W: ty.handle, Conv: ty.handle) -> No tir.float16(0), dtype="float16", ) - with tir.attr( - tir.iter_var(threadIdx_x, None, "ThreadIndex", "threadIdx.x"), - "thread_extent", - 32, - ): + with tir.launch_thread(tx, 32): Apad_shared[ - ( - ( - (((threadIdx_y * 3072) + (threadIdx_z * 1536)) + (ax2 * 512)) - + threadIdx_x - ) - + 320 - ) + (((((ty * 3072) + (tz * 1536)) + (ax2 * 512)) + tx) + 320) ] = tir.if_then_else( ( ( ( - (1 <= (tir.floordiv(blockIdx_z, 14) + kh)) - and ((tir.floordiv(blockIdx_z, 14) + kh) < 15) + (1 <= (tir.floordiv(bz, 14) + kh)) + and ((tir.floordiv(bz, 14) + kh) < 15) ) - and (1 <= (ax2 + tir.floormod(blockIdx_z, 14))) + and (1 <= (ax2 + tir.floormod(bz, 14))) ) - and ((ax2 + tir.floormod(blockIdx_z, 14)) < 15) + and ((ax2 + tir.floormod(bz, 14)) < 15) ), tir.load( "float16", @@ -1758,21 +1586,18 @@ def opt_conv_tensorcore_lower(A: ty.handle, W: ty.handle, Conv: ty.handle) -> No ( ( ( - ( - (blockIdx_x * 6422528) - + (threadIdx_y * 1605632) - ) - + (threadIdx_z * 802816) + ((bx * 6422528) + (ty * 1605632)) + + (tz * 802816) ) + (kh * 57344) ) - + (blockIdx_z * 4096) + + (bz * 4096) ) + (ax2 * 4096) ) + (ic_outer * 512) ) - + threadIdx_x + + tx ) - 61120 ), @@ -1780,29 +1605,19 @@ def opt_conv_tensorcore_lower(A: ty.handle, W: ty.handle, Conv: ty.handle) -> No tir.float16(0), dtype="float16", ) - with tir.attr( - tir.iter_var(threadIdx_x, None, "ThreadIndex", "threadIdx.x"), - "thread_extent", - 32, - ): + with tir.launch_thread(tx, 32): Apad_shared[ - ( - ( - (((threadIdx_y * 3072) + (threadIdx_z * 1536)) + (ax2 * 512)) - + threadIdx_x - ) - + 352 - ) + (((((ty * 3072) + (tz * 1536)) + (ax2 * 512)) + tx) + 352) ] = tir.if_then_else( ( ( ( - (1 <= (tir.floordiv(blockIdx_z, 14) + kh)) - and ((tir.floordiv(blockIdx_z, 14) + kh) < 15) + (1 <= (tir.floordiv(bz, 14) + kh)) + and ((tir.floordiv(bz, 14) + kh) < 15) ) - and (1 <= (ax2 + tir.floormod(blockIdx_z, 14))) + and (1 <= (ax2 + tir.floormod(bz, 14))) ) - and ((ax2 + tir.floormod(blockIdx_z, 14)) < 15) + and ((ax2 + tir.floormod(bz, 14)) < 15) ), tir.load( "float16", @@ -1814,21 +1629,18 @@ def opt_conv_tensorcore_lower(A: ty.handle, W: ty.handle, Conv: ty.handle) -> No ( ( ( - ( - (blockIdx_x * 6422528) - + (threadIdx_y * 1605632) - ) - + (threadIdx_z * 802816) + ((bx * 6422528) + (ty * 1605632)) + + (tz * 802816) ) + (kh * 57344) ) - + (blockIdx_z * 4096) + + (bz * 4096) ) + (ax2 * 4096) ) + (ic_outer * 512) ) - + threadIdx_x + + tx ) - 61088 ), @@ -1836,29 +1648,19 @@ def opt_conv_tensorcore_lower(A: ty.handle, W: ty.handle, Conv: ty.handle) -> No tir.float16(0), dtype="float16", ) - with tir.attr( - tir.iter_var(threadIdx_x, None, "ThreadIndex", "threadIdx.x"), - "thread_extent", - 32, - ): + with tir.launch_thread(tx, 32): Apad_shared[ - ( - ( - (((threadIdx_y * 3072) + (threadIdx_z * 1536)) + (ax2 * 512)) - + threadIdx_x - ) - + 384 - ) + (((((ty * 3072) + (tz * 1536)) + (ax2 * 512)) + tx) + 384) ] = tir.if_then_else( ( ( ( - (1 <= (tir.floordiv(blockIdx_z, 14) + kh)) - and ((tir.floordiv(blockIdx_z, 14) + kh) < 15) + (1 <= (tir.floordiv(bz, 14) + kh)) + and ((tir.floordiv(bz, 14) + kh) < 15) ) - and (1 <= (ax2 + tir.floormod(blockIdx_z, 14))) + and (1 <= (ax2 + tir.floormod(bz, 14))) ) - and ((ax2 + tir.floormod(blockIdx_z, 14)) < 15) + and ((ax2 + tir.floormod(bz, 14)) < 15) ), tir.load( "float16", @@ -1870,21 +1672,18 @@ def opt_conv_tensorcore_lower(A: ty.handle, W: ty.handle, Conv: ty.handle) -> No ( ( ( - ( - (blockIdx_x * 6422528) - + (threadIdx_y * 1605632) - ) - + (threadIdx_z * 802816) + ((bx * 6422528) + (ty * 1605632)) + + (tz * 802816) ) + (kh * 57344) ) - + (blockIdx_z * 4096) + + (bz * 4096) ) + (ax2 * 4096) ) + (ic_outer * 512) ) - + threadIdx_x + + tx ) - 61056 ), @@ -1892,29 +1691,19 @@ def opt_conv_tensorcore_lower(A: ty.handle, W: ty.handle, Conv: ty.handle) -> No tir.float16(0), dtype="float16", ) - with tir.attr( - tir.iter_var(threadIdx_x, None, "ThreadIndex", "threadIdx.x"), - "thread_extent", - 32, - ): + with tir.launch_thread(tx, 32): Apad_shared[ - ( - ( - (((threadIdx_y * 3072) + (threadIdx_z * 1536)) + (ax2 * 512)) - + threadIdx_x - ) - + 416 - ) + (((((ty * 3072) + (tz * 1536)) + (ax2 * 512)) + tx) + 416) ] = tir.if_then_else( ( ( ( - (1 <= (tir.floordiv(blockIdx_z, 14) + kh)) - and ((tir.floordiv(blockIdx_z, 14) + kh) < 15) + (1 <= (tir.floordiv(bz, 14) + kh)) + and ((tir.floordiv(bz, 14) + kh) < 15) ) - and (1 <= (ax2 + tir.floormod(blockIdx_z, 14))) + and (1 <= (ax2 + tir.floormod(bz, 14))) ) - and ((ax2 + tir.floormod(blockIdx_z, 14)) < 15) + and ((ax2 + tir.floormod(bz, 14)) < 15) ), tir.load( "float16", @@ -1926,21 +1715,18 @@ def opt_conv_tensorcore_lower(A: ty.handle, W: ty.handle, Conv: ty.handle) -> No ( ( ( - ( - (blockIdx_x * 6422528) - + (threadIdx_y * 1605632) - ) - + (threadIdx_z * 802816) + ((bx * 6422528) + (ty * 1605632)) + + (tz * 802816) ) + (kh * 57344) ) - + (blockIdx_z * 4096) + + (bz * 4096) ) + (ax2 * 4096) ) + (ic_outer * 512) ) - + threadIdx_x + + tx ) - 61024 ), @@ -1948,29 +1734,19 @@ def opt_conv_tensorcore_lower(A: ty.handle, W: ty.handle, Conv: ty.handle) -> No tir.float16(0), dtype="float16", ) - with tir.attr( - tir.iter_var(threadIdx_x, None, "ThreadIndex", "threadIdx.x"), - "thread_extent", - 32, - ): + with tir.launch_thread(tx, 32): Apad_shared[ - ( - ( - (((threadIdx_y * 3072) + (threadIdx_z * 1536)) + (ax2 * 512)) - + threadIdx_x - ) - + 448 - ) + (((((ty * 3072) + (tz * 1536)) + (ax2 * 512)) + tx) + 448) ] = tir.if_then_else( ( ( ( - (1 <= (tir.floordiv(blockIdx_z, 14) + kh)) - and ((tir.floordiv(blockIdx_z, 14) + kh) < 15) + (1 <= (tir.floordiv(bz, 14) + kh)) + and ((tir.floordiv(bz, 14) + kh) < 15) ) - and (1 <= (ax2 + tir.floormod(blockIdx_z, 14))) + and (1 <= (ax2 + tir.floormod(bz, 14))) ) - and ((ax2 + tir.floormod(blockIdx_z, 14)) < 15) + and ((ax2 + tir.floormod(bz, 14)) < 15) ), tir.load( "float16", @@ -1982,21 +1758,18 @@ def opt_conv_tensorcore_lower(A: ty.handle, W: ty.handle, Conv: ty.handle) -> No ( ( ( - ( - (blockIdx_x * 6422528) - + (threadIdx_y * 1605632) - ) - + (threadIdx_z * 802816) + ((bx * 6422528) + (ty * 1605632)) + + (tz * 802816) ) + (kh * 57344) ) - + (blockIdx_z * 4096) + + (bz * 4096) ) + (ax2 * 4096) ) + (ic_outer * 512) ) - + threadIdx_x + + tx ) - 60992 ), @@ -2004,29 +1777,19 @@ def opt_conv_tensorcore_lower(A: ty.handle, W: ty.handle, Conv: ty.handle) -> No tir.float16(0), dtype="float16", ) - tir.attr( - tir.iter_var(threadIdx_x, None, "ThreadIndex", "threadIdx.x"), - "thread_extent", - 32, - ) + tir.launch_thread(tx, 32) Apad_shared[ - ( - ( - (((threadIdx_y * 3072) + (threadIdx_z * 1536)) + (ax2 * 512)) - + threadIdx_x - ) - + 480 - ) + (((((ty * 3072) + (tz * 1536)) + (ax2 * 512)) + tx) + 480) ] = tir.if_then_else( ( ( ( - (1 <= (tir.floordiv(blockIdx_z, 14) + kh)) - and ((tir.floordiv(blockIdx_z, 14) + kh) < 15) + (1 <= (tir.floordiv(bz, 14) + kh)) + and ((tir.floordiv(bz, 14) + kh) < 15) ) - and (1 <= (ax2 + tir.floormod(blockIdx_z, 14))) + and (1 <= (ax2 + tir.floormod(bz, 14))) ) - and ((ax2 + tir.floormod(blockIdx_z, 14)) < 15) + and ((ax2 + tir.floormod(bz, 14)) < 15) ), tir.load( "float16", @@ -2037,22 +1800,16 @@ def opt_conv_tensorcore_lower(A: ty.handle, W: ty.handle, Conv: ty.handle) -> No ( ( ( - ( - ( - (blockIdx_x * 6422528) - + (threadIdx_y * 1605632) - ) - + (threadIdx_z * 802816) - ) + (((bx * 6422528) + (ty * 1605632)) + (tz * 802816)) + (kh * 57344) ) - + (blockIdx_z * 4096) + + (bz * 4096) ) + (ax2 * 4096) ) + (ic_outer * 512) ) - + threadIdx_x + + tx ) - 60960 ), @@ -2060,14 +1817,10 @@ def opt_conv_tensorcore_lower(A: ty.handle, W: ty.handle, Conv: ty.handle) -> No tir.float16(0), dtype="float16", ) - with tir.attr( - tir.iter_var(threadIdx_x, None, "ThreadIndex", "threadIdx.x"), "thread_extent", 32 - ): + with tir.launch_thread(tx, 32): tir.store( W_shared, - tir.ramp( - (((threadIdx_y * 512) + (threadIdx_z * 256)) + (threadIdx_x * 8)), 1, 8 - ), + tir.ramp((((ty * 512) + (tz * 256)) + (tx * 8)), 1, 8), tir.load( "float16x8", W_1.data, @@ -2075,12 +1828,12 @@ def opt_conv_tensorcore_lower(A: ty.handle, W: ty.handle, Conv: ty.handle) -> No ( ( ( - (((kh * 393216) + (ic_outer * 16384)) + (blockIdx_y * 2048)) - + (threadIdx_y * 512) + (((kh * 393216) + (ic_outer * 16384)) + (by * 2048)) + + (ty * 512) ) - + (threadIdx_z * 256) + + (tz * 256) ) - + (threadIdx_x * 8) + + (tx * 8) ), 1, 8, @@ -2089,16 +1842,10 @@ def opt_conv_tensorcore_lower(A: ty.handle, W: ty.handle, Conv: ty.handle) -> No ), tir.broadcast(True, 8), ) - with tir.attr( - tir.iter_var(threadIdx_x, None, "ThreadIndex", "threadIdx.x"), "thread_extent", 32 - ): + with tir.launch_thread(tx, 32): tir.store( W_shared, - tir.ramp( - ((((threadIdx_y * 512) + (threadIdx_z * 256)) + (threadIdx_x * 8)) + 2048), - 1, - 8, - ), + tir.ramp(((((ty * 512) + (tz * 256)) + (tx * 8)) + 2048), 1, 8), tir.load( "float16x8", W_1.data, @@ -2107,15 +1854,12 @@ def opt_conv_tensorcore_lower(A: ty.handle, W: ty.handle, Conv: ty.handle) -> No ( ( ( - ( - ((kh * 393216) + (ic_outer * 16384)) - + (blockIdx_y * 2048) - ) - + (threadIdx_y * 512) + (((kh * 393216) + (ic_outer * 16384)) + (by * 2048)) + + (ty * 512) ) - + (threadIdx_z * 256) + + (tz * 256) ) - + (threadIdx_x * 8) + + (tx * 8) ) + 8192 ), @@ -2126,16 +1870,10 @@ def opt_conv_tensorcore_lower(A: ty.handle, W: ty.handle, Conv: ty.handle) -> No ), tir.broadcast(True, 8), ) - with tir.attr( - tir.iter_var(threadIdx_x, None, "ThreadIndex", "threadIdx.x"), "thread_extent", 32 - ): + with tir.launch_thread(tx, 32): tir.store( W_shared, - tir.ramp( - ((((threadIdx_y * 512) + (threadIdx_z * 256)) + (threadIdx_x * 8)) + 4096), - 1, - 8, - ), + tir.ramp(((((ty * 512) + (tz * 256)) + (tx * 8)) + 4096), 1, 8), tir.load( "float16x8", W_1.data, @@ -2144,15 +1882,12 @@ def opt_conv_tensorcore_lower(A: ty.handle, W: ty.handle, Conv: ty.handle) -> No ( ( ( - ( - ((kh * 393216) + (ic_outer * 16384)) - + (blockIdx_y * 2048) - ) - + (threadIdx_y * 512) + (((kh * 393216) + (ic_outer * 16384)) + (by * 2048)) + + (ty * 512) ) - + (threadIdx_z * 256) + + (tz * 256) ) - + (threadIdx_x * 8) + + (tx * 8) ) + 131072 ), @@ -2163,16 +1898,10 @@ def opt_conv_tensorcore_lower(A: ty.handle, W: ty.handle, Conv: ty.handle) -> No ), tir.broadcast(True, 8), ) - with tir.attr( - tir.iter_var(threadIdx_x, None, "ThreadIndex", "threadIdx.x"), "thread_extent", 32 - ): + with tir.launch_thread(tx, 32): tir.store( W_shared, - tir.ramp( - ((((threadIdx_y * 512) + (threadIdx_z * 256)) + (threadIdx_x * 8)) + 6144), - 1, - 8, - ), + tir.ramp(((((ty * 512) + (tz * 256)) + (tx * 8)) + 6144), 1, 8), tir.load( "float16x8", W_1.data, @@ -2181,15 +1910,12 @@ def opt_conv_tensorcore_lower(A: ty.handle, W: ty.handle, Conv: ty.handle) -> No ( ( ( - ( - ((kh * 393216) + (ic_outer * 16384)) - + (blockIdx_y * 2048) - ) - + (threadIdx_y * 512) + (((kh * 393216) + (ic_outer * 16384)) + (by * 2048)) + + (ty * 512) ) - + (threadIdx_z * 256) + + (tz * 256) ) - + (threadIdx_x * 8) + + (tx * 8) ) + 139264 ), @@ -2200,16 +1926,10 @@ def opt_conv_tensorcore_lower(A: ty.handle, W: ty.handle, Conv: ty.handle) -> No ), tir.broadcast(True, 8), ) - with tir.attr( - tir.iter_var(threadIdx_x, None, "ThreadIndex", "threadIdx.x"), "thread_extent", 32 - ): + with tir.launch_thread(tx, 32): tir.store( W_shared, - tir.ramp( - ((((threadIdx_y * 512) + (threadIdx_z * 256)) + (threadIdx_x * 8)) + 8192), - 1, - 8, - ), + tir.ramp(((((ty * 512) + (tz * 256)) + (tx * 8)) + 8192), 1, 8), tir.load( "float16x8", W_1.data, @@ -2218,15 +1938,12 @@ def opt_conv_tensorcore_lower(A: ty.handle, W: ty.handle, Conv: ty.handle) -> No ( ( ( - ( - ((kh * 393216) + (ic_outer * 16384)) - + (blockIdx_y * 2048) - ) - + (threadIdx_y * 512) + (((kh * 393216) + (ic_outer * 16384)) + (by * 2048)) + + (ty * 512) ) - + (threadIdx_z * 256) + + (tz * 256) ) - + (threadIdx_x * 8) + + (tx * 8) ) + 262144 ), @@ -2237,16 +1954,10 @@ def opt_conv_tensorcore_lower(A: ty.handle, W: ty.handle, Conv: ty.handle) -> No ), tir.broadcast(True, 8), ) - with tir.attr( - tir.iter_var(threadIdx_x, None, "ThreadIndex", "threadIdx.x"), "thread_extent", 32 - ): + with tir.launch_thread(tx, 32): tir.store( W_shared, - tir.ramp( - ((((threadIdx_y * 512) + (threadIdx_z * 256)) + (threadIdx_x * 8)) + 10240), - 1, - 8, - ), + tir.ramp(((((ty * 512) + (tz * 256)) + (tx * 8)) + 10240), 1, 8), tir.load( "float16x8", W_1.data, @@ -2255,15 +1966,12 @@ def opt_conv_tensorcore_lower(A: ty.handle, W: ty.handle, Conv: ty.handle) -> No ( ( ( - ( - ((kh * 393216) + (ic_outer * 16384)) - + (blockIdx_y * 2048) - ) - + (threadIdx_y * 512) + (((kh * 393216) + (ic_outer * 16384)) + (by * 2048)) + + (ty * 512) ) - + (threadIdx_z * 256) + + (tz * 256) ) - + (threadIdx_x * 8) + + (tx * 8) ) + 270336 ), @@ -2274,8 +1982,8 @@ def opt_conv_tensorcore_lower(A: ty.handle, W: ty.handle, Conv: ty.handle) -> No ), tir.broadcast(True, 8), ) - for ic_inner in tir.range(0, 2): - for kw in tir.range(0, 3): + for ic_inner in tir.serial(0, 2): + for kw in tir.serial(0, 3): tir.evaluate( tir.tvm_load_matrix_sync( Apad_shared_wmma_matrix_a, @@ -2286,7 +1994,7 @@ def opt_conv_tensorcore_lower(A: ty.handle, W: ty.handle, Conv: ty.handle) -> No tir.tvm_access_ptr( tir.type_annotation(dtype="float16"), Apad_shared, - (((threadIdx_y * 3072) + (kw * 512)) + (ic_inner * 256)), + (((ty * 3072) + (kw * 512)) + (ic_inner * 256)), 256, 1, dtype="handle", @@ -2306,7 +2014,7 @@ def opt_conv_tensorcore_lower(A: ty.handle, W: ty.handle, Conv: ty.handle) -> No tir.tvm_access_ptr( tir.type_annotation(dtype="float16"), Apad_shared, - ((((threadIdx_y * 3072) + (kw * 512)) + (ic_inner * 256)) + 1536), + ((((ty * 3072) + (kw * 512)) + (ic_inner * 256)) + 1536), 256, 1, dtype="handle", @@ -2326,7 +2034,7 @@ def opt_conv_tensorcore_lower(A: ty.handle, W: ty.handle, Conv: ty.handle) -> No tir.tvm_access_ptr( tir.type_annotation(dtype="float16"), W_shared, - (((kw * 4096) + (ic_inner * 2048)) + (threadIdx_z * 1024)), + (((kw * 4096) + (ic_inner * 2048)) + (tz * 1024)), 256, 1, dtype="handle", @@ -2346,7 +2054,7 @@ def opt_conv_tensorcore_lower(A: ty.handle, W: ty.handle, Conv: ty.handle) -> No tir.tvm_access_ptr( tir.type_annotation(dtype="float16"), W_shared, - ((((kw * 4096) + (ic_inner * 2048)) + (threadIdx_z * 1024)) + 256), + ((((kw * 4096) + (ic_inner * 2048)) + (tz * 1024)) + 256), 256, 1, dtype="handle", @@ -2366,7 +2074,7 @@ def opt_conv_tensorcore_lower(A: ty.handle, W: ty.handle, Conv: ty.handle) -> No tir.tvm_access_ptr( tir.type_annotation(dtype="float16"), W_shared, - ((((kw * 4096) + (ic_inner * 2048)) + (threadIdx_z * 1024)) + 512), + ((((kw * 4096) + (ic_inner * 2048)) + (tz * 1024)) + 512), 256, 1, dtype="handle", @@ -2386,7 +2094,7 @@ def opt_conv_tensorcore_lower(A: ty.handle, W: ty.handle, Conv: ty.handle) -> No tir.tvm_access_ptr( tir.type_annotation(dtype="float16"), W_shared, - ((((kw * 4096) + (ic_inner * 2048)) + (threadIdx_z * 1024)) + 768), + ((((kw * 4096) + (ic_inner * 2048)) + (tz * 1024)) + 768), 256, 1, dtype="handle", @@ -2510,13 +2218,7 @@ def opt_conv_tensorcore_lower(A: ty.handle, W: ty.handle, Conv: ty.handle) -> No tir.tvm_access_ptr( tir.type_annotation(dtype="float32"), Conv_1.data, - ( - ( - (((blockIdx_x * 12845056) + (threadIdx_y * 3211264)) + (blockIdx_z * 8192)) - + (blockIdx_y * 2048) - ) - + (threadIdx_z * 1024) - ), + (((((bx * 12845056) + (ty * 3211264)) + (bz * 8192)) + (by * 2048)) + (tz * 1024)), 256, 2, dtype="handle", @@ -2538,14 +2240,8 @@ def opt_conv_tensorcore_lower(A: ty.handle, W: ty.handle, Conv: ty.handle) -> No Conv_1.data, ( ( - ( - ( - ((blockIdx_x * 12845056) + (threadIdx_y * 3211264)) - + (blockIdx_z * 8192) - ) - + (blockIdx_y * 2048) - ) - + (threadIdx_z * 1024) + ((((bx * 12845056) + (ty * 3211264)) + (bz * 8192)) + (by * 2048)) + + (tz * 1024) ) + 256 ), @@ -2570,14 +2266,8 @@ def opt_conv_tensorcore_lower(A: ty.handle, W: ty.handle, Conv: ty.handle) -> No Conv_1.data, ( ( - ( - ( - ((blockIdx_x * 12845056) + (threadIdx_y * 3211264)) - + (blockIdx_z * 8192) - ) - + (blockIdx_y * 2048) - ) - + (threadIdx_z * 1024) + ((((bx * 12845056) + (ty * 3211264)) + (bz * 8192)) + (by * 2048)) + + (tz * 1024) ) + 512 ), @@ -2602,14 +2292,8 @@ def opt_conv_tensorcore_lower(A: ty.handle, W: ty.handle, Conv: ty.handle) -> No Conv_1.data, ( ( - ( - ( - ((blockIdx_x * 12845056) + (threadIdx_y * 3211264)) - + (blockIdx_z * 8192) - ) - + (blockIdx_y * 2048) - ) - + (threadIdx_z * 1024) + ((((bx * 12845056) + (ty * 3211264)) + (bz * 8192)) + (by * 2048)) + + (tz * 1024) ) + 768 ), @@ -2634,14 +2318,8 @@ def opt_conv_tensorcore_lower(A: ty.handle, W: ty.handle, Conv: ty.handle) -> No Conv_1.data, ( ( - ( - ( - ((blockIdx_x * 12845056) + (threadIdx_y * 3211264)) - + (blockIdx_z * 8192) - ) - + (blockIdx_y * 2048) - ) - + (threadIdx_z * 1024) + ((((bx * 12845056) + (ty * 3211264)) + (bz * 8192)) + (by * 2048)) + + (tz * 1024) ) + 1605632 ), @@ -2666,14 +2344,8 @@ def opt_conv_tensorcore_lower(A: ty.handle, W: ty.handle, Conv: ty.handle) -> No Conv_1.data, ( ( - ( - ( - ((blockIdx_x * 12845056) + (threadIdx_y * 3211264)) - + (blockIdx_z * 8192) - ) - + (blockIdx_y * 2048) - ) - + (threadIdx_z * 1024) + ((((bx * 12845056) + (ty * 3211264)) + (bz * 8192)) + (by * 2048)) + + (tz * 1024) ) + 1605888 ), @@ -2698,14 +2370,8 @@ def opt_conv_tensorcore_lower(A: ty.handle, W: ty.handle, Conv: ty.handle) -> No Conv_1.data, ( ( - ( - ( - ((blockIdx_x * 12845056) + (threadIdx_y * 3211264)) - + (blockIdx_z * 8192) - ) - + (blockIdx_y * 2048) - ) - + (threadIdx_z * 1024) + ((((bx * 12845056) + (ty * 3211264)) + (bz * 8192)) + (by * 2048)) + + (tz * 1024) ) + 1606144 ), @@ -2730,14 +2396,8 @@ def opt_conv_tensorcore_lower(A: ty.handle, W: ty.handle, Conv: ty.handle) -> No Conv_1.data, ( ( - ( - ( - ((blockIdx_x * 12845056) + (threadIdx_y * 3211264)) - + (blockIdx_z * 8192) - ) - + (blockIdx_y * 2048) - ) - + (threadIdx_z * 1024) + ((((bx * 12845056) + (ty * 3211264)) + (bz * 8192)) + (by * 2048)) + + (tz * 1024) ) + 1606400 ), @@ -2817,38 +2477,38 @@ def opt_conv_tensorcore_mod_host( tir.tvm_struct_get(arg0, 0, 7, dtype="uint16") == tir.uint16(1) ), "arg0.dtype is expected to be float16" assert 16 == tir.cast( - "int32", tir.load("int64", arg0_shape, 0) + tir.load("int64", arg0_shape, 0), "int32" ), "Argument arg0.shape[0] has an unsatisfied constraint" assert 14 == tir.cast( - "int32", tir.load("int64", arg0_shape, 1) + tir.load("int64", arg0_shape, 1), "int32" ), "Argument arg0.shape[1] has an unsatisfied constraint" assert 14 == tir.cast( - "int32", tir.load("int64", arg0_shape, 2) + tir.load("int64", arg0_shape, 2), "int32" ), "Argument arg0.shape[2] has an unsatisfied constraint" assert 16 == tir.cast( - "int32", tir.load("int64", arg0_shape, 3) + tir.load("int64", arg0_shape, 3), "int32" ), "Argument arg0.shape[3] has an unsatisfied constraint" assert 16 == tir.cast( - "int32", tir.load("int64", arg0_shape, 4) + tir.load("int64", arg0_shape, 4), "int32" ), "Argument arg0.shape[4] has an unsatisfied constraint" assert 16 == tir.cast( - "int32", tir.load("int64", arg0_shape, 5) + tir.load("int64", arg0_shape, 5), "int32" ), "Argument arg0.shape[5] has an unsatisfied constraint" if not (tir.isnullptr(arg0_strides, dtype="bool")): assert ( ( ( ( - (1 == tir.cast("int32", tir.load("int64", arg0_strides, 5))) - and (16 == tir.cast("int32", tir.load("int64", arg0_strides, 4))) + (1 == tir.cast(tir.load("int64", arg0_strides, 5), "int32")) + and (16 == tir.cast(tir.load("int64", arg0_strides, 4), "int32")) ) - and (256 == tir.cast("int32", tir.load("int64", arg0_strides, 3))) + and (256 == tir.cast(tir.load("int64", arg0_strides, 3), "int32")) ) - and (4096 == tir.cast("int32", tir.load("int64", arg0_strides, 2))) + and (4096 == tir.cast(tir.load("int64", arg0_strides, 2), "int32")) ) - and (57344 == tir.cast("int32", tir.load("int64", arg0_strides, 1))) + and (57344 == tir.cast(tir.load("int64", arg0_strides, 1), "int32")) ) and ( - 802816 == tir.cast("int32", tir.load("int64", arg0_strides, 0)) + 802816 == tir.cast(tir.load("int64", arg0_strides, 0), "int32") ), "arg0.strides: expected to be compact array" tir.evaluate(0) assert tir.uint64(0) == tir.tvm_struct_get( @@ -2866,38 +2526,38 @@ def opt_conv_tensorcore_mod_host( tir.tvm_struct_get(arg1, 0, 7, dtype="uint16") == tir.uint16(1) ), "arg1.dtype is expected to be float16" assert 3 == tir.cast( - "int32", tir.load("int64", arg1_shape, 0) + tir.load("int64", arg1_shape, 0), "int32" ), "Argument arg1.shape[0] has an unsatisfied constraint" assert 3 == tir.cast( - "int32", tir.load("int64", arg1_shape, 1) + tir.load("int64", arg1_shape, 1), "int32" ), "Argument arg1.shape[1] has an unsatisfied constraint" assert 16 == tir.cast( - "int32", tir.load("int64", arg1_shape, 2) + tir.load("int64", arg1_shape, 2), "int32" ), "Argument arg1.shape[2] has an unsatisfied constraint" assert 32 == tir.cast( - "int32", tir.load("int64", arg1_shape, 3) + tir.load("int64", arg1_shape, 3), "int32" ), "Argument arg1.shape[3] has an unsatisfied constraint" assert 16 == tir.cast( - "int32", tir.load("int64", arg1_shape, 4) + tir.load("int64", arg1_shape, 4), "int32" ), "Argument arg1.shape[4] has an unsatisfied constraint" assert 16 == tir.cast( - "int32", tir.load("int64", arg1_shape, 5) + tir.load("int64", arg1_shape, 5), "int32" ), "Argument arg1.shape[5] has an unsatisfied constraint" if not (tir.isnullptr(arg1_strides, dtype="bool")): assert ( ( ( ( - (1 == tir.cast("int32", tir.load("int64", arg1_strides, 5))) - and (16 == tir.cast("int32", tir.load("int64", arg1_strides, 4))) + (1 == tir.cast(tir.load("int64", arg1_strides, 5), "int32")) + and (16 == tir.cast(tir.load("int64", arg1_strides, 4), "int32")) ) - and (256 == tir.cast("int32", tir.load("int64", arg1_strides, 3))) + and (256 == tir.cast(tir.load("int64", arg1_strides, 3), "int32")) ) - and (8192 == tir.cast("int32", tir.load("int64", arg1_strides, 2))) + and (8192 == tir.cast(tir.load("int64", arg1_strides, 2), "int32")) ) - and (131072 == tir.cast("int32", tir.load("int64", arg1_strides, 1))) + and (131072 == tir.cast(tir.load("int64", arg1_strides, 1), "int32")) ) and ( - 393216 == tir.cast("int32", tir.load("int64", arg1_strides, 0)) + 393216 == tir.cast(tir.load("int64", arg1_strides, 0), "int32") ), "arg1.strides: expected to be compact array" tir.evaluate(0) assert tir.uint64(0) == tir.tvm_struct_get( @@ -2918,38 +2578,38 @@ def opt_conv_tensorcore_mod_host( tir.tvm_struct_get(arg2, 0, 7, dtype="uint16") == tir.uint16(1) ), "arg2.dtype is expected to be float32" assert 16 == tir.cast( - "int32", tir.load("int64", arg2_shape, 0) + tir.load("int64", arg2_shape, 0), "int32" ), "Argument arg2.shape[0] has an unsatisfied constraint" assert 14 == tir.cast( - "int32", tir.load("int64", arg2_shape, 1) + tir.load("int64", arg2_shape, 1), "int32" ), "Argument arg2.shape[1] has an unsatisfied constraint" assert 14 == tir.cast( - "int32", tir.load("int64", arg2_shape, 2) + tir.load("int64", arg2_shape, 2), "int32" ), "Argument arg2.shape[2] has an unsatisfied constraint" assert 32 == tir.cast( - "int32", tir.load("int64", arg2_shape, 3) + tir.load("int64", arg2_shape, 3), "int32" ), "Argument arg2.shape[3] has an unsatisfied constraint" assert 16 == tir.cast( - "int32", tir.load("int64", arg2_shape, 4) + tir.load("int64", arg2_shape, 4), "int32" ), "Argument arg2.shape[4] has an unsatisfied constraint" assert 16 == tir.cast( - "int32", tir.load("int64", arg2_shape, 5) + tir.load("int64", arg2_shape, 5), "int32" ), "Argument arg2.shape[5] has an unsatisfied constraint" if not (tir.isnullptr(arg2_strides, dtype="bool")): assert ( ( ( ( - (1 == tir.cast("int32", tir.load("int64", arg2_strides, 5))) - and (16 == tir.cast("int32", tir.load("int64", arg2_strides, 4))) + (1 == tir.cast(tir.load("int64", arg2_strides, 5), "int32")) + and (16 == tir.cast(tir.load("int64", arg2_strides, 4), "int32")) ) - and (256 == tir.cast("int32", tir.load("int64", arg2_strides, 3))) + and (256 == tir.cast(tir.load("int64", arg2_strides, 3), "int32")) ) - and (8192 == tir.cast("int32", tir.load("int64", arg2_strides, 2))) + and (8192 == tir.cast(tir.load("int64", arg2_strides, 2), "int32")) ) - and (114688 == tir.cast("int32", tir.load("int64", arg2_strides, 1))) + and (114688 == tir.cast(tir.load("int64", arg2_strides, 1), "int32")) ) and ( - 1605632 == tir.cast("int32", tir.load("int64", arg2_strides, 0)) + 1605632 == tir.cast(tir.load("int64", arg2_strides, 0), "int32") ), "arg2.strides: expected to be compact array" tir.evaluate(0) assert tir.uint64(0) == tir.tvm_struct_get( @@ -2961,9 +2621,9 @@ def opt_conv_tensorcore_mod_host( assert dev_id == tir.tvm_struct_get( arg2, 0, 9, dtype="int32" ), "Argument arg2.device_id has an unsatisfied constraint" - tir.evaluate(tir.tvm_struct_set(stack_value, 0, 12, tir.cast("int64", 2), dtype="int32")) + tir.evaluate(tir.tvm_struct_set(stack_value, 0, 12, tir.cast(2, "int64"), dtype="int32")) stack_tcode[0] = 0 - tir.evaluate(tir.tvm_struct_set(stack_value, 1, 12, tir.cast("int64", dev_id), dtype="int32")) + tir.evaluate(tir.tvm_struct_set(stack_value, 1, 12, tir.cast(dev_id, "int64"), dtype="int32")) stack_tcode[1] = 0 tir.evaluate( tir.tvm_call_packed_lowered( @@ -2977,17 +2637,17 @@ def opt_conv_tensorcore_mod_host( stack_tcode[1] = 3 tir.evaluate(tir.tvm_struct_set(stack_value, 2, 12, Conv, dtype="int32")) stack_tcode[2] = 3 - tir.evaluate(tir.tvm_struct_set(stack_value, 3, 12, tir.cast("int64", 196), dtype="int32")) + tir.evaluate(tir.tvm_struct_set(stack_value, 3, 12, tir.cast(196, "int64"), dtype="int32")) stack_tcode[3] = 0 - tir.evaluate(tir.tvm_struct_set(stack_value, 4, 12, tir.cast("int64", 2), dtype="int32")) + tir.evaluate(tir.tvm_struct_set(stack_value, 4, 12, tir.cast(2, "int64"), dtype="int32")) stack_tcode[4] = 0 - tir.evaluate(tir.tvm_struct_set(stack_value, 5, 12, tir.cast("int64", 4), dtype="int32")) + tir.evaluate(tir.tvm_struct_set(stack_value, 5, 12, tir.cast(4, "int64"), dtype="int32")) stack_tcode[5] = 0 - tir.evaluate(tir.tvm_struct_set(stack_value, 6, 12, tir.cast("int64", 4), dtype="int32")) + tir.evaluate(tir.tvm_struct_set(stack_value, 6, 12, tir.cast(4, "int64"), dtype="int32")) stack_tcode[6] = 0 - tir.evaluate(tir.tvm_struct_set(stack_value, 7, 12, tir.cast("int64", 2), dtype="int32")) + tir.evaluate(tir.tvm_struct_set(stack_value, 7, 12, tir.cast(2, "int64"), dtype="int32")) stack_tcode[7] = 0 - tir.evaluate(tir.tvm_struct_set(stack_value, 8, 12, tir.cast("int64", 32), dtype="int32")) + tir.evaluate(tir.tvm_struct_set(stack_value, 8, 12, tir.cast(32, "int64"), dtype="int32")) stack_tcode[8] = 0 tir.evaluate( tir.tvm_call_packed_lowered(