Auto TensorCore CodeGen (apache#4234)

* Add Auto TensorCore TensorCore Unit Test * Rebase to tvm master branch & Add auto tensor core * Code Refine * Add tensor core switch by pragma * Add pragma in tensor core example code * Get real tile size to replace hard coded 16 * support more than 2 dimensions (e.g. batchmatmul) for buffer bind scope * support batch matmul * Move cuda env check to tensor_core.cc * Coderefine for tensor_core.cc * Refine comments * Some refinements of code and comment * Update TensorCore UT to pass the CPU test * remove redundant code * matmul's storage align for different layout * Add support for differenct position of type cast * Add formal tutorial for auto tensorcore codegen * move tensorcore check up to tutorial code * code and doc refine * comment out tune_and_evaluate in tutorial * fix cpplint error
neo-ai · Nov 13, 2019 · 9ff4496 · 9ff4496
1 parent ff00d07
commit 9ff4496
Show file tree

Hide file tree

Showing 7 changed files with 1,920 additions and 0 deletions.
diff --git a/include/tvm/ir.h b/include/tvm/ir.h
@@ -1248,6 +1248,8 @@ constexpr const char* reduce_scope = "reduce_scope";
 constexpr const char* pragma_scope_prefix = "pragma_";
 /*! \brief Import llvm source or file into the final code gen module */
 constexpr const char* pragma_import_llvm = "pragma_import_llvm";
+/*! \brief Try to modify the AST to support Tensor Core */
+constexpr const char* pragma_tensor_core = "pragma_tensor_core";
 /*!
  * \brief Mark of prefetch scope, value=offset,
  *  run prefetch of Tensor on the current loop scope

diff --git a/include/tvm/ir_pass.h b/include/tvm/ir_pass.h
@@ -206,6 +206,20 @@ Stmt StorageFlatten(Stmt stmt,
                     Map<Tensor, Buffer> extern_buffer,
                     int cache_line_size,
                     bool create_bound_attribute = false);
+
+/*!
+ * \brief Try to modify the AST to support TensorCore
+ *
+ * \param stmt The stmt to be trasnformed.
+ * \param schedule The original schedule.
+ * \param extern_buffer Map specifies external
+ *    buffer assignment of input and outputs.
+ * \return Transformed stmt.
+ */
+Stmt RewriteForTensorCore(Stmt stmt,
+                          Schedule schedule,
+                          Map<Tensor, Buffer> extern_buffer);
+
 /*!
  * \brief Verify if there is any argument bound to compact buffer.
  *

diff --git a/python/tvm/build_module.py b/python/tvm/build_module.py
@@ -387,6 +387,7 @@ def lower(sch,
     binds, arg_list = get_binds(args, compact, binds)
 
     # Phase 1
+    stmt = ir_pass.RewriteForTensorCore(stmt, sch, binds)
     stmt = ir_pass.StorageFlatten(stmt, binds, 64, cfg.instrument_bound_checkers)
     stmt = ir_pass.CanonicalSimplify(stmt)
     for f in lower_phase1:

diff --git a/src/api/api_pass.cc b/src/api/api_pass.cc
@@ -94,6 +94,12 @@ TVM_REGISTER_API("ir_pass.StorageFlatten")
     }
   });
 
+TVM_REGISTER_API("ir_pass.RewriteForTensorCore")
+.set_body_typed<Stmt(const Stmt&, const Schedule&, const Map<Tensor, Buffer>&)>
+  ([](const Stmt& stmt, const Schedule& schedule, const Map<Tensor, Buffer>& extern_buffer) {
+      return RewriteForTensorCore(stmt, schedule, extern_buffer);
+  });
+
 TVM_REGISTER_API("ir_pass.AttrsEqual")
 .set_body_typed<bool(const NodeRef&, const NodeRef&)>([](const NodeRef& lhs, const NodeRef& rhs) {
     return AttrsEqual()(lhs, rhs);