apache · tqchen · Jun 23, 2018 · Jun 17, 2018 · Jun 17, 2018 · Jun 20, 2018
diff --git a/include/tvm/ir_pass.h b/include/tvm/ir_pass.h
@@ -477,6 +477,22 @@ LoweredFunc LowerIntrin(LoweredFunc f, const std::string& target);
  */
 bool VerifyMemory(LoweredFunc func, int device_type);
 
+
+/*!
+ * \brief Verify the correctness of a GPU code
+ *        It will check the whether the amount of shared memory or the number of threads
+ *        in a block exceeds the limit
+ * \param stmt The statement to be checked
+ * \param max_shared_memory_per_block Total amount of shared memory per block (in bytes).
+ * \param max_thread_per_block Maximum number of threads per block.
+ * \return valid Whether it is a valid GPU code
+ *
+ */
+bool VerifyGPUCode(Stmt stmt,
+                   int max_shared_memory_per_block,
+                   int max_thread_per_block);
+
+
 }  // namespace ir
 }  // namespace tvm
 

diff --git a/src/api/api_pass.cc b/src/api/api_pass.cc
@@ -131,5 +131,6 @@ REGISTER_PASS2(LowerIntrin);
 REGISTER_PASS1(LowerTVMBuiltin);
 REGISTER_PASS1(CombineContextCall);
 REGISTER_PASS2(VerifyMemory);
+REGISTER_PASS3(VerifyGPUCode);
 }  // namespace ir
 }  // namespace tvm
diff --git a/src/pass/verify_gpu_code.cc b/src/pass/verify_gpu_code.cc
@@ -0,0 +1,115 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file verify_gpu_code.cc
+ * \brief Verify the correctness of a GPU IR.
+ *        It will check the whether the amount of shared memory or
+ *        the number of threads in a block exceeds the limit
+ */
+
+#include <tvm/api_registry.h>
+#include <tvm/ir.h>
+#include <tvm/ir_visitor.h>
+
+namespace tvm {
+namespace ir {
+
+class GPUCodeVerifier : public IRVisitor {
+ public:
+  bool verify(tvm::Stmt stmt, int max_shared_memory_per_block, int max_thread_per_block) {
+    max_shared_memory_per_block_ = static_cast<size_t>(max_shared_memory_per_block);
+    max_thread_per_block_ = static_cast<size_t>(max_thread_per_block);
+
+    this->Visit(stmt);
+
+    return valid;
+  }
+
+  void Visit_(const ProducerConsumer *op) {
+    if (nest_level_ == 0) {
+      // enter a new kernel, reset statistics
+      reset_();
+    }
+
+    if (op->is_producer) {
+      nest_level_++;
+      IRVisitor::Visit_(op);
+      nest_level_--;
+    } else {
+      IRVisitor::Visit_(op);
+    }
+
+    if (nest_level_ == 0) {
+      // exit a kernel, check the validity
+      if (thread_per_block_ > max_thread_per_block_) {
+        valid = false;
+      }
+      if (shared_memory_per_block_ > max_shared_memory_per_block_) {
+        valid = false;
+      }
+    }
+  }
+
+  void Visit_(const Allocate *op) {
+    IRVisitor::Visit_(op);
+    // visit an allocation of a buffer in shared memory, record its size
+    if (shared_buffers_.count(op->buffer_var.get()) != 0) {
+      int64_t size = op->type.bytes();
+      for (auto dim : op->extents) {
+        size *= dim.as<IntImm>()->value;
+      }
+      shared_memory_per_block_ += size;
+    }
+  }
+
+  void Visit_(const AttrStmt *op) {
+    if (op->attr_key == attr::storage_scope) {
+      if (op->value.as<StringImm>()->value == "shared") {
+        shared_buffers_.insert(op->node.as<tvm::Variable>());
+      }
+    } else if (op->attr_key == attr::thread_extent) {
+      VarExpr var = op->node.as<tvm::IterVarNode>()->var;
+      const auto *extent = op->value.as<IntImm>();
+      CHECK(extent);
+
+      // record the number of threads in a block
+      std::string name = var.get()->name_hint;
+      if (name == "threadIdx.x" || name == "threadIdx.y" || name == "threadIdx.z") {
+        if (visited_threads_.find(name) == visited_threads_.end()) {
+          visited_threads_.insert(name);
+          thread_per_block_ *= extent->value;
+        }
+      }
+    }
+    IRVisitor::Visit_(op);
+  }
+
+ private:
+  int nest_level_{0};
+
+  std::unordered_set<const tvm::Variable *> shared_buffers_;
+  std::unordered_set<std::string> visited_threads_;
+  size_t shared_memory_per_block_;
+  size_t thread_per_block_;
+
+  size_t max_shared_memory_per_block_;
+  size_t max_thread_per_block_;
+
+  bool valid{true};
+
+  void reset_() {
+    shared_buffers_.clear();
+    shared_memory_per_block_ = 0;
+    thread_per_block_ = 1;
+    visited_threads_.clear();
+  }
+};
+
+bool VerifyGPUCode(Stmt stmt,
+                   int max_shared_memory_per_block,
+                   int max_thread_per_block) {
+  GPUCodeVerifier verifier;
+  return verifier.verify(stmt, max_shared_memory_per_block, max_thread_per_block);
+}
+
+}  // namespace ir
+}  // namespace tvm
diff --git a/tests/python/unittest/test_pass_verify_gpu_code.py b/tests/python/unittest/test_pass_verify_gpu_code.py
@@ -0,0 +1,74 @@
+"""Test gpu code verifier"""
+import tvm
+
+global valid
+
+def cuda_verify_pass(max_shared_memory, max_num_thread):
+    def verify_pass(stmt):
+        global valid
+        valid = tvm.ir_pass.VerifyGPUCode(stmt, max_shared_memory, max_num_thread)
+        return stmt
+    return verify_pass
+
+def test_shared_memory():
+    N = 1024
+    M = 128
+
+    A = tvm.placeholder((N,), name='A', dtype='float32')
+    B = tvm.compute((N, ), lambda i: A[i], name='B')
+
+    s = tvm.create_schedule([B.op])
+    AA = s.cache_read(A, "shared", [B])
+    o, i = s[B].split(s[B].op.axis[0], M)
+    s[AA].compute_at(s[B], o)
+    s[B].bind(o, tvm.thread_axis("blockIdx.x"))
+    s[B].bind(i, tvm.thread_axis("threadIdx.x"))
+
+    # shared memory usage: M * 4B
+    # thread usage: M
+
+    for target in ['opencl', 'cuda']:
+        if not tvm.context(target).exist:
+            continue
+        global valid
+        with tvm.build_config(**{"add_lower_pass": [(2, cuda_verify_pass(4 * M - 1, M))]}):
+            tvm.build(s, [A, B], target)
+        assert not valid
+
+        with tvm.build_config(**{"add_lower_pass": [(2, cuda_verify_pass(4 * M, M))]}):
+            tvm.build(s, [A, B], target)
+        assert valid
+
+
+def test_num_thread():
+    N = 1024
+    M = 128
+
+    A = tvm.placeholder((N,), name='A', dtype='float32')
+    B = tvm.compute((N, ), lambda i: A[i], name='B')
+
+    s = tvm.create_schedule([B.op])
+    o, i = s[B].split(s[B].op.axis[0], M)
+
+    s[B].bind(o, tvm.thread_axis('threadIdx.x'))
+    s[B].bind(i, tvm.thread_axis("threadIdx.y"))
+
+    # shared memory usage: 0
+    # thread usage: N
+
+    for target in ['opencl', 'cuda']:
+        if not tvm.context(target).exist:
+            continue
+        global valid
+        with tvm.build_config(**{"add_lower_pass": [(2, cuda_verify_pass(0, N - 1))]}):
+            tvm.build(s, [A, B], target)
+        assert not valid
+
+        with tvm.build_config(**{"add_lower_pass": [(2, cuda_verify_pass(0, N))]}):
+            tvm.build(s, [A, B], target)
+        assert valid
+
+
+if __name__ == "__main__":
+    test_shared_memory()
+    test_num_thread()