From 0329772be19a07a03091aa432eb4162a522c387b Mon Sep 17 00:00:00 2001
From: Meghan <cowanmeg@cs.washington.edu>
Date: Tue, 5 Jun 2018 16:37:53 -0700
Subject: [PATCH 01/11] ARM Popcount lowering rule and codegen updates to
 support reinterpreting and accessing vectors

---
 HalideIR                         |  2 +-
 src/codegen/llvm/codegen_arm.cc  | 77 ++++++++++++++++++++++++++++++++
 src/codegen/llvm/codegen_llvm.cc | 26 ++++++++++-
 3 files changed, 103 insertions(+), 2 deletions(-)
diff --git a/HalideIR b/HalideIR
index a3698398faff..e20e5e9abb3a 160000
--- a/HalideIR
+++ b/HalideIR
@@ -1 +1 @@
-Subproject commit a3698398faff7fec1c0fa4e4479357651382db75
+Subproject commit e20e5e9abb3aa43147a90a4ffb3e190f62862970
diff --git a/src/codegen/llvm/codegen_arm.cc b/src/codegen/llvm/codegen_arm.cc
index b87b6ec88808..abf30756011c 100644
--- a/src/codegen/llvm/codegen_arm.cc
+++ b/src/codegen/llvm/codegen_arm.cc
@@ -18,8 +18,85 @@ class CodeGenARM final : public CodeGenCPU {
     native_vector_bits_ = 16 * 8;
     CodeGenCPU::InitTarget(tm);
   }
+  llvm::Value* CreateIntrinsic(const Call* op) override;
+
+ private:
+  Expr ARMPopcount(const Call* op);
 };
 
+llvm::Value* CodeGenARM::CreateIntrinsic(const Call* op) {
+  if (op->is_intrinsic("llvm_intrin")) {
+    llvm::Intrinsic::ID id = static_cast<llvm::Intrinsic::ID>(
+        op->args[0].as<UIntImm>()->value);
+    if (id == ::llvm::Intrinsic::ctpop) {
+      Expr e = ARMPopcount(op);
+      return CodeGenCPU::CreateIntrinsic(e.as<Call>());
+    }
+  }
+  return CodeGenCPU::CreateIntrinsic(op);
+}
+
+Expr CodeGenARM::ARMPopcount(const Call *call) {
+  using namespace ir;
+  const Expr& e = call->args[2];
+  ::llvm::Intrinsic::ID ctpop_id = ::llvm::Intrinsic::ctpop;
+  ::llvm::Intrinsic::ID vpaddu_id = ::llvm::Intrinsic::arm_neon_vpaddlu;
+
+
+  Type uint8_type = Type(e.type().code(), 8, e.type().bits() * e.type().lanes() / 8);
+  Type uint16_type = Type(uint8_type.code(), 16, uint8_type.bits() * uint8_type.lanes() / 16);
+  Type uint32_type = Type(uint16_type.code(), 32, uint8_type.bits() * uint8_type.lanes() / 32);
+
+  // Fallback to default llvm lowering rule if input type not a full vector or half vector length
+  int total_size =  call->type.bits() * call->type.lanes();
+  if (!call->type.is_vector() || call->type.bits() == 8 ||
+     (total_size != 128 && total_size != 64)) {
+    Array<Expr> vcnt_args;
+    vcnt_args.push_back(ir::UIntImm::make(UInt(32), ctpop_id));
+    vcnt_args.push_back(ir::UIntImm::make(UInt(32), 1));
+    vcnt_args.push_back(e);
+    return ir::Call::make(call->type,  "llvm_intrin", vcnt_args, Call::PureIntrinsic);
+  }
+
+  // Interpret input as vector of 8bit values
+  Expr input8 = reinterpret(uint8_type, e);
+  // Popcount 8bit->8bit
+  const Call* c0 = input8.as<Call>();
+  CHECK(c0 != nullptr);
+  Array<Expr> vcnt8_args;
+  vcnt8_args.push_back(ir::UIntImm::make(UInt(32), ctpop_id));
+  vcnt8_args.push_back(ir::UIntImm::make(UInt(32), 1));
+  vcnt8_args.push_back(input8);
+  Expr vcnt8 = ir::Call::make(uint8_type,  "llvm_intrin", vcnt8_args, Call::PureIntrinsic);
+
+  // Accumulation 8->16bit
+  Array<Expr> vcnt16_args;
+  vcnt16_args.push_back(ir::UIntImm::make(UInt(32), vpaddu_id));
+  vcnt16_args.push_back(ir::UIntImm::make(UInt(32), 1));
+  vcnt16_args.push_back(vcnt8);
+  Expr vcnt16 = ir::Call::make(uint16_type, "llvm_intrin", vcnt16_args, Call::PureIntrinsic);
+  if (call->type.bits() == 16) {
+    return vcnt16;
+  }
+
+  // Accumulation 16->32bit
+  Array<Expr> vcnt32_args;
+  vcnt32_args.push_back(ir::UIntImm::make(UInt(32), vpaddu_id));
+  vcnt32_args.push_back(ir::UIntImm::make(UInt(32), 1));
+  vcnt32_args.push_back(vcnt16);
+  Expr vcnt32 = ir::Call::make(uint32_type,  "llvm_intrin", vcnt32_args, Call::PureIntrinsic);
+  if (call->type.bits() == 32) {
+    return vcnt32;
+  }
+
+  // Accumulation 32->64bit
+  Array<Expr> vcnt64_args;
+  vcnt64_args.push_back(ir::UIntImm::make(UInt(32), vpaddu_id));
+  vcnt64_args.push_back(ir::UIntImm::make(UInt(32), 1));
+  vcnt64_args.push_back(vcnt32);
+  return ir::Call::make(call->type,  "llvm_intrin", vcnt64_args, Call::PureIntrinsic);
+}
+
 TVM_REGISTER_GLOBAL("tvm.codegen.llvm.target_arm")
 .set_body([](const TVMArgs& targs, TVMRetValue* rv) {
     CodeGenLLVM* cg = new CodeGenARM();
diff --git a/src/codegen/llvm/codegen_llvm.cc b/src/codegen/llvm/codegen_llvm.cc
index 934398d9ce09..d0c5b77cbfd5 100644
--- a/src/codegen/llvm/codegen_llvm.cc
+++ b/src/codegen/llvm/codegen_llvm.cc
@@ -366,7 +366,7 @@ llvm::Value* CodeGenLLVM::CreateBroadcast(llvm::Value* value, int lanes) {
 llvm::Value* CodeGenLLVM::CreateVecSlice(llvm::Value* vec, int begin, int extent) {
   int num_elems = static_cast<int>(vec->getType()->getVectorNumElements());
   if (extent == num_elems && begin == 0) return vec;
-  CHECK_LT(begin + extent, num_elems);
+  CHECK_LT(begin + extent, num_elems+1);
   std::vector<unsigned> indices;
   for (int i = 0; i < extent; ++i) {
     indices.push_back(begin + i);
@@ -562,6 +562,10 @@ llvm::Value* CodeGenLLVM::CreateIntrinsic(const Call* op) {
         sig_type.push_back(arg_value.back()->getType());
       }
     }
+    llvm::Type *returnType = LLVMType(op->type);
+    if (returnType != sig_type[0]) {
+      sig_type.insert(sig_type.begin(), returnType);
+    }
     llvm::Function* f = llvm::Intrinsic::getDeclaration(
         module_.get(), id, sig_type);
     return builder_->CreateCall(f, arg_value);
@@ -628,6 +632,26 @@ llvm::Value* CodeGenLLVM::CreateIntrinsic(const Call* op) {
     value->addIncoming(then_value, then_value_block);
     value->addIncoming(else_value, else_value_block);
     return value;
+  } else if (op->is_intrinsic(Call::reinterpret)) {
+    llvm::Type * target = LLVMType(op->type);
+    return builder_->CreateBitCast(MakeValue(op->args[0]), target);
+  } else if (op->is_intrinsic("vectorlow")) {
+    llvm::Value *v = MakeValue(op->args[0]);
+    int l = v->getType()->getVectorNumElements();
+    return CreateVecSlice(v, 0, l/2);
+  } else if (op->is_intrinsic("vectorhigh")) {
+    llvm::Value *v = MakeValue(op->args[0]);
+    int l = v->getType()->getVectorNumElements();
+    return CreateVecSlice(v, l/2, l/2);
+  } else if (op->is_intrinsic("vectorcombine")) {
+    llvm::Value *v0 = MakeValue(op->args[0]);
+    llvm::Value *v1 = MakeValue(op->args[1]);
+    int num_elems = static_cast<int>(v0->getType()->getVectorNumElements()) * 2;
+    std::vector<unsigned> indices;
+    for (int i = 0; i < num_elems; ++i) {
+      indices.push_back(i);
+    }
+    return builder_->CreateShuffleVector(v0, v1, indices);
   } else {
     LOG(FATAL) << "unknown intrinsic " << op->name;
     return nullptr;

From 777f9ea69d24d40433d2867c69e3761f9704198a Mon Sep 17 00:00:00 2001
From: Meghan <cowanmeg@cs.washington.edu>
Date: Wed, 6 Jun 2018 16:05:59 -0700
Subject: [PATCH 02/11] Fixes and test case for arm popcount

---
 HalideIR                         |  2 +-
 src/codegen/llvm/codegen_arm.cc  | 24 ++++++++++++--------
 src/codegen/llvm/codegen_llvm.cc |  2 +-
 src/codegen/llvm/llvm_module.cc  | 38 ++++++++++++++++++++++++++++----
 4 files changed, 51 insertions(+), 15 deletions(-)

diff --git a/HalideIR b/HalideIR
index e20e5e9abb3a..a3698398faff 160000
--- a/HalideIR
+++ b/HalideIR
@@ -1 +1 @@
-Subproject commit e20e5e9abb3aa43147a90a4ffb3e190f62862970
+Subproject commit a3698398faff7fec1c0fa4e4479357651382db75
diff --git a/src/codegen/llvm/codegen_arm.cc b/src/codegen/llvm/codegen_arm.cc
index abf30756011c..161d6db6e42d 100644
--- a/src/codegen/llvm/codegen_arm.cc
+++ b/src/codegen/llvm/codegen_arm.cc
@@ -39,13 +39,9 @@ llvm::Value* CodeGenARM::CreateIntrinsic(const Call* op) {
 Expr CodeGenARM::ARMPopcount(const Call *call) {
   using namespace ir;
   const Expr& e = call->args[2];
-  ::llvm::Intrinsic::ID ctpop_id = ::llvm::Intrinsic::ctpop;
-  ::llvm::Intrinsic::ID vpaddu_id = ::llvm::Intrinsic::arm_neon_vpaddlu;
-
 
-  Type uint8_type = Type(e.type().code(), 8, e.type().bits() * e.type().lanes() / 8);
-  Type uint16_type = Type(uint8_type.code(), 16, uint8_type.bits() * uint8_type.lanes() / 16);
-  Type uint32_type = Type(uint16_type.code(), 32, uint8_type.bits() * uint8_type.lanes() / 32);
+  ::llvm::Intrinsic::ID ctpop_id = ::llvm::Intrinsic::ctpop;
+  ::llvm::Intrinsic::ID vpaddlu_id = ::llvm::Intrinsic::arm_neon_vpaddlu;
 
   // Fallback to default llvm lowering rule if input type not a full vector or half vector length
   int total_size =  call->type.bits() * call->type.lanes();
@@ -58,6 +54,16 @@ Expr CodeGenARM::ARMPopcount(const Call *call) {
     return ir::Call::make(call->type,  "llvm_intrin", vcnt_args, Call::PureIntrinsic);
   }
 
+  // Popcount lowering rule:
+  // Reinterpret input vector as a vector of 8bit values and preform popcount
+  // Pairwise add between adjacent elements and double width with vpaddlu
+  // to return back to original input type
+
+  // Dvisions are always divisible (number of bits = 64 or 128)
+  Type uint8_type = Type(e.type().code(), 8, e.type().bits() * e.type().lanes() / 8);
+  Type uint16_type = Type(uint8_type.code(), 16, uint8_type.bits() * uint8_type.lanes() / 16);
+  Type uint32_type = Type(uint16_type.code(), 32, uint8_type.bits() * uint8_type.lanes() / 32);
+
   // Interpret input as vector of 8bit values
   Expr input8 = reinterpret(uint8_type, e);
   // Popcount 8bit->8bit
@@ -71,7 +77,7 @@ Expr CodeGenARM::ARMPopcount(const Call *call) {
 
   // Accumulation 8->16bit
   Array<Expr> vcnt16_args;
-  vcnt16_args.push_back(ir::UIntImm::make(UInt(32), vpaddu_id));
+  vcnt16_args.push_back(ir::UIntImm::make(UInt(32), vpaddlu_id));
   vcnt16_args.push_back(ir::UIntImm::make(UInt(32), 1));
   vcnt16_args.push_back(vcnt8);
   Expr vcnt16 = ir::Call::make(uint16_type, "llvm_intrin", vcnt16_args, Call::PureIntrinsic);
@@ -81,7 +87,7 @@ Expr CodeGenARM::ARMPopcount(const Call *call) {
 
   // Accumulation 16->32bit
   Array<Expr> vcnt32_args;
-  vcnt32_args.push_back(ir::UIntImm::make(UInt(32), vpaddu_id));
+  vcnt32_args.push_back(ir::UIntImm::make(UInt(32), vpaddlu_id));
   vcnt32_args.push_back(ir::UIntImm::make(UInt(32), 1));
   vcnt32_args.push_back(vcnt16);
   Expr vcnt32 = ir::Call::make(uint32_type,  "llvm_intrin", vcnt32_args, Call::PureIntrinsic);
@@ -91,7 +97,7 @@ Expr CodeGenARM::ARMPopcount(const Call *call) {
 
   // Accumulation 32->64bit
   Array<Expr> vcnt64_args;
-  vcnt64_args.push_back(ir::UIntImm::make(UInt(32), vpaddu_id));
+  vcnt64_args.push_back(ir::UIntImm::make(UInt(32), vpaddlu_id));
   vcnt64_args.push_back(ir::UIntImm::make(UInt(32), 1));
   vcnt64_args.push_back(vcnt32);
   return ir::Call::make(call->type,  "llvm_intrin", vcnt64_args, Call::PureIntrinsic);
diff --git a/src/codegen/llvm/codegen_llvm.cc b/src/codegen/llvm/codegen_llvm.cc
index d0c5b77cbfd5..bbf52512d3d5 100644
--- a/src/codegen/llvm/codegen_llvm.cc
+++ b/src/codegen/llvm/codegen_llvm.cc
@@ -366,7 +366,7 @@ llvm::Value* CodeGenLLVM::CreateBroadcast(llvm::Value* value, int lanes) {
 llvm::Value* CodeGenLLVM::CreateVecSlice(llvm::Value* vec, int begin, int extent) {
   int num_elems = static_cast<int>(vec->getType()->getVectorNumElements());
   if (extent == num_elems && begin == 0) return vec;
-  CHECK_LT(begin + extent, num_elems+1);
+  CHECK_LE(begin + extent, num_elems);
   std::vector<unsigned> indices;
   for (int i = 0; i < extent; ++i) {
     indices.push_back(begin + i);
diff --git a/src/codegen/llvm/llvm_module.cc b/src/codegen/llvm/llvm_module.cc
index c16af511febc..2bae52b194f5 100644
--- a/src/codegen/llvm/llvm_module.cc
+++ b/src/codegen/llvm/llvm_module.cc
@@ -117,11 +117,41 @@ class LLVMModuleNode final : public runtime::ModuleNode {
   }
 
   std::string GetSource(const std::string& format) final {
+    std::string fmt = runtime::GetFileFormat("", format);
     std::string type_str;
-    llvm::raw_string_ostream rso(type_str);
-    CHECK(mptr_ != nullptr);
-    mptr_->print(rso, nullptr);
-    return rso.str();
+    llvm::SmallString<256> str;
+    llvm::raw_svector_ostream rso(str);
+
+    if (fmt == "s" || fmt == "asm") {
+    #if TVM_LLVM_VERSION <= 60
+          std::unique_ptr<llvm::Module> m = llvm::CloneModule(mptr_);
+    #else
+          std::unique_ptr<llvm::Module> m = llvm::CloneModule(*mptr_);
+    #endif
+          llvm::legacy::PassManager pass;
+          CHECK(tm_);
+    #if TVM_LLVM_VERSION <= 60
+          CHECK(tm_->addPassesToEmitFile(
+              pass, rso, llvm::TargetMachine::CGFT_AssemblyFile) == 0)
+              << "Cannot emit target CGFT_AssemblyFile";
+    #else
+          CHECK(tm_->addPassesToEmitFile(
+              pass, rso, nullptr, llvm::TargetMachine::CGFT_AssemblyFile) == 0)
+              << "Cannot emit target CGFT_AssemblyFile";
+    #endif
+          pass.run(*m);
+          return rso.str().str();
+    } else if (fmt == "" || fmt == "ll") {
+      std::string type_str;
+      llvm::raw_string_ostream rso(type_str);
+      CHECK(mptr_ != nullptr);
+      mptr_->print(rso, nullptr);
+      return rso.str();
+    } else {
+      LOG(FATAL) << "Do not know how to get source code with format: "
+                 << format << "\'";
+    }
+    return "";
   }
 
   void Init(const Array<LoweredFunc>& funcs, std::string target) {

From 2e56f092e09cc366dc64dcc7480ab8f75869ae56 Mon Sep 17 00:00:00 2001
From: Meghan <cowanmeg@cs.washington.edu>
Date: Wed, 6 Jun 2018 16:07:15 -0700
Subject: [PATCH 03/11] white space fixes

---
 src/codegen/llvm/codegen_arm.cc | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/codegen/llvm/codegen_arm.cc b/src/codegen/llvm/codegen_arm.cc
index 161d6db6e42d..18a0eb54e182 100644
--- a/src/codegen/llvm/codegen_arm.cc
+++ b/src/codegen/llvm/codegen_arm.cc
@@ -39,7 +39,6 @@ llvm::Value* CodeGenARM::CreateIntrinsic(const Call* op) {
 Expr CodeGenARM::ARMPopcount(const Call *call) {
   using namespace ir;
   const Expr& e = call->args[2];
-
   ::llvm::Intrinsic::ID ctpop_id = ::llvm::Intrinsic::ctpop;
   ::llvm::Intrinsic::ID vpaddlu_id = ::llvm::Intrinsic::arm_neon_vpaddlu;
 

From b329f462b7994f6b1c9bded239e2572aaea3a6f3 Mon Sep 17 00:00:00 2001
From: Meghan <cowanmeg@cs.washington.edu>
Date: Thu, 7 Jun 2018 09:56:52 -0700
Subject: [PATCH 04/11] Initial qconv2d operators

---
 topi/python/topi/generic/nn.py    | 35 ++++++++++++++++
 topi/python/topi/nn/__init__.py   |  1 +
 topi/python/topi/nn/util.py       | 66 +++++++++++++++++++++++++++++++
 topi/python/topi/rasp/__init__.py |  1 +
 topi/python/topi/x86/__init__.py  |  1 +
 5 files changed, 104 insertions(+)

diff --git a/topi/python/topi/generic/nn.py b/topi/python/topi/generic/nn.py
index 5a16d12206a3..bb81c37ad285 100644
--- a/topi/python/topi/generic/nn.py
+++ b/topi/python/topi/generic/nn.py
@@ -132,6 +132,41 @@ def schedule_depthwise_conv2d_nhwc(outs):
     """
     return _default_schedule(outs, False)
 
+@tvm.target.generic_func
+def schedule_qconv2d_nchw(outs):
+    """Schedule for qconv2d_nchw
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+          The computation graph description of qconv2d_nchw
+          in the format of an array of tensors.
+
+    Returns
+    -------
+    sch: Schedule
+        The computation schedule for the op.
+    """
+    return _default_schedule(outs, False)
+
+
+@tvm.target.generic_func
+def schedule_qconv2d_nhwc(outs):
+    """Schedule for qconv2d_nhwc
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+          The computation graph description of qconv2d_nchw
+          in the format of an array of tensors.
+
+    Returns
+    -------
+    sch: Schedule
+        The computation schedule for the op.
+    """
+    return _default_schedule(outs, False)
+
 
 @tvm.target.override_native_generic_func("schedule_reduce")
 def schedule_reduce(outs):
diff --git a/topi/python/topi/nn/__init__.py b/topi/python/topi/nn/__init__.py
index 056d1a76339a..e968bd68c927 100644
--- a/topi/python/topi/nn/__init__.py
+++ b/topi/python/topi/nn/__init__.py
@@ -17,3 +17,4 @@
 from .upsampling import *
 from .local_response_norm import *
 from .l2_norm import *
+from .qconv2d import *
\ No newline at end of file
diff --git a/topi/python/topi/nn/util.py b/topi/python/topi/nn/util.py
index 6264ced76953..90497a77c6f9 100644
--- a/topi/python/topi/nn/util.py
+++ b/topi/python/topi/nn/util.py
@@ -4,6 +4,9 @@
 
 import tvm
 from ..util import get_const_int
+import numpy as np
+from topi.transform import concatenate
+
 
 def infer_pad(data, data_pad):
     """Infer the padding from stages in reverse.
@@ -102,3 +105,66 @@ def get_pad_tuple(padding, kernel):
     pad_top = (pad_h + 1) // 2
     pad_left = (pad_w + 1) // 2
     return pad_top, pad_left, pad_h - pad_top, pad_w - pad_left
+
+
+# Packs quantized data into packed bitplanes
+# pack_axis = Axis to compress of original tensor
+# bit_axis = Axis to place bitplanes in the resulting tensor
+# pack_type = Datatype to pack elements into 
+def bitpack(data, bits, pack_axis, bit_axis, pack_type, name="QuantizeInput"):
+    ishape = data.shape
+    n = len(ishape)
+    if pack_type == 'uint8':
+        data_width = 8
+    elif pack_type == 'uint16':
+        data_width = 16
+    elif pack_type == 'uint32':
+        data_width = 32
+    elif pack_type == 'uint64':
+        data_width = 64
+  
+    # Data must be in multiples of the data_width
+    assert get_const_int(ishape[pack_axis]) % data_width == 0, "Not a multiple of word size"
+
+    shape_vec = list(ishape)
+    shape_vec[pack_axis] = (shape_vec[pack_axis] // data_width)
+    shape_vec.insert(bit_axis, 1)
+    bitserial_oshape = tuple(shape_vec)
+    masks = np.array([0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80])
+
+    # pack axis shifts if bit axis comes before
+    if bit_axis <= pack_axis:
+        pack_axis += 1 
+
+    def _bitpack(*indices):
+        packed_data = [tvm.const(0, pack_type)] * bits
+        for k in range(data_width):
+            # Translate indices for packed data back to original
+            idx = [0] * n
+            j = 0
+            for i in range(n+1):
+                if i == bit_axis:
+                    continue
+                elif i == pack_axis:
+                    idx[j] = indices[i] * data_width + k
+                else:
+                    idx[j] = indices[i]
+                j += 1       
+            
+            element = data(*idx)
+            for b in range(bits):
+                extracted_bit = ((element & tvm.const(masks[b])) >> b).astype(pack_type)
+                packed_data[b] = (packed_data[b] | extracted_bit)
+                if k < data_width - 1 :
+                    packed_data[b] = packed_data[b] << 1
+
+            if k == data_width - 1:
+                return tuple(packed_data)
+
+    output_tuple = tvm.compute(bitserial_oshape, _bitpack, name=name, tag='bitpack')
+
+    if bits > 1:
+        return concatenate(output_tuple, axis=bit_axis)
+    else:
+        return output_tuple  
+
diff --git a/topi/python/topi/rasp/__init__.py b/topi/python/topi/rasp/__init__.py
index 31ecea5aba4e..8000e752c9ec 100644
--- a/topi/python/topi/rasp/__init__.py
+++ b/topi/python/topi/rasp/__init__.py
@@ -4,3 +4,4 @@
 
 from .conv2d import schedule_conv2d_nchw
 from .depthwise_conv2d import schedule_depthwise_conv2d_nchw
+from .qconv2d import schedule_qconv2d
diff --git a/topi/python/topi/x86/__init__.py b/topi/python/topi/x86/__init__.py
index d001b5fdca57..3ee6e6ee34a6 100644
--- a/topi/python/topi/x86/__init__.py
+++ b/topi/python/topi/x86/__init__.py
@@ -8,3 +8,4 @@
 from .nn import *
 from .injective import *
 from .pooling import schedule_pool, schedule_global_pool
+from .qconv2d import schedule_qconv2d

From 753c4b2834f161bb8b1869bfea824043fc3f497f Mon Sep 17 00:00:00 2001
From: Meghan <cowanmeg@cs.washington.edu>
Date: Thu, 7 Jun 2018 10:11:04 -0700
Subject: [PATCH 05/11] operators

---
 topi/python/topi/nn/qconv2d.py   | 350 +++++++++++++++++
 topi/python/topi/rasp/qconv2d.py | 619 +++++++++++++++++++++++++++++++
 topi/python/topi/x86/qconv2d.py  | 405 ++++++++++++++++++++
 3 files changed, 1374 insertions(+)
 create mode 100644 topi/python/topi/nn/qconv2d.py
 create mode 100644 topi/python/topi/rasp/qconv2d.py
 create mode 100644 topi/python/topi/x86/qconv2d.py

diff --git a/topi/python/topi/nn/qconv2d.py b/topi/python/topi/nn/qconv2d.py
new file mode 100644
index 000000000000..820a92bc9ff1
--- /dev/null
+++ b/topi/python/topi/nn/qconv2d.py
@@ -0,0 +1,350 @@
+# pylint: disable=invalid-name, unused-variable, too-many-locals, unused-argument
+"""Conv2D operators"""
+from __future__ import absolute_import as _abs
+from collections import namedtuple
+import tvm
+from .pad import pad
+from .util import get_pad_tuple, bitpack
+from ..util import simplify, get_const_int, get_const_tuple
+import numpy as np
+
+
+# workload description of qconv2d
+Workload = namedtuple('Workload',
+                      ['in_dtype', 'out_dtype', 'height', 'width', 'in_filter', 'out_filter',
+                       'hkernel', 'wkernel', 'hpad', 'wpad', 'hstride', 'wstride'])
+
+QuantizedSpatialPackNCHW = namedtuple('SpatialPack', 
+                        ['vh', 'vw', 'vc', 'ba', 'bc'])
+
+QuantizedSpatialPackNHWC= namedtuple('SpatialPack', 
+                        ['vh', 'vw', 'vc', 'ba', 'bc'])
+
+# RPI version - broken right now
+RaspQuantizedSpatialPack = namedtuple('SpatialPack', 
+                        ['vh', 'vw', 'vc', 'ba', 'bc', 'split_ci', 'kfactor'])
+
+
+_WORKLOADS = [
+    # workloads of resnet18 on imagenet
+    # input_size, input_size, ic, oc, kh, kw, pad, pad, stride, stride
+    Workload('uint32', 'int32', 56, 56, 64, 64, 3, 3, 1, 1, 1, 1),
+    Workload('uint32', 'int32', 56, 56, 64, 64, 1, 1, 0, 0, 1, 1),
+    Workload('uint32', 'int32', 56, 56, 64, 128, 3, 3, 1, 1, 2, 2),
+    Workload('uint32', 'int32', 56, 56, 64, 128, 1, 1, 0, 0, 2, 2),
+    Workload('uint32', 'int32', 28, 28, 128, 128, 3, 3, 1, 1, 1, 1),
+    Workload('uint32', 'int32', 28, 28, 128, 256, 3, 3, 1, 1, 2, 2),
+    Workload('uint32', 'int32', 28, 28, 128, 256, 1, 1, 0, 0, 2, 2),
+    Workload('uint32', 'int32', 14, 14, 256, 256, 3, 3, 1, 1, 1, 1),
+    Workload('uint32', 'int32', 14, 14, 256, 512, 3, 3, 1, 1, 2, 2),
+    Workload('uint32', 'int32', 14, 14, 256, 512, 1, 1, 0, 0, 2, 2),
+    Workload('uint32', 'int32', 7, 7, 512, 512, 3, 3, 1, 1, 1, 1),
+]
+
+@tvm.target.generic_func
+def qconv2d(data, kernel, stride, padding,  activation_bits, weight_bits, layout='NCHW', 
+           pack_dtype='uint32', out_dtype='int32', dorefa=True):
+    """Conv2D operator.
+
+    Parameters
+    ----------
+    input : tvm.Tensor
+        4-D with shape [batch, in_channel, in_height, in_width] or 
+                       [batch, in_height, in_width, in_channel]
+
+    filter : tvm.Tensor
+        4-D with shape [num_filter, in_channel, filter_height, filter_width]
+
+    stride : int or a list/tuple of two ints
+        stride size, or [stride_height, stride_width]
+
+    padding : int or a list/tuple of two ints
+        padding size, or [pad_height, pad_width]
+
+    layout : str
+        layout of data
+
+    activation_bits: int
+
+    weight_bits: int
+
+    out_dtype: str
+        return type of convolution
+
+    pack_dtype: str
+        bit packing type
+    
+    dorefa: bool
+        method of preforming popcount
+
+    Returns
+    -------
+    output : tvm.Tensor
+        4-D with shape [batch, out_channel, out_height, out_width]
+    """
+    # search platform specific declaration first
+    # default declaration
+    if layout == 'NCHW':
+        return spatial_pack_nchw(data, kernel, stride, padding, activation_bits, weight_bits, pack_dtype=pack_dtype, 
+                                 out_dtype=out_dtype, dorefa=dorefa)
+    elif layout == 'NHWC':
+        return spatial_pack_nhwc(data, kernel, stride, padding, activation_bits, weight_bits, pack_dtype=pack_dtype, 
+                                 out_dtype=out_dtype, dorefa=dorefa)
+    else:
+        raise ValueError("not support this layout {} yet".format(layout))
+
+def _get_workload(data, kernel, stride, padding, out_dtype, layout):
+    """ Get the workload structure. """
+    assert layout == "NCHW" or layout == "NHWC", \
+        "Only support layouts NCHW and NHWC"
+    if layout == "NCHW":
+        _, CI, IH, IW = [x.value for x in data.shape]
+        CO, _, KH, KW = [x.value for x in kernel.shape]
+    else: # NHWC
+        IH, IW = data.shape[1].value, data.shape[2].value
+        KH, KW, CI, CO = [x for x in get_const_tuple(kernel.shape)]
+
+    HPAD, WPAD, _, _ = get_pad_tuple(padding, kernel)
+    if isinstance(stride, (tuple, list)):
+        HSTR, WSTR = stride
+    else:
+        HSTR, WSTR = stride, stride
+    
+    return Workload(data.dtype, out_dtype, IH, IW, CI, CO, KH, KW, HPAD, WPAD, HSTR, WSTR)
+
+@tvm.target.generic_func
+def _get_schedule(wkl, layout):
+    # pylint: disable=unreachable
+    """ Get the platform specific schedule. """
+    target = tvm.target.current_target()
+    raise RuntimeError(
+        "No schedule for current target:{}".format(target))
+    # This return has no use, merely to supress pylint warning
+    return wkl
+
+
+def qconv2d_nchw(Input, Filter, stride, padding, activation_bits, weight_bits, out_dtype='int32', pack_type='uint32'):
+    assert isinstance(stride, int) or len(stride) == 2
+    Input_q = bitpack(Input, activation_bits, pack_axis=1, bit_axis=2, pack_type=pack_type)
+    Filter_q = bitpack(Filter, weight_bits, pack_axis=1, bit_axis=4, pack_type=pack_type)
+    batch, in_channel, activation_bits, in_height, in_width = Input_q.shape
+    num_filter, channel, kernel_h, kernel_w, weight_bits = Filter_q.shape
+
+    pad_top, pad_left, pad_down, pad_right = get_pad_tuple(
+        padding, (kernel_h, kernel_w))
+    pad_before = [0, 0, 0, pad_top, pad_left]
+    pad_after = [0, 0, 0, pad_down, pad_right]
+
+    PadInput_q = pad(Input_q, pad_before, pad_after, name="pad_temp")
+    # compute the output shape
+    if isinstance(stride, int):
+        stride_h = stride_w = stride
+    else:
+        stride_h, stride_w = stride
+    out_channel = num_filter
+    out_height = simplify((in_height - kernel_h + pad_top + pad_down) // stride_h + 1)
+    out_width = simplify((in_width - kernel_w + pad_left + pad_right) // stride_w + 1)
+
+    rc = tvm.reduce_axis((0, in_channel), name='rc')
+    ry = tvm.reduce_axis((0, kernel_h), name='ry')
+    rx = tvm.reduce_axis((0, kernel_w), name='rx')
+    b1 = tvm.reduce_axis((0, activation_bits), name='b1')
+    b2 = tvm.reduce_axis((0, weight_bits), name='b2')
+
+    def _conv(nn, ff, yy, xx):
+        b1b2 = (b1+b2).astype(out_dtype)
+        return tvm.sum( 
+            (tvm.popcount(PadInput_q[nn, rc, b1, yy * stride_h + ry, xx * stride_w + rx] & 
+                Filter_q[ff, rc, ry, rx, b2])<< (b1b2)).astype(out_dtype),
+            axis=[rc, ry, rx, b2, b1]).astype(out_dtype)
+
+    return tvm.compute((batch, out_channel, out_height, out_width), _conv, 
+        name="QConv2dOutput", tag="qconv2d_nchw")
+
+
+def qconv2d_nhwc(Input, Filter, stride, padding, activation_bits, weight_bits, out_dtype='int32', pack_type='uint32'):
+    assert isinstance(stride, int) or len(stride) == 2
+    Input_q = bitpack(Input, activation_bits, pack_axis=3, bit_axis=4, pack_type=pack_type)
+    Filter_q = bitpack(Filter, weight_bits, pack_axis=2, bit_axis=4, pack_type=pack_type)
+    batch, in_height, in_width, in_channel_q, _ = Input_q.shape
+    kernel_h, kernel_w, _, num_filter, _ = Filter_q.shape
+
+    if isinstance(stride, int):
+        stride_h = stride_w = stride
+    else:
+        stride_h, stride_w = stride
+    pad_top, pad_left, pad_down, pad_right = get_pad_tuple(padding, (kernel_h, kernel_w))
+    # compute the output shape
+    out_channel = num_filter
+    out_height = simplify((in_height - kernel_h + pad_top + pad_down) // stride_h + 1)
+    out_width = simplify((in_width - kernel_w + pad_left + pad_right) // stride_w + 1)
+    pad_before = [0, pad_top, pad_left, 0, 0]
+    pad_after = [0, pad_down, pad_right, 0, 0]
+    PadInput_q = pad(Input_q, pad_before, pad_after, name="PaddedInput")
+
+    rc = tvm.reduce_axis((0, in_channel_q), name='rc')
+    ry = tvm.reduce_axis((0, kernel_h), name='ry')
+    rx = tvm.reduce_axis((0, kernel_w), name='rx')
+    b1 = tvm.reduce_axis((0, activation_bits), name='b1')
+    b2 = tvm.reduce_axis((0, weight_bits), name='b2')
+
+    def _conv(nn, yy, xx, ff):
+        return tvm.sum( 
+            (tvm.popcount(PadInput_q[nn, yy * stride_h + ry, xx * stride_w + rx, rc, b1] & 
+                Filter_q[ry, rx, rc, ff, b2])<< b1b2).astype(out_dtype),
+            axis=[rc, ry, rx, b2, b1])
+    
+    return tvm.compute( (batch, out_height, out_width, out_channel), _conv,
+        name="QConv2dOutput", tag="qconv2d_nhwc")
+
+
+def spatial_pack_nchw(data, kernel, stride, padding, in_bits, weight_bits, pack_dtype, out_dtype, dorefa=False):
+    """ Compute convolution with pack on spatial axes. """
+    assert data.shape[0].value == 1, "spatial pack convolution only support batch size=1"
+    data_q = bitpack(data, in_bits, pack_axis=1, bit_axis=0, pack_type=pack_dtype)
+    kernel_q = bitpack(kernel, weight_bits, pack_axis=1, bit_axis=0, pack_type=pack_dtype)
+    IB, _, CI, H, W = data_q.shape
+    KB, CO, _, KH, KW = kernel_q.shape
+    HPAD, WPAD, _, _ = get_pad_tuple(padding, kernel)
+
+    if isinstance(stride, (tuple, list)):
+        HSTR, WSTR = stride
+    else:
+        HSTR, WSTR = stride, stride
+    HCAT, WCAT = KH-1, KW-1
+
+    wkl = _get_workload(data, kernel, stride, padding, out_dtype, "NCHW")
+    sch = _get_schedule(wkl, "NCHW")
+    VH = sch.vh
+    VW = sch.vw
+    VC = sch.vc
+
+    TH = H + 2*HPAD
+    TW = W + 2*WPAD
+    OH = (H + 2*HPAD - KH) // HSTR + 1
+    OW = (W + 2*WPAD - KW) // WSTR + 1
+
+    dshape = (IB, 1, CI, H, W)
+    dpshape = (IB, 1, CI, TH, TW)
+    dvshape = (1, TH//(VH*HSTR), TW//(VW*WSTR), CI, VH*HSTR+HCAT, VW*WSTR+WCAT, IB)
+
+    kshape = (KB, CO, CI, KH, KW)
+    kvshape = (CO//VC, CI, KH, KW, KB, VC)
+
+    ovshape = (1, CO//VC, OH//VH, OW//VW, VH, VW, VC)
+    oshape = (1, CO, OH, OW)
+
+    DOPAD = (HPAD != 0 and WPAD != 0)
+    if DOPAD:
+        data_pad = pad(data_q, (0, 0, 0, HPAD, WPAD), name="data_pad")
+    else:
+        data_pad = data_q
+
+    data_vec = tvm.compute(dvshape, lambda n, h, w, ci, vh, vw, b: \
+        data_pad[b][n][ci][h*VH*HSTR+vh][w*VW*WSTR+vw], name='data_vec')
+
+    kernel_vec = tvm.compute(kvshape, lambda co, ci, dh, dw, b, vc: \
+        kernel_q[b][co*VC+vc][ci][dh][dw], name='kernel_vec')
+
+    ci = tvm.reduce_axis((0, CI), name='ci')
+    dh = tvm.reduce_axis((0, KH), name='dh')
+    dw = tvm.reduce_axis((0, KW), name='dw')
+    b1 = tvm.reduce_axis((0, IB), name='ib')
+    b2 = tvm.reduce_axis((0, KB), name='kb')
+    
+    def _conv(n, co, h, w, vh, vw, vc):
+        b1b2 = (b1+b2).astype(out_dtype)
+        if dorefa:
+            return tvm.sum( 
+                (tvm.popcount(data_vec[n, h, w, ci, vh*HSTR+dh, vw*WSTR+dw, b1] &
+                    kernel_vec[co, ci, dh, dw, b2, vc])  -
+                tvm.popcount(data_vec[n, h, w, ci, vh*HSTR+dh, vw*WSTR+dw, b1] &
+                    ~kernel_vec[co, ci, dh, dw, b2, vc])).astype(out_dtype) << b1b2,
+                axis=[ci, dh, dw, b1, b2])
+        else:
+            return tvm.sum( 
+                (tvm.popcount(data_vec[n, h, w, ci, vh*HSTR+dh, vw*WSTR+dw, b1] &
+                    kernel_vec[co, ci, dh, dw, b2, vc])).astype(out_dtype) << b1b2,
+                axis=[ci, dh, dw, b1, b2])
+
+    conv = tvm.compute(ovshape, _conv, name='conv_out')
+
+    return tvm.compute(oshape, lambda n, co, h, w:
+        conv[n][co//VC][h//VH][w//VW][h%VH][w%VW][co%VC],
+        name='conv_vec', tag='spatial_qconv_nchw')
+        
+
+
+def spatial_pack_nhwc(data, kernel, stride, padding, in_bits, weight_bits, pack_dtype, out_dtype, dorefa=False):
+    """ Compute convolution with pack on spatial axes. """
+    assert data.shape[0].value == 1, "spatial pack convolution only support batch size=1"
+    data_q = bitpack(data, in_bits, pack_axis=3, bit_axis=4, pack_type=pack_dtype)
+    kernel_q = bitpack(kernel, weight_bits, pack_axis=2, bit_axis=4, pack_type=pack_dtype)
+    _, H, W, CI, IB = data_q.shape
+    KH, KW, _, CO, KB = kernel_q.shape
+    HPAD, WPAD, _, _ = get_pad_tuple(padding, kernel)
+
+    if isinstance(stride, (tuple, list)):
+        HSTR, WSTR = stride
+    else:
+        HSTR, WSTR = stride, stride
+    HCAT, WCAT = KH-1, KW-1
+
+    wkl = _get_workload(data, kernel, stride, padding, out_dtype, "NHWC")
+    sch = _get_schedule(wkl, "NHWC")
+    VH = sch.vh
+    VW = sch.vw
+    VC = sch.vc
+
+    PAD_H = H + 2*HPAD
+    PAD_W = W + 2*WPAD
+    OH = (H + 2*HPAD - KH) // HSTR + 1
+    OW = (W + 2*WPAD - KW) // WSTR + 1
+
+    dvshape = (1, PAD_H//(VH*HSTR), PAD_W//(VW*WSTR), VH*HSTR+HCAT, VW*WSTR+WCAT, CI, IB)
+    kvshape = (CO, KH, KW, CI, VC, KB)
+    ovshape = (1, OH, OW, CO, VH, VW, VC)
+    oshape = (1, OH, OW, CO)
+
+    if (HPAD != 0 and WPAD != 0):
+        data_pad = pad(data_q, (0, HPAD, WPAD, 0, 0), name="data_pad")
+    else:
+        data_pad = data_q
+
+    data_vec = tvm.compute(dvshape, lambda n, h, w, vh, vw, ci, b: \
+        data_pad[n][h*VH*HSTR+vh][w*VW*WSTR+vw][ci][b], name='data_vec')
+
+    kernel_vec = tvm.compute(kvshape, lambda co, dh, dw, ci, vc, b: \
+        kernel_q[dh][dw][ci][co*VC+vc][b], name='kernel_vec')
+
+    ci = tvm.reduce_axis((0, CI), name='ci')
+    dh = tvm.reduce_axis((0, KH), name='dh')
+    dw = tvm.reduce_axis((0, KW), name='dw')
+    b1 = tvm.reduce_axis((0, IB), name='ib')
+    b2 = tvm.reduce_axis((0, KB), name='kb')
+
+    def _conv(n, h, w, co, vh, vw, vc):
+        b1b2 = (b1+b2).astype(out_dtype)
+        if dorefa:
+            return tvm.sum( 
+                (tvm.popcount(data_vec[n, h, w, vh*HSTR+dh, vw*WSTR+dw, ci, b1] &
+                    kernel_vec[co, dh, dw, ci, vc, b2]) -
+                tvm.popcount(data_vec[n, h, w, vh*HSTR+dh, vw*WSTR+dw, ci, b1] &
+                    ~kernel_vec[co, dh, dw, ci, vc, b2])).astype(out_dtype) << b1b2,
+                axis=[dh, dw, ci, b1, b2])
+        else:
+            return tvm.sum( 
+                tvm.popcount(data_vec[n, h, w, vh*HSTR+dh, vw*WSTR+dw, ci, b1] &
+                    kernel_vec[co, dh, dw, ci, vc, b2]).astype(out_dtype) << b1b2,
+                axis=[dh, dw, ci, b1, b2])
+
+    conv = tvm.compute(ovshape, _conv, name='conv')
+
+    return tvm.compute(oshape, lambda n, h, w, co:
+        conv[n][h//VH][w//VW][co//VC][h%VH][w%VW][co%VC],
+        name='output_unpack', tag='spatial_qconv_nhwc')
+
+_SCH_TO_DECL_FUNC_QUANT = {
+    QuantizedSpatialPackNCHW: spatial_pack_nchw,
+    QuantizedSpatialPackNHWC: spatial_pack_nhwc,
+}
diff --git a/topi/python/topi/rasp/qconv2d.py b/topi/python/topi/rasp/qconv2d.py
new file mode 100644
index 000000000000..b0f7fcb011fe
--- /dev/null
+++ b/topi/python/topi/rasp/qconv2d.py
@@ -0,0 +1,619 @@
+# pylint: disable=invalid-name,unused-variable,invalid-name
+"""QConv2D schedule on raspberry pi"""
+from __future__ import absolute_import as _abs
+import tvm
+from tvm import target as _target
+from .. import tag
+from ..nn.qconv2d import qconv2d as _qconv2d, _get_schedule
+from ..nn.qconv2d import RaspQuantizedSpatialPack, QuantizedSpatialPackNCHW, QuantizedSpatialPackNHWC
+from ..nn.qconv2d import _WORKLOADS, _SCH_TO_DECL_FUNC_QUANT
+from ..nn.qconv2d import _get_workload
+from ..nn.util import infer_pad, infer_stride
+from ..util import simplify, get_const_int
+
+from .. import generic
+
+# TODO grab the number from autotuner
+_QUANTIZED_SCHEDULES = [
+    RaspQuantizedSpatialPack(2, 2, 8, 1, 1, False, 8),
+    RaspQuantizedSpatialPack(1, 4, 8, 4, 1, False, 8),
+    RaspQuantizedSpatialPack(1, 4, 8, 1, 16, False, 8),
+    RaspQuantizedSpatialPack(1, 4, 8, 4, 8, False, 8),
+    RaspQuantizedSpatialPack(1, 7, 8, 3, 8, False, 16),
+    RaspQuantizedSpatialPack(1, 2, 8, 1, 8, False, 16),
+    RaspQuantizedSpatialPack(2, 1, 8, 1, 4, False, 16),
+    RaspQuantizedSpatialPack(1, 7, 8, 1, 1, True, 16),
+    RaspQuantizedSpatialPack(1, 1, 8, 1, 16, True, 16),
+    RaspQuantizedSpatialPack(1, 1, 8, 1, 8, True, 16),
+    RaspQuantizedSpatialPack(1, 1, 8, 1, 16, True, 16),
+]
+
+# TODO grab the number from autotuner
+_QUANTIZED_SCHEDULES_NCHW = [
+    # resnet
+    QuantizedSpatialPackNCHW(2, 2, 8, 1, 1),
+    QuantizedSpatialPackNCHW(1, 4, 8, 4, 1),
+    QuantizedSpatialPackNCHW(1, 4, 8, 1, 16),
+    QuantizedSpatialPackNCHW(1, 4, 8, 4, 8),
+    QuantizedSpatialPackNCHW(1, 7, 8, 3, 8),
+    QuantizedSpatialPackNCHW(1, 2, 8, 1, 8),
+    QuantizedSpatialPackNCHW(2, 1, 8, 1, 4),
+    QuantizedSpatialPackNCHW(1, 7, 8, 1, 1),
+    QuantizedSpatialPackNCHW(1, 1, 8, 1, 16),
+    QuantizedSpatialPackNCHW(1, 1, 8, 1, 8),
+    QuantizedSpatialPackNCHW(1, 1, 8, 1, 16),
+]
+
+_QUANTIZED_SCHEDULES_NHWC = [
+    # resnet
+    QuantizedSpatialPackNHWC(2, 2, 8, 1, 1),
+    QuantizedSpatialPackNHWC(1, 4, 8, 4, 1),
+    QuantizedSpatialPackNHWC(1, 4, 8, 1, 16),
+    QuantizedSpatialPackNHWC(1, 4, 8, 4, 8),
+    QuantizedSpatialPackNHWC(1, 7, 8, 3, 8),
+    QuantizedSpatialPackNHWC(1, 2, 8, 1, 8),
+    QuantizedSpatialPackNHWC(2, 1, 8, 1, 4),
+    QuantizedSpatialPackNHWC(1, 7, 8, 1, 1),
+    QuantizedSpatialPackNHWC(1, 1, 8, 1, 16),
+    QuantizedSpatialPackNHWC(1, 1, 8, 1, 8),
+    QuantizedSpatialPackNHWC(1, 1, 8, 1, 16),
+]
+
+
+@_get_schedule.register("rasp")
+def _get_schedule_qconv2d(wkl, layout):
+    if wkl not in _WORKLOADS:
+        raise ValueError("no schedule for such workload: {}".format(wkl))
+    idx = _WORKLOADS.index(wkl)
+    if layout == "NCHW":
+        sch = _QUANTIZED_SCHEDULES_NCHW[idx]
+    elif layout == "NHWC":
+        sch = _QUANTIZED_SCHEDULES_NHWC[idx]
+    return sch
+
+
+@_qconv2d.register("rasp")
+def _declaration_qconv2d(data, kernel, stride, padding,  activation_bits, weight_bits, layout='NCHW', 
+           pack_dtype=None, out_dtype=None, dorefa=False):
+    if out_dtype is None:
+        out_dtype = data.dtype
+    assert data.shape[0].value == 1, "only support batch size=1 convolution on rasp"
+    assert layout == "NCHW" or layout == "NHWC", "only support layouts NCHW and NHWC"
+    wkl = _get_workload(data, kernel, stride, padding, out_dtype, layout)
+    sch = _get_schedule(wkl, layout)
+    return _SCH_TO_DECL_FUNC_QUANT[type(sch)](data, kernel, stride, padding, activation_bits, weight_bits, 
+                                              pack_dtype, out_dtype, dorefa)
+
+# TODO: is there a better way to share these with x86?
+
+@generic.schedule_qconv2d_nchw.register(["rasp"])
+@generic.schedule_qconv2d_nhwc.register(["rasp"])
+def schedule_qconv2d(outs):
+    s = tvm.create_schedule([x.op for x in outs])
+
+    def traverse(op):
+        output = op.output(0)
+        # inline all one-to-one-mapping operators except the last stage (output)
+        if tag.is_broadcast(op.tag) or 'elemwise' in op.tag or 'uquantize' in op.tag:
+            if op not in s.outputs:
+                s[op].compute_inline()
+            for tensor in op.input_tensors:
+                if tensor.op.input_tensors:
+                    traverse(tensor.op)
+
+        elif 'spatial_qconv_nchw' in op.tag or 'spatial_qconv_nhwc' in op.tag :
+            conv_out = op.input_tensors[0]
+            kernel_vec = conv_out.op.input_tensors[1]
+            kernel_q = kernel_vec.op.input_tensors[0]
+            kernel = kernel_q.op.input_tensors[0]
+            data_vec = conv_out.op.input_tensors[0]
+            data_q = data_vec.op.input_tensors[0]
+            data = data_q.op.input_tensors[0]
+            data_pad = None
+            if isinstance(data_q.op, tvm.tensor.ComputeOp) and "pad" in data_q.op.tag:
+                data_pad = data_q
+                data_q = data
+                data = data_q.op.input_tensors[0]
+
+            # Need to go up 1 further, from the combine in bitpack
+            if "QuantizeInput" in kernel.op.name:
+                kernel = kernel.op.input_tensors[0]
+            if "QuantizeInput" in data.op.name:
+                data = data.op.input_tensors[0]
+
+            if 'spatial_qconv_nchw' in op.tag:
+                _schedule_spatial_conv2d_nchw(s, data, data_q, data_pad, data_vec,
+                                        kernel, kernel_q, kernel_vec,
+                                        conv_out, output, outs[0])
+            elif 'spatial_qconv_nhwc' in op.tag:
+                _schedule_spatial_conv2d_nhwc(s, data, data_q, data_pad, data_vec,
+                                        kernel, kernel_q, kernel_vec,
+                                        conv_out, output, outs[0])
+        
+    traverse(outs[0].op)
+    return s
+
+
+def _schedule_spatial_conv2d_nchw(s, data, data_q, data_pad, data_vec, kernel, kernel_q, kernel_vec, conv_out, output, last):
+    IB, _, CI, IH, IW = data_q.shape
+    KB, CO, _, KH, KW = kernel_q.shape
+    _, _, OH, OW = output.shape
+
+    # Infer padding and stride
+    if data_pad is None:
+        padding = (0, 0)
+        TH, TW = IH, IW
+    else:
+        _, _, _, TH, TW = data_pad.shape
+        hpad = get_const_int((TH - IH) // 2)
+        wpad = get_const_int((TW - IW) // 2)
+        padding = (hpad, wpad)
+
+    hstride = get_const_int((TH - KH) // (OH - 1))
+    wstride = get_const_int((TW - KW) // (OW - 1))
+    stride = (hstride, wstride)
+
+    wkl = _get_workload(data, kernel, stride, padding, last.dtype, "NCHW")
+    sch = _get_schedule(wkl, "NCHW")
+    VH = sch.vh
+    VW = sch.vw
+    VC = sch.vc
+    ba = sch.ba
+    bc = sch.bc
+    
+    CC = s.cache_write(conv_out, "global")
+
+    n, co, oh, ow, vh, vw, vc = s[conv_out].op.axis
+    s[conv_out].vectorize(vc)
+
+    s[CC].compute_at(s[conv_out], ow)
+    n, co, oh, ow, vh, vw, vc = s[CC].op.axis
+    ci, dh, dw, b1, b2 = s[CC].op.reduce_axis
+    s[CC].reorder(ci, dh, vh, dw, vw, b1, b2, vc)
+    s[CC].unroll(b1)
+    s[CC].unroll(b2)
+    s[CC].vectorize(vc)
+
+    ##### Schedule A
+    if data_pad is not None:
+        s[data_pad].compute_inline()
+
+    _, h, _, _, _, _ , vw = s[data_vec].op.axis
+    s[data_vec].vectorize(vw)
+    if ba == 1:
+        oaxis = h
+        paxis = h
+    else:
+        oh, ih = s[data_vec].split(h, ba)
+        oaxis = oh
+        paxis = ih
+
+    s[data_vec].parallel(paxis)
+    s[data_vec].pragma(oaxis, "parallel_launch_point")
+    s[data_vec].pragma(paxis, "parallel_stride_pattern")
+    s[data_vec].pragma(oaxis, "parallel_barrier_when_finish")
+
+
+    ##### Schedule B
+    co, _, _, _, _, vc = s[kernel_vec].op.axis
+    s[kernel_vec].vectorize(vc)
+    if bc == 1:
+        oaxis = co
+        paxis = co
+    else:
+        oco, ico = s[kernel_vec].split(co, bc)
+        oaxis = oco
+        paxis = ico
+
+    s[kernel_vec].parallel(paxis)
+    s[kernel_vec].pragma(oaxis, "parallel_launch_point")
+    s[kernel_vec].pragma(paxis, "parallel_stride_pattern")
+    s[kernel_vec].pragma(oaxis, "parallel_barrier_when_finish")
+
+
+    ##### Schedule C
+    n, co, h, w = s[last].op.axis
+    co, vc = s[last].split(co, VC)
+    oh, ow, vh, vw = s[last].tile(h, w, VH, VW)
+    s[last].reorder(n, co, oh, ow, vh, vw, vc)
+    if last != output:
+        s[output].compute_inline()
+    s[conv_out].compute_at(s[last], ow)
+
+    if bc == 1:
+        oaxis = co
+        paxis = co
+    else:
+        oco, ico = s[last].split(co, bc)
+        oaxis = oco
+        paxis = ico
+
+    s[last].parallel(paxis)
+    s[last].pragma(oaxis, "parallel_launch_point")
+    s[last].pragma(paxis, "parallel_stride_pattern")
+    s[last].pragma(oaxis, "parallel_barrier_when_finish")
+
+    return s
+
+def _schedule_spatial_conv2d_nhwc(s, data, data_q, data_pad, data_vec,
+                            kernel, kernel_q, kernel_vec,
+                            conv_out, output, last):
+    return s
+    _, IH, IW, CI, IB = data_q.shape
+    KH, KW, _, CO, KB = kernel_q.shape
+    _, OH, OW, _ = output.shape
+    # Infer padding and stride
+    if data_pad is None:
+        padding = (0, 0)
+        TH, TW = IH, IW
+    else:
+        _, TH, TW, _, _ = data_pad.shape
+        hpad = get_const_int((TH - IH) // 2)
+        wpad = get_const_int((TW - IW) // 2)
+        padding = (hpad, wpad)
+
+    hstride = get_const_int((TH - KH) // (OH - 1))
+    wstride = get_const_int((TW - KW) // (OW - 1))
+    stride = (hstride, wstride)
+
+    wkl = _get_workload(data, kernel, stride, padding, output.dtype, "NHWC")
+    sch = _get_schedule(wkl, "NHWC")
+    VH = sch.vh
+    VW = sch.vw
+    VC = sch.vc
+    ba = sch.ba
+    bc = sch.bc
+
+    ##### Schedule data packing
+    if data_pad is not None:
+        s[data_pad].compute_inline()
+
+    _, h, _, _, _, _ , _ = s[data_vec].op.axis
+    if ba == 1:
+        oaxis = h
+        paxis = h
+    else:
+        oh, ih = s[data_vec].split(h, ba)
+        oaxis = oh
+        paxis = ih
+    s[data_vec].parallel(paxis)
+    s[data_vec].pragma(oaxis, "parallel_launch_point")
+    s[data_vec].pragma(paxis, "parallel_stride_pattern")
+    s[data_vec].pragma(oaxis, "parallel_barrier_when_finish")
+
+
+    ##### Schedule kernel packing
+    co, _, _, _, _, _ = s[kernel_vec].op.axis
+    if bc == 1:
+        oaxis = co
+        paxis = co
+    else:
+        oco, ico = s[kernel_vec].split(co, bc)
+        oaxis = oco
+        paxis = ico
+
+    s[kernel_vec].parallel(paxis)
+    s[kernel_vec].pragma(oaxis, "parallel_launch_point")
+    s[kernel_vec].pragma(paxis, "parallel_stride_pattern")
+    s[kernel_vec].pragma(oaxis, "parallel_barrier_when_finish")
+
+
+    ##### Schedule Convolution
+    n, oh, ow, co, vh, vw, vc = s[conv_out].op.axis
+    dh, dw, ci, b1, b2 = s[conv_out].op.reduce_axis
+
+    s[conv_out].reorder(n, oh, ow, co, vh, vw, dh, dw, ci, vc, b1, b2)
+
+    s[conv_out].unroll(b1)
+    s[conv_out].unroll(b2)
+    s[conv_out].vectorize(vc)
+
+    # # Schedule output
+    n, h, w, co = s[last].op.axis
+    co, vc = s[last].split(co, VC)
+    oh, ow, vh, vw = s[last].tile(h, w, VH, VW)
+    s[last].reorder(n, oh, ow, co, vh, vw, vc)
+    s[last].vectorize(vc)
+    if last != output:
+        s[output].compute_inline()
+    s[conv_out].compute_at(s[last], ow)
+
+
+    if bc == 1:
+        oaxis = oh
+        paxis = oh
+    else:
+        oho, iho = s[last].split(oh, bc)
+        oaxis = oho
+        paxis = iho
+
+    s[last].parallel(paxis)
+    s[last].pragma(oaxis, "parallel_launch_point")
+    s[last].pragma(paxis, "parallel_stride_pattern")
+    s[last].pragma(oaxis, "parallel_barrier_when_finish")
+
+    return s
+
+####### ARM SPECIFIC #######
+def _spatial_pack_nhwc(data, kernel, stride, padding, activation_bits, weight_bits, out_dtype):
+    """ Compute convolution with pack on spatial axes. """
+    assert data.shape[0].value == 1, "spatial pack convolution only support batch size=1"
+    print (out_dtype)
+    wkl = _get_workload(data, kernel, stride, padding, out_dtype, "NHWC")
+    sch = _get_schedule(wkl)
+    VH = sch.vh
+    VW = sch.vw
+    VC = sch.vc
+
+    data_q = bitpack(data, activation_bits, pack_axis=3, bit_axis=3, pack_type='uint8')
+    kernel_vec = kernel_vec_spatial_pack_nhwc(kernel, weight_bits, VC)
+    N, H, W, IB, CI = data_q.shape
+    OCO, KH, KW, KB, VC, _ = kernel_vec.shape
+
+    CO = OCO * VC
+    HPAD, WPAD, _, _ = get_pad_tuple(padding, kernel)
+
+    if isinstance(stride, (tuple, list)):
+        HSTR, WSTR = stride
+    else:
+        HSTR, WSTR = stride, stride
+    HCAT, WCAT = KH-1, KW-1
+
+
+    PAD_H = H + 2*HPAD
+    PAD_W = W + 2*WPAD
+    OH = (H + 2*HPAD - KH) // HSTR + 1
+    OW = (W + 2*WPAD - KW) // WSTR + 1
+    dvshape = (N, PAD_H//(VH*HSTR), PAD_W//(VW*WSTR), VH*HSTR+HCAT, VW*WSTR+WCAT, IB, CI)
+    ovshape = (1, OH // VH, OW // VW, CO // VC, VH, VW, VC)
+    oshape = (1, OH, OW, CO)
+
+    if (HPAD != 0 and WPAD != 0):
+        data_pad = pad(data_q, (0, HPAD, WPAD, 0, 0), name="data_pad")
+    else:
+        data_pad = data_q
+
+    data_vec = tvm.compute(dvshape, lambda n, h, w, vh, vw, b, ci: \
+        data_pad[n][h*VH*HSTR+vh][w*VW*WSTR+vw][b][ci], name='data_vec')
+    
+    ci = tvm.reduce_axis((0, CI), name='ci')
+    dh = tvm.reduce_axis((0, KH), name='dh')
+    dw = tvm.reduce_axis((0, KW), name='dw')
+    ib = tvm.reduce_axis((0, IB), name='ib')
+    kb = tvm.reduce_axis((0, KB), name='kb')
+
+    def _conv(n, h, w, co, vh, vw, vc):
+        return tvm.sum( 
+            (tvm.popcount(kernel_vec[co, dh, dw, kb, vc, ci] & 
+                data_vec[n, h, w, vh*HSTR+dh, vw*WSTR+dw, ib, ci]).astype('int16') 
+            << (kb + ib).astype('int16')), axis=[dh, dw, kb, ib, ci])
+
+    conv = tvm.compute(ovshape, _conv, name='conv')
+
+    return tvm.compute(oshape, lambda n, h, w, co:
+        conv[n][h//VH][w//VW][co//VC][h%VH][w%VW][co%VC].astype(out_dtype),
+        name='output_vec', tag='spatial_qconv_nhwc')
+
+def intrin_popcount(m, k_i, w_b, x_b):
+    type = 'uint8'
+    w = tvm.placeholder((w_b, m, k_i), dtype=type, name='w')
+    x = tvm.placeholder((x_b, k_i,), dtype=type, name='x')
+    k = tvm.reduce_axis((0, k_i), name='k')
+    bw = tvm.reduce_axis((0, w_b), name='bw')
+    bx = tvm.reduce_axis((0, x_b), name='bx')
+    z = tvm.compute((m,), lambda i:
+                    tvm.sum(tvm.popcount(w[bw, i, k].astype('uint16') & x[bx, k].astype('uint16')) << (bw+bx).astype('uint16'),
+                     axis=[bw, bx, k]), name='z')
+
+    Wb = tvm.decl_buffer(w.shape, w.dtype,
+                        name="W",
+                        offset_factor=k_i,
+                        strides=[tvm.var('ldw'), tvm.var('ldw'), 1]) 
+    Xb = tvm.decl_buffer(x.shape, x.dtype,
+                        name="X",
+                        offset_factor=k_i,
+                        strides=[tvm.var('ldw'), 1])
+
+            
+    def intrin_func(ins, outs):
+        ww, xx = ins
+        zz = outs[0]
+        vpadd_id = tvm.const(647, 'uint32')
+        vpadalu_id = tvm.const(646, 'uint32')
+        args_1 = tvm.const(1, 'uint32')
+        args_2 = tvm.const(2, 'uint32')
+    
+        def instr(index):
+            irb = tvm.ir_builder.create()
+            if index == 1:
+                irb.emit(zz.vstore(0, tvm.const(0, 'uint16x8')))
+            else:
+                cnts8 = [None] * 8
+                cnts4 = [None] * 4
+                cnts2 = [None] * 2
+                for bw in range(w_b):
+                    for bx in range(x_b):
+                        if k_i == 16:
+                            for i in range(m):
+                                ands = ww.vload([bw, i, 0], 'uint8x16') & xx.vload([bx, 0], 'uint8x16')
+                                cnts = tvm.popcount(ands)
+                                upper_half = tvm.call_pure_intrin('uint8x8', 'vectorhigh', cnts)
+                                lower_half = tvm.call_pure_intrin('uint8x8', 'vectorlow', cnts)
+                                cnts8[i] = upper_half + lower_half
+                            for i in range(m/2):
+                                cnts4[i] = tvm.call_pure_intrin('uint8x8', 'llvm_intrin', vpadd_id, args_1, cnts8[i*2], cnts8[i*2+1])
+                            for i in range(m/4):
+                                cnts2[i] = tvm.call_pure_intrin('uint8x8', 'llvm_intrin', vpadd_id, args_1, cnts4[i*2], cnts4[i*2+1])
+                            cnts = tvm.call_pure_intrin('uint8x16', 'vectorcombine', cnts2[0], cnts2[1])
+                            shifted_cnts = cnts << (bw+bx)
+                            out = tvm.call_pure_intrin('uint16x8', 'llvm_intrin', vpadalu_id, args_2, zz.vload(0, 'uint16x8'), shifted_cnts)
+                        else: # ki ==8
+                            for i in range(m):
+                                ands = ww.vload([bw, i, 0], 'uint8x8') & xx.vload([bx, 0], 'uint8x8')
+                                cnts8[i] = tvm.popcount(ands)
+                            for i in range(m/2):
+                                cnts4[i] = tvm.call_pure_intrin('uint8x8', 'llvm_intrin', vpadd_id, args_1, cnts8[i*2], cnts8[i*2+1])
+                            for i in range(m/4):
+                                cnts2[i] = tvm.call_pure_intrin('uint8x8', 'llvm_intrin', vpadd_id, args_1, cnts4[i*2], cnts4[i*2+1])
+                            cnts = tvm.call_pure_intrin('uint8x16', 'vectorcombine', cnts2[0], cnts2[1])
+                            shifted_cnts = cnts << (bw+bx)
+                            out = tvm.call_pure_intrin('uint16x8', 'llvm_intrin', vpadalu_id, args_2, zz.vload(0, 'uint16x8'), shifted_cnts)
+                        irb.emit(zz.vstore(0, out))
+            return irb.get()
+        # body, reset, update
+        return instr(0), instr(1), instr(2)
+    with tvm.build_config(offset_factor=1, partition_const_loop=True):
+        return tvm.decl_tensor_intrin(z.op, intrin_func, binds={w: Wb, x:Xb})
+
+
+# ARM specific schedule that using custom microkernel
+def arm_schedule_spatial_conv2d_nhwc(s, data, data_q, data_pad, data_vec,
+                kernel, kernel_q, kernel_vec, conv_out, output, last):
+    # no stride and padding info here
+    _, H, W, IB, CI = data_q.shape
+    KH, KW, KB, _, CO = kernel_q.shape
+    KB = get_const_int(KB)
+    IB = get_const_int(IB)
+
+    if data_pad is None:
+        padding = (0,0)
+        _, in_h, in_w, _ , _ = data_q.shape
+        kern_h, kern_w, _, _ = kernel.shape
+        _, out_h, out_w, _ = output.shape
+        hstride = (in_h - kern_h) // (out_h - 1)
+        wstride = (in_w - kern_w) // (out_w - 1)
+        stride = get_const_int(hstride), get_const_int(wstride)
+    else:
+        _, in_h, in_w, _, _ = data_q.shape
+        _, pad_h, pad_w, _, _ = data_pad.shape
+        hpad = (pad_h - in_h) // 2
+        wpad = (pad_w - in_w) // 2
+        padding = get_const_int(hpad), get_const_int(wpad)
+
+        _, in_h, in_w, _, _ = data_pad.shape
+        kern_h, kern_w, _, _ = kernel.shape
+        _, out_h, out_w, _ = output.shape
+        hstride = (in_h - kern_h) // (out_h - 1)
+        wstride = (in_w - kern_w) // (out_w - 1)
+        stride = get_const_int(hstride), get_const_int(wstride)
+
+    wkl = _get_workload(data, kernel, stride, padding, output.dtype, "NHWC")
+    sch = _get_schedule(wkl, "NHWC")
+
+    VH = sch.vh
+    VW = sch.vw
+    VC = sch.vc
+    ba = sch.ba
+    bc = sch.bc
+
+    ##### Schedule data packing
+    if data_pad is not None:
+        s[data_pad].compute_inline()
+
+    _, h, _, _, _, _, _ = s[data_vec].op.axis
+    if ba == 1:
+        oaxis = h
+        paxis = h
+    else:
+        oh, ih = s[data_vec].split(h, ba)
+        oaxis = oh
+        paxis = ih
+
+    s[data_vec].parallel(paxis)
+    s[data_vec].pragma(oaxis, "parallel_launch_point")
+    s[data_vec].pragma(paxis, "parallel_stride_pattern")
+    s[data_vec].pragma(oaxis, "parallel_barrier_when_finish")
+
+
+    ##### Schedule kernel packing
+    co, _, _, _, _, _ = s[kernel_vec].op.axis
+    if bc == 1:
+        oaxis = co
+        paxis = co
+    else:
+        oco, ico = s[kernel_vec].split(co, bc)
+        oaxis = oco
+        paxis = ico
+
+    s[kernel_vec].parallel(paxis)
+    s[kernel_vec].pragma(oaxis, "parallel_launch_point")
+    s[kernel_vec].pragma(paxis, "parallel_stride_pattern")
+    s[kernel_vec].pragma(oaxis, "parallel_barrier_when_finish")
+
+
+    ##### Schedule Convolution
+    n, oh, ow, co, vh, vw, vc = s[conv_out].op.axis
+    dh, dw, kb, ib, ci = s[conv_out].op.reduce_axis
+    
+    kfactor = sch.kfactor
+    if sch.split_ci:
+        oci, ici = s[conv_out].split(ci, kfactor)
+        s[conv_out].reorder(n, oh, ow, co, vh, vw, dh, dw, oci, kb, ib, vc, ici)
+    else:
+        s[conv_out].reorder(n, oh, ow, co, vh, vw, dh, dw, kb, ib, vc, ci)
+   
+    pc = intrin_popcount(8, kfactor, KB, IB)
+    s[conv_out].tensorize(kb, pc)
+
+    n, h, w, co = s[last].op.axis
+    co, vc = s[last].split(co, VC)
+    oh, ow, vh, vw = s[last].tile(h, w, VH, VW)
+    s[last].reorder(n, oh, ow, co, vc, vh, vw)
+    s[last].vectorize(vw)
+    if last != output:
+        s[last].compute_inline()
+    
+    s[conv_out].compute_at(s[last], ow)
+    if co == 1:
+        oaxis = oh
+        paxis = oh
+    else:
+        oho, iho = s[last].split(oh, bc)
+        oaxis = oho
+        paxis = iho
+
+    s[last].parallel(paxis)
+    s = s.normalize()
+    return s
+
+
+# @generic.schedule_qconv2d_nhwc.register(["rasp"])
+def schedule_qconv2d_nhwc(outs):
+    s = tvm.create_schedule([x.op for x in outs])
+    def traverse(op):
+        """Traverse operators from computation graph"""
+        # inline all one-to-one-mapping operators except the last stage (output)
+        if tag.is_broadcast(op.tag):
+            if op not in s.outputs:
+                s[op].compute_inline()
+            for tensor in op.input_tensors:
+                if tensor.op.input_tensors:
+                    traverse(tensor.op)
+
+        if 'spatial_qconv_nhwc' in op.tag:
+            # print "spatial"
+            output = op.output(0)
+            conv_out = op.input_tensors[0]
+            kernel_vec = conv_out.op.input_tensors[0]
+            kernel_q = kernel_vec.op.input_tensors[0]
+            kernel = kernel_q.op.input_tensors[0]
+            if "QuantizeInput" in kernel.op.name:
+                # Need to go up 1 further, from the combine in bitpack
+                kernel = kernel.op.input_tensors[0]
+            data_vec = conv_out.op.input_tensors[1]
+            data_q = data_vec.op.input_tensors[0]
+            data = data_q.op.input_tensors[0]
+            data_pad = None
+            if isinstance(data_q.op, tvm.tensor.ComputeOp) and "pad" in data_q.op.tag:
+                data_pad = data_q
+                data_q = data
+                data = data_q.op.input_tensors[0]
+            if "QuantizeInput" in data.op.name:
+                # Need to go up 1 further, from the combine in bitpack
+                data = data.op.input_tensors[0]
+
+            _schedule_spatial_conv2d_nhwc(s, data, data_q, data_pad, data_vec,
+                                            kernel, kernel_q, kernel_vec, conv_out, output, outs[0])
+
+    traverse(outs[0].op)
+    return s
diff --git a/topi/python/topi/x86/qconv2d.py b/topi/python/topi/x86/qconv2d.py
new file mode 100644
index 000000000000..1375c5436734
--- /dev/null
+++ b/topi/python/topi/x86/qconv2d.py
@@ -0,0 +1,405 @@
+# pylint: disable=invalid-name,unused-variable,invalid-name
+"""QConv2D schedule on x86"""
+import tvm
+from .. import generic, tag
+from .. import nn
+from ..nn.util import infer_pad, infer_stride
+from topi.util import simplify, get_const_int
+from ..nn.qconv2d import qconv2d as _qconv2d, _get_schedule
+from ..nn.qconv2d import QuantizedSpatialPackNCHW, QuantizedSpatialPackNHWC
+from ..nn.qconv2d import _WORKLOADS, _SCH_TO_DECL_FUNC_QUANT
+from ..nn.qconv2d import _get_workload
+
+
+# TODO grab the number from autotuner
+_QUANTIZED_SCHEDULES_NCHW = [
+    # resnet
+    QuantizedSpatialPackNCHW(2, 2, 8, 1, 1),
+    QuantizedSpatialPackNCHW(1, 4, 8, 4, 1),
+    QuantizedSpatialPackNCHW(1, 4, 8, 1, 16),
+    QuantizedSpatialPackNCHW(1, 4, 8, 4, 8),
+    QuantizedSpatialPackNCHW(1, 7, 8, 3, 8),
+    QuantizedSpatialPackNCHW(1, 2, 8, 1, 8),
+    QuantizedSpatialPackNCHW(2, 1, 8, 1, 4),
+    QuantizedSpatialPackNCHW(1, 7, 8, 1, 1),
+    QuantizedSpatialPackNCHW(1, 1, 8, 1, 16),
+    QuantizedSpatialPackNCHW(1, 1, 8, 1, 8),
+    QuantizedSpatialPackNCHW(1, 1, 8, 1, 16),
+]
+
+_QUANTIZED_SCHEDULES_NHWC = [
+    # resnet
+    QuantizedSpatialPackNHWC(2, 2, 8, 1, 1),
+    QuantizedSpatialPackNHWC(1, 4, 8, 4, 1),
+    QuantizedSpatialPackNHWC(1, 4, 8, 1, 16),
+    QuantizedSpatialPackNHWC(1, 4, 8, 4, 8),
+    QuantizedSpatialPackNHWC(1, 7, 8, 3, 8),
+    QuantizedSpatialPackNHWC(1, 2, 8, 1, 8),
+    QuantizedSpatialPackNHWC(2, 1, 8, 1, 4),
+    QuantizedSpatialPackNHWC(1, 7, 8, 1, 1),
+    QuantizedSpatialPackNHWC(1, 1, 8, 1, 16),
+    QuantizedSpatialPackNHWC(1, 1, 8, 1, 8),
+    QuantizedSpatialPackNHWC(1, 1, 8, 1, 16),
+]
+
+@_get_schedule.register("cpu")
+def _get_schedule_qconv2d(wkl, layout):
+    if wkl not in _WORKLOADS:
+        raise ValueError("no schedule for such workload: {}".format(wkl))
+    idx = _WORKLOADS.index(wkl)
+    if layout == "NCHW":
+        sch = _QUANTIZED_SCHEDULES_NCHW[idx]
+    elif layout == "NHWC":
+        sch = _QUANTIZED_SCHEDULES_NHWC[idx]
+    return sch
+
+
+@_qconv2d.register("cpu")
+def _declaration_qconv2d(data, kernel, stride, padding,  activation_bits, weight_bits, layout='NCHW', 
+           pack_dtype=None, out_dtype=None, dorefa=False):
+    if out_dtype is None:
+        out_dtype = data.dtype
+    assert data.shape[0].value == 1, "only support batch size=1 convolution on rasp"
+    assert layout == "NCHW" or layout == "NHWC", "only support layouts NCHW and NHWC"
+
+    wkl = _get_workload(data, kernel, stride, padding, out_dtype, layout)
+    sch = _get_schedule(wkl, layout)
+    return _SCH_TO_DECL_FUNC_QUANT[type(sch)](data, kernel, stride, padding, activation_bits, weight_bits, 
+                                              pack_dtype, out_dtype, dorefa)
+
+@generic.schedule_qconv2d_nchw.register(["cpu"])
+@generic.schedule_qconv2d_nhwc.register(["cpu"])
+def schedule_qconv2d(outs):
+    s = tvm.create_schedule([x.op for x in outs])
+
+    def traverse(op):
+        output = op.output(0)
+        # inline all one-to-one-mapping operators except the last stage (output)
+        if tag.is_broadcast(op.tag) or 'elemwise' in op.tag or 'uquantize' in op.tag:
+            if op not in s.outputs:
+                s[op].compute_inline()
+            for tensor in op.input_tensors:
+                if tensor.op.input_tensors:
+                    traverse(tensor.op)
+
+        elif 'spatial_qconv_nchw' in op.tag or 'spatial_qconv_nhwc' in op.tag :
+            conv_out = op.input_tensors[0]
+            kernel_vec = conv_out.op.input_tensors[1]
+            kernel_q = kernel_vec.op.input_tensors[0]
+            kernel = kernel_q.op.input_tensors[0]
+            data_vec = conv_out.op.input_tensors[0]
+            data_q = data_vec.op.input_tensors[0]
+            data = data_q.op.input_tensors[0]
+            data_pad = None
+            if isinstance(data_q.op, tvm.tensor.ComputeOp) and "pad" in data_q.op.tag:
+                data_pad = data_q
+                data_q = data
+                data = data_q.op.input_tensors[0]
+            if "QuantizeInput" in kernel.op.name:
+                # Need to go up 1 further, from the combine in bitpack
+                kernel = kernel.op.input_tensors[0]
+            if "QuantizeInput" in data.op.name:
+                # Need to go up 1 further, from the combine in bitpack
+                data = data.op.input_tensors[0]
+
+            if 'spatial_qconv_nchw' in op.tag:
+                _schedule_spatial_conv2d_nchw(s, data, data_q, data_pad, data_vec,
+                                        kernel, kernel_q, kernel_vec,
+                                        conv_out, output, outs[0])
+            elif 'spatial_qconv_nhwc' in op.tag:
+                _schedule_spatial_conv2d_nhwc(s, data, data_q, data_pad, data_vec,
+                                        kernel, kernel_q, kernel_vec,
+                                        conv_out, output, outs[0])
+        else:
+            kernel = op.input_tensors[1]
+            data_q = op.input_tensors[0]
+            data = data_q.op.input_tensors[0]
+            data_pad = None
+            if isinstance(data_q.op, tvm.tensor.ComputeOp) and "pad" in data_q.op.tag:
+                data_pad = data_q
+                data_q = data
+                data = data_q.op.input_tensors[0]
+            if 'conv2d_nchw_q' in op.tag:
+                _schedule_conv2d_nchw_q(s, data, data_q, data_pad, kernel, output)
+            elif 'conv2d_nhwc_q' in op.tag:
+                _schedule_conv2d_nhwc_q(s, data, data_q, data_pad, kernel, output)
+
+
+    traverse(outs[0].op)
+    return s
+
+
+def _schedule_spatial_conv2d_nchw(s, data, data_q, data_pad, data_vec, kernel, kernel_q, kernel_vec, conv_out, output, last):
+    IB, _, CI, IH, IW = data_q.shape
+    KB, CO, _, KH, KW = kernel_q.shape
+    _, _, OH, OW = output.shape
+
+    # Infer padding and stride
+    if data_pad is None:
+        padding = (0, 0)
+        TH, TW = IH, IW
+    else:
+        _, _, _, TH, TW = data_pad.shape
+        hpad = get_const_int((TH - IH) // 2)
+        wpad = get_const_int((TW - IW) // 2)
+        padding = (hpad, wpad)
+
+    hstride = get_const_int((TH - KH) // (OH - 1))
+    wstride = get_const_int((TW - KW) // (OW - 1))
+    stride = (hstride, wstride)
+    
+    wkl = _get_workload(data, kernel, stride, padding, output.dtype, "NCHW")
+    sch = _get_schedule(wkl, "NCHW")
+    VH = sch.vh
+    VW = sch.vw
+    VC = sch.vc
+    ba = sch.ba
+    bc = sch.bc
+    
+    CC = s.cache_write(conv_out, "global")
+
+    n, co, oh, ow, vh, vw, vc = s[conv_out].op.axis
+    s[conv_out].vectorize(vc)
+
+    s[CC].compute_at(s[conv_out], ow)
+    n, co, oh, ow, vh, vw, vc = s[CC].op.axis
+    ci, dh, dw, b1, b2 = s[CC].op.reduce_axis
+    s[CC].reorder(ci, dh, vh, dw, vw, b1, b2, vc)
+    s[CC].unroll(b1)
+    s[CC].unroll(b2)
+    s[CC].vectorize(vc)
+
+    ##### Schedule A
+    if data_pad is not None:
+        s[data_pad].compute_inline()
+
+    _, h, _, _, _, _ , vw = s[data_vec].op.axis
+    s[data_vec].vectorize(vw)
+    if ba == 1:
+        oaxis = h
+        paxis = h
+    else:
+        oh, ih = s[data_vec].split(h, ba)
+        oaxis = oh
+        paxis = ih
+
+    s[data_vec].parallel(paxis)
+    s[data_vec].pragma(oaxis, "parallel_launch_point")
+    s[data_vec].pragma(paxis, "parallel_stride_pattern")
+    s[data_vec].pragma(oaxis, "parallel_barrier_when_finish")
+
+
+    ##### Schedule B
+    co, _, _, _, _, vc = s[kernel_vec].op.axis
+    s[kernel_vec].vectorize(vc)
+    if bc == 1:
+        oaxis = co
+        paxis = co
+    else:
+        oco, ico = s[kernel_vec].split(co, bc)
+        oaxis = oco
+        paxis = ico
+
+    s[kernel_vec].parallel(paxis)
+    s[kernel_vec].pragma(oaxis, "parallel_launch_point")
+    s[kernel_vec].pragma(paxis, "parallel_stride_pattern")
+    s[kernel_vec].pragma(oaxis, "parallel_barrier_when_finish")
+
+
+    ##### Schedule C
+    n, co, h, w = s[last].op.axis
+    co, vc = s[last].split(co, VC)
+    oh, ow, vh, vw = s[last].tile(h, w, VH, VW)
+    s[last].reorder(n, co, oh, ow, vh, vw, vc)
+    if last != output:
+        s[output].compute_inline()
+    s[conv_out].compute_at(s[last], ow)
+
+    if bc == 1:
+        oaxis = co
+        paxis = co
+    else:
+        oco, ico = s[last].split(co, bc)
+        oaxis = oco
+        paxis = ico
+
+    s[last].parallel(paxis)
+    s[last].pragma(oaxis, "parallel_launch_point")
+    s[last].pragma(paxis, "parallel_stride_pattern")
+    s[last].pragma(oaxis, "parallel_barrier_when_finish")
+
+    return s
+
+def _schedule_spatial_conv2d_nhwc(s, data, data_q, data_pad, data_vec,
+                            kernel, kernel_q, kernel_vec,
+                            conv_out, output, last):
+    # no stride and padding info here
+    _, IH, IW, CI, IB = data_q.shape
+    KH, KW, _, CO, KB = kernel_q.shape
+    _, OH, OW, _ = output.shape
+    # Infer padding and stride
+    if data_pad is None:
+        padding = (0, 0)
+        TH, TW = IH, IW
+    else:
+        _, TH, TW, _, _ = data_pad.shape
+        hpad = get_const_int((TH - IH) // 2)
+        wpad = get_const_int((TW - IW) // 2)
+        padding = (hpad, wpad)
+
+    hstride = get_const_int((TH - KH) // (OH - 1))
+    wstride = get_const_int((TW - KW) // (OW - 1))
+    stride = (hstride, wstride)
+
+    wkl = _get_workload(data, kernel, stride, padding, last.dtype, "NHWC")
+    sch = _get_schedule(wkl, "NHWC")
+    VH = sch.vh
+    VW = sch.vw
+    VC = sch.vc
+    ba = sch.ba
+    bc = sch.bc
+
+    ##### Schedule data packing
+    if data_pad is not None:
+        s[data_pad].compute_inline()
+
+    _, h, _, _, _, _ , _ = s[data_vec].op.axis
+    if ba == 1:
+        oaxis = h
+        paxis = h
+    else:
+        oh, ih = s[data_vec].split(h, ba)
+        oaxis = oh
+        paxis = ih
+    s[data_vec].parallel(paxis)
+    s[data_vec].pragma(oaxis, "parallel_launch_point")
+    s[data_vec].pragma(paxis, "parallel_stride_pattern")
+    s[data_vec].pragma(oaxis, "parallel_barrier_when_finish")
+
+
+    ##### Schedule kernel packing
+    co, _, _, _, _, _ = s[kernel_vec].op.axis
+    if bc == 1:
+        oaxis = co
+        paxis = co
+    else:
+        oco, ico = s[kernel_vec].split(co, bc)
+        oaxis = oco
+        paxis = ico
+
+    s[kernel_vec].parallel(paxis)
+    s[kernel_vec].pragma(oaxis, "parallel_launch_point")
+    s[kernel_vec].pragma(paxis, "parallel_stride_pattern")
+    s[kernel_vec].pragma(oaxis, "parallel_barrier_when_finish")
+
+
+    ##### Schedule Convolution
+    n, oh, ow, co, vh, vw, vc = s[conv_out].op.axis
+    dh, dw, ci, b1, b2 = s[conv_out].op.reduce_axis
+
+    s[conv_out].reorder(n, oh, ow, co, vh, vw, dh, dw, ci, vc, b1, b2)
+
+    s[conv_out].unroll(b1)
+    s[conv_out].unroll(b2)
+    s[conv_out].vectorize(vc)
+
+    # # Schedule output
+    n, h, w, co = s[last].op.axis
+    co, vc = s[last].split(co, VC)
+    oh, ow, vh, vw = s[last].tile(h, w, VH, VW)
+    s[last].reorder(n, oh, ow, co, vh, vw, vc)
+    s[last].vectorize(vc)
+    if last != output:
+        s[output].compute_inline()
+    s[conv_out].compute_at(s[last], ow)
+
+    if bc == 1:
+        oaxis = oh
+        paxis = oh
+    else:
+        oho, iho = s[last].split(oh, bc)
+        oaxis = oho
+        paxis = iho
+
+    s[last].parallel(paxis)
+    s[last].pragma(oaxis, "parallel_launch_point")
+    s[last].pragma(paxis, "parallel_stride_pattern")
+    s[last].pragma(oaxis, "parallel_barrier_when_finish")
+
+    return s
+
+# Very simple schedules
+def schedule_qconv2d_nchw(outs):
+    """Create schedule for tensors"""
+    s = tvm.create_schedule([x.op for x in outs])
+
+    def traverse(op):
+        if 'qconv2d_nchw' in op.tag:
+            output = op.output(0)
+            kernel = op.input_tensors[1]
+            data_q = op.input_tensors[0]
+            data = data_q.op.input_tensors[0]
+            data_pad = None
+            if isinstance(data_q.op, tvm.tensor.ComputeOp) and "pad" in data_q.op.tag:
+                data_pad = data_q
+                data_q = data
+                data = data_q.op.input_tensors[0]
+
+            # Schedule for padding
+            n_pad, c_pad, b_pad, h_pad, w_pad = data_pad.op.axis
+            pad_fused = s[data_pad].fuse(n_pad, c_pad)
+            s[data_pad].parallel(pad_fused)
+
+            # Schedule for convolution
+            nn, ff, yy, xx = s[output].op.axis
+            rc, ry, rx, b2, b1 = s[output].op.reduce_axis
+
+            # Tiling
+            yo, xo, yi, xi = s[output].tile(yy, xx, 4, 4)
+            fused = s[output].fuse(nn, ff)
+            s[output].reorder(fused,  rc, yo, xo, ry, rx, yi, b1, b2, xi)
+            # Vectorize, unroll, parallel
+            s[output].vectorize(xi)
+            s[output].unroll(b1)
+            s[output].unroll(b2)
+            s[output].parallel(fused)
+    
+    traverse(outs[0].op)
+    return s
+
+def schedule_qconv2d_nhwc(outs):
+    """Create schedule for tensors"""
+    s = tvm.create_schedule([x.op for x in outs])
+
+    def traverse(op):
+        if 'qconv2d_nhwc' in op.tag:
+            output = op.output(0)
+            kernel = op.input_tensors[1]
+            data_q = op.input_tensors[0]
+            data = data_q.op.input_tensors[0]
+            data_pad = None
+            if isinstance(data_q.op, tvm.tensor.ComputeOp) and "pad" in data_q.op.tag:
+                data_pad = data_q
+                data_q = data
+                data = data_q.op.input_tensors[0]
+
+            # Schedule for padding
+            n_pad, h_pad, w_pad, c_pad, b_pad = data_pad.op.axis
+            pad_fused = s[data_pad].fuse(n_pad, h_pad)
+            s[data_pad].parallel(pad_fused)
+
+            # Schedule for convolution
+            nn, yy, xx, ff = s[output].op.axis
+            ry, rx, rc, b1, b2 = s[output].op.reduce_axis
+
+            # Tiling
+            xo, fo, xi, fi = s[output].tile(xx, ff, 4, 4)
+            fused = s[output].fuse(nn, yy)
+            s[output].reorder(fused, xo, fo, ry, rx, xi, rc, b1, b2, fi)
+            # Vectorize, unroll, parallel
+            s[output].vectorize(fi)
+            s[output].unroll(b1)
+            s[output].unroll(b2)
+            s[output].parallel(fused)
+    traverse(outs[0].op)
+    return s

From 74660b1c71491513673b4018abc492b4a912557c Mon Sep 17 00:00:00 2001
From: Meghan <cowanmeg@cs.washington.edu>
Date: Mon, 18 Jun 2018 23:28:46 -0700
Subject: [PATCH 06/11] rename qconv->bitserial_conv

---
 topi/python/topi/generic/nn.py                |  20 +-
 topi/python/topi/nn/__init__.py               |   3 +-
 .../nn/{qconv2d.py => bitserial_conv2d.py}    | 132 ++--
 topi/python/topi/rasp/__init__.py             |   2 +-
 topi/python/topi/rasp/bitserial_conv2d.py     | 360 ++++++++++
 topi/python/topi/rasp/qconv2d.py              | 619 ------------------
 topi/python/topi/x86/__init__.py              |   3 +-
 .../x86/{qconv2d.py => bitserial_conv2d.py}   | 124 ++--
 .../python/test_topi_bitserial_conv2d.py      | 109 +++
 .../python/test_topi_bitserial_conv2d_rasp.py | 132 ++++
 10 files changed, 751 insertions(+), 753 deletions(-)
 rename topi/python/topi/nn/{qconv2d.py => bitserial_conv2d.py} (75%)
 create mode 100644 topi/python/topi/rasp/bitserial_conv2d.py
 delete mode 100644 topi/python/topi/rasp/qconv2d.py
 rename topi/python/topi/x86/{qconv2d.py => bitserial_conv2d.py} (79%)
 create mode 100644 topi/tests/python/test_topi_bitserial_conv2d.py
 create mode 100644 topi/tests/python/test_topi_bitserial_conv2d_rasp.py

diff --git a/topi/python/topi/generic/nn.py b/topi/python/topi/generic/nn.py
index bb81c37ad285..1a85e5818462 100644
--- a/topi/python/topi/generic/nn.py
+++ b/topi/python/topi/generic/nn.py
@@ -53,6 +53,22 @@ def schedule_conv2d_nhwc(outs):
     """
     return _default_schedule(outs, False)
 
+@tvm.target.generic_func
+def schedule_qdense(outs):
+    """Schedule for qdense
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+          The computation graph description of qdense
+          in the format of an array of tensors.
+
+    Returns
+    -------
+    sch: Schedule
+        The computation schedule for the op.
+    """
+    return _default_schedule(outs, False)
 
 @tvm.target.generic_func
 def schedule_conv2d_NCHWc(num_filter, kernel_size, strides, padding, outs):
@@ -133,7 +149,7 @@ def schedule_depthwise_conv2d_nhwc(outs):
     return _default_schedule(outs, False)
 
 @tvm.target.generic_func
-def schedule_qconv2d_nchw(outs):
+def schedule_bitserial_conv2d_nchw(outs):
     """Schedule for qconv2d_nchw
 
     Parameters
@@ -151,7 +167,7 @@ def schedule_qconv2d_nchw(outs):
 
 
 @tvm.target.generic_func
-def schedule_qconv2d_nhwc(outs):
+def schedule_bitserial_conv2d_nhwc(outs):
     """Schedule for qconv2d_nhwc
 
     Parameters
diff --git a/topi/python/topi/nn/__init__.py b/topi/python/topi/nn/__init__.py
index e968bd68c927..2c17e0540477 100644
--- a/topi/python/topi/nn/__init__.py
+++ b/topi/python/topi/nn/__init__.py
@@ -17,4 +17,5 @@
 from .upsampling import *
 from .local_response_norm import *
 from .l2_norm import *
-from .qconv2d import *
\ No newline at end of file
+from .bitserial_conv2d import *
+from .qdense import *
\ No newline at end of file
diff --git a/topi/python/topi/nn/qconv2d.py b/topi/python/topi/nn/bitserial_conv2d.py
similarity index 75%
rename from topi/python/topi/nn/qconv2d.py
rename to topi/python/topi/nn/bitserial_conv2d.py
index 820a92bc9ff1..e51577563498 100644
--- a/topi/python/topi/nn/qconv2d.py
+++ b/topi/python/topi/nn/bitserial_conv2d.py
@@ -5,25 +5,18 @@
 import tvm
 from .pad import pad
 from .util import get_pad_tuple, bitpack
-from ..util import simplify, get_const_int, get_const_tuple
-import numpy as np
-
+from ..util import simplify, get_const_tuple
 
 # workload description of qconv2d
 Workload = namedtuple('Workload',
                       ['in_dtype', 'out_dtype', 'height', 'width', 'in_filter', 'out_filter',
                        'hkernel', 'wkernel', 'hpad', 'wpad', 'hstride', 'wstride'])
 
-QuantizedSpatialPackNCHW = namedtuple('SpatialPack', 
-                        ['vh', 'vw', 'vc', 'ba', 'bc'])
-
-QuantizedSpatialPackNHWC= namedtuple('SpatialPack', 
-                        ['vh', 'vw', 'vc', 'ba', 'bc'])
-
-# RPI version - broken right now
-RaspQuantizedSpatialPack = namedtuple('SpatialPack', 
-                        ['vh', 'vw', 'vc', 'ba', 'bc', 'split_ci', 'kfactor'])
+SpatialPackNCHW = namedtuple('SpatialPack',
+                             ['vh', 'vw', 'vc', 'ba', 'bc'])
 
+SpatialPackNHWC = namedtuple('SpatialPack',
+                              ['vh', 'vw', 'vc', 'ba', 'bc'])
 
 _WORKLOADS = [
     # workloads of resnet18 on imagenet
@@ -39,17 +32,23 @@
     Workload('uint32', 'int32', 14, 14, 256, 512, 3, 3, 1, 1, 2, 2),
     Workload('uint32', 'int32', 14, 14, 256, 512, 1, 1, 0, 0, 2, 2),
     Workload('uint32', 'int32', 7, 7, 512, 512, 3, 3, 1, 1, 1, 1),
+
+    # workload of alexnet on cifar10
+    Workload('int32', 'int32', 27, 27, 96, 192, 5, 5, 2, 2, 1, 1),
+    Workload('int32', 'int32', 13, 13, 192, 384, 3, 3, 1, 1, 1, 1),
+    Workload('int32', 'int32', 13, 13, 384, 384, 3, 3, 1, 1, 1, 1),
+    Workload('int32', 'int32', 13, 13, 384, 256, 3, 3, 1, 1, 1, 1),
 ]
 
 @tvm.target.generic_func
-def qconv2d(data, kernel, stride, padding,  activation_bits, weight_bits, layout='NCHW', 
-           pack_dtype='uint32', out_dtype='int32', dorefa=True):
+def bitserial_conv2d(data, kernel, stride, padding, activation_bits, weight_bits,
+                     layout='NCHW', pack_dtype='uint32', out_dtype='int32', dorefa=True):
     """Conv2D operator.
 
     Parameters
     ----------
     input : tvm.Tensor
-        4-D with shape [batch, in_channel, in_height, in_width] or 
+        4-D with shape [batch, in_channel, in_height, in_width] or
                        [batch, in_height, in_width, in_channel]
 
     filter : tvm.Tensor
@@ -73,7 +72,7 @@ def qconv2d(data, kernel, stride, padding,  activation_bits, weight_bits, layout
 
     pack_dtype: str
         bit packing type
-    
+
     dorefa: bool
         method of preforming popcount
 
@@ -85,13 +84,12 @@ def qconv2d(data, kernel, stride, padding,  activation_bits, weight_bits, layout
     # search platform specific declaration first
     # default declaration
     if layout == 'NCHW':
-        return spatial_pack_nchw(data, kernel, stride, padding, activation_bits, weight_bits, pack_dtype=pack_dtype, 
-                                 out_dtype=out_dtype, dorefa=dorefa)
+        return spatial_pack_nchw(data, kernel, stride, padding, activation_bits, weight_bits,
+                                 pack_dtype=pack_dtype, out_dtype=out_dtype, dorefa=dorefa)
     elif layout == 'NHWC':
-        return spatial_pack_nhwc(data, kernel, stride, padding, activation_bits, weight_bits, pack_dtype=pack_dtype, 
-                                 out_dtype=out_dtype, dorefa=dorefa)
-    else:
-        raise ValueError("not support this layout {} yet".format(layout))
+        return spatial_pack_nhwc(data, kernel, stride, padding, activation_bits, weight_bits,
+                                 pack_dtype=pack_dtype, out_dtype=out_dtype, dorefa=dorefa)
+    raise ValueError("not support this layout {} yet".format(layout))
 
 def _get_workload(data, kernel, stride, padding, out_dtype, layout):
     """ Get the workload structure. """
@@ -109,7 +107,7 @@ def _get_workload(data, kernel, stride, padding, out_dtype, layout):
         HSTR, WSTR = stride
     else:
         HSTR, WSTR = stride, stride
-    
+
     return Workload(data.dtype, out_dtype, IH, IW, CI, CO, KH, KW, HPAD, WPAD, HSTR, WSTR)
 
 @tvm.target.generic_func
@@ -123,7 +121,8 @@ def _get_schedule(wkl, layout):
     return wkl
 
 
-def qconv2d_nchw(Input, Filter, stride, padding, activation_bits, weight_bits, out_dtype='int32', pack_type='uint32'):
+def bitserial_conv2d_nchw(Input, Filter, stride, padding, activation_bits, weight_bits,
+                 out_dtype='int32', pack_type='uint32'):
     assert isinstance(stride, int) or len(stride) == 2
     Input_q = bitpack(Input, activation_bits, pack_axis=1, bit_axis=2, pack_type=pack_type)
     Filter_q = bitpack(Filter, weight_bits, pack_axis=1, bit_axis=4, pack_type=pack_type)
@@ -153,16 +152,16 @@ def qconv2d_nchw(Input, Filter, stride, padding, activation_bits, weight_bits, o
 
     def _conv(nn, ff, yy, xx):
         b1b2 = (b1+b2).astype(out_dtype)
-        return tvm.sum( 
-            (tvm.popcount(PadInput_q[nn, rc, b1, yy * stride_h + ry, xx * stride_w + rx] & 
-                Filter_q[ff, rc, ry, rx, b2])<< (b1b2)).astype(out_dtype),
-            axis=[rc, ry, rx, b2, b1]).astype(out_dtype)
-
-    return tvm.compute((batch, out_channel, out_height, out_width), _conv, 
-        name="QConv2dOutput", tag="qconv2d_nchw")
+        return tvm.sum((tvm.popcount(
+            PadInput_q[nn, rc, b1, yy * stride_h + ry, xx * stride_w + rx] &
+            Filter_q[ff, rc, ry, rx, b2])<< (b1b2)).astype(out_dtype),
+                       axis=[rc, ry, rx, b2, b1]).astype(out_dtype)
 
+    return tvm.compute((batch, out_channel, out_height, out_width), _conv,
+                       name="Conv2dOutput", tag="bitserial_conv2d_nchw")
 
-def qconv2d_nhwc(Input, Filter, stride, padding, activation_bits, weight_bits, out_dtype='int32', pack_type='uint32'):
+def bitserial_conv2d_nhwc(Input, Filter, stride, padding, activation_bits, weight_bits,
+                 out_dtype='int32', pack_type='uint32'):
     assert isinstance(stride, int) or len(stride) == 2
     Input_q = bitpack(Input, activation_bits, pack_axis=3, bit_axis=4, pack_type=pack_type)
     Filter_q = bitpack(Filter, weight_bits, pack_axis=2, bit_axis=4, pack_type=pack_type)
@@ -189,16 +188,17 @@ def qconv2d_nhwc(Input, Filter, stride, padding, activation_bits, weight_bits, o
     b2 = tvm.reduce_axis((0, weight_bits), name='b2')
 
     def _conv(nn, yy, xx, ff):
-        return tvm.sum( 
-            (tvm.popcount(PadInput_q[nn, yy * stride_h + ry, xx * stride_w + rx, rc, b1] & 
-                Filter_q[ry, rx, rc, ff, b2])<< b1b2).astype(out_dtype),
-            axis=[rc, ry, rx, b2, b1])
-    
-    return tvm.compute( (batch, out_height, out_width, out_channel), _conv,
-        name="QConv2dOutput", tag="qconv2d_nhwc")
+        b1b2 = (b1+b2).astype(out_dtype)
+        return tvm.sum((tvm.popcount(
+            PadInput_q[nn, yy * stride_h + ry, xx * stride_w + rx, rc, b1] &
+            Filter_q[ry, rx, rc, ff, b2]) << b1b2).astype(out_dtype),
+                       axis=[rc, ry, rx, b2, b1])
 
+    return tvm.compute((batch, out_height, out_width, out_channel), _conv,
+                       name="Conv2dOutput", tag="bitserial_conv2d_nhwc")
 
-def spatial_pack_nchw(data, kernel, stride, padding, in_bits, weight_bits, pack_dtype, out_dtype, dorefa=False):
+def spatial_pack_nchw(data, kernel, stride, padding, in_bits, weight_bits,
+                      pack_dtype, out_dtype, dorefa=False):
     """ Compute convolution with pack on spatial axes. """
     assert data.shape[0].value == 1, "spatial pack convolution only support batch size=1"
     data_q = bitpack(data, in_bits, pack_axis=1, bit_axis=0, pack_type=pack_dtype)
@@ -251,31 +251,31 @@ def spatial_pack_nchw(data, kernel, stride, padding, in_bits, weight_bits, pack_
     dw = tvm.reduce_axis((0, KW), name='dw')
     b1 = tvm.reduce_axis((0, IB), name='ib')
     b2 = tvm.reduce_axis((0, KB), name='kb')
-    
+
     def _conv(n, co, h, w, vh, vw, vc):
         b1b2 = (b1+b2).astype(out_dtype)
         if dorefa:
-            return tvm.sum( 
-                (tvm.popcount(data_vec[n, h, w, ci, vh*HSTR+dh, vw*WSTR+dw, b1] &
-                    kernel_vec[co, ci, dh, dw, b2, vc])  -
-                tvm.popcount(data_vec[n, h, w, ci, vh*HSTR+dh, vw*WSTR+dw, b1] &
-                    ~kernel_vec[co, ci, dh, dw, b2, vc])).astype(out_dtype) << b1b2,
-                axis=[ci, dh, dw, b1, b2])
-        else:
-            return tvm.sum( 
-                (tvm.popcount(data_vec[n, h, w, ci, vh*HSTR+dh, vw*WSTR+dw, b1] &
-                    kernel_vec[co, ci, dh, dw, b2, vc])).astype(out_dtype) << b1b2,
+            return tvm.sum((tvm.popcount(
+                data_vec[n, h, w, ci, vh*HSTR+dh, vw*WSTR+dw, b1] &
+                kernel_vec[co, ci, dh, dw, b2, vc])  -
+                tvm.popcount(
+                data_vec[n, h, w, ci, vh*HSTR+dh, vw*WSTR+dw, b1] &
+                ~kernel_vec[co, ci, dh, dw, b2, vc])).astype(out_dtype) << b1b2,
                 axis=[ci, dh, dw, b1, b2])
 
+        return tvm.sum((tvm.popcount(
+            data_vec[n, h, w, ci, vh*HSTR+dh, vw*WSTR+dw, b1] &
+            kernel_vec[co, ci, dh, dw, b2, vc])).astype(out_dtype) << b1b2,
+                        axis=[ci, dh, dw, b1, b2])
+
     conv = tvm.compute(ovshape, _conv, name='conv_out')
 
     return tvm.compute(oshape, lambda n, co, h, w:
-        conv[n][co//VC][h//VH][w//VW][h%VH][w%VW][co%VC],
-        name='conv_vec', tag='spatial_qconv_nchw')
-        
+                       conv[n][co//VC][h//VH][w//VW][h%VH][w%VW][co%VC],
+                       name='conv_vec', tag='spatial_bitserial_conv_nchw')
 
-
-def spatial_pack_nhwc(data, kernel, stride, padding, in_bits, weight_bits, pack_dtype, out_dtype, dorefa=False):
+def spatial_pack_nhwc(data, kernel, stride, padding, in_bits, weight_bits,
+                      pack_dtype, out_dtype, dorefa=False):
     """ Compute convolution with pack on spatial axes. """
     assert data.shape[0].value == 1, "spatial pack convolution only support batch size=1"
     data_q = bitpack(data, in_bits, pack_axis=3, bit_axis=4, pack_type=pack_dtype)
@@ -326,25 +326,25 @@ def spatial_pack_nhwc(data, kernel, stride, padding, in_bits, weight_bits, pack_
     def _conv(n, h, w, co, vh, vw, vc):
         b1b2 = (b1+b2).astype(out_dtype)
         if dorefa:
-            return tvm.sum( 
+            return tvm.sum(
                 (tvm.popcount(data_vec[n, h, w, vh*HSTR+dh, vw*WSTR+dw, ci, b1] &
                     kernel_vec[co, dh, dw, ci, vc, b2]) -
                 tvm.popcount(data_vec[n, h, w, vh*HSTR+dh, vw*WSTR+dw, ci, b1] &
                     ~kernel_vec[co, dh, dw, ci, vc, b2])).astype(out_dtype) << b1b2,
                 axis=[dh, dw, ci, b1, b2])
-        else:
-            return tvm.sum( 
-                tvm.popcount(data_vec[n, h, w, vh*HSTR+dh, vw*WSTR+dw, ci, b1] &
-                    kernel_vec[co, dh, dw, ci, vc, b2]).astype(out_dtype) << b1b2,
-                axis=[dh, dw, ci, b1, b2])
+
+        return tvm.sum(tvm.popcount(
+            data_vec[n, h, w, vh*HSTR+dh, vw*WSTR+dw, ci, b1] &
+            kernel_vec[co, dh, dw, ci, vc, b2]).astype(out_dtype) << b1b2,
+                       axis=[dh, dw, ci, b1, b2])
 
     conv = tvm.compute(ovshape, _conv, name='conv')
 
     return tvm.compute(oshape, lambda n, h, w, co:
-        conv[n][h//VH][w//VW][co//VC][h%VH][w%VW][co%VC],
-        name='output_unpack', tag='spatial_qconv_nhwc')
+                       conv[n][h//VH][w//VW][co//VC][h%VH][w%VW][co%VC],
+                       name='output_unpack', tag='spatial_bitserial_conv_nhwc')
 
 _SCH_TO_DECL_FUNC_QUANT = {
-    QuantizedSpatialPackNCHW: spatial_pack_nchw,
-    QuantizedSpatialPackNHWC: spatial_pack_nhwc,
+    SpatialPackNCHW: spatial_pack_nchw,
+    SpatialPackNHWC: spatial_pack_nhwc,
 }
diff --git a/topi/python/topi/rasp/__init__.py b/topi/python/topi/rasp/__init__.py
index 8000e752c9ec..270a48504468 100644
--- a/topi/python/topi/rasp/__init__.py
+++ b/topi/python/topi/rasp/__init__.py
@@ -4,4 +4,4 @@
 
 from .conv2d import schedule_conv2d_nchw
 from .depthwise_conv2d import schedule_depthwise_conv2d_nchw
-from .qconv2d import schedule_qconv2d
+from .bitserial_conv2d import schedule_bitserial_conv2d_nhwc
diff --git a/topi/python/topi/rasp/bitserial_conv2d.py b/topi/python/topi/rasp/bitserial_conv2d.py
new file mode 100644
index 000000000000..03aa1e1fc418
--- /dev/null
+++ b/topi/python/topi/rasp/bitserial_conv2d.py
@@ -0,0 +1,360 @@
+# pylint: disable=invalid-name,unused-variable,invalid-name
+"""Bitserial conv2d schedule on raspberry pi"""
+from __future__ import absolute_import as _abs
+from collections import namedtuple
+import tvm
+from .. import tag
+from ..nn.pad import pad
+from ..nn.bitserial_conv2d import bitserial_conv2d, _get_schedule, _get_workload
+from ..nn.bitserial_conv2d import SpatialPackNCHW, _WORKLOADS, spatial_pack_nchw
+from ..nn.util import get_pad_tuple, bitpack
+from ..util import get_const_int
+from .. import generic
+
+RaspSpatialPack = namedtuple('SpatialPack', 
+                        ['vh', 'vw', 'vc', 'ba', 'bc', 'split_ci', 'kfactor'])
+
+_QUANTIZED_SCHEDULES_NHWC = [
+    RaspSpatialPack(2, 2, 8, 1, 1, False, 8),
+    RaspSpatialPack(1, 4, 8, 4, 1, False, 8),
+    RaspSpatialPack(1, 4, 8, 1, 16, False, 8),
+    RaspSpatialPack(1, 4, 8, 4, 8, False, 8),
+    RaspSpatialPack(1, 7, 8, 3, 8, False, 16),
+    RaspSpatialPack(1, 2, 8, 1, 8, False, 16),
+    RaspSpatialPack(2, 1, 8, 1, 4, False, 16),
+    RaspSpatialPack(1, 7, 8, 1, 1, True, 16),
+    RaspSpatialPack(1, 1, 8, 1, 16, True, 16),
+    RaspSpatialPack(1, 1, 8, 1, 8, True, 16),
+    RaspSpatialPack(1, 1, 8, 1, 16, True, 16),
+]
+
+_QUANTIZED_SCHEDULES_NCHW = [
+    # resnet
+    SpatialPackNCHW(2, 2, 8, 1, 1),
+    SpatialPackNCHW(1, 4, 8, 4, 1),
+    SpatialPackNCHW(1, 4, 8, 1, 16),
+    SpatialPackNCHW(1, 4, 8, 4, 8),
+    SpatialPackNCHW(1, 7, 8, 3, 8),
+    SpatialPackNCHW(1, 2, 8, 1, 8),
+    SpatialPackNCHW(2, 1, 8, 1, 4),
+    SpatialPackNCHW(1, 7, 8, 1, 1),
+    SpatialPackNCHW(1, 1, 8, 1, 16),
+    SpatialPackNCHW(1, 1, 8, 1, 8),
+    SpatialPackNCHW(1, 1, 8, 1, 16),
+]
+
+@_get_schedule.register("rasp")
+def _get_schedule_bitserial_conv2d(wkl, layout):
+    if wkl not in _WORKLOADS:
+        raise ValueError("no schedule for such workload: {}".format(wkl))
+    idx = _WORKLOADS.index(wkl)
+    if layout == "NCHW":
+        sch = _QUANTIZED_SCHEDULES_NCHW[idx]
+    elif layout == "NHWC":
+        sch = _QUANTIZED_SCHEDULES_NHWC[idx]
+    return sch
+
+
+@bitserial_conv2d.register("rasp")
+def _declaration_bitserial_conv2d(data, kernel, stride, padding, activation_bits, weight_bits,
+                                  layout='NCHW', pack_dtype=None, out_dtype=None, dorefa=False):
+    if out_dtype is None:
+        out_dtype = data.dtype
+    assert data.shape[0].value == 1, "only support batch size=1 convolution on rasp"
+    assert layout == "NCHW" or layout == "NHWC", "only support layouts NCHW and NHWC"
+    wkl = _get_workload(data, kernel, stride, padding, out_dtype, layout)
+    sch = _get_schedule(wkl, layout)
+    if layout == "NCHW":
+        return spatial_pack_nchw(data, kernel, stride, padding, activation_bits, weight_bits,
+                                 pack_dtype=pack_dtype, out_dtype=out_dtype, dorefa=dorefa)
+    # TODO: Doesn't support dorefa style yet
+    return _spatial_pack_nhwc(data, kernel, stride, padding, activation_bits,
+                              weight_bits, out_dtype)
+
+def kernel_vec_spatial_pack_nhwc(kernel, kernel_bits, VC):
+    kernel_q = bitpack(kernel, kernel_bits, pack_axis=2, bit_axis=2, pack_type='uint8')
+    KH, KW, KB, CI, CO = kernel_q.shape
+    kvshape = (CO//VC, KH, KW, KB, VC, CI)
+    return tvm.compute(kvshape, lambda co, dh, dw, b, vc, ci: \
+        kernel_q[dh][dw][b][ci][co*VC+vc], name='kernel_vec')
+
+def _spatial_pack_nhwc(data, kernel, stride, padding, activation_bits, weight_bits, out_dtype):
+    """ Compute convolution with pack on spatial axes. """
+    assert data.shape[0].value == 1, "spatial pack convolution only support batch size=1"
+    wkl = _get_workload(data, kernel, stride, padding, out_dtype, "NHWC")
+    sch = _get_schedule(wkl, "NHWC")
+    VH = sch.vh
+    VW = sch.vw
+    VC = sch.vc
+
+    data_q = bitpack(data, activation_bits, pack_axis=3, bit_axis=3, pack_type='uint8')
+    kernel_vec = kernel_vec_spatial_pack_nhwc(kernel, weight_bits, VC)
+    N, H, W, IB, CI = data_q.shape
+    OCO, KH, KW, KB, VC, _ = kernel_vec.shape
+
+    CO = OCO * VC
+    HPAD, WPAD, _, _ = get_pad_tuple(padding, kernel)
+
+    if isinstance(stride, (tuple, list)):
+        HSTR, WSTR = stride
+    else:
+        HSTR, WSTR = stride, stride
+    HCAT, WCAT = KH-1, KW-1
+
+    PAD_H = H + 2*HPAD
+    PAD_W = W + 2*WPAD
+    OH = (H + 2*HPAD - KH) // HSTR + 1
+    OW = (W + 2*WPAD - KW) // WSTR + 1
+    dvshape = (N, PAD_H//(VH*HSTR), PAD_W//(VW*WSTR), VH*HSTR+HCAT, VW*WSTR+WCAT, IB, CI)
+    ovshape = (1, OH // VH, OW // VW, CO // VC, VH, VW, VC)
+    oshape = (1, OH, OW, CO)
+
+    if (HPAD != 0 and WPAD != 0):
+        data_pad = pad(data_q, (0, HPAD, WPAD, 0, 0), name="data_pad")
+    else:
+        data_pad = data_q
+
+    data_vec = tvm.compute(dvshape, lambda n, h, w, vh, vw, b, ci: \
+        data_pad[n][h*VH*HSTR+vh][w*VW*WSTR+vw][b][ci], name='data_vec')
+
+    ci = tvm.reduce_axis((0, CI), name='ci')
+    dh = tvm.reduce_axis((0, KH), name='dh')
+    dw = tvm.reduce_axis((0, KW), name='dw')
+    ib = tvm.reduce_axis((0, IB), name='ib')
+    kb = tvm.reduce_axis((0, KB), name='kb')
+
+    def _conv(n, h, w, co, vh, vw, vc):
+        return tvm.sum((tvm.popcount(
+            kernel_vec[co, dh, dw, kb, vc, ci].astype('uint16') &
+            data_vec[n, h, w, vh*HSTR+dh, vw*WSTR+dw, ib, ci].astype('uint16'))
+            << (kb + ib).astype('uint16')), axis=[dh, dw, kb, ib, ci])
+
+    conv = tvm.compute(ovshape, _conv, name='conv')
+
+    return tvm.compute(oshape, lambda n, h, w, co:
+                       conv[n][h//VH][w//VW][co//VC][h%VH][w%VW][co%VC].astype(out_dtype),
+                       name='output_vec', tag='spatial_bitserial_conv_nhwc')
+
+def intrin_popcount(m, k_i, w_b, x_b):
+    dtype = 'uint8'
+    w = tvm.placeholder((w_b, m, k_i), dtype=dtype, name='w')
+    x = tvm.placeholder((x_b, k_i,), dtype=dtype, name='x')
+    k = tvm.reduce_axis((0, k_i), name='k')
+    bw = tvm.reduce_axis((0, w_b), name='bw')
+    bx = tvm.reduce_axis((0, x_b), name='bx')
+    z = tvm.compute((m,), lambda i:
+                    tvm.sum(tvm.popcount(w[bw, i, k].astype('uint16') & x[bx, k].astype('uint16'))
+                    << (bw+bx).astype('uint16'), axis=[bw, bx, k]), name='z')
+
+    Wb = tvm.decl_buffer(w.shape, w.dtype,
+                         name="W",
+                         offset_factor=k_i,
+                         strides=[tvm.var('ldw'), tvm.var('ldw'), 1])
+    Xb = tvm.decl_buffer(x.shape, x.dtype,
+                         name="X",
+                         offset_factor=k_i,
+                         strides=[tvm.var('ldw'), 1])
+
+    def intrin_func(ins, outs):
+        ww, xx = ins
+        zz = outs[0]
+        vpadd_id = tvm.const(647, 'uint32')
+        vpadalu_id = tvm.const(646, 'uint32')
+        args_1 = tvm.const(1, 'uint32')
+        args_2 = tvm.const(2, 'uint32')
+
+        def instr(index):
+            irb = tvm.ir_builder.create()
+            if index == 1:
+                irb.emit(zz.vstore(0, tvm.const(0, 'uint16x8')))
+            else:
+                cnts8 = [None] * 8
+                cnts4 = [None] * 4
+                cnts2 = [None] * 2
+                for bw in range(w_b):
+                    for bx in range(x_b):
+                        if k_i == 16:
+                            for i in range(m):
+                                ands = ww.vload([bw, i, 0], 'uint8x16') & xx.vload([bx, 0], 'uint8x16')
+                                cnts = tvm.popcount(ands)
+                                upper_half = tvm.call_pure_intrin('uint8x8', 'vectorhigh', cnts)
+                                lower_half = tvm.call_pure_intrin('uint8x8', 'vectorlow', cnts)
+                                cnts8[i] = upper_half + lower_half
+                            for i in range(m/2):
+                                cnts4[i] = tvm.call_pure_intrin('uint8x8', 'llvm_intrin', vpadd_id, args_1, cnts8[i*2], cnts8[i*2+1])
+                            for i in range(m/4):
+                                cnts2[i] = tvm.call_pure_intrin('uint8x8', 'llvm_intrin', vpadd_id, args_1, cnts4[i*2], cnts4[i*2+1])
+                            cnts = tvm.call_pure_intrin('uint8x16', 'vectorcombine', cnts2[0], cnts2[1])
+                            shifted_cnts = cnts << (bw+bx)
+                            out = tvm.call_pure_intrin('uint16x8', 'llvm_intrin', vpadalu_id, args_2, zz.vload(0, 'uint16x8'), shifted_cnts)
+                        else: # ki ==8
+                            for i in range(m):
+                                ands = ww.vload([bw, i, 0], 'uint8x8') & xx.vload([bx, 0], 'uint8x8')
+                                cnts8[i] = tvm.popcount(ands)
+                            for i in range(m/2):
+                                cnts4[i] = tvm.call_pure_intrin('uint8x8', 'llvm_intrin', vpadd_id, args_1, cnts8[i*2], cnts8[i*2+1])
+                            for i in range(m/4):
+                                cnts2[i] = tvm.call_pure_intrin('uint8x8', 'llvm_intrin', vpadd_id, args_1, cnts4[i*2], cnts4[i*2+1])
+                            cnts = tvm.call_pure_intrin('uint8x16', 'vectorcombine', cnts2[0], cnts2[1])
+                            shifted_cnts = cnts << (bw+bx)
+                            out = tvm.call_pure_intrin('uint16x8', 'llvm_intrin', vpadalu_id, args_2, zz.vload(0, 'uint16x8'), shifted_cnts)
+                        irb.emit(zz.vstore(0, out))
+            return irb.get()
+        # body, reset, update
+        return instr(0), instr(1), instr(2)
+    with tvm.build_config(offset_factor=1, partition_const_loop=True):
+        return tvm.decl_tensor_intrin(z.op, intrin_func, binds={w: Wb, x:Xb})
+
+
+# ARM specific schedule that using custom microkernel
+def _schedule_spatial_conv2d_nhwc(s, data, data_q, data_pad, data_vec,
+                                  kernel, kernel_q, kernel_vec,
+                                  conv_out, output, last):
+    # no stride and padding info here
+    _, H, W, IB, CI = data_q.shape
+    KH, KW, KB, _, CO = kernel_q.shape
+    KB = get_const_int(KB)
+    IB = get_const_int(IB)
+
+    if data_pad is None:
+        padding = (0, 0)
+        _, in_h, in_w, _, _ = data_q.shape
+        kern_h, kern_w, _, _ = kernel.shape
+        _, out_h, out_w, _ = output.shape
+        hstride = (in_h - kern_h) // (out_h - 1)
+        wstride = (in_w - kern_w) // (out_w - 1)
+        stride = get_const_int(hstride), get_const_int(wstride)
+    else:
+        _, in_h, in_w, _, _ = data_q.shape
+        _, pad_h, pad_w, _, _ = data_pad.shape
+        hpad = (pad_h - in_h) // 2
+        wpad = (pad_w - in_w) // 2
+        padding = get_const_int(hpad), get_const_int(wpad)
+
+        _, in_h, in_w, _, _ = data_pad.shape
+        kern_h, kern_w, _, _ = kernel.shape
+        _, out_h, out_w, _ = output.shape
+        hstride = (in_h - kern_h) // (out_h - 1)
+        wstride = (in_w - kern_w) // (out_w - 1)
+        stride = get_const_int(hstride), get_const_int(wstride)
+
+    wkl = _get_workload(data, kernel, stride, padding, output.dtype, "NHWC")
+    sch = _get_schedule(wkl, "NHWC")
+
+    VH = sch.vh
+    VW = sch.vw
+    VC = sch.vc
+    ba = sch.ba
+    bc = sch.bc
+
+    ##### Schedule data packing
+    if data_pad is not None:
+        s[data_pad].compute_inline()
+
+    _, h, _, _, _, _, _ = s[data_vec].op.axis
+    if ba == 1:
+        oaxis = h
+        paxis = h
+    else:
+        oh, ih = s[data_vec].split(h, ba)
+        oaxis = oh
+        paxis = ih
+
+    s[data_vec].parallel(paxis)
+    s[data_vec].pragma(oaxis, "parallel_launch_point")
+    s[data_vec].pragma(paxis, "parallel_stride_pattern")
+    s[data_vec].pragma(oaxis, "parallel_barrier_when_finish")
+
+
+    ##### Schedule kernel packing
+    co, _, _, _, _, _ = s[kernel_vec].op.axis
+    if bc == 1:
+        oaxis = co
+        paxis = co
+    else:
+        oco, ico = s[kernel_vec].split(co, bc)
+        oaxis = oco
+        paxis = ico
+
+    s[kernel_vec].parallel(paxis)
+    s[kernel_vec].pragma(oaxis, "parallel_launch_point")
+    s[kernel_vec].pragma(paxis, "parallel_stride_pattern")
+    s[kernel_vec].pragma(oaxis, "parallel_barrier_when_finish")
+
+
+    ##### Schedule Convolution
+    n, oh, ow, co, vh, vw, vc = s[conv_out].op.axis
+    dh, dw, kb, ib, ci = s[conv_out].op.reduce_axis
+
+    kfactor = sch.kfactor
+    if sch.split_ci:
+        oci, ici = s[conv_out].split(ci, kfactor)
+        s[conv_out].reorder(n, oh, ow, co, vh, vw, dh, dw, oci, kb, ib, vc, ici)
+    else:
+        s[conv_out].reorder(n, oh, ow, co, vh, vw, dh, dw, kb, ib, vc, ci)
+
+    pc = intrin_popcount(8, kfactor, KB, IB)
+    s[conv_out].tensorize(kb, pc)
+
+    n, h, w, co = s[last].op.axis
+    co, vc = s[last].split(co, VC)
+    oh, ow, vh, vw = s[last].tile(h, w, VH, VW)
+    s[last].reorder(n, oh, ow, co, vc, vh, vw)
+    s[last].vectorize(vw)
+    if last != output:
+        s[last].compute_inline()
+
+    s[conv_out].compute_at(s[last], ow)
+    if co == 1:
+        oaxis = oh
+        paxis = oh
+    else:
+        oho, iho = s[last].split(oh, bc)
+        oaxis = oho
+        paxis = iho
+
+    s[last].parallel(paxis)
+    s = s.normalize()
+    return s
+
+
+@generic.schedule_bitserial_conv2d_nhwc.register(["rasp"])
+def schedule_bitserial_conv2d_nhwc(outs):
+    s = tvm.create_schedule([x.op for x in outs])
+    def traverse(op):
+        """Traverse operators from computation graph"""
+        # inline all one-to-one-mapping operators except the last stage (output)
+        if tag.is_broadcast(op.tag):
+            if op not in s.outputs:
+                s[op].compute_inline()
+            for tensor in op.input_tensors:
+                if tensor.op.input_tensors:
+                    traverse(tensor.op)
+
+        if 'spatial_bitserial_conv_nhwc' in op.tag:
+            # print "spatial"
+            output = op.output(0)
+            conv_out = op.input_tensors[0]
+            kernel_vec = conv_out.op.input_tensors[0]
+            kernel_q = kernel_vec.op.input_tensors[0]
+            kernel = kernel_q.op.input_tensors[0]
+            if "QuantizeInput" in kernel.op.name:
+                # Need to go up 1 further, from the combine in bitpack
+                kernel = kernel.op.input_tensors[0]
+            data_vec = conv_out.op.input_tensors[1]
+            data_q = data_vec.op.input_tensors[0]
+            data = data_q.op.input_tensors[0]
+            data_pad = None
+            if isinstance(data_q.op, tvm.tensor.ComputeOp) and "pad" in data_q.op.tag:
+                data_pad = data_q
+                data_q = data
+                data = data_q.op.input_tensors[0]
+            if "QuantizeInput" in data.op.name:
+                # Need to go up 1 further, from the combine in bitpack
+                data = data.op.input_tensors[0]
+
+            _schedule_spatial_conv2d_nhwc(s, data, data_q, data_pad, data_vec,
+                                          kernel, kernel_q, kernel_vec, conv_out, output, outs[0])
+
+    traverse(outs[0].op)
+    return s
diff --git a/topi/python/topi/rasp/qconv2d.py b/topi/python/topi/rasp/qconv2d.py
deleted file mode 100644
index b0f7fcb011fe..000000000000
--- a/topi/python/topi/rasp/qconv2d.py
+++ /dev/null
@@ -1,619 +0,0 @@
-# pylint: disable=invalid-name,unused-variable,invalid-name
-"""QConv2D schedule on raspberry pi"""
-from __future__ import absolute_import as _abs
-import tvm
-from tvm import target as _target
-from .. import tag
-from ..nn.qconv2d import qconv2d as _qconv2d, _get_schedule
-from ..nn.qconv2d import RaspQuantizedSpatialPack, QuantizedSpatialPackNCHW, QuantizedSpatialPackNHWC
-from ..nn.qconv2d import _WORKLOADS, _SCH_TO_DECL_FUNC_QUANT
-from ..nn.qconv2d import _get_workload
-from ..nn.util import infer_pad, infer_stride
-from ..util import simplify, get_const_int
-
-from .. import generic
-
-# TODO grab the number from autotuner
-_QUANTIZED_SCHEDULES = [
-    RaspQuantizedSpatialPack(2, 2, 8, 1, 1, False, 8),
-    RaspQuantizedSpatialPack(1, 4, 8, 4, 1, False, 8),
-    RaspQuantizedSpatialPack(1, 4, 8, 1, 16, False, 8),
-    RaspQuantizedSpatialPack(1, 4, 8, 4, 8, False, 8),
-    RaspQuantizedSpatialPack(1, 7, 8, 3, 8, False, 16),
-    RaspQuantizedSpatialPack(1, 2, 8, 1, 8, False, 16),
-    RaspQuantizedSpatialPack(2, 1, 8, 1, 4, False, 16),
-    RaspQuantizedSpatialPack(1, 7, 8, 1, 1, True, 16),
-    RaspQuantizedSpatialPack(1, 1, 8, 1, 16, True, 16),
-    RaspQuantizedSpatialPack(1, 1, 8, 1, 8, True, 16),
-    RaspQuantizedSpatialPack(1, 1, 8, 1, 16, True, 16),
-]
-
-# TODO grab the number from autotuner
-_QUANTIZED_SCHEDULES_NCHW = [
-    # resnet
-    QuantizedSpatialPackNCHW(2, 2, 8, 1, 1),
-    QuantizedSpatialPackNCHW(1, 4, 8, 4, 1),
-    QuantizedSpatialPackNCHW(1, 4, 8, 1, 16),
-    QuantizedSpatialPackNCHW(1, 4, 8, 4, 8),
-    QuantizedSpatialPackNCHW(1, 7, 8, 3, 8),
-    QuantizedSpatialPackNCHW(1, 2, 8, 1, 8),
-    QuantizedSpatialPackNCHW(2, 1, 8, 1, 4),
-    QuantizedSpatialPackNCHW(1, 7, 8, 1, 1),
-    QuantizedSpatialPackNCHW(1, 1, 8, 1, 16),
-    QuantizedSpatialPackNCHW(1, 1, 8, 1, 8),
-    QuantizedSpatialPackNCHW(1, 1, 8, 1, 16),
-]
-
-_QUANTIZED_SCHEDULES_NHWC = [
-    # resnet
-    QuantizedSpatialPackNHWC(2, 2, 8, 1, 1),
-    QuantizedSpatialPackNHWC(1, 4, 8, 4, 1),
-    QuantizedSpatialPackNHWC(1, 4, 8, 1, 16),
-    QuantizedSpatialPackNHWC(1, 4, 8, 4, 8),
-    QuantizedSpatialPackNHWC(1, 7, 8, 3, 8),
-    QuantizedSpatialPackNHWC(1, 2, 8, 1, 8),
-    QuantizedSpatialPackNHWC(2, 1, 8, 1, 4),
-    QuantizedSpatialPackNHWC(1, 7, 8, 1, 1),
-    QuantizedSpatialPackNHWC(1, 1, 8, 1, 16),
-    QuantizedSpatialPackNHWC(1, 1, 8, 1, 8),
-    QuantizedSpatialPackNHWC(1, 1, 8, 1, 16),
-]
-
-
-@_get_schedule.register("rasp")
-def _get_schedule_qconv2d(wkl, layout):
-    if wkl not in _WORKLOADS:
-        raise ValueError("no schedule for such workload: {}".format(wkl))
-    idx = _WORKLOADS.index(wkl)
-    if layout == "NCHW":
-        sch = _QUANTIZED_SCHEDULES_NCHW[idx]
-    elif layout == "NHWC":
-        sch = _QUANTIZED_SCHEDULES_NHWC[idx]
-    return sch
-
-
-@_qconv2d.register("rasp")
-def _declaration_qconv2d(data, kernel, stride, padding,  activation_bits, weight_bits, layout='NCHW', 
-           pack_dtype=None, out_dtype=None, dorefa=False):
-    if out_dtype is None:
-        out_dtype = data.dtype
-    assert data.shape[0].value == 1, "only support batch size=1 convolution on rasp"
-    assert layout == "NCHW" or layout == "NHWC", "only support layouts NCHW and NHWC"
-    wkl = _get_workload(data, kernel, stride, padding, out_dtype, layout)
-    sch = _get_schedule(wkl, layout)
-    return _SCH_TO_DECL_FUNC_QUANT[type(sch)](data, kernel, stride, padding, activation_bits, weight_bits, 
-                                              pack_dtype, out_dtype, dorefa)
-
-# TODO: is there a better way to share these with x86?
-
-@generic.schedule_qconv2d_nchw.register(["rasp"])
-@generic.schedule_qconv2d_nhwc.register(["rasp"])
-def schedule_qconv2d(outs):
-    s = tvm.create_schedule([x.op for x in outs])
-
-    def traverse(op):
-        output = op.output(0)
-        # inline all one-to-one-mapping operators except the last stage (output)
-        if tag.is_broadcast(op.tag) or 'elemwise' in op.tag or 'uquantize' in op.tag:
-            if op not in s.outputs:
-                s[op].compute_inline()
-            for tensor in op.input_tensors:
-                if tensor.op.input_tensors:
-                    traverse(tensor.op)
-
-        elif 'spatial_qconv_nchw' in op.tag or 'spatial_qconv_nhwc' in op.tag :
-            conv_out = op.input_tensors[0]
-            kernel_vec = conv_out.op.input_tensors[1]
-            kernel_q = kernel_vec.op.input_tensors[0]
-            kernel = kernel_q.op.input_tensors[0]
-            data_vec = conv_out.op.input_tensors[0]
-            data_q = data_vec.op.input_tensors[0]
-            data = data_q.op.input_tensors[0]
-            data_pad = None
-            if isinstance(data_q.op, tvm.tensor.ComputeOp) and "pad" in data_q.op.tag:
-                data_pad = data_q
-                data_q = data
-                data = data_q.op.input_tensors[0]
-
-            # Need to go up 1 further, from the combine in bitpack
-            if "QuantizeInput" in kernel.op.name:
-                kernel = kernel.op.input_tensors[0]
-            if "QuantizeInput" in data.op.name:
-                data = data.op.input_tensors[0]
-
-            if 'spatial_qconv_nchw' in op.tag:
-                _schedule_spatial_conv2d_nchw(s, data, data_q, data_pad, data_vec,
-                                        kernel, kernel_q, kernel_vec,
-                                        conv_out, output, outs[0])
-            elif 'spatial_qconv_nhwc' in op.tag:
-                _schedule_spatial_conv2d_nhwc(s, data, data_q, data_pad, data_vec,
-                                        kernel, kernel_q, kernel_vec,
-                                        conv_out, output, outs[0])
-        
-    traverse(outs[0].op)
-    return s
-
-
-def _schedule_spatial_conv2d_nchw(s, data, data_q, data_pad, data_vec, kernel, kernel_q, kernel_vec, conv_out, output, last):
-    IB, _, CI, IH, IW = data_q.shape
-    KB, CO, _, KH, KW = kernel_q.shape
-    _, _, OH, OW = output.shape
-
-    # Infer padding and stride
-    if data_pad is None:
-        padding = (0, 0)
-        TH, TW = IH, IW
-    else:
-        _, _, _, TH, TW = data_pad.shape
-        hpad = get_const_int((TH - IH) // 2)
-        wpad = get_const_int((TW - IW) // 2)
-        padding = (hpad, wpad)
-
-    hstride = get_const_int((TH - KH) // (OH - 1))
-    wstride = get_const_int((TW - KW) // (OW - 1))
-    stride = (hstride, wstride)
-
-    wkl = _get_workload(data, kernel, stride, padding, last.dtype, "NCHW")
-    sch = _get_schedule(wkl, "NCHW")
-    VH = sch.vh
-    VW = sch.vw
-    VC = sch.vc
-    ba = sch.ba
-    bc = sch.bc
-    
-    CC = s.cache_write(conv_out, "global")
-
-    n, co, oh, ow, vh, vw, vc = s[conv_out].op.axis
-    s[conv_out].vectorize(vc)
-
-    s[CC].compute_at(s[conv_out], ow)
-    n, co, oh, ow, vh, vw, vc = s[CC].op.axis
-    ci, dh, dw, b1, b2 = s[CC].op.reduce_axis
-    s[CC].reorder(ci, dh, vh, dw, vw, b1, b2, vc)
-    s[CC].unroll(b1)
-    s[CC].unroll(b2)
-    s[CC].vectorize(vc)
-
-    ##### Schedule A
-    if data_pad is not None:
-        s[data_pad].compute_inline()
-
-    _, h, _, _, _, _ , vw = s[data_vec].op.axis
-    s[data_vec].vectorize(vw)
-    if ba == 1:
-        oaxis = h
-        paxis = h
-    else:
-        oh, ih = s[data_vec].split(h, ba)
-        oaxis = oh
-        paxis = ih
-
-    s[data_vec].parallel(paxis)
-    s[data_vec].pragma(oaxis, "parallel_launch_point")
-    s[data_vec].pragma(paxis, "parallel_stride_pattern")
-    s[data_vec].pragma(oaxis, "parallel_barrier_when_finish")
-
-
-    ##### Schedule B
-    co, _, _, _, _, vc = s[kernel_vec].op.axis
-    s[kernel_vec].vectorize(vc)
-    if bc == 1:
-        oaxis = co
-        paxis = co
-    else:
-        oco, ico = s[kernel_vec].split(co, bc)
-        oaxis = oco
-        paxis = ico
-
-    s[kernel_vec].parallel(paxis)
-    s[kernel_vec].pragma(oaxis, "parallel_launch_point")
-    s[kernel_vec].pragma(paxis, "parallel_stride_pattern")
-    s[kernel_vec].pragma(oaxis, "parallel_barrier_when_finish")
-
-
-    ##### Schedule C
-    n, co, h, w = s[last].op.axis
-    co, vc = s[last].split(co, VC)
-    oh, ow, vh, vw = s[last].tile(h, w, VH, VW)
-    s[last].reorder(n, co, oh, ow, vh, vw, vc)
-    if last != output:
-        s[output].compute_inline()
-    s[conv_out].compute_at(s[last], ow)
-
-    if bc == 1:
-        oaxis = co
-        paxis = co
-    else:
-        oco, ico = s[last].split(co, bc)
-        oaxis = oco
-        paxis = ico
-
-    s[last].parallel(paxis)
-    s[last].pragma(oaxis, "parallel_launch_point")
-    s[last].pragma(paxis, "parallel_stride_pattern")
-    s[last].pragma(oaxis, "parallel_barrier_when_finish")
-
-    return s
-
-def _schedule_spatial_conv2d_nhwc(s, data, data_q, data_pad, data_vec,
-                            kernel, kernel_q, kernel_vec,
-                            conv_out, output, last):
-    return s
-    _, IH, IW, CI, IB = data_q.shape
-    KH, KW, _, CO, KB = kernel_q.shape
-    _, OH, OW, _ = output.shape
-    # Infer padding and stride
-    if data_pad is None:
-        padding = (0, 0)
-        TH, TW = IH, IW
-    else:
-        _, TH, TW, _, _ = data_pad.shape
-        hpad = get_const_int((TH - IH) // 2)
-        wpad = get_const_int((TW - IW) // 2)
-        padding = (hpad, wpad)
-
-    hstride = get_const_int((TH - KH) // (OH - 1))
-    wstride = get_const_int((TW - KW) // (OW - 1))
-    stride = (hstride, wstride)
-
-    wkl = _get_workload(data, kernel, stride, padding, output.dtype, "NHWC")
-    sch = _get_schedule(wkl, "NHWC")
-    VH = sch.vh
-    VW = sch.vw
-    VC = sch.vc
-    ba = sch.ba
-    bc = sch.bc
-
-    ##### Schedule data packing
-    if data_pad is not None:
-        s[data_pad].compute_inline()
-
-    _, h, _, _, _, _ , _ = s[data_vec].op.axis
-    if ba == 1:
-        oaxis = h
-        paxis = h
-    else:
-        oh, ih = s[data_vec].split(h, ba)
-        oaxis = oh
-        paxis = ih
-    s[data_vec].parallel(paxis)
-    s[data_vec].pragma(oaxis, "parallel_launch_point")
-    s[data_vec].pragma(paxis, "parallel_stride_pattern")
-    s[data_vec].pragma(oaxis, "parallel_barrier_when_finish")
-
-
-    ##### Schedule kernel packing
-    co, _, _, _, _, _ = s[kernel_vec].op.axis
-    if bc == 1:
-        oaxis = co
-        paxis = co
-    else:
-        oco, ico = s[kernel_vec].split(co, bc)
-        oaxis = oco
-        paxis = ico
-
-    s[kernel_vec].parallel(paxis)
-    s[kernel_vec].pragma(oaxis, "parallel_launch_point")
-    s[kernel_vec].pragma(paxis, "parallel_stride_pattern")
-    s[kernel_vec].pragma(oaxis, "parallel_barrier_when_finish")
-
-
-    ##### Schedule Convolution
-    n, oh, ow, co, vh, vw, vc = s[conv_out].op.axis
-    dh, dw, ci, b1, b2 = s[conv_out].op.reduce_axis
-
-    s[conv_out].reorder(n, oh, ow, co, vh, vw, dh, dw, ci, vc, b1, b2)
-
-    s[conv_out].unroll(b1)
-    s[conv_out].unroll(b2)
-    s[conv_out].vectorize(vc)
-
-    # # Schedule output
-    n, h, w, co = s[last].op.axis
-    co, vc = s[last].split(co, VC)
-    oh, ow, vh, vw = s[last].tile(h, w, VH, VW)
-    s[last].reorder(n, oh, ow, co, vh, vw, vc)
-    s[last].vectorize(vc)
-    if last != output:
-        s[output].compute_inline()
-    s[conv_out].compute_at(s[last], ow)
-
-
-    if bc == 1:
-        oaxis = oh
-        paxis = oh
-    else:
-        oho, iho = s[last].split(oh, bc)
-        oaxis = oho
-        paxis = iho
-
-    s[last].parallel(paxis)
-    s[last].pragma(oaxis, "parallel_launch_point")
-    s[last].pragma(paxis, "parallel_stride_pattern")
-    s[last].pragma(oaxis, "parallel_barrier_when_finish")
-
-    return s
-
-####### ARM SPECIFIC #######
-def _spatial_pack_nhwc(data, kernel, stride, padding, activation_bits, weight_bits, out_dtype):
-    """ Compute convolution with pack on spatial axes. """
-    assert data.shape[0].value == 1, "spatial pack convolution only support batch size=1"
-    print (out_dtype)
-    wkl = _get_workload(data, kernel, stride, padding, out_dtype, "NHWC")
-    sch = _get_schedule(wkl)
-    VH = sch.vh
-    VW = sch.vw
-    VC = sch.vc
-
-    data_q = bitpack(data, activation_bits, pack_axis=3, bit_axis=3, pack_type='uint8')
-    kernel_vec = kernel_vec_spatial_pack_nhwc(kernel, weight_bits, VC)
-    N, H, W, IB, CI = data_q.shape
-    OCO, KH, KW, KB, VC, _ = kernel_vec.shape
-
-    CO = OCO * VC
-    HPAD, WPAD, _, _ = get_pad_tuple(padding, kernel)
-
-    if isinstance(stride, (tuple, list)):
-        HSTR, WSTR = stride
-    else:
-        HSTR, WSTR = stride, stride
-    HCAT, WCAT = KH-1, KW-1
-
-
-    PAD_H = H + 2*HPAD
-    PAD_W = W + 2*WPAD
-    OH = (H + 2*HPAD - KH) // HSTR + 1
-    OW = (W + 2*WPAD - KW) // WSTR + 1
-    dvshape = (N, PAD_H//(VH*HSTR), PAD_W//(VW*WSTR), VH*HSTR+HCAT, VW*WSTR+WCAT, IB, CI)
-    ovshape = (1, OH // VH, OW // VW, CO // VC, VH, VW, VC)
-    oshape = (1, OH, OW, CO)
-
-    if (HPAD != 0 and WPAD != 0):
-        data_pad = pad(data_q, (0, HPAD, WPAD, 0, 0), name="data_pad")
-    else:
-        data_pad = data_q
-
-    data_vec = tvm.compute(dvshape, lambda n, h, w, vh, vw, b, ci: \
-        data_pad[n][h*VH*HSTR+vh][w*VW*WSTR+vw][b][ci], name='data_vec')
-    
-    ci = tvm.reduce_axis((0, CI), name='ci')
-    dh = tvm.reduce_axis((0, KH), name='dh')
-    dw = tvm.reduce_axis((0, KW), name='dw')
-    ib = tvm.reduce_axis((0, IB), name='ib')
-    kb = tvm.reduce_axis((0, KB), name='kb')
-
-    def _conv(n, h, w, co, vh, vw, vc):
-        return tvm.sum( 
-            (tvm.popcount(kernel_vec[co, dh, dw, kb, vc, ci] & 
-                data_vec[n, h, w, vh*HSTR+dh, vw*WSTR+dw, ib, ci]).astype('int16') 
-            << (kb + ib).astype('int16')), axis=[dh, dw, kb, ib, ci])
-
-    conv = tvm.compute(ovshape, _conv, name='conv')
-
-    return tvm.compute(oshape, lambda n, h, w, co:
-        conv[n][h//VH][w//VW][co//VC][h%VH][w%VW][co%VC].astype(out_dtype),
-        name='output_vec', tag='spatial_qconv_nhwc')
-
-def intrin_popcount(m, k_i, w_b, x_b):
-    type = 'uint8'
-    w = tvm.placeholder((w_b, m, k_i), dtype=type, name='w')
-    x = tvm.placeholder((x_b, k_i,), dtype=type, name='x')
-    k = tvm.reduce_axis((0, k_i), name='k')
-    bw = tvm.reduce_axis((0, w_b), name='bw')
-    bx = tvm.reduce_axis((0, x_b), name='bx')
-    z = tvm.compute((m,), lambda i:
-                    tvm.sum(tvm.popcount(w[bw, i, k].astype('uint16') & x[bx, k].astype('uint16')) << (bw+bx).astype('uint16'),
-                     axis=[bw, bx, k]), name='z')
-
-    Wb = tvm.decl_buffer(w.shape, w.dtype,
-                        name="W",
-                        offset_factor=k_i,
-                        strides=[tvm.var('ldw'), tvm.var('ldw'), 1]) 
-    Xb = tvm.decl_buffer(x.shape, x.dtype,
-                        name="X",
-                        offset_factor=k_i,
-                        strides=[tvm.var('ldw'), 1])
-
-            
-    def intrin_func(ins, outs):
-        ww, xx = ins
-        zz = outs[0]
-        vpadd_id = tvm.const(647, 'uint32')
-        vpadalu_id = tvm.const(646, 'uint32')
-        args_1 = tvm.const(1, 'uint32')
-        args_2 = tvm.const(2, 'uint32')
-    
-        def instr(index):
-            irb = tvm.ir_builder.create()
-            if index == 1:
-                irb.emit(zz.vstore(0, tvm.const(0, 'uint16x8')))
-            else:
-                cnts8 = [None] * 8
-                cnts4 = [None] * 4
-                cnts2 = [None] * 2
-                for bw in range(w_b):
-                    for bx in range(x_b):
-                        if k_i == 16:
-                            for i in range(m):
-                                ands = ww.vload([bw, i, 0], 'uint8x16') & xx.vload([bx, 0], 'uint8x16')
-                                cnts = tvm.popcount(ands)
-                                upper_half = tvm.call_pure_intrin('uint8x8', 'vectorhigh', cnts)
-                                lower_half = tvm.call_pure_intrin('uint8x8', 'vectorlow', cnts)
-                                cnts8[i] = upper_half + lower_half
-                            for i in range(m/2):
-                                cnts4[i] = tvm.call_pure_intrin('uint8x8', 'llvm_intrin', vpadd_id, args_1, cnts8[i*2], cnts8[i*2+1])
-                            for i in range(m/4):
-                                cnts2[i] = tvm.call_pure_intrin('uint8x8', 'llvm_intrin', vpadd_id, args_1, cnts4[i*2], cnts4[i*2+1])
-                            cnts = tvm.call_pure_intrin('uint8x16', 'vectorcombine', cnts2[0], cnts2[1])
-                            shifted_cnts = cnts << (bw+bx)
-                            out = tvm.call_pure_intrin('uint16x8', 'llvm_intrin', vpadalu_id, args_2, zz.vload(0, 'uint16x8'), shifted_cnts)
-                        else: # ki ==8
-                            for i in range(m):
-                                ands = ww.vload([bw, i, 0], 'uint8x8') & xx.vload([bx, 0], 'uint8x8')
-                                cnts8[i] = tvm.popcount(ands)
-                            for i in range(m/2):
-                                cnts4[i] = tvm.call_pure_intrin('uint8x8', 'llvm_intrin', vpadd_id, args_1, cnts8[i*2], cnts8[i*2+1])
-                            for i in range(m/4):
-                                cnts2[i] = tvm.call_pure_intrin('uint8x8', 'llvm_intrin', vpadd_id, args_1, cnts4[i*2], cnts4[i*2+1])
-                            cnts = tvm.call_pure_intrin('uint8x16', 'vectorcombine', cnts2[0], cnts2[1])
-                            shifted_cnts = cnts << (bw+bx)
-                            out = tvm.call_pure_intrin('uint16x8', 'llvm_intrin', vpadalu_id, args_2, zz.vload(0, 'uint16x8'), shifted_cnts)
-                        irb.emit(zz.vstore(0, out))
-            return irb.get()
-        # body, reset, update
-        return instr(0), instr(1), instr(2)
-    with tvm.build_config(offset_factor=1, partition_const_loop=True):
-        return tvm.decl_tensor_intrin(z.op, intrin_func, binds={w: Wb, x:Xb})
-
-
-# ARM specific schedule that using custom microkernel
-def arm_schedule_spatial_conv2d_nhwc(s, data, data_q, data_pad, data_vec,
-                kernel, kernel_q, kernel_vec, conv_out, output, last):
-    # no stride and padding info here
-    _, H, W, IB, CI = data_q.shape
-    KH, KW, KB, _, CO = kernel_q.shape
-    KB = get_const_int(KB)
-    IB = get_const_int(IB)
-
-    if data_pad is None:
-        padding = (0,0)
-        _, in_h, in_w, _ , _ = data_q.shape
-        kern_h, kern_w, _, _ = kernel.shape
-        _, out_h, out_w, _ = output.shape
-        hstride = (in_h - kern_h) // (out_h - 1)
-        wstride = (in_w - kern_w) // (out_w - 1)
-        stride = get_const_int(hstride), get_const_int(wstride)
-    else:
-        _, in_h, in_w, _, _ = data_q.shape
-        _, pad_h, pad_w, _, _ = data_pad.shape
-        hpad = (pad_h - in_h) // 2
-        wpad = (pad_w - in_w) // 2
-        padding = get_const_int(hpad), get_const_int(wpad)
-
-        _, in_h, in_w, _, _ = data_pad.shape
-        kern_h, kern_w, _, _ = kernel.shape
-        _, out_h, out_w, _ = output.shape
-        hstride = (in_h - kern_h) // (out_h - 1)
-        wstride = (in_w - kern_w) // (out_w - 1)
-        stride = get_const_int(hstride), get_const_int(wstride)
-
-    wkl = _get_workload(data, kernel, stride, padding, output.dtype, "NHWC")
-    sch = _get_schedule(wkl, "NHWC")
-
-    VH = sch.vh
-    VW = sch.vw
-    VC = sch.vc
-    ba = sch.ba
-    bc = sch.bc
-
-    ##### Schedule data packing
-    if data_pad is not None:
-        s[data_pad].compute_inline()
-
-    _, h, _, _, _, _, _ = s[data_vec].op.axis
-    if ba == 1:
-        oaxis = h
-        paxis = h
-    else:
-        oh, ih = s[data_vec].split(h, ba)
-        oaxis = oh
-        paxis = ih
-
-    s[data_vec].parallel(paxis)
-    s[data_vec].pragma(oaxis, "parallel_launch_point")
-    s[data_vec].pragma(paxis, "parallel_stride_pattern")
-    s[data_vec].pragma(oaxis, "parallel_barrier_when_finish")
-
-
-    ##### Schedule kernel packing
-    co, _, _, _, _, _ = s[kernel_vec].op.axis
-    if bc == 1:
-        oaxis = co
-        paxis = co
-    else:
-        oco, ico = s[kernel_vec].split(co, bc)
-        oaxis = oco
-        paxis = ico
-
-    s[kernel_vec].parallel(paxis)
-    s[kernel_vec].pragma(oaxis, "parallel_launch_point")
-    s[kernel_vec].pragma(paxis, "parallel_stride_pattern")
-    s[kernel_vec].pragma(oaxis, "parallel_barrier_when_finish")
-
-
-    ##### Schedule Convolution
-    n, oh, ow, co, vh, vw, vc = s[conv_out].op.axis
-    dh, dw, kb, ib, ci = s[conv_out].op.reduce_axis
-    
-    kfactor = sch.kfactor
-    if sch.split_ci:
-        oci, ici = s[conv_out].split(ci, kfactor)
-        s[conv_out].reorder(n, oh, ow, co, vh, vw, dh, dw, oci, kb, ib, vc, ici)
-    else:
-        s[conv_out].reorder(n, oh, ow, co, vh, vw, dh, dw, kb, ib, vc, ci)
-   
-    pc = intrin_popcount(8, kfactor, KB, IB)
-    s[conv_out].tensorize(kb, pc)
-
-    n, h, w, co = s[last].op.axis
-    co, vc = s[last].split(co, VC)
-    oh, ow, vh, vw = s[last].tile(h, w, VH, VW)
-    s[last].reorder(n, oh, ow, co, vc, vh, vw)
-    s[last].vectorize(vw)
-    if last != output:
-        s[last].compute_inline()
-    
-    s[conv_out].compute_at(s[last], ow)
-    if co == 1:
-        oaxis = oh
-        paxis = oh
-    else:
-        oho, iho = s[last].split(oh, bc)
-        oaxis = oho
-        paxis = iho
-
-    s[last].parallel(paxis)
-    s = s.normalize()
-    return s
-
-
-# @generic.schedule_qconv2d_nhwc.register(["rasp"])
-def schedule_qconv2d_nhwc(outs):
-    s = tvm.create_schedule([x.op for x in outs])
-    def traverse(op):
-        """Traverse operators from computation graph"""
-        # inline all one-to-one-mapping operators except the last stage (output)
-        if tag.is_broadcast(op.tag):
-            if op not in s.outputs:
-                s[op].compute_inline()
-            for tensor in op.input_tensors:
-                if tensor.op.input_tensors:
-                    traverse(tensor.op)
-
-        if 'spatial_qconv_nhwc' in op.tag:
-            # print "spatial"
-            output = op.output(0)
-            conv_out = op.input_tensors[0]
-            kernel_vec = conv_out.op.input_tensors[0]
-            kernel_q = kernel_vec.op.input_tensors[0]
-            kernel = kernel_q.op.input_tensors[0]
-            if "QuantizeInput" in kernel.op.name:
-                # Need to go up 1 further, from the combine in bitpack
-                kernel = kernel.op.input_tensors[0]
-            data_vec = conv_out.op.input_tensors[1]
-            data_q = data_vec.op.input_tensors[0]
-            data = data_q.op.input_tensors[0]
-            data_pad = None
-            if isinstance(data_q.op, tvm.tensor.ComputeOp) and "pad" in data_q.op.tag:
-                data_pad = data_q
-                data_q = data
-                data = data_q.op.input_tensors[0]
-            if "QuantizeInput" in data.op.name:
-                # Need to go up 1 further, from the combine in bitpack
-                data = data.op.input_tensors[0]
-
-            _schedule_spatial_conv2d_nhwc(s, data, data_q, data_pad, data_vec,
-                                            kernel, kernel_q, kernel_vec, conv_out, output, outs[0])
-
-    traverse(outs[0].op)
-    return s
diff --git a/topi/python/topi/x86/__init__.py b/topi/python/topi/x86/__init__.py
index 3ee6e6ee34a6..78f18b4ebf7e 100644
--- a/topi/python/topi/x86/__init__.py
+++ b/topi/python/topi/x86/__init__.py
@@ -8,4 +8,5 @@
 from .nn import *
 from .injective import *
 from .pooling import schedule_pool, schedule_global_pool
-from .qconv2d import schedule_qconv2d
+from .bitserial_conv2d import schedule_bitserial_conv2d
+from .qdense import schedule_qdense
diff --git a/topi/python/topi/x86/qconv2d.py b/topi/python/topi/x86/bitserial_conv2d.py
similarity index 79%
rename from topi/python/topi/x86/qconv2d.py
rename to topi/python/topi/x86/bitserial_conv2d.py
index 1375c5436734..522e6eb32208 100644
--- a/topi/python/topi/x86/qconv2d.py
+++ b/topi/python/topi/x86/bitserial_conv2d.py
@@ -1,49 +1,49 @@
 # pylint: disable=invalid-name,unused-variable,invalid-name
-"""QConv2D schedule on x86"""
+"""Bitserial conv2d schedule on x86"""
 import tvm
+from topi.util import get_const_int
 from .. import generic, tag
-from .. import nn
-from ..nn.util import infer_pad, infer_stride
-from topi.util import simplify, get_const_int
-from ..nn.qconv2d import qconv2d as _qconv2d, _get_schedule
-from ..nn.qconv2d import QuantizedSpatialPackNCHW, QuantizedSpatialPackNHWC
-from ..nn.qconv2d import _WORKLOADS, _SCH_TO_DECL_FUNC_QUANT
-from ..nn.qconv2d import _get_workload
+from ..nn.bitserial_conv2d import bitserial_conv2d, _get_schedule, _get_workload
+from ..nn.bitserial_conv2d import SpatialPackNCHW, SpatialPackNHWC
+from ..nn.bitserial_conv2d import _WORKLOADS, _SCH_TO_DECL_FUNC_QUANT
 
-
-# TODO grab the number from autotuner
 _QUANTIZED_SCHEDULES_NCHW = [
     # resnet
-    QuantizedSpatialPackNCHW(2, 2, 8, 1, 1),
-    QuantizedSpatialPackNCHW(1, 4, 8, 4, 1),
-    QuantizedSpatialPackNCHW(1, 4, 8, 1, 16),
-    QuantizedSpatialPackNCHW(1, 4, 8, 4, 8),
-    QuantizedSpatialPackNCHW(1, 7, 8, 3, 8),
-    QuantizedSpatialPackNCHW(1, 2, 8, 1, 8),
-    QuantizedSpatialPackNCHW(2, 1, 8, 1, 4),
-    QuantizedSpatialPackNCHW(1, 7, 8, 1, 1),
-    QuantizedSpatialPackNCHW(1, 1, 8, 1, 16),
-    QuantizedSpatialPackNCHW(1, 1, 8, 1, 8),
-    QuantizedSpatialPackNCHW(1, 1, 8, 1, 16),
+    SpatialPackNCHW(2, 2, 8, 1, 1),
+    SpatialPackNCHW(1, 4, 8, 4, 1),
+    SpatialPackNCHW(1, 4, 8, 1, 16),
+    SpatialPackNCHW(1, 4, 8, 4, 8),
+    SpatialPackNCHW(1, 7, 8, 3, 8),
+    SpatialPackNCHW(1, 2, 8, 1, 8),
+    SpatialPackNCHW(2, 1, 8, 1, 4),
+    SpatialPackNCHW(1, 7, 8, 1, 1),
+    SpatialPackNCHW(1, 1, 8, 1, 16),
+    SpatialPackNCHW(1, 1, 8, 1, 8),
+    SpatialPackNCHW(1, 1, 8, 1, 16),
+
+    SpatialPackNCHW(3, 3, 16, 3, 16),
+    SpatialPackNCHW(1, 1, 16, 2, 16),
+    SpatialPackNCHW(1, 1, 8, 1, 16),
+    SpatialPackNCHW(1, 1, 8, 1, 16),
 ]
 
 _QUANTIZED_SCHEDULES_NHWC = [
     # resnet
-    QuantizedSpatialPackNHWC(2, 2, 8, 1, 1),
-    QuantizedSpatialPackNHWC(1, 4, 8, 4, 1),
-    QuantizedSpatialPackNHWC(1, 4, 8, 1, 16),
-    QuantizedSpatialPackNHWC(1, 4, 8, 4, 8),
-    QuantizedSpatialPackNHWC(1, 7, 8, 3, 8),
-    QuantizedSpatialPackNHWC(1, 2, 8, 1, 8),
-    QuantizedSpatialPackNHWC(2, 1, 8, 1, 4),
-    QuantizedSpatialPackNHWC(1, 7, 8, 1, 1),
-    QuantizedSpatialPackNHWC(1, 1, 8, 1, 16),
-    QuantizedSpatialPackNHWC(1, 1, 8, 1, 8),
-    QuantizedSpatialPackNHWC(1, 1, 8, 1, 16),
+    SpatialPackNHWC(2, 2, 8, 1, 1),
+    SpatialPackNHWC(1, 4, 8, 4, 1),
+    SpatialPackNHWC(1, 4, 8, 1, 16),
+    SpatialPackNHWC(1, 4, 8, 4, 8),
+    SpatialPackNHWC(1, 7, 8, 3, 8),
+    SpatialPackNHWC(1, 2, 8, 1, 8),
+    SpatialPackNHWC(2, 1, 8, 1, 4),
+    SpatialPackNHWC(1, 7, 8, 1, 1),
+    SpatialPackNHWC(1, 1, 8, 1, 16),
+    SpatialPackNHWC(1, 1, 8, 1, 8),
+    SpatialPackNHWC(1, 1, 8, 1, 16),
 ]
 
 @_get_schedule.register("cpu")
-def _get_schedule_qconv2d(wkl, layout):
+def _get_schedule_bitserial_conv2d(wkl, layout):
     if wkl not in _WORKLOADS:
         raise ValueError("no schedule for such workload: {}".format(wkl))
     idx = _WORKLOADS.index(wkl)
@@ -53,10 +53,9 @@ def _get_schedule_qconv2d(wkl, layout):
         sch = _QUANTIZED_SCHEDULES_NHWC[idx]
     return sch
 
-
-@_qconv2d.register("cpu")
-def _declaration_qconv2d(data, kernel, stride, padding,  activation_bits, weight_bits, layout='NCHW', 
-           pack_dtype=None, out_dtype=None, dorefa=False):
+@bitserial_conv2d.register("cpu")
+def _declaration_bitserial_conv2d(data, kernel, stride, padding, activation_bits, weight_bits,
+                         layout='NCHW', pack_dtype=None, out_dtype=None, dorefa=False):
     if out_dtype is None:
         out_dtype = data.dtype
     assert data.shape[0].value == 1, "only support batch size=1 convolution on rasp"
@@ -64,12 +63,12 @@ def _declaration_qconv2d(data, kernel, stride, padding,  activation_bits, weight
 
     wkl = _get_workload(data, kernel, stride, padding, out_dtype, layout)
     sch = _get_schedule(wkl, layout)
-    return _SCH_TO_DECL_FUNC_QUANT[type(sch)](data, kernel, stride, padding, activation_bits, weight_bits, 
-                                              pack_dtype, out_dtype, dorefa)
+    return _SCH_TO_DECL_FUNC_QUANT[type(sch)](data, kernel, stride, padding, activation_bits,
+                                              weight_bits, pack_dtype, out_dtype, dorefa)
 
-@generic.schedule_qconv2d_nchw.register(["cpu"])
-@generic.schedule_qconv2d_nhwc.register(["cpu"])
-def schedule_qconv2d(outs):
+@generic.schedule_bitserial_conv2d_nchw.register(["cpu"])
+@generic.schedule_bitserial_conv2d_nhwc.register(["cpu"])
+def schedule_bitserial_conv2d(outs):
     s = tvm.create_schedule([x.op for x in outs])
 
     def traverse(op):
@@ -82,7 +81,7 @@ def traverse(op):
                 if tensor.op.input_tensors:
                     traverse(tensor.op)
 
-        elif 'spatial_qconv_nchw' in op.tag or 'spatial_qconv_nhwc' in op.tag :
+        elif 'spatial_bitserial_conv_nchw' in op.tag or 'spatial_bitserial_conv_nhwc' in op.tag:
             conv_out = op.input_tensors[0]
             kernel_vec = conv_out.op.input_tensors[1]
             kernel_q = kernel_vec.op.input_tensors[0]
@@ -102,14 +101,14 @@ def traverse(op):
                 # Need to go up 1 further, from the combine in bitpack
                 data = data.op.input_tensors[0]
 
-            if 'spatial_qconv_nchw' in op.tag:
+            if 'spatial_bitserial_conv_nchw' in op.tag:
                 _schedule_spatial_conv2d_nchw(s, data, data_q, data_pad, data_vec,
-                                        kernel, kernel_q, kernel_vec,
-                                        conv_out, output, outs[0])
-            elif 'spatial_qconv_nhwc' in op.tag:
+                                              kernel, kernel_q, kernel_vec,
+                                              conv_out, output, outs[0])
+            elif 'spatial_bitserial_conv_nhwc' in op.tag:
                 _schedule_spatial_conv2d_nhwc(s, data, data_q, data_pad, data_vec,
-                                        kernel, kernel_q, kernel_vec,
-                                        conv_out, output, outs[0])
+                                              kernel, kernel_q, kernel_vec,
+                                              conv_out, output, outs[0])
         else:
             kernel = op.input_tensors[1]
             data_q = op.input_tensors[0]
@@ -120,16 +119,16 @@ def traverse(op):
                 data_q = data
                 data = data_q.op.input_tensors[0]
             if 'conv2d_nchw_q' in op.tag:
-                _schedule_conv2d_nchw_q(s, data, data_q, data_pad, kernel, output)
+                _schedule_conv2d_nchw(s, data, data_q, data_pad, kernel, output)
             elif 'conv2d_nhwc_q' in op.tag:
-                _schedule_conv2d_nhwc_q(s, data, data_q, data_pad, kernel, output)
-
+                _schedule_conv2d_nhwc(s, data, data_q, data_pad, kernel, output)
 
     traverse(outs[0].op)
     return s
 
-
-def _schedule_spatial_conv2d_nchw(s, data, data_q, data_pad, data_vec, kernel, kernel_q, kernel_vec, conv_out, output, last):
+def _schedule_spatial_conv2d_nchw(s, data, data_q, data_pad, data_vec,
+                                  kernel, kernel_q, kernel_vec,
+                                  conv_out, output, last):
     IB, _, CI, IH, IW = data_q.shape
     KB, CO, _, KH, KW = kernel_q.shape
     _, _, OH, OW = output.shape
@@ -147,7 +146,7 @@ def _schedule_spatial_conv2d_nchw(s, data, data_q, data_pad, data_vec, kernel, k
     hstride = get_const_int((TH - KH) // (OH - 1))
     wstride = get_const_int((TW - KW) // (OW - 1))
     stride = (hstride, wstride)
-    
+
     wkl = _get_workload(data, kernel, stride, padding, output.dtype, "NCHW")
     sch = _get_schedule(wkl, "NCHW")
     VH = sch.vh
@@ -155,9 +154,8 @@ def _schedule_spatial_conv2d_nchw(s, data, data_q, data_pad, data_vec, kernel, k
     VC = sch.vc
     ba = sch.ba
     bc = sch.bc
-    
-    CC = s.cache_write(conv_out, "global")
 
+    CC = s.cache_write(conv_out, "global")
     n, co, oh, ow, vh, vw, vc = s[conv_out].op.axis
     s[conv_out].vectorize(vc)
 
@@ -173,7 +171,7 @@ def _schedule_spatial_conv2d_nchw(s, data, data_q, data_pad, data_vec, kernel, k
     if data_pad is not None:
         s[data_pad].compute_inline()
 
-    _, h, _, _, _, _ , vw = s[data_vec].op.axis
+    _, h, _, _, _, _, vw = s[data_vec].op.axis
     s[data_vec].vectorize(vw)
     if ba == 1:
         oaxis = h
@@ -231,8 +229,8 @@ def _schedule_spatial_conv2d_nchw(s, data, data_q, data_pad, data_vec, kernel, k
     return s
 
 def _schedule_spatial_conv2d_nhwc(s, data, data_q, data_pad, data_vec,
-                            kernel, kernel_q, kernel_vec,
-                            conv_out, output, last):
+                                  kernel, kernel_q, kernel_vec,
+                                  conv_out, output, last):
     # no stride and padding info here
     _, IH, IW, CI, IB = data_q.shape
     KH, KW, _, CO, KB = kernel_q.shape
@@ -263,7 +261,7 @@ def _schedule_spatial_conv2d_nhwc(s, data, data_q, data_pad, data_vec,
     if data_pad is not None:
         s[data_pad].compute_inline()
 
-    _, h, _, _, _, _ , _ = s[data_vec].op.axis
+    _, h, _, _, _, _, _ = s[data_vec].op.axis
     if ba == 1:
         oaxis = h
         paxis = h
@@ -357,13 +355,13 @@ def traverse(op):
             # Tiling
             yo, xo, yi, xi = s[output].tile(yy, xx, 4, 4)
             fused = s[output].fuse(nn, ff)
-            s[output].reorder(fused,  rc, yo, xo, ry, rx, yi, b1, b2, xi)
+            s[output].reorder(fused, rc, yo, xo, ry, rx, yi, b1, b2, xi)
             # Vectorize, unroll, parallel
             s[output].vectorize(xi)
             s[output].unroll(b1)
             s[output].unroll(b2)
             s[output].parallel(fused)
-    
+
     traverse(outs[0].op)
     return s
 
diff --git a/topi/tests/python/test_topi_bitserial_conv2d.py b/topi/tests/python/test_topi_bitserial_conv2d.py
new file mode 100644
index 000000000000..b27067d24b6a
--- /dev/null
+++ b/topi/tests/python/test_topi_bitserial_conv2d.py
@@ -0,0 +1,109 @@
+import os
+import numpy as np
+import tvm
+import topi
+import topi.testing
+from tvm.contrib.pickle_memoize import memoize
+from topi.util import get_const_tuple
+from tvm.contrib import rpc, util
+
+
+def generate_quantized_np(shape, bits, out_dtype):
+    min_val = 0
+    max_val = 1 << bits
+    return np.random.randint(min_val, max_val, size=shape).astype(out_dtype)
+
+def verify_bitserial_conv2d_nchw(batch, in_size, in_channel, num_filter, kernel, stride, padding, 
+    activation_bits, weight_bits, dorefa):
+    in_height = in_width = in_size
+    input_type='uint32'
+    out_dtype='int32'
+
+    with tvm.target.create('llvm'):
+        A = tvm.placeholder((batch, in_channel, in_height, in_width), dtype=input_type, name='A')
+        W = tvm.placeholder((num_filter, in_channel, kernel, kernel), dtype=input_type, name='W')
+        B = topi.nn.bitserial_conv2d(A, W, stride, padding, activation_bits, weight_bits, 
+            out_dtype=out_dtype, layout="NCHW", dorefa=dorefa)
+        s = topi.generic.schedule_bitserial_conv2d_nchw([B])
+
+    a_shape = get_const_tuple(A.shape)
+    w_shape = get_const_tuple(W.shape)
+    dtype = A.dtype
+
+    def get_ref_data():
+        a_np = generate_quantized_np(get_const_tuple(A.shape), activation_bits, input_type)
+        w_np = generate_quantized_np(get_const_tuple(W.shape), weight_bits, input_type)
+        if dorefa:
+            w_ = np.copy(w_np).astype(out_dtype)
+            for x in np.nditer(w_, op_flags=['readwrite']):
+                x[...] = 1 if x == 1 else -1
+            b_np = topi.testing.conv2d_nchw_python(a_np.astype(out_dtype), w_, stride, padding)
+        else:
+            b_np = topi.testing.conv2d_nchw_python(a_np, w_np, stride, padding)
+        return a_np, w_np, b_np
+    a_np, w_np, b_np = get_ref_data()
+
+    ctx = tvm.cpu(0)
+    a = tvm.nd.array(a_np, ctx)
+    w = tvm.nd.array(w_np, ctx)
+    b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx)
+    func = tvm.build(s, [A, W, B], "llvm")
+    func(a, w, b)
+    np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
+
+def verify_bitserial_conv2d_nhwc(batch, in_size, in_channel, num_filter, kernel, stride, padding, 
+                        activation_bits, weight_bits, dorefa):
+    in_height = in_width = in_size
+    input_type='uint32'
+    out_dtype='int32'
+
+    with tvm.target.create('llvm'):
+        A = tvm.placeholder((batch, in_height, in_width, in_channel), dtype=input_type, name='A')
+        W = tvm.placeholder((kernel, kernel, in_channel, num_filter), dtype=input_type, name='W')
+        B = topi.nn.bitserial_conv2d(A, W, stride, padding, activation_bits, weight_bits, out_dtype=out_dtype, 
+                            layout="NHWC", dorefa=dorefa)
+        s = topi.generic.schedule_bitserial_conv2d_nhwc([B])
+
+    a_shape = get_const_tuple(A.shape)
+    w_shape = get_const_tuple(W.shape)
+    dtype = A.dtype
+
+    def get_ref_data():
+        a_np = generate_quantized_np(get_const_tuple(A.shape), activation_bits, input_type)
+        w_np = generate_quantized_np(get_const_tuple(W.shape), weight_bits, input_type)
+        if dorefa:
+            w_ = np.copy(w_np).astype(out_dtype)
+            for x in np.nditer(w_, op_flags=['readwrite']):
+                x[...] = 1 if x == 1 else -1
+            b_np = topi.testing.conv2d_nhwc_python(a_np, w_, stride, padding).astype(out_dtype)
+        else:
+            b_np = topi.testing.conv2d_nhwc_python(a_np, w_np, stride, padding).astype(out_dtype)
+        return a_np, w_np, b_np
+    a_np, w_np, b_np = get_ref_data()
+
+    ctx = tvm.cpu(0)
+    a = tvm.nd.array(a_np, ctx)
+    w = tvm.nd.array(w_np, ctx)
+    b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx)
+    func = tvm.build(s, [A, W, B], 'llvm')
+
+    func(a, w, b)
+    np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
+
+def test_bitserial_conv2d(in_size, ic, oc, k, stride, pad):
+    verify_bitserial_conv2d_nchw(1, in_size, ic, oc, k, stride, pad, 1, 1, True)
+    verify_bitserial_conv2d_nchw(1, in_size, ic, oc, k, stride, pad, 2, 1, True)
+    verify_bitserial_conv2d_nchw(1, in_size, ic, oc, k, stride, pad, 1, 1, False)
+    verify_bitserial_conv2d_nchw(1, in_size, ic, oc, k, stride, pad, 2, 1, False)
+    verify_bitserial_conv2d_nchw(1, in_size, ic, oc, k, stride, pad, 2, 2, False)
+
+    verify_bitserial_conv2d_nhwc(1, in_size, ic, oc, k, stride, pad, 1, 1, True)
+    verify_bitserial_conv2d_nhwc(1, in_size, ic, oc, k, stride, pad, 2, 1, True)
+    verify_bitserial_conv2d_nhwc(1, in_size, ic, oc, k, stride, pad, 1, 1, False)
+    verify_bitserial_conv2d_nhwc(1, in_size, ic, oc, k, stride, pad, 2, 1, False)
+    verify_bitserial_conv2d_nhwc(1, in_size, ic, oc, k, stride, pad, 2, 2, False)
+
+
+if __name__ == "__main__":
+    test_bitserial_conv2d(56, 64, 128, 3, 2, 1)
+
diff --git a/topi/tests/python/test_topi_bitserial_conv2d_rasp.py b/topi/tests/python/test_topi_bitserial_conv2d_rasp.py
new file mode 100644
index 000000000000..7223b17b9d8d
--- /dev/null
+++ b/topi/tests/python/test_topi_bitserial_conv2d_rasp.py
@@ -0,0 +1,132 @@
+import os
+import numpy as np
+import tvm
+import topi
+import topi.testing
+from tvm.contrib.pickle_memoize import memoize
+from topi.util import get_const_tuple
+from tvm.contrib import rpc, util
+
+def generate_quantized_np(shape, bits, out_dtype):
+    np.random.seed(0)
+    min_val = 0
+    max_val = 1 << bits
+    return np.random.randint(min_val, max_val, size=shape).astype(out_dtype)
+
+def verify_bitserial_conv2d_nchw(batch, in_size, in_channel, num_filter, kernel, stride, padding, 
+                        activation_bits, weight_bits, dorefa):
+    target = 'llvm -target=armv7l-none-linux-gnueabihf -mcpu=cortex-a53 -mattr=+neon'
+    host = '10.77.1.69'
+    port = 9090
+    remote = rpc.connect(host, port)
+    ctx = remote.cpu(0)  
+
+    in_height = in_width = in_size
+    input_type='uint32'
+    out_dtype='int32'
+
+    with tvm.target.rasp():
+        A = tvm.placeholder((batch, in_channel, in_height, in_width), dtype=input_type, name='A')
+        W = tvm.placeholder((num_filter, in_channel, kernel, kernel), dtype=input_type, name='W')
+        B = topi.nn.bitserial_conv2d(A, W, stride, padding, activation_bits, weight_bits, out_dtype=out_dtype,
+            layout="NCHW", dorefa=dorefa)
+        s = topi.generic.schedule_bitserial_conv2d_nchw([B])
+
+    a_shape = get_const_tuple(A.shape)
+    w_shape = get_const_tuple(W.shape)
+    dtype = A.dtype
+
+    def get_ref_data():
+        a_np = generate_quantized_np(get_const_tuple(A.shape), activation_bits, input_type)
+        w_np = generate_quantized_np(get_const_tuple(W.shape), weight_bits, input_type)
+        if dorefa:
+            w_ = np.copy(w_np).astype(out_dtype)
+            for x in np.nditer(w_, op_flags=['readwrite']):
+                x[...] = 1 if x == 1 else -1
+            b_np = topi.testing.conv2d_nchw_python(a_np, w_, stride, padding).astype(out_dtype)
+        else:
+            b_np = topi.testing.conv2d_nchw_python(a_np, w_np, stride, padding).astype(out_dtype)
+        return a_np, w_np, b_np
+    a_np, w_np, b_np = get_ref_data()
+
+    a = tvm.nd.array(a_np, ctx)
+    w = tvm.nd.array(w_np, ctx)
+    b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx)
+    func = tvm.build(s, [A, W, B], target)
+
+    # upload to rpi
+    temp = util.tempdir()
+    path = temp.relpath('qconv_nhwc.o')
+    func.save(path)
+    remote.upload(path)
+    func = remote.load_module('qconv_nhwc.o')
+
+    func(a, w, b)
+    np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
+
+def verify_bitserial_conv2d_nhwc(batch, in_size, in_channel, num_filter, kernel, stride, padding, 
+                        activation_bits, weight_bits, dorefa):
+    target = 'llvm -target=armv7l-none-linux-gnueabihf -mcpu=cortex-a53 -mattr=+neon'
+    host = '10.77.1.69'
+    port = 9090
+    remote = rpc.connect(host, port)
+    ctx = remote.cpu(0)  
+
+    in_height = in_width = in_size
+    input_type='uint32'
+    out_dtype='int32'
+
+    with tvm.target.rasp():
+        A = tvm.placeholder((batch, in_height, in_width, in_channel), dtype=input_type, name='A')
+        W = tvm.placeholder((kernel, kernel, in_channel, num_filter), dtype=input_type, name='W')
+        B = topi.nn.bitserial_conv2d(A, W, stride, padding, activation_bits, weight_bits, out_dtype=out_dtype, 
+                            layout="NHWC", dorefa=dorefa)
+        s = topi.generic.schedule_bitserial_conv2d_nhwc([B])
+
+    a_shape = get_const_tuple(A.shape)
+    w_shape = get_const_tuple(W.shape)
+    dtype = A.dtype
+
+    def get_ref_data():
+        a_np = generate_quantized_np(get_const_tuple(A.shape), activation_bits, input_type)
+        w_np = generate_quantized_np(get_const_tuple(W.shape), weight_bits, input_type)
+        if dorefa:
+            w_ = np.copy(w_np).astype(out_dtype)
+            for x in np.nditer(w_, op_flags=['readwrite']):
+                x[...] = 1 if x == 1 else -1
+            b_np = topi.testing.conv2d_nhwc_python(a_np, w_, stride, padding).astype(out_dtype)
+        else:
+            b_np = topi.testing.conv2d_nhwc_python(a_np, w_np, stride, padding).astype(out_dtype)
+        return a_np, w_np, b_np
+    a_np, w_np, b_np = get_ref_data()
+    a = tvm.nd.array(a_np, ctx)
+    w = tvm.nd.array(w_np, ctx)
+    b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx)
+    func = tvm.build(s, [A, W, B], target)
+    # Upload to pi
+    temp = util.tempdir()
+    path = temp.relpath('qconv_nhwc.o')
+    func.save(path)
+    remote.upload(path)
+    func = remote.load_module('qconv_nhwc.o')
+
+    func(a, w, b)
+    np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
+
+
+def test_bitserial_conv2d(in_size, ic, oc, k, stride, pad):
+    verify_bitserial_conv2d_nchw(1, in_size, ic, oc, k, stride, pad, 1, 1, False)
+    verify_bitserial_conv2d_nchw(1, in_size, ic, oc, k, stride, pad, 2, 1, False)
+    verify_bitserial_conv2d_nchw(1, in_size, ic, oc, k, stride, pad, 2, 1, False)
+    verify_bitserial_conv2d_nchw(1, in_size, ic, oc, k, stride, pad, 1, 1, True)
+    verify_bitserial_conv2d_nchw(1, in_size, ic, oc, k, stride, pad, 2, 1, True)
+
+    verify_bitserial_conv2d_nhwc(1, in_size, ic, oc, k, stride, pad, 1, 1, False)
+    verify_bitserial_conv2d_nhwc(1, in_size, ic, oc, k, stride, pad, 2, 1, False)
+    verify_bitserial_conv2d_nhwc(1, in_size, ic, oc, k, stride, pad, 2, 1, False)
+    # verify_bitserial_conv2d_nhwc(1, in_size, ic, oc, k, stride, pad, 1, 1, True)
+    # verify_bitserial_conv2d_nhwc(1, in_size, ic, oc, k, stride, pad, 2, 1, True)
+
+if __name__ == "__main__":
+    test_bitserial_conv2d(56, 64, 64, 3, 1, 1)
+

From f8467dde91000a56d29ee9982bdaea1e17a59bd8 Mon Sep 17 00:00:00 2001
From: Meghan <cowanmeg@cs.washington.edu>
Date: Mon, 18 Jun 2018 23:30:06 -0700
Subject: [PATCH 07/11] remove unused simple compute and schedules

---
 topi/python/topi/nn/bitserial_conv2d.py  | 77 --------------------
 topi/python/topi/x86/bitserial_conv2d.py | 89 ------------------------
 2 files changed, 166 deletions(-)

diff --git a/topi/python/topi/nn/bitserial_conv2d.py b/topi/python/topi/nn/bitserial_conv2d.py
index e51577563498..d769800ba0a3 100644
--- a/topi/python/topi/nn/bitserial_conv2d.py
+++ b/topi/python/topi/nn/bitserial_conv2d.py
@@ -120,83 +120,6 @@ def _get_schedule(wkl, layout):
     # This return has no use, merely to supress pylint warning
     return wkl
 
-
-def bitserial_conv2d_nchw(Input, Filter, stride, padding, activation_bits, weight_bits,
-                 out_dtype='int32', pack_type='uint32'):
-    assert isinstance(stride, int) or len(stride) == 2
-    Input_q = bitpack(Input, activation_bits, pack_axis=1, bit_axis=2, pack_type=pack_type)
-    Filter_q = bitpack(Filter, weight_bits, pack_axis=1, bit_axis=4, pack_type=pack_type)
-    batch, in_channel, activation_bits, in_height, in_width = Input_q.shape
-    num_filter, channel, kernel_h, kernel_w, weight_bits = Filter_q.shape
-
-    pad_top, pad_left, pad_down, pad_right = get_pad_tuple(
-        padding, (kernel_h, kernel_w))
-    pad_before = [0, 0, 0, pad_top, pad_left]
-    pad_after = [0, 0, 0, pad_down, pad_right]
-
-    PadInput_q = pad(Input_q, pad_before, pad_after, name="pad_temp")
-    # compute the output shape
-    if isinstance(stride, int):
-        stride_h = stride_w = stride
-    else:
-        stride_h, stride_w = stride
-    out_channel = num_filter
-    out_height = simplify((in_height - kernel_h + pad_top + pad_down) // stride_h + 1)
-    out_width = simplify((in_width - kernel_w + pad_left + pad_right) // stride_w + 1)
-
-    rc = tvm.reduce_axis((0, in_channel), name='rc')
-    ry = tvm.reduce_axis((0, kernel_h), name='ry')
-    rx = tvm.reduce_axis((0, kernel_w), name='rx')
-    b1 = tvm.reduce_axis((0, activation_bits), name='b1')
-    b2 = tvm.reduce_axis((0, weight_bits), name='b2')
-
-    def _conv(nn, ff, yy, xx):
-        b1b2 = (b1+b2).astype(out_dtype)
-        return tvm.sum((tvm.popcount(
-            PadInput_q[nn, rc, b1, yy * stride_h + ry, xx * stride_w + rx] &
-            Filter_q[ff, rc, ry, rx, b2])<< (b1b2)).astype(out_dtype),
-                       axis=[rc, ry, rx, b2, b1]).astype(out_dtype)
-
-    return tvm.compute((batch, out_channel, out_height, out_width), _conv,
-                       name="Conv2dOutput", tag="bitserial_conv2d_nchw")
-
-def bitserial_conv2d_nhwc(Input, Filter, stride, padding, activation_bits, weight_bits,
-                 out_dtype='int32', pack_type='uint32'):
-    assert isinstance(stride, int) or len(stride) == 2
-    Input_q = bitpack(Input, activation_bits, pack_axis=3, bit_axis=4, pack_type=pack_type)
-    Filter_q = bitpack(Filter, weight_bits, pack_axis=2, bit_axis=4, pack_type=pack_type)
-    batch, in_height, in_width, in_channel_q, _ = Input_q.shape
-    kernel_h, kernel_w, _, num_filter, _ = Filter_q.shape
-
-    if isinstance(stride, int):
-        stride_h = stride_w = stride
-    else:
-        stride_h, stride_w = stride
-    pad_top, pad_left, pad_down, pad_right = get_pad_tuple(padding, (kernel_h, kernel_w))
-    # compute the output shape
-    out_channel = num_filter
-    out_height = simplify((in_height - kernel_h + pad_top + pad_down) // stride_h + 1)
-    out_width = simplify((in_width - kernel_w + pad_left + pad_right) // stride_w + 1)
-    pad_before = [0, pad_top, pad_left, 0, 0]
-    pad_after = [0, pad_down, pad_right, 0, 0]
-    PadInput_q = pad(Input_q, pad_before, pad_after, name="PaddedInput")
-
-    rc = tvm.reduce_axis((0, in_channel_q), name='rc')
-    ry = tvm.reduce_axis((0, kernel_h), name='ry')
-    rx = tvm.reduce_axis((0, kernel_w), name='rx')
-    b1 = tvm.reduce_axis((0, activation_bits), name='b1')
-    b2 = tvm.reduce_axis((0, weight_bits), name='b2')
-
-    def _conv(nn, yy, xx, ff):
-        b1b2 = (b1+b2).astype(out_dtype)
-        return tvm.sum((tvm.popcount(
-            PadInput_q[nn, yy * stride_h + ry, xx * stride_w + rx, rc, b1] &
-            Filter_q[ry, rx, rc, ff, b2]) << b1b2).astype(out_dtype),
-                       axis=[rc, ry, rx, b2, b1])
-
-    return tvm.compute((batch, out_height, out_width, out_channel), _conv,
-                       name="Conv2dOutput", tag="bitserial_conv2d_nhwc")
-
 def spatial_pack_nchw(data, kernel, stride, padding, in_bits, weight_bits,
                       pack_dtype, out_dtype, dorefa=False):
     """ Compute convolution with pack on spatial axes. """
diff --git a/topi/python/topi/x86/bitserial_conv2d.py b/topi/python/topi/x86/bitserial_conv2d.py
index 522e6eb32208..292d96d29e9e 100644
--- a/topi/python/topi/x86/bitserial_conv2d.py
+++ b/topi/python/topi/x86/bitserial_conv2d.py
@@ -109,19 +109,6 @@ def traverse(op):
                 _schedule_spatial_conv2d_nhwc(s, data, data_q, data_pad, data_vec,
                                               kernel, kernel_q, kernel_vec,
                                               conv_out, output, outs[0])
-        else:
-            kernel = op.input_tensors[1]
-            data_q = op.input_tensors[0]
-            data = data_q.op.input_tensors[0]
-            data_pad = None
-            if isinstance(data_q.op, tvm.tensor.ComputeOp) and "pad" in data_q.op.tag:
-                data_pad = data_q
-                data_q = data
-                data = data_q.op.input_tensors[0]
-            if 'conv2d_nchw_q' in op.tag:
-                _schedule_conv2d_nchw(s, data, data_q, data_pad, kernel, output)
-            elif 'conv2d_nhwc_q' in op.tag:
-                _schedule_conv2d_nhwc(s, data, data_q, data_pad, kernel, output)
 
     traverse(outs[0].op)
     return s
@@ -325,79 +312,3 @@ def _schedule_spatial_conv2d_nhwc(s, data, data_q, data_pad, data_vec,
     s[last].pragma(oaxis, "parallel_barrier_when_finish")
 
     return s
-
-# Very simple schedules
-def schedule_qconv2d_nchw(outs):
-    """Create schedule for tensors"""
-    s = tvm.create_schedule([x.op for x in outs])
-
-    def traverse(op):
-        if 'qconv2d_nchw' in op.tag:
-            output = op.output(0)
-            kernel = op.input_tensors[1]
-            data_q = op.input_tensors[0]
-            data = data_q.op.input_tensors[0]
-            data_pad = None
-            if isinstance(data_q.op, tvm.tensor.ComputeOp) and "pad" in data_q.op.tag:
-                data_pad = data_q
-                data_q = data
-                data = data_q.op.input_tensors[0]
-
-            # Schedule for padding
-            n_pad, c_pad, b_pad, h_pad, w_pad = data_pad.op.axis
-            pad_fused = s[data_pad].fuse(n_pad, c_pad)
-            s[data_pad].parallel(pad_fused)
-
-            # Schedule for convolution
-            nn, ff, yy, xx = s[output].op.axis
-            rc, ry, rx, b2, b1 = s[output].op.reduce_axis
-
-            # Tiling
-            yo, xo, yi, xi = s[output].tile(yy, xx, 4, 4)
-            fused = s[output].fuse(nn, ff)
-            s[output].reorder(fused, rc, yo, xo, ry, rx, yi, b1, b2, xi)
-            # Vectorize, unroll, parallel
-            s[output].vectorize(xi)
-            s[output].unroll(b1)
-            s[output].unroll(b2)
-            s[output].parallel(fused)
-
-    traverse(outs[0].op)
-    return s
-
-def schedule_qconv2d_nhwc(outs):
-    """Create schedule for tensors"""
-    s = tvm.create_schedule([x.op for x in outs])
-
-    def traverse(op):
-        if 'qconv2d_nhwc' in op.tag:
-            output = op.output(0)
-            kernel = op.input_tensors[1]
-            data_q = op.input_tensors[0]
-            data = data_q.op.input_tensors[0]
-            data_pad = None
-            if isinstance(data_q.op, tvm.tensor.ComputeOp) and "pad" in data_q.op.tag:
-                data_pad = data_q
-                data_q = data
-                data = data_q.op.input_tensors[0]
-
-            # Schedule for padding
-            n_pad, h_pad, w_pad, c_pad, b_pad = data_pad.op.axis
-            pad_fused = s[data_pad].fuse(n_pad, h_pad)
-            s[data_pad].parallel(pad_fused)
-
-            # Schedule for convolution
-            nn, yy, xx, ff = s[output].op.axis
-            ry, rx, rc, b1, b2 = s[output].op.reduce_axis
-
-            # Tiling
-            xo, fo, xi, fi = s[output].tile(xx, ff, 4, 4)
-            fused = s[output].fuse(nn, yy)
-            s[output].reorder(fused, xo, fo, ry, rx, xi, rc, b1, b2, fi)
-            # Vectorize, unroll, parallel
-            s[output].vectorize(fi)
-            s[output].unroll(b1)
-            s[output].unroll(b2)
-            s[output].parallel(fused)
-    traverse(outs[0].op)
-    return s

From 74517e183406f8f0bf412bbfad38e976d072b4b9 Mon Sep 17 00:00:00 2001
From: Meghan <cowanmeg@cs.washington.edu>
Date: Sun, 24 Jun 2018 17:25:04 -0700
Subject: [PATCH 08/11] linting

---
 topi/python/topi/nn/__init__.py               |   1 -
 topi/python/topi/nn/bitserial_conv2d.py       |  38 ++++---
 topi/python/topi/rasp/bitserial_conv2d.py     | 102 ++++++++++--------
 topi/python/topi/x86/__init__.py              |   1 -
 topi/python/topi/x86/bitserial_conv2d.py      |   6 +-
 .../python/test_topi_bitserial_conv2d.py      |  11 +-
 .../python/test_topi_bitserial_conv2d_rasp.py |  11 +-
 7 files changed, 95 insertions(+), 75 deletions(-)

diff --git a/topi/python/topi/nn/__init__.py b/topi/python/topi/nn/__init__.py
index 2c17e0540477..4caaef5a4d86 100644
--- a/topi/python/topi/nn/__init__.py
+++ b/topi/python/topi/nn/__init__.py
@@ -18,4 +18,3 @@
 from .local_response_norm import *
 from .l2_norm import *
 from .bitserial_conv2d import *
-from .qdense import *
\ No newline at end of file
diff --git a/topi/python/topi/nn/bitserial_conv2d.py b/topi/python/topi/nn/bitserial_conv2d.py
index d769800ba0a3..c8d5313770f6 100644
--- a/topi/python/topi/nn/bitserial_conv2d.py
+++ b/topi/python/topi/nn/bitserial_conv2d.py
@@ -1,13 +1,13 @@
-# pylint: disable=invalid-name, unused-variable, too-many-locals, unused-argument
-"""Conv2D operators"""
+# pylint: disable=invalid-name, unused-variable, too-many-locals, too-many-arguments, unused-argument
+"""Bitserial Conv2D operators"""
 from __future__ import absolute_import as _abs
 from collections import namedtuple
 import tvm
 from .pad import pad
 from .util import get_pad_tuple, bitpack
-from ..util import simplify, get_const_tuple
+from ..util import get_const_tuple
 
-# workload description of qconv2d
+# workload description of conv2d
 Workload = namedtuple('Workload',
                       ['in_dtype', 'out_dtype', 'height', 'width', 'in_filter', 'out_filter',
                        'hkernel', 'wkernel', 'hpad', 'wpad', 'hstride', 'wstride'])
@@ -16,7 +16,7 @@
                              ['vh', 'vw', 'vc', 'ba', 'bc'])
 
 SpatialPackNHWC = namedtuple('SpatialPack',
-                              ['vh', 'vw', 'vc', 'ba', 'bc'])
+                             ['vh', 'vw', 'vc', 'ba', 'bc'])
 
 _WORKLOADS = [
     # workloads of resnet18 on imagenet
@@ -43,7 +43,7 @@
 @tvm.target.generic_func
 def bitserial_conv2d(data, kernel, stride, padding, activation_bits, weight_bits,
                      layout='NCHW', pack_dtype='uint32', out_dtype='int32', dorefa=True):
-    """Conv2D operator.
+    """Bitserial Conv2D operator.
 
     Parameters
     ----------
@@ -52,7 +52,8 @@ def bitserial_conv2d(data, kernel, stride, padding, activation_bits, weight_bits
                        [batch, in_height, in_width, in_channel]
 
     filter : tvm.Tensor
-        4-D with shape [num_filter, in_channel, filter_height, filter_width]
+        4-D with shape [num_filter, in_channel, filter_height, filter_width] or
+		       [filter_height, filter_width, in_channel, num_filter]
 
     stride : int or a list/tuple of two ints
         stride size, or [stride_height, stride_width]
@@ -64,8 +65,10 @@ def bitserial_conv2d(data, kernel, stride, padding, activation_bits, weight_bits
         layout of data
 
     activation_bits: int
+        number of bits used for activations/input elements
 
     weight_bits: int
+        number of bits used for weight elements
 
     out_dtype: str
         return type of convolution
@@ -74,12 +77,13 @@ def bitserial_conv2d(data, kernel, stride, padding, activation_bits, weight_bits
         bit packing type
 
     dorefa: bool
-        method of preforming popcount
+        preform the bitserial dot-product using 2 popcounts (required for DoReFa-Net)
 
     Returns
     -------
     output : tvm.Tensor
-        4-D with shape [batch, out_channel, out_height, out_width]
+        4-D with shape [batch, out_channel, out_height, out_width] or
+                       [batch, out_height, out_width, out_channel]
     """
     # search platform specific declaration first
     # default declaration
@@ -181,15 +185,15 @@ def _conv(n, co, h, w, vh, vw, vc):
             return tvm.sum((tvm.popcount(
                 data_vec[n, h, w, ci, vh*HSTR+dh, vw*WSTR+dw, b1] &
                 kernel_vec[co, ci, dh, dw, b2, vc])  -
-                tvm.popcount(
-                data_vec[n, h, w, ci, vh*HSTR+dh, vw*WSTR+dw, b1] &
-                ~kernel_vec[co, ci, dh, dw, b2, vc])).astype(out_dtype) << b1b2,
-                axis=[ci, dh, dw, b1, b2])
+                            tvm.popcount(
+                                data_vec[n, h, w, ci, vh*HSTR+dh, vw*WSTR+dw, b1] &
+                                ~kernel_vec[co, ci, dh, dw, b2, vc])).astype(out_dtype) << b1b2,
+                           axis=[ci, dh, dw, b1, b2])
 
         return tvm.sum((tvm.popcount(
             data_vec[n, h, w, ci, vh*HSTR+dh, vw*WSTR+dw, b1] &
             kernel_vec[co, ci, dh, dw, b2, vc])).astype(out_dtype) << b1b2,
-                        axis=[ci, dh, dw, b1, b2])
+                       axis=[ci, dh, dw, b1, b2])
 
     conv = tvm.compute(ovshape, _conv, name='conv_out')
 
@@ -251,9 +255,9 @@ def _conv(n, h, w, co, vh, vw, vc):
         if dorefa:
             return tvm.sum(
                 (tvm.popcount(data_vec[n, h, w, vh*HSTR+dh, vw*WSTR+dw, ci, b1] &
-                    kernel_vec[co, dh, dw, ci, vc, b2]) -
-                tvm.popcount(data_vec[n, h, w, vh*HSTR+dh, vw*WSTR+dw, ci, b1] &
-                    ~kernel_vec[co, dh, dw, ci, vc, b2])).astype(out_dtype) << b1b2,
+                              kernel_vec[co, dh, dw, ci, vc, b2]) -
+                 tvm.popcount(data_vec[n, h, w, vh*HSTR+dh, vw*WSTR+dw, ci, b1] &
+                              ~kernel_vec[co, dh, dw, ci, vc, b2])).astype(out_dtype) << b1b2,
                 axis=[dh, dw, ci, b1, b2])
 
         return tvm.sum(tvm.popcount(
diff --git a/topi/python/topi/rasp/bitserial_conv2d.py b/topi/python/topi/rasp/bitserial_conv2d.py
index 03aa1e1fc418..44f7d8f5fc60 100644
--- a/topi/python/topi/rasp/bitserial_conv2d.py
+++ b/topi/python/topi/rasp/bitserial_conv2d.py
@@ -11,8 +11,8 @@
 from ..util import get_const_int
 from .. import generic
 
-RaspSpatialPack = namedtuple('SpatialPack', 
-                        ['vh', 'vw', 'vc', 'ba', 'bc', 'split_ci', 'kfactor'])
+RaspSpatialPack = namedtuple('SpatialPack',
+                             ['vh', 'vw', 'vc', 'ba', 'bc', 'split_ci', 'kfactor'])
 
 _QUANTIZED_SCHEDULES_NHWC = [
     RaspSpatialPack(2, 2, 8, 1, 1, False, 8),
@@ -62,16 +62,17 @@ def _declaration_bitserial_conv2d(data, kernel, stride, padding, activation_bits
         out_dtype = data.dtype
     assert data.shape[0].value == 1, "only support batch size=1 convolution on rasp"
     assert layout == "NCHW" or layout == "NHWC", "only support layouts NCHW and NHWC"
+    if dorefa:
+        assert layout == "NCHW", "Cannot support dorea with NHWC layout yet"
     wkl = _get_workload(data, kernel, stride, padding, out_dtype, layout)
     sch = _get_schedule(wkl, layout)
     if layout == "NCHW":
         return spatial_pack_nchw(data, kernel, stride, padding, activation_bits, weight_bits,
                                  pack_dtype=pack_dtype, out_dtype=out_dtype, dorefa=dorefa)
-    # TODO: Doesn't support dorefa style yet
     return _spatial_pack_nhwc(data, kernel, stride, padding, activation_bits,
                               weight_bits, out_dtype)
 
-def kernel_vec_spatial_pack_nhwc(kernel, kernel_bits, VC):
+def _kernel_vec_spatial_pack_nhwc(kernel, kernel_bits, VC):
     kernel_q = bitpack(kernel, kernel_bits, pack_axis=2, bit_axis=2, pack_type='uint8')
     KH, KW, KB, CI, CO = kernel_q.shape
     kvshape = (CO//VC, KH, KW, KB, VC, CI)
@@ -88,7 +89,7 @@ def _spatial_pack_nhwc(data, kernel, stride, padding, activation_bits, weight_bi
     VC = sch.vc
 
     data_q = bitpack(data, activation_bits, pack_axis=3, bit_axis=3, pack_type='uint8')
-    kernel_vec = kernel_vec_spatial_pack_nhwc(kernel, weight_bits, VC)
+    kernel_vec = _kernel_vec_spatial_pack_nhwc(kernel, weight_bits, VC)
     N, H, W, IB, CI = data_q.shape
     OCO, KH, KW, KB, VC, _ = kernel_vec.shape
 
@@ -127,7 +128,7 @@ def _conv(n, h, w, co, vh, vw, vc):
         return tvm.sum((tvm.popcount(
             kernel_vec[co, dh, dw, kb, vc, ci].astype('uint16') &
             data_vec[n, h, w, vh*HSTR+dh, vw*WSTR+dw, ib, ci].astype('uint16'))
-            << (kb + ib).astype('uint16')), axis=[dh, dw, kb, ib, ci])
+                        << (kb + ib).astype('uint16')), axis=[dh, dw, kb, ib, ci])
 
     conv = tvm.compute(ovshape, _conv, name='conv')
 
@@ -135,7 +136,7 @@ def _conv(n, h, w, co, vh, vw, vc):
                        conv[n][h//VH][w//VW][co//VC][h%VH][w%VW][co%VC].astype(out_dtype),
                        name='output_vec', tag='spatial_bitserial_conv_nhwc')
 
-def intrin_popcount(m, k_i, w_b, x_b):
+def _intrin_popcount(m, k_i, w_b, x_b):
     dtype = 'uint8'
     w = tvm.placeholder((w_b, m, k_i), dtype=dtype, name='w')
     x = tvm.placeholder((x_b, k_i,), dtype=dtype, name='x')
@@ -143,8 +144,9 @@ def intrin_popcount(m, k_i, w_b, x_b):
     bw = tvm.reduce_axis((0, w_b), name='bw')
     bx = tvm.reduce_axis((0, x_b), name='bx')
     z = tvm.compute((m,), lambda i:
-                    tvm.sum(tvm.popcount(w[bw, i, k].astype('uint16') & x[bx, k].astype('uint16'))
-                    << (bw+bx).astype('uint16'), axis=[bw, bx, k]), name='z')
+                    tvm.sum(tvm.popcount(w[bw, i, k].astype('uint16') &
+                                         x[bx, k].astype('uint16'))
+                            << (bw+bx).astype('uint16'), axis=[bw, bx, k]), name='z')
 
     Wb = tvm.decl_buffer(w.shape, w.dtype,
                          name="W",
@@ -155,7 +157,7 @@ def intrin_popcount(m, k_i, w_b, x_b):
                          offset_factor=k_i,
                          strides=[tvm.var('ldw'), 1])
 
-    def intrin_func(ins, outs):
+    def _intrin_func(ins, outs):
         ww, xx = ins
         zz = outs[0]
         vpadd_id = tvm.const(647, 'uint32')
@@ -163,47 +165,54 @@ def intrin_func(ins, outs):
         args_1 = tvm.const(1, 'uint32')
         args_2 = tvm.const(2, 'uint32')
 
-        def instr(index):
+        def _instr(index):
             irb = tvm.ir_builder.create()
             if index == 1:
                 irb.emit(zz.vstore(0, tvm.const(0, 'uint16x8')))
-            else:
-                cnts8 = [None] * 8
-                cnts4 = [None] * 4
-                cnts2 = [None] * 2
-                for bw in range(w_b):
-                    for bx in range(x_b):
-                        if k_i == 16:
-                            for i in range(m):
-                                ands = ww.vload([bw, i, 0], 'uint8x16') & xx.vload([bx, 0], 'uint8x16')
-                                cnts = tvm.popcount(ands)
-                                upper_half = tvm.call_pure_intrin('uint8x8', 'vectorhigh', cnts)
-                                lower_half = tvm.call_pure_intrin('uint8x8', 'vectorlow', cnts)
-                                cnts8[i] = upper_half + lower_half
-                            for i in range(m/2):
-                                cnts4[i] = tvm.call_pure_intrin('uint8x8', 'llvm_intrin', vpadd_id, args_1, cnts8[i*2], cnts8[i*2+1])
-                            for i in range(m/4):
-                                cnts2[i] = tvm.call_pure_intrin('uint8x8', 'llvm_intrin', vpadd_id, args_1, cnts4[i*2], cnts4[i*2+1])
-                            cnts = tvm.call_pure_intrin('uint8x16', 'vectorcombine', cnts2[0], cnts2[1])
-                            shifted_cnts = cnts << (bw+bx)
-                            out = tvm.call_pure_intrin('uint16x8', 'llvm_intrin', vpadalu_id, args_2, zz.vload(0, 'uint16x8'), shifted_cnts)
-                        else: # ki ==8
-                            for i in range(m):
-                                ands = ww.vload([bw, i, 0], 'uint8x8') & xx.vload([bx, 0], 'uint8x8')
-                                cnts8[i] = tvm.popcount(ands)
-                            for i in range(m/2):
-                                cnts4[i] = tvm.call_pure_intrin('uint8x8', 'llvm_intrin', vpadd_id, args_1, cnts8[i*2], cnts8[i*2+1])
-                            for i in range(m/4):
-                                cnts2[i] = tvm.call_pure_intrin('uint8x8', 'llvm_intrin', vpadd_id, args_1, cnts4[i*2], cnts4[i*2+1])
-                            cnts = tvm.call_pure_intrin('uint8x16', 'vectorcombine', cnts2[0], cnts2[1])
-                            shifted_cnts = cnts << (bw+bx)
-                            out = tvm.call_pure_intrin('uint16x8', 'llvm_intrin', vpadalu_id, args_2, zz.vload(0, 'uint16x8'), shifted_cnts)
-                        irb.emit(zz.vstore(0, out))
+                return irb.get()
+
+            cnts8 = [None] * 8
+            cnts4 = [None] * 4
+            cnts2 = [None] * 2
+            for bw in range(w_b):
+                for bx in range(x_b):
+                    if k_i == 16:
+                        for i in range(m):
+                            ands = ww.vload([bw, i, 0], 'uint8x16') & xx.vload([bx, 0], 'uint8x16')
+                            cnts = tvm.popcount(ands)
+                            upper_half = tvm.call_pure_intrin('uint8x8', 'vectorhigh', cnts)
+                            lower_half = tvm.call_pure_intrin('uint8x8', 'vectorlow', cnts)
+                            cnts8[i] = upper_half + lower_half
+                        for i in range(m//2):
+                            cnts4[i] = tvm.call_pure_intrin('uint8x8', 'llvm_intrin', vpadd_id,
+                                                            args_1, cnts8[i*2], cnts8[i*2+1])
+                        for i in range(m//4):
+                            cnts2[i] = tvm.call_pure_intrin('uint8x8', 'llvm_intrin', vpadd_id,
+                                                            args_1, cnts4[i*2], cnts4[i*2+1])
+                        cnts = tvm.call_pure_intrin('uint8x16', 'vectorcombine', cnts2[0], cnts2[1])
+                        shifted_cnts = cnts << (bw+bx)
+                        out = tvm.call_pure_intrin('uint16x8', 'llvm_intrin', vpadalu_id,
+                                                   args_2, zz.vload(0, 'uint16x8'), shifted_cnts)
+                    else: # ki == 8
+                        for i in range(m):
+                            ands = ww.vload([bw, i, 0], 'uint8x8') & xx.vload([bx, 0], 'uint8x8')
+                            cnts8[i] = tvm.popcount(ands)
+                        for i in range(m//2):
+                            cnts4[i] = tvm.call_pure_intrin('uint8x8', 'llvm_intrin', vpadd_id,
+                                                            args_1, cnts8[i*2], cnts8[i*2+1])
+                        for i in range(m//4):
+                            cnts2[i] = tvm.call_pure_intrin('uint8x8', 'llvm_intrin', vpadd_id,
+                                                            args_1, cnts4[i*2], cnts4[i*2+1])
+                        cnts = tvm.call_pure_intrin('uint8x16', 'vectorcombine', cnts2[0], cnts2[1])
+                        shifted_cnts = cnts << (bw+bx)
+                        out = tvm.call_pure_intrin('uint16x8', 'llvm_intrin', vpadalu_id,
+                                                   args_2, zz.vload(0, 'uint16x8'), shifted_cnts)
+                    irb.emit(zz.vstore(0, out))
             return irb.get()
         # body, reset, update
-        return instr(0), instr(1), instr(2)
+        return _instr(0), _instr(1), _instr(2)
     with tvm.build_config(offset_factor=1, partition_const_loop=True):
-        return tvm.decl_tensor_intrin(z.op, intrin_func, binds={w: Wb, x:Xb})
+        return tvm.decl_tensor_intrin(z.op, _intrin_func, binds={w: Wb, x:Xb})
 
 
 # ARM specific schedule that using custom microkernel
@@ -293,7 +302,7 @@ def _schedule_spatial_conv2d_nhwc(s, data, data_q, data_pad, data_vec,
     else:
         s[conv_out].reorder(n, oh, ow, co, vh, vw, dh, dw, kb, ib, vc, ci)
 
-    pc = intrin_popcount(8, kfactor, KB, IB)
+    pc = _intrin_popcount(8, kfactor, KB, IB)
     s[conv_out].tensorize(kb, pc)
 
     n, h, w, co = s[last].op.axis
@@ -320,6 +329,7 @@ def _schedule_spatial_conv2d_nhwc(s, data, data_q, data_pad, data_vec,
 
 @generic.schedule_bitserial_conv2d_nhwc.register(["rasp"])
 def schedule_bitserial_conv2d_nhwc(outs):
+    """Raspverry pi schedule for bitserial conv2d"""
     s = tvm.create_schedule([x.op for x in outs])
     def traverse(op):
         """Traverse operators from computation graph"""
diff --git a/topi/python/topi/x86/__init__.py b/topi/python/topi/x86/__init__.py
index 78f18b4ebf7e..c146419fcec9 100644
--- a/topi/python/topi/x86/__init__.py
+++ b/topi/python/topi/x86/__init__.py
@@ -9,4 +9,3 @@
 from .injective import *
 from .pooling import schedule_pool, schedule_global_pool
 from .bitserial_conv2d import schedule_bitserial_conv2d
-from .qdense import schedule_qdense
diff --git a/topi/python/topi/x86/bitserial_conv2d.py b/topi/python/topi/x86/bitserial_conv2d.py
index 292d96d29e9e..1c01b96f9c30 100644
--- a/topi/python/topi/x86/bitserial_conv2d.py
+++ b/topi/python/topi/x86/bitserial_conv2d.py
@@ -55,7 +55,7 @@ def _get_schedule_bitserial_conv2d(wkl, layout):
 
 @bitserial_conv2d.register("cpu")
 def _declaration_bitserial_conv2d(data, kernel, stride, padding, activation_bits, weight_bits,
-                         layout='NCHW', pack_dtype=None, out_dtype=None, dorefa=False):
+                                  layout='NCHW', pack_dtype=None, out_dtype=None, dorefa=False):
     if out_dtype is None:
         out_dtype = data.dtype
     assert data.shape[0].value == 1, "only support batch size=1 convolution on rasp"
@@ -69,12 +69,14 @@ def _declaration_bitserial_conv2d(data, kernel, stride, padding, activation_bits
 @generic.schedule_bitserial_conv2d_nchw.register(["cpu"])
 @generic.schedule_bitserial_conv2d_nhwc.register(["cpu"])
 def schedule_bitserial_conv2d(outs):
+    """CPU schedule for bitserial convolutions NCHW and NHWC"""
     s = tvm.create_schedule([x.op for x in outs])
 
     def traverse(op):
+        """Traverse operators from computation graph"""
         output = op.output(0)
         # inline all one-to-one-mapping operators except the last stage (output)
-        if tag.is_broadcast(op.tag) or 'elemwise' in op.tag or 'uquantize' in op.tag:
+        if tag.is_broadcast(op.tag) or 'elemwise' in op.tag:
             if op not in s.outputs:
                 s[op].compute_inline()
             for tensor in op.input_tensors:
diff --git a/topi/tests/python/test_topi_bitserial_conv2d.py b/topi/tests/python/test_topi_bitserial_conv2d.py
index b27067d24b6a..a494f57551ce 100644
--- a/topi/tests/python/test_topi_bitserial_conv2d.py
+++ b/topi/tests/python/test_topi_bitserial_conv2d.py
@@ -90,7 +90,12 @@ def get_ref_data():
     func(a, w, b)
     np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
 
-def test_bitserial_conv2d(in_size, ic, oc, k, stride, pad):
+def test_bitserial_conv2d():
+    in_size = 56
+    ic, oc = 64, 64
+    k = 3
+    stride = 1
+    pad = 1
     verify_bitserial_conv2d_nchw(1, in_size, ic, oc, k, stride, pad, 1, 1, True)
     verify_bitserial_conv2d_nchw(1, in_size, ic, oc, k, stride, pad, 2, 1, True)
     verify_bitserial_conv2d_nchw(1, in_size, ic, oc, k, stride, pad, 1, 1, False)
@@ -103,7 +108,5 @@ def test_bitserial_conv2d(in_size, ic, oc, k, stride, pad):
     verify_bitserial_conv2d_nhwc(1, in_size, ic, oc, k, stride, pad, 2, 1, False)
     verify_bitserial_conv2d_nhwc(1, in_size, ic, oc, k, stride, pad, 2, 2, False)
 
-
 if __name__ == "__main__":
-    test_bitserial_conv2d(56, 64, 128, 3, 2, 1)
-
+    test_bitserial_conv2d()
\ No newline at end of file
diff --git a/topi/tests/python/test_topi_bitserial_conv2d_rasp.py b/topi/tests/python/test_topi_bitserial_conv2d_rasp.py
index 7223b17b9d8d..c1ec95c383ef 100644
--- a/topi/tests/python/test_topi_bitserial_conv2d_rasp.py
+++ b/topi/tests/python/test_topi_bitserial_conv2d_rasp.py
@@ -114,7 +114,12 @@ def get_ref_data():
     np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
 
 
-def test_bitserial_conv2d(in_size, ic, oc, k, stride, pad):
+def test_bitserial_conv2d():
+    in_size = 56
+    ic, oc = 64, 64
+    k = 3
+    stride = 1
+    pad = 1
     verify_bitserial_conv2d_nchw(1, in_size, ic, oc, k, stride, pad, 1, 1, False)
     verify_bitserial_conv2d_nchw(1, in_size, ic, oc, k, stride, pad, 2, 1, False)
     verify_bitserial_conv2d_nchw(1, in_size, ic, oc, k, stride, pad, 2, 1, False)
@@ -124,9 +129,7 @@ def test_bitserial_conv2d(in_size, ic, oc, k, stride, pad):
     verify_bitserial_conv2d_nhwc(1, in_size, ic, oc, k, stride, pad, 1, 1, False)
     verify_bitserial_conv2d_nhwc(1, in_size, ic, oc, k, stride, pad, 2, 1, False)
     verify_bitserial_conv2d_nhwc(1, in_size, ic, oc, k, stride, pad, 2, 1, False)
-    # verify_bitserial_conv2d_nhwc(1, in_size, ic, oc, k, stride, pad, 1, 1, True)
-    # verify_bitserial_conv2d_nhwc(1, in_size, ic, oc, k, stride, pad, 2, 1, True)
 
 if __name__ == "__main__":
-    test_bitserial_conv2d(56, 64, 64, 3, 1, 1)
+    test_bitserial_conv2d()
 

From 80d0f081a38c6184b04125e2b1871b8e5870d727 Mon Sep 17 00:00:00 2001
From: Meghan <cowanmeg@cs.washington.edu>
Date: Mon, 25 Jun 2018 00:37:03 -0700
Subject: [PATCH 09/11] more linting

---
 HalideIR                                      |  2 +-
 dmlc-core                                     |  2 +-
 topi/python/topi/nn/bitserial_conv2d.py       | 68 ++++++++++++++++++-
 topi/python/topi/nn/util.py                   | 66 ------------------
 topi/python/topi/rasp/bitserial_conv2d.py     |  9 +--
 .../python/test_topi_bitserial_conv2d_rasp.py | 11 ++-
 6 files changed, 75 insertions(+), 83 deletions(-)

diff --git a/HalideIR b/HalideIR
index a3698398faff..0b7e25275138 160000
--- a/HalideIR
+++ b/HalideIR
@@ -1 +1 @@
-Subproject commit a3698398faff7fec1c0fa4e4479357651382db75
+Subproject commit 0b7e25275138768bb05edb9b9db2c86d0fb09c9a
diff --git a/dmlc-core b/dmlc-core
index 9b3f9753ae81..e864aa6757cd 160000
--- a/dmlc-core
+++ b/dmlc-core
@@ -1 +1 @@
-Subproject commit 9b3f9753ae81d657743c555e0cacc4e43f0bed2d
+Subproject commit e864aa6757cdbe78b1296fe5231fd3050b7802c3
diff --git a/topi/python/topi/nn/bitserial_conv2d.py b/topi/python/topi/nn/bitserial_conv2d.py
index c8d5313770f6..89cb03182ec6 100644
--- a/topi/python/topi/nn/bitserial_conv2d.py
+++ b/topi/python/topi/nn/bitserial_conv2d.py
@@ -2,10 +2,12 @@
 """Bitserial Conv2D operators"""
 from __future__ import absolute_import as _abs
 from collections import namedtuple
+import numpy as np
 import tvm
+from topi.transform import concatenate
 from .pad import pad
-from .util import get_pad_tuple, bitpack
-from ..util import get_const_tuple
+from .util import get_pad_tuple
+from ..util import get_const_tuple, get_const_int
 
 # workload description of conv2d
 Workload = namedtuple('Workload',
@@ -271,6 +273,68 @@ def _conv(n, h, w, co, vh, vw, vc):
                        conv[n][h//VH][w//VW][co//VC][h%VH][w%VW][co%VC],
                        name='output_unpack', tag='spatial_bitserial_conv_nhwc')
 
+def bitpack(data, bits, pack_axis, bit_axis, pack_type, name="QuantizeInput"):
+    """Packs data into format necessary for bitserial computation
+    pack_axis : int
+       index of the axis to pack in data
+    bit_axis : int
+       index of axis to place bit axis in resulting packed data"""
+    ishape = data.shape
+    n = len(ishape)
+    if pack_type == 'uint8':
+        data_width = 8
+    elif pack_type == 'uint16':
+        data_width = 16
+    elif pack_type == 'uint32':
+        data_width = 32
+    elif pack_type == 'uint64':
+        data_width = 64
+
+    # Data must be in multiples of the data_width
+    assert get_const_int(ishape[pack_axis]) % data_width == 0, "Not a multiple of word size"
+
+    shape_vec = list(ishape)
+    shape_vec[pack_axis] = (shape_vec[pack_axis] // data_width)
+    shape_vec.insert(bit_axis, 1)
+    bitserial_oshape = tuple(shape_vec)
+    masks = np.array([0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80])
+
+    # pack axis shifts if bit axis comes before
+    if bit_axis <= pack_axis:
+        pack_axis += 1
+
+    def _bitpack(*indices):
+        packed_data = [tvm.const(0, pack_type)] * bits
+        for k in range(data_width):
+            # Translate indices for packed data back to original
+            idx = [0] * n
+            j = 0
+            for i in range(n+1):
+                if i == bit_axis:
+                    continue
+                elif i == pack_axis:
+                    idx[j] = indices[i] * data_width + k
+                else:
+                    idx[j] = indices[i]
+                j += 1
+
+            element = data(*idx)
+            for b in range(bits):
+                extracted_bit = ((element & tvm.const(masks[b])) >> b).astype(pack_type)
+                packed_data[b] = (packed_data[b] | extracted_bit)
+                if k < data_width - 1:
+                    packed_data[b] = packed_data[b] << 1
+
+            if k == data_width - 1:
+                return tuple(packed_data)
+        return tuple(packed_data)
+
+    output_tuple = tvm.compute(bitserial_oshape, _bitpack, name=name, tag='bitpack')
+
+    if bits > 1:
+        return concatenate(output_tuple, axis=bit_axis)
+    return output_tuple
+
 _SCH_TO_DECL_FUNC_QUANT = {
     SpatialPackNCHW: spatial_pack_nchw,
     SpatialPackNHWC: spatial_pack_nhwc,
diff --git a/topi/python/topi/nn/util.py b/topi/python/topi/nn/util.py
index 90497a77c6f9..6264ced76953 100644
--- a/topi/python/topi/nn/util.py
+++ b/topi/python/topi/nn/util.py
@@ -4,9 +4,6 @@
 
 import tvm
 from ..util import get_const_int
-import numpy as np
-from topi.transform import concatenate
-
 
 def infer_pad(data, data_pad):
     """Infer the padding from stages in reverse.
@@ -105,66 +102,3 @@ def get_pad_tuple(padding, kernel):
     pad_top = (pad_h + 1) // 2
     pad_left = (pad_w + 1) // 2
     return pad_top, pad_left, pad_h - pad_top, pad_w - pad_left
-
-
-# Packs quantized data into packed bitplanes
-# pack_axis = Axis to compress of original tensor
-# bit_axis = Axis to place bitplanes in the resulting tensor
-# pack_type = Datatype to pack elements into 
-def bitpack(data, bits, pack_axis, bit_axis, pack_type, name="QuantizeInput"):
-    ishape = data.shape
-    n = len(ishape)
-    if pack_type == 'uint8':
-        data_width = 8
-    elif pack_type == 'uint16':
-        data_width = 16
-    elif pack_type == 'uint32':
-        data_width = 32
-    elif pack_type == 'uint64':
-        data_width = 64
-  
-    # Data must be in multiples of the data_width
-    assert get_const_int(ishape[pack_axis]) % data_width == 0, "Not a multiple of word size"
-
-    shape_vec = list(ishape)
-    shape_vec[pack_axis] = (shape_vec[pack_axis] // data_width)
-    shape_vec.insert(bit_axis, 1)
-    bitserial_oshape = tuple(shape_vec)
-    masks = np.array([0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80])
-
-    # pack axis shifts if bit axis comes before
-    if bit_axis <= pack_axis:
-        pack_axis += 1 
-
-    def _bitpack(*indices):
-        packed_data = [tvm.const(0, pack_type)] * bits
-        for k in range(data_width):
-            # Translate indices for packed data back to original
-            idx = [0] * n
-            j = 0
-            for i in range(n+1):
-                if i == bit_axis:
-                    continue
-                elif i == pack_axis:
-                    idx[j] = indices[i] * data_width + k
-                else:
-                    idx[j] = indices[i]
-                j += 1       
-            
-            element = data(*idx)
-            for b in range(bits):
-                extracted_bit = ((element & tvm.const(masks[b])) >> b).astype(pack_type)
-                packed_data[b] = (packed_data[b] | extracted_bit)
-                if k < data_width - 1 :
-                    packed_data[b] = packed_data[b] << 1
-
-            if k == data_width - 1:
-                return tuple(packed_data)
-
-    output_tuple = tvm.compute(bitserial_oshape, _bitpack, name=name, tag='bitpack')
-
-    if bits > 1:
-        return concatenate(output_tuple, axis=bit_axis)
-    else:
-        return output_tuple  
-
diff --git a/topi/python/topi/rasp/bitserial_conv2d.py b/topi/python/topi/rasp/bitserial_conv2d.py
index 44f7d8f5fc60..8c023ac46eed 100644
--- a/topi/python/topi/rasp/bitserial_conv2d.py
+++ b/topi/python/topi/rasp/bitserial_conv2d.py
@@ -5,9 +5,9 @@
 import tvm
 from .. import tag
 from ..nn.pad import pad
-from ..nn.bitserial_conv2d import bitserial_conv2d, _get_schedule, _get_workload
+from ..nn.bitserial_conv2d import bitserial_conv2d, _get_schedule, _get_workload, bitpack
 from ..nn.bitserial_conv2d import SpatialPackNCHW, _WORKLOADS, spatial_pack_nchw
-from ..nn.util import get_pad_tuple, bitpack
+from ..nn.util import get_pad_tuple
 from ..util import get_const_int
 from .. import generic
 
@@ -214,7 +214,6 @@ def _instr(index):
     with tvm.build_config(offset_factor=1, partition_const_loop=True):
         return tvm.decl_tensor_intrin(z.op, _intrin_func, binds={w: Wb, x:Xb})
 
-
 # ARM specific schedule that using custom microkernel
 def _schedule_spatial_conv2d_nhwc(s, data, data_q, data_pad, data_vec,
                                   kernel, kernel_q, kernel_vec,
@@ -274,7 +273,6 @@ def _schedule_spatial_conv2d_nhwc(s, data, data_q, data_pad, data_vec,
     s[data_vec].pragma(paxis, "parallel_stride_pattern")
     s[data_vec].pragma(oaxis, "parallel_barrier_when_finish")
 
-
     ##### Schedule kernel packing
     co, _, _, _, _, _ = s[kernel_vec].op.axis
     if bc == 1:
@@ -290,7 +288,6 @@ def _schedule_spatial_conv2d_nhwc(s, data, data_q, data_pad, data_vec,
     s[kernel_vec].pragma(paxis, "parallel_stride_pattern")
     s[kernel_vec].pragma(oaxis, "parallel_barrier_when_finish")
 
-
     ##### Schedule Convolution
     n, oh, ow, co, vh, vw, vc = s[conv_out].op.axis
     dh, dw, kb, ib, ci = s[conv_out].op.reduce_axis
@@ -326,7 +323,6 @@ def _schedule_spatial_conv2d_nhwc(s, data, data_q, data_pad, data_vec,
     s = s.normalize()
     return s
 
-
 @generic.schedule_bitserial_conv2d_nhwc.register(["rasp"])
 def schedule_bitserial_conv2d_nhwc(outs):
     """Raspverry pi schedule for bitserial conv2d"""
@@ -342,7 +338,6 @@ def traverse(op):
                     traverse(tensor.op)
 
         if 'spatial_bitserial_conv_nhwc' in op.tag:
-            # print "spatial"
             output = op.output(0)
             conv_out = op.input_tensors[0]
             kernel_vec = conv_out.op.input_tensors[0]
diff --git a/topi/tests/python/test_topi_bitserial_conv2d_rasp.py b/topi/tests/python/test_topi_bitserial_conv2d_rasp.py
index c1ec95c383ef..9e04391baf26 100644
--- a/topi/tests/python/test_topi_bitserial_conv2d_rasp.py
+++ b/topi/tests/python/test_topi_bitserial_conv2d_rasp.py
@@ -56,10 +56,10 @@ def get_ref_data():
 
     # upload to rpi
     temp = util.tempdir()
-    path = temp.relpath('qconv_nhwc.o')
+    path = temp.relpath('conv_nhwc.o')
     func.save(path)
     remote.upload(path)
-    func = remote.load_module('qconv_nhwc.o')
+    func = remote.load_module('conv_nhwc.o')
 
     func(a, w, b)
     np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
@@ -105,15 +105,14 @@ def get_ref_data():
     func = tvm.build(s, [A, W, B], target)
     # Upload to pi
     temp = util.tempdir()
-    path = temp.relpath('qconv_nhwc.o')
+    path = temp.relpath('conv_nhwc.o')
     func.save(path)
     remote.upload(path)
-    func = remote.load_module('qconv_nhwc.o')
+    func = remote.load_module('conv_nhwc.o')
 
     func(a, w, b)
     np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
 
-
 def test_bitserial_conv2d():
     in_size = 56
     ic, oc = 64, 64
@@ -128,7 +127,7 @@ def test_bitserial_conv2d():
 
     verify_bitserial_conv2d_nhwc(1, in_size, ic, oc, k, stride, pad, 1, 1, False)
     verify_bitserial_conv2d_nhwc(1, in_size, ic, oc, k, stride, pad, 2, 1, False)
-    verify_bitserial_conv2d_nhwc(1, in_size, ic, oc, k, stride, pad, 2, 1, False)
+    verify_bitserial_conv2d_nhwc(1, in_size, ic, oc, k, stride, pad, 2, 2, False)
 
 if __name__ == "__main__":
     test_bitserial_conv2d()

From bb0dc520628c7638a0c281eecd2cb9684fae600b Mon Sep 17 00:00:00 2001
From: Meghan <cowanmeg@cs.washington.edu>
Date: Sun, 15 Jul 2018 16:40:32 -0700
Subject: [PATCH 10/11] fixing types

---
 HalideIR                                        |  2 +-
 topi/python/topi/nn/bitserial_conv2d.py         | 16 ++++++++--------
 topi/python/topi/rasp/bitserial_conv2d.py       |  4 ++--
 topi/tests/python/test_topi_bitserial_conv2d.py |  3 +--
 4 files changed, 12 insertions(+), 13 deletions(-)

diff --git a/HalideIR b/HalideIR
index 0b7e25275138..9204453ae8de 160000
--- a/HalideIR
+++ b/HalideIR
@@ -1 +1 @@
-Subproject commit 0b7e25275138768bb05edb9b9db2c86d0fb09c9a
+Subproject commit 9204453ae8de77e7dfc32c4d80f58dd788ad75ff
diff --git a/topi/python/topi/nn/bitserial_conv2d.py b/topi/python/topi/nn/bitserial_conv2d.py
index 89cb03182ec6..ca2efb0820c1 100644
--- a/topi/python/topi/nn/bitserial_conv2d.py
+++ b/topi/python/topi/nn/bitserial_conv2d.py
@@ -185,11 +185,11 @@ def _conv(n, co, h, w, vh, vw, vc):
         b1b2 = (b1+b2).astype(out_dtype)
         if dorefa:
             return tvm.sum((tvm.popcount(
-                data_vec[n, h, w, ci, vh*HSTR+dh, vw*WSTR+dw, b1] &
-                kernel_vec[co, ci, dh, dw, b2, vc])  -
+                data_vec[n, h, w, ci, vh*HSTR+dh, vw*WSTR+dw, b1].astype(out_dtype) &
+                kernel_vec[co, ci, dh, dw, b2, vc].astype(out_dtype))  -
                             tvm.popcount(
-                                data_vec[n, h, w, ci, vh*HSTR+dh, vw*WSTR+dw, b1] &
-                                ~kernel_vec[co, ci, dh, dw, b2, vc])).astype(out_dtype) << b1b2,
+                                data_vec[n, h, w, ci, vh*HSTR+dh, vw*WSTR+dw, b1].astype(out_dtype)
+                                & ~kernel_vec[co, ci, dh, dw, b2, vc]).astype(out_dtype)) << b1b2,
                            axis=[ci, dh, dw, b1, b2])
 
         return tvm.sum((tvm.popcount(
@@ -256,10 +256,10 @@ def _conv(n, h, w, co, vh, vw, vc):
         b1b2 = (b1+b2).astype(out_dtype)
         if dorefa:
             return tvm.sum(
-                (tvm.popcount(data_vec[n, h, w, vh*HSTR+dh, vw*WSTR+dw, ci, b1] &
-                              kernel_vec[co, dh, dw, ci, vc, b2]) -
-                 tvm.popcount(data_vec[n, h, w, vh*HSTR+dh, vw*WSTR+dw, ci, b1] &
-                              ~kernel_vec[co, dh, dw, ci, vc, b2])).astype(out_dtype) << b1b2,
+                (tvm.popcount(data_vec[n, h, w, vh*HSTR+dh, vw*WSTR+dw, ci, b1].astype(out_dtype) &
+                              kernel_vec[co, dh, dw, ci, vc, b2].astype(out_dtype)) -
+                 tvm.popcount(data_vec[n, h, w, vh*HSTR+dh, vw*WSTR+dw, ci, b1].astype(out_dtype) &
+                              ~kernel_vec[co, dh, dw, ci, vc, b2]).astype(out_dtype)) << b1b2,
                 axis=[dh, dw, ci, b1, b2])
 
         return tvm.sum(tvm.popcount(
diff --git a/topi/python/topi/rasp/bitserial_conv2d.py b/topi/python/topi/rasp/bitserial_conv2d.py
index 8c023ac46eed..1e5e1bee729a 100644
--- a/topi/python/topi/rasp/bitserial_conv2d.py
+++ b/topi/python/topi/rasp/bitserial_conv2d.py
@@ -190,7 +190,7 @@ def _instr(index):
                             cnts2[i] = tvm.call_pure_intrin('uint8x8', 'llvm_intrin', vpadd_id,
                                                             args_1, cnts4[i*2], cnts4[i*2+1])
                         cnts = tvm.call_pure_intrin('uint8x16', 'vectorcombine', cnts2[0], cnts2[1])
-                        shifted_cnts = cnts << (bw+bx)
+                        shifted_cnts = cnts << tvm.const(bw+bx, dtype)
                         out = tvm.call_pure_intrin('uint16x8', 'llvm_intrin', vpadalu_id,
                                                    args_2, zz.vload(0, 'uint16x8'), shifted_cnts)
                     else: # ki == 8
@@ -204,7 +204,7 @@ def _instr(index):
                             cnts2[i] = tvm.call_pure_intrin('uint8x8', 'llvm_intrin', vpadd_id,
                                                             args_1, cnts4[i*2], cnts4[i*2+1])
                         cnts = tvm.call_pure_intrin('uint8x16', 'vectorcombine', cnts2[0], cnts2[1])
-                        shifted_cnts = cnts << (bw+bx)
+                        shifted_cnts = cnts << tvm.const(bw+bx, dtype)
                         out = tvm.call_pure_intrin('uint16x8', 'llvm_intrin', vpadalu_id,
                                                    args_2, zz.vload(0, 'uint16x8'), shifted_cnts)
                     irb.emit(zz.vstore(0, out))
diff --git a/topi/tests/python/test_topi_bitserial_conv2d.py b/topi/tests/python/test_topi_bitserial_conv2d.py
index 3da905f4d21b..6df18483a45f 100644
--- a/topi/tests/python/test_topi_bitserial_conv2d.py
+++ b/topi/tests/python/test_topi_bitserial_conv2d.py
@@ -5,7 +5,7 @@
 import topi.testing
 from tvm.contrib.pickle_memoize import memoize
 from topi.util import get_const_tuple
-from tvm.contrib import rpc, util
+from tvm.contrib import util
 from tvm.contrib.pickle_memoize import memoize
 
 def generate_quantized_np(shape, bits, out_dtype):
@@ -30,7 +30,6 @@ def verify_bitserial_conv2d_nchw(batch, in_size, in_channel, num_filter, kernel,
     w_shape = get_const_tuple(W.shape)
     dtype = A.dtype
 
-    @memoize("topi.tests.test_topi_conv2d.verify_conv2d")
     def get_ref_data():
         a_np = generate_quantized_np(get_const_tuple(A.shape), activation_bits, input_type)
         w_np = generate_quantized_np(get_const_tuple(W.shape), weight_bits, input_type)

From 85a931f1276e33f52af25006f43019f9daa77d29 Mon Sep 17 00:00:00 2001
From: Meghan <cowanmeg@cs.washington.edu>
Date: Sun, 22 Jul 2018 17:26:08 -0700
Subject: [PATCH 11/11] Fix typos, rasp test case, llvm intrin lookup

---
 python/tvm/intrin.py                          | 25 +++++++++
 src/codegen/llvm/llvm_module.cc               |  9 +++
 tests/python/unittest/test_codegen_llvm.py    | 11 ++++
 topi/python/topi/generic/nn.py                | 24 ++------
 topi/python/topi/rasp/bitserial_conv2d.py     | 16 +++---
 .../python/test_topi_bitserial_conv2d_rasp.py | 56 +++++++++++++++++++
 6 files changed, 113 insertions(+), 28 deletions(-)
 create mode 100644 topi/tests/python/test_topi_bitserial_conv2d_rasp.py

diff --git a/python/tvm/intrin.py b/python/tvm/intrin.py
index 422f2d682d2b..30da873b5dcf 100644
--- a/python/tvm/intrin.py
+++ b/python/tvm/intrin.py
@@ -154,6 +154,31 @@ def call_extern(dtype, func_name, *args):
         dtype, func_name, convert(args), _Call.Extern, None, 0)
 
 
+def call_llvm_intrin(dtype, name, *args):
+    """Build expression by calling an llvm intrinsic function
+
+    Parameters
+    ----------
+    dtype : str
+       The data type of the result.
+
+    name : str
+       The name of the llvm intrinsic function.
+
+    args : list
+       Poistional arguments.
+
+    Returns
+    -------
+    call : Expr
+        The call expression.
+    """
+    import tvm
+    llvm_id = tvm.codegen.llvm_lookup_intrinsic_id(name)
+    assert llvm_id != 0, "%s is not an LLVM intrinsic" % name
+    return call_pure_intrin(dtype, 'llvm_intrin', tvm.const(llvm_id, 'uint32'), *args)
+
+
 def exp(x):
     """Take exponetial of input x.
 
diff --git a/src/codegen/llvm/llvm_module.cc b/src/codegen/llvm/llvm_module.cc
index 2bae52b194f5..99740b0dbdca 100644
--- a/src/codegen/llvm/llvm_module.cc
+++ b/src/codegen/llvm/llvm_module.cc
@@ -282,6 +282,15 @@ class LLVMModuleNode final : public runtime::ModuleNode {
   std::shared_ptr<llvm::LLVMContext> ctx_;
 };
 
+unsigned LookupLLVMIntrinsic(const std::string& name) {
+  return llvm::Function::lookupIntrinsicID(name);
+}
+
+TVM_REGISTER_API("codegen.llvm_lookup_intrinsic_id")
+.set_body([](TVMArgs args, TVMRetValue* rv) {
+    *rv = static_cast<int64_t>(LookupLLVMIntrinsic(args[0]));
+  });
+
 TVM_REGISTER_API("codegen.build_llvm")
 .set_body([](TVMArgs args, TVMRetValue* rv) {
     std::shared_ptr<LLVMModuleNode> n = std::make_shared<LLVMModuleNode>();
diff --git a/tests/python/unittest/test_codegen_llvm.py b/tests/python/unittest/test_codegen_llvm.py
index f05fad10d273..e07f4aa8f40c 100644
--- a/tests/python/unittest/test_codegen_llvm.py
+++ b/tests/python/unittest/test_codegen_llvm.py
@@ -17,6 +17,16 @@ def test_llvm_intrin():
     func = tvm.ir_pass.MakeAPI(body, "prefetch", [A], 0, True)
     fcode = tvm.build(func, None, "llvm")
 
+def test_llvm_lookup_intrin():
+    ib = tvm.ir_builder.create()
+    m = tvm.var("m")
+    A = ib.pointer("uint8x8", name="A")
+    x = tvm.call_llvm_intrin("uint8x8", "llvm.ctpop.i8", tvm.const(1, 'uint32'), A)
+    ib.emit(x)
+    body = ib.get()
+    func = tvm.ir_pass.MakeAPI(body, "ctpop", [A], 1, True)
+    fcode = tvm.build(func, None, "llvm")
+
 def test_llvm_add_pipeline():
     nn = 1024
     n = tvm.convert(nn)
@@ -324,3 +334,4 @@ def test_alignment():
     test_llvm_flip_pipeline()
     test_llvm_madd_pipeline()
     test_llvm_temp_space()
+    test_llvm_lookup_intrin()
diff --git a/topi/python/topi/generic/nn.py b/topi/python/topi/generic/nn.py
index 9b70d4aac78b..fe76b9715d59 100644
--- a/topi/python/topi/generic/nn.py
+++ b/topi/python/topi/generic/nn.py
@@ -53,22 +53,6 @@ def schedule_conv2d_nhwc(outs):
     """
     return _default_schedule(outs, False)
 
-@tvm.target.generic_func
-def schedule_qdense(outs):
-    """Schedule for qdense
-
-    Parameters
-    ----------
-    outs: Array of Tensor
-          The computation graph description of qdense
-          in the format of an array of tensors.
-
-    Returns
-    -------
-    sch: Schedule
-        The computation schedule for the op.
-    """
-    return _default_schedule(outs, False)
 
 @tvm.target.generic_func
 def schedule_conv2d_NCHWc(num_filter, kernel_size, strides,
@@ -161,12 +145,12 @@ def schedule_depthwise_conv2d_nhwc(outs):
 
 @tvm.target.generic_func
 def schedule_bitserial_conv2d_nchw(outs):
-    """Schedule for qconv2d_nchw
+    """Schedule for bitserial_conv2d_nchw
 
     Parameters
     ----------
     outs: Array of Tensor
-          The computation graph description of qconv2d_nchw
+          The computation graph description of bitserial_conv2d_nchw
           in the format of an array of tensors.
 
     Returns
@@ -179,12 +163,12 @@ def schedule_bitserial_conv2d_nchw(outs):
 
 @tvm.target.generic_func
 def schedule_bitserial_conv2d_nhwc(outs):
-    """Schedule for qconv2d_nhwc
+    """Schedule for bitserial_conv2d_nhwc
 
     Parameters
     ----------
     outs: Array of Tensor
-          The computation graph description of qconv2d_nchw
+          The computation graph description of bitserial_conv2d_nchw
           in the format of an array of tensors.
 
     Returns
diff --git a/topi/python/topi/rasp/bitserial_conv2d.py b/topi/python/topi/rasp/bitserial_conv2d.py
index 1e5e1bee729a..7d292db8d298 100644
--- a/topi/python/topi/rasp/bitserial_conv2d.py
+++ b/topi/python/topi/rasp/bitserial_conv2d.py
@@ -160,8 +160,8 @@ def _intrin_popcount(m, k_i, w_b, x_b):
     def _intrin_func(ins, outs):
         ww, xx = ins
         zz = outs[0]
-        vpadd_id = tvm.const(647, 'uint32')
-        vpadalu_id = tvm.const(646, 'uint32')
+        vpadd = "llvm.arm.neon.vpadd.v8u8"
+        vpadalu = "llvm.arm.neon.vpadalu.v16u8.v8u16"
         args_1 = tvm.const(1, 'uint32')
         args_2 = tvm.const(2, 'uint32')
 
@@ -184,28 +184,28 @@ def _instr(index):
                             lower_half = tvm.call_pure_intrin('uint8x8', 'vectorlow', cnts)
                             cnts8[i] = upper_half + lower_half
                         for i in range(m//2):
-                            cnts4[i] = tvm.call_pure_intrin('uint8x8', 'llvm_intrin', vpadd_id,
+                            cnts4[i] = tvm.call_llvm_intrin('uint8x8', vpadd,
                                                             args_1, cnts8[i*2], cnts8[i*2+1])
                         for i in range(m//4):
-                            cnts2[i] = tvm.call_pure_intrin('uint8x8', 'llvm_intrin', vpadd_id,
+                            cnts2[i] = tvm.call_llvm_intrin('uint8x8', vpadd,
                                                             args_1, cnts4[i*2], cnts4[i*2+1])
                         cnts = tvm.call_pure_intrin('uint8x16', 'vectorcombine', cnts2[0], cnts2[1])
                         shifted_cnts = cnts << tvm.const(bw+bx, dtype)
-                        out = tvm.call_pure_intrin('uint16x8', 'llvm_intrin', vpadalu_id,
+                        out = tvm.call_llvm_intrin('uint16x8', vpadalu,
                                                    args_2, zz.vload(0, 'uint16x8'), shifted_cnts)
                     else: # ki == 8
                         for i in range(m):
                             ands = ww.vload([bw, i, 0], 'uint8x8') & xx.vload([bx, 0], 'uint8x8')
                             cnts8[i] = tvm.popcount(ands)
                         for i in range(m//2):
-                            cnts4[i] = tvm.call_pure_intrin('uint8x8', 'llvm_intrin', vpadd_id,
+                            cnts4[i] = tvm.call_llvm_intrin('uint8x8', vpadd,
                                                             args_1, cnts8[i*2], cnts8[i*2+1])
                         for i in range(m//4):
-                            cnts2[i] = tvm.call_pure_intrin('uint8x8', 'llvm_intrin', vpadd_id,
+                            cnts2[i] = tvm.call_llvm_intrin('uint8x8', vpadd,
                                                             args_1, cnts4[i*2], cnts4[i*2+1])
                         cnts = tvm.call_pure_intrin('uint8x16', 'vectorcombine', cnts2[0], cnts2[1])
                         shifted_cnts = cnts << tvm.const(bw+bx, dtype)
-                        out = tvm.call_pure_intrin('uint16x8', 'llvm_intrin', vpadalu_id,
+                        out = tvm.call_llvm_intrin('uint16x8', vpadalu,
                                                    args_2, zz.vload(0, 'uint16x8'), shifted_cnts)
                     irb.emit(zz.vstore(0, out))
             return irb.get()
diff --git a/topi/tests/python/test_topi_bitserial_conv2d_rasp.py b/topi/tests/python/test_topi_bitserial_conv2d_rasp.py
new file mode 100644
index 000000000000..5789c5496205
--- /dev/null
+++ b/topi/tests/python/test_topi_bitserial_conv2d_rasp.py
@@ -0,0 +1,56 @@
+import os
+import re
+import numpy as np
+import tvm
+import topi
+import topi.testing
+from topi.util import get_const_tuple
+from tvm.contrib import util
+
+target = 'llvm -target=armv7l-none-linux-gnueabihf -mcpu=cortex-a53 -mattr=+neon'
+
+def generate_quantized_np(shape, bits, out_dtype):
+    np.random.seed(0)
+    min_val = 0
+    max_val = 1 << bits
+    return np.random.randint(min_val, max_val, size=shape).astype(out_dtype)
+
+# Verify that certain special instructions from the tensorize pass exist
+def verify_bitserial_conv2d_nhwc(batch, in_size, in_channel, num_filter, kernel, stride, padding, 
+                        activation_bits, weight_bits, dorefa):
+    in_height = in_width = in_size
+    input_type='uint32'
+    out_dtype='int32'
+
+    with tvm.target.rasp():
+        A = tvm.placeholder((batch, in_height, in_width, in_channel), dtype=input_type, name='A')
+        W = tvm.placeholder((kernel, kernel, in_channel, num_filter), dtype=input_type, name='W')
+        B = topi.nn.bitserial_conv2d(A, W, stride, padding, activation_bits, weight_bits, out_dtype=out_dtype, 
+                            layout="NHWC", dorefa=dorefa)
+        s = topi.generic.schedule_bitserial_conv2d_nhwc([B])
+
+    
+    func = tvm.build(s, [A, W, B], target)
+   
+    assembly = func.get_source('asm')
+    matches = re.findall("vpadal", assembly)
+    assert (len(matches) > 0)
+    matches = re.findall("vcnt", assembly)
+    assert (len(matches) > 0)
+    matches = re.findall("vpadd", assembly)
+    assert (len(matches) > 0)
+
+def test_bitserial_conv2d():
+    in_size = 56
+    ic, oc = 64, 64
+    k = 3
+    stride = 1
+    pad = 1
+
+
+    verify_bitserial_conv2d_nhwc(1, in_size, ic, oc, k, stride, pad, 1, 1, False)
+    verify_bitserial_conv2d_nhwc(1, in_size, ic, oc, k, stride, pad, 2, 1, False)
+
+if __name__ == "__main__":
+    test_bitserial_conv2d()
+