From 0329772be19a07a03091aa432eb4162a522c387b Mon Sep 17 00:00:00 2001 From: Meghan Date: Tue, 5 Jun 2018 16:37:53 -0700 Subject: [PATCH 01/11] ARM Popcount lowering rule and codegen updates to support reinterpreting and accessing vectors --- HalideIR | 2 +- src/codegen/llvm/codegen_arm.cc | 77 ++++++++++++++++++++++++++++++++ src/codegen/llvm/codegen_llvm.cc | 26 ++++++++++- 3 files changed, 103 insertions(+), 2 deletions(-) diff --git a/HalideIR b/HalideIR index a3698398faff..e20e5e9abb3a 160000 --- a/HalideIR +++ b/HalideIR @@ -1 +1 @@ -Subproject commit a3698398faff7fec1c0fa4e4479357651382db75 +Subproject commit e20e5e9abb3aa43147a90a4ffb3e190f62862970 diff --git a/src/codegen/llvm/codegen_arm.cc b/src/codegen/llvm/codegen_arm.cc index b87b6ec88808..abf30756011c 100644 --- a/src/codegen/llvm/codegen_arm.cc +++ b/src/codegen/llvm/codegen_arm.cc @@ -18,8 +18,85 @@ class CodeGenARM final : public CodeGenCPU { native_vector_bits_ = 16 * 8; CodeGenCPU::InitTarget(tm); } + llvm::Value* CreateIntrinsic(const Call* op) override; + + private: + Expr ARMPopcount(const Call* op); }; +llvm::Value* CodeGenARM::CreateIntrinsic(const Call* op) { + if (op->is_intrinsic("llvm_intrin")) { + llvm::Intrinsic::ID id = static_cast( + op->args[0].as()->value); + if (id == ::llvm::Intrinsic::ctpop) { + Expr e = ARMPopcount(op); + return CodeGenCPU::CreateIntrinsic(e.as()); + } + } + return CodeGenCPU::CreateIntrinsic(op); +} + +Expr CodeGenARM::ARMPopcount(const Call *call) { + using namespace ir; + const Expr& e = call->args[2]; + ::llvm::Intrinsic::ID ctpop_id = ::llvm::Intrinsic::ctpop; + ::llvm::Intrinsic::ID vpaddu_id = ::llvm::Intrinsic::arm_neon_vpaddlu; + + + Type uint8_type = Type(e.type().code(), 8, e.type().bits() * e.type().lanes() / 8); + Type uint16_type = Type(uint8_type.code(), 16, uint8_type.bits() * uint8_type.lanes() / 16); + Type uint32_type = Type(uint16_type.code(), 32, uint8_type.bits() * uint8_type.lanes() / 32); + + // Fallback to default llvm lowering rule if input type not a full vector or half vector length + int total_size = call->type.bits() * call->type.lanes(); + if (!call->type.is_vector() || call->type.bits() == 8 || + (total_size != 128 && total_size != 64)) { + Array vcnt_args; + vcnt_args.push_back(ir::UIntImm::make(UInt(32), ctpop_id)); + vcnt_args.push_back(ir::UIntImm::make(UInt(32), 1)); + vcnt_args.push_back(e); + return ir::Call::make(call->type, "llvm_intrin", vcnt_args, Call::PureIntrinsic); + } + + // Interpret input as vector of 8bit values + Expr input8 = reinterpret(uint8_type, e); + // Popcount 8bit->8bit + const Call* c0 = input8.as(); + CHECK(c0 != nullptr); + Array vcnt8_args; + vcnt8_args.push_back(ir::UIntImm::make(UInt(32), ctpop_id)); + vcnt8_args.push_back(ir::UIntImm::make(UInt(32), 1)); + vcnt8_args.push_back(input8); + Expr vcnt8 = ir::Call::make(uint8_type, "llvm_intrin", vcnt8_args, Call::PureIntrinsic); + + // Accumulation 8->16bit + Array vcnt16_args; + vcnt16_args.push_back(ir::UIntImm::make(UInt(32), vpaddu_id)); + vcnt16_args.push_back(ir::UIntImm::make(UInt(32), 1)); + vcnt16_args.push_back(vcnt8); + Expr vcnt16 = ir::Call::make(uint16_type, "llvm_intrin", vcnt16_args, Call::PureIntrinsic); + if (call->type.bits() == 16) { + return vcnt16; + } + + // Accumulation 16->32bit + Array vcnt32_args; + vcnt32_args.push_back(ir::UIntImm::make(UInt(32), vpaddu_id)); + vcnt32_args.push_back(ir::UIntImm::make(UInt(32), 1)); + vcnt32_args.push_back(vcnt16); + Expr vcnt32 = ir::Call::make(uint32_type, "llvm_intrin", vcnt32_args, Call::PureIntrinsic); + if (call->type.bits() == 32) { + return vcnt32; + } + + // Accumulation 32->64bit + Array vcnt64_args; + vcnt64_args.push_back(ir::UIntImm::make(UInt(32), vpaddu_id)); + vcnt64_args.push_back(ir::UIntImm::make(UInt(32), 1)); + vcnt64_args.push_back(vcnt32); + return ir::Call::make(call->type, "llvm_intrin", vcnt64_args, Call::PureIntrinsic); +} + TVM_REGISTER_GLOBAL("tvm.codegen.llvm.target_arm") .set_body([](const TVMArgs& targs, TVMRetValue* rv) { CodeGenLLVM* cg = new CodeGenARM(); diff --git a/src/codegen/llvm/codegen_llvm.cc b/src/codegen/llvm/codegen_llvm.cc index 934398d9ce09..d0c5b77cbfd5 100644 --- a/src/codegen/llvm/codegen_llvm.cc +++ b/src/codegen/llvm/codegen_llvm.cc @@ -366,7 +366,7 @@ llvm::Value* CodeGenLLVM::CreateBroadcast(llvm::Value* value, int lanes) { llvm::Value* CodeGenLLVM::CreateVecSlice(llvm::Value* vec, int begin, int extent) { int num_elems = static_cast(vec->getType()->getVectorNumElements()); if (extent == num_elems && begin == 0) return vec; - CHECK_LT(begin + extent, num_elems); + CHECK_LT(begin + extent, num_elems+1); std::vector indices; for (int i = 0; i < extent; ++i) { indices.push_back(begin + i); @@ -562,6 +562,10 @@ llvm::Value* CodeGenLLVM::CreateIntrinsic(const Call* op) { sig_type.push_back(arg_value.back()->getType()); } } + llvm::Type *returnType = LLVMType(op->type); + if (returnType != sig_type[0]) { + sig_type.insert(sig_type.begin(), returnType); + } llvm::Function* f = llvm::Intrinsic::getDeclaration( module_.get(), id, sig_type); return builder_->CreateCall(f, arg_value); @@ -628,6 +632,26 @@ llvm::Value* CodeGenLLVM::CreateIntrinsic(const Call* op) { value->addIncoming(then_value, then_value_block); value->addIncoming(else_value, else_value_block); return value; + } else if (op->is_intrinsic(Call::reinterpret)) { + llvm::Type * target = LLVMType(op->type); + return builder_->CreateBitCast(MakeValue(op->args[0]), target); + } else if (op->is_intrinsic("vectorlow")) { + llvm::Value *v = MakeValue(op->args[0]); + int l = v->getType()->getVectorNumElements(); + return CreateVecSlice(v, 0, l/2); + } else if (op->is_intrinsic("vectorhigh")) { + llvm::Value *v = MakeValue(op->args[0]); + int l = v->getType()->getVectorNumElements(); + return CreateVecSlice(v, l/2, l/2); + } else if (op->is_intrinsic("vectorcombine")) { + llvm::Value *v0 = MakeValue(op->args[0]); + llvm::Value *v1 = MakeValue(op->args[1]); + int num_elems = static_cast(v0->getType()->getVectorNumElements()) * 2; + std::vector indices; + for (int i = 0; i < num_elems; ++i) { + indices.push_back(i); + } + return builder_->CreateShuffleVector(v0, v1, indices); } else { LOG(FATAL) << "unknown intrinsic " << op->name; return nullptr; From 777f9ea69d24d40433d2867c69e3761f9704198a Mon Sep 17 00:00:00 2001 From: Meghan Date: Wed, 6 Jun 2018 16:05:59 -0700 Subject: [PATCH 02/11] Fixes and test case for arm popcount --- HalideIR | 2 +- src/codegen/llvm/codegen_arm.cc | 24 ++++++++++++-------- src/codegen/llvm/codegen_llvm.cc | 2 +- src/codegen/llvm/llvm_module.cc | 38 ++++++++++++++++++++++++++++---- 4 files changed, 51 insertions(+), 15 deletions(-) diff --git a/HalideIR b/HalideIR index e20e5e9abb3a..a3698398faff 160000 --- a/HalideIR +++ b/HalideIR @@ -1 +1 @@ -Subproject commit e20e5e9abb3aa43147a90a4ffb3e190f62862970 +Subproject commit a3698398faff7fec1c0fa4e4479357651382db75 diff --git a/src/codegen/llvm/codegen_arm.cc b/src/codegen/llvm/codegen_arm.cc index abf30756011c..161d6db6e42d 100644 --- a/src/codegen/llvm/codegen_arm.cc +++ b/src/codegen/llvm/codegen_arm.cc @@ -39,13 +39,9 @@ llvm::Value* CodeGenARM::CreateIntrinsic(const Call* op) { Expr CodeGenARM::ARMPopcount(const Call *call) { using namespace ir; const Expr& e = call->args[2]; - ::llvm::Intrinsic::ID ctpop_id = ::llvm::Intrinsic::ctpop; - ::llvm::Intrinsic::ID vpaddu_id = ::llvm::Intrinsic::arm_neon_vpaddlu; - - Type uint8_type = Type(e.type().code(), 8, e.type().bits() * e.type().lanes() / 8); - Type uint16_type = Type(uint8_type.code(), 16, uint8_type.bits() * uint8_type.lanes() / 16); - Type uint32_type = Type(uint16_type.code(), 32, uint8_type.bits() * uint8_type.lanes() / 32); + ::llvm::Intrinsic::ID ctpop_id = ::llvm::Intrinsic::ctpop; + ::llvm::Intrinsic::ID vpaddlu_id = ::llvm::Intrinsic::arm_neon_vpaddlu; // Fallback to default llvm lowering rule if input type not a full vector or half vector length int total_size = call->type.bits() * call->type.lanes(); @@ -58,6 +54,16 @@ Expr CodeGenARM::ARMPopcount(const Call *call) { return ir::Call::make(call->type, "llvm_intrin", vcnt_args, Call::PureIntrinsic); } + // Popcount lowering rule: + // Reinterpret input vector as a vector of 8bit values and preform popcount + // Pairwise add between adjacent elements and double width with vpaddlu + // to return back to original input type + + // Dvisions are always divisible (number of bits = 64 or 128) + Type uint8_type = Type(e.type().code(), 8, e.type().bits() * e.type().lanes() / 8); + Type uint16_type = Type(uint8_type.code(), 16, uint8_type.bits() * uint8_type.lanes() / 16); + Type uint32_type = Type(uint16_type.code(), 32, uint8_type.bits() * uint8_type.lanes() / 32); + // Interpret input as vector of 8bit values Expr input8 = reinterpret(uint8_type, e); // Popcount 8bit->8bit @@ -71,7 +77,7 @@ Expr CodeGenARM::ARMPopcount(const Call *call) { // Accumulation 8->16bit Array vcnt16_args; - vcnt16_args.push_back(ir::UIntImm::make(UInt(32), vpaddu_id)); + vcnt16_args.push_back(ir::UIntImm::make(UInt(32), vpaddlu_id)); vcnt16_args.push_back(ir::UIntImm::make(UInt(32), 1)); vcnt16_args.push_back(vcnt8); Expr vcnt16 = ir::Call::make(uint16_type, "llvm_intrin", vcnt16_args, Call::PureIntrinsic); @@ -81,7 +87,7 @@ Expr CodeGenARM::ARMPopcount(const Call *call) { // Accumulation 16->32bit Array vcnt32_args; - vcnt32_args.push_back(ir::UIntImm::make(UInt(32), vpaddu_id)); + vcnt32_args.push_back(ir::UIntImm::make(UInt(32), vpaddlu_id)); vcnt32_args.push_back(ir::UIntImm::make(UInt(32), 1)); vcnt32_args.push_back(vcnt16); Expr vcnt32 = ir::Call::make(uint32_type, "llvm_intrin", vcnt32_args, Call::PureIntrinsic); @@ -91,7 +97,7 @@ Expr CodeGenARM::ARMPopcount(const Call *call) { // Accumulation 32->64bit Array vcnt64_args; - vcnt64_args.push_back(ir::UIntImm::make(UInt(32), vpaddu_id)); + vcnt64_args.push_back(ir::UIntImm::make(UInt(32), vpaddlu_id)); vcnt64_args.push_back(ir::UIntImm::make(UInt(32), 1)); vcnt64_args.push_back(vcnt32); return ir::Call::make(call->type, "llvm_intrin", vcnt64_args, Call::PureIntrinsic); diff --git a/src/codegen/llvm/codegen_llvm.cc b/src/codegen/llvm/codegen_llvm.cc index d0c5b77cbfd5..bbf52512d3d5 100644 --- a/src/codegen/llvm/codegen_llvm.cc +++ b/src/codegen/llvm/codegen_llvm.cc @@ -366,7 +366,7 @@ llvm::Value* CodeGenLLVM::CreateBroadcast(llvm::Value* value, int lanes) { llvm::Value* CodeGenLLVM::CreateVecSlice(llvm::Value* vec, int begin, int extent) { int num_elems = static_cast(vec->getType()->getVectorNumElements()); if (extent == num_elems && begin == 0) return vec; - CHECK_LT(begin + extent, num_elems+1); + CHECK_LE(begin + extent, num_elems); std::vector indices; for (int i = 0; i < extent; ++i) { indices.push_back(begin + i); diff --git a/src/codegen/llvm/llvm_module.cc b/src/codegen/llvm/llvm_module.cc index c16af511febc..2bae52b194f5 100644 --- a/src/codegen/llvm/llvm_module.cc +++ b/src/codegen/llvm/llvm_module.cc @@ -117,11 +117,41 @@ class LLVMModuleNode final : public runtime::ModuleNode { } std::string GetSource(const std::string& format) final { + std::string fmt = runtime::GetFileFormat("", format); std::string type_str; - llvm::raw_string_ostream rso(type_str); - CHECK(mptr_ != nullptr); - mptr_->print(rso, nullptr); - return rso.str(); + llvm::SmallString<256> str; + llvm::raw_svector_ostream rso(str); + + if (fmt == "s" || fmt == "asm") { + #if TVM_LLVM_VERSION <= 60 + std::unique_ptr m = llvm::CloneModule(mptr_); + #else + std::unique_ptr m = llvm::CloneModule(*mptr_); + #endif + llvm::legacy::PassManager pass; + CHECK(tm_); + #if TVM_LLVM_VERSION <= 60 + CHECK(tm_->addPassesToEmitFile( + pass, rso, llvm::TargetMachine::CGFT_AssemblyFile) == 0) + << "Cannot emit target CGFT_AssemblyFile"; + #else + CHECK(tm_->addPassesToEmitFile( + pass, rso, nullptr, llvm::TargetMachine::CGFT_AssemblyFile) == 0) + << "Cannot emit target CGFT_AssemblyFile"; + #endif + pass.run(*m); + return rso.str().str(); + } else if (fmt == "" || fmt == "ll") { + std::string type_str; + llvm::raw_string_ostream rso(type_str); + CHECK(mptr_ != nullptr); + mptr_->print(rso, nullptr); + return rso.str(); + } else { + LOG(FATAL) << "Do not know how to get source code with format: " + << format << "\'"; + } + return ""; } void Init(const Array& funcs, std::string target) { From 2e56f092e09cc366dc64dcc7480ab8f75869ae56 Mon Sep 17 00:00:00 2001 From: Meghan Date: Wed, 6 Jun 2018 16:07:15 -0700 Subject: [PATCH 03/11] white space fixes --- src/codegen/llvm/codegen_arm.cc | 1 - 1 file changed, 1 deletion(-) diff --git a/src/codegen/llvm/codegen_arm.cc b/src/codegen/llvm/codegen_arm.cc index 161d6db6e42d..18a0eb54e182 100644 --- a/src/codegen/llvm/codegen_arm.cc +++ b/src/codegen/llvm/codegen_arm.cc @@ -39,7 +39,6 @@ llvm::Value* CodeGenARM::CreateIntrinsic(const Call* op) { Expr CodeGenARM::ARMPopcount(const Call *call) { using namespace ir; const Expr& e = call->args[2]; - ::llvm::Intrinsic::ID ctpop_id = ::llvm::Intrinsic::ctpop; ::llvm::Intrinsic::ID vpaddlu_id = ::llvm::Intrinsic::arm_neon_vpaddlu; From b329f462b7994f6b1c9bded239e2572aaea3a6f3 Mon Sep 17 00:00:00 2001 From: Meghan Date: Thu, 7 Jun 2018 09:56:52 -0700 Subject: [PATCH 04/11] Initial qconv2d operators --- topi/python/topi/generic/nn.py | 35 ++++++++++++++++ topi/python/topi/nn/__init__.py | 1 + topi/python/topi/nn/util.py | 66 +++++++++++++++++++++++++++++++ topi/python/topi/rasp/__init__.py | 1 + topi/python/topi/x86/__init__.py | 1 + 5 files changed, 104 insertions(+) diff --git a/topi/python/topi/generic/nn.py b/topi/python/topi/generic/nn.py index 5a16d12206a3..bb81c37ad285 100644 --- a/topi/python/topi/generic/nn.py +++ b/topi/python/topi/generic/nn.py @@ -132,6 +132,41 @@ def schedule_depthwise_conv2d_nhwc(outs): """ return _default_schedule(outs, False) +@tvm.target.generic_func +def schedule_qconv2d_nchw(outs): + """Schedule for qconv2d_nchw + + Parameters + ---------- + outs: Array of Tensor + The computation graph description of qconv2d_nchw + in the format of an array of tensors. + + Returns + ------- + sch: Schedule + The computation schedule for the op. + """ + return _default_schedule(outs, False) + + +@tvm.target.generic_func +def schedule_qconv2d_nhwc(outs): + """Schedule for qconv2d_nhwc + + Parameters + ---------- + outs: Array of Tensor + The computation graph description of qconv2d_nchw + in the format of an array of tensors. + + Returns + ------- + sch: Schedule + The computation schedule for the op. + """ + return _default_schedule(outs, False) + @tvm.target.override_native_generic_func("schedule_reduce") def schedule_reduce(outs): diff --git a/topi/python/topi/nn/__init__.py b/topi/python/topi/nn/__init__.py index 056d1a76339a..e968bd68c927 100644 --- a/topi/python/topi/nn/__init__.py +++ b/topi/python/topi/nn/__init__.py @@ -17,3 +17,4 @@ from .upsampling import * from .local_response_norm import * from .l2_norm import * +from .qconv2d import * \ No newline at end of file diff --git a/topi/python/topi/nn/util.py b/topi/python/topi/nn/util.py index 6264ced76953..90497a77c6f9 100644 --- a/topi/python/topi/nn/util.py +++ b/topi/python/topi/nn/util.py @@ -4,6 +4,9 @@ import tvm from ..util import get_const_int +import numpy as np +from topi.transform import concatenate + def infer_pad(data, data_pad): """Infer the padding from stages in reverse. @@ -102,3 +105,66 @@ def get_pad_tuple(padding, kernel): pad_top = (pad_h + 1) // 2 pad_left = (pad_w + 1) // 2 return pad_top, pad_left, pad_h - pad_top, pad_w - pad_left + + +# Packs quantized data into packed bitplanes +# pack_axis = Axis to compress of original tensor +# bit_axis = Axis to place bitplanes in the resulting tensor +# pack_type = Datatype to pack elements into +def bitpack(data, bits, pack_axis, bit_axis, pack_type, name="QuantizeInput"): + ishape = data.shape + n = len(ishape) + if pack_type == 'uint8': + data_width = 8 + elif pack_type == 'uint16': + data_width = 16 + elif pack_type == 'uint32': + data_width = 32 + elif pack_type == 'uint64': + data_width = 64 + + # Data must be in multiples of the data_width + assert get_const_int(ishape[pack_axis]) % data_width == 0, "Not a multiple of word size" + + shape_vec = list(ishape) + shape_vec[pack_axis] = (shape_vec[pack_axis] // data_width) + shape_vec.insert(bit_axis, 1) + bitserial_oshape = tuple(shape_vec) + masks = np.array([0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80]) + + # pack axis shifts if bit axis comes before + if bit_axis <= pack_axis: + pack_axis += 1 + + def _bitpack(*indices): + packed_data = [tvm.const(0, pack_type)] * bits + for k in range(data_width): + # Translate indices for packed data back to original + idx = [0] * n + j = 0 + for i in range(n+1): + if i == bit_axis: + continue + elif i == pack_axis: + idx[j] = indices[i] * data_width + k + else: + idx[j] = indices[i] + j += 1 + + element = data(*idx) + for b in range(bits): + extracted_bit = ((element & tvm.const(masks[b])) >> b).astype(pack_type) + packed_data[b] = (packed_data[b] | extracted_bit) + if k < data_width - 1 : + packed_data[b] = packed_data[b] << 1 + + if k == data_width - 1: + return tuple(packed_data) + + output_tuple = tvm.compute(bitserial_oshape, _bitpack, name=name, tag='bitpack') + + if bits > 1: + return concatenate(output_tuple, axis=bit_axis) + else: + return output_tuple + diff --git a/topi/python/topi/rasp/__init__.py b/topi/python/topi/rasp/__init__.py index 31ecea5aba4e..8000e752c9ec 100644 --- a/topi/python/topi/rasp/__init__.py +++ b/topi/python/topi/rasp/__init__.py @@ -4,3 +4,4 @@ from .conv2d import schedule_conv2d_nchw from .depthwise_conv2d import schedule_depthwise_conv2d_nchw +from .qconv2d import schedule_qconv2d diff --git a/topi/python/topi/x86/__init__.py b/topi/python/topi/x86/__init__.py index d001b5fdca57..3ee6e6ee34a6 100644 --- a/topi/python/topi/x86/__init__.py +++ b/topi/python/topi/x86/__init__.py @@ -8,3 +8,4 @@ from .nn import * from .injective import * from .pooling import schedule_pool, schedule_global_pool +from .qconv2d import schedule_qconv2d From 753c4b2834f161bb8b1869bfea824043fc3f497f Mon Sep 17 00:00:00 2001 From: Meghan Date: Thu, 7 Jun 2018 10:11:04 -0700 Subject: [PATCH 05/11] operators --- topi/python/topi/nn/qconv2d.py | 350 +++++++++++++++++ topi/python/topi/rasp/qconv2d.py | 619 +++++++++++++++++++++++++++++++ topi/python/topi/x86/qconv2d.py | 405 ++++++++++++++++++++ 3 files changed, 1374 insertions(+) create mode 100644 topi/python/topi/nn/qconv2d.py create mode 100644 topi/python/topi/rasp/qconv2d.py create mode 100644 topi/python/topi/x86/qconv2d.py diff --git a/topi/python/topi/nn/qconv2d.py b/topi/python/topi/nn/qconv2d.py new file mode 100644 index 000000000000..820a92bc9ff1 --- /dev/null +++ b/topi/python/topi/nn/qconv2d.py @@ -0,0 +1,350 @@ +# pylint: disable=invalid-name, unused-variable, too-many-locals, unused-argument +"""Conv2D operators""" +from __future__ import absolute_import as _abs +from collections import namedtuple +import tvm +from .pad import pad +from .util import get_pad_tuple, bitpack +from ..util import simplify, get_const_int, get_const_tuple +import numpy as np + + +# workload description of qconv2d +Workload = namedtuple('Workload', + ['in_dtype', 'out_dtype', 'height', 'width', 'in_filter', 'out_filter', + 'hkernel', 'wkernel', 'hpad', 'wpad', 'hstride', 'wstride']) + +QuantizedSpatialPackNCHW = namedtuple('SpatialPack', + ['vh', 'vw', 'vc', 'ba', 'bc']) + +QuantizedSpatialPackNHWC= namedtuple('SpatialPack', + ['vh', 'vw', 'vc', 'ba', 'bc']) + +# RPI version - broken right now +RaspQuantizedSpatialPack = namedtuple('SpatialPack', + ['vh', 'vw', 'vc', 'ba', 'bc', 'split_ci', 'kfactor']) + + +_WORKLOADS = [ + # workloads of resnet18 on imagenet + # input_size, input_size, ic, oc, kh, kw, pad, pad, stride, stride + Workload('uint32', 'int32', 56, 56, 64, 64, 3, 3, 1, 1, 1, 1), + Workload('uint32', 'int32', 56, 56, 64, 64, 1, 1, 0, 0, 1, 1), + Workload('uint32', 'int32', 56, 56, 64, 128, 3, 3, 1, 1, 2, 2), + Workload('uint32', 'int32', 56, 56, 64, 128, 1, 1, 0, 0, 2, 2), + Workload('uint32', 'int32', 28, 28, 128, 128, 3, 3, 1, 1, 1, 1), + Workload('uint32', 'int32', 28, 28, 128, 256, 3, 3, 1, 1, 2, 2), + Workload('uint32', 'int32', 28, 28, 128, 256, 1, 1, 0, 0, 2, 2), + Workload('uint32', 'int32', 14, 14, 256, 256, 3, 3, 1, 1, 1, 1), + Workload('uint32', 'int32', 14, 14, 256, 512, 3, 3, 1, 1, 2, 2), + Workload('uint32', 'int32', 14, 14, 256, 512, 1, 1, 0, 0, 2, 2), + Workload('uint32', 'int32', 7, 7, 512, 512, 3, 3, 1, 1, 1, 1), +] + +@tvm.target.generic_func +def qconv2d(data, kernel, stride, padding, activation_bits, weight_bits, layout='NCHW', + pack_dtype='uint32', out_dtype='int32', dorefa=True): + """Conv2D operator. + + Parameters + ---------- + input : tvm.Tensor + 4-D with shape [batch, in_channel, in_height, in_width] or + [batch, in_height, in_width, in_channel] + + filter : tvm.Tensor + 4-D with shape [num_filter, in_channel, filter_height, filter_width] + + stride : int or a list/tuple of two ints + stride size, or [stride_height, stride_width] + + padding : int or a list/tuple of two ints + padding size, or [pad_height, pad_width] + + layout : str + layout of data + + activation_bits: int + + weight_bits: int + + out_dtype: str + return type of convolution + + pack_dtype: str + bit packing type + + dorefa: bool + method of preforming popcount + + Returns + ------- + output : tvm.Tensor + 4-D with shape [batch, out_channel, out_height, out_width] + """ + # search platform specific declaration first + # default declaration + if layout == 'NCHW': + return spatial_pack_nchw(data, kernel, stride, padding, activation_bits, weight_bits, pack_dtype=pack_dtype, + out_dtype=out_dtype, dorefa=dorefa) + elif layout == 'NHWC': + return spatial_pack_nhwc(data, kernel, stride, padding, activation_bits, weight_bits, pack_dtype=pack_dtype, + out_dtype=out_dtype, dorefa=dorefa) + else: + raise ValueError("not support this layout {} yet".format(layout)) + +def _get_workload(data, kernel, stride, padding, out_dtype, layout): + """ Get the workload structure. """ + assert layout == "NCHW" or layout == "NHWC", \ + "Only support layouts NCHW and NHWC" + if layout == "NCHW": + _, CI, IH, IW = [x.value for x in data.shape] + CO, _, KH, KW = [x.value for x in kernel.shape] + else: # NHWC + IH, IW = data.shape[1].value, data.shape[2].value + KH, KW, CI, CO = [x for x in get_const_tuple(kernel.shape)] + + HPAD, WPAD, _, _ = get_pad_tuple(padding, kernel) + if isinstance(stride, (tuple, list)): + HSTR, WSTR = stride + else: + HSTR, WSTR = stride, stride + + return Workload(data.dtype, out_dtype, IH, IW, CI, CO, KH, KW, HPAD, WPAD, HSTR, WSTR) + +@tvm.target.generic_func +def _get_schedule(wkl, layout): + # pylint: disable=unreachable + """ Get the platform specific schedule. """ + target = tvm.target.current_target() + raise RuntimeError( + "No schedule for current target:{}".format(target)) + # This return has no use, merely to supress pylint warning + return wkl + + +def qconv2d_nchw(Input, Filter, stride, padding, activation_bits, weight_bits, out_dtype='int32', pack_type='uint32'): + assert isinstance(stride, int) or len(stride) == 2 + Input_q = bitpack(Input, activation_bits, pack_axis=1, bit_axis=2, pack_type=pack_type) + Filter_q = bitpack(Filter, weight_bits, pack_axis=1, bit_axis=4, pack_type=pack_type) + batch, in_channel, activation_bits, in_height, in_width = Input_q.shape + num_filter, channel, kernel_h, kernel_w, weight_bits = Filter_q.shape + + pad_top, pad_left, pad_down, pad_right = get_pad_tuple( + padding, (kernel_h, kernel_w)) + pad_before = [0, 0, 0, pad_top, pad_left] + pad_after = [0, 0, 0, pad_down, pad_right] + + PadInput_q = pad(Input_q, pad_before, pad_after, name="pad_temp") + # compute the output shape + if isinstance(stride, int): + stride_h = stride_w = stride + else: + stride_h, stride_w = stride + out_channel = num_filter + out_height = simplify((in_height - kernel_h + pad_top + pad_down) // stride_h + 1) + out_width = simplify((in_width - kernel_w + pad_left + pad_right) // stride_w + 1) + + rc = tvm.reduce_axis((0, in_channel), name='rc') + ry = tvm.reduce_axis((0, kernel_h), name='ry') + rx = tvm.reduce_axis((0, kernel_w), name='rx') + b1 = tvm.reduce_axis((0, activation_bits), name='b1') + b2 = tvm.reduce_axis((0, weight_bits), name='b2') + + def _conv(nn, ff, yy, xx): + b1b2 = (b1+b2).astype(out_dtype) + return tvm.sum( + (tvm.popcount(PadInput_q[nn, rc, b1, yy * stride_h + ry, xx * stride_w + rx] & + Filter_q[ff, rc, ry, rx, b2])<< (b1b2)).astype(out_dtype), + axis=[rc, ry, rx, b2, b1]).astype(out_dtype) + + return tvm.compute((batch, out_channel, out_height, out_width), _conv, + name="QConv2dOutput", tag="qconv2d_nchw") + + +def qconv2d_nhwc(Input, Filter, stride, padding, activation_bits, weight_bits, out_dtype='int32', pack_type='uint32'): + assert isinstance(stride, int) or len(stride) == 2 + Input_q = bitpack(Input, activation_bits, pack_axis=3, bit_axis=4, pack_type=pack_type) + Filter_q = bitpack(Filter, weight_bits, pack_axis=2, bit_axis=4, pack_type=pack_type) + batch, in_height, in_width, in_channel_q, _ = Input_q.shape + kernel_h, kernel_w, _, num_filter, _ = Filter_q.shape + + if isinstance(stride, int): + stride_h = stride_w = stride + else: + stride_h, stride_w = stride + pad_top, pad_left, pad_down, pad_right = get_pad_tuple(padding, (kernel_h, kernel_w)) + # compute the output shape + out_channel = num_filter + out_height = simplify((in_height - kernel_h + pad_top + pad_down) // stride_h + 1) + out_width = simplify((in_width - kernel_w + pad_left + pad_right) // stride_w + 1) + pad_before = [0, pad_top, pad_left, 0, 0] + pad_after = [0, pad_down, pad_right, 0, 0] + PadInput_q = pad(Input_q, pad_before, pad_after, name="PaddedInput") + + rc = tvm.reduce_axis((0, in_channel_q), name='rc') + ry = tvm.reduce_axis((0, kernel_h), name='ry') + rx = tvm.reduce_axis((0, kernel_w), name='rx') + b1 = tvm.reduce_axis((0, activation_bits), name='b1') + b2 = tvm.reduce_axis((0, weight_bits), name='b2') + + def _conv(nn, yy, xx, ff): + return tvm.sum( + (tvm.popcount(PadInput_q[nn, yy * stride_h + ry, xx * stride_w + rx, rc, b1] & + Filter_q[ry, rx, rc, ff, b2])<< b1b2).astype(out_dtype), + axis=[rc, ry, rx, b2, b1]) + + return tvm.compute( (batch, out_height, out_width, out_channel), _conv, + name="QConv2dOutput", tag="qconv2d_nhwc") + + +def spatial_pack_nchw(data, kernel, stride, padding, in_bits, weight_bits, pack_dtype, out_dtype, dorefa=False): + """ Compute convolution with pack on spatial axes. """ + assert data.shape[0].value == 1, "spatial pack convolution only support batch size=1" + data_q = bitpack(data, in_bits, pack_axis=1, bit_axis=0, pack_type=pack_dtype) + kernel_q = bitpack(kernel, weight_bits, pack_axis=1, bit_axis=0, pack_type=pack_dtype) + IB, _, CI, H, W = data_q.shape + KB, CO, _, KH, KW = kernel_q.shape + HPAD, WPAD, _, _ = get_pad_tuple(padding, kernel) + + if isinstance(stride, (tuple, list)): + HSTR, WSTR = stride + else: + HSTR, WSTR = stride, stride + HCAT, WCAT = KH-1, KW-1 + + wkl = _get_workload(data, kernel, stride, padding, out_dtype, "NCHW") + sch = _get_schedule(wkl, "NCHW") + VH = sch.vh + VW = sch.vw + VC = sch.vc + + TH = H + 2*HPAD + TW = W + 2*WPAD + OH = (H + 2*HPAD - KH) // HSTR + 1 + OW = (W + 2*WPAD - KW) // WSTR + 1 + + dshape = (IB, 1, CI, H, W) + dpshape = (IB, 1, CI, TH, TW) + dvshape = (1, TH//(VH*HSTR), TW//(VW*WSTR), CI, VH*HSTR+HCAT, VW*WSTR+WCAT, IB) + + kshape = (KB, CO, CI, KH, KW) + kvshape = (CO//VC, CI, KH, KW, KB, VC) + + ovshape = (1, CO//VC, OH//VH, OW//VW, VH, VW, VC) + oshape = (1, CO, OH, OW) + + DOPAD = (HPAD != 0 and WPAD != 0) + if DOPAD: + data_pad = pad(data_q, (0, 0, 0, HPAD, WPAD), name="data_pad") + else: + data_pad = data_q + + data_vec = tvm.compute(dvshape, lambda n, h, w, ci, vh, vw, b: \ + data_pad[b][n][ci][h*VH*HSTR+vh][w*VW*WSTR+vw], name='data_vec') + + kernel_vec = tvm.compute(kvshape, lambda co, ci, dh, dw, b, vc: \ + kernel_q[b][co*VC+vc][ci][dh][dw], name='kernel_vec') + + ci = tvm.reduce_axis((0, CI), name='ci') + dh = tvm.reduce_axis((0, KH), name='dh') + dw = tvm.reduce_axis((0, KW), name='dw') + b1 = tvm.reduce_axis((0, IB), name='ib') + b2 = tvm.reduce_axis((0, KB), name='kb') + + def _conv(n, co, h, w, vh, vw, vc): + b1b2 = (b1+b2).astype(out_dtype) + if dorefa: + return tvm.sum( + (tvm.popcount(data_vec[n, h, w, ci, vh*HSTR+dh, vw*WSTR+dw, b1] & + kernel_vec[co, ci, dh, dw, b2, vc]) - + tvm.popcount(data_vec[n, h, w, ci, vh*HSTR+dh, vw*WSTR+dw, b1] & + ~kernel_vec[co, ci, dh, dw, b2, vc])).astype(out_dtype) << b1b2, + axis=[ci, dh, dw, b1, b2]) + else: + return tvm.sum( + (tvm.popcount(data_vec[n, h, w, ci, vh*HSTR+dh, vw*WSTR+dw, b1] & + kernel_vec[co, ci, dh, dw, b2, vc])).astype(out_dtype) << b1b2, + axis=[ci, dh, dw, b1, b2]) + + conv = tvm.compute(ovshape, _conv, name='conv_out') + + return tvm.compute(oshape, lambda n, co, h, w: + conv[n][co//VC][h//VH][w//VW][h%VH][w%VW][co%VC], + name='conv_vec', tag='spatial_qconv_nchw') + + + +def spatial_pack_nhwc(data, kernel, stride, padding, in_bits, weight_bits, pack_dtype, out_dtype, dorefa=False): + """ Compute convolution with pack on spatial axes. """ + assert data.shape[0].value == 1, "spatial pack convolution only support batch size=1" + data_q = bitpack(data, in_bits, pack_axis=3, bit_axis=4, pack_type=pack_dtype) + kernel_q = bitpack(kernel, weight_bits, pack_axis=2, bit_axis=4, pack_type=pack_dtype) + _, H, W, CI, IB = data_q.shape + KH, KW, _, CO, KB = kernel_q.shape + HPAD, WPAD, _, _ = get_pad_tuple(padding, kernel) + + if isinstance(stride, (tuple, list)): + HSTR, WSTR = stride + else: + HSTR, WSTR = stride, stride + HCAT, WCAT = KH-1, KW-1 + + wkl = _get_workload(data, kernel, stride, padding, out_dtype, "NHWC") + sch = _get_schedule(wkl, "NHWC") + VH = sch.vh + VW = sch.vw + VC = sch.vc + + PAD_H = H + 2*HPAD + PAD_W = W + 2*WPAD + OH = (H + 2*HPAD - KH) // HSTR + 1 + OW = (W + 2*WPAD - KW) // WSTR + 1 + + dvshape = (1, PAD_H//(VH*HSTR), PAD_W//(VW*WSTR), VH*HSTR+HCAT, VW*WSTR+WCAT, CI, IB) + kvshape = (CO, KH, KW, CI, VC, KB) + ovshape = (1, OH, OW, CO, VH, VW, VC) + oshape = (1, OH, OW, CO) + + if (HPAD != 0 and WPAD != 0): + data_pad = pad(data_q, (0, HPAD, WPAD, 0, 0), name="data_pad") + else: + data_pad = data_q + + data_vec = tvm.compute(dvshape, lambda n, h, w, vh, vw, ci, b: \ + data_pad[n][h*VH*HSTR+vh][w*VW*WSTR+vw][ci][b], name='data_vec') + + kernel_vec = tvm.compute(kvshape, lambda co, dh, dw, ci, vc, b: \ + kernel_q[dh][dw][ci][co*VC+vc][b], name='kernel_vec') + + ci = tvm.reduce_axis((0, CI), name='ci') + dh = tvm.reduce_axis((0, KH), name='dh') + dw = tvm.reduce_axis((0, KW), name='dw') + b1 = tvm.reduce_axis((0, IB), name='ib') + b2 = tvm.reduce_axis((0, KB), name='kb') + + def _conv(n, h, w, co, vh, vw, vc): + b1b2 = (b1+b2).astype(out_dtype) + if dorefa: + return tvm.sum( + (tvm.popcount(data_vec[n, h, w, vh*HSTR+dh, vw*WSTR+dw, ci, b1] & + kernel_vec[co, dh, dw, ci, vc, b2]) - + tvm.popcount(data_vec[n, h, w, vh*HSTR+dh, vw*WSTR+dw, ci, b1] & + ~kernel_vec[co, dh, dw, ci, vc, b2])).astype(out_dtype) << b1b2, + axis=[dh, dw, ci, b1, b2]) + else: + return tvm.sum( + tvm.popcount(data_vec[n, h, w, vh*HSTR+dh, vw*WSTR+dw, ci, b1] & + kernel_vec[co, dh, dw, ci, vc, b2]).astype(out_dtype) << b1b2, + axis=[dh, dw, ci, b1, b2]) + + conv = tvm.compute(ovshape, _conv, name='conv') + + return tvm.compute(oshape, lambda n, h, w, co: + conv[n][h//VH][w//VW][co//VC][h%VH][w%VW][co%VC], + name='output_unpack', tag='spatial_qconv_nhwc') + +_SCH_TO_DECL_FUNC_QUANT = { + QuantizedSpatialPackNCHW: spatial_pack_nchw, + QuantizedSpatialPackNHWC: spatial_pack_nhwc, +} diff --git a/topi/python/topi/rasp/qconv2d.py b/topi/python/topi/rasp/qconv2d.py new file mode 100644 index 000000000000..b0f7fcb011fe --- /dev/null +++ b/topi/python/topi/rasp/qconv2d.py @@ -0,0 +1,619 @@ +# pylint: disable=invalid-name,unused-variable,invalid-name +"""QConv2D schedule on raspberry pi""" +from __future__ import absolute_import as _abs +import tvm +from tvm import target as _target +from .. import tag +from ..nn.qconv2d import qconv2d as _qconv2d, _get_schedule +from ..nn.qconv2d import RaspQuantizedSpatialPack, QuantizedSpatialPackNCHW, QuantizedSpatialPackNHWC +from ..nn.qconv2d import _WORKLOADS, _SCH_TO_DECL_FUNC_QUANT +from ..nn.qconv2d import _get_workload +from ..nn.util import infer_pad, infer_stride +from ..util import simplify, get_const_int + +from .. import generic + +# TODO grab the number from autotuner +_QUANTIZED_SCHEDULES = [ + RaspQuantizedSpatialPack(2, 2, 8, 1, 1, False, 8), + RaspQuantizedSpatialPack(1, 4, 8, 4, 1, False, 8), + RaspQuantizedSpatialPack(1, 4, 8, 1, 16, False, 8), + RaspQuantizedSpatialPack(1, 4, 8, 4, 8, False, 8), + RaspQuantizedSpatialPack(1, 7, 8, 3, 8, False, 16), + RaspQuantizedSpatialPack(1, 2, 8, 1, 8, False, 16), + RaspQuantizedSpatialPack(2, 1, 8, 1, 4, False, 16), + RaspQuantizedSpatialPack(1, 7, 8, 1, 1, True, 16), + RaspQuantizedSpatialPack(1, 1, 8, 1, 16, True, 16), + RaspQuantizedSpatialPack(1, 1, 8, 1, 8, True, 16), + RaspQuantizedSpatialPack(1, 1, 8, 1, 16, True, 16), +] + +# TODO grab the number from autotuner +_QUANTIZED_SCHEDULES_NCHW = [ + # resnet + QuantizedSpatialPackNCHW(2, 2, 8, 1, 1), + QuantizedSpatialPackNCHW(1, 4, 8, 4, 1), + QuantizedSpatialPackNCHW(1, 4, 8, 1, 16), + QuantizedSpatialPackNCHW(1, 4, 8, 4, 8), + QuantizedSpatialPackNCHW(1, 7, 8, 3, 8), + QuantizedSpatialPackNCHW(1, 2, 8, 1, 8), + QuantizedSpatialPackNCHW(2, 1, 8, 1, 4), + QuantizedSpatialPackNCHW(1, 7, 8, 1, 1), + QuantizedSpatialPackNCHW(1, 1, 8, 1, 16), + QuantizedSpatialPackNCHW(1, 1, 8, 1, 8), + QuantizedSpatialPackNCHW(1, 1, 8, 1, 16), +] + +_QUANTIZED_SCHEDULES_NHWC = [ + # resnet + QuantizedSpatialPackNHWC(2, 2, 8, 1, 1), + QuantizedSpatialPackNHWC(1, 4, 8, 4, 1), + QuantizedSpatialPackNHWC(1, 4, 8, 1, 16), + QuantizedSpatialPackNHWC(1, 4, 8, 4, 8), + QuantizedSpatialPackNHWC(1, 7, 8, 3, 8), + QuantizedSpatialPackNHWC(1, 2, 8, 1, 8), + QuantizedSpatialPackNHWC(2, 1, 8, 1, 4), + QuantizedSpatialPackNHWC(1, 7, 8, 1, 1), + QuantizedSpatialPackNHWC(1, 1, 8, 1, 16), + QuantizedSpatialPackNHWC(1, 1, 8, 1, 8), + QuantizedSpatialPackNHWC(1, 1, 8, 1, 16), +] + + +@_get_schedule.register("rasp") +def _get_schedule_qconv2d(wkl, layout): + if wkl not in _WORKLOADS: + raise ValueError("no schedule for such workload: {}".format(wkl)) + idx = _WORKLOADS.index(wkl) + if layout == "NCHW": + sch = _QUANTIZED_SCHEDULES_NCHW[idx] + elif layout == "NHWC": + sch = _QUANTIZED_SCHEDULES_NHWC[idx] + return sch + + +@_qconv2d.register("rasp") +def _declaration_qconv2d(data, kernel, stride, padding, activation_bits, weight_bits, layout='NCHW', + pack_dtype=None, out_dtype=None, dorefa=False): + if out_dtype is None: + out_dtype = data.dtype + assert data.shape[0].value == 1, "only support batch size=1 convolution on rasp" + assert layout == "NCHW" or layout == "NHWC", "only support layouts NCHW and NHWC" + wkl = _get_workload(data, kernel, stride, padding, out_dtype, layout) + sch = _get_schedule(wkl, layout) + return _SCH_TO_DECL_FUNC_QUANT[type(sch)](data, kernel, stride, padding, activation_bits, weight_bits, + pack_dtype, out_dtype, dorefa) + +# TODO: is there a better way to share these with x86? + +@generic.schedule_qconv2d_nchw.register(["rasp"]) +@generic.schedule_qconv2d_nhwc.register(["rasp"]) +def schedule_qconv2d(outs): + s = tvm.create_schedule([x.op for x in outs]) + + def traverse(op): + output = op.output(0) + # inline all one-to-one-mapping operators except the last stage (output) + if tag.is_broadcast(op.tag) or 'elemwise' in op.tag or 'uquantize' in op.tag: + if op not in s.outputs: + s[op].compute_inline() + for tensor in op.input_tensors: + if tensor.op.input_tensors: + traverse(tensor.op) + + elif 'spatial_qconv_nchw' in op.tag or 'spatial_qconv_nhwc' in op.tag : + conv_out = op.input_tensors[0] + kernel_vec = conv_out.op.input_tensors[1] + kernel_q = kernel_vec.op.input_tensors[0] + kernel = kernel_q.op.input_tensors[0] + data_vec = conv_out.op.input_tensors[0] + data_q = data_vec.op.input_tensors[0] + data = data_q.op.input_tensors[0] + data_pad = None + if isinstance(data_q.op, tvm.tensor.ComputeOp) and "pad" in data_q.op.tag: + data_pad = data_q + data_q = data + data = data_q.op.input_tensors[0] + + # Need to go up 1 further, from the combine in bitpack + if "QuantizeInput" in kernel.op.name: + kernel = kernel.op.input_tensors[0] + if "QuantizeInput" in data.op.name: + data = data.op.input_tensors[0] + + if 'spatial_qconv_nchw' in op.tag: + _schedule_spatial_conv2d_nchw(s, data, data_q, data_pad, data_vec, + kernel, kernel_q, kernel_vec, + conv_out, output, outs[0]) + elif 'spatial_qconv_nhwc' in op.tag: + _schedule_spatial_conv2d_nhwc(s, data, data_q, data_pad, data_vec, + kernel, kernel_q, kernel_vec, + conv_out, output, outs[0]) + + traverse(outs[0].op) + return s + + +def _schedule_spatial_conv2d_nchw(s, data, data_q, data_pad, data_vec, kernel, kernel_q, kernel_vec, conv_out, output, last): + IB, _, CI, IH, IW = data_q.shape + KB, CO, _, KH, KW = kernel_q.shape + _, _, OH, OW = output.shape + + # Infer padding and stride + if data_pad is None: + padding = (0, 0) + TH, TW = IH, IW + else: + _, _, _, TH, TW = data_pad.shape + hpad = get_const_int((TH - IH) // 2) + wpad = get_const_int((TW - IW) // 2) + padding = (hpad, wpad) + + hstride = get_const_int((TH - KH) // (OH - 1)) + wstride = get_const_int((TW - KW) // (OW - 1)) + stride = (hstride, wstride) + + wkl = _get_workload(data, kernel, stride, padding, last.dtype, "NCHW") + sch = _get_schedule(wkl, "NCHW") + VH = sch.vh + VW = sch.vw + VC = sch.vc + ba = sch.ba + bc = sch.bc + + CC = s.cache_write(conv_out, "global") + + n, co, oh, ow, vh, vw, vc = s[conv_out].op.axis + s[conv_out].vectorize(vc) + + s[CC].compute_at(s[conv_out], ow) + n, co, oh, ow, vh, vw, vc = s[CC].op.axis + ci, dh, dw, b1, b2 = s[CC].op.reduce_axis + s[CC].reorder(ci, dh, vh, dw, vw, b1, b2, vc) + s[CC].unroll(b1) + s[CC].unroll(b2) + s[CC].vectorize(vc) + + ##### Schedule A + if data_pad is not None: + s[data_pad].compute_inline() + + _, h, _, _, _, _ , vw = s[data_vec].op.axis + s[data_vec].vectorize(vw) + if ba == 1: + oaxis = h + paxis = h + else: + oh, ih = s[data_vec].split(h, ba) + oaxis = oh + paxis = ih + + s[data_vec].parallel(paxis) + s[data_vec].pragma(oaxis, "parallel_launch_point") + s[data_vec].pragma(paxis, "parallel_stride_pattern") + s[data_vec].pragma(oaxis, "parallel_barrier_when_finish") + + + ##### Schedule B + co, _, _, _, _, vc = s[kernel_vec].op.axis + s[kernel_vec].vectorize(vc) + if bc == 1: + oaxis = co + paxis = co + else: + oco, ico = s[kernel_vec].split(co, bc) + oaxis = oco + paxis = ico + + s[kernel_vec].parallel(paxis) + s[kernel_vec].pragma(oaxis, "parallel_launch_point") + s[kernel_vec].pragma(paxis, "parallel_stride_pattern") + s[kernel_vec].pragma(oaxis, "parallel_barrier_when_finish") + + + ##### Schedule C + n, co, h, w = s[last].op.axis + co, vc = s[last].split(co, VC) + oh, ow, vh, vw = s[last].tile(h, w, VH, VW) + s[last].reorder(n, co, oh, ow, vh, vw, vc) + if last != output: + s[output].compute_inline() + s[conv_out].compute_at(s[last], ow) + + if bc == 1: + oaxis = co + paxis = co + else: + oco, ico = s[last].split(co, bc) + oaxis = oco + paxis = ico + + s[last].parallel(paxis) + s[last].pragma(oaxis, "parallel_launch_point") + s[last].pragma(paxis, "parallel_stride_pattern") + s[last].pragma(oaxis, "parallel_barrier_when_finish") + + return s + +def _schedule_spatial_conv2d_nhwc(s, data, data_q, data_pad, data_vec, + kernel, kernel_q, kernel_vec, + conv_out, output, last): + return s + _, IH, IW, CI, IB = data_q.shape + KH, KW, _, CO, KB = kernel_q.shape + _, OH, OW, _ = output.shape + # Infer padding and stride + if data_pad is None: + padding = (0, 0) + TH, TW = IH, IW + else: + _, TH, TW, _, _ = data_pad.shape + hpad = get_const_int((TH - IH) // 2) + wpad = get_const_int((TW - IW) // 2) + padding = (hpad, wpad) + + hstride = get_const_int((TH - KH) // (OH - 1)) + wstride = get_const_int((TW - KW) // (OW - 1)) + stride = (hstride, wstride) + + wkl = _get_workload(data, kernel, stride, padding, output.dtype, "NHWC") + sch = _get_schedule(wkl, "NHWC") + VH = sch.vh + VW = sch.vw + VC = sch.vc + ba = sch.ba + bc = sch.bc + + ##### Schedule data packing + if data_pad is not None: + s[data_pad].compute_inline() + + _, h, _, _, _, _ , _ = s[data_vec].op.axis + if ba == 1: + oaxis = h + paxis = h + else: + oh, ih = s[data_vec].split(h, ba) + oaxis = oh + paxis = ih + s[data_vec].parallel(paxis) + s[data_vec].pragma(oaxis, "parallel_launch_point") + s[data_vec].pragma(paxis, "parallel_stride_pattern") + s[data_vec].pragma(oaxis, "parallel_barrier_when_finish") + + + ##### Schedule kernel packing + co, _, _, _, _, _ = s[kernel_vec].op.axis + if bc == 1: + oaxis = co + paxis = co + else: + oco, ico = s[kernel_vec].split(co, bc) + oaxis = oco + paxis = ico + + s[kernel_vec].parallel(paxis) + s[kernel_vec].pragma(oaxis, "parallel_launch_point") + s[kernel_vec].pragma(paxis, "parallel_stride_pattern") + s[kernel_vec].pragma(oaxis, "parallel_barrier_when_finish") + + + ##### Schedule Convolution + n, oh, ow, co, vh, vw, vc = s[conv_out].op.axis + dh, dw, ci, b1, b2 = s[conv_out].op.reduce_axis + + s[conv_out].reorder(n, oh, ow, co, vh, vw, dh, dw, ci, vc, b1, b2) + + s[conv_out].unroll(b1) + s[conv_out].unroll(b2) + s[conv_out].vectorize(vc) + + # # Schedule output + n, h, w, co = s[last].op.axis + co, vc = s[last].split(co, VC) + oh, ow, vh, vw = s[last].tile(h, w, VH, VW) + s[last].reorder(n, oh, ow, co, vh, vw, vc) + s[last].vectorize(vc) + if last != output: + s[output].compute_inline() + s[conv_out].compute_at(s[last], ow) + + + if bc == 1: + oaxis = oh + paxis = oh + else: + oho, iho = s[last].split(oh, bc) + oaxis = oho + paxis = iho + + s[last].parallel(paxis) + s[last].pragma(oaxis, "parallel_launch_point") + s[last].pragma(paxis, "parallel_stride_pattern") + s[last].pragma(oaxis, "parallel_barrier_when_finish") + + return s + +####### ARM SPECIFIC ####### +def _spatial_pack_nhwc(data, kernel, stride, padding, activation_bits, weight_bits, out_dtype): + """ Compute convolution with pack on spatial axes. """ + assert data.shape[0].value == 1, "spatial pack convolution only support batch size=1" + print (out_dtype) + wkl = _get_workload(data, kernel, stride, padding, out_dtype, "NHWC") + sch = _get_schedule(wkl) + VH = sch.vh + VW = sch.vw + VC = sch.vc + + data_q = bitpack(data, activation_bits, pack_axis=3, bit_axis=3, pack_type='uint8') + kernel_vec = kernel_vec_spatial_pack_nhwc(kernel, weight_bits, VC) + N, H, W, IB, CI = data_q.shape + OCO, KH, KW, KB, VC, _ = kernel_vec.shape + + CO = OCO * VC + HPAD, WPAD, _, _ = get_pad_tuple(padding, kernel) + + if isinstance(stride, (tuple, list)): + HSTR, WSTR = stride + else: + HSTR, WSTR = stride, stride + HCAT, WCAT = KH-1, KW-1 + + + PAD_H = H + 2*HPAD + PAD_W = W + 2*WPAD + OH = (H + 2*HPAD - KH) // HSTR + 1 + OW = (W + 2*WPAD - KW) // WSTR + 1 + dvshape = (N, PAD_H//(VH*HSTR), PAD_W//(VW*WSTR), VH*HSTR+HCAT, VW*WSTR+WCAT, IB, CI) + ovshape = (1, OH // VH, OW // VW, CO // VC, VH, VW, VC) + oshape = (1, OH, OW, CO) + + if (HPAD != 0 and WPAD != 0): + data_pad = pad(data_q, (0, HPAD, WPAD, 0, 0), name="data_pad") + else: + data_pad = data_q + + data_vec = tvm.compute(dvshape, lambda n, h, w, vh, vw, b, ci: \ + data_pad[n][h*VH*HSTR+vh][w*VW*WSTR+vw][b][ci], name='data_vec') + + ci = tvm.reduce_axis((0, CI), name='ci') + dh = tvm.reduce_axis((0, KH), name='dh') + dw = tvm.reduce_axis((0, KW), name='dw') + ib = tvm.reduce_axis((0, IB), name='ib') + kb = tvm.reduce_axis((0, KB), name='kb') + + def _conv(n, h, w, co, vh, vw, vc): + return tvm.sum( + (tvm.popcount(kernel_vec[co, dh, dw, kb, vc, ci] & + data_vec[n, h, w, vh*HSTR+dh, vw*WSTR+dw, ib, ci]).astype('int16') + << (kb + ib).astype('int16')), axis=[dh, dw, kb, ib, ci]) + + conv = tvm.compute(ovshape, _conv, name='conv') + + return tvm.compute(oshape, lambda n, h, w, co: + conv[n][h//VH][w//VW][co//VC][h%VH][w%VW][co%VC].astype(out_dtype), + name='output_vec', tag='spatial_qconv_nhwc') + +def intrin_popcount(m, k_i, w_b, x_b): + type = 'uint8' + w = tvm.placeholder((w_b, m, k_i), dtype=type, name='w') + x = tvm.placeholder((x_b, k_i,), dtype=type, name='x') + k = tvm.reduce_axis((0, k_i), name='k') + bw = tvm.reduce_axis((0, w_b), name='bw') + bx = tvm.reduce_axis((0, x_b), name='bx') + z = tvm.compute((m,), lambda i: + tvm.sum(tvm.popcount(w[bw, i, k].astype('uint16') & x[bx, k].astype('uint16')) << (bw+bx).astype('uint16'), + axis=[bw, bx, k]), name='z') + + Wb = tvm.decl_buffer(w.shape, w.dtype, + name="W", + offset_factor=k_i, + strides=[tvm.var('ldw'), tvm.var('ldw'), 1]) + Xb = tvm.decl_buffer(x.shape, x.dtype, + name="X", + offset_factor=k_i, + strides=[tvm.var('ldw'), 1]) + + + def intrin_func(ins, outs): + ww, xx = ins + zz = outs[0] + vpadd_id = tvm.const(647, 'uint32') + vpadalu_id = tvm.const(646, 'uint32') + args_1 = tvm.const(1, 'uint32') + args_2 = tvm.const(2, 'uint32') + + def instr(index): + irb = tvm.ir_builder.create() + if index == 1: + irb.emit(zz.vstore(0, tvm.const(0, 'uint16x8'))) + else: + cnts8 = [None] * 8 + cnts4 = [None] * 4 + cnts2 = [None] * 2 + for bw in range(w_b): + for bx in range(x_b): + if k_i == 16: + for i in range(m): + ands = ww.vload([bw, i, 0], 'uint8x16') & xx.vload([bx, 0], 'uint8x16') + cnts = tvm.popcount(ands) + upper_half = tvm.call_pure_intrin('uint8x8', 'vectorhigh', cnts) + lower_half = tvm.call_pure_intrin('uint8x8', 'vectorlow', cnts) + cnts8[i] = upper_half + lower_half + for i in range(m/2): + cnts4[i] = tvm.call_pure_intrin('uint8x8', 'llvm_intrin', vpadd_id, args_1, cnts8[i*2], cnts8[i*2+1]) + for i in range(m/4): + cnts2[i] = tvm.call_pure_intrin('uint8x8', 'llvm_intrin', vpadd_id, args_1, cnts4[i*2], cnts4[i*2+1]) + cnts = tvm.call_pure_intrin('uint8x16', 'vectorcombine', cnts2[0], cnts2[1]) + shifted_cnts = cnts << (bw+bx) + out = tvm.call_pure_intrin('uint16x8', 'llvm_intrin', vpadalu_id, args_2, zz.vload(0, 'uint16x8'), shifted_cnts) + else: # ki ==8 + for i in range(m): + ands = ww.vload([bw, i, 0], 'uint8x8') & xx.vload([bx, 0], 'uint8x8') + cnts8[i] = tvm.popcount(ands) + for i in range(m/2): + cnts4[i] = tvm.call_pure_intrin('uint8x8', 'llvm_intrin', vpadd_id, args_1, cnts8[i*2], cnts8[i*2+1]) + for i in range(m/4): + cnts2[i] = tvm.call_pure_intrin('uint8x8', 'llvm_intrin', vpadd_id, args_1, cnts4[i*2], cnts4[i*2+1]) + cnts = tvm.call_pure_intrin('uint8x16', 'vectorcombine', cnts2[0], cnts2[1]) + shifted_cnts = cnts << (bw+bx) + out = tvm.call_pure_intrin('uint16x8', 'llvm_intrin', vpadalu_id, args_2, zz.vload(0, 'uint16x8'), shifted_cnts) + irb.emit(zz.vstore(0, out)) + return irb.get() + # body, reset, update + return instr(0), instr(1), instr(2) + with tvm.build_config(offset_factor=1, partition_const_loop=True): + return tvm.decl_tensor_intrin(z.op, intrin_func, binds={w: Wb, x:Xb}) + + +# ARM specific schedule that using custom microkernel +def arm_schedule_spatial_conv2d_nhwc(s, data, data_q, data_pad, data_vec, + kernel, kernel_q, kernel_vec, conv_out, output, last): + # no stride and padding info here + _, H, W, IB, CI = data_q.shape + KH, KW, KB, _, CO = kernel_q.shape + KB = get_const_int(KB) + IB = get_const_int(IB) + + if data_pad is None: + padding = (0,0) + _, in_h, in_w, _ , _ = data_q.shape + kern_h, kern_w, _, _ = kernel.shape + _, out_h, out_w, _ = output.shape + hstride = (in_h - kern_h) // (out_h - 1) + wstride = (in_w - kern_w) // (out_w - 1) + stride = get_const_int(hstride), get_const_int(wstride) + else: + _, in_h, in_w, _, _ = data_q.shape + _, pad_h, pad_w, _, _ = data_pad.shape + hpad = (pad_h - in_h) // 2 + wpad = (pad_w - in_w) // 2 + padding = get_const_int(hpad), get_const_int(wpad) + + _, in_h, in_w, _, _ = data_pad.shape + kern_h, kern_w, _, _ = kernel.shape + _, out_h, out_w, _ = output.shape + hstride = (in_h - kern_h) // (out_h - 1) + wstride = (in_w - kern_w) // (out_w - 1) + stride = get_const_int(hstride), get_const_int(wstride) + + wkl = _get_workload(data, kernel, stride, padding, output.dtype, "NHWC") + sch = _get_schedule(wkl, "NHWC") + + VH = sch.vh + VW = sch.vw + VC = sch.vc + ba = sch.ba + bc = sch.bc + + ##### Schedule data packing + if data_pad is not None: + s[data_pad].compute_inline() + + _, h, _, _, _, _, _ = s[data_vec].op.axis + if ba == 1: + oaxis = h + paxis = h + else: + oh, ih = s[data_vec].split(h, ba) + oaxis = oh + paxis = ih + + s[data_vec].parallel(paxis) + s[data_vec].pragma(oaxis, "parallel_launch_point") + s[data_vec].pragma(paxis, "parallel_stride_pattern") + s[data_vec].pragma(oaxis, "parallel_barrier_when_finish") + + + ##### Schedule kernel packing + co, _, _, _, _, _ = s[kernel_vec].op.axis + if bc == 1: + oaxis = co + paxis = co + else: + oco, ico = s[kernel_vec].split(co, bc) + oaxis = oco + paxis = ico + + s[kernel_vec].parallel(paxis) + s[kernel_vec].pragma(oaxis, "parallel_launch_point") + s[kernel_vec].pragma(paxis, "parallel_stride_pattern") + s[kernel_vec].pragma(oaxis, "parallel_barrier_when_finish") + + + ##### Schedule Convolution + n, oh, ow, co, vh, vw, vc = s[conv_out].op.axis + dh, dw, kb, ib, ci = s[conv_out].op.reduce_axis + + kfactor = sch.kfactor + if sch.split_ci: + oci, ici = s[conv_out].split(ci, kfactor) + s[conv_out].reorder(n, oh, ow, co, vh, vw, dh, dw, oci, kb, ib, vc, ici) + else: + s[conv_out].reorder(n, oh, ow, co, vh, vw, dh, dw, kb, ib, vc, ci) + + pc = intrin_popcount(8, kfactor, KB, IB) + s[conv_out].tensorize(kb, pc) + + n, h, w, co = s[last].op.axis + co, vc = s[last].split(co, VC) + oh, ow, vh, vw = s[last].tile(h, w, VH, VW) + s[last].reorder(n, oh, ow, co, vc, vh, vw) + s[last].vectorize(vw) + if last != output: + s[last].compute_inline() + + s[conv_out].compute_at(s[last], ow) + if co == 1: + oaxis = oh + paxis = oh + else: + oho, iho = s[last].split(oh, bc) + oaxis = oho + paxis = iho + + s[last].parallel(paxis) + s = s.normalize() + return s + + +# @generic.schedule_qconv2d_nhwc.register(["rasp"]) +def schedule_qconv2d_nhwc(outs): + s = tvm.create_schedule([x.op for x in outs]) + def traverse(op): + """Traverse operators from computation graph""" + # inline all one-to-one-mapping operators except the last stage (output) + if tag.is_broadcast(op.tag): + if op not in s.outputs: + s[op].compute_inline() + for tensor in op.input_tensors: + if tensor.op.input_tensors: + traverse(tensor.op) + + if 'spatial_qconv_nhwc' in op.tag: + # print "spatial" + output = op.output(0) + conv_out = op.input_tensors[0] + kernel_vec = conv_out.op.input_tensors[0] + kernel_q = kernel_vec.op.input_tensors[0] + kernel = kernel_q.op.input_tensors[0] + if "QuantizeInput" in kernel.op.name: + # Need to go up 1 further, from the combine in bitpack + kernel = kernel.op.input_tensors[0] + data_vec = conv_out.op.input_tensors[1] + data_q = data_vec.op.input_tensors[0] + data = data_q.op.input_tensors[0] + data_pad = None + if isinstance(data_q.op, tvm.tensor.ComputeOp) and "pad" in data_q.op.tag: + data_pad = data_q + data_q = data + data = data_q.op.input_tensors[0] + if "QuantizeInput" in data.op.name: + # Need to go up 1 further, from the combine in bitpack + data = data.op.input_tensors[0] + + _schedule_spatial_conv2d_nhwc(s, data, data_q, data_pad, data_vec, + kernel, kernel_q, kernel_vec, conv_out, output, outs[0]) + + traverse(outs[0].op) + return s diff --git a/topi/python/topi/x86/qconv2d.py b/topi/python/topi/x86/qconv2d.py new file mode 100644 index 000000000000..1375c5436734 --- /dev/null +++ b/topi/python/topi/x86/qconv2d.py @@ -0,0 +1,405 @@ +# pylint: disable=invalid-name,unused-variable,invalid-name +"""QConv2D schedule on x86""" +import tvm +from .. import generic, tag +from .. import nn +from ..nn.util import infer_pad, infer_stride +from topi.util import simplify, get_const_int +from ..nn.qconv2d import qconv2d as _qconv2d, _get_schedule +from ..nn.qconv2d import QuantizedSpatialPackNCHW, QuantizedSpatialPackNHWC +from ..nn.qconv2d import _WORKLOADS, _SCH_TO_DECL_FUNC_QUANT +from ..nn.qconv2d import _get_workload + + +# TODO grab the number from autotuner +_QUANTIZED_SCHEDULES_NCHW = [ + # resnet + QuantizedSpatialPackNCHW(2, 2, 8, 1, 1), + QuantizedSpatialPackNCHW(1, 4, 8, 4, 1), + QuantizedSpatialPackNCHW(1, 4, 8, 1, 16), + QuantizedSpatialPackNCHW(1, 4, 8, 4, 8), + QuantizedSpatialPackNCHW(1, 7, 8, 3, 8), + QuantizedSpatialPackNCHW(1, 2, 8, 1, 8), + QuantizedSpatialPackNCHW(2, 1, 8, 1, 4), + QuantizedSpatialPackNCHW(1, 7, 8, 1, 1), + QuantizedSpatialPackNCHW(1, 1, 8, 1, 16), + QuantizedSpatialPackNCHW(1, 1, 8, 1, 8), + QuantizedSpatialPackNCHW(1, 1, 8, 1, 16), +] + +_QUANTIZED_SCHEDULES_NHWC = [ + # resnet + QuantizedSpatialPackNHWC(2, 2, 8, 1, 1), + QuantizedSpatialPackNHWC(1, 4, 8, 4, 1), + QuantizedSpatialPackNHWC(1, 4, 8, 1, 16), + QuantizedSpatialPackNHWC(1, 4, 8, 4, 8), + QuantizedSpatialPackNHWC(1, 7, 8, 3, 8), + QuantizedSpatialPackNHWC(1, 2, 8, 1, 8), + QuantizedSpatialPackNHWC(2, 1, 8, 1, 4), + QuantizedSpatialPackNHWC(1, 7, 8, 1, 1), + QuantizedSpatialPackNHWC(1, 1, 8, 1, 16), + QuantizedSpatialPackNHWC(1, 1, 8, 1, 8), + QuantizedSpatialPackNHWC(1, 1, 8, 1, 16), +] + +@_get_schedule.register("cpu") +def _get_schedule_qconv2d(wkl, layout): + if wkl not in _WORKLOADS: + raise ValueError("no schedule for such workload: {}".format(wkl)) + idx = _WORKLOADS.index(wkl) + if layout == "NCHW": + sch = _QUANTIZED_SCHEDULES_NCHW[idx] + elif layout == "NHWC": + sch = _QUANTIZED_SCHEDULES_NHWC[idx] + return sch + + +@_qconv2d.register("cpu") +def _declaration_qconv2d(data, kernel, stride, padding, activation_bits, weight_bits, layout='NCHW', + pack_dtype=None, out_dtype=None, dorefa=False): + if out_dtype is None: + out_dtype = data.dtype + assert data.shape[0].value == 1, "only support batch size=1 convolution on rasp" + assert layout == "NCHW" or layout == "NHWC", "only support layouts NCHW and NHWC" + + wkl = _get_workload(data, kernel, stride, padding, out_dtype, layout) + sch = _get_schedule(wkl, layout) + return _SCH_TO_DECL_FUNC_QUANT[type(sch)](data, kernel, stride, padding, activation_bits, weight_bits, + pack_dtype, out_dtype, dorefa) + +@generic.schedule_qconv2d_nchw.register(["cpu"]) +@generic.schedule_qconv2d_nhwc.register(["cpu"]) +def schedule_qconv2d(outs): + s = tvm.create_schedule([x.op for x in outs]) + + def traverse(op): + output = op.output(0) + # inline all one-to-one-mapping operators except the last stage (output) + if tag.is_broadcast(op.tag) or 'elemwise' in op.tag or 'uquantize' in op.tag: + if op not in s.outputs: + s[op].compute_inline() + for tensor in op.input_tensors: + if tensor.op.input_tensors: + traverse(tensor.op) + + elif 'spatial_qconv_nchw' in op.tag or 'spatial_qconv_nhwc' in op.tag : + conv_out = op.input_tensors[0] + kernel_vec = conv_out.op.input_tensors[1] + kernel_q = kernel_vec.op.input_tensors[0] + kernel = kernel_q.op.input_tensors[0] + data_vec = conv_out.op.input_tensors[0] + data_q = data_vec.op.input_tensors[0] + data = data_q.op.input_tensors[0] + data_pad = None + if isinstance(data_q.op, tvm.tensor.ComputeOp) and "pad" in data_q.op.tag: + data_pad = data_q + data_q = data + data = data_q.op.input_tensors[0] + if "QuantizeInput" in kernel.op.name: + # Need to go up 1 further, from the combine in bitpack + kernel = kernel.op.input_tensors[0] + if "QuantizeInput" in data.op.name: + # Need to go up 1 further, from the combine in bitpack + data = data.op.input_tensors[0] + + if 'spatial_qconv_nchw' in op.tag: + _schedule_spatial_conv2d_nchw(s, data, data_q, data_pad, data_vec, + kernel, kernel_q, kernel_vec, + conv_out, output, outs[0]) + elif 'spatial_qconv_nhwc' in op.tag: + _schedule_spatial_conv2d_nhwc(s, data, data_q, data_pad, data_vec, + kernel, kernel_q, kernel_vec, + conv_out, output, outs[0]) + else: + kernel = op.input_tensors[1] + data_q = op.input_tensors[0] + data = data_q.op.input_tensors[0] + data_pad = None + if isinstance(data_q.op, tvm.tensor.ComputeOp) and "pad" in data_q.op.tag: + data_pad = data_q + data_q = data + data = data_q.op.input_tensors[0] + if 'conv2d_nchw_q' in op.tag: + _schedule_conv2d_nchw_q(s, data, data_q, data_pad, kernel, output) + elif 'conv2d_nhwc_q' in op.tag: + _schedule_conv2d_nhwc_q(s, data, data_q, data_pad, kernel, output) + + + traverse(outs[0].op) + return s + + +def _schedule_spatial_conv2d_nchw(s, data, data_q, data_pad, data_vec, kernel, kernel_q, kernel_vec, conv_out, output, last): + IB, _, CI, IH, IW = data_q.shape + KB, CO, _, KH, KW = kernel_q.shape + _, _, OH, OW = output.shape + + # Infer padding and stride + if data_pad is None: + padding = (0, 0) + TH, TW = IH, IW + else: + _, _, _, TH, TW = data_pad.shape + hpad = get_const_int((TH - IH) // 2) + wpad = get_const_int((TW - IW) // 2) + padding = (hpad, wpad) + + hstride = get_const_int((TH - KH) // (OH - 1)) + wstride = get_const_int((TW - KW) // (OW - 1)) + stride = (hstride, wstride) + + wkl = _get_workload(data, kernel, stride, padding, output.dtype, "NCHW") + sch = _get_schedule(wkl, "NCHW") + VH = sch.vh + VW = sch.vw + VC = sch.vc + ba = sch.ba + bc = sch.bc + + CC = s.cache_write(conv_out, "global") + + n, co, oh, ow, vh, vw, vc = s[conv_out].op.axis + s[conv_out].vectorize(vc) + + s[CC].compute_at(s[conv_out], ow) + n, co, oh, ow, vh, vw, vc = s[CC].op.axis + ci, dh, dw, b1, b2 = s[CC].op.reduce_axis + s[CC].reorder(ci, dh, vh, dw, vw, b1, b2, vc) + s[CC].unroll(b1) + s[CC].unroll(b2) + s[CC].vectorize(vc) + + ##### Schedule A + if data_pad is not None: + s[data_pad].compute_inline() + + _, h, _, _, _, _ , vw = s[data_vec].op.axis + s[data_vec].vectorize(vw) + if ba == 1: + oaxis = h + paxis = h + else: + oh, ih = s[data_vec].split(h, ba) + oaxis = oh + paxis = ih + + s[data_vec].parallel(paxis) + s[data_vec].pragma(oaxis, "parallel_launch_point") + s[data_vec].pragma(paxis, "parallel_stride_pattern") + s[data_vec].pragma(oaxis, "parallel_barrier_when_finish") + + + ##### Schedule B + co, _, _, _, _, vc = s[kernel_vec].op.axis + s[kernel_vec].vectorize(vc) + if bc == 1: + oaxis = co + paxis = co + else: + oco, ico = s[kernel_vec].split(co, bc) + oaxis = oco + paxis = ico + + s[kernel_vec].parallel(paxis) + s[kernel_vec].pragma(oaxis, "parallel_launch_point") + s[kernel_vec].pragma(paxis, "parallel_stride_pattern") + s[kernel_vec].pragma(oaxis, "parallel_barrier_when_finish") + + + ##### Schedule C + n, co, h, w = s[last].op.axis + co, vc = s[last].split(co, VC) + oh, ow, vh, vw = s[last].tile(h, w, VH, VW) + s[last].reorder(n, co, oh, ow, vh, vw, vc) + if last != output: + s[output].compute_inline() + s[conv_out].compute_at(s[last], ow) + + if bc == 1: + oaxis = co + paxis = co + else: + oco, ico = s[last].split(co, bc) + oaxis = oco + paxis = ico + + s[last].parallel(paxis) + s[last].pragma(oaxis, "parallel_launch_point") + s[last].pragma(paxis, "parallel_stride_pattern") + s[last].pragma(oaxis, "parallel_barrier_when_finish") + + return s + +def _schedule_spatial_conv2d_nhwc(s, data, data_q, data_pad, data_vec, + kernel, kernel_q, kernel_vec, + conv_out, output, last): + # no stride and padding info here + _, IH, IW, CI, IB = data_q.shape + KH, KW, _, CO, KB = kernel_q.shape + _, OH, OW, _ = output.shape + # Infer padding and stride + if data_pad is None: + padding = (0, 0) + TH, TW = IH, IW + else: + _, TH, TW, _, _ = data_pad.shape + hpad = get_const_int((TH - IH) // 2) + wpad = get_const_int((TW - IW) // 2) + padding = (hpad, wpad) + + hstride = get_const_int((TH - KH) // (OH - 1)) + wstride = get_const_int((TW - KW) // (OW - 1)) + stride = (hstride, wstride) + + wkl = _get_workload(data, kernel, stride, padding, last.dtype, "NHWC") + sch = _get_schedule(wkl, "NHWC") + VH = sch.vh + VW = sch.vw + VC = sch.vc + ba = sch.ba + bc = sch.bc + + ##### Schedule data packing + if data_pad is not None: + s[data_pad].compute_inline() + + _, h, _, _, _, _ , _ = s[data_vec].op.axis + if ba == 1: + oaxis = h + paxis = h + else: + oh, ih = s[data_vec].split(h, ba) + oaxis = oh + paxis = ih + s[data_vec].parallel(paxis) + s[data_vec].pragma(oaxis, "parallel_launch_point") + s[data_vec].pragma(paxis, "parallel_stride_pattern") + s[data_vec].pragma(oaxis, "parallel_barrier_when_finish") + + + ##### Schedule kernel packing + co, _, _, _, _, _ = s[kernel_vec].op.axis + if bc == 1: + oaxis = co + paxis = co + else: + oco, ico = s[kernel_vec].split(co, bc) + oaxis = oco + paxis = ico + + s[kernel_vec].parallel(paxis) + s[kernel_vec].pragma(oaxis, "parallel_launch_point") + s[kernel_vec].pragma(paxis, "parallel_stride_pattern") + s[kernel_vec].pragma(oaxis, "parallel_barrier_when_finish") + + + ##### Schedule Convolution + n, oh, ow, co, vh, vw, vc = s[conv_out].op.axis + dh, dw, ci, b1, b2 = s[conv_out].op.reduce_axis + + s[conv_out].reorder(n, oh, ow, co, vh, vw, dh, dw, ci, vc, b1, b2) + + s[conv_out].unroll(b1) + s[conv_out].unroll(b2) + s[conv_out].vectorize(vc) + + # # Schedule output + n, h, w, co = s[last].op.axis + co, vc = s[last].split(co, VC) + oh, ow, vh, vw = s[last].tile(h, w, VH, VW) + s[last].reorder(n, oh, ow, co, vh, vw, vc) + s[last].vectorize(vc) + if last != output: + s[output].compute_inline() + s[conv_out].compute_at(s[last], ow) + + if bc == 1: + oaxis = oh + paxis = oh + else: + oho, iho = s[last].split(oh, bc) + oaxis = oho + paxis = iho + + s[last].parallel(paxis) + s[last].pragma(oaxis, "parallel_launch_point") + s[last].pragma(paxis, "parallel_stride_pattern") + s[last].pragma(oaxis, "parallel_barrier_when_finish") + + return s + +# Very simple schedules +def schedule_qconv2d_nchw(outs): + """Create schedule for tensors""" + s = tvm.create_schedule([x.op for x in outs]) + + def traverse(op): + if 'qconv2d_nchw' in op.tag: + output = op.output(0) + kernel = op.input_tensors[1] + data_q = op.input_tensors[0] + data = data_q.op.input_tensors[0] + data_pad = None + if isinstance(data_q.op, tvm.tensor.ComputeOp) and "pad" in data_q.op.tag: + data_pad = data_q + data_q = data + data = data_q.op.input_tensors[0] + + # Schedule for padding + n_pad, c_pad, b_pad, h_pad, w_pad = data_pad.op.axis + pad_fused = s[data_pad].fuse(n_pad, c_pad) + s[data_pad].parallel(pad_fused) + + # Schedule for convolution + nn, ff, yy, xx = s[output].op.axis + rc, ry, rx, b2, b1 = s[output].op.reduce_axis + + # Tiling + yo, xo, yi, xi = s[output].tile(yy, xx, 4, 4) + fused = s[output].fuse(nn, ff) + s[output].reorder(fused, rc, yo, xo, ry, rx, yi, b1, b2, xi) + # Vectorize, unroll, parallel + s[output].vectorize(xi) + s[output].unroll(b1) + s[output].unroll(b2) + s[output].parallel(fused) + + traverse(outs[0].op) + return s + +def schedule_qconv2d_nhwc(outs): + """Create schedule for tensors""" + s = tvm.create_schedule([x.op for x in outs]) + + def traverse(op): + if 'qconv2d_nhwc' in op.tag: + output = op.output(0) + kernel = op.input_tensors[1] + data_q = op.input_tensors[0] + data = data_q.op.input_tensors[0] + data_pad = None + if isinstance(data_q.op, tvm.tensor.ComputeOp) and "pad" in data_q.op.tag: + data_pad = data_q + data_q = data + data = data_q.op.input_tensors[0] + + # Schedule for padding + n_pad, h_pad, w_pad, c_pad, b_pad = data_pad.op.axis + pad_fused = s[data_pad].fuse(n_pad, h_pad) + s[data_pad].parallel(pad_fused) + + # Schedule for convolution + nn, yy, xx, ff = s[output].op.axis + ry, rx, rc, b1, b2 = s[output].op.reduce_axis + + # Tiling + xo, fo, xi, fi = s[output].tile(xx, ff, 4, 4) + fused = s[output].fuse(nn, yy) + s[output].reorder(fused, xo, fo, ry, rx, xi, rc, b1, b2, fi) + # Vectorize, unroll, parallel + s[output].vectorize(fi) + s[output].unroll(b1) + s[output].unroll(b2) + s[output].parallel(fused) + traverse(outs[0].op) + return s From 74660b1c71491513673b4018abc492b4a912557c Mon Sep 17 00:00:00 2001 From: Meghan Date: Mon, 18 Jun 2018 23:28:46 -0700 Subject: [PATCH 06/11] rename qconv->bitserial_conv --- topi/python/topi/generic/nn.py | 20 +- topi/python/topi/nn/__init__.py | 3 +- .../nn/{qconv2d.py => bitserial_conv2d.py} | 132 ++-- topi/python/topi/rasp/__init__.py | 2 +- topi/python/topi/rasp/bitserial_conv2d.py | 360 ++++++++++ topi/python/topi/rasp/qconv2d.py | 619 ------------------ topi/python/topi/x86/__init__.py | 3 +- .../x86/{qconv2d.py => bitserial_conv2d.py} | 124 ++-- .../python/test_topi_bitserial_conv2d.py | 109 +++ .../python/test_topi_bitserial_conv2d_rasp.py | 132 ++++ 10 files changed, 751 insertions(+), 753 deletions(-) rename topi/python/topi/nn/{qconv2d.py => bitserial_conv2d.py} (75%) create mode 100644 topi/python/topi/rasp/bitserial_conv2d.py delete mode 100644 topi/python/topi/rasp/qconv2d.py rename topi/python/topi/x86/{qconv2d.py => bitserial_conv2d.py} (79%) create mode 100644 topi/tests/python/test_topi_bitserial_conv2d.py create mode 100644 topi/tests/python/test_topi_bitserial_conv2d_rasp.py diff --git a/topi/python/topi/generic/nn.py b/topi/python/topi/generic/nn.py index bb81c37ad285..1a85e5818462 100644 --- a/topi/python/topi/generic/nn.py +++ b/topi/python/topi/generic/nn.py @@ -53,6 +53,22 @@ def schedule_conv2d_nhwc(outs): """ return _default_schedule(outs, False) +@tvm.target.generic_func +def schedule_qdense(outs): + """Schedule for qdense + + Parameters + ---------- + outs: Array of Tensor + The computation graph description of qdense + in the format of an array of tensors. + + Returns + ------- + sch: Schedule + The computation schedule for the op. + """ + return _default_schedule(outs, False) @tvm.target.generic_func def schedule_conv2d_NCHWc(num_filter, kernel_size, strides, padding, outs): @@ -133,7 +149,7 @@ def schedule_depthwise_conv2d_nhwc(outs): return _default_schedule(outs, False) @tvm.target.generic_func -def schedule_qconv2d_nchw(outs): +def schedule_bitserial_conv2d_nchw(outs): """Schedule for qconv2d_nchw Parameters @@ -151,7 +167,7 @@ def schedule_qconv2d_nchw(outs): @tvm.target.generic_func -def schedule_qconv2d_nhwc(outs): +def schedule_bitserial_conv2d_nhwc(outs): """Schedule for qconv2d_nhwc Parameters diff --git a/topi/python/topi/nn/__init__.py b/topi/python/topi/nn/__init__.py index e968bd68c927..2c17e0540477 100644 --- a/topi/python/topi/nn/__init__.py +++ b/topi/python/topi/nn/__init__.py @@ -17,4 +17,5 @@ from .upsampling import * from .local_response_norm import * from .l2_norm import * -from .qconv2d import * \ No newline at end of file +from .bitserial_conv2d import * +from .qdense import * \ No newline at end of file diff --git a/topi/python/topi/nn/qconv2d.py b/topi/python/topi/nn/bitserial_conv2d.py similarity index 75% rename from topi/python/topi/nn/qconv2d.py rename to topi/python/topi/nn/bitserial_conv2d.py index 820a92bc9ff1..e51577563498 100644 --- a/topi/python/topi/nn/qconv2d.py +++ b/topi/python/topi/nn/bitserial_conv2d.py @@ -5,25 +5,18 @@ import tvm from .pad import pad from .util import get_pad_tuple, bitpack -from ..util import simplify, get_const_int, get_const_tuple -import numpy as np - +from ..util import simplify, get_const_tuple # workload description of qconv2d Workload = namedtuple('Workload', ['in_dtype', 'out_dtype', 'height', 'width', 'in_filter', 'out_filter', 'hkernel', 'wkernel', 'hpad', 'wpad', 'hstride', 'wstride']) -QuantizedSpatialPackNCHW = namedtuple('SpatialPack', - ['vh', 'vw', 'vc', 'ba', 'bc']) - -QuantizedSpatialPackNHWC= namedtuple('SpatialPack', - ['vh', 'vw', 'vc', 'ba', 'bc']) - -# RPI version - broken right now -RaspQuantizedSpatialPack = namedtuple('SpatialPack', - ['vh', 'vw', 'vc', 'ba', 'bc', 'split_ci', 'kfactor']) +SpatialPackNCHW = namedtuple('SpatialPack', + ['vh', 'vw', 'vc', 'ba', 'bc']) +SpatialPackNHWC = namedtuple('SpatialPack', + ['vh', 'vw', 'vc', 'ba', 'bc']) _WORKLOADS = [ # workloads of resnet18 on imagenet @@ -39,17 +32,23 @@ Workload('uint32', 'int32', 14, 14, 256, 512, 3, 3, 1, 1, 2, 2), Workload('uint32', 'int32', 14, 14, 256, 512, 1, 1, 0, 0, 2, 2), Workload('uint32', 'int32', 7, 7, 512, 512, 3, 3, 1, 1, 1, 1), + + # workload of alexnet on cifar10 + Workload('int32', 'int32', 27, 27, 96, 192, 5, 5, 2, 2, 1, 1), + Workload('int32', 'int32', 13, 13, 192, 384, 3, 3, 1, 1, 1, 1), + Workload('int32', 'int32', 13, 13, 384, 384, 3, 3, 1, 1, 1, 1), + Workload('int32', 'int32', 13, 13, 384, 256, 3, 3, 1, 1, 1, 1), ] @tvm.target.generic_func -def qconv2d(data, kernel, stride, padding, activation_bits, weight_bits, layout='NCHW', - pack_dtype='uint32', out_dtype='int32', dorefa=True): +def bitserial_conv2d(data, kernel, stride, padding, activation_bits, weight_bits, + layout='NCHW', pack_dtype='uint32', out_dtype='int32', dorefa=True): """Conv2D operator. Parameters ---------- input : tvm.Tensor - 4-D with shape [batch, in_channel, in_height, in_width] or + 4-D with shape [batch, in_channel, in_height, in_width] or [batch, in_height, in_width, in_channel] filter : tvm.Tensor @@ -73,7 +72,7 @@ def qconv2d(data, kernel, stride, padding, activation_bits, weight_bits, layout pack_dtype: str bit packing type - + dorefa: bool method of preforming popcount @@ -85,13 +84,12 @@ def qconv2d(data, kernel, stride, padding, activation_bits, weight_bits, layout # search platform specific declaration first # default declaration if layout == 'NCHW': - return spatial_pack_nchw(data, kernel, stride, padding, activation_bits, weight_bits, pack_dtype=pack_dtype, - out_dtype=out_dtype, dorefa=dorefa) + return spatial_pack_nchw(data, kernel, stride, padding, activation_bits, weight_bits, + pack_dtype=pack_dtype, out_dtype=out_dtype, dorefa=dorefa) elif layout == 'NHWC': - return spatial_pack_nhwc(data, kernel, stride, padding, activation_bits, weight_bits, pack_dtype=pack_dtype, - out_dtype=out_dtype, dorefa=dorefa) - else: - raise ValueError("not support this layout {} yet".format(layout)) + return spatial_pack_nhwc(data, kernel, stride, padding, activation_bits, weight_bits, + pack_dtype=pack_dtype, out_dtype=out_dtype, dorefa=dorefa) + raise ValueError("not support this layout {} yet".format(layout)) def _get_workload(data, kernel, stride, padding, out_dtype, layout): """ Get the workload structure. """ @@ -109,7 +107,7 @@ def _get_workload(data, kernel, stride, padding, out_dtype, layout): HSTR, WSTR = stride else: HSTR, WSTR = stride, stride - + return Workload(data.dtype, out_dtype, IH, IW, CI, CO, KH, KW, HPAD, WPAD, HSTR, WSTR) @tvm.target.generic_func @@ -123,7 +121,8 @@ def _get_schedule(wkl, layout): return wkl -def qconv2d_nchw(Input, Filter, stride, padding, activation_bits, weight_bits, out_dtype='int32', pack_type='uint32'): +def bitserial_conv2d_nchw(Input, Filter, stride, padding, activation_bits, weight_bits, + out_dtype='int32', pack_type='uint32'): assert isinstance(stride, int) or len(stride) == 2 Input_q = bitpack(Input, activation_bits, pack_axis=1, bit_axis=2, pack_type=pack_type) Filter_q = bitpack(Filter, weight_bits, pack_axis=1, bit_axis=4, pack_type=pack_type) @@ -153,16 +152,16 @@ def qconv2d_nchw(Input, Filter, stride, padding, activation_bits, weight_bits, o def _conv(nn, ff, yy, xx): b1b2 = (b1+b2).astype(out_dtype) - return tvm.sum( - (tvm.popcount(PadInput_q[nn, rc, b1, yy * stride_h + ry, xx * stride_w + rx] & - Filter_q[ff, rc, ry, rx, b2])<< (b1b2)).astype(out_dtype), - axis=[rc, ry, rx, b2, b1]).astype(out_dtype) - - return tvm.compute((batch, out_channel, out_height, out_width), _conv, - name="QConv2dOutput", tag="qconv2d_nchw") + return tvm.sum((tvm.popcount( + PadInput_q[nn, rc, b1, yy * stride_h + ry, xx * stride_w + rx] & + Filter_q[ff, rc, ry, rx, b2])<< (b1b2)).astype(out_dtype), + axis=[rc, ry, rx, b2, b1]).astype(out_dtype) + return tvm.compute((batch, out_channel, out_height, out_width), _conv, + name="Conv2dOutput", tag="bitserial_conv2d_nchw") -def qconv2d_nhwc(Input, Filter, stride, padding, activation_bits, weight_bits, out_dtype='int32', pack_type='uint32'): +def bitserial_conv2d_nhwc(Input, Filter, stride, padding, activation_bits, weight_bits, + out_dtype='int32', pack_type='uint32'): assert isinstance(stride, int) or len(stride) == 2 Input_q = bitpack(Input, activation_bits, pack_axis=3, bit_axis=4, pack_type=pack_type) Filter_q = bitpack(Filter, weight_bits, pack_axis=2, bit_axis=4, pack_type=pack_type) @@ -189,16 +188,17 @@ def qconv2d_nhwc(Input, Filter, stride, padding, activation_bits, weight_bits, o b2 = tvm.reduce_axis((0, weight_bits), name='b2') def _conv(nn, yy, xx, ff): - return tvm.sum( - (tvm.popcount(PadInput_q[nn, yy * stride_h + ry, xx * stride_w + rx, rc, b1] & - Filter_q[ry, rx, rc, ff, b2])<< b1b2).astype(out_dtype), - axis=[rc, ry, rx, b2, b1]) - - return tvm.compute( (batch, out_height, out_width, out_channel), _conv, - name="QConv2dOutput", tag="qconv2d_nhwc") + b1b2 = (b1+b2).astype(out_dtype) + return tvm.sum((tvm.popcount( + PadInput_q[nn, yy * stride_h + ry, xx * stride_w + rx, rc, b1] & + Filter_q[ry, rx, rc, ff, b2]) << b1b2).astype(out_dtype), + axis=[rc, ry, rx, b2, b1]) + return tvm.compute((batch, out_height, out_width, out_channel), _conv, + name="Conv2dOutput", tag="bitserial_conv2d_nhwc") -def spatial_pack_nchw(data, kernel, stride, padding, in_bits, weight_bits, pack_dtype, out_dtype, dorefa=False): +def spatial_pack_nchw(data, kernel, stride, padding, in_bits, weight_bits, + pack_dtype, out_dtype, dorefa=False): """ Compute convolution with pack on spatial axes. """ assert data.shape[0].value == 1, "spatial pack convolution only support batch size=1" data_q = bitpack(data, in_bits, pack_axis=1, bit_axis=0, pack_type=pack_dtype) @@ -251,31 +251,31 @@ def spatial_pack_nchw(data, kernel, stride, padding, in_bits, weight_bits, pack_ dw = tvm.reduce_axis((0, KW), name='dw') b1 = tvm.reduce_axis((0, IB), name='ib') b2 = tvm.reduce_axis((0, KB), name='kb') - + def _conv(n, co, h, w, vh, vw, vc): b1b2 = (b1+b2).astype(out_dtype) if dorefa: - return tvm.sum( - (tvm.popcount(data_vec[n, h, w, ci, vh*HSTR+dh, vw*WSTR+dw, b1] & - kernel_vec[co, ci, dh, dw, b2, vc]) - - tvm.popcount(data_vec[n, h, w, ci, vh*HSTR+dh, vw*WSTR+dw, b1] & - ~kernel_vec[co, ci, dh, dw, b2, vc])).astype(out_dtype) << b1b2, - axis=[ci, dh, dw, b1, b2]) - else: - return tvm.sum( - (tvm.popcount(data_vec[n, h, w, ci, vh*HSTR+dh, vw*WSTR+dw, b1] & - kernel_vec[co, ci, dh, dw, b2, vc])).astype(out_dtype) << b1b2, + return tvm.sum((tvm.popcount( + data_vec[n, h, w, ci, vh*HSTR+dh, vw*WSTR+dw, b1] & + kernel_vec[co, ci, dh, dw, b2, vc]) - + tvm.popcount( + data_vec[n, h, w, ci, vh*HSTR+dh, vw*WSTR+dw, b1] & + ~kernel_vec[co, ci, dh, dw, b2, vc])).astype(out_dtype) << b1b2, axis=[ci, dh, dw, b1, b2]) + return tvm.sum((tvm.popcount( + data_vec[n, h, w, ci, vh*HSTR+dh, vw*WSTR+dw, b1] & + kernel_vec[co, ci, dh, dw, b2, vc])).astype(out_dtype) << b1b2, + axis=[ci, dh, dw, b1, b2]) + conv = tvm.compute(ovshape, _conv, name='conv_out') return tvm.compute(oshape, lambda n, co, h, w: - conv[n][co//VC][h//VH][w//VW][h%VH][w%VW][co%VC], - name='conv_vec', tag='spatial_qconv_nchw') - + conv[n][co//VC][h//VH][w//VW][h%VH][w%VW][co%VC], + name='conv_vec', tag='spatial_bitserial_conv_nchw') - -def spatial_pack_nhwc(data, kernel, stride, padding, in_bits, weight_bits, pack_dtype, out_dtype, dorefa=False): +def spatial_pack_nhwc(data, kernel, stride, padding, in_bits, weight_bits, + pack_dtype, out_dtype, dorefa=False): """ Compute convolution with pack on spatial axes. """ assert data.shape[0].value == 1, "spatial pack convolution only support batch size=1" data_q = bitpack(data, in_bits, pack_axis=3, bit_axis=4, pack_type=pack_dtype) @@ -326,25 +326,25 @@ def spatial_pack_nhwc(data, kernel, stride, padding, in_bits, weight_bits, pack_ def _conv(n, h, w, co, vh, vw, vc): b1b2 = (b1+b2).astype(out_dtype) if dorefa: - return tvm.sum( + return tvm.sum( (tvm.popcount(data_vec[n, h, w, vh*HSTR+dh, vw*WSTR+dw, ci, b1] & kernel_vec[co, dh, dw, ci, vc, b2]) - tvm.popcount(data_vec[n, h, w, vh*HSTR+dh, vw*WSTR+dw, ci, b1] & ~kernel_vec[co, dh, dw, ci, vc, b2])).astype(out_dtype) << b1b2, axis=[dh, dw, ci, b1, b2]) - else: - return tvm.sum( - tvm.popcount(data_vec[n, h, w, vh*HSTR+dh, vw*WSTR+dw, ci, b1] & - kernel_vec[co, dh, dw, ci, vc, b2]).astype(out_dtype) << b1b2, - axis=[dh, dw, ci, b1, b2]) + + return tvm.sum(tvm.popcount( + data_vec[n, h, w, vh*HSTR+dh, vw*WSTR+dw, ci, b1] & + kernel_vec[co, dh, dw, ci, vc, b2]).astype(out_dtype) << b1b2, + axis=[dh, dw, ci, b1, b2]) conv = tvm.compute(ovshape, _conv, name='conv') return tvm.compute(oshape, lambda n, h, w, co: - conv[n][h//VH][w//VW][co//VC][h%VH][w%VW][co%VC], - name='output_unpack', tag='spatial_qconv_nhwc') + conv[n][h//VH][w//VW][co//VC][h%VH][w%VW][co%VC], + name='output_unpack', tag='spatial_bitserial_conv_nhwc') _SCH_TO_DECL_FUNC_QUANT = { - QuantizedSpatialPackNCHW: spatial_pack_nchw, - QuantizedSpatialPackNHWC: spatial_pack_nhwc, + SpatialPackNCHW: spatial_pack_nchw, + SpatialPackNHWC: spatial_pack_nhwc, } diff --git a/topi/python/topi/rasp/__init__.py b/topi/python/topi/rasp/__init__.py index 8000e752c9ec..270a48504468 100644 --- a/topi/python/topi/rasp/__init__.py +++ b/topi/python/topi/rasp/__init__.py @@ -4,4 +4,4 @@ from .conv2d import schedule_conv2d_nchw from .depthwise_conv2d import schedule_depthwise_conv2d_nchw -from .qconv2d import schedule_qconv2d +from .bitserial_conv2d import schedule_bitserial_conv2d_nhwc diff --git a/topi/python/topi/rasp/bitserial_conv2d.py b/topi/python/topi/rasp/bitserial_conv2d.py new file mode 100644 index 000000000000..03aa1e1fc418 --- /dev/null +++ b/topi/python/topi/rasp/bitserial_conv2d.py @@ -0,0 +1,360 @@ +# pylint: disable=invalid-name,unused-variable,invalid-name +"""Bitserial conv2d schedule on raspberry pi""" +from __future__ import absolute_import as _abs +from collections import namedtuple +import tvm +from .. import tag +from ..nn.pad import pad +from ..nn.bitserial_conv2d import bitserial_conv2d, _get_schedule, _get_workload +from ..nn.bitserial_conv2d import SpatialPackNCHW, _WORKLOADS, spatial_pack_nchw +from ..nn.util import get_pad_tuple, bitpack +from ..util import get_const_int +from .. import generic + +RaspSpatialPack = namedtuple('SpatialPack', + ['vh', 'vw', 'vc', 'ba', 'bc', 'split_ci', 'kfactor']) + +_QUANTIZED_SCHEDULES_NHWC = [ + RaspSpatialPack(2, 2, 8, 1, 1, False, 8), + RaspSpatialPack(1, 4, 8, 4, 1, False, 8), + RaspSpatialPack(1, 4, 8, 1, 16, False, 8), + RaspSpatialPack(1, 4, 8, 4, 8, False, 8), + RaspSpatialPack(1, 7, 8, 3, 8, False, 16), + RaspSpatialPack(1, 2, 8, 1, 8, False, 16), + RaspSpatialPack(2, 1, 8, 1, 4, False, 16), + RaspSpatialPack(1, 7, 8, 1, 1, True, 16), + RaspSpatialPack(1, 1, 8, 1, 16, True, 16), + RaspSpatialPack(1, 1, 8, 1, 8, True, 16), + RaspSpatialPack(1, 1, 8, 1, 16, True, 16), +] + +_QUANTIZED_SCHEDULES_NCHW = [ + # resnet + SpatialPackNCHW(2, 2, 8, 1, 1), + SpatialPackNCHW(1, 4, 8, 4, 1), + SpatialPackNCHW(1, 4, 8, 1, 16), + SpatialPackNCHW(1, 4, 8, 4, 8), + SpatialPackNCHW(1, 7, 8, 3, 8), + SpatialPackNCHW(1, 2, 8, 1, 8), + SpatialPackNCHW(2, 1, 8, 1, 4), + SpatialPackNCHW(1, 7, 8, 1, 1), + SpatialPackNCHW(1, 1, 8, 1, 16), + SpatialPackNCHW(1, 1, 8, 1, 8), + SpatialPackNCHW(1, 1, 8, 1, 16), +] + +@_get_schedule.register("rasp") +def _get_schedule_bitserial_conv2d(wkl, layout): + if wkl not in _WORKLOADS: + raise ValueError("no schedule for such workload: {}".format(wkl)) + idx = _WORKLOADS.index(wkl) + if layout == "NCHW": + sch = _QUANTIZED_SCHEDULES_NCHW[idx] + elif layout == "NHWC": + sch = _QUANTIZED_SCHEDULES_NHWC[idx] + return sch + + +@bitserial_conv2d.register("rasp") +def _declaration_bitserial_conv2d(data, kernel, stride, padding, activation_bits, weight_bits, + layout='NCHW', pack_dtype=None, out_dtype=None, dorefa=False): + if out_dtype is None: + out_dtype = data.dtype + assert data.shape[0].value == 1, "only support batch size=1 convolution on rasp" + assert layout == "NCHW" or layout == "NHWC", "only support layouts NCHW and NHWC" + wkl = _get_workload(data, kernel, stride, padding, out_dtype, layout) + sch = _get_schedule(wkl, layout) + if layout == "NCHW": + return spatial_pack_nchw(data, kernel, stride, padding, activation_bits, weight_bits, + pack_dtype=pack_dtype, out_dtype=out_dtype, dorefa=dorefa) + # TODO: Doesn't support dorefa style yet + return _spatial_pack_nhwc(data, kernel, stride, padding, activation_bits, + weight_bits, out_dtype) + +def kernel_vec_spatial_pack_nhwc(kernel, kernel_bits, VC): + kernel_q = bitpack(kernel, kernel_bits, pack_axis=2, bit_axis=2, pack_type='uint8') + KH, KW, KB, CI, CO = kernel_q.shape + kvshape = (CO//VC, KH, KW, KB, VC, CI) + return tvm.compute(kvshape, lambda co, dh, dw, b, vc, ci: \ + kernel_q[dh][dw][b][ci][co*VC+vc], name='kernel_vec') + +def _spatial_pack_nhwc(data, kernel, stride, padding, activation_bits, weight_bits, out_dtype): + """ Compute convolution with pack on spatial axes. """ + assert data.shape[0].value == 1, "spatial pack convolution only support batch size=1" + wkl = _get_workload(data, kernel, stride, padding, out_dtype, "NHWC") + sch = _get_schedule(wkl, "NHWC") + VH = sch.vh + VW = sch.vw + VC = sch.vc + + data_q = bitpack(data, activation_bits, pack_axis=3, bit_axis=3, pack_type='uint8') + kernel_vec = kernel_vec_spatial_pack_nhwc(kernel, weight_bits, VC) + N, H, W, IB, CI = data_q.shape + OCO, KH, KW, KB, VC, _ = kernel_vec.shape + + CO = OCO * VC + HPAD, WPAD, _, _ = get_pad_tuple(padding, kernel) + + if isinstance(stride, (tuple, list)): + HSTR, WSTR = stride + else: + HSTR, WSTR = stride, stride + HCAT, WCAT = KH-1, KW-1 + + PAD_H = H + 2*HPAD + PAD_W = W + 2*WPAD + OH = (H + 2*HPAD - KH) // HSTR + 1 + OW = (W + 2*WPAD - KW) // WSTR + 1 + dvshape = (N, PAD_H//(VH*HSTR), PAD_W//(VW*WSTR), VH*HSTR+HCAT, VW*WSTR+WCAT, IB, CI) + ovshape = (1, OH // VH, OW // VW, CO // VC, VH, VW, VC) + oshape = (1, OH, OW, CO) + + if (HPAD != 0 and WPAD != 0): + data_pad = pad(data_q, (0, HPAD, WPAD, 0, 0), name="data_pad") + else: + data_pad = data_q + + data_vec = tvm.compute(dvshape, lambda n, h, w, vh, vw, b, ci: \ + data_pad[n][h*VH*HSTR+vh][w*VW*WSTR+vw][b][ci], name='data_vec') + + ci = tvm.reduce_axis((0, CI), name='ci') + dh = tvm.reduce_axis((0, KH), name='dh') + dw = tvm.reduce_axis((0, KW), name='dw') + ib = tvm.reduce_axis((0, IB), name='ib') + kb = tvm.reduce_axis((0, KB), name='kb') + + def _conv(n, h, w, co, vh, vw, vc): + return tvm.sum((tvm.popcount( + kernel_vec[co, dh, dw, kb, vc, ci].astype('uint16') & + data_vec[n, h, w, vh*HSTR+dh, vw*WSTR+dw, ib, ci].astype('uint16')) + << (kb + ib).astype('uint16')), axis=[dh, dw, kb, ib, ci]) + + conv = tvm.compute(ovshape, _conv, name='conv') + + return tvm.compute(oshape, lambda n, h, w, co: + conv[n][h//VH][w//VW][co//VC][h%VH][w%VW][co%VC].astype(out_dtype), + name='output_vec', tag='spatial_bitserial_conv_nhwc') + +def intrin_popcount(m, k_i, w_b, x_b): + dtype = 'uint8' + w = tvm.placeholder((w_b, m, k_i), dtype=dtype, name='w') + x = tvm.placeholder((x_b, k_i,), dtype=dtype, name='x') + k = tvm.reduce_axis((0, k_i), name='k') + bw = tvm.reduce_axis((0, w_b), name='bw') + bx = tvm.reduce_axis((0, x_b), name='bx') + z = tvm.compute((m,), lambda i: + tvm.sum(tvm.popcount(w[bw, i, k].astype('uint16') & x[bx, k].astype('uint16')) + << (bw+bx).astype('uint16'), axis=[bw, bx, k]), name='z') + + Wb = tvm.decl_buffer(w.shape, w.dtype, + name="W", + offset_factor=k_i, + strides=[tvm.var('ldw'), tvm.var('ldw'), 1]) + Xb = tvm.decl_buffer(x.shape, x.dtype, + name="X", + offset_factor=k_i, + strides=[tvm.var('ldw'), 1]) + + def intrin_func(ins, outs): + ww, xx = ins + zz = outs[0] + vpadd_id = tvm.const(647, 'uint32') + vpadalu_id = tvm.const(646, 'uint32') + args_1 = tvm.const(1, 'uint32') + args_2 = tvm.const(2, 'uint32') + + def instr(index): + irb = tvm.ir_builder.create() + if index == 1: + irb.emit(zz.vstore(0, tvm.const(0, 'uint16x8'))) + else: + cnts8 = [None] * 8 + cnts4 = [None] * 4 + cnts2 = [None] * 2 + for bw in range(w_b): + for bx in range(x_b): + if k_i == 16: + for i in range(m): + ands = ww.vload([bw, i, 0], 'uint8x16') & xx.vload([bx, 0], 'uint8x16') + cnts = tvm.popcount(ands) + upper_half = tvm.call_pure_intrin('uint8x8', 'vectorhigh', cnts) + lower_half = tvm.call_pure_intrin('uint8x8', 'vectorlow', cnts) + cnts8[i] = upper_half + lower_half + for i in range(m/2): + cnts4[i] = tvm.call_pure_intrin('uint8x8', 'llvm_intrin', vpadd_id, args_1, cnts8[i*2], cnts8[i*2+1]) + for i in range(m/4): + cnts2[i] = tvm.call_pure_intrin('uint8x8', 'llvm_intrin', vpadd_id, args_1, cnts4[i*2], cnts4[i*2+1]) + cnts = tvm.call_pure_intrin('uint8x16', 'vectorcombine', cnts2[0], cnts2[1]) + shifted_cnts = cnts << (bw+bx) + out = tvm.call_pure_intrin('uint16x8', 'llvm_intrin', vpadalu_id, args_2, zz.vload(0, 'uint16x8'), shifted_cnts) + else: # ki ==8 + for i in range(m): + ands = ww.vload([bw, i, 0], 'uint8x8') & xx.vload([bx, 0], 'uint8x8') + cnts8[i] = tvm.popcount(ands) + for i in range(m/2): + cnts4[i] = tvm.call_pure_intrin('uint8x8', 'llvm_intrin', vpadd_id, args_1, cnts8[i*2], cnts8[i*2+1]) + for i in range(m/4): + cnts2[i] = tvm.call_pure_intrin('uint8x8', 'llvm_intrin', vpadd_id, args_1, cnts4[i*2], cnts4[i*2+1]) + cnts = tvm.call_pure_intrin('uint8x16', 'vectorcombine', cnts2[0], cnts2[1]) + shifted_cnts = cnts << (bw+bx) + out = tvm.call_pure_intrin('uint16x8', 'llvm_intrin', vpadalu_id, args_2, zz.vload(0, 'uint16x8'), shifted_cnts) + irb.emit(zz.vstore(0, out)) + return irb.get() + # body, reset, update + return instr(0), instr(1), instr(2) + with tvm.build_config(offset_factor=1, partition_const_loop=True): + return tvm.decl_tensor_intrin(z.op, intrin_func, binds={w: Wb, x:Xb}) + + +# ARM specific schedule that using custom microkernel +def _schedule_spatial_conv2d_nhwc(s, data, data_q, data_pad, data_vec, + kernel, kernel_q, kernel_vec, + conv_out, output, last): + # no stride and padding info here + _, H, W, IB, CI = data_q.shape + KH, KW, KB, _, CO = kernel_q.shape + KB = get_const_int(KB) + IB = get_const_int(IB) + + if data_pad is None: + padding = (0, 0) + _, in_h, in_w, _, _ = data_q.shape + kern_h, kern_w, _, _ = kernel.shape + _, out_h, out_w, _ = output.shape + hstride = (in_h - kern_h) // (out_h - 1) + wstride = (in_w - kern_w) // (out_w - 1) + stride = get_const_int(hstride), get_const_int(wstride) + else: + _, in_h, in_w, _, _ = data_q.shape + _, pad_h, pad_w, _, _ = data_pad.shape + hpad = (pad_h - in_h) // 2 + wpad = (pad_w - in_w) // 2 + padding = get_const_int(hpad), get_const_int(wpad) + + _, in_h, in_w, _, _ = data_pad.shape + kern_h, kern_w, _, _ = kernel.shape + _, out_h, out_w, _ = output.shape + hstride = (in_h - kern_h) // (out_h - 1) + wstride = (in_w - kern_w) // (out_w - 1) + stride = get_const_int(hstride), get_const_int(wstride) + + wkl = _get_workload(data, kernel, stride, padding, output.dtype, "NHWC") + sch = _get_schedule(wkl, "NHWC") + + VH = sch.vh + VW = sch.vw + VC = sch.vc + ba = sch.ba + bc = sch.bc + + ##### Schedule data packing + if data_pad is not None: + s[data_pad].compute_inline() + + _, h, _, _, _, _, _ = s[data_vec].op.axis + if ba == 1: + oaxis = h + paxis = h + else: + oh, ih = s[data_vec].split(h, ba) + oaxis = oh + paxis = ih + + s[data_vec].parallel(paxis) + s[data_vec].pragma(oaxis, "parallel_launch_point") + s[data_vec].pragma(paxis, "parallel_stride_pattern") + s[data_vec].pragma(oaxis, "parallel_barrier_when_finish") + + + ##### Schedule kernel packing + co, _, _, _, _, _ = s[kernel_vec].op.axis + if bc == 1: + oaxis = co + paxis = co + else: + oco, ico = s[kernel_vec].split(co, bc) + oaxis = oco + paxis = ico + + s[kernel_vec].parallel(paxis) + s[kernel_vec].pragma(oaxis, "parallel_launch_point") + s[kernel_vec].pragma(paxis, "parallel_stride_pattern") + s[kernel_vec].pragma(oaxis, "parallel_barrier_when_finish") + + + ##### Schedule Convolution + n, oh, ow, co, vh, vw, vc = s[conv_out].op.axis + dh, dw, kb, ib, ci = s[conv_out].op.reduce_axis + + kfactor = sch.kfactor + if sch.split_ci: + oci, ici = s[conv_out].split(ci, kfactor) + s[conv_out].reorder(n, oh, ow, co, vh, vw, dh, dw, oci, kb, ib, vc, ici) + else: + s[conv_out].reorder(n, oh, ow, co, vh, vw, dh, dw, kb, ib, vc, ci) + + pc = intrin_popcount(8, kfactor, KB, IB) + s[conv_out].tensorize(kb, pc) + + n, h, w, co = s[last].op.axis + co, vc = s[last].split(co, VC) + oh, ow, vh, vw = s[last].tile(h, w, VH, VW) + s[last].reorder(n, oh, ow, co, vc, vh, vw) + s[last].vectorize(vw) + if last != output: + s[last].compute_inline() + + s[conv_out].compute_at(s[last], ow) + if co == 1: + oaxis = oh + paxis = oh + else: + oho, iho = s[last].split(oh, bc) + oaxis = oho + paxis = iho + + s[last].parallel(paxis) + s = s.normalize() + return s + + +@generic.schedule_bitserial_conv2d_nhwc.register(["rasp"]) +def schedule_bitserial_conv2d_nhwc(outs): + s = tvm.create_schedule([x.op for x in outs]) + def traverse(op): + """Traverse operators from computation graph""" + # inline all one-to-one-mapping operators except the last stage (output) + if tag.is_broadcast(op.tag): + if op not in s.outputs: + s[op].compute_inline() + for tensor in op.input_tensors: + if tensor.op.input_tensors: + traverse(tensor.op) + + if 'spatial_bitserial_conv_nhwc' in op.tag: + # print "spatial" + output = op.output(0) + conv_out = op.input_tensors[0] + kernel_vec = conv_out.op.input_tensors[0] + kernel_q = kernel_vec.op.input_tensors[0] + kernel = kernel_q.op.input_tensors[0] + if "QuantizeInput" in kernel.op.name: + # Need to go up 1 further, from the combine in bitpack + kernel = kernel.op.input_tensors[0] + data_vec = conv_out.op.input_tensors[1] + data_q = data_vec.op.input_tensors[0] + data = data_q.op.input_tensors[0] + data_pad = None + if isinstance(data_q.op, tvm.tensor.ComputeOp) and "pad" in data_q.op.tag: + data_pad = data_q + data_q = data + data = data_q.op.input_tensors[0] + if "QuantizeInput" in data.op.name: + # Need to go up 1 further, from the combine in bitpack + data = data.op.input_tensors[0] + + _schedule_spatial_conv2d_nhwc(s, data, data_q, data_pad, data_vec, + kernel, kernel_q, kernel_vec, conv_out, output, outs[0]) + + traverse(outs[0].op) + return s diff --git a/topi/python/topi/rasp/qconv2d.py b/topi/python/topi/rasp/qconv2d.py deleted file mode 100644 index b0f7fcb011fe..000000000000 --- a/topi/python/topi/rasp/qconv2d.py +++ /dev/null @@ -1,619 +0,0 @@ -# pylint: disable=invalid-name,unused-variable,invalid-name -"""QConv2D schedule on raspberry pi""" -from __future__ import absolute_import as _abs -import tvm -from tvm import target as _target -from .. import tag -from ..nn.qconv2d import qconv2d as _qconv2d, _get_schedule -from ..nn.qconv2d import RaspQuantizedSpatialPack, QuantizedSpatialPackNCHW, QuantizedSpatialPackNHWC -from ..nn.qconv2d import _WORKLOADS, _SCH_TO_DECL_FUNC_QUANT -from ..nn.qconv2d import _get_workload -from ..nn.util import infer_pad, infer_stride -from ..util import simplify, get_const_int - -from .. import generic - -# TODO grab the number from autotuner -_QUANTIZED_SCHEDULES = [ - RaspQuantizedSpatialPack(2, 2, 8, 1, 1, False, 8), - RaspQuantizedSpatialPack(1, 4, 8, 4, 1, False, 8), - RaspQuantizedSpatialPack(1, 4, 8, 1, 16, False, 8), - RaspQuantizedSpatialPack(1, 4, 8, 4, 8, False, 8), - RaspQuantizedSpatialPack(1, 7, 8, 3, 8, False, 16), - RaspQuantizedSpatialPack(1, 2, 8, 1, 8, False, 16), - RaspQuantizedSpatialPack(2, 1, 8, 1, 4, False, 16), - RaspQuantizedSpatialPack(1, 7, 8, 1, 1, True, 16), - RaspQuantizedSpatialPack(1, 1, 8, 1, 16, True, 16), - RaspQuantizedSpatialPack(1, 1, 8, 1, 8, True, 16), - RaspQuantizedSpatialPack(1, 1, 8, 1, 16, True, 16), -] - -# TODO grab the number from autotuner -_QUANTIZED_SCHEDULES_NCHW = [ - # resnet - QuantizedSpatialPackNCHW(2, 2, 8, 1, 1), - QuantizedSpatialPackNCHW(1, 4, 8, 4, 1), - QuantizedSpatialPackNCHW(1, 4, 8, 1, 16), - QuantizedSpatialPackNCHW(1, 4, 8, 4, 8), - QuantizedSpatialPackNCHW(1, 7, 8, 3, 8), - QuantizedSpatialPackNCHW(1, 2, 8, 1, 8), - QuantizedSpatialPackNCHW(2, 1, 8, 1, 4), - QuantizedSpatialPackNCHW(1, 7, 8, 1, 1), - QuantizedSpatialPackNCHW(1, 1, 8, 1, 16), - QuantizedSpatialPackNCHW(1, 1, 8, 1, 8), - QuantizedSpatialPackNCHW(1, 1, 8, 1, 16), -] - -_QUANTIZED_SCHEDULES_NHWC = [ - # resnet - QuantizedSpatialPackNHWC(2, 2, 8, 1, 1), - QuantizedSpatialPackNHWC(1, 4, 8, 4, 1), - QuantizedSpatialPackNHWC(1, 4, 8, 1, 16), - QuantizedSpatialPackNHWC(1, 4, 8, 4, 8), - QuantizedSpatialPackNHWC(1, 7, 8, 3, 8), - QuantizedSpatialPackNHWC(1, 2, 8, 1, 8), - QuantizedSpatialPackNHWC(2, 1, 8, 1, 4), - QuantizedSpatialPackNHWC(1, 7, 8, 1, 1), - QuantizedSpatialPackNHWC(1, 1, 8, 1, 16), - QuantizedSpatialPackNHWC(1, 1, 8, 1, 8), - QuantizedSpatialPackNHWC(1, 1, 8, 1, 16), -] - - -@_get_schedule.register("rasp") -def _get_schedule_qconv2d(wkl, layout): - if wkl not in _WORKLOADS: - raise ValueError("no schedule for such workload: {}".format(wkl)) - idx = _WORKLOADS.index(wkl) - if layout == "NCHW": - sch = _QUANTIZED_SCHEDULES_NCHW[idx] - elif layout == "NHWC": - sch = _QUANTIZED_SCHEDULES_NHWC[idx] - return sch - - -@_qconv2d.register("rasp") -def _declaration_qconv2d(data, kernel, stride, padding, activation_bits, weight_bits, layout='NCHW', - pack_dtype=None, out_dtype=None, dorefa=False): - if out_dtype is None: - out_dtype = data.dtype - assert data.shape[0].value == 1, "only support batch size=1 convolution on rasp" - assert layout == "NCHW" or layout == "NHWC", "only support layouts NCHW and NHWC" - wkl = _get_workload(data, kernel, stride, padding, out_dtype, layout) - sch = _get_schedule(wkl, layout) - return _SCH_TO_DECL_FUNC_QUANT[type(sch)](data, kernel, stride, padding, activation_bits, weight_bits, - pack_dtype, out_dtype, dorefa) - -# TODO: is there a better way to share these with x86? - -@generic.schedule_qconv2d_nchw.register(["rasp"]) -@generic.schedule_qconv2d_nhwc.register(["rasp"]) -def schedule_qconv2d(outs): - s = tvm.create_schedule([x.op for x in outs]) - - def traverse(op): - output = op.output(0) - # inline all one-to-one-mapping operators except the last stage (output) - if tag.is_broadcast(op.tag) or 'elemwise' in op.tag or 'uquantize' in op.tag: - if op not in s.outputs: - s[op].compute_inline() - for tensor in op.input_tensors: - if tensor.op.input_tensors: - traverse(tensor.op) - - elif 'spatial_qconv_nchw' in op.tag or 'spatial_qconv_nhwc' in op.tag : - conv_out = op.input_tensors[0] - kernel_vec = conv_out.op.input_tensors[1] - kernel_q = kernel_vec.op.input_tensors[0] - kernel = kernel_q.op.input_tensors[0] - data_vec = conv_out.op.input_tensors[0] - data_q = data_vec.op.input_tensors[0] - data = data_q.op.input_tensors[0] - data_pad = None - if isinstance(data_q.op, tvm.tensor.ComputeOp) and "pad" in data_q.op.tag: - data_pad = data_q - data_q = data - data = data_q.op.input_tensors[0] - - # Need to go up 1 further, from the combine in bitpack - if "QuantizeInput" in kernel.op.name: - kernel = kernel.op.input_tensors[0] - if "QuantizeInput" in data.op.name: - data = data.op.input_tensors[0] - - if 'spatial_qconv_nchw' in op.tag: - _schedule_spatial_conv2d_nchw(s, data, data_q, data_pad, data_vec, - kernel, kernel_q, kernel_vec, - conv_out, output, outs[0]) - elif 'spatial_qconv_nhwc' in op.tag: - _schedule_spatial_conv2d_nhwc(s, data, data_q, data_pad, data_vec, - kernel, kernel_q, kernel_vec, - conv_out, output, outs[0]) - - traverse(outs[0].op) - return s - - -def _schedule_spatial_conv2d_nchw(s, data, data_q, data_pad, data_vec, kernel, kernel_q, kernel_vec, conv_out, output, last): - IB, _, CI, IH, IW = data_q.shape - KB, CO, _, KH, KW = kernel_q.shape - _, _, OH, OW = output.shape - - # Infer padding and stride - if data_pad is None: - padding = (0, 0) - TH, TW = IH, IW - else: - _, _, _, TH, TW = data_pad.shape - hpad = get_const_int((TH - IH) // 2) - wpad = get_const_int((TW - IW) // 2) - padding = (hpad, wpad) - - hstride = get_const_int((TH - KH) // (OH - 1)) - wstride = get_const_int((TW - KW) // (OW - 1)) - stride = (hstride, wstride) - - wkl = _get_workload(data, kernel, stride, padding, last.dtype, "NCHW") - sch = _get_schedule(wkl, "NCHW") - VH = sch.vh - VW = sch.vw - VC = sch.vc - ba = sch.ba - bc = sch.bc - - CC = s.cache_write(conv_out, "global") - - n, co, oh, ow, vh, vw, vc = s[conv_out].op.axis - s[conv_out].vectorize(vc) - - s[CC].compute_at(s[conv_out], ow) - n, co, oh, ow, vh, vw, vc = s[CC].op.axis - ci, dh, dw, b1, b2 = s[CC].op.reduce_axis - s[CC].reorder(ci, dh, vh, dw, vw, b1, b2, vc) - s[CC].unroll(b1) - s[CC].unroll(b2) - s[CC].vectorize(vc) - - ##### Schedule A - if data_pad is not None: - s[data_pad].compute_inline() - - _, h, _, _, _, _ , vw = s[data_vec].op.axis - s[data_vec].vectorize(vw) - if ba == 1: - oaxis = h - paxis = h - else: - oh, ih = s[data_vec].split(h, ba) - oaxis = oh - paxis = ih - - s[data_vec].parallel(paxis) - s[data_vec].pragma(oaxis, "parallel_launch_point") - s[data_vec].pragma(paxis, "parallel_stride_pattern") - s[data_vec].pragma(oaxis, "parallel_barrier_when_finish") - - - ##### Schedule B - co, _, _, _, _, vc = s[kernel_vec].op.axis - s[kernel_vec].vectorize(vc) - if bc == 1: - oaxis = co - paxis = co - else: - oco, ico = s[kernel_vec].split(co, bc) - oaxis = oco - paxis = ico - - s[kernel_vec].parallel(paxis) - s[kernel_vec].pragma(oaxis, "parallel_launch_point") - s[kernel_vec].pragma(paxis, "parallel_stride_pattern") - s[kernel_vec].pragma(oaxis, "parallel_barrier_when_finish") - - - ##### Schedule C - n, co, h, w = s[last].op.axis - co, vc = s[last].split(co, VC) - oh, ow, vh, vw = s[last].tile(h, w, VH, VW) - s[last].reorder(n, co, oh, ow, vh, vw, vc) - if last != output: - s[output].compute_inline() - s[conv_out].compute_at(s[last], ow) - - if bc == 1: - oaxis = co - paxis = co - else: - oco, ico = s[last].split(co, bc) - oaxis = oco - paxis = ico - - s[last].parallel(paxis) - s[last].pragma(oaxis, "parallel_launch_point") - s[last].pragma(paxis, "parallel_stride_pattern") - s[last].pragma(oaxis, "parallel_barrier_when_finish") - - return s - -def _schedule_spatial_conv2d_nhwc(s, data, data_q, data_pad, data_vec, - kernel, kernel_q, kernel_vec, - conv_out, output, last): - return s - _, IH, IW, CI, IB = data_q.shape - KH, KW, _, CO, KB = kernel_q.shape - _, OH, OW, _ = output.shape - # Infer padding and stride - if data_pad is None: - padding = (0, 0) - TH, TW = IH, IW - else: - _, TH, TW, _, _ = data_pad.shape - hpad = get_const_int((TH - IH) // 2) - wpad = get_const_int((TW - IW) // 2) - padding = (hpad, wpad) - - hstride = get_const_int((TH - KH) // (OH - 1)) - wstride = get_const_int((TW - KW) // (OW - 1)) - stride = (hstride, wstride) - - wkl = _get_workload(data, kernel, stride, padding, output.dtype, "NHWC") - sch = _get_schedule(wkl, "NHWC") - VH = sch.vh - VW = sch.vw - VC = sch.vc - ba = sch.ba - bc = sch.bc - - ##### Schedule data packing - if data_pad is not None: - s[data_pad].compute_inline() - - _, h, _, _, _, _ , _ = s[data_vec].op.axis - if ba == 1: - oaxis = h - paxis = h - else: - oh, ih = s[data_vec].split(h, ba) - oaxis = oh - paxis = ih - s[data_vec].parallel(paxis) - s[data_vec].pragma(oaxis, "parallel_launch_point") - s[data_vec].pragma(paxis, "parallel_stride_pattern") - s[data_vec].pragma(oaxis, "parallel_barrier_when_finish") - - - ##### Schedule kernel packing - co, _, _, _, _, _ = s[kernel_vec].op.axis - if bc == 1: - oaxis = co - paxis = co - else: - oco, ico = s[kernel_vec].split(co, bc) - oaxis = oco - paxis = ico - - s[kernel_vec].parallel(paxis) - s[kernel_vec].pragma(oaxis, "parallel_launch_point") - s[kernel_vec].pragma(paxis, "parallel_stride_pattern") - s[kernel_vec].pragma(oaxis, "parallel_barrier_when_finish") - - - ##### Schedule Convolution - n, oh, ow, co, vh, vw, vc = s[conv_out].op.axis - dh, dw, ci, b1, b2 = s[conv_out].op.reduce_axis - - s[conv_out].reorder(n, oh, ow, co, vh, vw, dh, dw, ci, vc, b1, b2) - - s[conv_out].unroll(b1) - s[conv_out].unroll(b2) - s[conv_out].vectorize(vc) - - # # Schedule output - n, h, w, co = s[last].op.axis - co, vc = s[last].split(co, VC) - oh, ow, vh, vw = s[last].tile(h, w, VH, VW) - s[last].reorder(n, oh, ow, co, vh, vw, vc) - s[last].vectorize(vc) - if last != output: - s[output].compute_inline() - s[conv_out].compute_at(s[last], ow) - - - if bc == 1: - oaxis = oh - paxis = oh - else: - oho, iho = s[last].split(oh, bc) - oaxis = oho - paxis = iho - - s[last].parallel(paxis) - s[last].pragma(oaxis, "parallel_launch_point") - s[last].pragma(paxis, "parallel_stride_pattern") - s[last].pragma(oaxis, "parallel_barrier_when_finish") - - return s - -####### ARM SPECIFIC ####### -def _spatial_pack_nhwc(data, kernel, stride, padding, activation_bits, weight_bits, out_dtype): - """ Compute convolution with pack on spatial axes. """ - assert data.shape[0].value == 1, "spatial pack convolution only support batch size=1" - print (out_dtype) - wkl = _get_workload(data, kernel, stride, padding, out_dtype, "NHWC") - sch = _get_schedule(wkl) - VH = sch.vh - VW = sch.vw - VC = sch.vc - - data_q = bitpack(data, activation_bits, pack_axis=3, bit_axis=3, pack_type='uint8') - kernel_vec = kernel_vec_spatial_pack_nhwc(kernel, weight_bits, VC) - N, H, W, IB, CI = data_q.shape - OCO, KH, KW, KB, VC, _ = kernel_vec.shape - - CO = OCO * VC - HPAD, WPAD, _, _ = get_pad_tuple(padding, kernel) - - if isinstance(stride, (tuple, list)): - HSTR, WSTR = stride - else: - HSTR, WSTR = stride, stride - HCAT, WCAT = KH-1, KW-1 - - - PAD_H = H + 2*HPAD - PAD_W = W + 2*WPAD - OH = (H + 2*HPAD - KH) // HSTR + 1 - OW = (W + 2*WPAD - KW) // WSTR + 1 - dvshape = (N, PAD_H//(VH*HSTR), PAD_W//(VW*WSTR), VH*HSTR+HCAT, VW*WSTR+WCAT, IB, CI) - ovshape = (1, OH // VH, OW // VW, CO // VC, VH, VW, VC) - oshape = (1, OH, OW, CO) - - if (HPAD != 0 and WPAD != 0): - data_pad = pad(data_q, (0, HPAD, WPAD, 0, 0), name="data_pad") - else: - data_pad = data_q - - data_vec = tvm.compute(dvshape, lambda n, h, w, vh, vw, b, ci: \ - data_pad[n][h*VH*HSTR+vh][w*VW*WSTR+vw][b][ci], name='data_vec') - - ci = tvm.reduce_axis((0, CI), name='ci') - dh = tvm.reduce_axis((0, KH), name='dh') - dw = tvm.reduce_axis((0, KW), name='dw') - ib = tvm.reduce_axis((0, IB), name='ib') - kb = tvm.reduce_axis((0, KB), name='kb') - - def _conv(n, h, w, co, vh, vw, vc): - return tvm.sum( - (tvm.popcount(kernel_vec[co, dh, dw, kb, vc, ci] & - data_vec[n, h, w, vh*HSTR+dh, vw*WSTR+dw, ib, ci]).astype('int16') - << (kb + ib).astype('int16')), axis=[dh, dw, kb, ib, ci]) - - conv = tvm.compute(ovshape, _conv, name='conv') - - return tvm.compute(oshape, lambda n, h, w, co: - conv[n][h//VH][w//VW][co//VC][h%VH][w%VW][co%VC].astype(out_dtype), - name='output_vec', tag='spatial_qconv_nhwc') - -def intrin_popcount(m, k_i, w_b, x_b): - type = 'uint8' - w = tvm.placeholder((w_b, m, k_i), dtype=type, name='w') - x = tvm.placeholder((x_b, k_i,), dtype=type, name='x') - k = tvm.reduce_axis((0, k_i), name='k') - bw = tvm.reduce_axis((0, w_b), name='bw') - bx = tvm.reduce_axis((0, x_b), name='bx') - z = tvm.compute((m,), lambda i: - tvm.sum(tvm.popcount(w[bw, i, k].astype('uint16') & x[bx, k].astype('uint16')) << (bw+bx).astype('uint16'), - axis=[bw, bx, k]), name='z') - - Wb = tvm.decl_buffer(w.shape, w.dtype, - name="W", - offset_factor=k_i, - strides=[tvm.var('ldw'), tvm.var('ldw'), 1]) - Xb = tvm.decl_buffer(x.shape, x.dtype, - name="X", - offset_factor=k_i, - strides=[tvm.var('ldw'), 1]) - - - def intrin_func(ins, outs): - ww, xx = ins - zz = outs[0] - vpadd_id = tvm.const(647, 'uint32') - vpadalu_id = tvm.const(646, 'uint32') - args_1 = tvm.const(1, 'uint32') - args_2 = tvm.const(2, 'uint32') - - def instr(index): - irb = tvm.ir_builder.create() - if index == 1: - irb.emit(zz.vstore(0, tvm.const(0, 'uint16x8'))) - else: - cnts8 = [None] * 8 - cnts4 = [None] * 4 - cnts2 = [None] * 2 - for bw in range(w_b): - for bx in range(x_b): - if k_i == 16: - for i in range(m): - ands = ww.vload([bw, i, 0], 'uint8x16') & xx.vload([bx, 0], 'uint8x16') - cnts = tvm.popcount(ands) - upper_half = tvm.call_pure_intrin('uint8x8', 'vectorhigh', cnts) - lower_half = tvm.call_pure_intrin('uint8x8', 'vectorlow', cnts) - cnts8[i] = upper_half + lower_half - for i in range(m/2): - cnts4[i] = tvm.call_pure_intrin('uint8x8', 'llvm_intrin', vpadd_id, args_1, cnts8[i*2], cnts8[i*2+1]) - for i in range(m/4): - cnts2[i] = tvm.call_pure_intrin('uint8x8', 'llvm_intrin', vpadd_id, args_1, cnts4[i*2], cnts4[i*2+1]) - cnts = tvm.call_pure_intrin('uint8x16', 'vectorcombine', cnts2[0], cnts2[1]) - shifted_cnts = cnts << (bw+bx) - out = tvm.call_pure_intrin('uint16x8', 'llvm_intrin', vpadalu_id, args_2, zz.vload(0, 'uint16x8'), shifted_cnts) - else: # ki ==8 - for i in range(m): - ands = ww.vload([bw, i, 0], 'uint8x8') & xx.vload([bx, 0], 'uint8x8') - cnts8[i] = tvm.popcount(ands) - for i in range(m/2): - cnts4[i] = tvm.call_pure_intrin('uint8x8', 'llvm_intrin', vpadd_id, args_1, cnts8[i*2], cnts8[i*2+1]) - for i in range(m/4): - cnts2[i] = tvm.call_pure_intrin('uint8x8', 'llvm_intrin', vpadd_id, args_1, cnts4[i*2], cnts4[i*2+1]) - cnts = tvm.call_pure_intrin('uint8x16', 'vectorcombine', cnts2[0], cnts2[1]) - shifted_cnts = cnts << (bw+bx) - out = tvm.call_pure_intrin('uint16x8', 'llvm_intrin', vpadalu_id, args_2, zz.vload(0, 'uint16x8'), shifted_cnts) - irb.emit(zz.vstore(0, out)) - return irb.get() - # body, reset, update - return instr(0), instr(1), instr(2) - with tvm.build_config(offset_factor=1, partition_const_loop=True): - return tvm.decl_tensor_intrin(z.op, intrin_func, binds={w: Wb, x:Xb}) - - -# ARM specific schedule that using custom microkernel -def arm_schedule_spatial_conv2d_nhwc(s, data, data_q, data_pad, data_vec, - kernel, kernel_q, kernel_vec, conv_out, output, last): - # no stride and padding info here - _, H, W, IB, CI = data_q.shape - KH, KW, KB, _, CO = kernel_q.shape - KB = get_const_int(KB) - IB = get_const_int(IB) - - if data_pad is None: - padding = (0,0) - _, in_h, in_w, _ , _ = data_q.shape - kern_h, kern_w, _, _ = kernel.shape - _, out_h, out_w, _ = output.shape - hstride = (in_h - kern_h) // (out_h - 1) - wstride = (in_w - kern_w) // (out_w - 1) - stride = get_const_int(hstride), get_const_int(wstride) - else: - _, in_h, in_w, _, _ = data_q.shape - _, pad_h, pad_w, _, _ = data_pad.shape - hpad = (pad_h - in_h) // 2 - wpad = (pad_w - in_w) // 2 - padding = get_const_int(hpad), get_const_int(wpad) - - _, in_h, in_w, _, _ = data_pad.shape - kern_h, kern_w, _, _ = kernel.shape - _, out_h, out_w, _ = output.shape - hstride = (in_h - kern_h) // (out_h - 1) - wstride = (in_w - kern_w) // (out_w - 1) - stride = get_const_int(hstride), get_const_int(wstride) - - wkl = _get_workload(data, kernel, stride, padding, output.dtype, "NHWC") - sch = _get_schedule(wkl, "NHWC") - - VH = sch.vh - VW = sch.vw - VC = sch.vc - ba = sch.ba - bc = sch.bc - - ##### Schedule data packing - if data_pad is not None: - s[data_pad].compute_inline() - - _, h, _, _, _, _, _ = s[data_vec].op.axis - if ba == 1: - oaxis = h - paxis = h - else: - oh, ih = s[data_vec].split(h, ba) - oaxis = oh - paxis = ih - - s[data_vec].parallel(paxis) - s[data_vec].pragma(oaxis, "parallel_launch_point") - s[data_vec].pragma(paxis, "parallel_stride_pattern") - s[data_vec].pragma(oaxis, "parallel_barrier_when_finish") - - - ##### Schedule kernel packing - co, _, _, _, _, _ = s[kernel_vec].op.axis - if bc == 1: - oaxis = co - paxis = co - else: - oco, ico = s[kernel_vec].split(co, bc) - oaxis = oco - paxis = ico - - s[kernel_vec].parallel(paxis) - s[kernel_vec].pragma(oaxis, "parallel_launch_point") - s[kernel_vec].pragma(paxis, "parallel_stride_pattern") - s[kernel_vec].pragma(oaxis, "parallel_barrier_when_finish") - - - ##### Schedule Convolution - n, oh, ow, co, vh, vw, vc = s[conv_out].op.axis - dh, dw, kb, ib, ci = s[conv_out].op.reduce_axis - - kfactor = sch.kfactor - if sch.split_ci: - oci, ici = s[conv_out].split(ci, kfactor) - s[conv_out].reorder(n, oh, ow, co, vh, vw, dh, dw, oci, kb, ib, vc, ici) - else: - s[conv_out].reorder(n, oh, ow, co, vh, vw, dh, dw, kb, ib, vc, ci) - - pc = intrin_popcount(8, kfactor, KB, IB) - s[conv_out].tensorize(kb, pc) - - n, h, w, co = s[last].op.axis - co, vc = s[last].split(co, VC) - oh, ow, vh, vw = s[last].tile(h, w, VH, VW) - s[last].reorder(n, oh, ow, co, vc, vh, vw) - s[last].vectorize(vw) - if last != output: - s[last].compute_inline() - - s[conv_out].compute_at(s[last], ow) - if co == 1: - oaxis = oh - paxis = oh - else: - oho, iho = s[last].split(oh, bc) - oaxis = oho - paxis = iho - - s[last].parallel(paxis) - s = s.normalize() - return s - - -# @generic.schedule_qconv2d_nhwc.register(["rasp"]) -def schedule_qconv2d_nhwc(outs): - s = tvm.create_schedule([x.op for x in outs]) - def traverse(op): - """Traverse operators from computation graph""" - # inline all one-to-one-mapping operators except the last stage (output) - if tag.is_broadcast(op.tag): - if op not in s.outputs: - s[op].compute_inline() - for tensor in op.input_tensors: - if tensor.op.input_tensors: - traverse(tensor.op) - - if 'spatial_qconv_nhwc' in op.tag: - # print "spatial" - output = op.output(0) - conv_out = op.input_tensors[0] - kernel_vec = conv_out.op.input_tensors[0] - kernel_q = kernel_vec.op.input_tensors[0] - kernel = kernel_q.op.input_tensors[0] - if "QuantizeInput" in kernel.op.name: - # Need to go up 1 further, from the combine in bitpack - kernel = kernel.op.input_tensors[0] - data_vec = conv_out.op.input_tensors[1] - data_q = data_vec.op.input_tensors[0] - data = data_q.op.input_tensors[0] - data_pad = None - if isinstance(data_q.op, tvm.tensor.ComputeOp) and "pad" in data_q.op.tag: - data_pad = data_q - data_q = data - data = data_q.op.input_tensors[0] - if "QuantizeInput" in data.op.name: - # Need to go up 1 further, from the combine in bitpack - data = data.op.input_tensors[0] - - _schedule_spatial_conv2d_nhwc(s, data, data_q, data_pad, data_vec, - kernel, kernel_q, kernel_vec, conv_out, output, outs[0]) - - traverse(outs[0].op) - return s diff --git a/topi/python/topi/x86/__init__.py b/topi/python/topi/x86/__init__.py index 3ee6e6ee34a6..78f18b4ebf7e 100644 --- a/topi/python/topi/x86/__init__.py +++ b/topi/python/topi/x86/__init__.py @@ -8,4 +8,5 @@ from .nn import * from .injective import * from .pooling import schedule_pool, schedule_global_pool -from .qconv2d import schedule_qconv2d +from .bitserial_conv2d import schedule_bitserial_conv2d +from .qdense import schedule_qdense diff --git a/topi/python/topi/x86/qconv2d.py b/topi/python/topi/x86/bitserial_conv2d.py similarity index 79% rename from topi/python/topi/x86/qconv2d.py rename to topi/python/topi/x86/bitserial_conv2d.py index 1375c5436734..522e6eb32208 100644 --- a/topi/python/topi/x86/qconv2d.py +++ b/topi/python/topi/x86/bitserial_conv2d.py @@ -1,49 +1,49 @@ # pylint: disable=invalid-name,unused-variable,invalid-name -"""QConv2D schedule on x86""" +"""Bitserial conv2d schedule on x86""" import tvm +from topi.util import get_const_int from .. import generic, tag -from .. import nn -from ..nn.util import infer_pad, infer_stride -from topi.util import simplify, get_const_int -from ..nn.qconv2d import qconv2d as _qconv2d, _get_schedule -from ..nn.qconv2d import QuantizedSpatialPackNCHW, QuantizedSpatialPackNHWC -from ..nn.qconv2d import _WORKLOADS, _SCH_TO_DECL_FUNC_QUANT -from ..nn.qconv2d import _get_workload +from ..nn.bitserial_conv2d import bitserial_conv2d, _get_schedule, _get_workload +from ..nn.bitserial_conv2d import SpatialPackNCHW, SpatialPackNHWC +from ..nn.bitserial_conv2d import _WORKLOADS, _SCH_TO_DECL_FUNC_QUANT - -# TODO grab the number from autotuner _QUANTIZED_SCHEDULES_NCHW = [ # resnet - QuantizedSpatialPackNCHW(2, 2, 8, 1, 1), - QuantizedSpatialPackNCHW(1, 4, 8, 4, 1), - QuantizedSpatialPackNCHW(1, 4, 8, 1, 16), - QuantizedSpatialPackNCHW(1, 4, 8, 4, 8), - QuantizedSpatialPackNCHW(1, 7, 8, 3, 8), - QuantizedSpatialPackNCHW(1, 2, 8, 1, 8), - QuantizedSpatialPackNCHW(2, 1, 8, 1, 4), - QuantizedSpatialPackNCHW(1, 7, 8, 1, 1), - QuantizedSpatialPackNCHW(1, 1, 8, 1, 16), - QuantizedSpatialPackNCHW(1, 1, 8, 1, 8), - QuantizedSpatialPackNCHW(1, 1, 8, 1, 16), + SpatialPackNCHW(2, 2, 8, 1, 1), + SpatialPackNCHW(1, 4, 8, 4, 1), + SpatialPackNCHW(1, 4, 8, 1, 16), + SpatialPackNCHW(1, 4, 8, 4, 8), + SpatialPackNCHW(1, 7, 8, 3, 8), + SpatialPackNCHW(1, 2, 8, 1, 8), + SpatialPackNCHW(2, 1, 8, 1, 4), + SpatialPackNCHW(1, 7, 8, 1, 1), + SpatialPackNCHW(1, 1, 8, 1, 16), + SpatialPackNCHW(1, 1, 8, 1, 8), + SpatialPackNCHW(1, 1, 8, 1, 16), + + SpatialPackNCHW(3, 3, 16, 3, 16), + SpatialPackNCHW(1, 1, 16, 2, 16), + SpatialPackNCHW(1, 1, 8, 1, 16), + SpatialPackNCHW(1, 1, 8, 1, 16), ] _QUANTIZED_SCHEDULES_NHWC = [ # resnet - QuantizedSpatialPackNHWC(2, 2, 8, 1, 1), - QuantizedSpatialPackNHWC(1, 4, 8, 4, 1), - QuantizedSpatialPackNHWC(1, 4, 8, 1, 16), - QuantizedSpatialPackNHWC(1, 4, 8, 4, 8), - QuantizedSpatialPackNHWC(1, 7, 8, 3, 8), - QuantizedSpatialPackNHWC(1, 2, 8, 1, 8), - QuantizedSpatialPackNHWC(2, 1, 8, 1, 4), - QuantizedSpatialPackNHWC(1, 7, 8, 1, 1), - QuantizedSpatialPackNHWC(1, 1, 8, 1, 16), - QuantizedSpatialPackNHWC(1, 1, 8, 1, 8), - QuantizedSpatialPackNHWC(1, 1, 8, 1, 16), + SpatialPackNHWC(2, 2, 8, 1, 1), + SpatialPackNHWC(1, 4, 8, 4, 1), + SpatialPackNHWC(1, 4, 8, 1, 16), + SpatialPackNHWC(1, 4, 8, 4, 8), + SpatialPackNHWC(1, 7, 8, 3, 8), + SpatialPackNHWC(1, 2, 8, 1, 8), + SpatialPackNHWC(2, 1, 8, 1, 4), + SpatialPackNHWC(1, 7, 8, 1, 1), + SpatialPackNHWC(1, 1, 8, 1, 16), + SpatialPackNHWC(1, 1, 8, 1, 8), + SpatialPackNHWC(1, 1, 8, 1, 16), ] @_get_schedule.register("cpu") -def _get_schedule_qconv2d(wkl, layout): +def _get_schedule_bitserial_conv2d(wkl, layout): if wkl not in _WORKLOADS: raise ValueError("no schedule for such workload: {}".format(wkl)) idx = _WORKLOADS.index(wkl) @@ -53,10 +53,9 @@ def _get_schedule_qconv2d(wkl, layout): sch = _QUANTIZED_SCHEDULES_NHWC[idx] return sch - -@_qconv2d.register("cpu") -def _declaration_qconv2d(data, kernel, stride, padding, activation_bits, weight_bits, layout='NCHW', - pack_dtype=None, out_dtype=None, dorefa=False): +@bitserial_conv2d.register("cpu") +def _declaration_bitserial_conv2d(data, kernel, stride, padding, activation_bits, weight_bits, + layout='NCHW', pack_dtype=None, out_dtype=None, dorefa=False): if out_dtype is None: out_dtype = data.dtype assert data.shape[0].value == 1, "only support batch size=1 convolution on rasp" @@ -64,12 +63,12 @@ def _declaration_qconv2d(data, kernel, stride, padding, activation_bits, weight wkl = _get_workload(data, kernel, stride, padding, out_dtype, layout) sch = _get_schedule(wkl, layout) - return _SCH_TO_DECL_FUNC_QUANT[type(sch)](data, kernel, stride, padding, activation_bits, weight_bits, - pack_dtype, out_dtype, dorefa) + return _SCH_TO_DECL_FUNC_QUANT[type(sch)](data, kernel, stride, padding, activation_bits, + weight_bits, pack_dtype, out_dtype, dorefa) -@generic.schedule_qconv2d_nchw.register(["cpu"]) -@generic.schedule_qconv2d_nhwc.register(["cpu"]) -def schedule_qconv2d(outs): +@generic.schedule_bitserial_conv2d_nchw.register(["cpu"]) +@generic.schedule_bitserial_conv2d_nhwc.register(["cpu"]) +def schedule_bitserial_conv2d(outs): s = tvm.create_schedule([x.op for x in outs]) def traverse(op): @@ -82,7 +81,7 @@ def traverse(op): if tensor.op.input_tensors: traverse(tensor.op) - elif 'spatial_qconv_nchw' in op.tag or 'spatial_qconv_nhwc' in op.tag : + elif 'spatial_bitserial_conv_nchw' in op.tag or 'spatial_bitserial_conv_nhwc' in op.tag: conv_out = op.input_tensors[0] kernel_vec = conv_out.op.input_tensors[1] kernel_q = kernel_vec.op.input_tensors[0] @@ -102,14 +101,14 @@ def traverse(op): # Need to go up 1 further, from the combine in bitpack data = data.op.input_tensors[0] - if 'spatial_qconv_nchw' in op.tag: + if 'spatial_bitserial_conv_nchw' in op.tag: _schedule_spatial_conv2d_nchw(s, data, data_q, data_pad, data_vec, - kernel, kernel_q, kernel_vec, - conv_out, output, outs[0]) - elif 'spatial_qconv_nhwc' in op.tag: + kernel, kernel_q, kernel_vec, + conv_out, output, outs[0]) + elif 'spatial_bitserial_conv_nhwc' in op.tag: _schedule_spatial_conv2d_nhwc(s, data, data_q, data_pad, data_vec, - kernel, kernel_q, kernel_vec, - conv_out, output, outs[0]) + kernel, kernel_q, kernel_vec, + conv_out, output, outs[0]) else: kernel = op.input_tensors[1] data_q = op.input_tensors[0] @@ -120,16 +119,16 @@ def traverse(op): data_q = data data = data_q.op.input_tensors[0] if 'conv2d_nchw_q' in op.tag: - _schedule_conv2d_nchw_q(s, data, data_q, data_pad, kernel, output) + _schedule_conv2d_nchw(s, data, data_q, data_pad, kernel, output) elif 'conv2d_nhwc_q' in op.tag: - _schedule_conv2d_nhwc_q(s, data, data_q, data_pad, kernel, output) - + _schedule_conv2d_nhwc(s, data, data_q, data_pad, kernel, output) traverse(outs[0].op) return s - -def _schedule_spatial_conv2d_nchw(s, data, data_q, data_pad, data_vec, kernel, kernel_q, kernel_vec, conv_out, output, last): +def _schedule_spatial_conv2d_nchw(s, data, data_q, data_pad, data_vec, + kernel, kernel_q, kernel_vec, + conv_out, output, last): IB, _, CI, IH, IW = data_q.shape KB, CO, _, KH, KW = kernel_q.shape _, _, OH, OW = output.shape @@ -147,7 +146,7 @@ def _schedule_spatial_conv2d_nchw(s, data, data_q, data_pad, data_vec, kernel, k hstride = get_const_int((TH - KH) // (OH - 1)) wstride = get_const_int((TW - KW) // (OW - 1)) stride = (hstride, wstride) - + wkl = _get_workload(data, kernel, stride, padding, output.dtype, "NCHW") sch = _get_schedule(wkl, "NCHW") VH = sch.vh @@ -155,9 +154,8 @@ def _schedule_spatial_conv2d_nchw(s, data, data_q, data_pad, data_vec, kernel, k VC = sch.vc ba = sch.ba bc = sch.bc - - CC = s.cache_write(conv_out, "global") + CC = s.cache_write(conv_out, "global") n, co, oh, ow, vh, vw, vc = s[conv_out].op.axis s[conv_out].vectorize(vc) @@ -173,7 +171,7 @@ def _schedule_spatial_conv2d_nchw(s, data, data_q, data_pad, data_vec, kernel, k if data_pad is not None: s[data_pad].compute_inline() - _, h, _, _, _, _ , vw = s[data_vec].op.axis + _, h, _, _, _, _, vw = s[data_vec].op.axis s[data_vec].vectorize(vw) if ba == 1: oaxis = h @@ -231,8 +229,8 @@ def _schedule_spatial_conv2d_nchw(s, data, data_q, data_pad, data_vec, kernel, k return s def _schedule_spatial_conv2d_nhwc(s, data, data_q, data_pad, data_vec, - kernel, kernel_q, kernel_vec, - conv_out, output, last): + kernel, kernel_q, kernel_vec, + conv_out, output, last): # no stride and padding info here _, IH, IW, CI, IB = data_q.shape KH, KW, _, CO, KB = kernel_q.shape @@ -263,7 +261,7 @@ def _schedule_spatial_conv2d_nhwc(s, data, data_q, data_pad, data_vec, if data_pad is not None: s[data_pad].compute_inline() - _, h, _, _, _, _ , _ = s[data_vec].op.axis + _, h, _, _, _, _, _ = s[data_vec].op.axis if ba == 1: oaxis = h paxis = h @@ -357,13 +355,13 @@ def traverse(op): # Tiling yo, xo, yi, xi = s[output].tile(yy, xx, 4, 4) fused = s[output].fuse(nn, ff) - s[output].reorder(fused, rc, yo, xo, ry, rx, yi, b1, b2, xi) + s[output].reorder(fused, rc, yo, xo, ry, rx, yi, b1, b2, xi) # Vectorize, unroll, parallel s[output].vectorize(xi) s[output].unroll(b1) s[output].unroll(b2) s[output].parallel(fused) - + traverse(outs[0].op) return s diff --git a/topi/tests/python/test_topi_bitserial_conv2d.py b/topi/tests/python/test_topi_bitserial_conv2d.py new file mode 100644 index 000000000000..b27067d24b6a --- /dev/null +++ b/topi/tests/python/test_topi_bitserial_conv2d.py @@ -0,0 +1,109 @@ +import os +import numpy as np +import tvm +import topi +import topi.testing +from tvm.contrib.pickle_memoize import memoize +from topi.util import get_const_tuple +from tvm.contrib import rpc, util + + +def generate_quantized_np(shape, bits, out_dtype): + min_val = 0 + max_val = 1 << bits + return np.random.randint(min_val, max_val, size=shape).astype(out_dtype) + +def verify_bitserial_conv2d_nchw(batch, in_size, in_channel, num_filter, kernel, stride, padding, + activation_bits, weight_bits, dorefa): + in_height = in_width = in_size + input_type='uint32' + out_dtype='int32' + + with tvm.target.create('llvm'): + A = tvm.placeholder((batch, in_channel, in_height, in_width), dtype=input_type, name='A') + W = tvm.placeholder((num_filter, in_channel, kernel, kernel), dtype=input_type, name='W') + B = topi.nn.bitserial_conv2d(A, W, stride, padding, activation_bits, weight_bits, + out_dtype=out_dtype, layout="NCHW", dorefa=dorefa) + s = topi.generic.schedule_bitserial_conv2d_nchw([B]) + + a_shape = get_const_tuple(A.shape) + w_shape = get_const_tuple(W.shape) + dtype = A.dtype + + def get_ref_data(): + a_np = generate_quantized_np(get_const_tuple(A.shape), activation_bits, input_type) + w_np = generate_quantized_np(get_const_tuple(W.shape), weight_bits, input_type) + if dorefa: + w_ = np.copy(w_np).astype(out_dtype) + for x in np.nditer(w_, op_flags=['readwrite']): + x[...] = 1 if x == 1 else -1 + b_np = topi.testing.conv2d_nchw_python(a_np.astype(out_dtype), w_, stride, padding) + else: + b_np = topi.testing.conv2d_nchw_python(a_np, w_np, stride, padding) + return a_np, w_np, b_np + a_np, w_np, b_np = get_ref_data() + + ctx = tvm.cpu(0) + a = tvm.nd.array(a_np, ctx) + w = tvm.nd.array(w_np, ctx) + b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx) + func = tvm.build(s, [A, W, B], "llvm") + func(a, w, b) + np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5) + +def verify_bitserial_conv2d_nhwc(batch, in_size, in_channel, num_filter, kernel, stride, padding, + activation_bits, weight_bits, dorefa): + in_height = in_width = in_size + input_type='uint32' + out_dtype='int32' + + with tvm.target.create('llvm'): + A = tvm.placeholder((batch, in_height, in_width, in_channel), dtype=input_type, name='A') + W = tvm.placeholder((kernel, kernel, in_channel, num_filter), dtype=input_type, name='W') + B = topi.nn.bitserial_conv2d(A, W, stride, padding, activation_bits, weight_bits, out_dtype=out_dtype, + layout="NHWC", dorefa=dorefa) + s = topi.generic.schedule_bitserial_conv2d_nhwc([B]) + + a_shape = get_const_tuple(A.shape) + w_shape = get_const_tuple(W.shape) + dtype = A.dtype + + def get_ref_data(): + a_np = generate_quantized_np(get_const_tuple(A.shape), activation_bits, input_type) + w_np = generate_quantized_np(get_const_tuple(W.shape), weight_bits, input_type) + if dorefa: + w_ = np.copy(w_np).astype(out_dtype) + for x in np.nditer(w_, op_flags=['readwrite']): + x[...] = 1 if x == 1 else -1 + b_np = topi.testing.conv2d_nhwc_python(a_np, w_, stride, padding).astype(out_dtype) + else: + b_np = topi.testing.conv2d_nhwc_python(a_np, w_np, stride, padding).astype(out_dtype) + return a_np, w_np, b_np + a_np, w_np, b_np = get_ref_data() + + ctx = tvm.cpu(0) + a = tvm.nd.array(a_np, ctx) + w = tvm.nd.array(w_np, ctx) + b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx) + func = tvm.build(s, [A, W, B], 'llvm') + + func(a, w, b) + np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5) + +def test_bitserial_conv2d(in_size, ic, oc, k, stride, pad): + verify_bitserial_conv2d_nchw(1, in_size, ic, oc, k, stride, pad, 1, 1, True) + verify_bitserial_conv2d_nchw(1, in_size, ic, oc, k, stride, pad, 2, 1, True) + verify_bitserial_conv2d_nchw(1, in_size, ic, oc, k, stride, pad, 1, 1, False) + verify_bitserial_conv2d_nchw(1, in_size, ic, oc, k, stride, pad, 2, 1, False) + verify_bitserial_conv2d_nchw(1, in_size, ic, oc, k, stride, pad, 2, 2, False) + + verify_bitserial_conv2d_nhwc(1, in_size, ic, oc, k, stride, pad, 1, 1, True) + verify_bitserial_conv2d_nhwc(1, in_size, ic, oc, k, stride, pad, 2, 1, True) + verify_bitserial_conv2d_nhwc(1, in_size, ic, oc, k, stride, pad, 1, 1, False) + verify_bitserial_conv2d_nhwc(1, in_size, ic, oc, k, stride, pad, 2, 1, False) + verify_bitserial_conv2d_nhwc(1, in_size, ic, oc, k, stride, pad, 2, 2, False) + + +if __name__ == "__main__": + test_bitserial_conv2d(56, 64, 128, 3, 2, 1) + diff --git a/topi/tests/python/test_topi_bitserial_conv2d_rasp.py b/topi/tests/python/test_topi_bitserial_conv2d_rasp.py new file mode 100644 index 000000000000..7223b17b9d8d --- /dev/null +++ b/topi/tests/python/test_topi_bitserial_conv2d_rasp.py @@ -0,0 +1,132 @@ +import os +import numpy as np +import tvm +import topi +import topi.testing +from tvm.contrib.pickle_memoize import memoize +from topi.util import get_const_tuple +from tvm.contrib import rpc, util + +def generate_quantized_np(shape, bits, out_dtype): + np.random.seed(0) + min_val = 0 + max_val = 1 << bits + return np.random.randint(min_val, max_val, size=shape).astype(out_dtype) + +def verify_bitserial_conv2d_nchw(batch, in_size, in_channel, num_filter, kernel, stride, padding, + activation_bits, weight_bits, dorefa): + target = 'llvm -target=armv7l-none-linux-gnueabihf -mcpu=cortex-a53 -mattr=+neon' + host = '10.77.1.69' + port = 9090 + remote = rpc.connect(host, port) + ctx = remote.cpu(0) + + in_height = in_width = in_size + input_type='uint32' + out_dtype='int32' + + with tvm.target.rasp(): + A = tvm.placeholder((batch, in_channel, in_height, in_width), dtype=input_type, name='A') + W = tvm.placeholder((num_filter, in_channel, kernel, kernel), dtype=input_type, name='W') + B = topi.nn.bitserial_conv2d(A, W, stride, padding, activation_bits, weight_bits, out_dtype=out_dtype, + layout="NCHW", dorefa=dorefa) + s = topi.generic.schedule_bitserial_conv2d_nchw([B]) + + a_shape = get_const_tuple(A.shape) + w_shape = get_const_tuple(W.shape) + dtype = A.dtype + + def get_ref_data(): + a_np = generate_quantized_np(get_const_tuple(A.shape), activation_bits, input_type) + w_np = generate_quantized_np(get_const_tuple(W.shape), weight_bits, input_type) + if dorefa: + w_ = np.copy(w_np).astype(out_dtype) + for x in np.nditer(w_, op_flags=['readwrite']): + x[...] = 1 if x == 1 else -1 + b_np = topi.testing.conv2d_nchw_python(a_np, w_, stride, padding).astype(out_dtype) + else: + b_np = topi.testing.conv2d_nchw_python(a_np, w_np, stride, padding).astype(out_dtype) + return a_np, w_np, b_np + a_np, w_np, b_np = get_ref_data() + + a = tvm.nd.array(a_np, ctx) + w = tvm.nd.array(w_np, ctx) + b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx) + func = tvm.build(s, [A, W, B], target) + + # upload to rpi + temp = util.tempdir() + path = temp.relpath('qconv_nhwc.o') + func.save(path) + remote.upload(path) + func = remote.load_module('qconv_nhwc.o') + + func(a, w, b) + np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5) + +def verify_bitserial_conv2d_nhwc(batch, in_size, in_channel, num_filter, kernel, stride, padding, + activation_bits, weight_bits, dorefa): + target = 'llvm -target=armv7l-none-linux-gnueabihf -mcpu=cortex-a53 -mattr=+neon' + host = '10.77.1.69' + port = 9090 + remote = rpc.connect(host, port) + ctx = remote.cpu(0) + + in_height = in_width = in_size + input_type='uint32' + out_dtype='int32' + + with tvm.target.rasp(): + A = tvm.placeholder((batch, in_height, in_width, in_channel), dtype=input_type, name='A') + W = tvm.placeholder((kernel, kernel, in_channel, num_filter), dtype=input_type, name='W') + B = topi.nn.bitserial_conv2d(A, W, stride, padding, activation_bits, weight_bits, out_dtype=out_dtype, + layout="NHWC", dorefa=dorefa) + s = topi.generic.schedule_bitserial_conv2d_nhwc([B]) + + a_shape = get_const_tuple(A.shape) + w_shape = get_const_tuple(W.shape) + dtype = A.dtype + + def get_ref_data(): + a_np = generate_quantized_np(get_const_tuple(A.shape), activation_bits, input_type) + w_np = generate_quantized_np(get_const_tuple(W.shape), weight_bits, input_type) + if dorefa: + w_ = np.copy(w_np).astype(out_dtype) + for x in np.nditer(w_, op_flags=['readwrite']): + x[...] = 1 if x == 1 else -1 + b_np = topi.testing.conv2d_nhwc_python(a_np, w_, stride, padding).astype(out_dtype) + else: + b_np = topi.testing.conv2d_nhwc_python(a_np, w_np, stride, padding).astype(out_dtype) + return a_np, w_np, b_np + a_np, w_np, b_np = get_ref_data() + a = tvm.nd.array(a_np, ctx) + w = tvm.nd.array(w_np, ctx) + b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx) + func = tvm.build(s, [A, W, B], target) + # Upload to pi + temp = util.tempdir() + path = temp.relpath('qconv_nhwc.o') + func.save(path) + remote.upload(path) + func = remote.load_module('qconv_nhwc.o') + + func(a, w, b) + np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5) + + +def test_bitserial_conv2d(in_size, ic, oc, k, stride, pad): + verify_bitserial_conv2d_nchw(1, in_size, ic, oc, k, stride, pad, 1, 1, False) + verify_bitserial_conv2d_nchw(1, in_size, ic, oc, k, stride, pad, 2, 1, False) + verify_bitserial_conv2d_nchw(1, in_size, ic, oc, k, stride, pad, 2, 1, False) + verify_bitserial_conv2d_nchw(1, in_size, ic, oc, k, stride, pad, 1, 1, True) + verify_bitserial_conv2d_nchw(1, in_size, ic, oc, k, stride, pad, 2, 1, True) + + verify_bitserial_conv2d_nhwc(1, in_size, ic, oc, k, stride, pad, 1, 1, False) + verify_bitserial_conv2d_nhwc(1, in_size, ic, oc, k, stride, pad, 2, 1, False) + verify_bitserial_conv2d_nhwc(1, in_size, ic, oc, k, stride, pad, 2, 1, False) + # verify_bitserial_conv2d_nhwc(1, in_size, ic, oc, k, stride, pad, 1, 1, True) + # verify_bitserial_conv2d_nhwc(1, in_size, ic, oc, k, stride, pad, 2, 1, True) + +if __name__ == "__main__": + test_bitserial_conv2d(56, 64, 64, 3, 1, 1) + From f8467dde91000a56d29ee9982bdaea1e17a59bd8 Mon Sep 17 00:00:00 2001 From: Meghan Date: Mon, 18 Jun 2018 23:30:06 -0700 Subject: [PATCH 07/11] remove unused simple compute and schedules --- topi/python/topi/nn/bitserial_conv2d.py | 77 -------------------- topi/python/topi/x86/bitserial_conv2d.py | 89 ------------------------ 2 files changed, 166 deletions(-) diff --git a/topi/python/topi/nn/bitserial_conv2d.py b/topi/python/topi/nn/bitserial_conv2d.py index e51577563498..d769800ba0a3 100644 --- a/topi/python/topi/nn/bitserial_conv2d.py +++ b/topi/python/topi/nn/bitserial_conv2d.py @@ -120,83 +120,6 @@ def _get_schedule(wkl, layout): # This return has no use, merely to supress pylint warning return wkl - -def bitserial_conv2d_nchw(Input, Filter, stride, padding, activation_bits, weight_bits, - out_dtype='int32', pack_type='uint32'): - assert isinstance(stride, int) or len(stride) == 2 - Input_q = bitpack(Input, activation_bits, pack_axis=1, bit_axis=2, pack_type=pack_type) - Filter_q = bitpack(Filter, weight_bits, pack_axis=1, bit_axis=4, pack_type=pack_type) - batch, in_channel, activation_bits, in_height, in_width = Input_q.shape - num_filter, channel, kernel_h, kernel_w, weight_bits = Filter_q.shape - - pad_top, pad_left, pad_down, pad_right = get_pad_tuple( - padding, (kernel_h, kernel_w)) - pad_before = [0, 0, 0, pad_top, pad_left] - pad_after = [0, 0, 0, pad_down, pad_right] - - PadInput_q = pad(Input_q, pad_before, pad_after, name="pad_temp") - # compute the output shape - if isinstance(stride, int): - stride_h = stride_w = stride - else: - stride_h, stride_w = stride - out_channel = num_filter - out_height = simplify((in_height - kernel_h + pad_top + pad_down) // stride_h + 1) - out_width = simplify((in_width - kernel_w + pad_left + pad_right) // stride_w + 1) - - rc = tvm.reduce_axis((0, in_channel), name='rc') - ry = tvm.reduce_axis((0, kernel_h), name='ry') - rx = tvm.reduce_axis((0, kernel_w), name='rx') - b1 = tvm.reduce_axis((0, activation_bits), name='b1') - b2 = tvm.reduce_axis((0, weight_bits), name='b2') - - def _conv(nn, ff, yy, xx): - b1b2 = (b1+b2).astype(out_dtype) - return tvm.sum((tvm.popcount( - PadInput_q[nn, rc, b1, yy * stride_h + ry, xx * stride_w + rx] & - Filter_q[ff, rc, ry, rx, b2])<< (b1b2)).astype(out_dtype), - axis=[rc, ry, rx, b2, b1]).astype(out_dtype) - - return tvm.compute((batch, out_channel, out_height, out_width), _conv, - name="Conv2dOutput", tag="bitserial_conv2d_nchw") - -def bitserial_conv2d_nhwc(Input, Filter, stride, padding, activation_bits, weight_bits, - out_dtype='int32', pack_type='uint32'): - assert isinstance(stride, int) or len(stride) == 2 - Input_q = bitpack(Input, activation_bits, pack_axis=3, bit_axis=4, pack_type=pack_type) - Filter_q = bitpack(Filter, weight_bits, pack_axis=2, bit_axis=4, pack_type=pack_type) - batch, in_height, in_width, in_channel_q, _ = Input_q.shape - kernel_h, kernel_w, _, num_filter, _ = Filter_q.shape - - if isinstance(stride, int): - stride_h = stride_w = stride - else: - stride_h, stride_w = stride - pad_top, pad_left, pad_down, pad_right = get_pad_tuple(padding, (kernel_h, kernel_w)) - # compute the output shape - out_channel = num_filter - out_height = simplify((in_height - kernel_h + pad_top + pad_down) // stride_h + 1) - out_width = simplify((in_width - kernel_w + pad_left + pad_right) // stride_w + 1) - pad_before = [0, pad_top, pad_left, 0, 0] - pad_after = [0, pad_down, pad_right, 0, 0] - PadInput_q = pad(Input_q, pad_before, pad_after, name="PaddedInput") - - rc = tvm.reduce_axis((0, in_channel_q), name='rc') - ry = tvm.reduce_axis((0, kernel_h), name='ry') - rx = tvm.reduce_axis((0, kernel_w), name='rx') - b1 = tvm.reduce_axis((0, activation_bits), name='b1') - b2 = tvm.reduce_axis((0, weight_bits), name='b2') - - def _conv(nn, yy, xx, ff): - b1b2 = (b1+b2).astype(out_dtype) - return tvm.sum((tvm.popcount( - PadInput_q[nn, yy * stride_h + ry, xx * stride_w + rx, rc, b1] & - Filter_q[ry, rx, rc, ff, b2]) << b1b2).astype(out_dtype), - axis=[rc, ry, rx, b2, b1]) - - return tvm.compute((batch, out_height, out_width, out_channel), _conv, - name="Conv2dOutput", tag="bitserial_conv2d_nhwc") - def spatial_pack_nchw(data, kernel, stride, padding, in_bits, weight_bits, pack_dtype, out_dtype, dorefa=False): """ Compute convolution with pack on spatial axes. """ diff --git a/topi/python/topi/x86/bitserial_conv2d.py b/topi/python/topi/x86/bitserial_conv2d.py index 522e6eb32208..292d96d29e9e 100644 --- a/topi/python/topi/x86/bitserial_conv2d.py +++ b/topi/python/topi/x86/bitserial_conv2d.py @@ -109,19 +109,6 @@ def traverse(op): _schedule_spatial_conv2d_nhwc(s, data, data_q, data_pad, data_vec, kernel, kernel_q, kernel_vec, conv_out, output, outs[0]) - else: - kernel = op.input_tensors[1] - data_q = op.input_tensors[0] - data = data_q.op.input_tensors[0] - data_pad = None - if isinstance(data_q.op, tvm.tensor.ComputeOp) and "pad" in data_q.op.tag: - data_pad = data_q - data_q = data - data = data_q.op.input_tensors[0] - if 'conv2d_nchw_q' in op.tag: - _schedule_conv2d_nchw(s, data, data_q, data_pad, kernel, output) - elif 'conv2d_nhwc_q' in op.tag: - _schedule_conv2d_nhwc(s, data, data_q, data_pad, kernel, output) traverse(outs[0].op) return s @@ -325,79 +312,3 @@ def _schedule_spatial_conv2d_nhwc(s, data, data_q, data_pad, data_vec, s[last].pragma(oaxis, "parallel_barrier_when_finish") return s - -# Very simple schedules -def schedule_qconv2d_nchw(outs): - """Create schedule for tensors""" - s = tvm.create_schedule([x.op for x in outs]) - - def traverse(op): - if 'qconv2d_nchw' in op.tag: - output = op.output(0) - kernel = op.input_tensors[1] - data_q = op.input_tensors[0] - data = data_q.op.input_tensors[0] - data_pad = None - if isinstance(data_q.op, tvm.tensor.ComputeOp) and "pad" in data_q.op.tag: - data_pad = data_q - data_q = data - data = data_q.op.input_tensors[0] - - # Schedule for padding - n_pad, c_pad, b_pad, h_pad, w_pad = data_pad.op.axis - pad_fused = s[data_pad].fuse(n_pad, c_pad) - s[data_pad].parallel(pad_fused) - - # Schedule for convolution - nn, ff, yy, xx = s[output].op.axis - rc, ry, rx, b2, b1 = s[output].op.reduce_axis - - # Tiling - yo, xo, yi, xi = s[output].tile(yy, xx, 4, 4) - fused = s[output].fuse(nn, ff) - s[output].reorder(fused, rc, yo, xo, ry, rx, yi, b1, b2, xi) - # Vectorize, unroll, parallel - s[output].vectorize(xi) - s[output].unroll(b1) - s[output].unroll(b2) - s[output].parallel(fused) - - traverse(outs[0].op) - return s - -def schedule_qconv2d_nhwc(outs): - """Create schedule for tensors""" - s = tvm.create_schedule([x.op for x in outs]) - - def traverse(op): - if 'qconv2d_nhwc' in op.tag: - output = op.output(0) - kernel = op.input_tensors[1] - data_q = op.input_tensors[0] - data = data_q.op.input_tensors[0] - data_pad = None - if isinstance(data_q.op, tvm.tensor.ComputeOp) and "pad" in data_q.op.tag: - data_pad = data_q - data_q = data - data = data_q.op.input_tensors[0] - - # Schedule for padding - n_pad, h_pad, w_pad, c_pad, b_pad = data_pad.op.axis - pad_fused = s[data_pad].fuse(n_pad, h_pad) - s[data_pad].parallel(pad_fused) - - # Schedule for convolution - nn, yy, xx, ff = s[output].op.axis - ry, rx, rc, b1, b2 = s[output].op.reduce_axis - - # Tiling - xo, fo, xi, fi = s[output].tile(xx, ff, 4, 4) - fused = s[output].fuse(nn, yy) - s[output].reorder(fused, xo, fo, ry, rx, xi, rc, b1, b2, fi) - # Vectorize, unroll, parallel - s[output].vectorize(fi) - s[output].unroll(b1) - s[output].unroll(b2) - s[output].parallel(fused) - traverse(outs[0].op) - return s From 74517e183406f8f0bf412bbfad38e976d072b4b9 Mon Sep 17 00:00:00 2001 From: Meghan Date: Sun, 24 Jun 2018 17:25:04 -0700 Subject: [PATCH 08/11] linting --- topi/python/topi/nn/__init__.py | 1 - topi/python/topi/nn/bitserial_conv2d.py | 38 ++++--- topi/python/topi/rasp/bitserial_conv2d.py | 102 ++++++++++-------- topi/python/topi/x86/__init__.py | 1 - topi/python/topi/x86/bitserial_conv2d.py | 6 +- .../python/test_topi_bitserial_conv2d.py | 11 +- .../python/test_topi_bitserial_conv2d_rasp.py | 11 +- 7 files changed, 95 insertions(+), 75 deletions(-) diff --git a/topi/python/topi/nn/__init__.py b/topi/python/topi/nn/__init__.py index 2c17e0540477..4caaef5a4d86 100644 --- a/topi/python/topi/nn/__init__.py +++ b/topi/python/topi/nn/__init__.py @@ -18,4 +18,3 @@ from .local_response_norm import * from .l2_norm import * from .bitserial_conv2d import * -from .qdense import * \ No newline at end of file diff --git a/topi/python/topi/nn/bitserial_conv2d.py b/topi/python/topi/nn/bitserial_conv2d.py index d769800ba0a3..c8d5313770f6 100644 --- a/topi/python/topi/nn/bitserial_conv2d.py +++ b/topi/python/topi/nn/bitserial_conv2d.py @@ -1,13 +1,13 @@ -# pylint: disable=invalid-name, unused-variable, too-many-locals, unused-argument -"""Conv2D operators""" +# pylint: disable=invalid-name, unused-variable, too-many-locals, too-many-arguments, unused-argument +"""Bitserial Conv2D operators""" from __future__ import absolute_import as _abs from collections import namedtuple import tvm from .pad import pad from .util import get_pad_tuple, bitpack -from ..util import simplify, get_const_tuple +from ..util import get_const_tuple -# workload description of qconv2d +# workload description of conv2d Workload = namedtuple('Workload', ['in_dtype', 'out_dtype', 'height', 'width', 'in_filter', 'out_filter', 'hkernel', 'wkernel', 'hpad', 'wpad', 'hstride', 'wstride']) @@ -16,7 +16,7 @@ ['vh', 'vw', 'vc', 'ba', 'bc']) SpatialPackNHWC = namedtuple('SpatialPack', - ['vh', 'vw', 'vc', 'ba', 'bc']) + ['vh', 'vw', 'vc', 'ba', 'bc']) _WORKLOADS = [ # workloads of resnet18 on imagenet @@ -43,7 +43,7 @@ @tvm.target.generic_func def bitserial_conv2d(data, kernel, stride, padding, activation_bits, weight_bits, layout='NCHW', pack_dtype='uint32', out_dtype='int32', dorefa=True): - """Conv2D operator. + """Bitserial Conv2D operator. Parameters ---------- @@ -52,7 +52,8 @@ def bitserial_conv2d(data, kernel, stride, padding, activation_bits, weight_bits [batch, in_height, in_width, in_channel] filter : tvm.Tensor - 4-D with shape [num_filter, in_channel, filter_height, filter_width] + 4-D with shape [num_filter, in_channel, filter_height, filter_width] or + [filter_height, filter_width, in_channel, num_filter] stride : int or a list/tuple of two ints stride size, or [stride_height, stride_width] @@ -64,8 +65,10 @@ def bitserial_conv2d(data, kernel, stride, padding, activation_bits, weight_bits layout of data activation_bits: int + number of bits used for activations/input elements weight_bits: int + number of bits used for weight elements out_dtype: str return type of convolution @@ -74,12 +77,13 @@ def bitserial_conv2d(data, kernel, stride, padding, activation_bits, weight_bits bit packing type dorefa: bool - method of preforming popcount + preform the bitserial dot-product using 2 popcounts (required for DoReFa-Net) Returns ------- output : tvm.Tensor - 4-D with shape [batch, out_channel, out_height, out_width] + 4-D with shape [batch, out_channel, out_height, out_width] or + [batch, out_height, out_width, out_channel] """ # search platform specific declaration first # default declaration @@ -181,15 +185,15 @@ def _conv(n, co, h, w, vh, vw, vc): return tvm.sum((tvm.popcount( data_vec[n, h, w, ci, vh*HSTR+dh, vw*WSTR+dw, b1] & kernel_vec[co, ci, dh, dw, b2, vc]) - - tvm.popcount( - data_vec[n, h, w, ci, vh*HSTR+dh, vw*WSTR+dw, b1] & - ~kernel_vec[co, ci, dh, dw, b2, vc])).astype(out_dtype) << b1b2, - axis=[ci, dh, dw, b1, b2]) + tvm.popcount( + data_vec[n, h, w, ci, vh*HSTR+dh, vw*WSTR+dw, b1] & + ~kernel_vec[co, ci, dh, dw, b2, vc])).astype(out_dtype) << b1b2, + axis=[ci, dh, dw, b1, b2]) return tvm.sum((tvm.popcount( data_vec[n, h, w, ci, vh*HSTR+dh, vw*WSTR+dw, b1] & kernel_vec[co, ci, dh, dw, b2, vc])).astype(out_dtype) << b1b2, - axis=[ci, dh, dw, b1, b2]) + axis=[ci, dh, dw, b1, b2]) conv = tvm.compute(ovshape, _conv, name='conv_out') @@ -251,9 +255,9 @@ def _conv(n, h, w, co, vh, vw, vc): if dorefa: return tvm.sum( (tvm.popcount(data_vec[n, h, w, vh*HSTR+dh, vw*WSTR+dw, ci, b1] & - kernel_vec[co, dh, dw, ci, vc, b2]) - - tvm.popcount(data_vec[n, h, w, vh*HSTR+dh, vw*WSTR+dw, ci, b1] & - ~kernel_vec[co, dh, dw, ci, vc, b2])).astype(out_dtype) << b1b2, + kernel_vec[co, dh, dw, ci, vc, b2]) - + tvm.popcount(data_vec[n, h, w, vh*HSTR+dh, vw*WSTR+dw, ci, b1] & + ~kernel_vec[co, dh, dw, ci, vc, b2])).astype(out_dtype) << b1b2, axis=[dh, dw, ci, b1, b2]) return tvm.sum(tvm.popcount( diff --git a/topi/python/topi/rasp/bitserial_conv2d.py b/topi/python/topi/rasp/bitserial_conv2d.py index 03aa1e1fc418..44f7d8f5fc60 100644 --- a/topi/python/topi/rasp/bitserial_conv2d.py +++ b/topi/python/topi/rasp/bitserial_conv2d.py @@ -11,8 +11,8 @@ from ..util import get_const_int from .. import generic -RaspSpatialPack = namedtuple('SpatialPack', - ['vh', 'vw', 'vc', 'ba', 'bc', 'split_ci', 'kfactor']) +RaspSpatialPack = namedtuple('SpatialPack', + ['vh', 'vw', 'vc', 'ba', 'bc', 'split_ci', 'kfactor']) _QUANTIZED_SCHEDULES_NHWC = [ RaspSpatialPack(2, 2, 8, 1, 1, False, 8), @@ -62,16 +62,17 @@ def _declaration_bitserial_conv2d(data, kernel, stride, padding, activation_bits out_dtype = data.dtype assert data.shape[0].value == 1, "only support batch size=1 convolution on rasp" assert layout == "NCHW" or layout == "NHWC", "only support layouts NCHW and NHWC" + if dorefa: + assert layout == "NCHW", "Cannot support dorea with NHWC layout yet" wkl = _get_workload(data, kernel, stride, padding, out_dtype, layout) sch = _get_schedule(wkl, layout) if layout == "NCHW": return spatial_pack_nchw(data, kernel, stride, padding, activation_bits, weight_bits, pack_dtype=pack_dtype, out_dtype=out_dtype, dorefa=dorefa) - # TODO: Doesn't support dorefa style yet return _spatial_pack_nhwc(data, kernel, stride, padding, activation_bits, weight_bits, out_dtype) -def kernel_vec_spatial_pack_nhwc(kernel, kernel_bits, VC): +def _kernel_vec_spatial_pack_nhwc(kernel, kernel_bits, VC): kernel_q = bitpack(kernel, kernel_bits, pack_axis=2, bit_axis=2, pack_type='uint8') KH, KW, KB, CI, CO = kernel_q.shape kvshape = (CO//VC, KH, KW, KB, VC, CI) @@ -88,7 +89,7 @@ def _spatial_pack_nhwc(data, kernel, stride, padding, activation_bits, weight_bi VC = sch.vc data_q = bitpack(data, activation_bits, pack_axis=3, bit_axis=3, pack_type='uint8') - kernel_vec = kernel_vec_spatial_pack_nhwc(kernel, weight_bits, VC) + kernel_vec = _kernel_vec_spatial_pack_nhwc(kernel, weight_bits, VC) N, H, W, IB, CI = data_q.shape OCO, KH, KW, KB, VC, _ = kernel_vec.shape @@ -127,7 +128,7 @@ def _conv(n, h, w, co, vh, vw, vc): return tvm.sum((tvm.popcount( kernel_vec[co, dh, dw, kb, vc, ci].astype('uint16') & data_vec[n, h, w, vh*HSTR+dh, vw*WSTR+dw, ib, ci].astype('uint16')) - << (kb + ib).astype('uint16')), axis=[dh, dw, kb, ib, ci]) + << (kb + ib).astype('uint16')), axis=[dh, dw, kb, ib, ci]) conv = tvm.compute(ovshape, _conv, name='conv') @@ -135,7 +136,7 @@ def _conv(n, h, w, co, vh, vw, vc): conv[n][h//VH][w//VW][co//VC][h%VH][w%VW][co%VC].astype(out_dtype), name='output_vec', tag='spatial_bitserial_conv_nhwc') -def intrin_popcount(m, k_i, w_b, x_b): +def _intrin_popcount(m, k_i, w_b, x_b): dtype = 'uint8' w = tvm.placeholder((w_b, m, k_i), dtype=dtype, name='w') x = tvm.placeholder((x_b, k_i,), dtype=dtype, name='x') @@ -143,8 +144,9 @@ def intrin_popcount(m, k_i, w_b, x_b): bw = tvm.reduce_axis((0, w_b), name='bw') bx = tvm.reduce_axis((0, x_b), name='bx') z = tvm.compute((m,), lambda i: - tvm.sum(tvm.popcount(w[bw, i, k].astype('uint16') & x[bx, k].astype('uint16')) - << (bw+bx).astype('uint16'), axis=[bw, bx, k]), name='z') + tvm.sum(tvm.popcount(w[bw, i, k].astype('uint16') & + x[bx, k].astype('uint16')) + << (bw+bx).astype('uint16'), axis=[bw, bx, k]), name='z') Wb = tvm.decl_buffer(w.shape, w.dtype, name="W", @@ -155,7 +157,7 @@ def intrin_popcount(m, k_i, w_b, x_b): offset_factor=k_i, strides=[tvm.var('ldw'), 1]) - def intrin_func(ins, outs): + def _intrin_func(ins, outs): ww, xx = ins zz = outs[0] vpadd_id = tvm.const(647, 'uint32') @@ -163,47 +165,54 @@ def intrin_func(ins, outs): args_1 = tvm.const(1, 'uint32') args_2 = tvm.const(2, 'uint32') - def instr(index): + def _instr(index): irb = tvm.ir_builder.create() if index == 1: irb.emit(zz.vstore(0, tvm.const(0, 'uint16x8'))) - else: - cnts8 = [None] * 8 - cnts4 = [None] * 4 - cnts2 = [None] * 2 - for bw in range(w_b): - for bx in range(x_b): - if k_i == 16: - for i in range(m): - ands = ww.vload([bw, i, 0], 'uint8x16') & xx.vload([bx, 0], 'uint8x16') - cnts = tvm.popcount(ands) - upper_half = tvm.call_pure_intrin('uint8x8', 'vectorhigh', cnts) - lower_half = tvm.call_pure_intrin('uint8x8', 'vectorlow', cnts) - cnts8[i] = upper_half + lower_half - for i in range(m/2): - cnts4[i] = tvm.call_pure_intrin('uint8x8', 'llvm_intrin', vpadd_id, args_1, cnts8[i*2], cnts8[i*2+1]) - for i in range(m/4): - cnts2[i] = tvm.call_pure_intrin('uint8x8', 'llvm_intrin', vpadd_id, args_1, cnts4[i*2], cnts4[i*2+1]) - cnts = tvm.call_pure_intrin('uint8x16', 'vectorcombine', cnts2[0], cnts2[1]) - shifted_cnts = cnts << (bw+bx) - out = tvm.call_pure_intrin('uint16x8', 'llvm_intrin', vpadalu_id, args_2, zz.vload(0, 'uint16x8'), shifted_cnts) - else: # ki ==8 - for i in range(m): - ands = ww.vload([bw, i, 0], 'uint8x8') & xx.vload([bx, 0], 'uint8x8') - cnts8[i] = tvm.popcount(ands) - for i in range(m/2): - cnts4[i] = tvm.call_pure_intrin('uint8x8', 'llvm_intrin', vpadd_id, args_1, cnts8[i*2], cnts8[i*2+1]) - for i in range(m/4): - cnts2[i] = tvm.call_pure_intrin('uint8x8', 'llvm_intrin', vpadd_id, args_1, cnts4[i*2], cnts4[i*2+1]) - cnts = tvm.call_pure_intrin('uint8x16', 'vectorcombine', cnts2[0], cnts2[1]) - shifted_cnts = cnts << (bw+bx) - out = tvm.call_pure_intrin('uint16x8', 'llvm_intrin', vpadalu_id, args_2, zz.vload(0, 'uint16x8'), shifted_cnts) - irb.emit(zz.vstore(0, out)) + return irb.get() + + cnts8 = [None] * 8 + cnts4 = [None] * 4 + cnts2 = [None] * 2 + for bw in range(w_b): + for bx in range(x_b): + if k_i == 16: + for i in range(m): + ands = ww.vload([bw, i, 0], 'uint8x16') & xx.vload([bx, 0], 'uint8x16') + cnts = tvm.popcount(ands) + upper_half = tvm.call_pure_intrin('uint8x8', 'vectorhigh', cnts) + lower_half = tvm.call_pure_intrin('uint8x8', 'vectorlow', cnts) + cnts8[i] = upper_half + lower_half + for i in range(m//2): + cnts4[i] = tvm.call_pure_intrin('uint8x8', 'llvm_intrin', vpadd_id, + args_1, cnts8[i*2], cnts8[i*2+1]) + for i in range(m//4): + cnts2[i] = tvm.call_pure_intrin('uint8x8', 'llvm_intrin', vpadd_id, + args_1, cnts4[i*2], cnts4[i*2+1]) + cnts = tvm.call_pure_intrin('uint8x16', 'vectorcombine', cnts2[0], cnts2[1]) + shifted_cnts = cnts << (bw+bx) + out = tvm.call_pure_intrin('uint16x8', 'llvm_intrin', vpadalu_id, + args_2, zz.vload(0, 'uint16x8'), shifted_cnts) + else: # ki == 8 + for i in range(m): + ands = ww.vload([bw, i, 0], 'uint8x8') & xx.vload([bx, 0], 'uint8x8') + cnts8[i] = tvm.popcount(ands) + for i in range(m//2): + cnts4[i] = tvm.call_pure_intrin('uint8x8', 'llvm_intrin', vpadd_id, + args_1, cnts8[i*2], cnts8[i*2+1]) + for i in range(m//4): + cnts2[i] = tvm.call_pure_intrin('uint8x8', 'llvm_intrin', vpadd_id, + args_1, cnts4[i*2], cnts4[i*2+1]) + cnts = tvm.call_pure_intrin('uint8x16', 'vectorcombine', cnts2[0], cnts2[1]) + shifted_cnts = cnts << (bw+bx) + out = tvm.call_pure_intrin('uint16x8', 'llvm_intrin', vpadalu_id, + args_2, zz.vload(0, 'uint16x8'), shifted_cnts) + irb.emit(zz.vstore(0, out)) return irb.get() # body, reset, update - return instr(0), instr(1), instr(2) + return _instr(0), _instr(1), _instr(2) with tvm.build_config(offset_factor=1, partition_const_loop=True): - return tvm.decl_tensor_intrin(z.op, intrin_func, binds={w: Wb, x:Xb}) + return tvm.decl_tensor_intrin(z.op, _intrin_func, binds={w: Wb, x:Xb}) # ARM specific schedule that using custom microkernel @@ -293,7 +302,7 @@ def _schedule_spatial_conv2d_nhwc(s, data, data_q, data_pad, data_vec, else: s[conv_out].reorder(n, oh, ow, co, vh, vw, dh, dw, kb, ib, vc, ci) - pc = intrin_popcount(8, kfactor, KB, IB) + pc = _intrin_popcount(8, kfactor, KB, IB) s[conv_out].tensorize(kb, pc) n, h, w, co = s[last].op.axis @@ -320,6 +329,7 @@ def _schedule_spatial_conv2d_nhwc(s, data, data_q, data_pad, data_vec, @generic.schedule_bitserial_conv2d_nhwc.register(["rasp"]) def schedule_bitserial_conv2d_nhwc(outs): + """Raspverry pi schedule for bitserial conv2d""" s = tvm.create_schedule([x.op for x in outs]) def traverse(op): """Traverse operators from computation graph""" diff --git a/topi/python/topi/x86/__init__.py b/topi/python/topi/x86/__init__.py index 78f18b4ebf7e..c146419fcec9 100644 --- a/topi/python/topi/x86/__init__.py +++ b/topi/python/topi/x86/__init__.py @@ -9,4 +9,3 @@ from .injective import * from .pooling import schedule_pool, schedule_global_pool from .bitserial_conv2d import schedule_bitserial_conv2d -from .qdense import schedule_qdense diff --git a/topi/python/topi/x86/bitserial_conv2d.py b/topi/python/topi/x86/bitserial_conv2d.py index 292d96d29e9e..1c01b96f9c30 100644 --- a/topi/python/topi/x86/bitserial_conv2d.py +++ b/topi/python/topi/x86/bitserial_conv2d.py @@ -55,7 +55,7 @@ def _get_schedule_bitserial_conv2d(wkl, layout): @bitserial_conv2d.register("cpu") def _declaration_bitserial_conv2d(data, kernel, stride, padding, activation_bits, weight_bits, - layout='NCHW', pack_dtype=None, out_dtype=None, dorefa=False): + layout='NCHW', pack_dtype=None, out_dtype=None, dorefa=False): if out_dtype is None: out_dtype = data.dtype assert data.shape[0].value == 1, "only support batch size=1 convolution on rasp" @@ -69,12 +69,14 @@ def _declaration_bitserial_conv2d(data, kernel, stride, padding, activation_bits @generic.schedule_bitserial_conv2d_nchw.register(["cpu"]) @generic.schedule_bitserial_conv2d_nhwc.register(["cpu"]) def schedule_bitserial_conv2d(outs): + """CPU schedule for bitserial convolutions NCHW and NHWC""" s = tvm.create_schedule([x.op for x in outs]) def traverse(op): + """Traverse operators from computation graph""" output = op.output(0) # inline all one-to-one-mapping operators except the last stage (output) - if tag.is_broadcast(op.tag) or 'elemwise' in op.tag or 'uquantize' in op.tag: + if tag.is_broadcast(op.tag) or 'elemwise' in op.tag: if op not in s.outputs: s[op].compute_inline() for tensor in op.input_tensors: diff --git a/topi/tests/python/test_topi_bitserial_conv2d.py b/topi/tests/python/test_topi_bitserial_conv2d.py index b27067d24b6a..a494f57551ce 100644 --- a/topi/tests/python/test_topi_bitserial_conv2d.py +++ b/topi/tests/python/test_topi_bitserial_conv2d.py @@ -90,7 +90,12 @@ def get_ref_data(): func(a, w, b) np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5) -def test_bitserial_conv2d(in_size, ic, oc, k, stride, pad): +def test_bitserial_conv2d(): + in_size = 56 + ic, oc = 64, 64 + k = 3 + stride = 1 + pad = 1 verify_bitserial_conv2d_nchw(1, in_size, ic, oc, k, stride, pad, 1, 1, True) verify_bitserial_conv2d_nchw(1, in_size, ic, oc, k, stride, pad, 2, 1, True) verify_bitserial_conv2d_nchw(1, in_size, ic, oc, k, stride, pad, 1, 1, False) @@ -103,7 +108,5 @@ def test_bitserial_conv2d(in_size, ic, oc, k, stride, pad): verify_bitserial_conv2d_nhwc(1, in_size, ic, oc, k, stride, pad, 2, 1, False) verify_bitserial_conv2d_nhwc(1, in_size, ic, oc, k, stride, pad, 2, 2, False) - if __name__ == "__main__": - test_bitserial_conv2d(56, 64, 128, 3, 2, 1) - + test_bitserial_conv2d() \ No newline at end of file diff --git a/topi/tests/python/test_topi_bitserial_conv2d_rasp.py b/topi/tests/python/test_topi_bitserial_conv2d_rasp.py index 7223b17b9d8d..c1ec95c383ef 100644 --- a/topi/tests/python/test_topi_bitserial_conv2d_rasp.py +++ b/topi/tests/python/test_topi_bitserial_conv2d_rasp.py @@ -114,7 +114,12 @@ def get_ref_data(): np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5) -def test_bitserial_conv2d(in_size, ic, oc, k, stride, pad): +def test_bitserial_conv2d(): + in_size = 56 + ic, oc = 64, 64 + k = 3 + stride = 1 + pad = 1 verify_bitserial_conv2d_nchw(1, in_size, ic, oc, k, stride, pad, 1, 1, False) verify_bitserial_conv2d_nchw(1, in_size, ic, oc, k, stride, pad, 2, 1, False) verify_bitserial_conv2d_nchw(1, in_size, ic, oc, k, stride, pad, 2, 1, False) @@ -124,9 +129,7 @@ def test_bitserial_conv2d(in_size, ic, oc, k, stride, pad): verify_bitserial_conv2d_nhwc(1, in_size, ic, oc, k, stride, pad, 1, 1, False) verify_bitserial_conv2d_nhwc(1, in_size, ic, oc, k, stride, pad, 2, 1, False) verify_bitserial_conv2d_nhwc(1, in_size, ic, oc, k, stride, pad, 2, 1, False) - # verify_bitserial_conv2d_nhwc(1, in_size, ic, oc, k, stride, pad, 1, 1, True) - # verify_bitserial_conv2d_nhwc(1, in_size, ic, oc, k, stride, pad, 2, 1, True) if __name__ == "__main__": - test_bitserial_conv2d(56, 64, 64, 3, 1, 1) + test_bitserial_conv2d() From 80d0f081a38c6184b04125e2b1871b8e5870d727 Mon Sep 17 00:00:00 2001 From: Meghan Date: Mon, 25 Jun 2018 00:37:03 -0700 Subject: [PATCH 09/11] more linting --- HalideIR | 2 +- dmlc-core | 2 +- topi/python/topi/nn/bitserial_conv2d.py | 68 ++++++++++++++++++- topi/python/topi/nn/util.py | 66 ------------------ topi/python/topi/rasp/bitserial_conv2d.py | 9 +-- .../python/test_topi_bitserial_conv2d_rasp.py | 11 ++- 6 files changed, 75 insertions(+), 83 deletions(-) diff --git a/HalideIR b/HalideIR index a3698398faff..0b7e25275138 160000 --- a/HalideIR +++ b/HalideIR @@ -1 +1 @@ -Subproject commit a3698398faff7fec1c0fa4e4479357651382db75 +Subproject commit 0b7e25275138768bb05edb9b9db2c86d0fb09c9a diff --git a/dmlc-core b/dmlc-core index 9b3f9753ae81..e864aa6757cd 160000 --- a/dmlc-core +++ b/dmlc-core @@ -1 +1 @@ -Subproject commit 9b3f9753ae81d657743c555e0cacc4e43f0bed2d +Subproject commit e864aa6757cdbe78b1296fe5231fd3050b7802c3 diff --git a/topi/python/topi/nn/bitserial_conv2d.py b/topi/python/topi/nn/bitserial_conv2d.py index c8d5313770f6..89cb03182ec6 100644 --- a/topi/python/topi/nn/bitserial_conv2d.py +++ b/topi/python/topi/nn/bitserial_conv2d.py @@ -2,10 +2,12 @@ """Bitserial Conv2D operators""" from __future__ import absolute_import as _abs from collections import namedtuple +import numpy as np import tvm +from topi.transform import concatenate from .pad import pad -from .util import get_pad_tuple, bitpack -from ..util import get_const_tuple +from .util import get_pad_tuple +from ..util import get_const_tuple, get_const_int # workload description of conv2d Workload = namedtuple('Workload', @@ -271,6 +273,68 @@ def _conv(n, h, w, co, vh, vw, vc): conv[n][h//VH][w//VW][co//VC][h%VH][w%VW][co%VC], name='output_unpack', tag='spatial_bitserial_conv_nhwc') +def bitpack(data, bits, pack_axis, bit_axis, pack_type, name="QuantizeInput"): + """Packs data into format necessary for bitserial computation + pack_axis : int + index of the axis to pack in data + bit_axis : int + index of axis to place bit axis in resulting packed data""" + ishape = data.shape + n = len(ishape) + if pack_type == 'uint8': + data_width = 8 + elif pack_type == 'uint16': + data_width = 16 + elif pack_type == 'uint32': + data_width = 32 + elif pack_type == 'uint64': + data_width = 64 + + # Data must be in multiples of the data_width + assert get_const_int(ishape[pack_axis]) % data_width == 0, "Not a multiple of word size" + + shape_vec = list(ishape) + shape_vec[pack_axis] = (shape_vec[pack_axis] // data_width) + shape_vec.insert(bit_axis, 1) + bitserial_oshape = tuple(shape_vec) + masks = np.array([0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80]) + + # pack axis shifts if bit axis comes before + if bit_axis <= pack_axis: + pack_axis += 1 + + def _bitpack(*indices): + packed_data = [tvm.const(0, pack_type)] * bits + for k in range(data_width): + # Translate indices for packed data back to original + idx = [0] * n + j = 0 + for i in range(n+1): + if i == bit_axis: + continue + elif i == pack_axis: + idx[j] = indices[i] * data_width + k + else: + idx[j] = indices[i] + j += 1 + + element = data(*idx) + for b in range(bits): + extracted_bit = ((element & tvm.const(masks[b])) >> b).astype(pack_type) + packed_data[b] = (packed_data[b] | extracted_bit) + if k < data_width - 1: + packed_data[b] = packed_data[b] << 1 + + if k == data_width - 1: + return tuple(packed_data) + return tuple(packed_data) + + output_tuple = tvm.compute(bitserial_oshape, _bitpack, name=name, tag='bitpack') + + if bits > 1: + return concatenate(output_tuple, axis=bit_axis) + return output_tuple + _SCH_TO_DECL_FUNC_QUANT = { SpatialPackNCHW: spatial_pack_nchw, SpatialPackNHWC: spatial_pack_nhwc, diff --git a/topi/python/topi/nn/util.py b/topi/python/topi/nn/util.py index 90497a77c6f9..6264ced76953 100644 --- a/topi/python/topi/nn/util.py +++ b/topi/python/topi/nn/util.py @@ -4,9 +4,6 @@ import tvm from ..util import get_const_int -import numpy as np -from topi.transform import concatenate - def infer_pad(data, data_pad): """Infer the padding from stages in reverse. @@ -105,66 +102,3 @@ def get_pad_tuple(padding, kernel): pad_top = (pad_h + 1) // 2 pad_left = (pad_w + 1) // 2 return pad_top, pad_left, pad_h - pad_top, pad_w - pad_left - - -# Packs quantized data into packed bitplanes -# pack_axis = Axis to compress of original tensor -# bit_axis = Axis to place bitplanes in the resulting tensor -# pack_type = Datatype to pack elements into -def bitpack(data, bits, pack_axis, bit_axis, pack_type, name="QuantizeInput"): - ishape = data.shape - n = len(ishape) - if pack_type == 'uint8': - data_width = 8 - elif pack_type == 'uint16': - data_width = 16 - elif pack_type == 'uint32': - data_width = 32 - elif pack_type == 'uint64': - data_width = 64 - - # Data must be in multiples of the data_width - assert get_const_int(ishape[pack_axis]) % data_width == 0, "Not a multiple of word size" - - shape_vec = list(ishape) - shape_vec[pack_axis] = (shape_vec[pack_axis] // data_width) - shape_vec.insert(bit_axis, 1) - bitserial_oshape = tuple(shape_vec) - masks = np.array([0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80]) - - # pack axis shifts if bit axis comes before - if bit_axis <= pack_axis: - pack_axis += 1 - - def _bitpack(*indices): - packed_data = [tvm.const(0, pack_type)] * bits - for k in range(data_width): - # Translate indices for packed data back to original - idx = [0] * n - j = 0 - for i in range(n+1): - if i == bit_axis: - continue - elif i == pack_axis: - idx[j] = indices[i] * data_width + k - else: - idx[j] = indices[i] - j += 1 - - element = data(*idx) - for b in range(bits): - extracted_bit = ((element & tvm.const(masks[b])) >> b).astype(pack_type) - packed_data[b] = (packed_data[b] | extracted_bit) - if k < data_width - 1 : - packed_data[b] = packed_data[b] << 1 - - if k == data_width - 1: - return tuple(packed_data) - - output_tuple = tvm.compute(bitserial_oshape, _bitpack, name=name, tag='bitpack') - - if bits > 1: - return concatenate(output_tuple, axis=bit_axis) - else: - return output_tuple - diff --git a/topi/python/topi/rasp/bitserial_conv2d.py b/topi/python/topi/rasp/bitserial_conv2d.py index 44f7d8f5fc60..8c023ac46eed 100644 --- a/topi/python/topi/rasp/bitserial_conv2d.py +++ b/topi/python/topi/rasp/bitserial_conv2d.py @@ -5,9 +5,9 @@ import tvm from .. import tag from ..nn.pad import pad -from ..nn.bitserial_conv2d import bitserial_conv2d, _get_schedule, _get_workload +from ..nn.bitserial_conv2d import bitserial_conv2d, _get_schedule, _get_workload, bitpack from ..nn.bitserial_conv2d import SpatialPackNCHW, _WORKLOADS, spatial_pack_nchw -from ..nn.util import get_pad_tuple, bitpack +from ..nn.util import get_pad_tuple from ..util import get_const_int from .. import generic @@ -214,7 +214,6 @@ def _instr(index): with tvm.build_config(offset_factor=1, partition_const_loop=True): return tvm.decl_tensor_intrin(z.op, _intrin_func, binds={w: Wb, x:Xb}) - # ARM specific schedule that using custom microkernel def _schedule_spatial_conv2d_nhwc(s, data, data_q, data_pad, data_vec, kernel, kernel_q, kernel_vec, @@ -274,7 +273,6 @@ def _schedule_spatial_conv2d_nhwc(s, data, data_q, data_pad, data_vec, s[data_vec].pragma(paxis, "parallel_stride_pattern") s[data_vec].pragma(oaxis, "parallel_barrier_when_finish") - ##### Schedule kernel packing co, _, _, _, _, _ = s[kernel_vec].op.axis if bc == 1: @@ -290,7 +288,6 @@ def _schedule_spatial_conv2d_nhwc(s, data, data_q, data_pad, data_vec, s[kernel_vec].pragma(paxis, "parallel_stride_pattern") s[kernel_vec].pragma(oaxis, "parallel_barrier_when_finish") - ##### Schedule Convolution n, oh, ow, co, vh, vw, vc = s[conv_out].op.axis dh, dw, kb, ib, ci = s[conv_out].op.reduce_axis @@ -326,7 +323,6 @@ def _schedule_spatial_conv2d_nhwc(s, data, data_q, data_pad, data_vec, s = s.normalize() return s - @generic.schedule_bitserial_conv2d_nhwc.register(["rasp"]) def schedule_bitserial_conv2d_nhwc(outs): """Raspverry pi schedule for bitserial conv2d""" @@ -342,7 +338,6 @@ def traverse(op): traverse(tensor.op) if 'spatial_bitserial_conv_nhwc' in op.tag: - # print "spatial" output = op.output(0) conv_out = op.input_tensors[0] kernel_vec = conv_out.op.input_tensors[0] diff --git a/topi/tests/python/test_topi_bitserial_conv2d_rasp.py b/topi/tests/python/test_topi_bitserial_conv2d_rasp.py index c1ec95c383ef..9e04391baf26 100644 --- a/topi/tests/python/test_topi_bitserial_conv2d_rasp.py +++ b/topi/tests/python/test_topi_bitserial_conv2d_rasp.py @@ -56,10 +56,10 @@ def get_ref_data(): # upload to rpi temp = util.tempdir() - path = temp.relpath('qconv_nhwc.o') + path = temp.relpath('conv_nhwc.o') func.save(path) remote.upload(path) - func = remote.load_module('qconv_nhwc.o') + func = remote.load_module('conv_nhwc.o') func(a, w, b) np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5) @@ -105,15 +105,14 @@ def get_ref_data(): func = tvm.build(s, [A, W, B], target) # Upload to pi temp = util.tempdir() - path = temp.relpath('qconv_nhwc.o') + path = temp.relpath('conv_nhwc.o') func.save(path) remote.upload(path) - func = remote.load_module('qconv_nhwc.o') + func = remote.load_module('conv_nhwc.o') func(a, w, b) np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5) - def test_bitserial_conv2d(): in_size = 56 ic, oc = 64, 64 @@ -128,7 +127,7 @@ def test_bitserial_conv2d(): verify_bitserial_conv2d_nhwc(1, in_size, ic, oc, k, stride, pad, 1, 1, False) verify_bitserial_conv2d_nhwc(1, in_size, ic, oc, k, stride, pad, 2, 1, False) - verify_bitserial_conv2d_nhwc(1, in_size, ic, oc, k, stride, pad, 2, 1, False) + verify_bitserial_conv2d_nhwc(1, in_size, ic, oc, k, stride, pad, 2, 2, False) if __name__ == "__main__": test_bitserial_conv2d() From bb0dc520628c7638a0c281eecd2cb9684fae600b Mon Sep 17 00:00:00 2001 From: Meghan Date: Sun, 15 Jul 2018 16:40:32 -0700 Subject: [PATCH 10/11] fixing types --- HalideIR | 2 +- topi/python/topi/nn/bitserial_conv2d.py | 16 ++++++++-------- topi/python/topi/rasp/bitserial_conv2d.py | 4 ++-- topi/tests/python/test_topi_bitserial_conv2d.py | 3 +-- 4 files changed, 12 insertions(+), 13 deletions(-) diff --git a/HalideIR b/HalideIR index 0b7e25275138..9204453ae8de 160000 --- a/HalideIR +++ b/HalideIR @@ -1 +1 @@ -Subproject commit 0b7e25275138768bb05edb9b9db2c86d0fb09c9a +Subproject commit 9204453ae8de77e7dfc32c4d80f58dd788ad75ff diff --git a/topi/python/topi/nn/bitserial_conv2d.py b/topi/python/topi/nn/bitserial_conv2d.py index 89cb03182ec6..ca2efb0820c1 100644 --- a/topi/python/topi/nn/bitserial_conv2d.py +++ b/topi/python/topi/nn/bitserial_conv2d.py @@ -185,11 +185,11 @@ def _conv(n, co, h, w, vh, vw, vc): b1b2 = (b1+b2).astype(out_dtype) if dorefa: return tvm.sum((tvm.popcount( - data_vec[n, h, w, ci, vh*HSTR+dh, vw*WSTR+dw, b1] & - kernel_vec[co, ci, dh, dw, b2, vc]) - + data_vec[n, h, w, ci, vh*HSTR+dh, vw*WSTR+dw, b1].astype(out_dtype) & + kernel_vec[co, ci, dh, dw, b2, vc].astype(out_dtype)) - tvm.popcount( - data_vec[n, h, w, ci, vh*HSTR+dh, vw*WSTR+dw, b1] & - ~kernel_vec[co, ci, dh, dw, b2, vc])).astype(out_dtype) << b1b2, + data_vec[n, h, w, ci, vh*HSTR+dh, vw*WSTR+dw, b1].astype(out_dtype) + & ~kernel_vec[co, ci, dh, dw, b2, vc]).astype(out_dtype)) << b1b2, axis=[ci, dh, dw, b1, b2]) return tvm.sum((tvm.popcount( @@ -256,10 +256,10 @@ def _conv(n, h, w, co, vh, vw, vc): b1b2 = (b1+b2).astype(out_dtype) if dorefa: return tvm.sum( - (tvm.popcount(data_vec[n, h, w, vh*HSTR+dh, vw*WSTR+dw, ci, b1] & - kernel_vec[co, dh, dw, ci, vc, b2]) - - tvm.popcount(data_vec[n, h, w, vh*HSTR+dh, vw*WSTR+dw, ci, b1] & - ~kernel_vec[co, dh, dw, ci, vc, b2])).astype(out_dtype) << b1b2, + (tvm.popcount(data_vec[n, h, w, vh*HSTR+dh, vw*WSTR+dw, ci, b1].astype(out_dtype) & + kernel_vec[co, dh, dw, ci, vc, b2].astype(out_dtype)) - + tvm.popcount(data_vec[n, h, w, vh*HSTR+dh, vw*WSTR+dw, ci, b1].astype(out_dtype) & + ~kernel_vec[co, dh, dw, ci, vc, b2]).astype(out_dtype)) << b1b2, axis=[dh, dw, ci, b1, b2]) return tvm.sum(tvm.popcount( diff --git a/topi/python/topi/rasp/bitserial_conv2d.py b/topi/python/topi/rasp/bitserial_conv2d.py index 8c023ac46eed..1e5e1bee729a 100644 --- a/topi/python/topi/rasp/bitserial_conv2d.py +++ b/topi/python/topi/rasp/bitserial_conv2d.py @@ -190,7 +190,7 @@ def _instr(index): cnts2[i] = tvm.call_pure_intrin('uint8x8', 'llvm_intrin', vpadd_id, args_1, cnts4[i*2], cnts4[i*2+1]) cnts = tvm.call_pure_intrin('uint8x16', 'vectorcombine', cnts2[0], cnts2[1]) - shifted_cnts = cnts << (bw+bx) + shifted_cnts = cnts << tvm.const(bw+bx, dtype) out = tvm.call_pure_intrin('uint16x8', 'llvm_intrin', vpadalu_id, args_2, zz.vload(0, 'uint16x8'), shifted_cnts) else: # ki == 8 @@ -204,7 +204,7 @@ def _instr(index): cnts2[i] = tvm.call_pure_intrin('uint8x8', 'llvm_intrin', vpadd_id, args_1, cnts4[i*2], cnts4[i*2+1]) cnts = tvm.call_pure_intrin('uint8x16', 'vectorcombine', cnts2[0], cnts2[1]) - shifted_cnts = cnts << (bw+bx) + shifted_cnts = cnts << tvm.const(bw+bx, dtype) out = tvm.call_pure_intrin('uint16x8', 'llvm_intrin', vpadalu_id, args_2, zz.vload(0, 'uint16x8'), shifted_cnts) irb.emit(zz.vstore(0, out)) diff --git a/topi/tests/python/test_topi_bitserial_conv2d.py b/topi/tests/python/test_topi_bitserial_conv2d.py index 3da905f4d21b..6df18483a45f 100644 --- a/topi/tests/python/test_topi_bitserial_conv2d.py +++ b/topi/tests/python/test_topi_bitserial_conv2d.py @@ -5,7 +5,7 @@ import topi.testing from tvm.contrib.pickle_memoize import memoize from topi.util import get_const_tuple -from tvm.contrib import rpc, util +from tvm.contrib import util from tvm.contrib.pickle_memoize import memoize def generate_quantized_np(shape, bits, out_dtype): @@ -30,7 +30,6 @@ def verify_bitserial_conv2d_nchw(batch, in_size, in_channel, num_filter, kernel, w_shape = get_const_tuple(W.shape) dtype = A.dtype - @memoize("topi.tests.test_topi_conv2d.verify_conv2d") def get_ref_data(): a_np = generate_quantized_np(get_const_tuple(A.shape), activation_bits, input_type) w_np = generate_quantized_np(get_const_tuple(W.shape), weight_bits, input_type) From 85a931f1276e33f52af25006f43019f9daa77d29 Mon Sep 17 00:00:00 2001 From: Meghan Date: Sun, 22 Jul 2018 17:26:08 -0700 Subject: [PATCH 11/11] Fix typos, rasp test case, llvm intrin lookup --- python/tvm/intrin.py | 25 +++++++++ src/codegen/llvm/llvm_module.cc | 9 +++ tests/python/unittest/test_codegen_llvm.py | 11 ++++ topi/python/topi/generic/nn.py | 24 ++------ topi/python/topi/rasp/bitserial_conv2d.py | 16 +++--- .../python/test_topi_bitserial_conv2d_rasp.py | 56 +++++++++++++++++++ 6 files changed, 113 insertions(+), 28 deletions(-) create mode 100644 topi/tests/python/test_topi_bitserial_conv2d_rasp.py diff --git a/python/tvm/intrin.py b/python/tvm/intrin.py index 422f2d682d2b..30da873b5dcf 100644 --- a/python/tvm/intrin.py +++ b/python/tvm/intrin.py @@ -154,6 +154,31 @@ def call_extern(dtype, func_name, *args): dtype, func_name, convert(args), _Call.Extern, None, 0) +def call_llvm_intrin(dtype, name, *args): + """Build expression by calling an llvm intrinsic function + + Parameters + ---------- + dtype : str + The data type of the result. + + name : str + The name of the llvm intrinsic function. + + args : list + Poistional arguments. + + Returns + ------- + call : Expr + The call expression. + """ + import tvm + llvm_id = tvm.codegen.llvm_lookup_intrinsic_id(name) + assert llvm_id != 0, "%s is not an LLVM intrinsic" % name + return call_pure_intrin(dtype, 'llvm_intrin', tvm.const(llvm_id, 'uint32'), *args) + + def exp(x): """Take exponetial of input x. diff --git a/src/codegen/llvm/llvm_module.cc b/src/codegen/llvm/llvm_module.cc index 2bae52b194f5..99740b0dbdca 100644 --- a/src/codegen/llvm/llvm_module.cc +++ b/src/codegen/llvm/llvm_module.cc @@ -282,6 +282,15 @@ class LLVMModuleNode final : public runtime::ModuleNode { std::shared_ptr ctx_; }; +unsigned LookupLLVMIntrinsic(const std::string& name) { + return llvm::Function::lookupIntrinsicID(name); +} + +TVM_REGISTER_API("codegen.llvm_lookup_intrinsic_id") +.set_body([](TVMArgs args, TVMRetValue* rv) { + *rv = static_cast(LookupLLVMIntrinsic(args[0])); + }); + TVM_REGISTER_API("codegen.build_llvm") .set_body([](TVMArgs args, TVMRetValue* rv) { std::shared_ptr n = std::make_shared(); diff --git a/tests/python/unittest/test_codegen_llvm.py b/tests/python/unittest/test_codegen_llvm.py index f05fad10d273..e07f4aa8f40c 100644 --- a/tests/python/unittest/test_codegen_llvm.py +++ b/tests/python/unittest/test_codegen_llvm.py @@ -17,6 +17,16 @@ def test_llvm_intrin(): func = tvm.ir_pass.MakeAPI(body, "prefetch", [A], 0, True) fcode = tvm.build(func, None, "llvm") +def test_llvm_lookup_intrin(): + ib = tvm.ir_builder.create() + m = tvm.var("m") + A = ib.pointer("uint8x8", name="A") + x = tvm.call_llvm_intrin("uint8x8", "llvm.ctpop.i8", tvm.const(1, 'uint32'), A) + ib.emit(x) + body = ib.get() + func = tvm.ir_pass.MakeAPI(body, "ctpop", [A], 1, True) + fcode = tvm.build(func, None, "llvm") + def test_llvm_add_pipeline(): nn = 1024 n = tvm.convert(nn) @@ -324,3 +334,4 @@ def test_alignment(): test_llvm_flip_pipeline() test_llvm_madd_pipeline() test_llvm_temp_space() + test_llvm_lookup_intrin() diff --git a/topi/python/topi/generic/nn.py b/topi/python/topi/generic/nn.py index 9b70d4aac78b..fe76b9715d59 100644 --- a/topi/python/topi/generic/nn.py +++ b/topi/python/topi/generic/nn.py @@ -53,22 +53,6 @@ def schedule_conv2d_nhwc(outs): """ return _default_schedule(outs, False) -@tvm.target.generic_func -def schedule_qdense(outs): - """Schedule for qdense - - Parameters - ---------- - outs: Array of Tensor - The computation graph description of qdense - in the format of an array of tensors. - - Returns - ------- - sch: Schedule - The computation schedule for the op. - """ - return _default_schedule(outs, False) @tvm.target.generic_func def schedule_conv2d_NCHWc(num_filter, kernel_size, strides, @@ -161,12 +145,12 @@ def schedule_depthwise_conv2d_nhwc(outs): @tvm.target.generic_func def schedule_bitserial_conv2d_nchw(outs): - """Schedule for qconv2d_nchw + """Schedule for bitserial_conv2d_nchw Parameters ---------- outs: Array of Tensor - The computation graph description of qconv2d_nchw + The computation graph description of bitserial_conv2d_nchw in the format of an array of tensors. Returns @@ -179,12 +163,12 @@ def schedule_bitserial_conv2d_nchw(outs): @tvm.target.generic_func def schedule_bitserial_conv2d_nhwc(outs): - """Schedule for qconv2d_nhwc + """Schedule for bitserial_conv2d_nhwc Parameters ---------- outs: Array of Tensor - The computation graph description of qconv2d_nchw + The computation graph description of bitserial_conv2d_nchw in the format of an array of tensors. Returns diff --git a/topi/python/topi/rasp/bitserial_conv2d.py b/topi/python/topi/rasp/bitserial_conv2d.py index 1e5e1bee729a..7d292db8d298 100644 --- a/topi/python/topi/rasp/bitserial_conv2d.py +++ b/topi/python/topi/rasp/bitserial_conv2d.py @@ -160,8 +160,8 @@ def _intrin_popcount(m, k_i, w_b, x_b): def _intrin_func(ins, outs): ww, xx = ins zz = outs[0] - vpadd_id = tvm.const(647, 'uint32') - vpadalu_id = tvm.const(646, 'uint32') + vpadd = "llvm.arm.neon.vpadd.v8u8" + vpadalu = "llvm.arm.neon.vpadalu.v16u8.v8u16" args_1 = tvm.const(1, 'uint32') args_2 = tvm.const(2, 'uint32') @@ -184,28 +184,28 @@ def _instr(index): lower_half = tvm.call_pure_intrin('uint8x8', 'vectorlow', cnts) cnts8[i] = upper_half + lower_half for i in range(m//2): - cnts4[i] = tvm.call_pure_intrin('uint8x8', 'llvm_intrin', vpadd_id, + cnts4[i] = tvm.call_llvm_intrin('uint8x8', vpadd, args_1, cnts8[i*2], cnts8[i*2+1]) for i in range(m//4): - cnts2[i] = tvm.call_pure_intrin('uint8x8', 'llvm_intrin', vpadd_id, + cnts2[i] = tvm.call_llvm_intrin('uint8x8', vpadd, args_1, cnts4[i*2], cnts4[i*2+1]) cnts = tvm.call_pure_intrin('uint8x16', 'vectorcombine', cnts2[0], cnts2[1]) shifted_cnts = cnts << tvm.const(bw+bx, dtype) - out = tvm.call_pure_intrin('uint16x8', 'llvm_intrin', vpadalu_id, + out = tvm.call_llvm_intrin('uint16x8', vpadalu, args_2, zz.vload(0, 'uint16x8'), shifted_cnts) else: # ki == 8 for i in range(m): ands = ww.vload([bw, i, 0], 'uint8x8') & xx.vload([bx, 0], 'uint8x8') cnts8[i] = tvm.popcount(ands) for i in range(m//2): - cnts4[i] = tvm.call_pure_intrin('uint8x8', 'llvm_intrin', vpadd_id, + cnts4[i] = tvm.call_llvm_intrin('uint8x8', vpadd, args_1, cnts8[i*2], cnts8[i*2+1]) for i in range(m//4): - cnts2[i] = tvm.call_pure_intrin('uint8x8', 'llvm_intrin', vpadd_id, + cnts2[i] = tvm.call_llvm_intrin('uint8x8', vpadd, args_1, cnts4[i*2], cnts4[i*2+1]) cnts = tvm.call_pure_intrin('uint8x16', 'vectorcombine', cnts2[0], cnts2[1]) shifted_cnts = cnts << tvm.const(bw+bx, dtype) - out = tvm.call_pure_intrin('uint16x8', 'llvm_intrin', vpadalu_id, + out = tvm.call_llvm_intrin('uint16x8', vpadalu, args_2, zz.vload(0, 'uint16x8'), shifted_cnts) irb.emit(zz.vstore(0, out)) return irb.get() diff --git a/topi/tests/python/test_topi_bitserial_conv2d_rasp.py b/topi/tests/python/test_topi_bitserial_conv2d_rasp.py new file mode 100644 index 000000000000..5789c5496205 --- /dev/null +++ b/topi/tests/python/test_topi_bitserial_conv2d_rasp.py @@ -0,0 +1,56 @@ +import os +import re +import numpy as np +import tvm +import topi +import topi.testing +from topi.util import get_const_tuple +from tvm.contrib import util + +target = 'llvm -target=armv7l-none-linux-gnueabihf -mcpu=cortex-a53 -mattr=+neon' + +def generate_quantized_np(shape, bits, out_dtype): + np.random.seed(0) + min_val = 0 + max_val = 1 << bits + return np.random.randint(min_val, max_val, size=shape).astype(out_dtype) + +# Verify that certain special instructions from the tensorize pass exist +def verify_bitserial_conv2d_nhwc(batch, in_size, in_channel, num_filter, kernel, stride, padding, + activation_bits, weight_bits, dorefa): + in_height = in_width = in_size + input_type='uint32' + out_dtype='int32' + + with tvm.target.rasp(): + A = tvm.placeholder((batch, in_height, in_width, in_channel), dtype=input_type, name='A') + W = tvm.placeholder((kernel, kernel, in_channel, num_filter), dtype=input_type, name='W') + B = topi.nn.bitserial_conv2d(A, W, stride, padding, activation_bits, weight_bits, out_dtype=out_dtype, + layout="NHWC", dorefa=dorefa) + s = topi.generic.schedule_bitserial_conv2d_nhwc([B]) + + + func = tvm.build(s, [A, W, B], target) + + assembly = func.get_source('asm') + matches = re.findall("vpadal", assembly) + assert (len(matches) > 0) + matches = re.findall("vcnt", assembly) + assert (len(matches) > 0) + matches = re.findall("vpadd", assembly) + assert (len(matches) > 0) + +def test_bitserial_conv2d(): + in_size = 56 + ic, oc = 64, 64 + k = 3 + stride = 1 + pad = 1 + + + verify_bitserial_conv2d_nhwc(1, in_size, ic, oc, k, stride, pad, 1, 1, False) + verify_bitserial_conv2d_nhwc(1, in_size, ic, oc, k, stride, pad, 2, 1, False) + +if __name__ == "__main__": + test_bitserial_conv2d() +