[Relay, Quantization, TOPI] int8 dense on CUDA & Dense op quantization (

apache#2877) * Quantize dense layers * Add out_dtype arggument to dense; Add dense_int8 on CUDA * Add topi unittest of dense int8 * Fix relay * Fix topi integration * Fix quantization * Update dense_rewrite * Triger CI * Change qconfig quantize_dense to quantize_op * Fix * Remove quantize_op from qconfig
neo-ai · May 13, 2019 · 3c2558e · 3c2558e
1 parent c8cb6b7
commit 3c2558e
Show file tree

Hide file tree

Showing 20 changed files with 326 additions and 49 deletions.
diff --git a/include/tvm/relay/attrs/nn.h b/include/tvm/relay/attrs/nn.h
@@ -336,10 +336,16 @@ struct GlobalPool2DAttrs : public tvm::AttrsNode<GlobalPool2DAttrs> {
 /*! \brief Attributes for dense operator */
 struct DenseAttrs : public tvm::AttrsNode<DenseAttrs> {
   IndexExpr units;
+  DataType out_dtype;
 
   TVM_DECLARE_ATTRS(DenseAttrs, "relay.attrs.DenseAttrs") {
     TVM_ATTR_FIELD(units)
         .describe("Number of hidden units of the dense transformation.");
+
+    // use 0 bits to indicate none.
+    TVM_ATTR_FIELD(out_dtype)
+        .set_default(NullValue<DataType>())
+        .describe("Output data type, set to explicit type under mixed precision setting");
   }
 };
 

diff --git a/python/tvm/autotvm/task/topi_integration.py b/python/tvm/autotvm/task/topi_integration.py
@@ -188,7 +188,7 @@ def _topi_nn_conv2d_transpose_nchw(*args, **kwargs):
         def _topi_nn_dense(*args, **kwargs):
             assert not kwargs, "Do not support kwargs in template function call"
             args = deserialize_args(args)
-            data, weight, bias = args
+            data, weight, bias, _ = args
             C = topi.nn.dense(*args, **kwargs)
             s = topi.generic.schedule_dense([C])
             if bias is not None:

diff --git a/python/tvm/relay/op/nn/_nn.py b/python/tvm/relay/op/nn/_nn.py
@@ -51,7 +51,9 @@ def schedule_log_softmax(_, outputs, target):
 @reg.register_compute("nn.dense")
 def compute_dense(attrs, inputs, out_type, target):
     """Compute definition of dense"""
-    return [topi.nn.dense(inputs[0], inputs[1])]
+    out_dtype = attrs.out_dtype
+    out_dtype = inputs[0].dtype if out_dtype == "" else out_dtype
+    return [topi.nn.dense(inputs[0], inputs[1], out_dtype=out_dtype)]
 
 @reg.register_schedule("nn.dense")
 def schedule_dense(attrs, outputs, target):

diff --git a/python/tvm/relay/op/nn/nn.py b/python/tvm/relay/op/nn/nn.py
@@ -475,7 +475,7 @@ def bias_add(data, bias, axis=1):
     return _make.bias_add(data, bias, axis)
 
 
-def dense(data, weight, units=None):
+def dense(data, weight, units=None, out_dtype=""):
     """Dense operator.
     Applies a linear transformation
 
@@ -494,12 +494,15 @@ def dense(data, weight, units=None):
     units : int, optional
         Number of hidden units of the dense transformation.
 
+    out_dtype : str, optional
+        Specifies the output data type for mixed precision dense.
+
     Returns
     -------
     result : tvm.relay.Expr
         The computed result.
     """
-    return _make.dense(data, weight, units)
+    return _make.dense(data, weight, units, out_dtype)
 
 
 def relu(data):

diff --git a/python/tvm/relay/quantize/_annotate.py b/python/tvm/relay/quantize/_annotate.py
@@ -171,6 +171,26 @@ def conv2d_rewrite(ref_call, new_args, ctx):
     return QAnnotateExpr(expr, QAnnotateKind.ACTIVATION)
 
 
+@register_annotate_function("nn.dense")
+def dense_rewrite(ref_call, new_args, ctx):
+    """Rewrite function for dense. Lhs of dense will be quantized to input field, and rhs of
+    dense will be quantized to weight field. Output would be in activation field."""
+    cnt = _conv_counter()
+    if cnt < current_qconfig().skip_k_conv:
+        return None
+    lhs_expr, lhs_kind = _get_expr_kind(new_args[0])
+    rhs_expr, rhs_kind = _get_expr_kind(new_args[1])
+
+    if lhs_kind is None or lhs_kind != QAnnotateKind.INPUT:
+        lhs_expr = attach_simulated_quantize(lhs_expr, QAnnotateKind.INPUT)
+
+    assert rhs_kind is None
+    rhs_expr = attach_simulated_quantize(rhs_expr, QAnnotateKind.WEIGHT)
+
+    expr = _forward_op(ref_call, [lhs_expr, rhs_expr])
+    return QAnnotateExpr(expr, QAnnotateKind.ACTIVATION)
+
+
 @register_annotate_function("multiply")
 def multiply_rewrite(ref_call, new_args, ctx):
     """Rewrite function for multiply."""

diff --git a/src/relay/op/nn/nn.cc b/src/relay/op/nn/nn.cc
@@ -131,18 +131,24 @@ bool DenseRel(const Array<Type>& types,
     oshape.Set((oshape.size() - 1), wshape[0]);
   }
 
+  DataType out_dtype = param->out_dtype;
+  if (out_dtype.bits() == 0) {
+    out_dtype = data->dtype;
+  }
   // assign output type
-  reporter->Assign(types[2], TensorTypeNode::make(oshape, data->dtype));
+  reporter->Assign(types[2], TensorTypeNode::make(oshape, out_dtype));
   return true;
 }
 
 
 // Positional relay function to create dense operator used by frontend FFI.
 Expr MakeDense(Expr data,
                Expr weight,
-               IndexExpr units) {
+               IndexExpr units,
+               DataType out_dtype) {
   auto attrs = make_node<DenseAttrs>();
   attrs->units = units;
+  attrs->out_dtype = out_dtype;
   static const Op& op = Op::Get("nn.dense");
   return CallNode::make(op, {data, weight}, Attrs(attrs), {});
 }

diff --git a/src/relay/pass/quantize.cc b/src/relay/pass/quantize.cc
@@ -296,6 +296,39 @@ RELAY_REGISTER_OP("nn.conv2d")
 .set_attr<FForwardRewrite>("FQRealizeRewrite", Conv2dRealize);
 
 
+Expr DenseRealize(const Call& ref_call,
+                  const Array<Expr>& new_args,
+                  const NodeRef& ctx) {
+  const QConfig& cfg = QConfig::Current();
+  CHECK_EQ(new_args.size(), 2);
+  if (!new_args[0]->derived_from<TempExprNode>() || !new_args[1]->derived_from<TempExprNode>()) {
+    return Expr(nullptr);
+  }
+  const auto* lhs = new_args[0].as<QRealizeIntExprNode>();
+  const auto* rhs = new_args[1].as<QRealizeIntExprNode>();
+
+  Expr ldata = lhs->data;
+  if (lhs->dtype != cfg->dtype_input) {
+    ldata = Cast(ldata, cfg->dtype_input);
+  }
+  Expr rdata = Cast(rhs->data, cfg->dtype_weight);
+
+  const auto ref_attrs = ref_call->attrs.as<DenseAttrs>();
+  auto attrs = make_node<DenseAttrs>();
+  *attrs = *ref_attrs;
+  DataType out_dtype = cfg->dtype_activation;
+  attrs->out_dtype = out_dtype;
+
+  Expr ret = CallNode::make(ref_call->op,
+          {ldata, rdata}, Attrs(attrs), ref_call->type_args);
+  Expr dom_scale = FoldConstant(Multiply(lhs->dom_scale, rhs->dom_scale));
+  return QRealizeIntExprNode::make(ret, dom_scale, out_dtype);
+}
+
+RELAY_REGISTER_OP("nn.dense")
+.set_attr<FForwardRewrite>("FQRealizeRewrite", DenseRealize);
+
+
 Expr MulRealize(const Call& ref_call,
                 const Array<Expr>& new_args,
                 const NodeRef& ctx) {

diff --git a/topi/include/topi/cuda/dense.h b/topi/include/topi/cuda/dense.h
@@ -44,13 +44,15 @@ namespace cuda {
 * \param data Tensor with shape [batch, in_dim]
 * \param weight Tensor with shape [out_dim, in_dim]
 * \param bias Tensor with shape [out_dim]. Optional; to omit bias, pass Tensor()
+* \param out_dtype Output data type. Used for mixed precision.
 *
 * \return Tensor with shape [batch, out_dim]
 */
 inline tvm::Tensor dense_cuda(const Target& target,
                               const tvm::Tensor& data,
                               const tvm::Tensor& weight,
-                              const tvm::Tensor& bias) {
+                              const tvm::Tensor& bias,
+                              const Type& out_dtype) {
   CHECK_EQ(data->shape.size(), 2) << "dense requires 2-D data";
   CHECK_EQ(weight->shape.size(), 2) << "dense requires 2-D weight";
   if (bias.defined()) {
@@ -62,6 +64,7 @@ inline tvm::Tensor dense_cuda(const Target& target,
   auto out_dim = weight->shape[0];
 
   if (target->libs().count("cublas")) {
+    CHECK_EQ(data->dtype, out_dtype) << "Mixed precision not supported.";
     auto mm = topi::contrib::cublas_matmul(data, weight, false, true);
     if (bias.defined()) {
       mm = tvm::compute({ batch, out_dim },
@@ -72,7 +75,7 @@ inline tvm::Tensor dense_cuda(const Target& target,
 
     return mm;
   } else {
-    return topi::nn::dense(data, weight, bias);
+    return topi::nn::dense(data, weight, bias, out_dtype);
   }
 }
 

diff --git a/topi/include/topi/nn/dense.h b/topi/include/topi/nn/dense.h
@@ -40,12 +40,14 @@ using namespace tvm;
 * \param data Tensor with shape [batch, in_dim]
 * \param weight Tensor with shape [out_dim, in_dim]
 * \param bias Tensor with shape [out_dim]. Optional; to omit bias, pass Tensor()
+* \param out_dtype Output data type. Used for mixed precision.
 *
 * \return Tensor with shape [batch, out_dim]
 */
 inline tvm::Tensor dense(const tvm::Tensor& data,
                          const tvm::Tensor& weight,
-                         const tvm::Tensor& bias) {
+                         const tvm::Tensor& bias,
+                         const Type& out_dtype) {
   CHECK_EQ(data->shape.size(), 2) << "dense requires 2-D data";
   CHECK_EQ(weight->shape.size(), 2) << "dense requires 2-D weight";
   if (bias.defined()) {
@@ -60,14 +62,15 @@ inline tvm::Tensor dense(const tvm::Tensor& data,
   auto matmul = tvm::compute(
     { batch, out_dim },
     [&](Var i, Var j) {
-      return tvm::sum(data(i, k) * weight(j, k), { k });
+      return tvm::sum(tvm::cast(out_dtype, data(i, k)) *
+                      tvm::cast(out_dtype, weight(j, k)), { k });
     }, "tensor", "dense");
 
   if (bias.defined()) {
     matmul = tvm::compute(
       { batch, out_dim },
       [&](Var i, Var j) {
-        return matmul(i, j) + bias(j);
+        return matmul(i, j) + tvm::cast(out_dtype, bias(j));
       }, "tensor", kBroadcast);
   }
 

diff --git a/topi/include/topi/rocm/dense.h b/topi/include/topi/rocm/dense.h
@@ -45,13 +45,15 @@ namespace rocm {
 * \param data Tensor with shape [batch, in_dim]
 * \param weight Tensor with shape [out_dim, in_dim]
 * \param bias Tensor with shape [out_dim]. Optional; to omit bias, pass Tensor()
+* \param out_dtype Output data type. Used for mixed precision.
 *
 * \return Tensor with shape [batch, out_dim]
 */
 inline tvm::Tensor dense_rocm(const Target& target,
                               const tvm::Tensor& data,
                               const tvm::Tensor& weight,
-                              const tvm::Tensor& bias) {
+                              const tvm::Tensor& bias,
+                              const Type& out_dtype) {
   CHECK_EQ(data->shape.size(), 2) << "dense requires 2-D data";
   CHECK_EQ(weight->shape.size(), 2) << "dense requires 2-D weight";
   if (bias.defined()) {
@@ -63,6 +65,7 @@ inline tvm::Tensor dense_rocm(const Target& target,
   auto out_dim = weight->shape[0];
 
   if (target->libs().count("rocblas")) {
+    CHECK_EQ(data->dtype, out_dtype) << "Mixed precision not supported.";
     auto mm = topi::contrib::rocblas_matmul(data, weight, false, true);
     if (bias.defined()) {
       mm = tvm::compute({ batch, out_dim },
@@ -73,7 +76,7 @@ inline tvm::Tensor dense_rocm(const Target& target,
 
     return mm;
   } else {
-    return topi::nn::dense(data, weight, bias);
+    return topi::nn::dense(data, weight, bias, out_dtype);
   }
 }
 

diff --git a/topi/python/topi/cuda/__init__.py b/topi/python/topi/cuda/__init__.py
@@ -2,15 +2,15 @@
 """CUDA specific declaration and schedules."""
 from __future__ import absolute_import as _abs
 
-from . import conv2d, depthwise_conv2d, conv2d_transpose_nchw, deformable_conv2d, group_conv2d_nchw
+from . import conv2d, depthwise_conv2d, conv2d_transpose_nchw, deformable_conv2d, \
+              group_conv2d_nchw, dense
 from .conv2d_hwcn import schedule_conv2d_hwcn
 from .depthwise_conv2d import schedule_depthwise_conv2d_backward_input_nhwc
 from .depthwise_conv2d import schedule_depthwise_conv2d_backward_weight_nhwc
 from .group_conv2d_nchw import schedule_conv2d_nchw_cuda
 from .reduction import schedule_reduce
 from .softmax import schedule_softmax
 from .injective import schedule_injective, schedule_elemwise, schedule_broadcast
-from .dense import dense_cuda, schedule_dense
 from .pooling import schedule_pool, schedule_global_pool
 from .extern import schedule_extern
 from .nn import schedule_lrn, schedule_l2_normalize