apache · ZihengJiang · Apr 26, 2019 · Mar 22, 2019 · Mar 22, 2019 · Mar 22, 2019
diff --git a/include/tvm/relay/attrs/nn.h b/include/tvm/relay/attrs/nn.h
@@ -336,10 +336,16 @@ struct GlobalPool2DAttrs : public tvm::AttrsNode<GlobalPool2DAttrs> {
 /*! \brief Attributes for dense operator */
 struct DenseAttrs : public tvm::AttrsNode<DenseAttrs> {
   IndexExpr units;
+  DataType out_dtype;
 
   TVM_DECLARE_ATTRS(DenseAttrs, "relay.attrs.DenseAttrs") {
     TVM_ATTR_FIELD(units)
         .describe("Number of hidden units of the dense transformation.");
+
+    // use 0 bits to indicate none.
+    TVM_ATTR_FIELD(out_dtype)
+        .set_default(NullValue<DataType>())
+        .describe("Output data type, set to explicit type under mixed precision setting");
   }
 };
 

diff --git a/python/tvm/autotvm/task/topi_integration.py b/python/tvm/autotvm/task/topi_integration.py
@@ -188,7 +188,7 @@ def _topi_nn_conv2d_transpose_nchw(*args, **kwargs):
         def _topi_nn_dense(*args, **kwargs):
             assert not kwargs, "Do not support kwargs in template function call"
             args = deserialize_args(args)
-            data, weight, bias = args
+            data, weight, bias, _ = args
             C = topi.nn.dense(*args, **kwargs)
             s = topi.generic.schedule_dense([C])
             if bias is not None:

diff --git a/python/tvm/relay/op/nn/_nn.py b/python/tvm/relay/op/nn/_nn.py
@@ -51,7 +51,9 @@ def schedule_log_softmax(_, outputs, target):
 @reg.register_compute("nn.dense")
 def compute_dense(attrs, inputs, out_type, target):
     """Compute definition of dense"""
-    return [topi.nn.dense(inputs[0], inputs[1])]
+    out_dtype = attrs.out_dtype
+    out_dtype = inputs[0].dtype if out_dtype == "" else out_dtype
+    return [topi.nn.dense(inputs[0], inputs[1], out_dtype=out_dtype)]
 
 @reg.register_schedule("nn.dense")
 def schedule_dense(attrs, outputs, target):

diff --git a/python/tvm/relay/op/nn/nn.py b/python/tvm/relay/op/nn/nn.py
@@ -475,7 +475,7 @@ def bias_add(data, bias, axis=1):
     return _make.bias_add(data, bias, axis)
 
 
-def dense(data, weight, units=None):
+def dense(data, weight, units=None, out_dtype=""):
     """Dense operator.
     Applies a linear transformation
 
@@ -494,12 +494,15 @@ def dense(data, weight, units=None):
     units : int, optional
         Number of hidden units of the dense transformation.
 
+    out_dtype : str, optional
+        Specifies the output data type for mixed precision dense.
+
     Returns
     -------
     result : tvm.relay.Expr
         The computed result.
     """
-    return _make.dense(data, weight, units)
+    return _make.dense(data, weight, units, out_dtype)
 
 
 def relu(data):

diff --git a/python/tvm/relay/quantize/_annotate.py b/python/tvm/relay/quantize/_annotate.py
@@ -171,6 +171,28 @@ def conv2d_rewrite(ref_call, new_args, ctx):
     return QAnnotateExpr(expr, QAnnotateKind.ACTIVATION)
 
 
+@register_annotate_function("nn.dense")
+def dense_rewrite(ref_call, new_args, ctx):
+    """Rewrite function for dense. Lhs of dense will be quantized to input field, and rhs of
+    dense will be quantized to weight field. Output would be in activation field."""
+    if not current_qconfig().quantize_dense:
+        return None
+    cnt = _conv_counter()
+    if cnt < current_qconfig().skip_k_conv:
+        return None
+    lhs_expr, lhs_kind = _get_expr_kind(new_args[0])
+    rhs_expr, rhs_kind = _get_expr_kind(new_args[1])
+
+    if lhs_kind is None or lhs_kind != QAnnotateKind.INPUT:
+        lhs_expr = attach_simulated_quantize(lhs_expr, QAnnotateKind.INPUT)
+
+    assert rhs_kind is None
+    rhs_expr = attach_simulated_quantize(rhs_expr, QAnnotateKind.WEIGHT)
+
+    expr = _forward_op(ref_call, [lhs_expr, rhs_expr])
+    return QAnnotateExpr(expr, QAnnotateKind.ACTIVATION)
+
+
 @register_annotate_function("multiply")
 def multiply_rewrite(ref_call, new_args, ctx):
     """Rewrite function for multiply."""

diff --git a/python/tvm/relay/quantize/quantize.py b/python/tvm/relay/quantize/quantize.py
@@ -74,7 +74,8 @@ class QConfig(NodeBase):
         "round_for_shift": True,
         "store_lowbit_output": True,
         "debug_enabled_ops": None,
-        "use_stop_fusion": True
+        "use_stop_fusion": True,
+        "quantize_dense": True
     }
 
     # pylint: disable=no-member
@@ -150,6 +151,9 @@ def qconfig(**kwargs):
         Whether add stop_fusion when casting to dtype_activation. stop_fusion forces lowbit
         results to be stored in memory.
 
+    quantize_dense: boolean
+        Whether to quantize dense layers.
+
     Returns
     -------
     config: QConfig

diff --git a/src/relay/op/nn/nn.cc b/src/relay/op/nn/nn.cc
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * 
+ *
  *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -125,18 +125,24 @@ bool DenseRel(const Array<Type>& types,
     oshape.Set((oshape.size() - 1), wshape[0]);
   }
 
+  DataType out_dtype = param->out_dtype;
+  if (out_dtype.bits() == 0) {
+    out_dtype = data->dtype;
+  }
   // assign output type
-  reporter->Assign(types[2], TensorTypeNode::make(oshape, data->dtype));
+  reporter->Assign(types[2], TensorTypeNode::make(oshape, out_dtype));
   return true;
 }
 
 
 // Positional relay function to create dense operator used by frontend FFI.
 Expr MakeDense(Expr data,
                Expr weight,
-               IndexExpr units) {
+               IndexExpr units,
+               DataType out_dtype) {
   auto attrs = make_node<DenseAttrs>();
   attrs->units = units;
+  attrs->out_dtype = out_dtype;
   static const Op& op = Op::Get("nn.dense");
   return CallNode::make(op, {data, weight}, Attrs(attrs), {});
 }

diff --git a/src/relay/pass/quantize.cc b/src/relay/pass/quantize.cc
@@ -296,6 +296,41 @@ RELAY_REGISTER_OP("nn.conv2d")
 .set_attr<FForwardRewrite>("FQRealizeRewrite", Conv2dRealize);
 
 
+Expr DenseRealize(const Call& ref_call,
+                  const Array<Expr>& new_args,
+                  const NodeRef& ctx) {
+  const QConfig& cfg = QConfig::Current();
+  CHECK_EQ(new_args.size(), 2);
+  if (!new_args[0]->derived_from<TempExprNode>() && !new_args[1]->derived_from<TempExprNode>()) {
+    return Expr(nullptr);
+  }
+  const auto* lhs = new_args[0].as<QRealizeIntExprNode>();
+  CHECK(lhs);
+  const auto* rhs = new_args[1].as<QRealizeIntExprNode>();
+  CHECK(rhs);
+
+  Expr ldata = lhs->data;
+  if (lhs->dtype != cfg->dtype_input) {
+    ldata = Cast(ldata, cfg->dtype_input);
+  }
+  Expr rdata = Cast(rhs->data, cfg->dtype_weight);
+
+  const auto ref_attrs = ref_call->attrs.as<DenseAttrs>();
+  auto attrs = make_node<DenseAttrs>();
+  *attrs = *ref_attrs;
+  DataType out_dtype = cfg->dtype_activation;
+  attrs->out_dtype = out_dtype;
+
+  Expr ret = CallNode::make(ref_call->op,
+          {ldata, rdata}, Attrs(attrs), ref_call->type_args);
+  Expr dom_scale = FoldConstant(Multiply(lhs->dom_scale, rhs->dom_scale));
+  return QRealizeIntExprNode::make(ret, dom_scale, out_dtype);
+}
+
+RELAY_REGISTER_OP("nn.dense")
+.set_attr<FForwardRewrite>("FQRealizeRewrite", DenseRealize);
+
+
 Expr MulRealize(const Call& ref_call,
                 const Array<Expr>& new_args,
                 const NodeRef& ctx) {
@@ -566,7 +601,8 @@ TVM_STATIC_IR_FUNCTOR(IRPrinter, vtable)
   p->stream << "round_for_shift==" << op->round_for_shift << ", ";
   p->stream << "store_lowbit_output==" << op->store_lowbit_output << ", ";
   p->stream << "debug_enabled_ops==" << op->debug_enabled_ops << ", ";
-  p->stream << "use_stop_fusion==" << op->use_stop_fusion;
+  p->stream << "use_stop_fusion==" << op->use_stop_fusion << ", ";
+  p->stream << "quantize_dense==" << op->quantize_dense;
   p->stream << ")";
 });
 

diff --git a/src/relay/pass/quantize.h b/src/relay/pass/quantize.h
@@ -130,6 +130,7 @@ class QConfigNode : public Node {
   bool store_lowbit_output = true;
   Array<Expr> debug_enabled_ops = Array<Expr>(NodePtr<Node>(nullptr));
   bool use_stop_fusion = true;
+  bool quantize_dense = true;
 
   void VisitAttrs(AttrVisitor* v) final {
     v->Visit("nbit_input", &nbit_input);
@@ -144,6 +145,7 @@ class QConfigNode : public Node {
     v->Visit("store_lowbit_output", &store_lowbit_output);
     v->Visit("debug_enabled_ops", &debug_enabled_ops);
     v->Visit("use_stop_fusion", &use_stop_fusion);
+    v->Visit("quantize_dense", &quantize_dense);
   }
 
   static constexpr const char* _type_key = "relay.quantize.QConfig";

diff --git a/topi/include/topi/cuda/dense.h b/topi/include/topi/cuda/dense.h
@@ -44,13 +44,15 @@ namespace cuda {
 * \param data Tensor with shape [batch, in_dim]
 * \param weight Tensor with shape [out_dim, in_dim]
 * \param bias Tensor with shape [out_dim]. Optional; to omit bias, pass Tensor()
+* \param out_dtype Output data type. Used for mixed precision.
 *
 * \return Tensor with shape [batch, out_dim]
 */
 inline tvm::Tensor dense_cuda(const Target& target,
                               const tvm::Tensor& data,
                               const tvm::Tensor& weight,
-                              const tvm::Tensor& bias) {
+                              const tvm::Tensor& bias,
+                              const Type& out_dtype) {
   CHECK_EQ(data->shape.size(), 2) << "dense requires 2-D data";
   CHECK_EQ(weight->shape.size(), 2) << "dense requires 2-D weight";
   if (bias.defined()) {
@@ -62,6 +64,7 @@ inline tvm::Tensor dense_cuda(const Target& target,
   auto out_dim = weight->shape[0];
 
   if (target->libs().count("cublas")) {
+    CHECK_EQ(data->dtype, out_dtype) << "Mixed precision not supported.";
     auto mm = topi::contrib::cublas_matmul(data, weight, false, true);
     if (bias.defined()) {
       mm = tvm::compute({ batch, out_dim },
@@ -72,7 +75,7 @@ inline tvm::Tensor dense_cuda(const Target& target,
 
     return mm;
   } else {
-    return topi::nn::dense(data, weight, bias);
+    return topi::nn::dense(data, weight, bias, out_dtype);
   }
 }
 

diff --git a/topi/include/topi/nn/dense.h b/topi/include/topi/nn/dense.h
@@ -40,12 +40,14 @@ using namespace tvm;
 * \param data Tensor with shape [batch, in_dim]
 * \param weight Tensor with shape [out_dim, in_dim]
 * \param bias Tensor with shape [out_dim]. Optional; to omit bias, pass Tensor()
+* \param out_dtype Output data type. Used for mixed precision.
 *
 * \return Tensor with shape [batch, out_dim]
 */
 inline tvm::Tensor dense(const tvm::Tensor& data,
                          const tvm::Tensor& weight,
-                         const tvm::Tensor& bias) {
+                         const tvm::Tensor& bias,
+                         const Type& out_dtype) {
   CHECK_EQ(data->shape.size(), 2) << "dense requires 2-D data";
   CHECK_EQ(weight->shape.size(), 2) << "dense requires 2-D weight";
   if (bias.defined()) {
@@ -60,14 +62,15 @@ inline tvm::Tensor dense(const tvm::Tensor& data,
   auto matmul = tvm::compute(
     { batch, out_dim },
     [&](Var i, Var j) {
-      return tvm::sum(data(i, k) * weight(j, k), { k });
+      return tvm::sum(tvm::cast(out_dtype, data(i, k)) *
+                      tvm::cast(out_dtype, weight(j, k)), { k });
     }, "tensor", "dense");
 
   if (bias.defined()) {
     matmul = tvm::compute(
       { batch, out_dim },
       [&](Var i, Var j) {
-        return matmul(i, j) + bias(j);
+        return matmul(i, j) + tvm::cast(out_dtype, bias(j));
       }, "tensor", kBroadcast);
   }
 

diff --git a/topi/include/topi/rocm/dense.h b/topi/include/topi/rocm/dense.h
@@ -45,13 +45,15 @@ namespace rocm {
 * \param data Tensor with shape [batch, in_dim]
 * \param weight Tensor with shape [out_dim, in_dim]
 * \param bias Tensor with shape [out_dim]. Optional; to omit bias, pass Tensor()
+* \param out_dtype Output data type. Used for mixed precision.
 *
 * \return Tensor with shape [batch, out_dim]
 */
 inline tvm::Tensor dense_rocm(const Target& target,
                               const tvm::Tensor& data,
                               const tvm::Tensor& weight,
-                              const tvm::Tensor& bias) {
+                              const tvm::Tensor& bias,
+                              const Type& out_dtype) {
   CHECK_EQ(data->shape.size(), 2) << "dense requires 2-D data";
   CHECK_EQ(weight->shape.size(), 2) << "dense requires 2-D weight";
   if (bias.defined()) {
@@ -63,6 +65,7 @@ inline tvm::Tensor dense_rocm(const Target& target,
   auto out_dim = weight->shape[0];
 
   if (target->libs().count("rocblas")) {
+    CHECK_EQ(data->dtype, out_dtype) << "Mixed precision not supported.";
     auto mm = topi::contrib::rocblas_matmul(data, weight, false, true);
     if (bias.defined()) {
       mm = tvm::compute({ batch, out_dim },
@@ -73,7 +76,7 @@ inline tvm::Tensor dense_rocm(const Target& target,
 
     return mm;
   } else {
-    return topi::nn::dense(data, weight, bias);
+    return topi::nn::dense(data, weight, bias, out_dtype);
   }
 }
 

diff --git a/topi/python/topi/cuda/__init__.py b/topi/python/topi/cuda/__init__.py
@@ -2,15 +2,15 @@
 """CUDA specific declaration and schedules."""
 from __future__ import absolute_import as _abs
 
-from . import conv2d, depthwise_conv2d, conv2d_transpose_nchw, deformable_conv2d, group_conv2d_nchw
+from . import conv2d, depthwise_conv2d, conv2d_transpose_nchw, deformable_conv2d, \
+              group_conv2d_nchw, dense
 from .conv2d_hwcn import schedule_conv2d_hwcn
 from .depthwise_conv2d import schedule_depthwise_conv2d_backward_input_nhwc
 from .depthwise_conv2d import schedule_depthwise_conv2d_backward_weight_nhwc
 from .group_conv2d_nchw import schedule_conv2d_nchw_cuda
 from .reduction import schedule_reduce
 from .softmax import schedule_softmax
 from .injective import schedule_injective, schedule_elemwise, schedule_broadcast
-from .dense import dense_cuda, schedule_dense
 from .pooling import schedule_pool, schedule_global_pool
 from .extern import schedule_extern
 from .nn import schedule_lrn, schedule_l2_normalize