From 06404d3bac98b730fcd50394c39874d0eef85e80 Mon Sep 17 00:00:00 2001
From: Andrew Tulloch <andrew@tullo.ch>
Date: Tue, 23 Jul 2019 14:43:27 -0700
Subject: [PATCH] {relay,topi}.reinterpret support (#3599)

= Motivation

It's useful to expose the tvm::reinterpret functionality to Relay/TOPI users, as
this allows them to build (fused) operators leveraging the bitwise
reinterpretation of an operator. An example is approximate transcendental
functions, which can be implemented similar to:

```.py
    def C(x):
        return relay.expr.const(x, "float32")

    def approx_exp(x):
        x = relay.minimum(relay.maximum(x, C(-88.0)), C(88.0))
        x = C(127.0) + x * C(1.44269504)
        xf = relay.floor(x)
        i = relay.cast(xf, "int32")
        x = x - xf
        Y = C(0.99992522) + x * (C(0.69583354) + x * (C(0.22606716) + x * C(0.078024523)))
        exponent = relay.left_shift(i, relay.expr.const(23, "int32"))
        exponent = relay.reinterpret(exponent, "float32")
        return exponent * Y

    def approx_sigmoid(x):
        # <2.0e-5 absolute error over [-5, 5]
        y = approx_exp(x)
        return y / (y + C(1.0))

    def approx_tanh(x):
        # <4.0e-5 absolute error over [-5, 5]
        x = x * C(2.0)
        y = approx_exp(x)
        return (y - C(1.0)) / (y + C(1.0))
```

See unit tests for implementations of these approximate transendentals.
---
 docs/api/python/topi.rst                     |  2 +
 docs/langref/relay_op.rst                    |  2 +
 python/tvm/relay/op/_transform.py            |  1 +
 python/tvm/relay/op/transform.py             | 20 ++++++
 src/codegen/codegen_c.cc                     | 11 +++-
 src/relay/op/tensor/transform.cc             | 31 ++++++++++
 tests/python/relay/test_op_level3.py         | 64 ++++++++++++++++++++
 tests/python/unittest/test_codegen_c_host.py | 25 ++++++++
 topi/include/topi/elemwise.h                 | 36 ++++++++---
 topi/python/topi/math.py                     | 18 ++++++
 topi/src/topi.cc                             |  6 ++
 topi/tests/python/test_topi_transform.py     | 36 +++++++++++
 12 files changed, 242 insertions(+), 10 deletions(-)

diff --git a/docs/api/python/topi.rst b/docs/api/python/topi.rst
index 9ac8bb1fd084..8f59e08c0797 100644
--- a/docs/api/python/topi.rst
+++ b/docs/api/python/topi.rst
@@ -40,6 +40,7 @@ List of operators
    topi.sigmoid
    topi.clip
    topi.cast
+   topi.reinterpret
    topi.transpose
    topi.flip
    topi.strided_slice
@@ -133,6 +134,7 @@ topi
 .. autofunction:: topi.sigmoid
 .. autofunction:: topi.clip
 .. autofunction:: topi.cast
+.. autofunction:: topi.reinterpret
 .. autofunction:: topi.transpose
 .. autofunction:: topi.flip
 .. autofunction:: topi.strided_slice
diff --git a/docs/langref/relay_op.rst b/docs/langref/relay_op.rst
index dad5eb89a053..61c9b36e1ffd 100644
--- a/docs/langref/relay_op.rst
+++ b/docs/langref/relay_op.rst
@@ -114,6 +114,7 @@ This level enables additional math and transform operators.
    tvm.relay.full
    tvm.relay.full_like
    tvm.relay.cast
+   tvm.relay.reinterpret
    tvm.relay.split
    tvm.relay.arange
    tvm.relay.stack
@@ -263,6 +264,7 @@ Level 3 Definitions
 .. autofunction:: tvm.relay.full
 .. autofunction:: tvm.relay.full_like
 .. autofunction:: tvm.relay.cast
+.. autofunction:: tvm.relay.reinterpret
 .. autofunction:: tvm.relay.split
 .. autofunction:: tvm.relay.arange
 .. autofunction:: tvm.relay.stack
diff --git a/python/tvm/relay/op/_transform.py b/python/tvm/relay/op/_transform.py
index 0749bbd02f1d..51e761516eed 100644
--- a/python/tvm/relay/op/_transform.py
+++ b/python/tvm/relay/op/_transform.py
@@ -40,6 +40,7 @@
 _reg.register_schedule("repeat", schedule_broadcast)
 _reg.register_schedule("tile", schedule_broadcast)
 _reg.register_schedule("cast", schedule_injective)
+_reg.register_schedule("reinterpret", schedule_injective)
 _reg.register_schedule("strided_slice", schedule_injective)
 _reg.register_schedule("slice_like", schedule_injective)
 _reg.register_schedule("split", schedule_injective)
diff --git a/python/tvm/relay/op/transform.py b/python/tvm/relay/op/transform.py
index 5137a9c469a4..5d8d28006ecb 100644
--- a/python/tvm/relay/op/transform.py
+++ b/python/tvm/relay/op/transform.py
@@ -40,6 +40,26 @@ def cast(data, dtype):
     return _relay_make.cast(data, dtype)
 
 
+def reinterpret(data, dtype):
+    """Reinterpret input tensor to data type.
+
+    Parameters
+    ----------
+    data : relay.Expr
+        The input data to the operator.
+
+    dtype: str
+        The target data type
+
+    Returns
+    -------
+    result : relay.Expr
+        The reinterpreted result.
+    """
+    from .. import _make as _relay_make
+    return _relay_make.reinterpret(data, dtype)
+
+
 def expand_dims(data, axis, num_newaxis=1):
     """Insert `num_newaxis` axises at the position given by `axis`.
 
diff --git a/src/codegen/codegen_c.cc b/src/codegen/codegen_c.cc
index 19f7a270b865..bbd28baea9b5 100644
--- a/src/codegen/codegen_c.cc
+++ b/src/codegen/codegen_c.cc
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * 
+ *
  *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -569,6 +569,13 @@ void CodeGenC::VisitExpr_(const Call *op, std::ostream& os) {  // NOLINT(*)
     os << "(";
     this->PrintExpr(op->args[0], os);
     os << " == NULL)";
+  } else if (op->is_intrinsic(Call::reinterpret)) {
+    // generate (*( TYPE *)(&(ARG)))
+    os << "(*(";
+    this->PrintType(op->type, os);
+    os << " *)(&(";
+    this->PrintExpr(op->args[0], os);
+    os << ")))";
   } else {
     if (op->call_type == Call::Intrinsic ||
         op->call_type == Call::PureIntrinsic) {
diff --git a/src/relay/op/tensor/transform.cc b/src/relay/op/tensor/transform.cc
index 59424884ccfe..0b501e2ff119 100644
--- a/src/relay/op/tensor/transform.cc
+++ b/src/relay/op/tensor/transform.cc
@@ -97,6 +97,37 @@ RELAY_REGISTER_OP("cast")
 .set_attr<TOpPattern>("TOpPattern", kElemWise)
 .set_attr<FInferCorrectLayout>("FInferCorrectLayout", ElemwiseArbitraryLayout);
 
+Array<Tensor> ReinterpretCompute(const Attrs& attrs, const Array<Tensor>& inputs,
+                                 const Type& out_type, const Target& target) {
+  const CastAttrs* param = attrs.as<CastAttrs>();
+  CHECK(param != nullptr);
+  DataType dtype = param->dtype;
+  return {topi::reinterpret(inputs[0], dtype)};
+}
+
+Expr MakeReinterpret(Expr data, DataType dtype) {
+  auto attrs = make_node<CastAttrs>();
+  attrs->dtype = dtype;
+  static const Op& op = Op::Get("reinterpret");
+  return CallNode::make(op, {data}, Attrs(attrs), {});
+}
+
+TVM_REGISTER_API("relay._make.reinterpret").set_body([](const TVMArgs& args, TVMRetValue* rv) {
+  runtime::detail::unpack_call<Expr, 2>(MakeReinterpret, args, rv);
+});
+
+RELAY_REGISTER_OP("reinterpret")
+    .describe(R"code(Reinterpret the data into a new data type.
+)code" TVM_ADD_FILELINE)
+    .set_num_inputs(1)
+    .set_attrs_type_key("relay.attrs.CastAttrs")
+    .add_argument("data", "Tensor", "The input tensor.")
+    .set_support_level(3)
+    .add_type_rel("Reinterpret", CastRel)
+    .set_attr<FTVMCompute>("FTVMCompute", ReinterpretCompute)
+    .set_attr<TOpPattern>("TOpPattern", kElemWise)
+    .set_attr<FInferCorrectLayout>("FInferCorrectLayout", ElemwiseArbitraryLayout);
+
 // relay.expand_dims
 TVM_REGISTER_NODE_TYPE(ExpandDimsAttrs);
 
diff --git a/tests/python/relay/test_op_level3.py b/tests/python/relay/test_op_level3.py
index da3de2b22f74..01c0a120dbcb 100644
--- a/tests/python/relay/test_op_level3.py
+++ b/tests/python/relay/test_op_level3.py
@@ -75,6 +75,7 @@ def test_cast():
     assert "dtype=" in yy.astext()
     assert yy.checked_type == relay.TensorType((8, 9, 4), "int32")
 
+
 def test_clip():
     a = relay.var("a", relay.TensorType((10, 4), "float32"))
     y = relay.clip(a, 1., 4.)
@@ -88,6 +89,69 @@ def test_clip():
     np.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=0.01)
 
 
+def test_reinterpret():
+    a = relay.var("a", relay.TensorType((1000, 4), "float32"))
+    y = relay.reinterpret(a, "int32")
+    yy = run_infer_type(y)
+    assert yy.checked_type == relay.TensorType((1000, 4), "int32")
+
+    data = np.random.randn(1000, 4).astype('float32') * 1000
+    intrp = create_executor()
+    op_res = intrp.evaluate(y, {a: relay.const(data)})
+    ref_res = data.view("int32")
+    np.testing.assert_equal(op_res.asnumpy(), ref_res)
+
+
+def test_approximate_transcendental():
+    def C(x):
+        return relay.expr.const(x, "float32")
+
+    def approx_exp(x):
+        # An approximation derived from Opus,
+        # https://github.com/xiph/opus/blob/c1c247/celt/mathops.h#L147-L165
+        x = relay.minimum(relay.maximum(x, C(-88.0)), C(88.0))
+        x = C(127.0) + x * C(1.44269504)
+        xf = relay.floor(x)
+        i = relay.cast(xf, "int32")
+        x = x - xf
+        Y = C(0.99992522) + x * (C(0.69583354) + x * (C(0.22606716) + x * C(0.078024523)))
+        exponent = relay.left_shift(i, relay.expr.const(23, "int32"))
+        exponent = relay.reinterpret(exponent, "float32")
+        return exponent * Y
+
+    def approximate_sigmoid(x):
+        y = approx_exp(x)
+        return y / (y + C(1.0))
+
+    def approximate_tanh(x):
+        x = x * C(2.0)
+        y = approx_exp(x)
+        return (y - C(1.0)) / (y + C(1.0))
+
+    a = relay.var("a", relay.TensorType((1000,), "float32"))
+    y = approximate_sigmoid(a)
+    yy = run_infer_type(y)
+    assert yy.checked_type == relay.TensorType((1000,), "float32")
+    data = np.linspace(-5, 5, 1000).astype("float32")
+    intrp = create_executor()
+    op_res = intrp.evaluate(y, {a: relay.const(data)})
+
+    def reference_sigmoid(x):
+        return np.exp(-np.logaddexp(0, -x))
+    np.testing.assert_allclose(op_res.asnumpy(), reference_sigmoid(data), atol=2e-5, rtol=1e-9)
+
+    y = approximate_tanh(a)
+    yy = run_infer_type(y)
+    assert yy.checked_type == relay.TensorType((1000,), "float32")
+    data = np.linspace(-5, 5, 1000).astype("float32")
+    intrp = create_executor()
+    op_res = intrp.evaluate(y, {a: relay.const(data)})
+
+    def reference_tanh(x):
+        return np.tanh(x)
+    np.testing.assert_allclose(op_res.asnumpy(), reference_tanh(data), atol=4e-5, rtol=1e-9)
+
+
 def test_squeeze():
     def verify_squeeze(shape, dtype, axis):
         x = relay.var("x", relay.TensorType(shape, dtype))
diff --git a/tests/python/unittest/test_codegen_c_host.py b/tests/python/unittest/test_codegen_c_host.py
index 5161c6899db9..70b38e178f69 100644
--- a/tests/python/unittest/test_codegen_c_host.py
+++ b/tests/python/unittest/test_codegen_c_host.py
@@ -95,6 +95,31 @@ def check_c():
     with tvm.build_config(offset_factor=4):
         check_c()
 
+
+def test_reinterpret():
+    nn = 1024
+    n = tvm.convert(nn)
+    A = tvm.placeholder((n,), name='A', dtype="int32")
+    B = tvm.compute(A.shape, lambda *i: tvm.call_pure_intrin("float32", "reinterpret", A(*i)), name='B')
+    s = tvm.create_schedule(B.op)
+
+    def check_c():
+        mhost = tvm.build(s, [A, B], "c", name="reinterpret")
+        temp = util.tempdir()
+        path_dso = temp.relpath("temp.so")
+        mhost.export_library(path_dso)
+        m = tvm.module.load(path_dso)
+        fadd = m['reinterpret']
+        ctx = tvm.cpu(0)
+        n = nn
+        a = tvm.nd.array(np.random.randint(-2 ** 30, 2 ** 30, size=n).astype(A.dtype), ctx)
+        b = tvm.nd.array(np.zeros(n, dtype=B.dtype), ctx)
+        fadd(a, b)
+        tvm.testing.assert_allclose(
+            b.asnumpy(), a.asnumpy().view('float32'))
+    check_c()
+
 if __name__ == "__main__":
     test_add()
     test_add_pipeline()
+    test_reinterpret()
diff --git a/topi/include/topi/elemwise.h b/topi/include/topi/elemwise.h
index b6e6adad0715..000567eeae14 100644
--- a/topi/include/topi/elemwise.h
+++ b/topi/include/topi/elemwise.h
@@ -269,14 +269,34 @@ inline Tensor cast(const Tensor& x,
 }
 
 /*!
-* \brief Creates an operation that sum each element of a tensor
-*
-* \param xs The input tensor array
-* \param name The name of the operation
-* \param tag The tag to mark the operation
-*
-* \return A Tensor whose op member is the sum operation
-*/
+ * \brief Reinterpret each element of x to the given type.
+
+ * \param x The input tensor
+ * \param type The type to cast to
+ * \param name The name of the operation
+ * \param tag The tag to mark the operation
+ *
+ * \return A Tensor whose op member is the reinterpret operation
+ */
+inline Tensor reinterpret(const Tensor& x, Type type, std::string name = "tensor",
+                          std::string tag = kElementWise) {
+  return compute(x->shape,
+                 [&](const Array<Var>& i) {
+                   return tvm::ir::Call::make(type, "reinterpret", {x(i)},
+                                              tvm::ir::Call::PureIntrinsic);
+                 },
+                 name, tag);
+}
+
+/*!
+ * \brief Creates an operation that sum each element of a tensor
+ *
+ * \param xs The input tensor array
+ * \param name The name of the operation
+ * \param tag The tag to mark the operation
+ *
+ * \return A Tensor whose op member is the sum operation
+ */
 inline Tensor elemwise_sum(const Array<Tensor>& xs,
                            std::string name = "T_elemwise_sum",
                            std::string tag = kElementWise) {
diff --git a/topi/python/topi/math.py b/topi/python/topi/math.py
index 406d48969682..87ac06c76c75 100644
--- a/topi/python/topi/math.py
+++ b/topi/python/topi/math.py
@@ -343,3 +343,21 @@ def cast(x, dtype):
         return tvm.compute(
             x.shape, lambda *i: x(*i).astype(dtype), tag=tag.ELEMWISE)
     return tvm.make._cast(dtype, x)
+
+def reinterpret(x, dtype):
+    """Reinterpret input to specified data type.
+
+    Parameters
+    ----------
+    x : tvm.Tensor
+        Input argument.
+
+    dtype : str
+        Data type.
+
+    Returns
+    -------
+    y : tvm.Tensor
+        The result.
+    """
+    return cpp.reinterpret(x, dtype)
diff --git a/topi/src/topi.cc b/topi/src/topi.cc
index 44134d7c2d67..6c5a0b438cb2 100644
--- a/topi/src/topi.cc
+++ b/topi/src/topi.cc
@@ -193,6 +193,12 @@ TVM_REGISTER_GLOBAL("topi.cast")
   *rv = cast(args[0], args[1]);
   });
 
+
+TVM_REGISTER_GLOBAL("topi.reinterpret")
+.set_body([](TVMArgs args, TVMRetValue* rv) {
+  *rv = reinterpret(args[0], args[1]);
+  });
+
 TVM_REGISTER_GLOBAL("topi.elemwise_sum")
 .set_body([](TVMArgs args, TVMRetValue *rv) {
   *rv = elemwise_sum(args[0]);
diff --git a/topi/tests/python/test_topi_transform.py b/topi/tests/python/test_topi_transform.py
index 7f2c73e00390..f069303461f1 100644
--- a/topi/tests/python/test_topi_transform.py
+++ b/topi/tests/python/test_topi_transform.py
@@ -45,6 +45,29 @@ def check_device(device):
         check_device(device)
 
 
+def verify_reinterpret(in_shape, in_dtype, out_dtype, generator):
+    A = tvm.placeholder(shape=in_shape, name="A", dtype=in_dtype)
+    B = topi.reinterpret(A, out_dtype)
+    def check_device(device):
+        ctx = tvm.context(device, 0)
+        if not ctx.exist:
+            print("Skip because %s is not enabled" % device)
+            return
+        print("Running on target: %s" % device)
+        with tvm.target.create(device):
+            s = topi.generic.schedule_elemwise(B)
+        foo = tvm.build(s, [A, B], device, name="reinterpret")
+        data_npy = generator(in_shape).astype(in_dtype)
+        out_npy = data_npy.view(B.dtype)
+        data_nd = tvm.nd.array(data_npy, ctx)
+        out_nd = tvm.nd.array(np.empty(in_shape).astype(B.dtype), ctx)
+        foo(data_nd, out_nd)
+        np.testing.assert_equal(out_nd.asnumpy(), out_npy)
+
+    for device in get_all_backend():
+        check_device(device)
+
+
 def verify_transpose(in_shape, axes):
     A = tvm.placeholder(shape=in_shape, name="A")
     B = topi.transpose(A, axes)
@@ -434,6 +457,19 @@ def test_expand_dims():
     verify_expand_dims((3, 10), (1, 3, 10), -3, 1)
 
 
+def test_reinterpret():
+    verify_reinterpret((1000,), "float32", "int32",
+                       lambda shape: np.random.randn(*shape) * 1000)
+    verify_reinterpret((1000,), "float16", "int16",
+                       lambda shape: np.random.randn(*shape) * 100)
+    verify_reinterpret((1000,), "int16", "uint16",
+                       lambda shape: np.random.randint(-1000, 1000, size=shape))
+    verify_reinterpret((1000,), "uint32", "int32",
+                       lambda shape: np.random.randint(0, 2 ** 32 - 1, size=shape))
+    verify_reinterpret((1000,), "uint32", "int32",
+                       lambda shape: np.random.randint(0, 2 ** 32 - 1, size=shape))
+
+
 def test_transpose():
     verify_transpose((3, 10, 2), (1, 0, 2))
     verify_transpose((3, 10, 5), (2, 0, 1))