diff --git a/topi/include/topi/elemwise.h b/topi/include/topi/elemwise.h
index a9f8f630471f8..b3681e17da8df 100644
--- a/topi/include/topi/elemwise.h
+++ b/topi/include/topi/elemwise.h
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * 
+ *
  *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -31,6 +31,7 @@
 #include "tvm/tvm.h"
 #include "tvm/ir.h"
 #include "tvm/ir_pass.h"
+#include "broadcast.h"
 
 namespace topi {
 using namespace tvm;
@@ -46,7 +47,6 @@ using namespace tvm;
   }
 
 TOPI_DECLARE_UNARY_OP(exp);
-TOPI_DECLARE_UNARY_OP(tanh);
 TOPI_DECLARE_UNARY_OP(sigmoid);
 TOPI_DECLARE_UNARY_OP(sqrt);
 TOPI_DECLARE_UNARY_OP(log);
@@ -56,6 +56,74 @@ TOPI_DECLARE_UNARY_OP(round);
 TOPI_DECLARE_UNARY_OP(trunc);
 TOPI_DECLARE_UNARY_OP(abs);
 
+/*
+ * \brief Fast_tanh_float implementation from Eigen
+ * https://github.com/eigenteam/eigen-git-mirror/blob/master/Eigen/src/Core/MathFunctionsImpl.h#L26
+ */
+inline Tensor fast_tanh_float(const Tensor& in,
+                              std::string name,
+                              std::string tag) {
+  // Clamp the inputs to the range [-9, 9] since anything outside
+  // this range is +/-1.0f in single-precision.
+  auto x = maximum(minimum(in, make_const(in->dtype, 9.0)), make_const(in->dtype, -9.0));
+
+  // The monomial coefficients of the numerator polynomial (odd).
+  auto alpha_1 = make_const(in->dtype, 4.89352455891786e-03);
+  auto alpha_3 = make_const(in->dtype, 6.37261928875436e-04);
+  auto alpha_5 = make_const(in->dtype, 1.48572235717979e-05);
+  auto alpha_7 = make_const(in->dtype, 5.12229709037114e-08);
+  auto alpha_9 = make_const(in->dtype, -8.60467152213735e-11);
+  auto alpha_11 = make_const(in->dtype, 2.00018790482477e-13);
+  auto alpha_13 = make_const(in->dtype, -2.76076847742355e-16);
+
+  // The monomial coefficients of the denominator polynomial (even).
+  auto beta_0 = make_const(in->dtype, 4.89352518554385e-03);
+  auto beta_2 = make_const(in->dtype, 2.26843463243900e-03);
+  auto beta_4 = make_const(in->dtype, 1.18534705686654e-04);
+  auto beta_6 = make_const(in->dtype, 1.19825839466702e-06);
+
+  return compute(x->shape,
+                 [&](const Array<Var>& i) {
+                   auto x2 = x(i) * x(i);
+                   auto p = x2 * alpha_13 + alpha_11;
+                   p = x2 * p + alpha_9;
+                   p = x2 * p + alpha_7;
+                   p = x2 * p + alpha_5;
+                   p = x2 * p + alpha_3;
+                   p = x2 * p + alpha_1;
+                   p = x(i) * p;
+
+                   auto q = x2 * beta_6 + beta_4;
+                   q = x2 * q + beta_2;
+                   q = x2 * q + beta_0;
+                   return p / q;
+                 },
+                 name, tag);
+}
+
+/*!
+* \brief Creates an operation that returns hyperbolic tanh of a given tensor
+*
+* \param x The input tensor
+* \param name The name of the operation
+* \param tag The tag to mark the operation
+*
+* \return A Tensor whose op member is tanh
+*/
+inline Tensor tanh(const Tensor& x,
+                   std::string name = "T_tanh",
+                   std::string tag = kElementWise) {
+  if (x->dtype == Float(32)) {
+    // invoke fast_tanh_float implementation
+    return fast_tanh_float(x, name, tag);
+  } else {
+    // fallback to default implementation
+    return compute(x->shape, [&](const Array<Var>& i) {
+      return ::tvm::tanh(x(i));
+    }, name, tag);
+  }
+}
+
 /*!
 * \brief Creates an operation that returns identity of a given tensor
 *
diff --git a/topi/tests/python/test_topi_math.py b/topi/tests/python/test_topi_math.py
index d6df450628d26..a4b4b3a2ca063 100644
--- a/topi/tests/python/test_topi_math.py
+++ b/topi/tests/python/test_topi_math.py
@@ -29,13 +29,22 @@ def test_util():
 
 
 def test_ewise():
-    m = tvm.var('m')
-    l = tvm.var('l')
-    A = tvm.placeholder((m, l), name='A')
+    def test_apply(
+        func,
+        name,
+        f_numpy,
+        low,
+        high,
+        shape=(20, 3),
+        dtype=tvm.float32,
+        maxulp=1,
+        check_round=False,
+        skip_name_check=False,
+    ):
+        m = tvm.var("m")
+        l = tvm.var("l")
+        A = tvm.placeholder((m, l), dtype=dtype, name="A")
 
-    shape = (20, 3)
-
-    def test_apply(func, name, f_numpy, low, high, check_round=False, skip_name_check=False):
         B = func(A)
         assert tuple(B.shape) == tuple(A.shape)
         if not skip_name_check:
@@ -58,12 +67,11 @@ def check_device(device):
             a = tvm.nd.array(a_np, ctx)
             b = tvm.nd.array(np.zeros_like(b_np), ctx)
             foo(a, b)
-            tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5, atol=1e-5)
+            np.testing.assert_array_max_ulp(b.asnumpy(), b_np, maxulp=maxulp)
 
         for device in get_all_backend():
             check_device(device)
 
-
     test_apply(topi.floor, "floor", np.floor, -100, 100)
     test_apply(topi.ceil, "ceil", np.ceil, -100, 100)
     test_apply(topi.sign, "sign", np.sign, -100, 100, skip_name_check=True)
@@ -71,11 +79,12 @@ def check_device(device):
     test_apply(topi.abs, "fabs", np.abs, -100, 100)
     test_apply(topi.round, "round", np.round, -100, 100, check_round=True)
     test_apply(topi.exp, "exp", np.exp, -1, 1)
-    test_apply(topi.tanh, "tanh", np.tanh, -10, 10)
-    test_apply(topi.sigmoid, "sigmoid", lambda x:1/(1+np.exp(-x)), -1, 1)
+    test_apply(topi.tanh, "tanh", np.tanh, -100, 100, shape=(128, 128), maxulp=1)
+    test_apply(topi.tanh, "tanh", np.tanh, -100, 100, shape=(128, 128), dtype="float64", maxulp=3)
+    test_apply(topi.sigmoid, "sigmoid", lambda x: 1 / (1 + np.exp(-x)), -1, 1)
     test_apply(topi.log, "log", np.log, 0, 100)
     test_apply(topi.sqrt, "sqrt", np.sqrt, 0, 100)
-    test_apply(topi.rsqrt, "rsqrt", lambda x:np.ones_like(x)/np.sqrt(x), 0, 100, skip_name_check=True)
+    test_apply(topi.rsqrt, "rsqrt", lambda x: np.ones_like(x) / np.sqrt(x), 0, 100, skip_name_check=True)
 
 
 def test_cast():
@@ -93,7 +102,7 @@ def verify(from_dtype, to_dtype, low=-100, high=100):
         b_np = a_np.astype(to_dtype)
 
         for device in get_all_backend():
-            ctx = tvm.context(device,  0)
+            ctx = tvm.context(device, 0)
             if not ctx.exist:
                 print("Skip because %s is not enabled" % device)
                 continue