TensorFlow: upstream changes from eigen to fix build from

changes in last commit.
adology · Dec 2, 2015 · bb7a7a8 · bb7a7a8
1 parent bf6b536
commit bb7a7a8
Show file tree

Hide file tree

Showing 8 changed files with 312 additions and 33 deletions.
diff --git a/third_party/eigen3/Eigen/src/Core/functors/UnaryFunctors.h b/third_party/eigen3/Eigen/src/Core/functors/UnaryFunctors.h
@@ -486,6 +486,39 @@ struct functor_traits<scalar_cube_op<Scalar> >
 { enum { Cost = 2*NumTraits<Scalar>::MulCost, PacketAccess = packet_traits<Scalar>::HasMul }; };
 
 
+/** \internal
+  * \brief Template functor to compute the signum of a scalar
+  * \sa class CwiseUnaryOp, Cwise::sign()
+  */
+template<typename Scalar,bool iscpx=(NumTraits<Scalar>::IsComplex!=0) > struct scalar_sign_op;
+template<typename Scalar>
+struct scalar_sign_op<Scalar,false> {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_sign_op)
+  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const
+  {
+      return Scalar( (a>Scalar(0)) - (a<Scalar(0)) );
+  }
+};
+template<typename Scalar>
+struct scalar_sign_op<Scalar,true> {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_sign_op)
+  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const
+  {
+      typename NumTraits<Scalar>::Real aa = std::abs(a);
+      return (aa==0) ?  Scalar(0) : (a/aa);
+  }
+};
+template<typename Scalar>
+struct functor_traits<scalar_sign_op<Scalar> >
+{ enum {
+    Cost =
+        NumTraits<Scalar>::IsComplex
+        ? ( 8*NumTraits<Scalar>::MulCost  ) // roughly
+        : ( 3*NumTraits<Scalar>::AddCost),
+    PacketAccess = false,
+  };
+};
+
 } // end namespace internal
 
 } // end namespace Eigen

diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/Tensor b/third_party/eigen3/unsupported/Eigen/CXX11/Tensor
@@ -59,7 +59,7 @@
 #include <curand_kernel.h>
 #endif  // defined(__CUDACC__)
 #else
-#include "perftools/gputools/executor/gcuda.h"
+#include "platforms/gpus/gcudacc/runtime/gcudacc_runtime.h"
 #ifdef __CUDACC__
 #include "third_party/gpus/cuda/curand_device/curand_kernel.h"
 #endif  // defined(__CUDACC__)
@@ -88,6 +88,7 @@
 #include "unsupported/Eigen/CXX11/src/Tensor/TensorInitializer.h"
 #include "unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h"
 #include "unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h"
 #include "unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h"
 #include "unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h"
 

diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
@@ -80,6 +80,12 @@ class TensorBase<Derived, ReadOnlyAccessors>
       return unaryExpr(internal::scalar_opposite_op<Scalar>());
     }
 
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_sign_op<Scalar>, const Derived>
+    sign() const {
+      return unaryExpr(internal::scalar_sign_op<Scalar>());
+    }
+
     EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_sqrt_op<Scalar>, const Derived>
     sqrt() const {

diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h
@@ -757,11 +757,17 @@ static inline void setCudaSharedMemConfig(cudaSharedMemConfig cache_config) {
 }
 
 struct GpuDevice {
-  GpuDevice()
-      : stream_(perftools::gputools::MachineManager::singleton()->stream_for_device(0)),
-        allocator_(nullptr),
-        stream_exec_(stream_->parent()),
-        device_descr_(&(stream_exec_->GetDeviceDescription())) {}
+  // Default constructor: Get [cached] device 0 and its default stream.
+  GpuDevice() : allocator_(nullptr) {
+    perftools::gputools::Platform* platform =
+        perftools::gputools::MultiPlatformManager::PlatformWithName("cuda")
+            .ValueOrDie();
+    stream_exec_ = platform->ExecutorForDevice(0).ValueOrDie();
+    // TODO(rspringer): If we ever pull from an executor aside from 0, this will
+    // need to be preceded by a call to SetDevice(N);
+    stream_ = platforms::gpus::gcudacc::GetDefaultStream();
+    device_descr_ = &(stream_exec_->GetDeviceDescription());
+  }
 
   GpuDevice(perftools::gputools::Stream* stream,
             const Allocator* alloc = nullptr)

diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
@@ -418,11 +418,13 @@ inline void TensorExecutor<Expression, GpuDevice, false, Tileable>::run(
   TensorEvaluator<Expression, GpuDevice> evaluator(expr, device);
   const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
   if (needs_assign) {
-    const int num_blocks = device.getNumCudaMultiProcessors() *
-                           device.maxCudaThreadsPerMultiProcessor() /
-                           device.maxCudaThreadsPerBlock();
     const int block_size = device.maxCudaThreadsPerBlock();
+    const int max_blocks = device.getNumCudaMultiProcessors() *
+                           device.maxCudaThreadsPerMultiProcessor() / block_size;
     const Index size = array_prod(evaluator.dimensions());
+    // Create a least one block to ensure we won't crash when tensorflow calls with tensors of size 0.
+    const int num_blocks = numext::maxi<int>(numext::mini<int>(max_blocks, (size + block_size - 1) / block_size), 1);
+
     LAUNCH_CUDA_KERNEL(
         (EigenMetaKernel_NonVectorizable<TensorEvaluator<Expression, GpuDevice>,
                                          Index>),
@@ -438,11 +440,13 @@ inline void TensorExecutor<Expression, GpuDevice, true, Tileable>::run(
   TensorEvaluator<Expression, GpuDevice> evaluator(expr, device);
   const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
   if (needs_assign) {
-    const int num_blocks = device.getNumCudaMultiProcessors() *
-                           device.maxCudaThreadsPerMultiProcessor() /
-                           device.maxCudaThreadsPerBlock();
     const int block_size = device.maxCudaThreadsPerBlock();
+    const int max_blocks = device.getNumCudaMultiProcessors() *
+                           device.maxCudaThreadsPerMultiProcessor() / block_size;
     const Index size = array_prod(evaluator.dimensions());
+    // Create a least one block to ensure we won't crash when tensorflow calls with tensors of size 0.
+    const int num_blocks = numext::maxi<int>(numext::mini<int>(max_blocks, (size + block_size - 1) / block_size), 1);
+
     LAUNCH_CUDA_KERNEL(
         (EigenMetaKernel_Vectorizable<TensorEvaluator<Expression, GpuDevice>,
                                       Index>),

diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h
@@ -59,13 +59,8 @@ namespace {
 
   template <typename T>
   struct DividerTraits {
-#if defined(__SIZEOF_INT128__) && !defined(__CUDACC__)
     typedef typename conditional<sizeof(T) == 8, uint64_t, uint32_t>::type type;
     static const int N = sizeof(T) * 8;
-#else
-    typedef uint32_t type;
-    static const int N = 32;
-#endif
   };
 
 
@@ -78,40 +73,39 @@ namespace {
 #endif
   }
 
+  template <typename T>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint64_t muluh(const uint64_t a, const T b) {
 #if defined(__CUDA_ARCH__)
- template <typename T>
- EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint64_t muluh(const uint64_t a, const T b) {
     return __umul64hi(a, b);
- }
-#else
-  template <typename T>
-  EIGEN_ALWAYS_INLINE uint64_t muluh(const uint64_t a, const T b) {
-#if defined(__SIZEOF_INT128__) && !defined(__CUDACC__)
+#elif defined(__SIZEOF_INT128__)
     __uint128_t v = static_cast<__uint128_t>(a) * static_cast<__uint128_t>(b);
     return static_cast<uint64_t>(v >> 64);
 #else
-    EIGEN_STATIC_ASSERT(sizeof(T) == 4, YOU_MADE_A_PROGRAMMING_MISTAKE);
-    return (a * b) >> 32;
+    return (TensorUInt128<static_val<0>, uint64_t>(a) * TensorUInt128<static_val<0>, uint64_t>(b)).upper();
 #endif
   }
-#endif
 
   template <int N, typename T>
   struct DividerHelper {
-    static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint32_t computeMultiplier (const int log_div, const T divider) {
+    static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint32_t computeMultiplier(const int log_div, const T divider) {
       EIGEN_STATIC_ASSERT(N == 32, YOU_MADE_A_PROGRAMMING_MISTAKE);
       return (static_cast<uint64_t>(1) << (N+log_div)) / divider - (static_cast<uint64_t>(1) << N) + 1;
     }
   };
 
-#if defined(__SIZEOF_INT128__) && !defined(__CUDACC__)
   template <typename T>
   struct DividerHelper<64, T> {
-    static EIGEN_ALWAYS_INLINE uint64_t computeMultiplier(const int log_div, const T divider) {
+    static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint64_t computeMultiplier(const int log_div, const T divider) {
+#if defined(__SIZEOF_INT128__) && !defined(__CUDA_ARCH__)
       return ((static_cast<__uint128_t>(1) << (64+log_div)) / static_cast<__uint128_t>(divider) - (static_cast<__uint128_t>(1) << 64) + 1);
+#else
+      const uint64_t shift = 1ULL << log_div;
+      TensorUInt128<uint64_t, uint64_t> result = (TensorUInt128<uint64_t, static_val<0> >(shift, 0) / TensorUInt128<static_val<0>, uint64_t>(divider) - TensorUInt128<static_val<1>, static_val<0> >(1, 0) + TensorUInt128<static_val<0>, static_val<1> >(1));
+      return static_cast<uint64_t>(result);
+#endif
     }
   };
-#endif
+
 }
 
 

diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
@@ -141,6 +141,7 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
         m_unshuffledInputStrides[i] =
             m_unshuffledInputStrides[i - 1] * input_dims[i - 1];
         m_outputStrides[i] = m_outputStrides[i - 1] * m_dimensions[i - 1];
+        m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(m_outputStrides[i]);
       }
     } else {
       m_unshuffledInputStrides[NumDims - 1] = 1;
@@ -149,6 +150,7 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
         m_unshuffledInputStrides[i] =
             m_unshuffledInputStrides[i + 1] * input_dims[i + 1];
         m_outputStrides[i] = m_outputStrides[i + 1] * m_dimensions[i + 1];
+        m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(m_outputStrides[i]);
       }
     }
 
@@ -319,14 +321,14 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
     Index inputIndex = 0;
     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
       for (int i = NumDims - 1; i > 0; --i) {
-        const Index idx = index / m_outputStrides[i];
+        const Index idx = index / m_fastOutputStrides[i];
         inputIndex += idx * m_inputStrides[i];
         index -= idx * m_outputStrides[i];
       }
       return inputIndex + index * m_inputStrides[0];
     } else {
       for (int i = 0; i < NumDims - 1; ++i) {
-        const Index idx = index / m_outputStrides[i];
+        const Index idx = index / m_fastOutputStrides[i];
         inputIndex += idx * m_inputStrides[i];
         index -= idx * m_outputStrides[i];
       }
@@ -338,6 +340,7 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
   Dimensions m_dimensions;
   array<Index, NumDims> m_inverseShuffle;
   array<Index, NumDims> m_outputStrides;
+  array<internal::TensorIntDivisor<Index>, NumDims> m_fastOutputStrides;
   array<Index, NumDims> m_inputStrides;
   array<Index, NumDims> m_unshuffledInputStrides;
   TensorEvaluator<ArgType, Device> m_impl;