Skip to content

Commit

Permalink
TensorFlow: upstream changes from eigen to fix build from
Browse files Browse the repository at this point in the history
changes in last commit.
  • Loading branch information
Vijay Vasudevan committed Dec 2, 2015
1 parent bf6b536 commit bb7a7a8
Show file tree
Hide file tree
Showing 8 changed files with 312 additions and 33 deletions.
33 changes: 33 additions & 0 deletions third_party/eigen3/Eigen/src/Core/functors/UnaryFunctors.h
Original file line number Diff line number Diff line change
Expand Up @@ -486,6 +486,39 @@ struct functor_traits<scalar_cube_op<Scalar> >
{ enum { Cost = 2*NumTraits<Scalar>::MulCost, PacketAccess = packet_traits<Scalar>::HasMul }; };


/** \internal
* \brief Template functor to compute the signum of a scalar
* \sa class CwiseUnaryOp, Cwise::sign()
*/
template<typename Scalar,bool iscpx=(NumTraits<Scalar>::IsComplex!=0) > struct scalar_sign_op;
template<typename Scalar>
struct scalar_sign_op<Scalar,false> {
EIGEN_EMPTY_STRUCT_CTOR(scalar_sign_op)
EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const
{
return Scalar( (a>Scalar(0)) - (a<Scalar(0)) );
}
};
template<typename Scalar>
struct scalar_sign_op<Scalar,true> {
EIGEN_EMPTY_STRUCT_CTOR(scalar_sign_op)
EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const
{
typename NumTraits<Scalar>::Real aa = std::abs(a);
return (aa==0) ? Scalar(0) : (a/aa);
}
};
template<typename Scalar>
struct functor_traits<scalar_sign_op<Scalar> >
{ enum {
Cost =
NumTraits<Scalar>::IsComplex
? ( 8*NumTraits<Scalar>::MulCost ) // roughly
: ( 3*NumTraits<Scalar>::AddCost),
PacketAccess = false,
};
};

} // end namespace internal

} // end namespace Eigen
Expand Down
3 changes: 2 additions & 1 deletion third_party/eigen3/unsupported/Eigen/CXX11/Tensor
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@
#include <curand_kernel.h>
#endif // defined(__CUDACC__)
#else
#include "perftools/gputools/executor/gcuda.h"
#include "platforms/gpus/gcudacc/runtime/gcudacc_runtime.h"
#ifdef __CUDACC__
#include "third_party/gpus/cuda/curand_device/curand_kernel.h"
#endif // defined(__CUDACC__)
Expand Down Expand Up @@ -88,6 +88,7 @@
#include "unsupported/Eigen/CXX11/src/Tensor/TensorInitializer.h"
#include "unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h"
#include "unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h"
#include "unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h"
#include "unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h"
#include "unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h"

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,12 @@ class TensorBase<Derived, ReadOnlyAccessors>
return unaryExpr(internal::scalar_opposite_op<Scalar>());
}

EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_sign_op<Scalar>, const Derived>
sign() const {
return unaryExpr(internal::scalar_sign_op<Scalar>());
}

EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_sqrt_op<Scalar>, const Derived>
sqrt() const {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -757,11 +757,17 @@ static inline void setCudaSharedMemConfig(cudaSharedMemConfig cache_config) {
}

struct GpuDevice {
GpuDevice()
: stream_(perftools::gputools::MachineManager::singleton()->stream_for_device(0)),
allocator_(nullptr),
stream_exec_(stream_->parent()),
device_descr_(&(stream_exec_->GetDeviceDescription())) {}
// Default constructor: Get [cached] device 0 and its default stream.
GpuDevice() : allocator_(nullptr) {
perftools::gputools::Platform* platform =
perftools::gputools::MultiPlatformManager::PlatformWithName("cuda")
.ValueOrDie();
stream_exec_ = platform->ExecutorForDevice(0).ValueOrDie();
// TODO(rspringer): If we ever pull from an executor aside from 0, this will
// need to be preceded by a call to SetDevice(N);
stream_ = platforms::gpus::gcudacc::GetDefaultStream();
device_descr_ = &(stream_exec_->GetDeviceDescription());
}

GpuDevice(perftools::gputools::Stream* stream,
const Allocator* alloc = nullptr)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -418,11 +418,13 @@ inline void TensorExecutor<Expression, GpuDevice, false, Tileable>::run(
TensorEvaluator<Expression, GpuDevice> evaluator(expr, device);
const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
if (needs_assign) {
const int num_blocks = device.getNumCudaMultiProcessors() *
device.maxCudaThreadsPerMultiProcessor() /
device.maxCudaThreadsPerBlock();
const int block_size = device.maxCudaThreadsPerBlock();
const int max_blocks = device.getNumCudaMultiProcessors() *
device.maxCudaThreadsPerMultiProcessor() / block_size;
const Index size = array_prod(evaluator.dimensions());
// Create a least one block to ensure we won't crash when tensorflow calls with tensors of size 0.
const int num_blocks = numext::maxi<int>(numext::mini<int>(max_blocks, (size + block_size - 1) / block_size), 1);

LAUNCH_CUDA_KERNEL(
(EigenMetaKernel_NonVectorizable<TensorEvaluator<Expression, GpuDevice>,
Index>),
Expand All @@ -438,11 +440,13 @@ inline void TensorExecutor<Expression, GpuDevice, true, Tileable>::run(
TensorEvaluator<Expression, GpuDevice> evaluator(expr, device);
const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
if (needs_assign) {
const int num_blocks = device.getNumCudaMultiProcessors() *
device.maxCudaThreadsPerMultiProcessor() /
device.maxCudaThreadsPerBlock();
const int block_size = device.maxCudaThreadsPerBlock();
const int max_blocks = device.getNumCudaMultiProcessors() *
device.maxCudaThreadsPerMultiProcessor() / block_size;
const Index size = array_prod(evaluator.dimensions());
// Create a least one block to ensure we won't crash when tensorflow calls with tensors of size 0.
const int num_blocks = numext::maxi<int>(numext::mini<int>(max_blocks, (size + block_size - 1) / block_size), 1);

LAUNCH_CUDA_KERNEL(
(EigenMetaKernel_Vectorizable<TensorEvaluator<Expression, GpuDevice>,
Index>),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -59,13 +59,8 @@ namespace {

template <typename T>
struct DividerTraits {
#if defined(__SIZEOF_INT128__) && !defined(__CUDACC__)
typedef typename conditional<sizeof(T) == 8, uint64_t, uint32_t>::type type;
static const int N = sizeof(T) * 8;
#else
typedef uint32_t type;
static const int N = 32;
#endif
};


Expand All @@ -78,40 +73,39 @@ namespace {
#endif
}

template <typename T>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint64_t muluh(const uint64_t a, const T b) {
#if defined(__CUDA_ARCH__)
template <typename T>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint64_t muluh(const uint64_t a, const T b) {
return __umul64hi(a, b);
}
#else
template <typename T>
EIGEN_ALWAYS_INLINE uint64_t muluh(const uint64_t a, const T b) {
#if defined(__SIZEOF_INT128__) && !defined(__CUDACC__)
#elif defined(__SIZEOF_INT128__)
__uint128_t v = static_cast<__uint128_t>(a) * static_cast<__uint128_t>(b);
return static_cast<uint64_t>(v >> 64);
#else
EIGEN_STATIC_ASSERT(sizeof(T) == 4, YOU_MADE_A_PROGRAMMING_MISTAKE);
return (a * b) >> 32;
return (TensorUInt128<static_val<0>, uint64_t>(a) * TensorUInt128<static_val<0>, uint64_t>(b)).upper();
#endif
}
#endif

template <int N, typename T>
struct DividerHelper {
static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint32_t computeMultiplier (const int log_div, const T divider) {
static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint32_t computeMultiplier(const int log_div, const T divider) {
EIGEN_STATIC_ASSERT(N == 32, YOU_MADE_A_PROGRAMMING_MISTAKE);
return (static_cast<uint64_t>(1) << (N+log_div)) / divider - (static_cast<uint64_t>(1) << N) + 1;
}
};

#if defined(__SIZEOF_INT128__) && !defined(__CUDACC__)
template <typename T>
struct DividerHelper<64, T> {
static EIGEN_ALWAYS_INLINE uint64_t computeMultiplier(const int log_div, const T divider) {
static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint64_t computeMultiplier(const int log_div, const T divider) {
#if defined(__SIZEOF_INT128__) && !defined(__CUDA_ARCH__)
return ((static_cast<__uint128_t>(1) << (64+log_div)) / static_cast<__uint128_t>(divider) - (static_cast<__uint128_t>(1) << 64) + 1);
#else
const uint64_t shift = 1ULL << log_div;
TensorUInt128<uint64_t, uint64_t> result = (TensorUInt128<uint64_t, static_val<0> >(shift, 0) / TensorUInt128<static_val<0>, uint64_t>(divider) - TensorUInt128<static_val<1>, static_val<0> >(1, 0) + TensorUInt128<static_val<0>, static_val<1> >(1));
return static_cast<uint64_t>(result);
#endif
}
};
#endif

}


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,7 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
m_unshuffledInputStrides[i] =
m_unshuffledInputStrides[i - 1] * input_dims[i - 1];
m_outputStrides[i] = m_outputStrides[i - 1] * m_dimensions[i - 1];
m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(m_outputStrides[i]);
}
} else {
m_unshuffledInputStrides[NumDims - 1] = 1;
Expand All @@ -149,6 +150,7 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
m_unshuffledInputStrides[i] =
m_unshuffledInputStrides[i + 1] * input_dims[i + 1];
m_outputStrides[i] = m_outputStrides[i + 1] * m_dimensions[i + 1];
m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(m_outputStrides[i]);
}
}

Expand Down Expand Up @@ -319,14 +321,14 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
Index inputIndex = 0;
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
for (int i = NumDims - 1; i > 0; --i) {
const Index idx = index / m_outputStrides[i];
const Index idx = index / m_fastOutputStrides[i];
inputIndex += idx * m_inputStrides[i];
index -= idx * m_outputStrides[i];
}
return inputIndex + index * m_inputStrides[0];
} else {
for (int i = 0; i < NumDims - 1; ++i) {
const Index idx = index / m_outputStrides[i];
const Index idx = index / m_fastOutputStrides[i];
inputIndex += idx * m_inputStrides[i];
index -= idx * m_outputStrides[i];
}
Expand All @@ -338,6 +340,7 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
Dimensions m_dimensions;
array<Index, NumDims> m_inverseShuffle;
array<Index, NumDims> m_outputStrides;
array<internal::TensorIntDivisor<Index>, NumDims> m_fastOutputStrides;
array<Index, NumDims> m_inputStrides;
array<Index, NumDims> m_unshuffledInputStrides;
TensorEvaluator<ArgType, Device> m_impl;
Expand Down
Loading

0 comments on commit bb7a7a8

Please sign in to comment.