diff --git a/aten/src/ATen/Declarations.cwrap b/aten/src/ATen/Declarations.cwrap index 35a3849404d32..c2103d20fd4ba 100644 --- a/aten/src/ATen/Declarations.cwrap +++ b/aten/src/ATen/Declarations.cwrap @@ -3568,6 +3568,19 @@ kwarg_only: True - double p ]] +[[ + name: _cpu_bernoulli_ + backends: + - CPU + cname: bernoulli + return: self + arguments: + - THTensor* self + - arg: THGenerator* generator + default: nullptr + kwarg_only: True + - double p +]] [[ name: _th_bernoulli types: diff --git a/aten/src/ATen/native/Distributions.cpp b/aten/src/ATen/native/Distributions.cpp index 1186c76a8be1e..4b7781057be9a 100644 --- a/aten/src/ATen/native/Distributions.cpp +++ b/aten/src/ATen/native/Distributions.cpp @@ -146,13 +146,7 @@ Tensor& bernoulli_(Tensor& self, const Tensor& p_, Generator* gen) { Tensor& bernoulli_(Tensor& self, double p, Generator* gen) { if (!self.is_cuda()) { - AT_DISPATCH_ALL_TYPES(self.type(), "bernoulli_", [&] { - THGenerator* generator = get_generator(gen); - std::lock_guard lock(generator->mutex); - CPU_tensor_apply1(self, [generator, p](scalar_t& ret_val) { - ret_val = (scalar_t)THRandom_bernoulli(generator, p); - }); - }); + self._cpu_bernoulli_(p, gen); return self; } Tensor probs = self.type().toScalarType(kDouble).tensor({}).fill_(p); diff --git a/aten/src/TH/THGeneral.h.in b/aten/src/TH/THGeneral.h.in index 16b55307f7205..b828737797e81 100644 --- a/aten/src/TH/THGeneral.h.in +++ b/aten/src/TH/THGeneral.h.in @@ -17,6 +17,10 @@ #include #include +#ifdef TH_BLAS_MKL +#include +#endif + #cmakedefine USE_BLAS #cmakedefine USE_LAPACK #cmakedefine BLAS_F2C diff --git a/aten/src/TH/THTensorApply.h b/aten/src/TH/THTensorApply.h index 55730b437a31d..8a14902616e2c 100644 --- a/aten/src/TH/THTensorApply.h +++ b/aten/src/TH/THTensorApply.h @@ -369,7 +369,7 @@ TYPE1 *rp = TENSOR1->storage->data()+TENSOR1->storageOffset; \ TYPE2 *tp = TENSOR2->storage->data()+TENSOR2->storageOffset; \ ptrdiff_t iter = 0; \ - if(tp != rp) { \ + if(tp != (TYPE2*)rp) { \ PRAGMA(ivdep) \ PRAGMA( omp parallel for if (SIZE > OMP_THRESHOLD * 10) firstprivate(rp, tp)) \ for (iter = 0; iter < SIZE; iter++) { \ @@ -449,7 +449,7 @@ TYPE2 *tp = TENSOR2->storage->data()+TENSOR2->storageOffset; \ TYPE3 *srcp = TENSOR3->storage->data()+TENSOR3->storageOffset; \ ptrdiff_t iter = 0;\ - if (rp != tp) { \ + if(tp != (TYPE2*)rp) { \ PRAGMA(ivdep) \ PRAGMA( omp parallel for if (SIZE > OMP_THRESHOLD * 10) ) \ for (iter = 0; iter < SIZE; iter++) {\ diff --git a/aten/src/TH/generic/THTensorRandom.cpp b/aten/src/TH/generic/THTensorRandom.cpp index 564c365804695..88f4e3c9daaa9 100644 --- a/aten/src/TH/generic/THTensorRandom.cpp +++ b/aten/src/TH/generic/THTensorRandom.cpp @@ -2,6 +2,12 @@ #define TH_GENERIC_FILE "generic/THTensorRandom.cpp" #else +#ifdef _OPENMP +#include +#endif + +#include + #include "THGenerator.hpp" void THTensor_(random)(THTensor *self, THGenerator *_generator) @@ -51,10 +57,93 @@ void THTensor_(geometric)(THTensor *self, THGenerator *_generator, double p) TH_TENSOR_APPLY(real, self, *self_data = (real)THRandom_geometric(_generator, p);); } +#ifdef TH_BLAS_MKL +#define BERNOULLI_OMP 800 +#define TH_OMP_OVERHEAD_THRESHOLD_COPY 20000 + +void iBernoulli_generate_copy(THTensor *self, THGenerator *_generator, const double p) +{ + int64_t seed = THRandom_random(_generator); + int64_t n = THTensor_(nElement)(self); + int contig = THTensor_(isContiguous)(self); + int *tmp = NULL; + THIntTensor* intTensor = NULL; + + if (contig) { +#ifdef TH_REAL_IS_INT + tmp = THIntTensor_data(self); +#else + tmp = (int*)THAlloc(n*sizeof(int)); +#endif + } else { + intTensor = THIntTensor_new(); + THIntTensor_resizeNd(intTensor, self->nDimension, self->size, NULL); + tmp = THIntTensor_data(intTensor); + } + +#ifdef _OPENMP + size_t nthr = !omp_in_parallel() && n >= BERNOULLI_OMP ? omp_get_num_threads() : 1; +#pragma omp parallel num_threads(nthr) firstprivate(nthr) + { + size_t tid = omp_get_thread_num(); + int64_t seg_len_tmp = n / nthr; + int64_t line_index_offset = tid * seg_len_tmp; + int64_t line_seg_len = (tid == nthr - 1)? (n-line_index_offset) : seg_len_tmp; +#else + { + int64_t line_index_offset = 0; + int64_t line_seg_len = n; +#endif + + if (line_seg_len > 0) { + VSLStreamStatePtr stream; + vslNewStream(&stream, VSL_BRNG_MCG31, seed); + vslSkipAheadStream(stream, line_index_offset); + viRngBernoulli(VSL_RNG_METHOD_BERNOULLI_ICDF, stream, line_seg_len, + tmp + line_index_offset, p); + vslDeleteStream(&stream); + +#ifndef TH_REAL_IS_INT + if (contig) { + real* self_seg = THTensor_(data)(self) + line_index_offset; + int* tmp_seg = tmp + line_index_offset; + THVector_(cvtFromInt)(self_seg, tmp_seg, line_seg_len); + } +#endif + } + } + + if(contig) { +#ifndef TH_REAL_IS_INT + THFree(tmp); +#endif + } else { +#ifdef _OPENMP + TH_TENSOR_APPLY2_OMP(n, 1, 0, int, intTensor, real, self, *self_data = *intTensor_data;, TH_OMP_OVERHEAD_THRESHOLD_COPY) +#else + TH_TENSOR_APPLY2(int, intTensor, real, self, *self_data = *intTensor_data;) +#endif + THIntTensor_free(intTensor); + } + +} + +#endif + void THTensor_(bernoulli)(THTensor *self, THGenerator *_generator, double p) { +#ifdef TH_BLAS_MKL + if(cpuinfo_initialize() && cpuinfo_vendor_intel == cpuinfo_get_processor(0)->core->vendor) { + std::lock_guard lock(_generator->mutex); + iBernoulli_generate_copy(self, _generator, p); + } else { + std::lock_guard lock(_generator->mutex); + TH_TENSOR_APPLY(real, self, *self_data = (real)THRandom_bernoulli(_generator, p);); + } +#else std::lock_guard lock(_generator->mutex); TH_TENSOR_APPLY(real, self, *self_data = (real)THRandom_bernoulli(_generator, p);); +#endif } void THTensor_(bernoulli_FloatTensor)(THTensor *self, THGenerator *_generator, THFloatTensor *p) diff --git a/aten/src/TH/generic/THVector.h b/aten/src/TH/generic/THVector.h index 887482c52867f..3a5c5c26fdc4f 100644 --- a/aten/src/TH/generic/THVector.h +++ b/aten/src/TH/generic/THVector.h @@ -19,6 +19,9 @@ TH_API void THVector_(normal_fill)(real *data, struct THGenerator *generator, const real mean, const real stddev); +#ifndef TH_REAL_IS_INT +TH_API void THVector_(cvtFromInt)(real *y, const int *x, const ptrdiff_t n); +#endif #if defined(TH_REAL_IS_SHORT) || defined(TH_REAL_IS_INT) || defined(TH_REAL_IS_LONG) TH_API void THVector_(abs)(real *y, const real *x, const ptrdiff_t n); diff --git a/aten/src/TH/generic/THVectorDefault.cpp b/aten/src/TH/generic/THVectorDefault.cpp index 51652ffa01c8a..c77580761bbb8 100644 --- a/aten/src/TH/generic/THVectorDefault.cpp +++ b/aten/src/TH/generic/THVectorDefault.cpp @@ -130,6 +130,24 @@ void THVector_(divs_DEFAULT)(real *y, const real *x, const real c, const ptrdiff y[i] = x[i] / c; } +#ifndef TH_REAL_IS_INT +void THVector_(cvtFromInt_DEFAULT)(real *y, const int *x, const ptrdiff_t n) +{ + ptrdiff_t i = 0; + + for(; i