From 272de5ebed0254193cd6355bf37dc9c46eac49da Mon Sep 17 00:00:00 2001 From: Minchul Lee Date: Tue, 30 Mar 2021 16:40:55 +0900 Subject: [PATCH 1/3] fixed compilation error with double type (#17) fixed conflicts with SpecialFunctionsPacketMath (#18) --- EigenRand/Dists/Discrete.h | 6 +- EigenRand/Dists/GammaPoisson.h | 4 +- EigenRand/Dists/NormalExp.h | 3 +- EigenRand/MorePacketMath.h | 757 ++++++++++++++++++++++++++++----- EigenRand/PacketRandomEngine.h | 30 -- TestAccuracy.vcxproj | 2 +- test/accuracy.cpp | 133 ++++++ test/benchmark.cpp | 84 +++- 8 files changed, 878 insertions(+), 141 deletions(-) diff --git a/EigenRand/Dists/Discrete.h b/EigenRand/Dists/Discrete.h index 667054f..27b5788 100644 --- a/EigenRand/Dists/Discrete.h +++ b/EigenRand/Dists/Discrete.h @@ -825,7 +825,7 @@ namespace Eigen fres = ptruncate(padd(pmul(psqrt_tmean, yx), pmean)); auto p1 = pmul(padd(pmul(yx, yx), pset1(1)), pset1(0.9)); - auto p2 = pexp(psub(psub(pmul(fres, plog_mean), plgamma(padd(fres, pset1(1)))), pg1)); + auto p2 = pexp(psub(psub(pmul(fres, plog_mean), plgamma_approx(padd(fres, pset1(1)))), pg1)); auto c1 = pcmple(pset1(0), fres); auto c2 = pcmple(ur.template packetOp(rng), pmul(p1, p2)); @@ -964,8 +964,8 @@ namespace Eigen auto p1 = pmul(pmul(pset1(1.2), psqrt_v), padd(pset1(1), pmul(ys, ys))); auto p2 = pexp( padd(padd(psub( - psub(pg1, plgamma(padd(fres, pset1(1)))), - plgamma(psub(padd(ptrials, pset1(1)), fres)) + psub(pg1, plgamma_approx(padd(fres, pset1(1)))), + plgamma_approx(psub(padd(ptrials, pset1(1)), fres)) ), pmul(fres, plog_small_p)), pmul(psub(ptrials, fres), plog_small_q)) ); diff --git a/EigenRand/Dists/GammaPoisson.h b/EigenRand/Dists/GammaPoisson.h index f157f8d..af4bf4f 100644 --- a/EigenRand/Dists/GammaPoisson.h +++ b/EigenRand/Dists/GammaPoisson.h @@ -81,7 +81,7 @@ namespace Eigen const PacketType ppi = pset1(constant::pi), psqrt_tmean = psqrt(pmul(pset1(2), mean)), plog_mean = plog(mean), - pg1 = psub(pmul(mean, plog_mean), plgamma(padd(mean, pset1(1)))); + pg1 = psub(pmul(mean, plog_mean), plgamma_approx(padd(mean, pset1(1)))); while (1) { PacketType fres, yx, psin, pcos; @@ -90,7 +90,7 @@ namespace Eigen fres = ptruncate(padd(pmul(psqrt_tmean, yx), mean)); auto p1 = pmul(padd(pmul(yx, yx), pset1(1)), pset1(0.9)); - auto p2 = pexp(psub(psub(pmul(fres, plog_mean), plgamma(padd(fres, pset1(1)))), pg1)); + auto p2 = pexp(psub(psub(pmul(fres, plog_mean), plgamma_approx(padd(fres, pset1(1)))), pg1)); auto c1 = pcmple(pset1(0), fres); auto c2 = pcmple(ur.template packetOp(rng), pmul(p1, p2)); diff --git a/EigenRand/Dists/NormalExp.h b/EigenRand/Dists/NormalExp.h index 2bc8218..e31df99 100644 --- a/EigenRand/Dists/NormalExp.h +++ b/EigenRand/Dists/NormalExp.h @@ -238,8 +238,7 @@ namespace Eigen psub(pexp(pmul(plog(u1), pset1(-2 / n))), pset1(1)) )); auto theta = pmul(pset1(2 * constant::pi), u2); - Packet sintheta, costheta; - + //Packet sintheta, costheta; //psincos(theta, sintheta, costheta); return pmul(radius, psin(theta)); } diff --git a/EigenRand/MorePacketMath.h b/EigenRand/MorePacketMath.h index 49403fc..99e1047 100644 --- a/EigenRand/MorePacketMath.h +++ b/EigenRand/MorePacketMath.h @@ -18,6 +18,51 @@ namespace Eigen { namespace internal { + template + struct IsIntPacket : std::false_type {}; + + template + struct IsFloatPacket : std::false_type {}; + + template + struct IsDoublePacket : std::false_type {}; + + template + struct HalfPacket; + +#ifdef EIGEN_VECTORIZE_AVX2 + template<> + struct IsIntPacket : std::true_type {}; + + template<> + struct HalfPacket + { + using type = Packet4i; + }; +#endif +#ifdef EIGEN_VECTORIZE_AVX + template<> + struct IsFloatPacket : std::true_type {}; + + template<> + struct IsDoublePacket : std::true_type {}; +#endif +#ifdef EIGEN_VECTORIZE_SSE2 + template<> + struct IsIntPacket : std::true_type {}; + + template<> + struct IsFloatPacket : std::true_type {}; + + template<> + struct IsDoublePacket : std::true_type {}; + + template<> + struct HalfPacket + { + using type = uint64_t; + }; +#endif template struct reinterpreter { @@ -47,6 +92,15 @@ namespace Eigen template EIGEN_STRONG_INLINE Packet pseti64(uint64_t a); + template + EIGEN_STRONG_INLINE Packet padd64(const Packet& a, const Packet& b); + + template + EIGEN_STRONG_INLINE Packet psub64(const Packet& a, const Packet& b); + + template + EIGEN_STRONG_INLINE TgtPacket pcast64(const SrcPacket& a); + template EIGEN_STRONG_INLINE Packet pcmpeq(const Packet& a, const Packet& b); @@ -65,6 +119,28 @@ namespace Eigen template EIGEN_STRONG_INLINE int pmovemask(const Packet& a); + template + EIGEN_STRONG_INLINE typename std::enable_if< + IsFloatPacket::value, Packet + >::type pext_sign(const Packet& a) + { + using IntPacket = decltype(reinterpret_to_int(a)); + return reinterpret_to_float( + pand(reinterpret_to_int(a), pset1(0x80000000)) + ); + } + + template + EIGEN_STRONG_INLINE typename std::enable_if< + IsDoublePacket::value, Packet + >::type pext_sign(const Packet& a) + { + using IntPacket = decltype(reinterpret_to_int(a)); + return reinterpret_to_double( + pand(reinterpret_to_int(a), pseti64(0x8000000000000000)) + ); + } + template<> EIGEN_STRONG_INLINE uint64_t psll64(const uint64_t& a, int b) { @@ -77,8 +153,136 @@ namespace Eigen return a >> b; } + // approximation : lgamma(z) ~= (z+2.5)ln(z+3) - z - 3 + 0.5 ln (2pi) + 1/12/(z + 3) - ln (z(z+1)(z+2)) + template + EIGEN_STRONG_INLINE Packet plgamma_approx(const Packet& x) + { + auto x_3 = padd(x, pset1(3)); + auto ret = pmul(padd(x_3, pset1(-0.5)), plog(x_3)); + ret = psub(ret, x_3); + ret = padd(ret, pset1(0.9189385332046727)); + ret = padd(ret, pdiv(pset1(1 / 12.), x_3)); + ret = psub(ret, plog(pmul( + pmul(psub(x_3, pset1(1)), psub(x_3, pset1(2))), x))); + return ret; + } + + template + EIGEN_STRONG_INLINE Packet pcmplt(const Packet& a, const Packet& b); + + template + EIGEN_STRONG_INLINE Packet pcmple(const Packet& a, const Packet& b); + + template + EIGEN_STRONG_INLINE Packet pblendv(const PacketIf& ifPacket, const Packet& thenPacket, const Packet& elsePacket); + + template + EIGEN_STRONG_INLINE Packet pgather(const int* addr, const Packet& index); + + template + EIGEN_STRONG_INLINE auto pgather(const float* addr, const Packet& index) -> decltype(reinterpret_to_float(std::declval())); + + template + EIGEN_STRONG_INLINE auto pgather(const double* addr, const Packet& index, bool upperhalf = false) -> decltype(reinterpret_to_double(std::declval())); + + template + EIGEN_STRONG_INLINE Packet ptruncate(const Packet& a); + + template + EIGEN_STRONG_INLINE Packet pcmpeq64(const Packet& a, const Packet& b); + + template + EIGEN_STRONG_INLINE Packet pcmplt64(const Packet& a, const Packet& b); + + template + EIGEN_STRONG_INLINE Packet pmuluadd64(const Packet& a, uint64_t b, uint64_t c); + + template + EIGEN_STRONG_INLINE auto bit_to_ur_float(const IntPacket& x) -> decltype(reinterpret_to_float(x)) + { + using FloatPacket = decltype(reinterpret_to_float(x)); + + const IntPacket lower = pset1(0x7FFFFF), + upper = pset1(127 << 23); + const FloatPacket one = pset1(1); + + return psub(reinterpret_to_float(por(pand(x, lower), upper)), one); + } + + template + EIGEN_STRONG_INLINE auto bit_to_ur_double(const IntPacket& x) -> decltype(reinterpret_to_double(x)) + { + using DoublePacket = decltype(reinterpret_to_double(x)); + + const IntPacket lower = pseti64(0xFFFFFFFFFFFFFull), + upper = pseti64(1023ull << 52); + const DoublePacket one = pset1(1); + + return psub(reinterpret_to_double(por(pand(x, lower), upper)), one); + } + + template + struct bit_scalar; + + template<> + struct bit_scalar + { + float to_ur(uint32_t x) + { + union + { + uint32_t u; + float f; + }; + u = (x & 0x7FFFFF) | (127 << 23); + return f - 1.f; + } + + float to_nzur(uint32_t x) + { + return to_ur(x) + std::numeric_limits::epsilon() / 8; + } + }; + + template<> + struct bit_scalar + { + double to_ur(uint64_t x) + { + union + { + uint64_t u; + double f; + }; + u = (x & 0xFFFFFFFFFFFFFull) | (1023ull << 52); + return f - 1.; + } + + double to_nzur(uint64_t x) + { + return to_ur(x) + std::numeric_limits::epsilon() / 8; + } + }; + + + struct float2 + { + float f[2]; + }; + + EIGEN_STRONG_INLINE float2 bit_to_ur_float(uint64_t x) + { + bit_scalar bs; + float2 ret; + ret.f[0] = bs.to_ur(x & 0xFFFFFFFF); + ret.f[1] = bs.to_ur(x >> 32); + return ret; + } + template - EIGEN_STRONG_INLINE void psincos(Packet x, Packet &s, Packet &c) + EIGEN_STRONG_INLINE typename std::enable_if< + IsFloatPacket::value + >::type psincos(Packet x, Packet& s, Packet& c) { Packet xmm1, xmm2, xmm3 = pset1(0), sign_bit_sin, y; using IntPacket = decltype(reinterpret_to_int(x)); @@ -88,9 +292,7 @@ namespace Eigen /* take the absolute value */ x = pabs(x); /* extract the sign bit (upper one) */ - sign_bit_sin = reinterpret_to_float( - pand(reinterpret_to_int(sign_bit_sin), pset1(0x80000000)) - ); + sign_bit_sin = pext_sign(sign_bit_sin); /* scale by 4/Pi */ y = pmul(x, pset1(1.27323954473516)); @@ -175,127 +377,190 @@ namespace Eigen c = pxor(xmm2, sign_bit_cos); } - // approximation : lgamma(z) ~= (z+2.5)ln(z+3) - z - 3 + 0.5 ln (2pi) + 1/12/(z + 3) - ln (z(z+1)(z+2)) template - EIGEN_STRONG_INLINE Packet plgamma(const Packet& x) + EIGEN_STRONG_INLINE typename std::enable_if< + IsDoublePacket::value + >::type psincos(Packet x, Packet& s, Packet& c) { - auto x_3 = padd(x, pset1(3)); - auto ret = pmul(padd(x_3, pset1(-0.5)), plog(x_3)); - ret = psub(ret, x_3); - ret = padd(ret, pset1(0.9189385332046727)); - ret = padd(ret, pdiv(pset1(1 / 12.), x_3)); - ret = psub(ret, plog(pmul( - pmul(psub(x_3, pset1(1)), psub(x_3, pset1(2))), x))); - return ret; - } + Packet xmm1, xmm2, xmm3 = pset1(0), sign_bit_sin, y; + using IntPacket = decltype(reinterpret_to_int(x)); + IntPacket emm0, emm2, emm4; - template - EIGEN_STRONG_INLINE Packet pcmplt(const Packet& a, const Packet& b); + sign_bit_sin = x; + /* take the absolute value */ + x = pabs(x); + /* extract the sign bit (upper one) */ + sign_bit_sin = pext_sign(sign_bit_sin); - template - EIGEN_STRONG_INLINE Packet pcmple(const Packet& a, const Packet& b); + /* scale by 4/Pi */ + y = pmul(x, pset1(1.27323954473516)); - template - EIGEN_STRONG_INLINE Packet pblendv(const PacketIf& ifPacket, const Packet& thenPacket, const Packet& elsePacket); + /* store the integer part of y in emm2 */ + emm2 = pcast64(y); - template - EIGEN_STRONG_INLINE Packet pgather(const int* addr, const Packet& index); + /* j=(j+1) & (~1) (see the cephes sources) */ + emm2 = padd64(emm2, pseti64(1)); + emm2 = pand(emm2, pseti64(~1ll)); + y = pcast64(emm2); - template - EIGEN_STRONG_INLINE auto pgather(const float* addr, const Packet& index) -> decltype(reinterpret_to_float(std::declval())); + emm4 = emm2; - template - EIGEN_STRONG_INLINE auto pgather(const double* addr, const Packet& index, bool upperhalf = false) -> decltype(reinterpret_to_double(std::declval())); + /* get the swap sign flag for the sine */ + emm0 = pand(emm2, pseti64(4)); + emm0 = psll64(emm0, 61); + Packet swap_sign_bit_sin = reinterpret_to_double(emm0); - template - EIGEN_STRONG_INLINE Packet ptruncate(const Packet& a); + /* get the polynom selection mask for the sine*/ + emm2 = pand(emm2, pseti64(2)); - template - EIGEN_STRONG_INLINE Packet pcmpeq64(const Packet& a, const Packet& b); + emm2 = pcmpeq64(emm2, pseti64(0)); + Packet poly_mask = reinterpret_to_double(emm2); - template - EIGEN_STRONG_INLINE Packet pmuluadd64(const Packet& a, uint64_t b, uint64_t c); + /* The magic pass: "Extended precision modular arithmetic" + x = ((x - y * DP1) - y * DP2) - y * DP3; */ + xmm1 = pset1(-0.78515625); + xmm2 = pset1(-2.4187564849853515625e-4); + xmm3 = pset1(-3.77489497744594108e-8); + xmm1 = pmul(y, xmm1); + xmm2 = pmul(y, xmm2); + xmm3 = pmul(y, xmm3); + x = padd(x, xmm1); + x = padd(x, xmm2); + x = padd(x, xmm3); - template - EIGEN_STRONG_INLINE auto bit_to_ur_float(const IntPacket& x) -> decltype(reinterpret_to_float(x)) - { - using FloatPacket = decltype(reinterpret_to_float(x)); + emm4 = psub64(emm4, pseti64(2)); + emm4 = pandnot(emm4, pseti64(4)); + emm4 = psll64(emm4, 61); + Packet sign_bit_cos = reinterpret_to_double(emm4); + sign_bit_sin = pxor(sign_bit_sin, swap_sign_bit_sin); - const IntPacket lower = pset1(0x7FFFFF), - upper = pset1(127 << 23); - const FloatPacket one = pset1(1); - return psub(reinterpret_to_float(por(pand(x, lower), upper)), one); + /* Evaluate the first polynom (0 <= x <= Pi/4) */ + Packet z = pmul(x, x); + y = pset1(2.443315711809948E-005); + + y = pmul(y, z); + y = padd(y, pset1(-1.388731625493765E-003)); + y = pmul(y, z); + y = padd(y, pset1(4.166664568298827E-002)); + y = pmul(y, z); + y = pmul(y, z); + Packet tmp = pmul(z, pset1(0.5)); + y = psub(y, tmp); + y = padd(y, pset1(1)); + + /* Evaluate the second polynom (Pi/4 <= x <= 0) */ + + Packet y2 = pset1(-1.9515295891E-4); + y2 = pmul(y2, z); + y2 = padd(y2, pset1(8.3321608736E-3)); + y2 = pmul(y2, z); + y2 = padd(y2, pset1(-1.6666654611E-1)); + y2 = pmul(y2, z); + y2 = pmul(y2, x); + y2 = padd(y2, x); + + /* select the correct result from the two polynoms */ + xmm3 = poly_mask; + Packet ysin2 = pand(xmm3, y2); + Packet ysin1 = pandnot(xmm3, y); + y2 = psub(y2, ysin2); + y = psub(y, ysin1); + + xmm1 = padd(ysin1, ysin2); + xmm2 = padd(y, y2); + + /* update the sign */ + s = pxor(xmm1, sign_bit_sin); + c = pxor(xmm2, sign_bit_cos); } - template - EIGEN_STRONG_INLINE auto bit_to_ur_double(const IntPacket& x) -> decltype(reinterpret_to_double(x)) + template + EIGEN_STRONG_INLINE typename std::enable_if< + IsDoublePacket::value, Packet + >::type _psin(Packet x) { - using DoublePacket = decltype(reinterpret_to_double(x)); + Packet xmm1, xmm2, xmm3 = pset1(0), sign_bit_sin, y; + using IntPacket = decltype(reinterpret_to_int(x)); + IntPacket emm0, emm2; - const IntPacket lower = pseti64(0xFFFFFFFFFFFFFull), - upper = pseti64(1023ull << 52); - const DoublePacket one = pset1(1); + sign_bit_sin = x; + /* take the absolute value */ + x = pabs(x); + /* extract the sign bit (upper one) */ + sign_bit_sin = pext_sign(sign_bit_sin); - return psub(reinterpret_to_double(por(pand(x, lower), upper)), one); - } + /* scale by 4/Pi */ + y = pmul(x, pset1(1.27323954473516)); - template - struct bit_scalar; + /* store the integer part of y in emm2 */ + emm2 = pcast64(y); - template<> - struct bit_scalar - { - float to_ur(uint32_t x) - { - union - { - uint32_t u; - float f; - }; - u = (x & 0x7FFFFF) | (127 << 23); - return f - 1.f; - } + /* j=(j+1) & (~1) (see the cephes sources) */ + emm2 = padd64(emm2, pseti64(1)); + emm2 = pand(emm2, pseti64(~1ll)); + y = pcast64(emm2); - float to_nzur(uint32_t x) - { - return to_ur(x) + std::numeric_limits::epsilon() / 8; - } - }; + /* get the swap sign flag for the sine */ + emm0 = pand(emm2, pseti64(4)); + emm0 = psll64(emm0, 61); + Packet swap_sign_bit_sin = reinterpret_to_double(emm0); - template<> - struct bit_scalar - { - double to_ur(uint64_t x) - { - union - { - uint64_t u; - double f; - }; - u = (x & 0xFFFFFFFFFFFFFull) | (1023ull << 52); - return f - 1.; - } + /* get the polynom selection mask for the sine*/ + emm2 = pand(emm2, pseti64(2)); - double to_nzur(uint64_t x) - { - return to_ur(x) + std::numeric_limits::epsilon() / 8; - } - }; + emm2 = pcmpeq64(emm2, pseti64(0)); + Packet poly_mask = reinterpret_to_double(emm2); + /* The magic pass: "Extended precision modular arithmetic" + x = ((x - y * DP1) - y * DP2) - y * DP3; */ + xmm1 = pset1(-0.78515625); + xmm2 = pset1(-2.4187564849853515625e-4); + xmm3 = pset1(-3.77489497744594108e-8); + xmm1 = pmul(y, xmm1); + xmm2 = pmul(y, xmm2); + xmm3 = pmul(y, xmm3); + x = padd(x, xmm1); + x = padd(x, xmm2); + x = padd(x, xmm3); - struct float2 - { - float f[2]; - }; + sign_bit_sin = pxor(sign_bit_sin, swap_sign_bit_sin); - EIGEN_STRONG_INLINE float2 bit_to_ur_float(uint64_t x) - { - bit_scalar bs; - float2 ret; - ret.f[0] = bs.to_ur(x & 0xFFFFFFFF); - ret.f[1] = bs.to_ur(x >> 32); - return ret; + + /* Evaluate the first polynom (0 <= x <= Pi/4) */ + Packet z = pmul(x, x); + y = pset1(2.443315711809948E-005); + + y = pmul(y, z); + y = padd(y, pset1(-1.388731625493765E-003)); + y = pmul(y, z); + y = padd(y, pset1(4.166664568298827E-002)); + y = pmul(y, z); + y = pmul(y, z); + Packet tmp = pmul(z, pset1(0.5)); + y = psub(y, tmp); + y = padd(y, pset1(1)); + + /* Evaluate the second polynom (Pi/4 <= x <= 0) */ + + Packet y2 = pset1(-1.9515295891E-4); + y2 = pmul(y2, z); + y2 = padd(y2, pset1(8.3321608736E-3)); + y2 = pmul(y2, z); + y2 = padd(y2, pset1(-1.6666654611E-1)); + y2 = pmul(y2, z); + y2 = pmul(y2, x); + y2 = padd(y2, x); + + /* select the correct result from the two polynoms */ + xmm3 = poly_mask; + Packet ysin2 = pand(xmm3, y2); + Packet ysin1 = pandnot(xmm3, y); + + xmm1 = padd(ysin1, ysin2); + + /* update the sign */ + return pxor(xmm1, sign_bit_sin); } } } @@ -403,6 +668,32 @@ namespace Eigen return _mm256_set1_epi64x(a); } + template<> + EIGEN_STRONG_INLINE Packet8i padd64(const Packet8i& a, const Packet8i& b) + { +#ifdef EIGEN_VECTORIZE_AVX2 + return _mm256_add_epi64(a, b); +#else + Packet4i a1, a2, b1, b2; + split_two(a, a1, a2); + split_two(b, b1, b2); + return combine_two((Packet4i)_mm_add_epi64(a1, b1), (Packet4i)_mm_add_epi64(a2, b2)); +#endif + } + + template<> + EIGEN_STRONG_INLINE Packet8i psub64(const Packet8i& a, const Packet8i& b) + { +#ifdef EIGEN_VECTORIZE_AVX2 + return _mm256_sub_epi64(a, b); +#else + Packet4i a1, a2, b1, b2; + split_two(a, a1, a2); + split_two(b, b1, b2); + return combine_two((Packet4i)_mm_sub_epi64(a1, b1), (Packet4i)_mm_sub_epi64(a2, b2)); +#endif + } + template<> EIGEN_STRONG_INLINE Packet8i pcmpeq(const Packet8i& a, const Packet8i& b) { @@ -537,6 +828,19 @@ namespace Eigen #endif } + template<> + EIGEN_STRONG_INLINE Packet8i pcmplt64(const Packet8i& a, const Packet8i& b) + { +#ifdef EIGEN_VECTORIZE_AVX2 + return _mm256_cmpgt_epi64(b, a); +#else + Packet4i a1, a2, b1, b2; + split_two(a, a1, a2); + split_two(b, b1, b2); + return combine_two((Packet4i)_mm_cmpgt_epi64(b1, a1), (Packet4i)_mm_cmpgt_epi64(b2, a2)); +#endif + } + template<> EIGEN_STRONG_INLINE Packet8f pcmplt(const Packet8f& a, const Packet8f& b) { @@ -694,6 +998,122 @@ namespace Eigen u[3] = u[3] * b + c; return _mm256_loadu_si256((__m256i*)u); } + + EIGEN_STRONG_INLINE __m256d uint64_to_double(__m256i x) { + auto y = _mm256_or_pd(_mm256_castsi256_pd(x), _mm256_set1_pd(0x0010000000000000)); + return _mm256_sub_pd(y, _mm256_set1_pd(0x0010000000000000)); + } + + EIGEN_STRONG_INLINE __m256d int64_to_double(__m256i x) { + x = padd64(x, _mm256_castpd_si256(_mm256_set1_pd(0x0018000000000000))); + return _mm256_sub_pd(_mm256_castsi256_pd(x), _mm256_set1_pd(0x0018000000000000)); + } + + EIGEN_STRONG_INLINE __m256i double_to_int64(__m256d x) { + x = _mm256_add_pd(x, _mm256_set1_pd(0x0018000000000000)); + return _mm256_sub_epi64( + _mm256_castpd_si256(x), + _mm256_castpd_si256(_mm256_set1_pd(0x0018000000000000)) + ); + } + + template<> + EIGEN_STRONG_INLINE Packet8i pcast64(const Packet4d& a) + { + return double_to_int64(a); + } + + template<> + EIGEN_STRONG_INLINE Packet4d pcast64(const Packet8i& a) + { + return int64_to_double(a); + } + + template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED + Packet4d psin(const Packet4d& x) + { + return _psin(x); + } + + template <> + EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4d + plog(const Packet4d& _x) { + Packet4d x = _x; + _EIGEN_DECLARE_CONST_Packet4d(1, 1.0); + _EIGEN_DECLARE_CONST_Packet4d(half, 0.5); + + auto inv_mant_mask = _mm256_castsi256_pd(pseti64(~0x7ff0000000000000)); + auto min_norm_pos = _mm256_castsi256_pd(pseti64(0x10000000000000)); + auto minus_inf = _mm256_castsi256_pd(pseti64(0xfff0000000000000)); + + // Polynomial coefficients. + _EIGEN_DECLARE_CONST_Packet4d(cephes_SQRTHF, 0.707106781186547524); + _EIGEN_DECLARE_CONST_Packet4d(cephes_log_p0, 7.0376836292E-2); + _EIGEN_DECLARE_CONST_Packet4d(cephes_log_p1, -1.1514610310E-1); + _EIGEN_DECLARE_CONST_Packet4d(cephes_log_p2, 1.1676998740E-1); + _EIGEN_DECLARE_CONST_Packet4d(cephes_log_p3, -1.2420140846E-1); + _EIGEN_DECLARE_CONST_Packet4d(cephes_log_p4, +1.4249322787E-1); + _EIGEN_DECLARE_CONST_Packet4d(cephes_log_p5, -1.6668057665E-1); + _EIGEN_DECLARE_CONST_Packet4d(cephes_log_p6, +2.0000714765E-1); + _EIGEN_DECLARE_CONST_Packet4d(cephes_log_p7, -2.4999993993E-1); + _EIGEN_DECLARE_CONST_Packet4d(cephes_log_p8, +3.3333331174E-1); + _EIGEN_DECLARE_CONST_Packet4d(cephes_log_q1, -2.12194440e-4); + _EIGEN_DECLARE_CONST_Packet4d(cephes_log_q2, 0.693359375); + + Packet4d invalid_mask = _mm256_cmp_pd(x, _mm256_setzero_pd(), _CMP_NGE_UQ); // not greater equal is true if x is NaN + Packet4d iszero_mask = _mm256_cmp_pd(x, _mm256_setzero_pd(), _CMP_EQ_OQ); + + // Truncate input values to the minimum positive normal. + x = pmax(x, min_norm_pos); + + Packet4d emm0 = uint64_to_double(_mm256_srli_epi64(_mm256_castpd_si256(x), 52)); + Packet4d e = psub(emm0, pset1(1022)); + + // Set the exponents to -1, i.e. x are in the range [0.5,1). + x = _mm256_and_pd(x, inv_mant_mask); + x = _mm256_or_pd(x, p4d_half); + + // part2: Shift the inputs from the range [0.5,1) to [sqrt(1/2),sqrt(2)) + // and shift by -1. The values are then centered around 0, which improves + // the stability of the polynomial evaluation. + // if( x < SQRTHF ) { + // e -= 1; + // x = x + x - 1.0; + // } else { x = x - 1.0; } + Packet4d mask = _mm256_cmp_pd(x, p4d_cephes_SQRTHF, _CMP_LT_OQ); + Packet4d tmp = _mm256_and_pd(x, mask); + x = psub(x, p4d_1); + e = psub(e, _mm256_and_pd(p4d_1, mask)); + x = padd(x, tmp); + + Packet4d x2 = pmul(x, x); + Packet4d x3 = pmul(x2, x); + + // Evaluate the polynomial approximant of degree 8 in three parts, probably + // to improve instruction-level parallelism. + Packet4d y, y1, y2; + y = pmadd(p4d_cephes_log_p0, x, p4d_cephes_log_p1); + y1 = pmadd(p4d_cephes_log_p3, x, p4d_cephes_log_p4); + y2 = pmadd(p4d_cephes_log_p6, x, p4d_cephes_log_p7); + y = pmadd(y, x, p4d_cephes_log_p2); + y1 = pmadd(y1, x, p4d_cephes_log_p5); + y2 = pmadd(y2, x, p4d_cephes_log_p8); + y = pmadd(y, x3, y1); + y = pmadd(y, x3, y2); + y = pmul(y, x3); + + // Add the logarithm of the exponent back to the result of the interpolation. + y1 = pmul(e, p4d_cephes_log_q1); + tmp = pmul(x2, p4d_half); + y = padd(y, y1); + x = psub(x, tmp); + y2 = pmul(e, p4d_cephes_log_q2); + x = padd(x, y); + x = padd(x, y2); + + // Filter out invalid inputs, i.e. negative arg will be NAN, 0 will be -INF. + return pblendv(iszero_mask, minus_inf, _mm256_or_pd(x, invalid_mask)); + } } } #endif @@ -790,6 +1210,18 @@ namespace Eigen return _mm_set1_epi64x(a); } + template<> + EIGEN_STRONG_INLINE Packet4i padd64(const Packet4i& a, const Packet4i& b) + { + return _mm_add_epi64(a, b); + } + + template<> + EIGEN_STRONG_INLINE Packet4i psub64(const Packet4i& a, const Packet4i& b) + { + return _mm_sub_epi64(a, b); + } + template<> EIGEN_STRONG_INLINE Packet4i pcmpeq(const Packet4i& a, const Packet4i& b) { @@ -827,6 +1259,19 @@ namespace Eigen return _mm_cmplt_epi32(a, b); } + template<> + EIGEN_STRONG_INLINE Packet4i pcmplt64(const Packet4i& a, const Packet4i& b) + { +#ifdef EIGEN_VECTORIZE_SSE4_2 + return _mm_cmpgt_epi64(b, a); +#else + int64_t u[2], v[2]; + _mm_storeu_si128((__m128i*)u, a); + _mm_storeu_si128((__m128i*)v, b); + return _mm_set_epi64x(u[1] < v[1] ? -1 : 0, u[0] < v[0] ? -1 : 0); +#endif + } + template<> EIGEN_STRONG_INLINE Packet4f pcmplt(const Packet4f& a, const Packet4f& b) { @@ -1003,6 +1448,122 @@ namespace Eigen u[1] = u[1] * b + c; return _mm_loadu_si128((__m128i*)u); } + + EIGEN_STRONG_INLINE __m128d uint64_to_double(__m128i x) { + x = _mm_or_si128(x, _mm_castpd_si128(_mm_set1_pd(0x0010000000000000))); + return _mm_sub_pd(_mm_castsi128_pd(x), _mm_set1_pd(0x0010000000000000)); + } + + EIGEN_STRONG_INLINE __m128d int64_to_double(__m128i x) { + x = _mm_add_epi64(x, _mm_castpd_si128(_mm_set1_pd(0x0018000000000000))); + return _mm_sub_pd(_mm_castsi128_pd(x), _mm_set1_pd(0x0018000000000000)); + } + + EIGEN_STRONG_INLINE __m128i double_to_int64(__m128d x) { + x = _mm_add_pd(x, _mm_set1_pd(0x0018000000000000)); + return _mm_sub_epi64( + _mm_castpd_si128(x), + _mm_castpd_si128(_mm_set1_pd(0x0018000000000000)) + ); + } + + template<> + EIGEN_STRONG_INLINE Packet4i pcast64(const Packet2d& a) + { + return double_to_int64(a); + } + + template<> + EIGEN_STRONG_INLINE Packet2d pcast64(const Packet4i& a) + { + return int64_to_double(a); + } + + template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED + Packet2d psin(const Packet2d& x) + { + return _psin(x); + } + + template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED + Packet2d plog(const Packet2d& _x) + { + Packet2d x = _x; + _EIGEN_DECLARE_CONST_Packet2d(1, 1.0f); + _EIGEN_DECLARE_CONST_Packet2d(half, 0.5f); + _EIGEN_DECLARE_CONST_Packet4i(0x7f, 0x7f); + + auto inv_mant_mask = _mm_castsi128_pd(pseti64(~0x7ff0000000000000)); + auto min_norm_pos = _mm_castsi128_pd(pseti64(0x10000000000000)); + auto minus_inf = _mm_castsi128_pd(pseti64(0xfff0000000000000)); + + /* natural logarithm computed for 4 simultaneous float + return NaN for x <= 0 + */ + _EIGEN_DECLARE_CONST_Packet2d(cephes_SQRTHF, 0.707106781186547524); + _EIGEN_DECLARE_CONST_Packet2d(cephes_log_p0, 7.0376836292E-2); + _EIGEN_DECLARE_CONST_Packet2d(cephes_log_p1, -1.1514610310E-1); + _EIGEN_DECLARE_CONST_Packet2d(cephes_log_p2, 1.1676998740E-1); + _EIGEN_DECLARE_CONST_Packet2d(cephes_log_p3, -1.2420140846E-1); + _EIGEN_DECLARE_CONST_Packet2d(cephes_log_p4, +1.4249322787E-1); + _EIGEN_DECLARE_CONST_Packet2d(cephes_log_p5, -1.6668057665E-1); + _EIGEN_DECLARE_CONST_Packet2d(cephes_log_p6, +2.0000714765E-1); + _EIGEN_DECLARE_CONST_Packet2d(cephes_log_p7, -2.4999993993E-1); + _EIGEN_DECLARE_CONST_Packet2d(cephes_log_p8, +3.3333331174E-1); + _EIGEN_DECLARE_CONST_Packet2d(cephes_log_q1, -2.12194440e-4); + _EIGEN_DECLARE_CONST_Packet2d(cephes_log_q2, 0.693359375); + + + Packet4i emm0; + + Packet2d invalid_mask = _mm_cmpnge_pd(x, _mm_setzero_pd()); // not greater equal is true if x is NaN + Packet2d iszero_mask = _mm_cmpeq_pd(x, _mm_setzero_pd()); + + x = pmax(x, min_norm_pos); /* cut off denormalized stuff */ + emm0 = _mm_srli_epi64(_mm_castpd_si128(x), 52); + + /* keep only the fractional part */ + x = _mm_and_pd(x, inv_mant_mask); + x = _mm_or_pd(x, p2d_half); + + Packet2d e = _mm_sub_pd(uint64_to_double(emm0), pset1(1022)); + + /* part2: + if( x < SQRTHF ) { + e -= 1; + x = x + x - 1.0; + } else { x = x - 1.0; } + */ + Packet2d mask = _mm_cmplt_pd(x, p2d_cephes_SQRTHF); + Packet2d tmp = pand(x, mask); + x = psub(x, p2d_1); + e = psub(e, pand(p2d_1, mask)); + x = padd(x, tmp); + + Packet2d x2 = pmul(x, x); + Packet2d x3 = pmul(x2, x); + + Packet2d y, y1, y2; + y = pmadd(p2d_cephes_log_p0, x, p2d_cephes_log_p1); + y1 = pmadd(p2d_cephes_log_p3, x, p2d_cephes_log_p4); + y2 = pmadd(p2d_cephes_log_p6, x, p2d_cephes_log_p7); + y = pmadd(y, x, p2d_cephes_log_p2); + y1 = pmadd(y1, x, p2d_cephes_log_p5); + y2 = pmadd(y2, x, p2d_cephes_log_p8); + y = pmadd(y, x3, y1); + y = pmadd(y, x3, y2); + y = pmul(y, x3); + + y1 = pmul(e, p2d_cephes_log_q1); + tmp = pmul(x2, p2d_half); + y = padd(y, y1); + x = psub(x, tmp); + y2 = pmul(e, p2d_cephes_log_q2); + x = padd(x, y); + x = padd(x, y2); + // negative arg will be NAN, 0 will be -INF + return pblendv(iszero_mask, minus_inf, _mm_or_pd(x, invalid_mask)); + } } } #endif diff --git a/EigenRand/PacketRandomEngine.h b/EigenRand/PacketRandomEngine.h index b333211..63cf320 100644 --- a/EigenRand/PacketRandomEngine.h +++ b/EigenRand/PacketRandomEngine.h @@ -20,36 +20,6 @@ namespace Eigen { - namespace internal - { - template - struct IsIntPacket : std::false_type {}; - - template - struct HalfPacket; - -#ifdef EIGEN_VECTORIZE_AVX2 - template<> - struct IsIntPacket : std::true_type {}; - - template<> - struct HalfPacket - { - using type = Packet4i; - }; -#endif -#ifdef EIGEN_VECTORIZE_SSE2 - template<> - struct IsIntPacket : std::true_type {}; - - template<> - struct HalfPacket - { - using type = uint64_t; - }; -#endif - } - namespace Rand { namespace detail diff --git a/TestAccuracy.vcxproj b/TestAccuracy.vcxproj index 94f1196..a21dced 100644 --- a/TestAccuracy.vcxproj +++ b/TestAccuracy.vcxproj @@ -222,7 +222,7 @@ true USE_ADDON;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) true - AdvancedVectorExtensions2 + StreamingSIMDExtensions2 Console diff --git a/test/accuracy.cpp b/test/accuracy.cpp index 153fccc..fa450ae 100644 --- a/test/accuracy.cpp +++ b/test/accuracy.cpp @@ -197,48 +197,93 @@ std::map test_eigenrand_cont(size_t size, size_t step, size arr = Eigen::Rand::normalLike(arr, urng); ret["normal"] = calc_emd_with_cdf(arr, normal_cdf, step); + arrd = Eigen::Rand::normalLike(arrd, urng); + ret["normal/double"] = calc_emd_with_cdf(arrd, normal_cdf, step); + arr = Eigen::Rand::lognormalLike(arr, urng); ret["lognormal"] = calc_emd_with_cdf(arr, lognormal_cdf, step); + arrd = Eigen::Rand::lognormalLike(arrd, urng); + ret["lognormal/double"] = calc_emd_with_cdf(arrd, lognormal_cdf, step); + arr = Eigen::Rand::gammaLike(arr, urng, 1, 1); ret["gamma(1,1)"] = calc_emd_with_pdf(arr, gamma11_pdf, step); + arrd = Eigen::Rand::gammaLike(arrd, urng, 1, 1); + ret["gamma(1,1)/double"] = calc_emd_with_pdf(arrd, gamma11_pdf, step); + arr = Eigen::Rand::gammaLike(arr, urng, 5, 1); ret["gamma(5,1)"] = calc_emd_with_pdf(arr, gamma51_pdf, step); + arrd = Eigen::Rand::gammaLike(arrd, urng, 5, 1); + ret["gamma(5,1)/double"] = calc_emd_with_pdf(arrd, gamma51_pdf, step); + arr = Eigen::Rand::gammaLike(arr, urng, 0.2, 1); ret["gamma(0.2,1)"] = calc_emd_with_pdf(arr, gamma21_pdf, step); + arrd = Eigen::Rand::gammaLike(arrd, urng, 0.2, 1); + ret["gamma(0.2,1)/double"] = calc_emd_with_pdf(arrd, gamma21_pdf, step); + arr = Eigen::Rand::exponentialLike(arr, urng); ret["exponential"] = calc_emd_with_cdf(arr, exp_cdf, step); + arrd = Eigen::Rand::exponentialLike(arrd, urng); + ret["exponential/double"] = calc_emd_with_cdf(arrd, exp_cdf, step); + arr = Eigen::Rand::weibullLike(arr, urng, 2); ret["weibull(2,1)"] = calc_emd_with_cdf(arr, weibull_cdf, step); + arrd = Eigen::Rand::weibullLike(arrd, urng, 2); + ret["weibull(2,1)/double"] = calc_emd_with_cdf(arrd, weibull_cdf, step); + arr = Eigen::Rand::extremeValueLike(arr, urng, 1, 1); ret["extremeValue(1,1)"] = calc_emd_with_cdf(arr, extreme_value_cdf, step); + arrd = Eigen::Rand::extremeValueLike(arrd, urng, 1, 1); + ret["extremeValue(1,1)/double"] = calc_emd_with_cdf(arrd, extreme_value_cdf, step); + arr = Eigen::Rand::chiSquaredLike(arr, urng, 7); ret["chiSquared(7)"] = calc_emd_with_pdf(arr, chisquared_pdf, step); + arrd = Eigen::Rand::chiSquaredLike(arrd, urng, 7); + ret["chiSquared(7)/double"] = calc_emd_with_pdf(arrd, chisquared_pdf, step); + arr = Eigen::Rand::cauchyLike(arr, urng); ret["cauchy"] = calc_emd_with_cdf(arr, cauchy_cdf, step); + arrd = Eigen::Rand::cauchyLike(arrd, urng); + ret["cauchy/double"] = calc_emd_with_cdf(arrd, cauchy_cdf, step); + arr = Eigen::Rand::studentTLike(arr, urng, 1); ret["studentT(1)"] = calc_emd_with_cdf(arr, cauchy_cdf, step); + arrd = Eigen::Rand::studentTLike(arrd, urng, 1); + ret["studentT(1)/double"] = calc_emd_with_cdf(arrd, cauchy_cdf, step); + arr = Eigen::Rand::studentTLike(arr, urng, 5); ret["studentT(5)"] = calc_emd_with_pdf(arr, student5_pdf, step); + arrd = Eigen::Rand::studentTLike(arrd, urng, 5); + ret["studentT(5)/double"] = calc_emd_with_pdf(arrd, student5_pdf, step); + arr = Eigen::Rand::studentTLike(arr, urng, 20); ret["studentT(20)"] = calc_emd_with_pdf(arr, student20_pdf, step); + arrd = Eigen::Rand::studentTLike(arrd, urng, 20); + ret["studentT(20)/double"] = calc_emd_with_pdf(arrd, student20_pdf, step); + arr = Eigen::Rand::fisherFLike(arr, urng, 1, 1); ret["fisherF(1,1)"] = calc_emd_with_cdf(arr, fisher11_cdf, step); + arrd = Eigen::Rand::fisherFLike(arrd, urng, 1, 1); + ret["fisherF(1,1)/double"] = calc_emd_with_cdf(arrd, fisher11_cdf, step); + arr = Eigen::Rand::fisherFLike(arr, urng, 5, 5); ret["fisherF(5,5)"] = calc_emd_with_pdf(arr, fisher55_pdf, step); + arrd = Eigen::Rand::fisherFLike(arrd, urng, 5, 5); + ret["fisherF(5,5)/double"] = calc_emd_with_pdf(arrd, fisher55_pdf, step); + return ret; } @@ -322,6 +367,11 @@ std::map test_cpp11_cont(size_t size, size_t step, size_t s } ret["normal"] = calc_emd_with_cdf(arr, normal_cdf, step); + { + std::normal_distribution dist; + arrd = Eigen::ArrayXd::NullaryExpr(size, [&]() { return dist(urng); }); + } + ret["normal/double"] = calc_emd_with_cdf(arrd, normal_cdf, step); { std::lognormal_distribution<> dist; @@ -329,6 +379,11 @@ std::map test_cpp11_cont(size_t size, size_t step, size_t s } ret["lognormal"] = calc_emd_with_cdf(arr, lognormal_cdf, step); + { + std::lognormal_distribution dist; + arrd = Eigen::ArrayXd::NullaryExpr(size, [&]() { return dist(urng); }); + } + ret["lognormal/double"] = calc_emd_with_cdf(arrd, lognormal_cdf, step); { std::gamma_distribution<> dist{ 1, 1 }; @@ -336,77 +391,155 @@ std::map test_cpp11_cont(size_t size, size_t step, size_t s } ret["gamma(1,1)"] = calc_emd_with_pdf(arr, gamma11_pdf, step); + { + std::gamma_distribution dist{ 1, 1 }; + arrd = Eigen::ArrayXd::NullaryExpr(size, [&]() { return dist(urng); }); + } + ret["gamma(1,1)/double"] = calc_emd_with_pdf(arrd, gamma11_pdf, step); + { std::gamma_distribution<> dist{ 5, 1 }; arr = Eigen::ArrayXf::NullaryExpr(size, [&]() { return dist(urng); }); } ret["gamma(5,1)"] = calc_emd_with_pdf(arr, gamma51_pdf, step); + { + std::gamma_distribution dist{ 5, 1 }; + arrd = Eigen::ArrayXd::NullaryExpr(size, [&]() { return dist(urng); }); + } + ret["gamma(5,1)/double"] = calc_emd_with_pdf(arrd, gamma51_pdf, step); + { std::gamma_distribution<> dist{ 0.2, 1 }; arr = Eigen::ArrayXf::NullaryExpr(size, [&]() { return dist(urng); }); } ret["gamma(0.2,1)"] = calc_emd_with_pdf(arr, gamma21_pdf, step); + { + std::gamma_distribution dist{ 0.2, 1 }; + arrd = Eigen::ArrayXd::NullaryExpr(size, [&]() { return dist(urng); }); + } + ret["gamma(0.2,1)/double"] = calc_emd_with_pdf(arrd, gamma21_pdf, step); + { std::exponential_distribution<> dist; arr = Eigen::ArrayXf::NullaryExpr(size, [&]() { return dist(urng); }); } ret["exponential"] = calc_emd_with_cdf(arr, exp_cdf, step); + { + std::exponential_distribution dist; + arrd = Eigen::ArrayXd::NullaryExpr(size, [&]() { return dist(urng); }); + } + ret["exponential/double"] = calc_emd_with_cdf(arrd, exp_cdf, step); + { std::weibull_distribution<> dist{ 2, 1 }; arr = Eigen::ArrayXf::NullaryExpr(size, [&]() { return dist(urng); }); } ret["weibull(2,1)"] = calc_emd_with_cdf(arr, weibull_cdf, step); + { + std::weibull_distribution dist{ 2, 1 }; + arrd = Eigen::ArrayXd::NullaryExpr(size, [&]() { return dist(urng); }); + } + ret["weibull(2,1)/double"] = calc_emd_with_cdf(arrd, weibull_cdf, step); + { std::extreme_value_distribution<> dist{ 1, 1 }; arr = Eigen::ArrayXf::NullaryExpr(size, [&]() { return dist(urng); }); } ret["extremeValue(1,1)"] = calc_emd_with_cdf(arr, extreme_value_cdf, step); + { + std::extreme_value_distribution dist{ 1, 1 }; + arrd = Eigen::ArrayXd::NullaryExpr(size, [&]() { return dist(urng); }); + } + ret["extremeValue(1,1)/double"] = calc_emd_with_cdf(arrd, extreme_value_cdf, step); + { std::chi_squared_distribution<> dist{ 7 }; arr = Eigen::ArrayXf::NullaryExpr(size, [&]() { return dist(urng); }); } ret["chiSquared(7)"] = calc_emd_with_pdf(arr, chisquared_pdf, step); + { + std::chi_squared_distribution dist{ 7 }; + arrd = Eigen::ArrayXd::NullaryExpr(size, [&]() { return dist(urng); }); + } + ret["chiSquared(7)/double"] = calc_emd_with_pdf(arrd, chisquared_pdf, step); + { std::cauchy_distribution<> dist{ 0, 1 }; arr = Eigen::ArrayXf::NullaryExpr(size, [&]() { return dist(urng); }); } ret["cauchy"] = calc_emd_with_cdf(arr, cauchy_cdf, step); + { + std::cauchy_distribution dist{ 0, 1 }; + arrd = Eigen::ArrayXd::NullaryExpr(size, [&]() { return dist(urng); }); + } + ret["cauchy/double"] = calc_emd_with_cdf(arrd, cauchy_cdf, step); + { std::student_t_distribution<> dist{ 1 }; arr = Eigen::ArrayXf::NullaryExpr(size, [&]() { return dist(urng); }); } ret["studentT(1)"] = calc_emd_with_cdf(arr, cauchy_cdf, step); + { + std::student_t_distribution dist{ 1 }; + arrd = Eigen::ArrayXd::NullaryExpr(size, [&]() { return dist(urng); }); + } + ret["studentT(1)/double"] = calc_emd_with_cdf(arrd, cauchy_cdf, step); + { std::student_t_distribution<> dist{ 5 }; arr = Eigen::ArrayXf::NullaryExpr(size, [&]() { return dist(urng); }); } ret["studentT(5)"] = calc_emd_with_pdf(arr, student5_pdf, step); + { + std::student_t_distribution dist{ 5 }; + arrd = Eigen::ArrayXd::NullaryExpr(size, [&]() { return dist(urng); }); + } + ret["studentT(5)/double"] = calc_emd_with_pdf(arrd, student5_pdf, step); + { std::student_t_distribution<> dist{ 20 }; arr = Eigen::ArrayXf::NullaryExpr(size, [&]() { return dist(urng); }); } ret["studentT(20)"] = calc_emd_with_pdf(arr, student20_pdf, step); + { + std::student_t_distribution dist{ 20 }; + arrd = Eigen::ArrayXd::NullaryExpr(size, [&]() { return dist(urng); }); + } + ret["studentT(20)/double"] = calc_emd_with_pdf(arrd, student20_pdf, step); + { std::fisher_f_distribution<> dist{ 1, 1 }; arr = Eigen::ArrayXf::NullaryExpr(size, [&]() { return dist(urng); }); } ret["fisherF(1,1)"] = calc_emd_with_cdf(arr, fisher11_cdf, step); + { + std::fisher_f_distribution dist{ 1, 1 }; + arrd = Eigen::ArrayXd::NullaryExpr(size, [&]() { return dist(urng); }); + } + ret["fisherF(1,1)/double"] = calc_emd_with_cdf(arrd, fisher11_cdf, step); + { std::fisher_f_distribution<> dist{ 5, 5 }; arr = Eigen::ArrayXf::NullaryExpr(size, [&]() { return dist(urng); }); } ret["fisherF(5,5)"] = calc_emd_with_pdf(arr, fisher55_pdf, step); + + { + std::fisher_f_distribution dist{ 5, 5 }; + arrd = Eigen::ArrayXd::NullaryExpr(size, [&]() { return dist(urng); }); + } + ret["fisherF(5,5)/double"] = calc_emd_with_pdf(arrd, fisher55_pdf, step); return ret; } diff --git a/test/benchmark.cpp b/test/benchmark.cpp index 3e56ac1..d1996de 100644 --- a/test/benchmark.cpp +++ b/test/benchmark.cpp @@ -259,6 +259,11 @@ std::map test_eigenrand(size_t size, const std::string& suf x = Eigen::Rand::normalLike(x, urng); } + { + auto scope = bh.measure("normal(0,1)/double" + suffix, xd); + xd = Eigen::Rand::normalLike(xd, urng); + } + { auto scope = bh.measure("normal(0,1) square" + suffix, x); x = Eigen::Rand::normalLike(x, urng).square(); @@ -269,78 +274,137 @@ std::map test_eigenrand(size_t size, const std::string& suf x = Eigen::Rand::normalLike(x, urng, 2, 3); } + { + auto scope = bh.measure("normal(2,3)/double" + suffix, xd); + xd = Eigen::Rand::normalLike(xd, urng, 2, 3); + } + { auto scope = bh.measure("lognormal(0,1)" + suffix, x); x = Eigen::Rand::lognormalLike(x, urng); } + + { + auto scope = bh.measure("lognormal(0,1)/double" + suffix, xd); + xd = Eigen::Rand::lognormalLike(xd, urng); + } { auto scope = bh.measure("exponential(1)" + suffix, x); x = Eigen::Rand::exponentialLike(x, urng); } + { + auto scope = bh.measure("exponential(1)/double" + suffix, xd); + xd = Eigen::Rand::exponentialLike(xd, urng); + } + { auto scope = bh.measure("gamma(1,2)" + suffix, x); x = Eigen::Rand::gammaLike(x, urng, 1, 2); } + { + auto scope = bh.measure("gamma(1,2)/double" + suffix, xd); + xd = Eigen::Rand::gammaLike(xd, urng, 1, 2); + } + { auto scope = bh.measure("gamma(5,3)" + suffix, x); x = Eigen::Rand::gammaLike(x, urng, 5, 3); } - { auto scope = bh.measure("gamma(5,3)/gen" + suffix, x); Eigen::Rand::GammaGen gen{ 5, 3 }; x = gen.generateLike(x, urng); } - /*{ + { auto scope = bh.measure("gamma(5,3)/double" + suffix, xd); xd = Eigen::Rand::gammaLike(xd, urng, 5, 3); - }*/ + } { auto scope = bh.measure("gamma(0.2,1)" + suffix, x); x = Eigen::Rand::gammaLike(x, urng, 0.2, 1); } + { + auto scope = bh.measure("gamma(0.2,1)/double" + suffix, xd); + xd = Eigen::Rand::gammaLike(xd, urng, 0.2, 1); + } + { auto scope = bh.measure("gamma(10.5,1)" + suffix, x); x = Eigen::Rand::gammaLike(x, urng, 10.5, 1); } + { + auto scope = bh.measure("gamma(10.5,1)/double" + suffix, xd); + xd = Eigen::Rand::gammaLike(xd, urng, 10.5, 1); + } + { auto scope = bh.measure("weibull(2,3)" + suffix, x); x = Eigen::Rand::weibullLike(x, urng, 2, 3); } + { + auto scope = bh.measure("weibull(2,3)/double" + suffix, xd); + xd = Eigen::Rand::weibullLike(xd, urng, 2, 3); + } + { auto scope = bh.measure("extremeValue(0,1)" + suffix, x); x = Eigen::Rand::extremeValueLike(x, urng, 0, 1); } + { + auto scope = bh.measure("extremeValue(0,1)/double" + suffix, xd); + xd = Eigen::Rand::extremeValueLike(xd, urng, 0, 1); + } + { auto scope = bh.measure("chiSquared(15)" + suffix, x); x = Eigen::Rand::chiSquaredLike(x, urng, 15); } + { + auto scope = bh.measure("chiSquared(15)/double" + suffix, xd); + xd = Eigen::Rand::chiSquaredLike(xd, urng, 15); + } + { auto scope = bh.measure("cauchy" + suffix, x); x = Eigen::Rand::cauchyLike(x, urng); } + { + auto scope = bh.measure("cauchy/double" + suffix, xd); + xd = Eigen::Rand::cauchyLike(xd, urng); + } + { auto scope = bh.measure("studentT(1)" + suffix, x); x = Eigen::Rand::studentTLike(x, urng, 1); } + { + auto scope = bh.measure("studentT(1)/double" + suffix, xd); + xd = Eigen::Rand::studentTLike(xd, urng, 1); + } + { auto scope = bh.measure("studentT(5)" + suffix, x); x = Eigen::Rand::studentTLike(x, urng, 5); } + { + auto scope = bh.measure("studentT(5)/double" + suffix, xd); + xd = Eigen::Rand::studentTLike(xd, urng, 5); + } + { auto scope = bh.measure("studentT(20)" + suffix, x); x = Eigen::Rand::studentTLike(x, urng, 20); @@ -351,6 +415,11 @@ std::map test_eigenrand(size_t size, const std::string& suf x = Eigen::Rand::fisherFLike(x, urng, 1, 1); } + { + auto scope = bh.measure("fisherF(1,1)/double" + suffix, xd); + xd = Eigen::Rand::fisherFLike(xd, urng, 1, 1); + } + { auto scope = bh.measure("fisherF(5,1)" + suffix, x); x = Eigen::Rand::fisherFLike(x, urng, 5, 1); @@ -366,6 +435,11 @@ std::map test_eigenrand(size_t size, const std::string& suf x = Eigen::Rand::fisherFLike(x, urng, 5, 5); } + { + auto scope = bh.measure("fisherF(5,5)/double" + suffix, xd); + xd = Eigen::Rand::fisherFLike(xd, urng, 5, 5); + } + { auto scope = bh.measure("poisson(1)" + suffix, xi); xi = Eigen::Rand::poissonLike(xi, urng, 1); @@ -746,7 +820,7 @@ int main(int argc, char** argv) for (size_t i = 0; i < repeat; ++i) { - for (auto& p : test_rng(std::mt19937{}, size, "rng\tmt19937", results)) + /*for (auto& p : test_rng(std::mt19937{}, size, "rng\tmt19937", results)) { time[p.first] += p.second; timeSq[p.first] += p.second * p.second; @@ -792,7 +866,7 @@ int main(int argc, char** argv) { time[p.first] += p.second; timeSq[p.first] += p.second * p.second; - } + }*/ for (auto& p : test_eigenrand(size, "\t:ERand", results)) { From a335914c4f30b32dfb9343719add81a2cf8e9ad9 Mon Sep 17 00:00:00 2001 From: Minchul Lee Date: Tue, 30 Mar 2021 17:10:14 +0900 Subject: [PATCH 2/3] fixed for avx & doc update --- EigenRand/Core.h | 6 +++--- EigenRand/Dists/Basic.h | 6 +++--- EigenRand/Dists/Discrete.h | 6 +++--- EigenRand/Dists/GammaPoisson.h | 6 +++--- EigenRand/Dists/NormalExp.h | 6 +++--- EigenRand/EigenRand | 6 +++--- EigenRand/Macro.h | 8 ++++---- EigenRand/MorePacketMath.h | 8 ++++---- EigenRand/MvDists/Multinomial.h | 6 +++--- EigenRand/MvDists/MvNormal.h | 6 +++--- EigenRand/PacketFilter.h | 6 +++--- EigenRand/PacketRandomEngine.h | 6 +++--- EigenRand/RandUtils.h | 6 +++--- 13 files changed, 41 insertions(+), 41 deletions(-) diff --git a/EigenRand/Core.h b/EigenRand/Core.h index 100e3d4..dd23126 100644 --- a/EigenRand/Core.h +++ b/EigenRand/Core.h @@ -2,10 +2,10 @@ * @file Core.h * @author bab2min (bab2min@gmail.com) * @brief - * @version 0.3.0 - * @date 2020-10-07 + * @version 0.3.3 + * @date 2021-03-31 * - * @copyright Copyright (c) 2020 + * @copyright Copyright (c) 2020-2021 * */ diff --git a/EigenRand/Dists/Basic.h b/EigenRand/Dists/Basic.h index b45af31..ccd609c 100644 --- a/EigenRand/Dists/Basic.h +++ b/EigenRand/Dists/Basic.h @@ -2,10 +2,10 @@ * @file Basic.h * @author bab2min (bab2min@gmail.com) * @brief - * @version 0.3.0 - * @date 2020-10-07 + * @version 0.3.3 + * @date 2021-03-31 * - * @copyright Copyright (c) 2020 + * @copyright Copyright (c) 2020-2021 * */ diff --git a/EigenRand/Dists/Discrete.h b/EigenRand/Dists/Discrete.h index 27b5788..599e565 100644 --- a/EigenRand/Dists/Discrete.h +++ b/EigenRand/Dists/Discrete.h @@ -2,10 +2,10 @@ * @file Discrete.h * @author bab2min (bab2min@gmail.com) * @brief - * @version 0.3.0 - * @date 2020-10-07 + * @version 0.3.3 + * @date 2021-03-31 * - * @copyright Copyright (c) 2020 + * @copyright Copyright (c) 2020-2021 * */ diff --git a/EigenRand/Dists/GammaPoisson.h b/EigenRand/Dists/GammaPoisson.h index af4bf4f..ea16c5c 100644 --- a/EigenRand/Dists/GammaPoisson.h +++ b/EigenRand/Dists/GammaPoisson.h @@ -2,10 +2,10 @@ * @file GammaPoisson.h * @author bab2min (bab2min@gmail.com) * @brief - * @version 0.3.0 - * @date 2020-10-07 + * @version 0.3.3 + * @date 2021-03-31 * - * @copyright Copyright (c) 2020 + * @copyright Copyright (c) 2020-2021 * */ diff --git a/EigenRand/Dists/NormalExp.h b/EigenRand/Dists/NormalExp.h index e31df99..6e90a98 100644 --- a/EigenRand/Dists/NormalExp.h +++ b/EigenRand/Dists/NormalExp.h @@ -2,10 +2,10 @@ * @file NormalExp.h * @author bab2min (bab2min@gmail.com) * @brief - * @version 0.3.0 - * @date 2020-10-07 + * @version 0.3.3 + * @date 2021-03-31 * - * @copyright Copyright (c) 2020 + * @copyright Copyright (c) 2020-2021 * */ diff --git a/EigenRand/EigenRand b/EigenRand/EigenRand index 984d9a7..28d218c 100644 --- a/EigenRand/EigenRand +++ b/EigenRand/EigenRand @@ -2,10 +2,10 @@ * @file EigenRand * @author bab2min (bab2min@gmail.com) * @brief - * @version 0.3.0 - * @date 2020-10-07 + * @version 0.3.3 + * @date 2021-03-31 * - * @copyright Copyright (c) 2020 + * @copyright Copyright (c) 2020-2021 * */ diff --git a/EigenRand/Macro.h b/EigenRand/Macro.h index 356a8e2..a20fcd8 100644 --- a/EigenRand/Macro.h +++ b/EigenRand/Macro.h @@ -2,10 +2,10 @@ * @file Macro.h * @author bab2min (bab2min@gmail.com) * @brief - * @version 0.3.0 - * @date 2020-10-07 + * @version 0.3.3 + * @date 2021-03-31 * - * @copyright Copyright (c) 2020 + * @copyright Copyright (c) 2020-2021 * */ @@ -14,7 +14,7 @@ #define EIGENRAND_WORLD_VERSION 0 #define EIGENRAND_MAJOR_VERSION 3 -#define EIGENRAND_MINOR_VERSION 2 +#define EIGENRAND_MINOR_VERSION 3 #if EIGEN_VERSION_AT_LEAST(3,3,7) #else diff --git a/EigenRand/MorePacketMath.h b/EigenRand/MorePacketMath.h index 99e1047..4d422d8 100644 --- a/EigenRand/MorePacketMath.h +++ b/EigenRand/MorePacketMath.h @@ -2,10 +2,10 @@ * @file MorePacketMath.h * @author bab2min (bab2min@gmail.com) * @brief - * @version 0.3.0 - * @date 2020-10-07 + * @version 0.3.3 + * @date 2021-03-31 * - * @copyright Copyright (c) 2020 + * @copyright Copyright (c) 2020-2021 * */ @@ -1011,7 +1011,7 @@ namespace Eigen EIGEN_STRONG_INLINE __m256i double_to_int64(__m256d x) { x = _mm256_add_pd(x, _mm256_set1_pd(0x0018000000000000)); - return _mm256_sub_epi64( + return psub64( _mm256_castpd_si256(x), _mm256_castpd_si256(_mm256_set1_pd(0x0018000000000000)) ); diff --git a/EigenRand/MvDists/Multinomial.h b/EigenRand/MvDists/Multinomial.h index 17de6ec..3840fb9 100644 --- a/EigenRand/MvDists/Multinomial.h +++ b/EigenRand/MvDists/Multinomial.h @@ -2,10 +2,10 @@ * @file Multinomial.h * @author bab2min (bab2min@gmail.com) * @brief - * @version 0.3.0 - * @date 2020-10-07 + * @version 0.3.3 + * @date 2021-03-31 * - * @copyright Copyright (c) 2020 + * @copyright Copyright (c) 2020-2021 * */ diff --git a/EigenRand/MvDists/MvNormal.h b/EigenRand/MvDists/MvNormal.h index b26986d..604b512 100644 --- a/EigenRand/MvDists/MvNormal.h +++ b/EigenRand/MvDists/MvNormal.h @@ -2,10 +2,10 @@ * @file MvNormal.h * @author bab2min (bab2min@gmail.com) * @brief - * @version 0.3.0 - * @date 2020-10-07 + * @version 0.3.3 + * @date 2021-03-31 * - * @copyright Copyright (c) 2020 + * @copyright Copyright (c) 2020-2021 * */ diff --git a/EigenRand/PacketFilter.h b/EigenRand/PacketFilter.h index 411efb9..692bb31 100644 --- a/EigenRand/PacketFilter.h +++ b/EigenRand/PacketFilter.h @@ -2,10 +2,10 @@ * @file PacketFilter.h * @author bab2min (bab2min@gmail.com) * @brief - * @version 0.3.0 - * @date 2020-10-07 + * @version 0.3.3 + * @date 2021-03-31 * - * @copyright Copyright (c) 2020 + * @copyright Copyright (c) 2020-2021 * */ diff --git a/EigenRand/PacketRandomEngine.h b/EigenRand/PacketRandomEngine.h index 63cf320..055f0a7 100644 --- a/EigenRand/PacketRandomEngine.h +++ b/EigenRand/PacketRandomEngine.h @@ -2,10 +2,10 @@ * @file PacketRandomEngine.h * @author bab2min (bab2min@gmail.com) * @brief - * @version 0.3.0 - * @date 2020-10-07 + * @version 0.3.3 + * @date 2021-03-31 * - * @copyright Copyright (c) 2020 + * @copyright Copyright (c) 2020-2021 * */ diff --git a/EigenRand/RandUtils.h b/EigenRand/RandUtils.h index e7304b2..155a5a2 100644 --- a/EigenRand/RandUtils.h +++ b/EigenRand/RandUtils.h @@ -2,10 +2,10 @@ * @file RandUtils.h * @author bab2min (bab2min@gmail.com) * @brief - * @version 0.3.0 - * @date 2020-10-07 + * @version 0.3.3 + * @date 2021-03-31 * - * @copyright Copyright (c) 2020 + * @copyright Copyright (c) 2020-2021 * */ From da7a7ba1f692c50d10ed41569ee6f6588d1d78ee Mon Sep 17 00:00:00 2001 From: Minchul Lee Date: Tue, 30 Mar 2021 17:30:55 +0900 Subject: [PATCH 3/3] fixed for avx --- EigenRand/MorePacketMath.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/EigenRand/MorePacketMath.h b/EigenRand/MorePacketMath.h index 4d422d8..b16e9bb 100644 --- a/EigenRand/MorePacketMath.h +++ b/EigenRand/MorePacketMath.h @@ -1066,7 +1066,7 @@ namespace Eigen // Truncate input values to the minimum positive normal. x = pmax(x, min_norm_pos); - Packet4d emm0 = uint64_to_double(_mm256_srli_epi64(_mm256_castpd_si256(x), 52)); + Packet4d emm0 = uint64_to_double(psrl64(_mm256_castpd_si256(x), 52)); Packet4d e = psub(emm0, pset1(1022)); // Set the exponents to -1, i.e. x are in the range [0.5,1).