diff --git a/EigenRand/Core.h b/EigenRand/Core.h
index 100e3d4..dd23126 100644
--- a/EigenRand/Core.h
+++ b/EigenRand/Core.h
@@ -2,10 +2,10 @@
  * @file Core.h
  * @author bab2min (bab2min@gmail.com)
  * @brief 
- * @version 0.3.0
- * @date 2020-10-07
+ * @version 0.3.3
+ * @date 2021-03-31
  * 
- * @copyright Copyright (c) 2020
+ * @copyright Copyright (c) 2020-2021
  * 
  */
 
diff --git a/EigenRand/Dists/Basic.h b/EigenRand/Dists/Basic.h
index b45af31..ccd609c 100644
--- a/EigenRand/Dists/Basic.h
+++ b/EigenRand/Dists/Basic.h
@@ -2,10 +2,10 @@
  * @file Basic.h
  * @author bab2min (bab2min@gmail.com)
  * @brief 
- * @version 0.3.0
- * @date 2020-10-07
+ * @version 0.3.3
+ * @date 2021-03-31
  * 
- * @copyright Copyright (c) 2020
+ * @copyright Copyright (c) 2020-2021
  * 
  */
 
diff --git a/EigenRand/Dists/Discrete.h b/EigenRand/Dists/Discrete.h
index 667054f..599e565 100644
--- a/EigenRand/Dists/Discrete.h
+++ b/EigenRand/Dists/Discrete.h
@@ -2,10 +2,10 @@
  * @file Discrete.h
  * @author bab2min (bab2min@gmail.com)
  * @brief 
- * @version 0.3.0
- * @date 2020-10-07
+ * @version 0.3.3
+ * @date 2021-03-31
  *
- * @copyright Copyright (c) 2020
+ * @copyright Copyright (c) 2020-2021
  * 
  */
 
@@ -825,7 +825,7 @@ namespace Eigen
 						fres = ptruncate(padd(pmul(psqrt_tmean, yx), pmean));
 
 						auto p1 = pmul(padd(pmul(yx, yx), pset1<PacketType>(1)), pset1<PacketType>(0.9));
-						auto p2 = pexp(psub(psub(pmul(fres, plog_mean), plgamma(padd(fres, pset1<PacketType>(1)))), pg1));
+						auto p2 = pexp(psub(psub(pmul(fres, plog_mean), plgamma_approx(padd(fres, pset1<PacketType>(1)))), pg1));
 
 						auto c1 = pcmple(pset1<PacketType>(0), fres);
 						auto c2 = pcmple(ur.template packetOp<PacketType>(rng), pmul(p1, p2));
@@ -964,8 +964,8 @@ namespace Eigen
 						auto p1 = pmul(pmul(pset1<PacketType>(1.2), psqrt_v), padd(pset1<PacketType>(1), pmul(ys, ys)));
 						auto p2 = pexp(
 							padd(padd(psub(
-								psub(pg1, plgamma(padd(fres, pset1<PacketType>(1)))),
-								plgamma(psub(padd(ptrials, pset1<PacketType>(1)), fres))
+								psub(pg1, plgamma_approx(padd(fres, pset1<PacketType>(1)))),
+								plgamma_approx(psub(padd(ptrials, pset1<PacketType>(1)), fres))
 							), pmul(fres, plog_small_p)), pmul(psub(ptrials, fres), plog_small_q))
 						);
 
diff --git a/EigenRand/Dists/GammaPoisson.h b/EigenRand/Dists/GammaPoisson.h
index f157f8d..ea16c5c 100644
--- a/EigenRand/Dists/GammaPoisson.h
+++ b/EigenRand/Dists/GammaPoisson.h
@@ -2,10 +2,10 @@
  * @file GammaPoisson.h
  * @author bab2min (bab2min@gmail.com)
  * @brief
- * @version 0.3.0
- * @date 2020-10-07
+ * @version 0.3.3
+ * @date 2021-03-31
  *
- * @copyright Copyright (c) 2020
+ * @copyright Copyright (c) 2020-2021
  *
  */
 
@@ -81,7 +81,7 @@ namespace Eigen
 					const PacketType ppi = pset1<PacketType>(constant::pi),
 						psqrt_tmean = psqrt(pmul(pset1<PacketType>(2), mean)),
 						plog_mean = plog(mean),
-						pg1 = psub(pmul(mean, plog_mean), plgamma(padd(mean, pset1<PacketType>(1))));
+						pg1 = psub(pmul(mean, plog_mean), plgamma_approx(padd(mean, pset1<PacketType>(1))));
 					while (1)
 					{
 						PacketType fres, yx, psin, pcos;
@@ -90,7 +90,7 @@ namespace Eigen
 						fres = ptruncate(padd(pmul(psqrt_tmean, yx), mean));
 
 						auto p1 = pmul(padd(pmul(yx, yx), pset1<PacketType>(1)), pset1<PacketType>(0.9));
-						auto p2 = pexp(psub(psub(pmul(fres, plog_mean), plgamma(padd(fres, pset1<PacketType>(1)))), pg1));
+						auto p2 = pexp(psub(psub(pmul(fres, plog_mean), plgamma_approx(padd(fres, pset1<PacketType>(1)))), pg1));
 
 						auto c1 = pcmple(pset1<PacketType>(0), fres);
 						auto c2 = pcmple(ur.template packetOp<PacketType>(rng), pmul(p1, p2));
diff --git a/EigenRand/Dists/NormalExp.h b/EigenRand/Dists/NormalExp.h
index 2bc8218..6e90a98 100644
--- a/EigenRand/Dists/NormalExp.h
+++ b/EigenRand/Dists/NormalExp.h
@@ -2,10 +2,10 @@
  * @file NormalExp.h
  * @author bab2min (bab2min@gmail.com)
  * @brief 
- * @version 0.3.0
- * @date 2020-10-07
+ * @version 0.3.3
+ * @date 2021-03-31
  * 
- * @copyright Copyright (c) 2020
+ * @copyright Copyright (c) 2020-2021
  * 
  */
 
@@ -238,8 +238,7 @@ namespace Eigen
 					psub(pexp(pmul(plog(u1), pset1<Packet>(-2 / n))), pset1<Packet>(1))
 				));
 				auto theta = pmul(pset1<Packet>(2 * constant::pi), u2);
-				Packet sintheta, costheta;
-
+				//Packet sintheta, costheta;
 				//psincos(theta, sintheta, costheta);
 				return pmul(radius, psin(theta));
 			}
diff --git a/EigenRand/EigenRand b/EigenRand/EigenRand
index 984d9a7..28d218c 100644
--- a/EigenRand/EigenRand
+++ b/EigenRand/EigenRand
@@ -2,10 +2,10 @@
  * @file EigenRand
  * @author bab2min (bab2min@gmail.com)
  * @brief 
- * @version 0.3.0
- * @date 2020-10-07
+ * @version 0.3.3
+ * @date 2021-03-31
  * 
- * @copyright Copyright (c) 2020
+ * @copyright Copyright (c) 2020-2021
  * 
  */
 
diff --git a/EigenRand/Macro.h b/EigenRand/Macro.h
index 356a8e2..a20fcd8 100644
--- a/EigenRand/Macro.h
+++ b/EigenRand/Macro.h
@@ -2,10 +2,10 @@
  * @file Macro.h
  * @author bab2min (bab2min@gmail.com)
  * @brief 
- * @version 0.3.0
- * @date 2020-10-07
+ * @version 0.3.3
+ * @date 2021-03-31
  * 
- * @copyright Copyright (c) 2020
+ * @copyright Copyright (c) 2020-2021
  * 
  */
 
@@ -14,7 +14,7 @@
 
 #define EIGENRAND_WORLD_VERSION 0
 #define EIGENRAND_MAJOR_VERSION 3
-#define EIGENRAND_MINOR_VERSION 2
+#define EIGENRAND_MINOR_VERSION 3
 
 #if EIGEN_VERSION_AT_LEAST(3,3,7)
 #else
diff --git a/EigenRand/MorePacketMath.h b/EigenRand/MorePacketMath.h
index 49403fc..b16e9bb 100644
--- a/EigenRand/MorePacketMath.h
+++ b/EigenRand/MorePacketMath.h
@@ -2,10 +2,10 @@
  * @file MorePacketMath.h
  * @author bab2min (bab2min@gmail.com)
  * @brief 
- * @version 0.3.0
- * @date 2020-10-07
+ * @version 0.3.3
+ * @date 2021-03-31
  * 
- * @copyright Copyright (c) 2020
+ * @copyright Copyright (c) 2020-2021
  * 
  */
 
@@ -18,6 +18,51 @@ namespace Eigen
 {
 	namespace internal
 	{
+		template<typename Ty>
+		struct IsIntPacket : std::false_type {};
+
+		template<typename Ty>
+		struct IsFloatPacket : std::false_type {};
+
+		template<typename Ty>
+		struct IsDoublePacket : std::false_type {};
+
+		template<typename Ty>
+		struct HalfPacket;
+
+#ifdef EIGEN_VECTORIZE_AVX2
+		template<>
+		struct IsIntPacket<Packet8i> : std::true_type {};
+
+		template<>
+		struct HalfPacket<Packet8i>
+		{
+			using type = Packet4i;
+		};
+#endif
+#ifdef EIGEN_VECTORIZE_AVX
+		template<>
+		struct IsFloatPacket<Packet8f> : std::true_type {};
+
+		template<>
+		struct IsDoublePacket<Packet4d> : std::true_type {};
+#endif
+#ifdef EIGEN_VECTORIZE_SSE2
+		template<>
+		struct IsIntPacket<Packet4i> : std::true_type {};
+
+		template<>
+		struct IsFloatPacket<Packet4f> : std::true_type {};
+
+		template<>
+		struct IsDoublePacket<Packet2d> : std::true_type {};
+
+		template<>
+		struct HalfPacket<Packet4i>
+		{
+			using type = uint64_t;
+		};
+#endif
 		template<typename Packet>
 		struct reinterpreter
 		{
@@ -47,6 +92,15 @@ namespace Eigen
 		template<typename Packet>
 		EIGEN_STRONG_INLINE Packet pseti64(uint64_t a);
 
+		template<typename Packet>
+		EIGEN_STRONG_INLINE Packet padd64(const Packet& a, const Packet& b);
+
+		template<typename Packet>
+		EIGEN_STRONG_INLINE Packet psub64(const Packet& a, const Packet& b);
+
+		template <typename SrcPacket, typename TgtPacket>
+		EIGEN_STRONG_INLINE TgtPacket pcast64(const SrcPacket& a);
+
 		template<typename Packet>
 		EIGEN_STRONG_INLINE Packet pcmpeq(const Packet& a, const Packet& b);
 
@@ -65,6 +119,28 @@ namespace Eigen
 		template<typename Packet>
 		EIGEN_STRONG_INLINE int pmovemask(const Packet& a);
 
+		template<typename Packet>
+		EIGEN_STRONG_INLINE typename std::enable_if<
+			IsFloatPacket<Packet>::value, Packet
+		>::type pext_sign(const Packet& a)
+		{
+			using IntPacket = decltype(reinterpret_to_int(a));
+			return reinterpret_to_float(
+				pand(reinterpret_to_int(a), pset1<IntPacket>(0x80000000))
+			);
+		}
+
+		template<typename Packet>
+		EIGEN_STRONG_INLINE typename std::enable_if<
+			IsDoublePacket<Packet>::value, Packet
+		>::type pext_sign(const Packet& a)
+		{
+			using IntPacket = decltype(reinterpret_to_int(a));
+			return reinterpret_to_double(
+				pand(reinterpret_to_int(a), pseti64<IntPacket>(0x8000000000000000))
+			);
+		}
+
 		template<>
 		EIGEN_STRONG_INLINE uint64_t psll64<uint64_t>(const uint64_t& a, int b)
 		{
@@ -77,8 +153,136 @@ namespace Eigen
 			return a >> b;
 		}
 
+		// approximation : lgamma(z) ~= (z+2.5)ln(z+3) - z - 3 + 0.5 ln (2pi) + 1/12/(z + 3) - ln (z(z+1)(z+2))
+		template<typename Packet>
+		EIGEN_STRONG_INLINE Packet plgamma_approx(const Packet& x)
+		{
+			auto x_3 = padd(x, pset1<Packet>(3));
+			auto ret = pmul(padd(x_3, pset1<Packet>(-0.5)), plog(x_3));
+			ret = psub(ret, x_3);
+			ret = padd(ret, pset1<Packet>(0.9189385332046727));
+			ret = padd(ret, pdiv(pset1<Packet>(1 / 12.), x_3));
+			ret = psub(ret, plog(pmul(
+				pmul(psub(x_3, pset1<Packet>(1)), psub(x_3, pset1<Packet>(2))), x)));
+			return ret;
+		}
+
+		template<typename Packet>
+		EIGEN_STRONG_INLINE Packet pcmplt(const Packet& a, const Packet& b);
+
+		template<typename Packet>
+		EIGEN_STRONG_INLINE Packet pcmple(const Packet& a, const Packet& b);
+
+		template<typename PacketIf, typename Packet>
+		EIGEN_STRONG_INLINE Packet pblendv(const PacketIf& ifPacket, const Packet& thenPacket, const Packet& elsePacket);
+
+		template<typename Packet>
+		EIGEN_STRONG_INLINE Packet pgather(const int* addr, const Packet& index);
+
+		template<typename Packet>
+		EIGEN_STRONG_INLINE auto pgather(const float* addr, const Packet& index) -> decltype(reinterpret_to_float(std::declval<Packet>()));
+
+		template<typename Packet>
+		EIGEN_STRONG_INLINE auto pgather(const double* addr, const Packet& index, bool upperhalf = false) -> decltype(reinterpret_to_double(std::declval<Packet>()));
+
+		template<typename Packet>
+		EIGEN_STRONG_INLINE Packet ptruncate(const Packet& a);
+
+		template<typename Packet>
+		EIGEN_STRONG_INLINE Packet pcmpeq64(const Packet& a, const Packet& b);
+
+		template<typename Packet>
+		EIGEN_STRONG_INLINE Packet pcmplt64(const Packet& a, const Packet& b);
+
+		template<typename Packet>
+		EIGEN_STRONG_INLINE Packet pmuluadd64(const Packet& a, uint64_t b, uint64_t c);
+
+		template<typename IntPacket>
+		EIGEN_STRONG_INLINE auto bit_to_ur_float(const IntPacket& x) -> decltype(reinterpret_to_float(x))
+		{
+			using FloatPacket = decltype(reinterpret_to_float(x));
+
+			const IntPacket lower = pset1<IntPacket>(0x7FFFFF),
+				upper = pset1<IntPacket>(127 << 23);
+			const FloatPacket one = pset1<FloatPacket>(1);
+
+			return psub(reinterpret_to_float(por(pand(x, lower), upper)), one);
+		}
+
+		template<typename IntPacket>
+		EIGEN_STRONG_INLINE auto bit_to_ur_double(const IntPacket& x) -> decltype(reinterpret_to_double(x))
+		{
+			using DoublePacket = decltype(reinterpret_to_double(x));
+
+			const IntPacket lower = pseti64<IntPacket>(0xFFFFFFFFFFFFFull),
+				upper = pseti64<IntPacket>(1023ull << 52);
+			const DoublePacket one = pset1<DoublePacket>(1);
+
+			return psub(reinterpret_to_double(por(pand(x, lower), upper)), one);
+		}
+
+		template<typename _Scalar>
+		struct bit_scalar;
+
+		template<>
+		struct bit_scalar<float>
+		{
+			float to_ur(uint32_t x)
+			{
+				union
+				{
+					uint32_t u;
+					float f;
+				};
+				u = (x & 0x7FFFFF) | (127 << 23);
+				return f - 1.f;
+			}
+
+			float to_nzur(uint32_t x)
+			{
+				return to_ur(x) + std::numeric_limits<float>::epsilon() / 8;
+			}
+		};
+
+		template<>
+		struct bit_scalar<double>
+		{
+			double to_ur(uint64_t x)
+			{
+				union
+				{
+					uint64_t u;
+					double f;
+				};
+				u = (x & 0xFFFFFFFFFFFFFull) | (1023ull << 52);
+				return f - 1.;
+			}
+
+			double to_nzur(uint64_t x)
+			{
+				return to_ur(x) + std::numeric_limits<double>::epsilon() / 8;
+			}
+		};
+
+
+		struct float2
+		{
+			float f[2];
+		};
+
+		EIGEN_STRONG_INLINE float2 bit_to_ur_float(uint64_t x)
+		{
+			bit_scalar<float> bs;
+			float2 ret;
+			ret.f[0] = bs.to_ur(x & 0xFFFFFFFF);
+			ret.f[1] = bs.to_ur(x >> 32);
+			return ret;
+		}
+
 		template<typename Packet>
-		EIGEN_STRONG_INLINE void psincos(Packet x, Packet &s, Packet &c)
+		EIGEN_STRONG_INLINE typename std::enable_if<
+			IsFloatPacket<Packet>::value
+		>::type psincos(Packet x, Packet& s, Packet& c)
 		{
 			Packet xmm1, xmm2, xmm3 = pset1<Packet>(0), sign_bit_sin, y;
 			using IntPacket = decltype(reinterpret_to_int(x));
@@ -88,9 +292,7 @@ namespace Eigen
 			/* take the absolute value */
 			x = pabs(x);
 			/* extract the sign bit (upper one) */
-			sign_bit_sin = reinterpret_to_float(
-				pand(reinterpret_to_int(sign_bit_sin), pset1<IntPacket>(0x80000000))
-			);
+			sign_bit_sin = pext_sign(sign_bit_sin);
 
 			/* scale by 4/Pi */
 			y = pmul(x, pset1<Packet>(1.27323954473516));
@@ -175,127 +377,190 @@ namespace Eigen
 			c = pxor(xmm2, sign_bit_cos);
 		}
 
-		// approximation : lgamma(z) ~= (z+2.5)ln(z+3) - z - 3 + 0.5 ln (2pi) + 1/12/(z + 3) - ln (z(z+1)(z+2))
 		template<typename Packet>
-		EIGEN_STRONG_INLINE Packet plgamma(const Packet& x)
+		EIGEN_STRONG_INLINE typename std::enable_if<
+			IsDoublePacket<Packet>::value
+		>::type psincos(Packet x, Packet& s, Packet& c)
 		{
-			auto x_3 = padd(x, pset1<Packet>(3));
-			auto ret = pmul(padd(x_3, pset1<Packet>(-0.5)), plog(x_3));
-			ret = psub(ret, x_3);
-			ret = padd(ret, pset1<Packet>(0.9189385332046727));
-			ret = padd(ret, pdiv(pset1<Packet>(1 / 12.), x_3));
-			ret = psub(ret, plog(pmul(
-				pmul(psub(x_3, pset1<Packet>(1)), psub(x_3, pset1<Packet>(2))), x)));
-			return ret;
-		}
+			Packet xmm1, xmm2, xmm3 = pset1<Packet>(0), sign_bit_sin, y;
+			using IntPacket = decltype(reinterpret_to_int(x));
+			IntPacket emm0, emm2, emm4;
 
-		template<typename Packet>
-		EIGEN_STRONG_INLINE Packet pcmplt(const Packet& a, const Packet& b);
+			sign_bit_sin = x;
+			/* take the absolute value */
+			x = pabs(x);
+			/* extract the sign bit (upper one) */
+			sign_bit_sin = pext_sign(sign_bit_sin);
 
-		template<typename Packet>
-		EIGEN_STRONG_INLINE Packet pcmple(const Packet& a, const Packet& b);
+			/* scale by 4/Pi */
+			y = pmul(x, pset1<Packet>(1.27323954473516));
 
-		template<typename PacketIf, typename Packet>
-		EIGEN_STRONG_INLINE Packet pblendv(const PacketIf& ifPacket, const Packet& thenPacket, const Packet& elsePacket);
+			/* store the integer part of y in emm2 */
+			emm2 = pcast64<Packet, IntPacket>(y);
 
-		template<typename Packet>
-		EIGEN_STRONG_INLINE Packet pgather(const int* addr, const Packet& index);
+			/* j=(j+1) & (~1) (see the cephes sources) */
+			emm2 = padd64(emm2, pseti64<IntPacket>(1));
+			emm2 = pand(emm2, pseti64<IntPacket>(~1ll));
+			y = pcast64<IntPacket, Packet>(emm2);
 
-		template<typename Packet>
-		EIGEN_STRONG_INLINE auto pgather(const float* addr, const Packet& index) -> decltype(reinterpret_to_float(std::declval<Packet>()));
+			emm4 = emm2;
 
-		template<typename Packet>
-		EIGEN_STRONG_INLINE auto pgather(const double* addr, const Packet& index, bool upperhalf = false) -> decltype(reinterpret_to_double(std::declval<Packet>()));
+			/* get the swap sign flag for the sine */
+			emm0 = pand(emm2, pseti64<IntPacket>(4));
+			emm0 = psll64(emm0, 61);
+			Packet swap_sign_bit_sin = reinterpret_to_double(emm0);
 
-		template<typename Packet>
-		EIGEN_STRONG_INLINE Packet ptruncate(const Packet& a);
+			/* get the polynom selection mask for the sine*/
+			emm2 = pand(emm2, pseti64<IntPacket>(2));
 
-		template<typename Packet>
-		EIGEN_STRONG_INLINE Packet pcmpeq64(const Packet& a, const Packet& b);
+			emm2 = pcmpeq64(emm2, pseti64<IntPacket>(0));
+			Packet poly_mask = reinterpret_to_double(emm2);
 
-		template<typename Packet>
-		EIGEN_STRONG_INLINE Packet pmuluadd64(const Packet& a, uint64_t b, uint64_t c);
+			/* The magic pass: "Extended precision modular arithmetic"
+			x = ((x - y * DP1) - y * DP2) - y * DP3; */
+			xmm1 = pset1<Packet>(-0.78515625);
+			xmm2 = pset1<Packet>(-2.4187564849853515625e-4);
+			xmm3 = pset1<Packet>(-3.77489497744594108e-8);
+			xmm1 = pmul(y, xmm1);
+			xmm2 = pmul(y, xmm2);
+			xmm3 = pmul(y, xmm3);
+			x = padd(x, xmm1);
+			x = padd(x, xmm2);
+			x = padd(x, xmm3);
 
-		template<typename IntPacket>
-		EIGEN_STRONG_INLINE auto bit_to_ur_float(const IntPacket& x) -> decltype(reinterpret_to_float(x))
-		{
-			using FloatPacket = decltype(reinterpret_to_float(x));
+			emm4 = psub64(emm4, pseti64<IntPacket>(2));
+			emm4 = pandnot(emm4, pseti64<IntPacket>(4));
+			emm4 = psll64(emm4, 61);
+			Packet sign_bit_cos = reinterpret_to_double(emm4);
+			sign_bit_sin = pxor(sign_bit_sin, swap_sign_bit_sin);
 
-			const IntPacket lower = pset1<IntPacket>(0x7FFFFF),
-				upper = pset1<IntPacket>(127 << 23);
-			const FloatPacket one = pset1<FloatPacket>(1);
 
-			return psub(reinterpret_to_float(por(pand(x, lower), upper)), one);
+			/* Evaluate the first polynom  (0 <= x <= Pi/4) */
+			Packet z = pmul(x, x);
+			y = pset1<Packet>(2.443315711809948E-005);
+
+			y = pmul(y, z);
+			y = padd(y, pset1<Packet>(-1.388731625493765E-003));
+			y = pmul(y, z);
+			y = padd(y, pset1<Packet>(4.166664568298827E-002));
+			y = pmul(y, z);
+			y = pmul(y, z);
+			Packet tmp = pmul(z, pset1<Packet>(0.5));
+			y = psub(y, tmp);
+			y = padd(y, pset1<Packet>(1));
+
+			/* Evaluate the second polynom  (Pi/4 <= x <= 0) */
+
+			Packet y2 = pset1<Packet>(-1.9515295891E-4);
+			y2 = pmul(y2, z);
+			y2 = padd(y2, pset1<Packet>(8.3321608736E-3));
+			y2 = pmul(y2, z);
+			y2 = padd(y2, pset1<Packet>(-1.6666654611E-1));
+			y2 = pmul(y2, z);
+			y2 = pmul(y2, x);
+			y2 = padd(y2, x);
+
+			/* select the correct result from the two polynoms */
+			xmm3 = poly_mask;
+			Packet ysin2 = pand(xmm3, y2);
+			Packet ysin1 = pandnot(xmm3, y);
+			y2 = psub(y2, ysin2);
+			y = psub(y, ysin1);
+
+			xmm1 = padd(ysin1, ysin2);
+			xmm2 = padd(y, y2);
+
+			/* update the sign */
+			s = pxor(xmm1, sign_bit_sin);
+			c = pxor(xmm2, sign_bit_cos);
 		}
 
-		template<typename IntPacket>
-		EIGEN_STRONG_INLINE auto bit_to_ur_double(const IntPacket& x) -> decltype(reinterpret_to_double(x))
+		template<typename Packet>
+		EIGEN_STRONG_INLINE typename std::enable_if<
+			IsDoublePacket<Packet>::value, Packet
+		>::type _psin(Packet x)
 		{
-			using DoublePacket = decltype(reinterpret_to_double(x));
+			Packet xmm1, xmm2, xmm3 = pset1<Packet>(0), sign_bit_sin, y;
+			using IntPacket = decltype(reinterpret_to_int(x));
+			IntPacket emm0, emm2;
 
-			const IntPacket lower = pseti64<IntPacket>(0xFFFFFFFFFFFFFull),
-				upper = pseti64<IntPacket>(1023ull << 52);
-			const DoublePacket one = pset1<DoublePacket>(1);
+			sign_bit_sin = x;
+			/* take the absolute value */
+			x = pabs(x);
+			/* extract the sign bit (upper one) */
+			sign_bit_sin = pext_sign(sign_bit_sin);
 
-			return psub(reinterpret_to_double(por(pand(x, lower), upper)), one);
-		}
+			/* scale by 4/Pi */
+			y = pmul(x, pset1<Packet>(1.27323954473516));
 
-		template<typename _Scalar>
-		struct bit_scalar;
+			/* store the integer part of y in emm2 */
+			emm2 = pcast64<Packet, IntPacket>(y);
 
-		template<>
-		struct bit_scalar<float>
-		{
-			float to_ur(uint32_t x)
-			{
-				union
-				{
-					uint32_t u;
-					float f;
-				};
-				u = (x & 0x7FFFFF) | (127 << 23);
-				return f - 1.f;
-			}
+			/* j=(j+1) & (~1) (see the cephes sources) */
+			emm2 = padd64(emm2, pseti64<IntPacket>(1));
+			emm2 = pand(emm2, pseti64<IntPacket>(~1ll));
+			y = pcast64<IntPacket, Packet>(emm2);
 
-			float to_nzur(uint32_t x)
-			{
-				return to_ur(x) + std::numeric_limits<float>::epsilon() / 8;
-			}
-		};
+			/* get the swap sign flag for the sine */
+			emm0 = pand(emm2, pseti64<IntPacket>(4));
+			emm0 = psll64(emm0, 61);
+			Packet swap_sign_bit_sin = reinterpret_to_double(emm0);
 
-		template<>
-		struct bit_scalar<double>
-		{
-			double to_ur(uint64_t x)
-			{
-				union
-				{
-					uint64_t u;
-					double f;
-				};
-				u = (x & 0xFFFFFFFFFFFFFull) | (1023ull << 52);
-				return f - 1.;
-			}
+			/* get the polynom selection mask for the sine*/
+			emm2 = pand(emm2, pseti64<IntPacket>(2));
 
-			double to_nzur(uint64_t x)
-			{
-				return to_ur(x) + std::numeric_limits<double>::epsilon() / 8;
-			}
-		};
+			emm2 = pcmpeq64(emm2, pseti64<IntPacket>(0));
+			Packet poly_mask = reinterpret_to_double(emm2);
 
+			/* The magic pass: "Extended precision modular arithmetic"
+			x = ((x - y * DP1) - y * DP2) - y * DP3; */
+			xmm1 = pset1<Packet>(-0.78515625);
+			xmm2 = pset1<Packet>(-2.4187564849853515625e-4);
+			xmm3 = pset1<Packet>(-3.77489497744594108e-8);
+			xmm1 = pmul(y, xmm1);
+			xmm2 = pmul(y, xmm2);
+			xmm3 = pmul(y, xmm3);
+			x = padd(x, xmm1);
+			x = padd(x, xmm2);
+			x = padd(x, xmm3);
 
-		struct float2
-		{
-			float f[2];
-		};
+			sign_bit_sin = pxor(sign_bit_sin, swap_sign_bit_sin);
 
-		EIGEN_STRONG_INLINE float2 bit_to_ur_float(uint64_t x)
-		{
-			bit_scalar<float> bs;
-			float2 ret;
-			ret.f[0] = bs.to_ur(x & 0xFFFFFFFF);
-			ret.f[1] = bs.to_ur(x >> 32);
-			return ret;
+
+			/* Evaluate the first polynom  (0 <= x <= Pi/4) */
+			Packet z = pmul(x, x);
+			y = pset1<Packet>(2.443315711809948E-005);
+
+			y = pmul(y, z);
+			y = padd(y, pset1<Packet>(-1.388731625493765E-003));
+			y = pmul(y, z);
+			y = padd(y, pset1<Packet>(4.166664568298827E-002));
+			y = pmul(y, z);
+			y = pmul(y, z);
+			Packet tmp = pmul(z, pset1<Packet>(0.5));
+			y = psub(y, tmp);
+			y = padd(y, pset1<Packet>(1));
+
+			/* Evaluate the second polynom  (Pi/4 <= x <= 0) */
+
+			Packet y2 = pset1<Packet>(-1.9515295891E-4);
+			y2 = pmul(y2, z);
+			y2 = padd(y2, pset1<Packet>(8.3321608736E-3));
+			y2 = pmul(y2, z);
+			y2 = padd(y2, pset1<Packet>(-1.6666654611E-1));
+			y2 = pmul(y2, z);
+			y2 = pmul(y2, x);
+			y2 = padd(y2, x);
+
+			/* select the correct result from the two polynoms */
+			xmm3 = poly_mask;
+			Packet ysin2 = pand(xmm3, y2);
+			Packet ysin1 = pandnot(xmm3, y);
+
+			xmm1 = padd(ysin1, ysin2);
+
+			/* update the sign */
+			return pxor(xmm1, sign_bit_sin);
 		}
 	}
 }
@@ -403,6 +668,32 @@ namespace Eigen
 			return _mm256_set1_epi64x(a);
 		}
 
+		template<>
+		EIGEN_STRONG_INLINE Packet8i padd64<Packet8i>(const Packet8i& a, const Packet8i& b)
+		{
+#ifdef EIGEN_VECTORIZE_AVX2
+			return _mm256_add_epi64(a, b);
+#else
+			Packet4i a1, a2, b1, b2;
+			split_two(a, a1, a2);
+			split_two(b, b1, b2);
+			return combine_two((Packet4i)_mm_add_epi64(a1, b1), (Packet4i)_mm_add_epi64(a2, b2));
+#endif
+		}
+
+		template<>
+		EIGEN_STRONG_INLINE Packet8i psub64<Packet8i>(const Packet8i& a, const Packet8i& b)
+		{
+#ifdef EIGEN_VECTORIZE_AVX2
+			return _mm256_sub_epi64(a, b);
+#else
+			Packet4i a1, a2, b1, b2;
+			split_two(a, a1, a2);
+			split_two(b, b1, b2);
+			return combine_two((Packet4i)_mm_sub_epi64(a1, b1), (Packet4i)_mm_sub_epi64(a2, b2));
+#endif
+		}
+
 		template<>
 		EIGEN_STRONG_INLINE Packet8i pcmpeq<Packet8i>(const Packet8i& a, const Packet8i& b)
 		{
@@ -537,6 +828,19 @@ namespace Eigen
 #endif
 		}
 
+		template<>
+		EIGEN_STRONG_INLINE Packet8i pcmplt64<Packet8i>(const Packet8i& a, const Packet8i& b)
+		{
+#ifdef EIGEN_VECTORIZE_AVX2
+			return _mm256_cmpgt_epi64(b, a);
+#else
+			Packet4i a1, a2, b1, b2;
+			split_two(a, a1, a2);
+			split_two(b, b1, b2);
+			return combine_two((Packet4i)_mm_cmpgt_epi64(b1, a1), (Packet4i)_mm_cmpgt_epi64(b2, a2));
+#endif
+		}
+
 		template<>
 		EIGEN_STRONG_INLINE Packet8f pcmplt<Packet8f>(const Packet8f& a, const Packet8f& b)
 		{
@@ -694,6 +998,122 @@ namespace Eigen
 			u[3] = u[3] * b + c;
 			return _mm256_loadu_si256((__m256i*)u);
 		}
+
+		EIGEN_STRONG_INLINE __m256d uint64_to_double(__m256i x) {
+			auto y = _mm256_or_pd(_mm256_castsi256_pd(x), _mm256_set1_pd(0x0010000000000000));
+			return _mm256_sub_pd(y, _mm256_set1_pd(0x0010000000000000));
+		}
+
+		EIGEN_STRONG_INLINE __m256d int64_to_double(__m256i x) {
+			x = padd64(x, _mm256_castpd_si256(_mm256_set1_pd(0x0018000000000000)));
+			return _mm256_sub_pd(_mm256_castsi256_pd(x), _mm256_set1_pd(0x0018000000000000));
+		}
+
+		EIGEN_STRONG_INLINE __m256i double_to_int64(__m256d x) {
+			x = _mm256_add_pd(x, _mm256_set1_pd(0x0018000000000000));
+			return psub64(
+				_mm256_castpd_si256(x),
+				_mm256_castpd_si256(_mm256_set1_pd(0x0018000000000000))
+			);
+		}
+
+		template<>
+		EIGEN_STRONG_INLINE Packet8i pcast64<Packet4d, Packet8i>(const Packet4d& a)
+		{
+			return double_to_int64(a);
+		}
+
+		template<>
+		EIGEN_STRONG_INLINE Packet4d pcast64<Packet8i, Packet4d>(const Packet8i& a)
+		{
+			return int64_to_double(a);
+		}
+
+		template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+			Packet4d psin<Packet4d>(const Packet4d& x)
+		{
+			return _psin(x);
+		}
+
+		template <>
+		EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4d
+			plog<Packet4d>(const Packet4d& _x) {
+			Packet4d x = _x;
+			_EIGEN_DECLARE_CONST_Packet4d(1, 1.0);
+			_EIGEN_DECLARE_CONST_Packet4d(half, 0.5);
+
+			auto inv_mant_mask = _mm256_castsi256_pd(pseti64<Packet8i>(~0x7ff0000000000000));
+			auto min_norm_pos = _mm256_castsi256_pd(pseti64<Packet8i>(0x10000000000000));
+			auto minus_inf = _mm256_castsi256_pd(pseti64<Packet8i>(0xfff0000000000000));
+
+			// Polynomial coefficients.
+			_EIGEN_DECLARE_CONST_Packet4d(cephes_SQRTHF, 0.707106781186547524);
+			_EIGEN_DECLARE_CONST_Packet4d(cephes_log_p0, 7.0376836292E-2);
+			_EIGEN_DECLARE_CONST_Packet4d(cephes_log_p1, -1.1514610310E-1);
+			_EIGEN_DECLARE_CONST_Packet4d(cephes_log_p2, 1.1676998740E-1);
+			_EIGEN_DECLARE_CONST_Packet4d(cephes_log_p3, -1.2420140846E-1);
+			_EIGEN_DECLARE_CONST_Packet4d(cephes_log_p4, +1.4249322787E-1);
+			_EIGEN_DECLARE_CONST_Packet4d(cephes_log_p5, -1.6668057665E-1);
+			_EIGEN_DECLARE_CONST_Packet4d(cephes_log_p6, +2.0000714765E-1);
+			_EIGEN_DECLARE_CONST_Packet4d(cephes_log_p7, -2.4999993993E-1);
+			_EIGEN_DECLARE_CONST_Packet4d(cephes_log_p8, +3.3333331174E-1);
+			_EIGEN_DECLARE_CONST_Packet4d(cephes_log_q1, -2.12194440e-4);
+			_EIGEN_DECLARE_CONST_Packet4d(cephes_log_q2, 0.693359375);
+
+			Packet4d invalid_mask = _mm256_cmp_pd(x, _mm256_setzero_pd(), _CMP_NGE_UQ); // not greater equal is true if x is NaN
+			Packet4d iszero_mask = _mm256_cmp_pd(x, _mm256_setzero_pd(), _CMP_EQ_OQ);
+
+			// Truncate input values to the minimum positive normal.
+			x = pmax(x, min_norm_pos);
+
+			Packet4d emm0 = uint64_to_double(psrl64(_mm256_castpd_si256(x), 52));
+			Packet4d e = psub(emm0, pset1<Packet4d>(1022));
+
+			// Set the exponents to -1, i.e. x are in the range [0.5,1).
+			x = _mm256_and_pd(x, inv_mant_mask);
+			x = _mm256_or_pd(x, p4d_half);
+
+			// part2: Shift the inputs from the range [0.5,1) to [sqrt(1/2),sqrt(2))
+			// and shift by -1. The values are then centered around 0, which improves
+			// the stability of the polynomial evaluation.
+			//   if( x < SQRTHF ) {
+			//     e -= 1;
+			//     x = x + x - 1.0;
+			//   } else { x = x - 1.0; }
+			Packet4d mask = _mm256_cmp_pd(x, p4d_cephes_SQRTHF, _CMP_LT_OQ);
+			Packet4d tmp = _mm256_and_pd(x, mask);
+			x = psub(x, p4d_1);
+			e = psub(e, _mm256_and_pd(p4d_1, mask));
+			x = padd(x, tmp);
+
+			Packet4d x2 = pmul(x, x);
+			Packet4d x3 = pmul(x2, x);
+
+			// Evaluate the polynomial approximant of degree 8 in three parts, probably
+			// to improve instruction-level parallelism.
+			Packet4d y, y1, y2;
+			y = pmadd(p4d_cephes_log_p0, x, p4d_cephes_log_p1);
+			y1 = pmadd(p4d_cephes_log_p3, x, p4d_cephes_log_p4);
+			y2 = pmadd(p4d_cephes_log_p6, x, p4d_cephes_log_p7);
+			y = pmadd(y, x, p4d_cephes_log_p2);
+			y1 = pmadd(y1, x, p4d_cephes_log_p5);
+			y2 = pmadd(y2, x, p4d_cephes_log_p8);
+			y = pmadd(y, x3, y1);
+			y = pmadd(y, x3, y2);
+			y = pmul(y, x3);
+
+			// Add the logarithm of the exponent back to the result of the interpolation.
+			y1 = pmul(e, p4d_cephes_log_q1);
+			tmp = pmul(x2, p4d_half);
+			y = padd(y, y1);
+			x = psub(x, tmp);
+			y2 = pmul(e, p4d_cephes_log_q2);
+			x = padd(x, y);
+			x = padd(x, y2);
+
+			// Filter out invalid inputs, i.e. negative arg will be NAN, 0 will be -INF.
+			return pblendv(iszero_mask, minus_inf, _mm256_or_pd(x, invalid_mask));
+		}
 	}
 }
 #endif
@@ -790,6 +1210,18 @@ namespace Eigen
 			return _mm_set1_epi64x(a);
 		}
 
+		template<>
+		EIGEN_STRONG_INLINE Packet4i padd64<Packet4i>(const Packet4i& a, const Packet4i& b)
+		{
+			return _mm_add_epi64(a, b);
+		}
+
+		template<>
+		EIGEN_STRONG_INLINE Packet4i psub64<Packet4i>(const Packet4i& a, const Packet4i& b)
+		{
+			return _mm_sub_epi64(a, b);
+		}
+
 		template<>
 		EIGEN_STRONG_INLINE Packet4i pcmpeq<Packet4i>(const Packet4i& a, const Packet4i& b)
 		{
@@ -827,6 +1259,19 @@ namespace Eigen
 			return _mm_cmplt_epi32(a, b);
 		}
 
+		template<>
+		EIGEN_STRONG_INLINE Packet4i pcmplt64<Packet4i>(const Packet4i& a, const Packet4i& b)
+		{
+#ifdef EIGEN_VECTORIZE_SSE4_2
+			return _mm_cmpgt_epi64(b, a);
+#else
+			int64_t u[2], v[2];
+			_mm_storeu_si128((__m128i*)u, a);
+			_mm_storeu_si128((__m128i*)v, b);
+			return _mm_set_epi64x(u[1] < v[1] ? -1 : 0, u[0] < v[0] ? -1 : 0);
+#endif
+		}
+
 		template<>
 		EIGEN_STRONG_INLINE Packet4f pcmplt<Packet4f>(const Packet4f& a, const Packet4f& b)
 		{
@@ -1003,6 +1448,122 @@ namespace Eigen
 			u[1] = u[1] * b + c;
 			return _mm_loadu_si128((__m128i*)u);
 		}
+
+		EIGEN_STRONG_INLINE __m128d uint64_to_double(__m128i x) {
+			x = _mm_or_si128(x, _mm_castpd_si128(_mm_set1_pd(0x0010000000000000)));
+			return _mm_sub_pd(_mm_castsi128_pd(x), _mm_set1_pd(0x0010000000000000));
+		}
+
+		EIGEN_STRONG_INLINE __m128d int64_to_double(__m128i x) {
+			x = _mm_add_epi64(x, _mm_castpd_si128(_mm_set1_pd(0x0018000000000000)));
+			return _mm_sub_pd(_mm_castsi128_pd(x), _mm_set1_pd(0x0018000000000000));
+		}
+
+		EIGEN_STRONG_INLINE __m128i double_to_int64(__m128d x) {
+			x = _mm_add_pd(x, _mm_set1_pd(0x0018000000000000));
+			return _mm_sub_epi64(
+				_mm_castpd_si128(x),
+				_mm_castpd_si128(_mm_set1_pd(0x0018000000000000))
+			);
+		}
+
+		template<>
+		EIGEN_STRONG_INLINE Packet4i pcast64<Packet2d, Packet4i>(const Packet2d& a)
+		{
+			return double_to_int64(a);
+		}
+
+		template<>
+		EIGEN_STRONG_INLINE Packet2d pcast64<Packet4i, Packet2d>(const Packet4i& a)
+		{
+			return int64_to_double(a);
+		}
+
+		template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+			Packet2d psin<Packet2d>(const Packet2d& x)
+		{
+			return _psin(x);
+		}
+
+		template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+			Packet2d plog<Packet2d>(const Packet2d& _x)
+		{
+			Packet2d x = _x;
+			_EIGEN_DECLARE_CONST_Packet2d(1, 1.0f);
+			_EIGEN_DECLARE_CONST_Packet2d(half, 0.5f);
+			_EIGEN_DECLARE_CONST_Packet4i(0x7f, 0x7f);
+
+			auto inv_mant_mask = _mm_castsi128_pd(pseti64<Packet4i>(~0x7ff0000000000000));
+			auto min_norm_pos = _mm_castsi128_pd(pseti64<Packet4i>(0x10000000000000));
+			auto minus_inf = _mm_castsi128_pd(pseti64<Packet4i>(0xfff0000000000000));
+
+			/* natural logarithm computed for 4 simultaneous float
+			  return NaN for x <= 0
+			*/
+			_EIGEN_DECLARE_CONST_Packet2d(cephes_SQRTHF, 0.707106781186547524);
+			_EIGEN_DECLARE_CONST_Packet2d(cephes_log_p0, 7.0376836292E-2);
+			_EIGEN_DECLARE_CONST_Packet2d(cephes_log_p1, -1.1514610310E-1);
+			_EIGEN_DECLARE_CONST_Packet2d(cephes_log_p2, 1.1676998740E-1);
+			_EIGEN_DECLARE_CONST_Packet2d(cephes_log_p3, -1.2420140846E-1);
+			_EIGEN_DECLARE_CONST_Packet2d(cephes_log_p4, +1.4249322787E-1);
+			_EIGEN_DECLARE_CONST_Packet2d(cephes_log_p5, -1.6668057665E-1);
+			_EIGEN_DECLARE_CONST_Packet2d(cephes_log_p6, +2.0000714765E-1);
+			_EIGEN_DECLARE_CONST_Packet2d(cephes_log_p7, -2.4999993993E-1);
+			_EIGEN_DECLARE_CONST_Packet2d(cephes_log_p8, +3.3333331174E-1);
+			_EIGEN_DECLARE_CONST_Packet2d(cephes_log_q1, -2.12194440e-4);
+			_EIGEN_DECLARE_CONST_Packet2d(cephes_log_q2, 0.693359375);
+
+
+			Packet4i emm0;
+
+			Packet2d invalid_mask = _mm_cmpnge_pd(x, _mm_setzero_pd()); // not greater equal is true if x is NaN
+			Packet2d iszero_mask = _mm_cmpeq_pd(x, _mm_setzero_pd());
+
+			x = pmax(x, min_norm_pos);  /* cut off denormalized stuff */
+			emm0 = _mm_srli_epi64(_mm_castpd_si128(x), 52);
+
+			/* keep only the fractional part */
+			x = _mm_and_pd(x, inv_mant_mask);
+			x = _mm_or_pd(x, p2d_half);
+
+			Packet2d e = _mm_sub_pd(uint64_to_double(emm0), pset1<Packet2d>(1022));
+
+			/* part2:
+			   if( x < SQRTHF ) {
+				 e -= 1;
+				 x = x + x - 1.0;
+			   } else { x = x - 1.0; }
+			*/
+			Packet2d mask = _mm_cmplt_pd(x, p2d_cephes_SQRTHF);
+			Packet2d tmp = pand(x, mask);
+			x = psub(x, p2d_1);
+			e = psub(e, pand(p2d_1, mask));
+			x = padd(x, tmp);
+
+			Packet2d x2 = pmul(x, x);
+			Packet2d x3 = pmul(x2, x);
+
+			Packet2d y, y1, y2;
+			y = pmadd(p2d_cephes_log_p0, x, p2d_cephes_log_p1);
+			y1 = pmadd(p2d_cephes_log_p3, x, p2d_cephes_log_p4);
+			y2 = pmadd(p2d_cephes_log_p6, x, p2d_cephes_log_p7);
+			y = pmadd(y, x, p2d_cephes_log_p2);
+			y1 = pmadd(y1, x, p2d_cephes_log_p5);
+			y2 = pmadd(y2, x, p2d_cephes_log_p8);
+			y = pmadd(y, x3, y1);
+			y = pmadd(y, x3, y2);
+			y = pmul(y, x3);
+
+			y1 = pmul(e, p2d_cephes_log_q1);
+			tmp = pmul(x2, p2d_half);
+			y = padd(y, y1);
+			x = psub(x, tmp);
+			y2 = pmul(e, p2d_cephes_log_q2);
+			x = padd(x, y);
+			x = padd(x, y2);
+			// negative arg will be NAN, 0 will be -INF
+			return pblendv(iszero_mask, minus_inf, _mm_or_pd(x, invalid_mask));
+		}
 	}
 }
 #endif
diff --git a/EigenRand/MvDists/Multinomial.h b/EigenRand/MvDists/Multinomial.h
index 17de6ec..3840fb9 100644
--- a/EigenRand/MvDists/Multinomial.h
+++ b/EigenRand/MvDists/Multinomial.h
@@ -2,10 +2,10 @@
  * @file Multinomial.h
  * @author bab2min (bab2min@gmail.com)
  * @brief
- * @version 0.3.0
- * @date 2020-10-07
+ * @version 0.3.3
+ * @date 2021-03-31
  *
- * @copyright Copyright (c) 2020
+ * @copyright Copyright (c) 2020-2021
  *
  */
 
diff --git a/EigenRand/MvDists/MvNormal.h b/EigenRand/MvDists/MvNormal.h
index b26986d..604b512 100644
--- a/EigenRand/MvDists/MvNormal.h
+++ b/EigenRand/MvDists/MvNormal.h
@@ -2,10 +2,10 @@
  * @file MvNormal.h
  * @author bab2min (bab2min@gmail.com)
  * @brief
- * @version 0.3.0
- * @date 2020-10-07
+ * @version 0.3.3
+ * @date 2021-03-31
  *
- * @copyright Copyright (c) 2020
+ * @copyright Copyright (c) 2020-2021
  *
  */
 
diff --git a/EigenRand/PacketFilter.h b/EigenRand/PacketFilter.h
index 411efb9..692bb31 100644
--- a/EigenRand/PacketFilter.h
+++ b/EigenRand/PacketFilter.h
@@ -2,10 +2,10 @@
  * @file PacketFilter.h
  * @author bab2min (bab2min@gmail.com)
  * @brief 
- * @version 0.3.0
- * @date 2020-10-07
+ * @version 0.3.3
+ * @date 2021-03-31
  * 
- * @copyright Copyright (c) 2020
+ * @copyright Copyright (c) 2020-2021
  * 
  */
 
diff --git a/EigenRand/PacketRandomEngine.h b/EigenRand/PacketRandomEngine.h
index b333211..055f0a7 100644
--- a/EigenRand/PacketRandomEngine.h
+++ b/EigenRand/PacketRandomEngine.h
@@ -2,10 +2,10 @@
  * @file PacketRandomEngine.h
  * @author bab2min (bab2min@gmail.com)
  * @brief
- * @version 0.3.0
- * @date 2020-10-07
+ * @version 0.3.3
+ * @date 2021-03-31
  *
- * @copyright Copyright (c) 2020
+ * @copyright Copyright (c) 2020-2021
  *
  */
 
@@ -20,36 +20,6 @@
 
 namespace Eigen
 {
-	namespace internal
-	{
-		template<typename Ty>
-		struct IsIntPacket : std::false_type {};
-
-		template<typename Ty>
-		struct HalfPacket;
-
-#ifdef EIGEN_VECTORIZE_AVX2
-		template<>
-		struct IsIntPacket<Packet8i> : std::true_type {};
-
-		template<>
-		struct HalfPacket<Packet8i>
-		{
-			using type = Packet4i;
-		};
-#endif
-#ifdef EIGEN_VECTORIZE_SSE2
-		template<>
-		struct IsIntPacket<Packet4i> : std::true_type {};
-
-		template<>
-		struct HalfPacket<Packet4i>
-		{
-			using type = uint64_t;
-		};
-#endif
-	}
-
 	namespace Rand
 	{
 		namespace detail
diff --git a/EigenRand/RandUtils.h b/EigenRand/RandUtils.h
index e7304b2..155a5a2 100644
--- a/EigenRand/RandUtils.h
+++ b/EigenRand/RandUtils.h
@@ -2,10 +2,10 @@
  * @file RandUtils.h
  * @author bab2min (bab2min@gmail.com)
  * @brief 
- * @version 0.3.0
- * @date 2020-10-07
+ * @version 0.3.3
+ * @date 2021-03-31
  * 
- * @copyright Copyright (c) 2020
+ * @copyright Copyright (c) 2020-2021
  * 
  */
 
diff --git a/TestAccuracy.vcxproj b/TestAccuracy.vcxproj
index 94f1196..a21dced 100644
--- a/TestAccuracy.vcxproj
+++ b/TestAccuracy.vcxproj
@@ -222,7 +222,7 @@
       <SDLCheck>true</SDLCheck>
       <PreprocessorDefinitions>USE_ADDON;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
       <ConformanceMode>true</ConformanceMode>
-      <EnableEnhancedInstructionSet>AdvancedVectorExtensions2</EnableEnhancedInstructionSet>
+      <EnableEnhancedInstructionSet>StreamingSIMDExtensions2</EnableEnhancedInstructionSet>
     </ClCompile>
     <Link>
       <SubSystem>Console</SubSystem>
diff --git a/test/accuracy.cpp b/test/accuracy.cpp
index 153fccc..fa450ae 100644
--- a/test/accuracy.cpp
+++ b/test/accuracy.cpp
@@ -197,48 +197,93 @@ std::map<std::string, double> test_eigenrand_cont(size_t size, size_t step, size
 	arr = Eigen::Rand::normalLike(arr, urng);
 	ret["normal"] = calc_emd_with_cdf(arr, normal_cdf, step);
 
+	arrd = Eigen::Rand::normalLike(arrd, urng);
+	ret["normal/double"] = calc_emd_with_cdf(arrd, normal_cdf, step);
+
 	arr = Eigen::Rand::lognormalLike(arr, urng);
 	ret["lognormal"] = calc_emd_with_cdf(arr, lognormal_cdf, step);
 
+	arrd = Eigen::Rand::lognormalLike(arrd, urng);
+	ret["lognormal/double"] = calc_emd_with_cdf(arrd, lognormal_cdf, step);
+
 	arr = Eigen::Rand::gammaLike(arr, urng, 1, 1);
 	ret["gamma(1,1)"] = calc_emd_with_pdf(arr, gamma11_pdf, step);
 
+	arrd = Eigen::Rand::gammaLike(arrd, urng, 1, 1);
+	ret["gamma(1,1)/double"] = calc_emd_with_pdf(arrd, gamma11_pdf, step);
+
 	arr = Eigen::Rand::gammaLike(arr, urng, 5, 1);
 	ret["gamma(5,1)"] = calc_emd_with_pdf(arr, gamma51_pdf, step);
 
+	arrd = Eigen::Rand::gammaLike(arrd, urng, 5, 1);
+	ret["gamma(5,1)/double"] = calc_emd_with_pdf(arrd, gamma51_pdf, step);
+
 	arr = Eigen::Rand::gammaLike(arr, urng, 0.2, 1);
 	ret["gamma(0.2,1)"] = calc_emd_with_pdf(arr, gamma21_pdf, step);
 
+	arrd = Eigen::Rand::gammaLike(arrd, urng, 0.2, 1);
+	ret["gamma(0.2,1)/double"] = calc_emd_with_pdf(arrd, gamma21_pdf, step);
+
 	arr = Eigen::Rand::exponentialLike(arr, urng);
 	ret["exponential"] = calc_emd_with_cdf(arr, exp_cdf, step);
 
+	arrd = Eigen::Rand::exponentialLike(arrd, urng);
+	ret["exponential/double"] = calc_emd_with_cdf(arrd, exp_cdf, step);
+
 	arr = Eigen::Rand::weibullLike(arr, urng, 2);
 	ret["weibull(2,1)"] = calc_emd_with_cdf(arr, weibull_cdf, step);
 
+	arrd = Eigen::Rand::weibullLike(arrd, urng, 2);
+	ret["weibull(2,1)/double"] = calc_emd_with_cdf(arrd, weibull_cdf, step);
+
 	arr = Eigen::Rand::extremeValueLike(arr, urng, 1, 1);
 	ret["extremeValue(1,1)"] = calc_emd_with_cdf(arr, extreme_value_cdf, step);
 
+	arrd = Eigen::Rand::extremeValueLike(arrd, urng, 1, 1);
+	ret["extremeValue(1,1)/double"] = calc_emd_with_cdf(arrd, extreme_value_cdf, step);
+
 	arr = Eigen::Rand::chiSquaredLike(arr, urng, 7);
 	ret["chiSquared(7)"] = calc_emd_with_pdf(arr, chisquared_pdf, step);
 
+	arrd = Eigen::Rand::chiSquaredLike(arrd, urng, 7);
+	ret["chiSquared(7)/double"] = calc_emd_with_pdf(arrd, chisquared_pdf, step);
+
 	arr = Eigen::Rand::cauchyLike(arr, urng);
 	ret["cauchy"] = calc_emd_with_cdf(arr, cauchy_cdf, step);
 
+	arrd = Eigen::Rand::cauchyLike(arrd, urng);
+	ret["cauchy/double"] = calc_emd_with_cdf(arrd, cauchy_cdf, step);
+
 	arr = Eigen::Rand::studentTLike(arr, urng, 1);
 	ret["studentT(1)"] = calc_emd_with_cdf(arr, cauchy_cdf, step);
 
+	arrd = Eigen::Rand::studentTLike(arrd, urng, 1);
+	ret["studentT(1)/double"] = calc_emd_with_cdf(arrd, cauchy_cdf, step);
+
 	arr = Eigen::Rand::studentTLike(arr, urng, 5);
 	ret["studentT(5)"] = calc_emd_with_pdf(arr, student5_pdf, step);
 
+	arrd = Eigen::Rand::studentTLike(arrd, urng, 5);
+	ret["studentT(5)/double"] = calc_emd_with_pdf(arrd, student5_pdf, step);
+
 	arr = Eigen::Rand::studentTLike(arr, urng, 20);
 	ret["studentT(20)"] = calc_emd_with_pdf(arr, student20_pdf, step);
 
+	arrd = Eigen::Rand::studentTLike(arrd, urng, 20);
+	ret["studentT(20)/double"] = calc_emd_with_pdf(arrd, student20_pdf, step);
+
 	arr = Eigen::Rand::fisherFLike(arr, urng, 1, 1);
 	ret["fisherF(1,1)"] = calc_emd_with_cdf(arr, fisher11_cdf, step);
 
+	arrd = Eigen::Rand::fisherFLike(arrd, urng, 1, 1);
+	ret["fisherF(1,1)/double"] = calc_emd_with_cdf(arrd, fisher11_cdf, step);
+
 	arr = Eigen::Rand::fisherFLike(arr, urng, 5, 5);
 	ret["fisherF(5,5)"] = calc_emd_with_pdf(arr, fisher55_pdf, step);
 
+	arrd = Eigen::Rand::fisherFLike(arrd, urng, 5, 5);
+	ret["fisherF(5,5)/double"] = calc_emd_with_pdf(arrd, fisher55_pdf, step);
+
 	return ret;
 }
 
@@ -322,6 +367,11 @@ std::map<std::string, double> test_cpp11_cont(size_t size, size_t step, size_t s
 	}
 	ret["normal"] = calc_emd_with_cdf(arr, normal_cdf, step);
 
+	{
+		std::normal_distribution<double> dist;
+		arrd = Eigen::ArrayXd::NullaryExpr(size, [&]() { return dist(urng); });
+	}
+	ret["normal/double"] = calc_emd_with_cdf(arrd, normal_cdf, step);
 
 	{
 		std::lognormal_distribution<> dist;
@@ -329,6 +379,11 @@ std::map<std::string, double> test_cpp11_cont(size_t size, size_t step, size_t s
 	}
 	ret["lognormal"] = calc_emd_with_cdf(arr, lognormal_cdf, step);
 
+	{
+		std::lognormal_distribution<double> dist;
+		arrd = Eigen::ArrayXd::NullaryExpr(size, [&]() { return dist(urng); });
+	}
+	ret["lognormal/double"] = calc_emd_with_cdf(arrd, lognormal_cdf, step);
 
 	{
 		std::gamma_distribution<> dist{ 1, 1 };
@@ -336,77 +391,155 @@ std::map<std::string, double> test_cpp11_cont(size_t size, size_t step, size_t s
 	}
 	ret["gamma(1,1)"] = calc_emd_with_pdf(arr, gamma11_pdf, step);
 
+	{
+		std::gamma_distribution<double> dist{ 1, 1 };
+		arrd = Eigen::ArrayXd::NullaryExpr(size, [&]() { return dist(urng); });
+	}
+	ret["gamma(1,1)/double"] = calc_emd_with_pdf(arrd, gamma11_pdf, step);
+
 	{
 		std::gamma_distribution<> dist{ 5, 1 };
 		arr = Eigen::ArrayXf::NullaryExpr(size, [&]() { return dist(urng); });
 	}
 	ret["gamma(5,1)"] = calc_emd_with_pdf(arr, gamma51_pdf, step);
 
+	{
+		std::gamma_distribution<double> dist{ 5, 1 };
+		arrd = Eigen::ArrayXd::NullaryExpr(size, [&]() { return dist(urng); });
+	}
+	ret["gamma(5,1)/double"] = calc_emd_with_pdf(arrd, gamma51_pdf, step);
+
 	{
 		std::gamma_distribution<> dist{ 0.2, 1 };
 		arr = Eigen::ArrayXf::NullaryExpr(size, [&]() { return dist(urng); });
 	}
 	ret["gamma(0.2,1)"] = calc_emd_with_pdf(arr, gamma21_pdf, step);
 
+	{
+		std::gamma_distribution<double> dist{ 0.2, 1 };
+		arrd = Eigen::ArrayXd::NullaryExpr(size, [&]() { return dist(urng); });
+	}
+	ret["gamma(0.2,1)/double"] = calc_emd_with_pdf(arrd, gamma21_pdf, step);
+
 	{
 		std::exponential_distribution<> dist;
 		arr = Eigen::ArrayXf::NullaryExpr(size, [&]() { return dist(urng); });
 	}
 	ret["exponential"] = calc_emd_with_cdf(arr, exp_cdf, step);
 
+	{
+		std::exponential_distribution<double> dist;
+		arrd = Eigen::ArrayXd::NullaryExpr(size, [&]() { return dist(urng); });
+	}
+	ret["exponential/double"] = calc_emd_with_cdf(arrd, exp_cdf, step);
+
 	{
 		std::weibull_distribution<> dist{ 2, 1 };
 		arr = Eigen::ArrayXf::NullaryExpr(size, [&]() { return dist(urng); });
 	}
 	ret["weibull(2,1)"] = calc_emd_with_cdf(arr, weibull_cdf, step);
 
+	{
+		std::weibull_distribution<double> dist{ 2, 1 };
+		arrd = Eigen::ArrayXd::NullaryExpr(size, [&]() { return dist(urng); });
+	}
+	ret["weibull(2,1)/double"] = calc_emd_with_cdf(arrd, weibull_cdf, step);
+
 	{
 		std::extreme_value_distribution<> dist{ 1, 1 };
 		arr = Eigen::ArrayXf::NullaryExpr(size, [&]() { return dist(urng); });
 	}
 	ret["extremeValue(1,1)"] = calc_emd_with_cdf(arr, extreme_value_cdf, step);
 
+	{
+		std::extreme_value_distribution<double> dist{ 1, 1 };
+		arrd = Eigen::ArrayXd::NullaryExpr(size, [&]() { return dist(urng); });
+	}
+	ret["extremeValue(1,1)/double"] = calc_emd_with_cdf(arrd, extreme_value_cdf, step);
+
 	{
 		std::chi_squared_distribution<> dist{ 7 };
 		arr = Eigen::ArrayXf::NullaryExpr(size, [&]() { return dist(urng); });
 	}
 	ret["chiSquared(7)"] = calc_emd_with_pdf(arr, chisquared_pdf, step);
 
+	{
+		std::chi_squared_distribution<double> dist{ 7 };
+		arrd = Eigen::ArrayXd::NullaryExpr(size, [&]() { return dist(urng); });
+	}
+	ret["chiSquared(7)/double"] = calc_emd_with_pdf(arrd, chisquared_pdf, step);
+
 	{
 		std::cauchy_distribution<> dist{ 0, 1 };
 		arr = Eigen::ArrayXf::NullaryExpr(size, [&]() { return dist(urng); });
 	}
 	ret["cauchy"] = calc_emd_with_cdf(arr, cauchy_cdf, step);
 
+	{
+		std::cauchy_distribution<double> dist{ 0, 1 };
+		arrd = Eigen::ArrayXd::NullaryExpr(size, [&]() { return dist(urng); });
+	}
+	ret["cauchy/double"] = calc_emd_with_cdf(arrd, cauchy_cdf, step);
+
 	{
 		std::student_t_distribution<> dist{ 1 };
 		arr = Eigen::ArrayXf::NullaryExpr(size, [&]() { return dist(urng); });
 	}
 	ret["studentT(1)"] = calc_emd_with_cdf(arr, cauchy_cdf, step);
 
+	{
+		std::student_t_distribution<double> dist{ 1 };
+		arrd = Eigen::ArrayXd::NullaryExpr(size, [&]() { return dist(urng); });
+	}
+	ret["studentT(1)/double"] = calc_emd_with_cdf(arrd, cauchy_cdf, step);
+
 	{
 		std::student_t_distribution<> dist{ 5 };
 		arr = Eigen::ArrayXf::NullaryExpr(size, [&]() { return dist(urng); });
 	}
 	ret["studentT(5)"] = calc_emd_with_pdf(arr, student5_pdf, step);
 
+	{
+		std::student_t_distribution<double> dist{ 5 };
+		arrd = Eigen::ArrayXd::NullaryExpr(size, [&]() { return dist(urng); });
+	}
+	ret["studentT(5)/double"] = calc_emd_with_pdf(arrd, student5_pdf, step);
+
 	{
 		std::student_t_distribution<> dist{ 20 };
 		arr = Eigen::ArrayXf::NullaryExpr(size, [&]() { return dist(urng); });
 	}
 	ret["studentT(20)"] = calc_emd_with_pdf(arr, student20_pdf, step);
 
+	{
+		std::student_t_distribution<double> dist{ 20 };
+		arrd = Eigen::ArrayXd::NullaryExpr(size, [&]() { return dist(urng); });
+	}
+	ret["studentT(20)/double"] = calc_emd_with_pdf(arrd, student20_pdf, step);
+
 	{
 		std::fisher_f_distribution<> dist{ 1, 1 };
 		arr = Eigen::ArrayXf::NullaryExpr(size, [&]() { return dist(urng); });
 	}
 	ret["fisherF(1,1)"] = calc_emd_with_cdf(arr, fisher11_cdf, step);
 
+	{
+		std::fisher_f_distribution<double> dist{ 1, 1 };
+		arrd = Eigen::ArrayXd::NullaryExpr(size, [&]() { return dist(urng); });
+	}
+	ret["fisherF(1,1)/double"] = calc_emd_with_cdf(arrd, fisher11_cdf, step);
+
 	{
 		std::fisher_f_distribution<> dist{ 5, 5 };
 		arr = Eigen::ArrayXf::NullaryExpr(size, [&]() { return dist(urng); });
 	}
 	ret["fisherF(5,5)"] = calc_emd_with_pdf(arr, fisher55_pdf, step);
+
+	{
+		std::fisher_f_distribution<double> dist{ 5, 5 };
+		arrd = Eigen::ArrayXd::NullaryExpr(size, [&]() { return dist(urng); });
+	}
+	ret["fisherF(5,5)/double"] = calc_emd_with_pdf(arrd, fisher55_pdf, step);
 	return ret;
 }
 
diff --git a/test/benchmark.cpp b/test/benchmark.cpp
index 3e56ac1..d1996de 100644
--- a/test/benchmark.cpp
+++ b/test/benchmark.cpp
@@ -259,6 +259,11 @@ std::map<std::string, double> test_eigenrand(size_t size, const std::string& suf
 		x = Eigen::Rand::normalLike(x, urng);
 	}
 
+	{
+		auto scope = bh.measure("normal(0,1)/double" + suffix, xd);
+		xd = Eigen::Rand::normalLike(xd, urng);
+	}
+
 	{
 		auto scope = bh.measure("normal(0,1) square" + suffix, x);
 		x = Eigen::Rand::normalLike(x, urng).square();
@@ -269,78 +274,137 @@ std::map<std::string, double> test_eigenrand(size_t size, const std::string& suf
 		x = Eigen::Rand::normalLike(x, urng, 2, 3);
 	}
 
+	{
+		auto scope = bh.measure("normal(2,3)/double" + suffix, xd);
+		xd = Eigen::Rand::normalLike(xd, urng, 2, 3);
+	}
+
 	{
 		auto scope = bh.measure("lognormal(0,1)" + suffix, x);
 		x = Eigen::Rand::lognormalLike(x, urng);
 	}
+	
+	{
+		auto scope = bh.measure("lognormal(0,1)/double" + suffix, xd);
+		xd = Eigen::Rand::lognormalLike(xd, urng);
+	}
 
 	{
 		auto scope = bh.measure("exponential(1)" + suffix, x);
 		x = Eigen::Rand::exponentialLike(x, urng);
 	}
 
+	{
+		auto scope = bh.measure("exponential(1)/double" + suffix, xd);
+		xd = Eigen::Rand::exponentialLike(xd, urng);
+	}
+
 	{
 		auto scope = bh.measure("gamma(1,2)" + suffix, x);
 		x = Eigen::Rand::gammaLike(x, urng, 1, 2);
 	}
 
+	{
+		auto scope = bh.measure("gamma(1,2)/double" + suffix, xd);
+		xd = Eigen::Rand::gammaLike(xd, urng, 1, 2);
+	}
+
 	{
 		auto scope = bh.measure("gamma(5,3)" + suffix, x);
 		x = Eigen::Rand::gammaLike(x, urng, 5, 3);
 	}
 
-
 	{
 		auto scope = bh.measure("gamma(5,3)/gen" + suffix, x);
 		Eigen::Rand::GammaGen<float> gen{ 5, 3 };
 		x = gen.generateLike(x, urng);
 	}
 
-	/*{
+	{
 		auto scope = bh.measure("gamma(5,3)/double" + suffix, xd);
 		xd = Eigen::Rand::gammaLike(xd, urng, 5, 3);
-	}*/
+	}
 
 	{
 		auto scope = bh.measure("gamma(0.2,1)" + suffix, x);
 		x = Eigen::Rand::gammaLike(x, urng, 0.2, 1);
 	}
 
+	{
+		auto scope = bh.measure("gamma(0.2,1)/double" + suffix, xd);
+		xd = Eigen::Rand::gammaLike(xd, urng, 0.2, 1);
+	}
+
 	{
 		auto scope = bh.measure("gamma(10.5,1)" + suffix, x);
 		x = Eigen::Rand::gammaLike(x, urng, 10.5, 1);
 	}
 
+	{
+		auto scope = bh.measure("gamma(10.5,1)/double" + suffix, xd);
+		xd = Eigen::Rand::gammaLike(xd, urng, 10.5, 1);
+	}
+
 	{
 		auto scope = bh.measure("weibull(2,3)" + suffix, x);
 		x = Eigen::Rand::weibullLike(x, urng, 2, 3);
 	}
 
+	{
+		auto scope = bh.measure("weibull(2,3)/double" + suffix, xd);
+		xd = Eigen::Rand::weibullLike(xd, urng, 2, 3);
+	}
+
 	{
 		auto scope = bh.measure("extremeValue(0,1)" + suffix, x);
 		x = Eigen::Rand::extremeValueLike(x, urng, 0, 1);
 	}
 
+	{
+		auto scope = bh.measure("extremeValue(0,1)/double" + suffix, xd);
+		xd = Eigen::Rand::extremeValueLike(xd, urng, 0, 1);
+	}
+
 	{
 		auto scope = bh.measure("chiSquared(15)" + suffix, x);
 		x = Eigen::Rand::chiSquaredLike(x, urng, 15);
 	}
 
+	{
+		auto scope = bh.measure("chiSquared(15)/double" + suffix, xd);
+		xd = Eigen::Rand::chiSquaredLike(xd, urng, 15);
+	}
+
 	{
 		auto scope = bh.measure("cauchy" + suffix, x);
 		x = Eigen::Rand::cauchyLike(x, urng);
 	}
 
+	{
+		auto scope = bh.measure("cauchy/double" + suffix, xd);
+		xd = Eigen::Rand::cauchyLike(xd, urng);
+	}
+
 	{
 		auto scope = bh.measure("studentT(1)" + suffix, x);
 		x = Eigen::Rand::studentTLike(x, urng, 1);
 	}
 
+	{
+		auto scope = bh.measure("studentT(1)/double" + suffix, xd);
+		xd = Eigen::Rand::studentTLike(xd, urng, 1);
+	}
+
 	{
 		auto scope = bh.measure("studentT(5)" + suffix, x);
 		x = Eigen::Rand::studentTLike(x, urng, 5);
 	}
 
+	{
+		auto scope = bh.measure("studentT(5)/double" + suffix, xd);
+		xd = Eigen::Rand::studentTLike(xd, urng, 5);
+	}
+
 	{
 		auto scope = bh.measure("studentT(20)" + suffix, x);
 		x = Eigen::Rand::studentTLike(x, urng, 20);
@@ -351,6 +415,11 @@ std::map<std::string, double> test_eigenrand(size_t size, const std::string& suf
 		x = Eigen::Rand::fisherFLike(x, urng, 1, 1);
 	}
 
+	{
+		auto scope = bh.measure("fisherF(1,1)/double" + suffix, xd);
+		xd = Eigen::Rand::fisherFLike(xd, urng, 1, 1);
+	}
+
 	{
 		auto scope = bh.measure("fisherF(5,1)" + suffix, x);
 		x = Eigen::Rand::fisherFLike(x, urng, 5, 1);
@@ -366,6 +435,11 @@ std::map<std::string, double> test_eigenrand(size_t size, const std::string& suf
 		x = Eigen::Rand::fisherFLike(x, urng, 5, 5);
 	}
 
+	{
+		auto scope = bh.measure("fisherF(5,5)/double" + suffix, xd);
+		xd = Eigen::Rand::fisherFLike(xd, urng, 5, 5);
+	}
+
 	{
 		auto scope = bh.measure("poisson(1)" + suffix, xi);
 		xi = Eigen::Rand::poissonLike(xi, urng, 1);
@@ -746,7 +820,7 @@ int main(int argc, char** argv)
 
 	for (size_t i = 0; i < repeat; ++i)
 	{
-		for (auto& p : test_rng(std::mt19937{}, size, "rng\tmt19937", results))
+		/*for (auto& p : test_rng(std::mt19937{}, size, "rng\tmt19937", results))
 		{
 			time[p.first] += p.second;
 			timeSq[p.first] += p.second * p.second;
@@ -792,7 +866,7 @@ int main(int argc, char** argv)
 		{
 			time[p.first] += p.second;
 			timeSq[p.first] += p.second * p.second;
-		}
+		}*/
 
 		for (auto& p : test_eigenrand<std::mt19937_64>(size, "\t:ERand", results))
 		{