diff --git a/pythran/pythonic/include/numpy/arange.hpp b/pythran/pythonic/include/numpy/arange.hpp index 77c91a0b3f..577c74f06a 100644 --- a/pythran/pythonic/include/numpy/arange.hpp +++ b/pythran/pythonic/include/numpy/arange.hpp @@ -14,7 +14,7 @@ namespace numpy #ifdef USE_XSIMD template struct arange_simd_iterator { - using vector_type = xsimd::simd_type; + using vector_type = xsimd::batch; vector_type curr_; vector_type step_; long index_; diff --git a/pythran/pythonic/include/numpy/conjugate.hpp b/pythran/pythonic/include/numpy/conjugate.hpp index 4194828916..08fcbdeabb 100644 --- a/pythran/pythonic/include/numpy/conjugate.hpp +++ b/pythran/pythonic/include/numpy/conjugate.hpp @@ -18,14 +18,14 @@ namespace numpy { return std::conj(v); } - template - xsimd::batch, N> - conjugate(xsimd::batch, N> const &v) + template + xsimd::batch, A> + conjugate(xsimd::batch, A> const &v) { return xsimd::conj(v); } - template - xsimd::batch conjugate(xsimd::batch const &v) + template + xsimd::batch conjugate(xsimd::batch const &v) { return v; } diff --git a/pythran/pythonic/include/types/nditerator.hpp b/pythran/pythonic/include/types/nditerator.hpp index e08bc73a79..e74f19c2b5 100644 --- a/pythran/pythonic/include/types/nditerator.hpp +++ b/pythran/pythonic/include/types/nditerator.hpp @@ -98,9 +98,9 @@ namespace types template struct const_simd_nditerator : public std::iterator> { + xsimd::batch> { - using vector_type = typename xsimd::simd_type; + using vector_type = typename xsimd::batch; typename E::dtype const *data; static const std::size_t vector_size = vector_type::size; @@ -116,7 +116,7 @@ namespace types bool operator==(const_simd_nditerator const &other) const; bool operator<(const_simd_nditerator const &other) const; const_simd_nditerator &operator=(const_simd_nditerator const &other); - void store(xsimd::simd_type const &); + void store(xsimd::batch const &); }; template struct const_simd_nditerator_nostep : const_simd_nditerator { diff --git a/pythran/pythonic/include/types/numpy_broadcast.hpp b/pythran/pythonic/include/types/numpy_broadcast.hpp index a8386da722..4bc7e639c4 100644 --- a/pythran/pythonic/include/types/numpy_broadcast.hpp +++ b/pythran/pythonic/include/types/numpy_broadcast.hpp @@ -176,7 +176,7 @@ namespace types template struct broadcast_base { dtype _value; - xsimd::simd_type _splated; + xsimd::batch _splated; broadcast_base() = default; template diff --git a/pythran/pythonic/include/types/numpy_expr.hpp b/pythran/pythonic/include/types/numpy_expr.hpp index a68457f96e..26a47dd6b2 100644 --- a/pythran/pythonic/include/types/numpy_expr.hpp +++ b/pythran/pythonic/include/types/numpy_expr.hpp @@ -271,10 +271,12 @@ namespace types auto _dereference(utils::index_sequence) const -> decltype(Op{}(*std::get(iters_)...)) { - return Op{}(((std::get(steps_)) - ? (*std::get(iters_)) - : (xsimd::simd_type(iters_))>( - *std::get(siters_))))...); + return Op{}(( + (std::get(steps_)) + ? (*std::get(iters_)) + : (xsimd::batch< + typename std::decay(siters_))>::type>( + *std::get(siters_))))...); } auto operator*() const -> decltype( diff --git a/pythran/pythonic/numpy/argminmax.hpp b/pythran/pythonic/numpy/argminmax.hpp index c0a96f7b9e..39bbf61ed3 100644 --- a/pythran/pythonic/numpy/argminmax.hpp +++ b/pythran/pythonic/numpy/argminmax.hpp @@ -99,7 +99,7 @@ namespace numpy long>::type _argminmax(E const &elts, T &minmax_elts, utils::int_<1>) { - using vT = xsimd::simd_type; + using vT = xsimd::batch; using iT = xsimd::as_integer_t; static const size_t vN = vT::size; const long n = elts.size(); @@ -117,9 +117,9 @@ namespace numpy iT iota[vN] = {0}; for (long i = 0; i < (long)vN; ++i) iota[i] = i; - auto curr = xsimd::load_unaligned(iota); - xsimd::simd_type indices = curr; - xsimd::simd_type step{vN}; + xsimd::batch curr = xsimd::load_unaligned(iota); + xsimd::batch indices = curr; + xsimd::batch step(vN); for (++viter; viter != vend; ++viter) { curr += step; diff --git a/pythran/pythonic/numpy/reduce.hpp b/pythran/pythonic/numpy/reduce.hpp index 8b97fde776..a42f3bc332 100644 --- a/pythran/pythonic/numpy/reduce.hpp +++ b/pythran/pythonic/numpy/reduce.hpp @@ -76,7 +76,7 @@ namespace numpy F vreduce(E e, F acc) { using T = typename E::dtype; - using vT = xsimd::simd_type; + using vT = xsimd::batch; static const size_t vN = vT::size; const long n = e.size(); auto viter = vectorizer::vbegin(e), vend = vectorizer::vend(e); diff --git a/pythran/pythonic/types/list.hpp b/pythran/pythonic/types/list.hpp index 9a991b3768..614656b6a5 100644 --- a/pythran/pythonic/types/list.hpp +++ b/pythran/pythonic/types/list.hpp @@ -210,7 +210,7 @@ namespace types typename sliced_list::simd_iterator sliced_list::vend(vectorizer) const { - using vector_type = typename xsimd::simd_type; + using vector_type = typename xsimd::batch; static const std::size_t vector_size = vector_type::size; return {data->data() + slicing.lower + long(size() / vector_size * vector_size)}; @@ -497,7 +497,7 @@ namespace types template typename list::simd_iterator list::vend(vectorizer) const { - using vector_type = typename xsimd::simd_type; + using vector_type = typename xsimd::batch; static const std::size_t vector_size = vector_type::size; return {data->data() + long(size() / vector_size * vector_size)}; } diff --git a/pythran/pythonic/types/ndarray.hpp b/pythran/pythonic/types/ndarray.hpp index cd60f48121..8062ff9882 100644 --- a/pythran/pythonic/types/ndarray.hpp +++ b/pythran/pythonic/types/ndarray.hpp @@ -689,7 +689,7 @@ namespace types template typename ndarray::simd_iterator ndarray::vend(vectorizer) const { - using vector_type = typename xsimd::simd_type; + using vector_type = typename xsimd::batch; static const std::size_t vector_size = vector_type::size; return {buffer + long(std::get<0>(_shape) / vector_size * vector_size)}; } diff --git a/pythran/pythonic/types/nditerator.hpp b/pythran/pythonic/types/nditerator.hpp index 1bc276c7eb..e809776c0e 100644 --- a/pythran/pythonic/types/nditerator.hpp +++ b/pythran/pythonic/types/nditerator.hpp @@ -211,8 +211,8 @@ namespace types } template - void const_simd_nditerator::store( - xsimd::simd_type const &val) + void + const_simd_nditerator::store(xsimd::batch const &val) { val.store_unaligned(const_cast(data)); } diff --git a/pythran/pythonic/types/numpy_broadcast.hpp b/pythran/pythonic/types/numpy_broadcast.hpp index 08b6496fb2..cc4976f277 100644 --- a/pythran/pythonic/types/numpy_broadcast.hpp +++ b/pythran/pythonic/types/numpy_broadcast.hpp @@ -77,7 +77,7 @@ namespace types template template broadcast_base::broadcast_base(V v) - : _value(v), _splated(xsimd::simd_type(_value)) + : _value(v), _splated(xsimd::batch(_value)) { } diff --git a/pythran/pythonic/types/numpy_gexpr.hpp b/pythran/pythonic/types/numpy_gexpr.hpp index 25051955b7..61fddab265 100644 --- a/pythran/pythonic/types/numpy_gexpr.hpp +++ b/pythran/pythonic/types/numpy_gexpr.hpp @@ -660,7 +660,7 @@ namespace types typename numpy_gexpr::simd_iterator numpy_gexpr::vend(vectorizer) const { - using vector_type = typename xsimd::simd_type; + using vector_type = typename xsimd::batch; static const std::size_t vector_size = vector_type::size; return {buffer + long(size() / vector_size * vector_size)}; } diff --git a/pythran/pythonic/types/numpy_iexpr.hpp b/pythran/pythonic/types/numpy_iexpr.hpp index 62d92af2fb..30e95ff9f7 100644 --- a/pythran/pythonic/types/numpy_iexpr.hpp +++ b/pythran/pythonic/types/numpy_iexpr.hpp @@ -331,7 +331,7 @@ namespace types typename numpy_iexpr::simd_iterator numpy_iexpr::vend(vectorizer) const { - using vector_type = typename xsimd::simd_type; + using vector_type = typename xsimd::batch; static const std::size_t vector_size = vector_type::size; return {buffer + long(size() / vector_size * vector_size)}; } diff --git a/pythran/pythonic/types/tuple.hpp b/pythran/pythonic/types/tuple.hpp index ad4cdea992..5d0ebab5f2 100644 --- a/pythran/pythonic/types/tuple.hpp +++ b/pythran/pythonic/types/tuple.hpp @@ -246,7 +246,7 @@ namespace types typename array_base::simd_iterator array_base::vend(vectorizer) const { - using vector_type = typename xsimd::simd_type; + using vector_type = typename xsimd::batch; static const std::size_t vector_size = vector_type::size; return {&buffer[long(size() / vector_size * vector_size)]}; } diff --git a/pythran/pythonic/utils/broadcast_copy.hpp b/pythran/pythonic/utils/broadcast_copy.hpp index 71ad6d39cf..47c345457c 100644 --- a/pythran/pythonic/utils/broadcast_copy.hpp +++ b/pythran/pythonic/utils/broadcast_copy.hpp @@ -198,7 +198,7 @@ namespace utils void vbroadcast_copy(E &&self, F const &other) { using T = typename F::dtype; - using vT = xsimd::simd_type; + using vT = xsimd::batch; static const std::size_t vN = vT::size; @@ -428,7 +428,7 @@ namespace utils void vbroadcast_update(E &&self, F const &other) { using T = typename F::dtype; - using vT = typename xsimd::simd_type; + using vT = typename xsimd::batch; long other_size = std::distance(other.begin(), other.end()); static const std::size_t vN = vT::size; diff --git a/third_party/xsimd/arch/generic/xsimd_generic_arithmetic.hpp b/third_party/xsimd/arch/generic/xsimd_generic_arithmetic.hpp new file mode 100644 index 0000000000..a927ba9757 --- /dev/null +++ b/third_party/xsimd/arch/generic/xsimd_generic_arithmetic.hpp @@ -0,0 +1,101 @@ +/*************************************************************************** +* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * +* Martin Renou * +* Copyright (c) QuantStack * +* Copyright (c) Serge Guelton * +* * +* Distributed under the terms of the BSD 3-Clause License. * +* * +* The full license is in the file LICENSE, distributed with this software. * +****************************************************************************/ + +#ifndef XSIMD_GENERIC_ARITHMETIC_HPP +#define XSIMD_GENERIC_ARITHMETIC_HPP + +#include +#include + +#include "./xsimd_generic_details.hpp" + +namespace xsimd { + + namespace kernel { + + using namespace types; + + // bitwise_lshift + template::value, void>::type*/> + batch bitwise_lshift(batch const& self, batch const& other, requires_arch) { + return detail::apply([](T x, T y) { return x << y;}, self, other); + } + + // bitwise_rshift + template::value, void>::type*/> + batch bitwise_rshift(batch const& self, batch const& other, requires_arch) { + return detail::apply([](T x, T y) { return x >> y;}, self, other); + } + + // div + template::value, void>::type> + batch div(batch const& self, batch const& other, requires_arch) { + return detail::apply([](T x, T y) -> T { return x / y;}, self, other); + } + + // fma + template batch fma(batch const& x, batch const& y, batch const& z, requires_arch) { + return x * y + z; + } + + template batch, A> fma(batch, A> const& x, batch, A> const& y, batch, A> const& z, requires_arch) { + auto res_r = fms(x.real(), y.real(), fms(x.imag(), y.imag(), z.real())); + auto res_i = fma(x.real(), y.imag(), fma(x.imag(), y.real(), z.imag())); + return {res_r, res_i}; + } + + // fms + template batch fms(batch const& x, batch const& y, batch const& z, requires_arch) { + return x * y - z; + } + + template batch, A> fms(batch, A> const& x, batch, A> const& y, batch, A> const& z, requires_arch) { + auto res_r = fms(x.real(), y.real(), fma(x.imag(), y.imag(), z.real())); + auto res_i = fma(x.real(), y.imag(), fms(x.imag(), y.real(), z.imag())); + return {res_r, res_i}; + } + + // fnma + template batch fnma(batch const& x, batch const& y, batch const& z, requires_arch) { + return -x * y + z; + } + + template batch, A> fnma(batch, A> const& x, batch, A> const& y, batch, A> const& z, requires_arch) { + auto res_r = - fms(x.real(), y.real(), fma(x.imag(), y.imag(), z.real())); + auto res_i = - fma(x.real(), y.imag(), fms(x.imag(), y.real(), z.imag())); + return {res_r, res_i}; + } + + // fnms + template batch fnms(batch const& x, batch const& y, batch const& z, requires_arch) { + return -x * y - z; + } + + template batch, A> fnms(batch, A> const& x, batch, A> const& y, batch, A> const& z, requires_arch) { + auto res_r = - fms(x.real(), y.real(), fms(x.imag(), y.imag(), z.real())); + auto res_i = - fma(x.real(), y.imag(), fma(x.imag(), y.real(), z.imag())); + return {res_r, res_i}; + } + + + + // mul + template::value, void>::type*/> + batch mul(batch const& self, batch const& other, requires_arch) { + return detail::apply([](T x, T y) -> T { return x * y;}, self, other); + } + + } + +} + +#endif + diff --git a/third_party/xsimd/arch/generic/xsimd_generic_complex.hpp b/third_party/xsimd/arch/generic/xsimd_generic_complex.hpp new file mode 100644 index 0000000000..79fd0f1e7f --- /dev/null +++ b/third_party/xsimd/arch/generic/xsimd_generic_complex.hpp @@ -0,0 +1,86 @@ +/*************************************************************************** +* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * +* Martin Renou * +* Copyright (c) QuantStack * +* Copyright (c) Serge Guelton * +* * +* Distributed under the terms of the BSD 3-Clause License. * +* * +* The full license is in the file LICENSE, distributed with this software. * +****************************************************************************/ + +#ifndef XSIMD_GENERIC_COMPLEX_HPP +#define XSIMD_GENERIC_COMPLEX_HPP + +#include + +#include "./xsimd_generic_details.hpp" + +namespace xsimd { + + namespace kernel { + + using namespace types; + + // real + template + batch real(batch const& self, requires_arch) { + return self; + } + + template + batch real(batch, A> const& self, requires_arch) { + return self.real(); + } + + // imag + template + batch imag(batch const& /*self*/, requires_arch) { + return batch(T(0)); + } + + template + batch imag(batch, A> const& self, requires_arch) { + return self.imag(); + } + + // arg + template + real_batch_type_t> arg(batch const& self, requires_arch) { + return atan2(imag(self), real(self)); + } + + // conj + template + complex_batch_type_t> conj(batch const& self, requires_arch) { + return {real(self), - imag(self)}; + } + + // norm + template + real_batch_type_t> norm(batch const& self, requires_arch) { + return {fma(real(self), real(self), imag(self) * imag(self))}; + } + + // proj + template + complex_batch_type_t> proj(batch const& self, requires_arch) { + using batch_type = complex_batch_type_t>; + using real_batch = typename batch_type::real_batch; + using real_value_type = typename real_batch::value_type; + auto cond = xsimd::isinf(real(self)) || xsimd::isinf(imag(self)); + return select(cond, + batch_type(constants::infinity(), + copysign(real_batch(real_value_type(0)), imag(self))), + batch_type(self)); + } + + template + batch_bool isnan(batch, A> const& self, requires_arch) { + return batch_bool(isnan(self.real()) || isnan(self.imag())); + } + } +} + +#endif + diff --git a/third_party/xsimd/arch/generic/xsimd_generic_details.hpp b/third_party/xsimd/arch/generic/xsimd_generic_details.hpp new file mode 100644 index 0000000000..c14ba3d85e --- /dev/null +++ b/third_party/xsimd/arch/generic/xsimd_generic_details.hpp @@ -0,0 +1,217 @@ +/*************************************************************************** +* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * +* Martin Renou * +* Copyright (c) QuantStack * +* Copyright (c) Serge Guelton * +* * +* Distributed under the terms of the BSD 3-Clause License. * +* * +* The full license is in the file LICENSE, distributed with this software. * +****************************************************************************/ + +#ifndef XSIMD_GENERIC_DETAILS_HPP +#define XSIMD_GENERIC_DETAILS_HPP + +#include + +#include "../../types/xsimd_generic_arch.hpp" +#include "../../types/xsimd_utils.hpp" +#include "../../math/xsimd_rem_pio2.hpp" +#include "../xsimd_constants.hpp" + +namespace xsimd { + // Forward declaration. Should we put them in a separate file? + template + batch abs(batch const& self); + template + batch abs(batch, A> const& self); + template + bool any(batch_bool const& self); + template + batch atan2(batch const& self, batch const& other); + template + batch bitofsign(batch const& self); + template + B bitwise_cast(batch const& self); + template + batch_bool bool_cast(batch_bool const& self); + template + batch_bool bool_cast(batch_bool const& self); + template + batch_bool bool_cast(batch_bool const& self); + template + batch_bool bool_cast(batch_bool const& self); + template + batch cos(batch const& self); + template + batch cosh(batch const& self); + template + batch exp(batch const& self); + template + batch fma(batch const& x, batch const& y, batch const& z); + template + batch fms(batch const& x, batch const& y, batch const& z); + template + batch frexp(const batch& x, const batch, A>& e); + template + T hadd(batch const&); + template + batch horner(const batch& self); + template + batch hypot(const batch& self); + template + batch_bool is_even(batch const& self); + template + batch_bool is_flint(batch const& self); + template + batch_bool is_odd(batch const& self); + template + batch_bool isinf(batch const& self); + template + typename batch::batch_bool_type isnan(batch const& self); + template + batch ldexp(const batch& x, const batch, A>& e); + template + batch log(batch const& self); + template + batch nearbyint(batch const& self); + template + batch select(batch_bool const&, batch const& , batch const& ); + template + batch, A> select(batch_bool const&, batch, A> const& , batch, A> const& ); + template + batch sign(batch const& self); + template + batch signnz(batch const& self); + template + batch sin(batch const& self); + template + batch sinh(batch const& self); + template + std::pair, batch> sincos(batch const& self); + template + batch sqrt(batch const& self); + template + batch tan(batch const& self); + template + batch, A> to_float(batch const& self); + template + batch, A> to_int(batch const& self); + template + batch trunc(batch const& self); + + + namespace kernel { + + namespace detail { + template + batch apply(F&& func, batch const& self, batch const& other) { + constexpr std::size_t size = batch::size; + alignas(A::alignment()) T self_buffer[size]; + alignas(A::alignment()) T other_buffer[size]; + self.store_aligned(&self_buffer[0]); + other.store_aligned(&other_buffer[0]); + for(std::size_t i = 0; i < size; ++i) { + self_buffer[i] = func(self_buffer[i], other_buffer[i]); + } + return batch::load_aligned(self_buffer); + } + } + + namespace detail { + // Generic conversion handling machinery. Each architecture must define + // conversion function when such conversions exits in the form of + // intrinsic. Then we use that information to automatically decide whether + // to use scalar or vector conversion when doing load / store / batch_cast + struct with_fast_conversion{}; + struct with_slow_conversion{}; + + template + struct conversion_type_impl + { + using type = with_slow_conversion; + }; + + using xsimd::detail::void_t; + + template + struct conversion_type_impl(), std::declval(), std::declval()))>> + { + using type = with_fast_conversion; + }; + + template + using conversion_type = typename conversion_type_impl::type; + } + + namespace detail { + /* origin: boost/simdfunction/horn.hpp*/ + /* + * ==================================================== + * copyright 2016 NumScale SAS + * + * Distributed under the Boost Software License, Version 1.0. + * (See copy at http://boost.org/LICENSE_1_0.txt) + * ==================================================== + */ + template + inline B coef() noexcept + { + using value_type = typename B::value_type; + return B(bit_cast(as_unsigned_integer_t(c))); + } + template + inline B horner(const B&) noexcept + { + return B(typename B::value_type(0.)); + } + + template + inline B horner(const B&) noexcept + { + return coef(); + } + + template + inline B horner(const B& self) noexcept + { + return fma(self, horner(self), coef()); + } + + /* origin: boost/simdfunction/horn1.hpp*/ + /* + * ==================================================== + * copyright 2016 NumScale SAS + * + * Distributed under the Boost Software License, Version 1.0. + * (See copy at http://boost.org/LICENSE_1_0.txt) + * ==================================================== + */ + template + inline B horner1(const B&) noexcept + { + return B(1.); + } + + template + inline B horner1(const B& x) noexcept + { + return x + detail::coef(); + } + + template + inline B horner1(const B& x) noexcept + { + return fma(x, horner1(x), detail::coef()); + } + } + + + + } + +} + +#endif + diff --git a/third_party/xsimd/arch/generic/xsimd_generic_logical.hpp b/third_party/xsimd/arch/generic/xsimd_generic_logical.hpp new file mode 100644 index 0000000000..fb8110b974 --- /dev/null +++ b/third_party/xsimd/arch/generic/xsimd_generic_logical.hpp @@ -0,0 +1,107 @@ +/*************************************************************************** +* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * +* Martin Renou * +* Copyright (c) QuantStack * +* Copyright (c) Serge Guelton * +* * +* Distributed under the terms of the BSD 3-Clause License. * +* * +* The full license is in the file LICENSE, distributed with this software. * +****************************************************************************/ + +#ifndef XSIMD_GENERIC_LOGICAL_HPP +#define XSIMD_GENERIC_LOGICAL_HPP + +#include "./xsimd_generic_details.hpp" + + +namespace xsimd { + + namespace kernel { + + using namespace types; + + // ge + template batch_bool ge(batch const& self, batch const& other, requires_arch) { + return other <= self; + } + + // gt + template batch_bool gt(batch const& self, batch const& other, requires_arch) { + return other < self; + } + + // is_even + template batch_bool is_even(batch const& self, requires_arch) { + return is_flint(self * T(0.5)); + } + + // is_flint + template batch_bool is_flint(batch const& self, requires_arch) { + auto frac = select(isnan(self - self), constants::nan>(), self - trunc(self)); + return frac == T(0.); + } + + // is_odd + template batch_bool is_odd(batch const& self, requires_arch) { + return is_even(self - T(1.)); + } + + // isinf + template::value, void>::type> + batch_bool isinf(batch const& , requires_arch) { + return batch_bool(false); + } + template batch_bool isinf(batch const& self, requires_arch) { + return abs(self) == std::numeric_limits::infinity(); + } + template batch_bool isinf(batch const& self, requires_arch) { + return abs(self) == std::numeric_limits::infinity(); + } + + // isfinite + template::value, void>::type> + batch_bool isfinite(batch const& , requires_arch) { + return batch_bool(true); + } + template batch_bool isfinite(batch const& self, requires_arch) { + return (self - self) == 0; + } + template batch_bool isfinite(batch const& self, requires_arch) { + return (self - self) == 0; + } + + // isnan + template::value, void>::type> + batch_bool isnan(batch const& , requires_arch) { + return batch_bool(false); + } + + // le + template::value, void>::type> + batch_bool le(batch const& self, batch const& other, requires_arch) { + return (self < other) || (self == other); + } + + + // neq + template batch_bool neq(batch const& self, batch const& other, requires_arch) { + return !(other == self); + } + + // logical_and + template + batch logical_and(batch const& self, batch const& other, requires_arch) { + return detail::apply([](T x, T y) { return x && y;}, self, other); + } + + // logical_or + template + batch logical_or(batch const& self, batch const& other, requires_arch) { + return detail::apply([](T x, T y) { return x || y;}, self, other); + } + } +} + +#endif + diff --git a/third_party/xsimd/arch/generic/xsimd_generic_math.hpp b/third_party/xsimd/arch/generic/xsimd_generic_math.hpp new file mode 100644 index 0000000000..2b8562b73b --- /dev/null +++ b/third_party/xsimd/arch/generic/xsimd_generic_math.hpp @@ -0,0 +1,2215 @@ +/*************************************************************************** +* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * +* Martin Renou * +* Copyright (c) QuantStack * +* Copyright (c) Serge Guelton * +* * +* Distributed under the terms of the BSD 3-Clause License. * +* * +* The full license is in the file LICENSE, distributed with this software. * +****************************************************************************/ + +#ifndef XSIMD_GENERIC_MATH_HPP +#define XSIMD_GENERIC_MATH_HPP + +#include + +#include "./xsimd_generic_details.hpp" +#include "./xsimd_generic_trigo.hpp" +#include "../xsimd_scalar.hpp" + + +namespace xsimd { + + namespace kernel { + + using namespace types; + // abs + template::value, void>::type*/> + batch abs(batch const& self, requires_arch) + { + if(std::is_unsigned::value) + return self; + else { + auto sign = bitofsign(self); + auto inv = self ^ sign; + return inv - sign; + } + } + + template + batch abs(batch, A> const& z, requires_arch) { + return hypot(z.real(), z.imag()); + } + + // batch_cast + template batch batch_cast(batch const& self, batch const&, requires_arch) { + return self; + } + + namespace detail { + template + batch batch_cast(batch const& self, batch const& out, requires_arch, with_fast_conversion) { + return fast_cast(self, out, A{}); + } + template + batch batch_cast(batch const& self, batch const&, requires_arch, with_slow_conversion) { + static_assert(!std::is_same::value, "there should be no conversion for this type combination"); + using batch_type_in = batch; + using batch_type_out = batch; + static_assert(batch_type_in::size == batch_type_out::size, "compatible sizes"); + alignas(A::alignment()) T_in buffer_in[batch_type_in::size]; + alignas(A::alignment()) T_out buffer_out[batch_type_out::size]; + self.store_aligned(&buffer_in[0]); + std::copy(std::begin(buffer_in), std::end(buffer_in), std::begin(buffer_out)); + return batch_type_out::load_aligned(buffer_out); + } + + } + + template + batch batch_cast(batch const& self, batch const& out, requires_arch) { + return detail::batch_cast(self, out, A{}, detail::conversion_type{}); + } + + // bitofsign + template batch bitofsign(batch const& self, requires_arch) { + static_assert(std::is_integral::value, "int type implementation"); + if(std::is_unsigned::value) + return batch(0); + else + return self >> (T)(8 * sizeof(T) - 1); + } + + template batch bitofsign(batch const& self, requires_arch) { + return self & constants::minuszero>(); + } + template batch bitofsign(batch const& self, requires_arch) { + return self & constants::minuszero>(); + } + + + // cbrt + /* origin: boost/simd/arch/common/simd/function/cbrt.hpp */ + /* + * ==================================================== + * copyright 2016 NumScale SAS + * + * Distributed under the Boost Software License, Version 1.0. + * (See copy at http://boost.org/LICENSE_1_0.txt) + * ==================================================== + */ + template batch cbrt(batch const& self, requires_arch) { + using batch_type = batch; + batch_type z = abs(self); +#ifndef XSIMD_NO_DENORMALS + auto denormal = z < constants::smallestposval(); + z = select(denormal, z * constants::twotonmb(), z); + batch_type f = select(denormal, constants::twotonmbo3(), batch_type(1.)); +#endif + const batch_type CBRT2 (bit_cast(0x3fa14518)); + const batch_type CBRT4 (bit_cast(0x3fcb2ff5)); + const batch_type CBRT2I (bit_cast(0x3f4b2ff5)); + const batch_type CBRT4I (bit_cast(0x3f214518)); + using i_type = as_integer_t; + i_type e; + batch_type x = frexp(z, e); + x = detail::horner(x); + auto flag = e >= i_type(0); + i_type e1 = abs(e); + i_type rem = e1; + e1 /= i_type(3); + rem -= e1 * i_type(3); + e = e1 * sign(e); + const batch_type cbrt2 = select(bool_cast(flag), CBRT2, CBRT2I); + const batch_type cbrt4 = select(bool_cast(flag), CBRT4, CBRT4I); + batch_type fact = select(bool_cast(rem == i_type(1)), cbrt2, batch_type(1.)); + fact = select(bool_cast(rem == i_type(2)), cbrt4, fact); + x = ldexp(x * fact, e); + x -= (x - z / (x * x)) * batch_type(1.f / 3.f); +#ifndef XSIMD_NO_DENORMALS + x = (x | bitofsign(self)) * f; +#else + x = x | bitofsign(self); +#endif +#ifndef XSIMD_NO_INFINITIES + return select(self == batch_type(0.) || isinf(self), self, x); +#else + return select(self == batch_type(0.), self, x); +#endif + } + template batch cbrt(batch const& self, requires_arch) { + using batch_type = batch; + batch_type z = abs(self); +#ifndef XSIMD_NO_DENORMALS + auto denormal = z < constants::smallestposval(); + z = select(denormal, z * constants::twotonmb(), z); + batch_type f = select(denormal, constants::twotonmbo3(), batch_type(1.)); +#endif + const batch_type CBRT2(bit_cast(int64_t(0x3ff428a2f98d728b))); + const batch_type CBRT4(bit_cast(int64_t(0x3ff965fea53d6e3d))); + const batch_type CBRT2I(bit_cast(int64_t(0x3fe965fea53d6e3d))); + const batch_type CBRT4I(bit_cast(int64_t(0x3fe428a2f98d728b))); + using i_type = as_integer_t; + i_type e; + batch_type x = frexp(z, e); + x = detail::horner(x); + auto flag = e >= typename i_type::value_type(0); + i_type e1 = abs(e); + i_type rem = e1; + e1 /= i_type(3); + rem -= e1 * i_type(3); + e = e1 * sign(e); + const batch_type cbrt2 = select(bool_cast(flag), CBRT2, CBRT2I); + const batch_type cbrt4 = select(bool_cast(flag), CBRT4, CBRT4I); + batch_type fact = select(bool_cast(rem == i_type(1)), cbrt2, batch_type(1.)); + fact = select(bool_cast(rem == i_type(2)), cbrt4, fact); + x = ldexp(x * fact, e); + x -= (x - z / (x * x)) * batch_type(1. / 3.); + x -= (x - z / (x * x)) * batch_type(1. / 3.); +#ifndef XSIMD_NO_DENORMALS + x = (x | bitofsign(self)) * f; +#else + x = x | bitofsign(self); +#endif +#ifndef XSIMD_NO_INFINITIES + return select(self == batch_type(0.) || isinf(self), self, x); +#else + return select(self == batch_type(0.), self, x); +#endif + } + + // clip + template batch clip(batch const& self, batch const& lo, batch const& hi, requires_arch) { + return min(hi, max(self, lo)); + } + + + // copysign + template batch copysign(batch const& self, batch const& other, requires_arch) { + return abs(self) | bitofsign(other); + } + + + // erf + + namespace detail { + /* origin: boost/simd/arch/common/detail/generic/erf_kernel.hpp */ + /* + * ==================================================== + * copyright 2016 NumScale SAS + * + * Distributed under the Boost Software License, Version 1.0. + * (See copy at http://boost.org/LICENSE_1_0.txt) + * ==================================================== + */ + template + struct erf_kernel; + + template + struct erf_kernel> + { + using batch_type = batch; + // computes erf(a0)/a0 + // x is sqr(a0) and 0 <= abs(a0) <= 2/3 + static inline batch_type erf1(const batch_type& x) + { + return detail::horner(x); + } + + // computes erfc(x)*exp(sqr(x)) + // x >= 2/3 + static inline batch_type erfc2(const batch_type& x) + { + return detail::horner(x); + } + + static inline batch_type erfc3(const batch_type& x) + { + return (batch_type(1.) - x) * detail::horner(x); + } + }; + + template + struct erf_kernel> + { + using batch_type = batch; + // computes erf(a0)/a0 + // x is sqr(a0) and 0 <= abs(a0) <= 0.65 + static inline batch_type erf1(const batch_type& x) + { + return detail::horner(x) / + detail::horner(x); + } + + // computes erfc(x)*exp(x*x) + // 0.65 <= abs(x) <= 2.2 + static inline batch_type erfc2(const batch_type& x) + { + return detail::horner(x) / + detail::horner(x); + } + + // computes erfc(x)*exp(x*x) + // 2.2 <= abs(x) <= 6 + static inline batch_type erfc3(const batch_type& x) + { + return detail::horner(x) / + detail::horner(x); + } + + // computes erfc(rx)*exp(rx*rx) + // x >= 6 rx = 1/x + static inline batch_type erfc4(const batch_type& x) + { + return detail::horner(x); + } + }; + } + /* origin: boost/simd/arch/common/simd/function/erf.hpp */ + /* + * ==================================================== + * copyright 2016 NumScale SAS + * + * Distributed under the Boost Software License, Version 1.0. + * (See copy at http://boost.org/LICENSE_1_0.txt) + * ==================================================== + */ + + template + batch erf(batch const& self, requires_arch) { + using batch_type = batch; + batch_type x = abs(self); + batch_type r1(0.); + auto test1 = x < batch_type(2.f / 3.f); + if (any(test1)) + { + r1 = self * detail::erf_kernel::erf1(x * x); + if (all(test1)) + return r1; + } + batch_type z = x / (batch_type(1.) + x); + z -= batch_type(0.4f); + batch_type r2 = batch_type(1.) - exp(-x * x) * detail::erf_kernel::erfc2(z); + r2 = select(self < batch_type(0.), -r2, r2); + r1 = select(test1, r1, r2); +#ifndef XSIMD_NO_INFINITIES + r1 = select(xsimd::isinf(self), sign(self), r1); +#endif + return r1; + } + template batch erf(batch const& self, requires_arch) { + using batch_type = batch; + batch_type x = abs(self); + batch_type xx = x * x; + batch_type lim1 (0.65); + batch_type lim2 (2.2); + auto test1 = x < lim1; + batch_type r1 (0.); + if (any(test1)) + { + r1 = self * detail::erf_kernel::erf1(xx); + if (all(test1)) + return r1; + } + auto test2 = x < lim2; + auto test3 = test2 && !test1; + batch_type ex = exp(-xx); + if (any(test3)) + { + batch_type z = batch_type(1.) - ex * detail::erf_kernel::erfc2(x); + batch_type r2 = select(self < batch_type(0.), -z, z); + r1 = select(test1, r1, r2); + if (all(test1 || test3)) + return r1; + } + batch_type z = batch_type(1.) - ex * detail::erf_kernel::erfc3(x); + z = select(self < batch_type(0.), -z, z); +#ifndef XSIMD_NO_INFINITIES + z = select(xsimd::isinf(self), sign(self), z); +#endif + return select(test2, r1, z); + } + + // erfc + template batch erfc(batch const& self, requires_arch) { + using batch_type = batch; + batch_type x = abs(self); + auto test0 = self < batch_type(0.); + batch_type r1 (0.); + auto test1 = x < batch_type(2.f / 3.f); + batch_type z = x / (batch_type(1.) + x); + if (any(test1)) + { + r1 = detail::erf_kernel::erfc3(z); + if (all(test1)) + return select(test0, batch_type(2.) - r1, r1); + } + z -= batch_type(0.4f); + batch_type r2 = exp(-x * x) * detail::erf_kernel::erfc2(z); + r1 = select(test1, r1, r2); +#ifndef XSIMD_NO_INFINITIES + r1 = select(x == constants::infinity(), batch_type(0.), r1); +#endif + return select(test0, batch_type(2.) - r1, r1); + } + template batch erfc(batch const& self, requires_arch) { + using batch_type = batch; + batch_type x = abs(self); + batch_type xx = x * x; + batch_type lim1 (0.65); + batch_type lim2 (2.2); + auto test0 = self < batch_type(0.); + auto test1 = x < lim1; + batch_type r1 (0.); + if (any(test1)) + { + r1 = batch_type(1.) - x * detail::erf_kernel::erf1(xx); + if (all(test1)) + return select(test0, batch_type(2.) - r1, r1); + } + auto test2 = x < lim2; + auto test3 = test2 && !test1; + batch_type ex = exp(-xx); + if (any(test3)) + { + batch_type z = ex * detail::erf_kernel::erfc2(x); + r1 = select(test1, r1, z); + if (all(test1 || test3)) + return select(test0, batch_type(2.) - r1, r1); + } + batch_type z = ex * detail::erf_kernel::erfc3(x); + r1 = select(test2, r1, z); +#ifndef XSIMD_NO_INFINITIES + r1 = select(x == constants::infinity(), batch_type(0.), r1); +#endif + return select(test0, batch_type(2.) - r1, r1); + } + + // estrin + namespace detail + { + + template + struct estrin + { + B x; + + template + inline B operator()(const Ts&... coefs) noexcept + { + return eval(coefs...); + } + + private: + inline B eval(const B& c0) noexcept + { + return c0; + } + + inline B eval(const B& c0, const B& c1) noexcept + { + return fma(x, c1, c0); + } + + template + inline B eval(::xsimd::detail::index_sequence, const Tuple& tuple) + { + return estrin{x * x}(std::get(tuple)...); + } + + template + inline B eval(const std::tuple& tuple) noexcept + { + return eval(::xsimd::detail::make_index_sequence(), tuple); + } + + template + inline B eval(const std::tuple& tuple, const B& c0) noexcept + { + return eval(std::tuple_cat(tuple, std::make_tuple(eval(c0)))); + } + + template + inline B eval(const std::tuple& tuple, const B& c0, const B& c1) noexcept + { + return eval(std::tuple_cat(tuple, std::make_tuple(eval(c0, c1)))); + } + + template + inline B eval(const std::tuple& tuple, const B& c0, const B& c1, const Ts&... coefs) noexcept + { + return eval(std::tuple_cat(tuple, std::make_tuple(eval(c0, c1))), coefs...); + } + + template + inline B eval(const B& c0, const B& c1, const Ts&... coefs) noexcept + { + return eval(std::make_tuple(eval(c0, c1)), coefs...); + } + }; + } + template + batch estrin(const batch& self) { + using batch_type = batch; + return detail::estrin{self}(detail::coef()...); + } + + // exp + /* origin: boost/simd/arch/common/detail/simd/expo_base.hpp */ + /* + * ==================================================== + * copyright 2016 NumScale SAS + * + * Distributed under the Boost Software License, Version 1.0. + * (See copy at http://boost.org/LICENSE_1_0.txt) + * ==================================================== + */ + namespace detail + { + enum exp_reduction_tag { exp_tag, exp2_tag, exp10_tag }; + + template + struct exp_reduction_base; + + template + struct exp_reduction_base + { + static constexpr B maxlog() noexcept + { + return constants::maxlog(); + } + + static constexpr B minlog() noexcept + { + return constants::minlog(); + } + }; + + template + struct exp_reduction_base + { + static constexpr B maxlog() noexcept + { + return constants::maxlog10(); + } + + static constexpr B minlog() noexcept + { + return constants::minlog10(); + } + }; + + template + struct exp_reduction_base + { + static constexpr B maxlog() noexcept + { + return constants::maxlog2(); + } + + static constexpr B minlog() noexcept + { + return constants::minlog2(); + } + }; + + template + struct exp_reduction; + + template + struct exp_reduction : exp_reduction_base, exp_tag> + { + using batch_type = batch; + static inline batch_type approx(const batch_type& x) + { + batch_type y = detail::horner(x); + return ++fma(y, x * x, x); + } + + static inline batch_type reduce(const batch_type& a, batch_type& x) + { + batch_type k = nearbyint(constants::invlog_2() * a); + x = fnma(k, constants::log_2hi(), a); + x = fnma(k, constants::log_2lo(), x); + return k; + } + }; + + template + struct exp_reduction : exp_reduction_base, exp10_tag> + { + using batch_type = batch; + static inline batch_type approx(const batch_type& x) + { + return ++(detail::horner(x) * + x); + } + + static inline batch_type reduce(const batch_type& a, batch_type& x) + { + batch_type k = nearbyint(constants::invlog10_2() * a); + x = fnma(k, constants::log10_2hi(), a); + x -= k * constants::log10_2lo(); + return k; + } + }; + + template + struct exp_reduction : exp_reduction_base, exp2_tag> + { + using batch_type = batch; + static inline batch_type approx(const batch_type& x) + { + batch_type y = detail::horner(x); + return ++fma(y, x * x, x * constants::log_2()); + } + + static inline batch_type reduce(const batch_type& a, batch_type& x) + { + batch_type k = nearbyint(a); + x = (a - k); + return k; + } + }; + + template + struct exp_reduction : exp_reduction_base, exp_tag> + { + using batch_type = batch; + static inline batch_type approx(const batch_type& x) + { + batch_type t = x * x; + return fnma(t, + detail::horner(t), + x); + } + + static inline batch_type reduce(const batch_type& a, batch_type& hi, batch_type& lo, batch_type& x) + { + batch_type k = nearbyint(constants::invlog_2() * a); + hi = fnma(k, constants::log_2hi(), a); + lo = k * constants::log_2lo(); + x = hi - lo; + return k; + } + + static inline batch_type finalize(const batch_type& x, const batch_type& c, const batch_type& hi, const batch_type& lo) + { + return batch_type(1.) - (((lo - (x * c) / (batch_type(2.) - c)) - hi)); + } + }; + + template + struct exp_reduction : exp_reduction_base, exp10_tag> + { + using batch_type = batch; + static inline batch_type approx(const batch_type& x) + { + batch_type xx = x * x; + batch_type px = x * detail::horner(xx); + batch_type x2 = px / (detail::horner1(xx) - + px); + return ++(x2 + x2); + } + + static inline batch_type reduce(const batch_type& a, batch_type&, batch_type&, batch_type& x) + { + batch_type k = nearbyint(constants::invlog10_2() * a); + x = fnma(k, constants::log10_2hi(), a); + x = fnma(k, constants::log10_2lo(), x); + return k; + } + + static inline batch_type finalize(const batch_type&, const batch_type& c, const batch_type&, const batch_type&) + { + return c; + } + }; + + template + struct exp_reduction : exp_reduction_base, exp2_tag> + { + using batch_type = batch; + static inline batch_type approx(const batch_type& x) + { + batch_type t = x * x; + return fnma(t, + detail::horner(t), + x); + } + + static inline batch_type reduce(const batch_type& a, batch_type&, batch_type&, batch_type& x) + { + batch_type k = nearbyint(a); + x = (a - k) * constants::log_2(); + return k; + } + + static inline batch_type finalize(const batch_type& x, const batch_type& c, const batch_type&, const batch_type&) + { + return batch_type(1.) + x + x * c / (batch_type(2.) - c); + } + }; + + template batch exp(batch const& self) { + using batch_type = batch; + using reducer_t = exp_reduction; + batch_type x; + batch_type k = reducer_t::reduce(self, x); + x = reducer_t::approx(x); + x = select(self <= reducer_t::minlog(), batch_type(0.), ldexp(x, to_int(k))); + x = select(self >= reducer_t::maxlog(), constants::infinity(), x); + return x; + } + template batch exp(batch const& self) { + using batch_type = batch; + using reducer_t = exp_reduction; + batch_type hi, lo, x; + batch_type k = reducer_t::reduce(self, hi, lo, x); + batch_type c = reducer_t::approx(x); + c = reducer_t::finalize(x, c, hi, lo); + c = select(self <= reducer_t::minlog(), batch_type(0.), ldexp(c, to_int(k))); + c = select(self >= reducer_t::maxlog(), constants::infinity(), c); + return c; + } + } + + template batch exp(batch const& self, requires_arch) { + return detail::exp(self); + } + + template batch, A> exp(batch, A> const& self, requires_arch) { + using batch_type = batch, A>; + auto isincos = sincos(self.imag()); + return exp(self.real()) * batch_type(std::get<1>(isincos), std::get<0>(isincos)); + } + + // exp10 + template batch exp10(batch const& self, requires_arch) { + return detail::exp(self); + } + + // exp2 + template batch exp2(batch const& self, requires_arch) { + return detail::exp(self); + } + + // expm1 + namespace detail { + /* origin: boost/simd/arch/common/detail/generic/expm1_kernel.hpp */ + /* + * ==================================================== + * copyright 2016 NumScale SAS + * + * Distributed under the Boost Software License, Version 1.0. + * (See copy at http://boost.org/LICENSE_1_0.txt) + * ==================================================== + */ + template + static inline batch expm1(const batch& a) + { + using batch_type = batch; + batch_type k = nearbyint(constants::invlog_2() * a); + batch_type x = fnma(k, constants::log_2hi(), a); + x = fnma(k, constants::log_2lo(), x); + batch_type hx = x * batch_type(0.5); + batch_type hxs = x * hx; + batch_type r = detail::horner(hxs); + batch_type t = fnma(r, hx, batch_type(3.)); + batch_type e = hxs * ((r - t) / (batch_type(6.) - x * t)); + e = fms(x, e, hxs); + using i_type = as_integer_t; + i_type ik = to_int(k); + batch_type two2mk = ::xsimd::bitwise_cast((constants::maxexponent() - ik) << constants::nmb()); + batch_type y = batch_type(1.) - two2mk - (e - x); + return ldexp(y, ik); + } + + template + static inline batch expm1(const batch& a) + { + using batch_type = batch; + batch_type k = nearbyint(constants::invlog_2() * a); + batch_type hi = fnma(k, constants::log_2hi(), a); + batch_type lo = k * constants::log_2lo(); + batch_type x = hi - lo; + batch_type hxs = x * x * batch_type(0.5); + batch_type r = detail::horner(hxs); + batch_type t = batch_type(3.) - r * batch_type(0.5) * x; + batch_type e = hxs * ((r - t) / (batch_type(6) - x * t)); + batch_type c = (hi - x) - lo; + e = (x * (e - c) - c) - hxs; + using i_type = as_integer_t; + i_type ik = to_int(k); + batch_type two2mk = ::xsimd::bitwise_cast((constants::maxexponent() - ik) << constants::nmb()); + batch_type ct1 = batch_type(1.) - two2mk - (e - x); + batch_type ct2 = ++(x - (e + two2mk)); + batch_type y = select(k < batch_type(20.), ct1, ct2); + return ldexp(y, ik); + } + + } + + template batch expm1(batch const& self, requires_arch) { + using batch_type = batch; + return select(self < constants::logeps(), + batch_type(-1.), + select(self > constants::maxlog(), + constants::infinity(), + detail::expm1(self))); + } + + template + batch, A> expm1(const batch, A>& z, requires_arch) + { + using batch_type = batch, A>; + using real_batch = typename batch_type::real_batch; + real_batch isin = sin(z.imag()); + real_batch rem1 = expm1(z.real()); + real_batch re = rem1 + 1.; + real_batch si = sin(z.imag() * 0.5); + return {rem1 - 2. * re * si * si, re * isin}; + } + + // fdim + template batch fdim(batch const& self, batch const& other, requires_arch) { + return fmax(batch(0), self - other); + } + + // fmod + template batch fmod(batch const& self, batch const& other, requires_arch) { + return fnma(trunc(self / other), other, self); + } + + // frexp + /* origin: boost/simd/arch/common/simd/function/ifrexp.hpp */ + /* + * ==================================================== + * copyright 2016 NumScale SAS + * + * Distributed under the Boost Software License, Version 1.0. + * (See copy at http://boost.org/LICENSE_1_0.txt) + * ==================================================== + */ + template + batch frexp(const batch& self, batch, A>& exp, requires_arch) { + using batch_type = batch; + using i_type = batch, A>; + i_type m1f = constants::mask1frexp(); + i_type r1 = m1f & ::xsimd::bitwise_cast(self); + batch_type x = self & ::xsimd::bitwise_cast(~m1f); + exp = (r1 >> constants::nmb()) - constants::maxexponentm1(); + exp = select(bool_cast(self != batch_type(0.)), exp, i_type(typename i_type::value_type(0))); + return select((self != batch_type(0.)), x | ::xsimd::bitwise_cast(constants::mask2frexp()), batch_type(0.)); + } + + // from bool + template + batch from_bool(batch_bool const& self, requires_arch) { + return batch(self.data) & batch(1); + } + + // hadd + template std::complex hadd(batch, A> const& self, requires_arch) { + return {hadd(self.real()), hadd(self.imag())}; + } + + // horner + template + batch horner(const batch& self) { + return detail::horner, Coefs...>(self); + } + + // hypot + template batch hypot(batch const& self, batch const& other, requires_arch) { + return sqrt(fma(self, self, other * other)); + } + + // ipow + template batch ipow(batch const& self, ITy other, requires_arch) { + return ::xsimd::detail::ipow(self, other); + } + + + // ldexp + /* origin: boost/simd/arch/common/simd/function/ldexp.hpp */ + /* + * ==================================================== + * copyright 2016 NumScale SAS + * + * Distributed under the Boost Software License, Version 1.0. + * (See copy at http://boost.org/LICENSE_1_0.txt) + * ==================================================== + */ + template + batch ldexp(const batch& self, const batch, A>& other, requires_arch) { + using batch_type = batch; + using itype = as_integer_t; + itype ik = other + constants::maxexponent(); + ik = ik << constants::nmb(); + return self * ::xsimd::bitwise_cast(ik); + } + + // lgamma + template batch lgamma(batch const& self, requires_arch); + + namespace detail { + /* origin: boost/simd/arch/common/detail/generic/gammaln_kernel.hpp */ + /* + * ==================================================== + * copyright 2016 NumScale SAS + * + * Distributed under the Boost Software License, Version 1.0. + * (See copy at http://boost.org/LICENSE_1_0.txt) + * ==================================================== + */ + template + static inline batch gammalnB(const batch& x) + { + return horner, + 0x3ed87730, // 4.227843421859038E-001 + 0x3ea51a64, // 3.224669577325661E-001, + 0xbd89f07e, // -6.735323259371034E-002, + 0x3ca89ed8, // 2.058355474821512E-002, + 0xbbf164fd, // -7.366775108654962E-003, + 0x3b3ba883, // 2.863437556468661E-003, + 0xbaabeab1, // -1.311620815545743E-003, + 0x3a1ebb94 // 6.055172732649237E-004 + >(x); + } + + template + static inline batch gammalnC(const batch& x) + { + return horner, + 0xbf13c468, // -5.772156501719101E-001 + 0x3f528d34, // 8.224670749082976E-001, + 0xbecd27a8, // -4.006931650563372E-001, + 0x3e8a898b, // 2.705806208275915E-001, + 0xbe53c04f, // -2.067882815621965E-001, + 0x3e2d4dab, // 1.692415923504637E-001, + 0xbe22d329, // -1.590086327657347E-001, + 0x3e0c3c4f // 1.369488127325832E-001 + >(x); + } + + template + static inline batch gammaln2(const batch& x) + { + return horner, + 0x3daaaa94, // 8.333316229807355E-002f + 0xbb358701, // -2.769887652139868E-003f, + 0x3a31fd69 // 6.789774945028216E-004f + >(x); + } + template + static inline batch gammaln1(const batch& x) + { + return horner, + 0xc12a0c675418055eull, // -8.53555664245765465627E5 + 0xc13a45890219f20bull, // -1.72173700820839662146E6, + 0xc131bc82f994db51ull, // -1.16237097492762307383E6, + 0xc1143d73f89089e5ull, // -3.31612992738871184744E5, + 0xc0e2f234355bb93eull, // -3.88016315134637840924E4, + 0xc09589018ff36761ull // -1.37825152569120859100E3 + >(x) / + horner, + 0xc13ece4b6a11e14aull, // -2.01889141433532773231E6 + 0xc1435255892ff34cull, // -2.53252307177582951285E6, + 0xc131628671950043ull, // -1.13933444367982507207E6, + 0xc10aeb84b9744c9bull, // -2.20528590553854454839E5, + 0xc0d0aa0d7b89d757ull, // -1.70642106651881159223E4, + 0xc075fd0d1cf312b2ull, // -3.51815701436523470549E2, + 0x3ff0000000000000ull // 1.00000000000000000000E0 + >(x); + } + + template + static inline batch gammalnA(const batch& x) + { + return horner, + 0x3fb555555555554bull, // 8.33333333333331927722E-2 + 0xbf66c16c16b0a5a1ull, // -2.77777777730099687205E-3, + 0x3f4a019f20dc5ebbull, // 7.93650340457716943945E-4, + 0xbf437fbdb580e943ull, // -5.95061904284301438324E-4, + 0x3f4a985027336661ull // 8.11614167470508450300E-4 + >(x); + } + /* origin: boost/simd/arch/common/simd/function/gammaln.hpp */ + /* + * ==================================================== + * copyright 2016 NumScale SAS + * + * Distributed under the Boost Software License, Version 1.0. + * (See copy at http://boost.org/LICENSE_1_0.txt) + * ==================================================== + */ + template + struct lgamma_impl; + + template + struct lgamma_impl> + { + using batch_type = batch; + static inline batch_type compute(const batch_type& a) + { + auto inf_result = (a <= batch_type(0.)) && is_flint(a); + batch_type x = select(inf_result, constants::nan(), a); + batch_type q = abs(x); +#ifndef XSIMD_NO_INFINITIES + inf_result = (x == constants::infinity()) || inf_result; +#endif + auto ltza = a < batch_type(0.); + batch_type r; + batch_type r1 = other(q); + if (any(ltza)) + { + r = select(inf_result, constants::infinity(), negative(q, r1)); + if (all(ltza)) + return r; + } + batch_type r2 = select(ltza, r, r1); + return select(a == constants::minusinfinity(), constants::nan(), select(inf_result, constants::infinity(), r2)); + } + + private: + + static inline batch_type negative(const batch_type& q, const batch_type& w) + { + batch_type p = floor(q); + batch_type z = q - p; + auto test2 = z < batch_type(0.5); + z = select(test2, z - batch_type(1.), z); + z = q * sin(z, trigo_pi_tag()); + return -log(constants::invpi() * abs(z)) - w; + } + + static inline batch_type other(const batch_type& x) + { + auto xlt650 = (x < batch_type(6.5)); + batch_type r0x = x; + batch_type r0z = x; + batch_type r0s = batch_type(1.); + batch_type r1 = batch_type(0.); + batch_type p = constants::nan(); + if (any(xlt650)) + { + batch_type z = batch_type(1.); + batch_type tx = select(xlt650, x, batch_type(0.)); + batch_type nx = batch_type(0.); + const batch_type _075 = batch_type(0.75); + const batch_type _150 = batch_type(1.50); + const batch_type _125 = batch_type(1.25); + const batch_type _250 = batch_type(2.50); + auto xge150 = (x >= _150); + auto txgt250 = (tx > _250); + + // x >= 1.5 + while (any(xge150 && txgt250)) + { + nx = select(txgt250, nx - batch_type(1.), nx); + tx = select(txgt250, x + nx, tx); + z = select(txgt250, z * tx, z); + txgt250 = (tx > _250); + } + r0x = select(xge150, x + nx - batch_type(2.), x); + r0z = select(xge150, z, r0z); + r0s = select(xge150, batch_type(1.), r0s); + + // x >= 1.25 && x < 1.5 + auto xge125 = (x >= _125); + auto xge125t = xge125 && !xge150; + if (any(xge125)) + { + r0x = select(xge125t, x - batch_type(1.), r0x); + r0z = select(xge125t, z * x, r0z); + r0s = select(xge125t, batch_type(-1.), r0s); + } + + // x >= 0.75 && x < 1.5 + batch_bool kernelC(false); + auto xge075 = (x >= _075); + auto xge075t = xge075 && !xge125; + if (any(xge075t)) + { + kernelC = xge075t; + r0x = select(xge075t, x - batch_type(1.), x); + r0z = select(xge075t, batch_type(1.), r0z); + r0s = select(xge075t, batch_type(-1.), r0s); + p = gammalnC(r0x); + } + + // tx < 1.5 && x < 0.75 + auto txlt150 = (tx < _150) && !xge075; + if (any(txlt150)) + { + auto orig = txlt150; + while (any(txlt150)) + { + z = select(txlt150, z * tx, z); + nx = select(txlt150, nx + batch_type(1.), nx); + tx = select(txlt150, x + nx, tx); + txlt150 = (tx < _150) && !xge075; + } + r0x = select(orig, r0x + nx - batch_type(2.), r0x); + r0z = select(orig, z, r0z); + r0s = select(orig, batch_type(-1.), r0s); + } + p = select(kernelC, p, gammalnB(r0x)); + if (all(xlt650)) + return fma(r0x, p, r0s * log(abs(r0z))); + } + r0z = select(xlt650, abs(r0z), x); + batch_type m = log(r0z); + r1 = fma(r0x, p, r0s * m); + batch_type r2 = fma(x - batch_type(0.5), m, constants::logsqrt2pi() - x); + r2 += gammaln2(batch_type(1.) / (x * x)) / x; + return select(xlt650, r1, r2); + } + }; + + template + struct lgamma_impl> + { + using batch_type = batch; + + static inline batch_type compute(const batch_type& a) + { + auto inf_result = (a <= batch_type(0.)) && is_flint(a); + batch_type x = select(inf_result, constants::nan(), a); + batch_type q = abs(x); +#ifndef XSIMD_NO_INFINITIES + inf_result = (q == constants::infinity()); +#endif + auto test = (a < batch_type(-34.)); + batch_type r = constants::nan(); + if (any(test)) + { + r = large_negative(q); + if (all(test)) + return select(inf_result, constants::nan(), r); + } + batch_type r1 = other(a); + batch_type r2 = select(test, r, r1); + return select(a == constants::minusinfinity(), constants::nan(), select(inf_result, constants::infinity(), r2)); + } + + private: + + static inline batch_type large_negative(const batch_type& q) + { + batch_type w = lgamma(q); + batch_type p = floor(q); + batch_type z = q - p; + auto test2 = (z < batch_type(0.5)); + z = select(test2, z - batch_type(1.), z); + z = q * sin(z, trigo_pi_tag()); + z = abs(z); + return constants::logpi() - log(z) - w; + } + + static inline batch_type other(const batch_type& xx) + { + batch_type x = xx; + auto test = (x < batch_type(13.)); + batch_type r1 = batch_type(0.); + if (any(test)) + { + batch_type z = batch_type(1.); + batch_type p = batch_type(0.); + batch_type u = select(test, x, batch_type(0.)); + auto test1 = (u >= batch_type(3.)); + while (any(test1)) + { + p = select(test1, p - batch_type(1.), p); + u = select(test1, x + p, u); + z = select(test1, z * u, z); + test1 = (u >= batch_type(3.)); + } + + auto test2 = (u < batch_type(2.)); + while (any(test2)) + { + z = select(test2, z / u, z); + p = select(test2, p + batch_type(1.), p); + u = select(test2, x + p, u); + test2 = (u < batch_type(2.)); + } + + z = abs(z); + x += p - batch_type(2.); + r1 = x * gammaln1(x) + log(z); + if (all(test)) + return r1; + } + batch_type r2 = fma(xx - batch_type(0.5), log(xx), constants::logsqrt2pi() - xx); + batch_type p = batch_type(1.) / (xx * xx); + r2 += gammalnA(p) / xx; + return select(test, r1, r2); + } + }; + } + + template batch lgamma(batch const& self, requires_arch) { + return detail::lgamma_impl>::compute(self); + } + + + // log + /* origin: boost/simd/arch/common/simd/function/log.hpp */ + /* + * ==================================================== + * copyright 2016 NumScale SAS + * + * Distributed under the Boost Software License, Version 1.0. + * (See copy at http://boost.org/LICENSE_1_0.txt) + * ==================================================== + */ + template batch log(batch const& self, requires_arch) { + using batch_type = batch; + using i_type = as_integer_t; + batch_type x = self; + i_type k(0); + auto isnez = (self != batch_type(0.)); +#ifndef XSIMD_NO_DENORMALS + auto test = (self < constants::smallestposval()) && isnez; + if (any(test)) + { + k = select(bool_cast(test), k - i_type(23), k); + x = select(test, x * batch_type(8388608ul), x); + } +#endif + i_type ix = ::xsimd::bitwise_cast(x); + ix += 0x3f800000 - 0x3f3504f3; + k += (ix >> 23) - 0x7f; + ix = (ix & i_type(0x007fffff)) + 0x3f3504f3; + x = ::xsimd::bitwise_cast(ix); + batch_type f = --x; + batch_type s = f / (batch_type(2.) + f); + batch_type z = s * s; + batch_type w = z * z; + batch_type t1 = w * detail::horner(w); + batch_type t2 = z * detail::horner(w); + batch_type R = t2 + t1; + batch_type hfsq = batch_type(0.5) * f * f; + batch_type dk = to_float(k); + batch_type r = fma(dk, constants::log_2hi(), fma(s, (hfsq + R), dk * constants::log_2lo()) - hfsq + f); +#ifndef XSIMD_NO_INFINITIES + batch_type zz = select(isnez, select(self == constants::infinity(), constants::infinity(), r), constants::minusinfinity()); +#else + batch_type zz = select(isnez, r, constants::minusinfinity()); +#endif + return select(!(self >= batch_type(0.)), constants::nan(), zz); + } + + template batch log(batch const& self, requires_arch) { + using batch_type = batch; + using i_type = as_integer_t; + + batch_type x = self; + i_type hx = ::xsimd::bitwise_cast(x) >> 32; + i_type k(0); + auto isnez = (self != batch_type(0.)); +#ifndef XSIMD_NO_DENORMALS + auto test = (self < constants::smallestposval()) && isnez; + if (any(test)) + { + k = select(bool_cast(test), k - i_type(54), k); + x = select(test, x * batch_type(18014398509481984ull), x); + } +#endif + hx += 0x3ff00000 - 0x3fe6a09e; + k += (hx >> 20) - 0x3ff; + batch_type dk = to_float(k); + hx = (hx & i_type(0x000fffff)) + 0x3fe6a09e; + x = ::xsimd::bitwise_cast(hx << 32 | (i_type(0xffffffff) & ::xsimd::bitwise_cast(x))); + + batch_type f = --x; + batch_type hfsq = batch_type(0.5) * f * f; + batch_type s = f / (batch_type(2.) + f); + batch_type z = s * s; + batch_type w = z * z; + + batch_type t1 = w * detail::horner(w); + batch_type t2 = z * detail::horner(w); + batch_type R = t2 + t1; + batch_type r = fma(dk, constants::log_2hi(), fma(s, (hfsq + R), dk * constants::log_2lo()) - hfsq + f); +#ifndef XSIMD_NO_INFINITIES + batch_type zz = select(isnez, select(self == constants::infinity(), constants::infinity(), r), constants::minusinfinity()); +#else + batch_type zz = select(isnez, r, constants::minusinfinity()); +#endif + return select(!(self >= batch_type(0.)), constants::nan(), zz); + } + + template + batch, A> log(const batch, A>& z, requires_arch) + { + return batch, A>(log(abs(z)), atan2(z.imag(), z.real())); + } + + // log2 + template batch log2(batch const& self, requires_arch) { + using batch_type = batch; + using i_type = as_integer_t; + batch_type x = self; + i_type k(0); + auto isnez = (self != batch_type(0.)); +#ifndef XSIMD_NO_DENORMALS + auto test = (self < constants::smallestposval()) && isnez; + if (any(test)) + { + k = select(bool_cast(test), k - i_type(25), k); + x = select(test, x * batch_type(33554432ul), x); + } +#endif + i_type ix = ::xsimd::bitwise_cast(x); + ix += 0x3f800000 - 0x3f3504f3; + k += (ix >> 23) - 0x7f; + ix = (ix & i_type(0x007fffff)) + 0x3f3504f3; + x = ::xsimd::bitwise_cast(ix); + batch_type f = --x; + batch_type s = f / (batch_type(2.) + f); + batch_type z = s * s; + batch_type w = z * z; + batch_type t1 = w * detail::horner(w); + batch_type t2 = z * detail::horner(w); + batch_type R = t1 + t2; + batch_type hfsq = batch_type(0.5) * f * f; + batch_type dk = to_float(k); + batch_type r = fma(fms(s, hfsq + R, hfsq) + f, constants::invlog_2(), dk); +#ifndef XSIMD_NO_INFINITIES + batch_type zz = select(isnez, select(self == constants::infinity(), constants::infinity(), r), constants::minusinfinity()); +#else + batch_type zz = select(isnez, r, constants::minusinfinity()); +#endif + return select(!(self >= batch_type(0.)), constants::nan(), zz); + } + template batch log2(batch const& self, requires_arch) { + using batch_type = batch; + using i_type = as_integer_t; + batch_type x = self; + i_type hx = ::xsimd::bitwise_cast(x) >> 32; + i_type k(0); + auto isnez = (self != batch_type(0.)); +#ifndef XSIMD_NO_DENORMALS + auto test = (self < constants::smallestposval()) && isnez; + if (any(test)) + { + k = select(bool_cast(test), k - i_type(54), k); + x = select(test, x * batch_type(18014398509481984ull), x); + } +#endif + hx += 0x3ff00000 - 0x3fe6a09e; + k += (hx >> 20) - 0x3ff; + hx = (hx & i_type(0x000fffff)) + 0x3fe6a09e; + x = ::xsimd::bitwise_cast(hx << 32 | (i_type(0xffffffff) & ::xsimd::bitwise_cast(x))); + batch_type f = --x; + batch_type s = f / (batch_type(2.) + f); + batch_type z = s * s; + batch_type w = z * z; + batch_type t1 = w * detail::horner(w); + batch_type t2 = z * detail::horner(w); + batch_type R = t2 + t1; + batch_type hfsq = batch_type(0.5) * f * f; + batch_type hi = f - hfsq; + hi = hi & ::xsimd::bitwise_cast((constants::allbits() << 32)); + batch_type lo = fma(s, hfsq + R, f - hi - hfsq); + batch_type val_hi = hi * constants::invlog_2hi(); + batch_type val_lo = fma(lo + hi, constants::invlog_2lo(), lo * constants::invlog_2hi()); + batch_type dk = to_float(k); + batch_type w1 = dk + val_hi; + val_lo += (dk - w1) + val_hi; + val_hi = w1; + batch_type r = val_lo + val_hi; +#ifndef XSIMD_NO_INFINITIES + batch_type zz = select(isnez, select(self == constants::infinity(), constants::infinity(), r), constants::minusinfinity()); +#else + batch_type zz = select(isnez, r, constants::minusinfinity()); +#endif + return select(!(self >= batch_type(0.)), constants::nan(), zz); + } + namespace detail { + template + inline batch logN_complex_impl(const batch& z, typename batch::value_type base) + { + using batch_type = batch; + using rv_type = typename batch_type::value_type; + return log(z) / batch_type(rv_type(base)); + } + } + + template batch, A> log2(batch, A> const& self, requires_arch) { + return detail::logN_complex_impl(self, std::log(2)); + } + + // log10 + /* origin: FreeBSD /usr/src/lib/msun/src/e_log10f.c */ + /* + * ==================================================== + * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved. + * + * Developed at SunPro, a Sun Microsystems, Inc. business. + * Permission to use, copy, modify, and distribute this + * software is freely granted, provided that this notice + * is preserved. + * ==================================================== + */ + template batch log10(batch const& self, requires_arch) { + using batch_type = batch; + const batch_type + ivln10hi(4.3432617188e-01f), + ivln10lo(-3.1689971365e-05f), + log10_2hi(3.0102920532e-01f), + log10_2lo(7.9034151668e-07f); + using i_type = as_integer_t; + batch_type x = self; + i_type k(0); + auto isnez = (self != batch_type(0.)); +#ifndef XSIMD_NO_DENORMALS + auto test = (self < constants::smallestposval()) && isnez; + if (any(test)) + { + k = select(bool_cast(test), k - i_type(25), k); + x = select(test, x * batch_type(33554432ul), x); + } +#endif + i_type ix = ::xsimd::bitwise_cast(x); + ix += 0x3f800000 - 0x3f3504f3; + k += (ix >> 23) - 0x7f; + ix = (ix & i_type(0x007fffff)) + 0x3f3504f3; + x = ::xsimd::bitwise_cast(ix); + batch_type f = --x; + batch_type s = f / (batch_type(2.) + f); + batch_type z = s * s; + batch_type w = z * z; + batch_type t1 = w * detail::horner(w); + batch_type t2 = z * detail::horner(w); + batch_type R = t2 + t1; + batch_type dk = to_float(k); + batch_type hfsq = batch_type(0.5) * f * f; + batch_type hibits = f - hfsq; + hibits &= ::xsimd::bitwise_cast(i_type(0xfffff000)); + batch_type lobits = fma(s, hfsq + R, f - hibits - hfsq); + batch_type r = fma(dk, log10_2hi, + fma(hibits, ivln10hi, + fma(lobits, ivln10hi, + fma(lobits + hibits, ivln10lo, dk * log10_2lo)))); +#ifndef XSIMD_NO_INFINITIES + batch_type zz = select(isnez, select(self == constants::infinity(), constants::infinity(), r), constants::minusinfinity()); +#else + batch_type zz = select(isnez, r, constants::minusinfinity()); +#endif + return select(!(self >= batch_type(0.)), constants::nan(), zz); + } + template batch log10(batch const& self, requires_arch) { + using batch_type = batch; + const batch_type + ivln10hi(4.34294481878168880939e-01), + ivln10lo(2.50829467116452752298e-11), + log10_2hi(3.01029995663611771306e-01), + log10_2lo(3.69423907715893078616e-13); + using i_type = as_integer_t; + batch_type x = self; + i_type hx = ::xsimd::bitwise_cast(x) >> 32; + i_type k(0); + auto isnez = (self != batch_type(0.)); +#ifndef XSIMD_NO_DENORMALS + auto test = (self < constants::smallestposval()) && isnez; + if (any(test)) + { + k = select(bool_cast(test), k - i_type(54), k); + x = select(test, x * batch_type(18014398509481984ull), x); + } +#endif + hx += 0x3ff00000 - 0x3fe6a09e; + k += (hx >> 20) - 0x3ff; + hx = (hx & i_type(0x000fffff)) + 0x3fe6a09e; + x = ::xsimd::bitwise_cast(hx << 32 | (i_type(0xffffffff) & ::xsimd::bitwise_cast(x))); + batch_type f = --x; + batch_type dk = to_float(k); + batch_type s = f / (batch_type(2.) + f); + batch_type z = s * s; + batch_type w = z * z; + batch_type t1 = w * detail::horner(w); + batch_type t2 = z * detail::horner(w); + batch_type R = t2 + t1; + batch_type hfsq = batch_type(0.5) * f * f; + batch_type hi = f - hfsq; + hi = hi & ::xsimd::bitwise_cast(constants::allbits() << 32); + batch_type lo = f - hi - hfsq + s * (hfsq + R); + batch_type val_hi = hi * ivln10hi; + batch_type y = dk * log10_2hi; + batch_type val_lo = dk * log10_2lo + (lo + hi) * ivln10lo + lo * ivln10hi; + batch_type w1 = y + val_hi; + val_lo += (y - w1) + val_hi; + val_hi = w1; + batch_type r = val_lo + val_hi; +#ifndef XSIMD_NO_INFINITIES + batch_type zz = select(isnez, select(self == constants::infinity(), constants::infinity(), r), constants::minusinfinity()); +#else + batch_type zz = select(isnez, r, constants::minusinfinity()); +#endif + return select(!(self >= batch_type(0.)), constants::nan(), zz); + } + + template + batch, A> log10(const batch, A>& z, requires_arch) + { + return detail::logN_complex_impl(z, std::log(10)); + } + + // log1p + /* origin: boost/simd/arch/common/simd/function/log1p.hpp */ + /* + * ==================================================== + * copyright 2016 NumScale SAS + * + * Distributed under the Boost Software License, Version 1.0. + * (See copy at http://boost.org/LICENSE_1_0.txt) + * ==================================================== + */ + template batch log1p(batch const& self, requires_arch) { + using batch_type = batch; + using i_type = as_integer_t; + const batch_type uf = self + batch_type(1.); + auto isnez = (uf != batch_type(0.)); + i_type iu = ::xsimd::bitwise_cast(uf); + iu += 0x3f800000 - 0x3f3504f3; + i_type k = (iu >> 23) - 0x7f; + iu = (iu & i_type(0x007fffff)) + 0x3f3504f3; + batch_type f = --(::xsimd::bitwise_cast(iu)); + batch_type s = f / (batch_type(2.) + f); + batch_type z = s * s; + batch_type w = z * z; + batch_type t1 = w * detail::horner(w); + batch_type t2 = z * detail::horner(w); + batch_type R = t2 + t1; + batch_type hfsq = batch_type(0.5) * f * f; + batch_type dk = to_float(k); + /* correction term ~ log(1+x)-log(u), avoid underflow in c/u */ + batch_type c = select(bool_cast(k >= i_type(2)), batch_type(1.) - (uf - self), self - (uf - batch_type(1.))) / uf; + batch_type r = fma(dk, constants::log_2hi(), fma(s, (hfsq + R), dk * constants::log_2lo() + c) - hfsq + f); +#ifndef XSIMD_NO_INFINITIES + batch_type zz = select(isnez, select(self == constants::infinity(), constants::infinity(), r), constants::minusinfinity()); +#else + batch_type zz = select(isnez, r, constants::minusinfinity()); +#endif + return select(!(uf >= batch_type(0.)), constants::nan(), zz); + } + template batch log1p(batch const& self, requires_arch) { + using batch_type = batch; + using i_type = as_integer_t; + const batch_type uf = self + batch_type(1.); + auto isnez = (uf != batch_type(0.)); + i_type hu = ::xsimd::bitwise_cast(uf) >> 32; + hu += 0x3ff00000 - 0x3fe6a09e; + i_type k = (hu >> 20) - 0x3ff; + /* correction term ~ log(1+x)-log(u), avoid underflow in c/u */ + batch_type c = select(bool_cast(k >= i_type(2)), batch_type(1.) - (uf - self), self - (uf - batch_type(1.))) / uf; + hu = (hu & i_type(0x000fffff)) + 0x3fe6a09e; + batch_type f = ::xsimd::bitwise_cast((hu << 32) | (i_type(0xffffffff) & ::xsimd::bitwise_cast(uf))); + f = --f; + batch_type hfsq = batch_type(0.5) * f * f; + batch_type s = f / (batch_type(2.) + f); + batch_type z = s * s; + batch_type w = z * z; + batch_type t1 = w * detail::horner(w); + batch_type t2 = z * detail::horner(w); + batch_type R = t2 + t1; + batch_type dk = to_float(k); + batch_type r = fma(dk, constants::log_2hi(), fma(s, hfsq + R, dk * constants::log_2lo() + c) - hfsq + f); +#ifndef XSIMD_NO_INFINITIES + batch_type zz = select(isnez, select(self == constants::infinity(), constants::infinity(), r), constants::minusinfinity()); +#else + batch_type zz = select(isnez, r, constants::minusinfinity()); +#endif + return select(!(uf >= batch_type(0.)), constants::nan(), zz); + } + + template batch, A> log1p(batch, A> const& self, requires_arch) { + using batch_type = batch, A>; + using real_batch = typename batch_type::real_batch; + batch_type u = 1 + self; + batch_type logu = log(u); + return select(u == batch_type(1.), + self, + select(u.real() <= real_batch(0.), + logu, + logu * self / (u - batch_type(1.)))); + } + + + // mod + template::value, void>::type> + batch mod(batch const& self, batch const& other, requires_arch) { + return detail::apply([](T x, T y) -> T { return x % y;}, self, other); + } + + + // nearbyint + template::value, void>::type> + batch nearbyint(batch const& self, requires_arch) { + return self; + } + namespace detail { + template batch nearbyintf(batch const& self) { + using batch_type = batch; + batch_type s = bitofsign(self); + batch_type v = self ^ s; + batch_type t2n = constants::twotonmb(); + // Under fast-math, reordering is possible and the compiler optimizes d + // to v. That's not what we want, so prevent compiler optimization here. + // FIXME: it may be better to emit a memory barrier here (?). +#ifdef __FAST_MATH__ + volatile batch_type d0 = v + t2n; + batch_type d = *(batch_type*)(void*)(&d0) - t2n; +#else + batch_type d0 = v + t2n; + batch_type d = d0 - t2n; +#endif + return s ^ select(v < t2n, d, v); + } + } + template batch nearbyint(batch const& self, requires_arch) { + return detail::nearbyintf(self); + } + template batch nearbyint(batch const& self, requires_arch) { + return detail::nearbyintf(self); + } + + // nextafter + namespace detail + { + template ::value> + struct nextafter_kernel + { + using batch_type = batch; + + static inline batch_type next(batch_type const& b) noexcept + { + return b; + } + + static inline batch_type prev(batch_type const& b) noexcept + { + return b; + } + }; + + template + struct bitwise_cast_batch; + + template + struct bitwise_cast_batch + { + using type = batch; + }; + + template + struct bitwise_cast_batch + { + using type = batch; + }; + + template + struct nextafter_kernel + { + using batch_type = batch; + using int_batch = typename bitwise_cast_batch::type; + using int_type = typename int_batch::value_type; + + static inline batch_type next(const batch_type& b) noexcept + { + batch_type n = ::xsimd::bitwise_cast(::xsimd::bitwise_cast(b) + int_type(1)); + return select(b == constants::infinity(), b, n); + } + + static inline batch_type prev(const batch_type& b) noexcept + { + batch_type p = ::xsimd::bitwise_cast(::xsimd::bitwise_cast(b) - int_type(1)); + return select(b == constants::minusinfinity(), b, p); + } + }; + } + template batch nextafter(batch const& from, batch const& to, requires_arch) { + using kernel = detail::nextafter_kernel; + return select(from == to, from, + select(to > from, kernel::next(from), kernel::prev(from))); + } + + + // pow + /* origin: boost/simd/arch/common/simd/function/pow.hpp*/ + /* + * ==================================================== + * copyright 2016 NumScale SAS + * + * Distributed under the Boost Software License, Version 1.0. + * (See copy at http://boost.org/LICENSE_1_0.txt) + * ==================================================== + */ + template batch pow(batch const& self, batch const& other, requires_arch) { + using batch_type = batch; + auto negx = self < batch_type(0.); + batch_type z = exp(other * log(abs(self))); + z = select(is_odd(other) && negx, -z, z); + auto invalid = negx && !(is_flint(other) || isinf(other)); + return select(invalid, constants::nan(), z); + } + + template + inline batch, A> pow(const batch, A>& a, const batch, A>& z, requires_arch) + { + using cplx_batch = batch, A>; + using real_batch = typename cplx_batch::real_batch; + real_batch absa = abs(a); + real_batch arga = arg(a); + real_batch x = z.real(); + real_batch y = z.imag(); + real_batch r = pow(absa, x); + real_batch theta = x * arga; + real_batch ze(0); + auto cond = (y == ze); + r = select(cond, r, r * exp(-y * arga)); + theta = select(cond, theta, theta + y * log(absa)); + return select(absa == ze, cplx_batch(ze), cplx_batch(r * cos(theta), r * sin(theta))); + } + + + // remainder + template + batch remainder(batch const& self, batch const& other, requires_arch) { + return fnma(nearbyint(self / other), other, self); + } + template + batch remainder(batch const& self, batch const& other, requires_arch) { + return fnma(nearbyint(self / other), other, self); + } + template::value, void>::type> + batch remainder(batch const& self, batch const& other, requires_arch) { + auto mod = self % other; + return select(mod <= other / 2, mod, mod - other); + } + + // select + template + batch, A> select(batch_bool const& cond, batch, A> const& true_br, batch, A> const& false_br, requires_arch) { + return {select(cond, true_br.real(), false_br.real()), select(cond, true_br.imag(), false_br.imag())}; + } + + // sign + template::value, void>::type> batch sign(batch const& self, requires_arch) { + using batch_type = batch; + batch_type res = select(self > batch_type(0), batch_type(1), batch_type(0)) - select(self < batch_type(0), batch_type(1), batch_type(0)); + return res; + } + + namespace detail { + template batch signf(batch const& self) { + using batch_type = batch; + batch_type res = select(self > batch_type(0.f), batch_type(1.f), batch_type(0.f)) - select(self < batch_type(0.f), batch_type(1.f), batch_type(0.f)); +#ifdef XSIMD_NO_NANS + return res; +#else + return select(isnan(self), constants::nan(), res); +#endif + } + } + + template batch sign(batch const& self, requires_arch) { + return detail::signf(self); + } + template batch sign(batch const& self, requires_arch) { + return detail::signf(self); + } + template + inline batch, A> sign(const batch, A>& z, requires_arch) + { + using batch_type = batch, A>; + using real_batch = typename batch_type::real_batch; + auto rz = z.real(); + auto iz = z.imag(); + return select(rz != real_batch(0.), + batch_type(sign(rz)), + batch_type(sign(iz))); + } + + // signnz + template::value, void>::type> batch signnz(batch const& self, requires_arch) { + using batch_type = batch; + return (self >> (sizeof(T) * 8 - 1)) | batch_type(1.); + } + + namespace detail { + template batch signnzf(batch const& self) { + using batch_type = batch; +#ifndef XSIMD_NO_NANS + return select(isnan(self), constants::nan(), batch_type(1.) | (constants::signmask() & self)); +#else + return batch_type(1.) | (constants::signmask() & self); +#endif + } + } + + template batch signnz(batch const& self, requires_arch) { + return detail::signnzf(self); + } + template batch signnz(batch const& self, requires_arch) { + return detail::signnzf(self); + } + + // sqrt + template batch, A> sqrt(batch, A> const& z, requires_arch) { + + constexpr T csqrt_scale_factor = std::is_same::value?6.7108864e7f:1.8014398509481984e16; + constexpr T csqrt_scale = std::is_same::value?1.220703125e-4f:7.450580596923828125e-9; + using batch_type = batch, A>; + using real_batch = batch; + real_batch x = z.real(); + real_batch y = z.imag(); + real_batch sqrt_x = sqrt(fabs(x)); + real_batch sqrt_hy = sqrt(0.5 * fabs(y)); + auto cond = (fabs(x) > real_batch(4.) || fabs(y) > real_batch(4.)); + x = select(cond, x * 0.25, x * csqrt_scale_factor); + y = select(cond, y * 0.25, y * csqrt_scale_factor); + real_batch scale = select(cond, real_batch(2.), real_batch(csqrt_scale)); + real_batch r = abs(batch_type(x, y)); + + auto condxp = x > real_batch(0.); + real_batch t0 = select(condxp, xsimd::sqrt(0.5 * (r + x)), xsimd::sqrt(0.5 * (r - x))); + real_batch r0 = scale * fabs((0.5 * y) / t0); + t0 *= scale; + real_batch t = select(condxp, t0, r0); + r = select(condxp, r0, t0); + batch_type resg = select(y < real_batch(0.), batch_type(t, -r), batch_type(t, r)); + real_batch ze(0.); + + return select(y == ze, + select(x == ze, + batch_type(ze, ze), + select(x < ze, batch_type(ze, sqrt_x), batch_type(sqrt_x, ze))), + select(x == ze, + select(y > ze, batch_type(sqrt_hy, sqrt_hy), batch_type(sqrt_hy, -sqrt_hy)), + resg)); + + } + + // tgamma + + namespace detail { + /* origin: boost/simd/arch/common/detail/generic/stirling_kernel.hpp */ + /* + * ==================================================== + * copyright 2016 NumScale SAS + * + * Distributed under the Boost Software License, Version 1.0. + * (See copy at http://boost.org/LICENSE_1_0.txt) + * ==================================================== + */ + template + struct stirling_kernel; + + template + struct stirling_kernel> + { + using batch_type = batch; + static inline batch_type compute(const batch_type& x) + { + return horner(x); + } + + static inline batch_type split_limit() + { + return batch_type(bit_cast(uint32_t(0x41d628f6))); + } + + static inline batch_type large_limit() + { + return batch_type(bit_cast(uint32_t(0x420c28f3))); + } + }; + + template + struct stirling_kernel> + { + using batch_type = batch; + static inline batch_type compute(const batch_type& x) + { + return horner(x); + } + + static inline batch_type split_limit() + { + return batch_type(bit_cast(uint64_t(0x4061e083ba3443d4))); + } + + static inline batch_type large_limit() + { + return batch_type(bit_cast(uint64_t(0x4065800000000000))); + } + }; + + /* origin: boost/simd/arch/common/simd/function/stirling.hpp */ + /* + * ==================================================== + * copyright 2016 NumScale SAS + * + * Distributed under the Boost Software License, Version 1.0. + * (See copy at http://boost.org/LICENSE_1_0.txt) + * ==================================================== + */ + template + inline batch stirling(const batch& a) + { + using batch_type = batch; + const batch_type stirlingsplitlim = stirling_kernel::split_limit(); + const batch_type stirlinglargelim = stirling_kernel::large_limit(); + batch_type x = select(a >= batch_type(0.), a, constants::nan()); + batch_type w = batch_type(1.) / x; + w = fma(w, stirling_kernel::compute(w), batch_type(1.)); + batch_type y = exp(-x); + auto test = (x < stirlingsplitlim); + batch_type z = x - batch_type(0.5); + z = select(test, z, batch_type(0.5) * z); + batch_type v = exp(z * log(abs(x))); + y *= v; + y = select(test, y, y * v); + y *= constants::sqrt_2pi() * w; +#ifndef XSIMD_NO_INFINITIES + y = select(isinf(x), x, y); +#endif + return select(x > stirlinglargelim, constants::infinity(), y); + } + + /* origin: boost/simd/arch/common/detail/generic/gamma_kernel.hpp */ + /* + * ==================================================== + * copyright 2016 NumScale SAS + * + * Distributed under the Boost Software License, Version 1.0. + * (See copy at http://boost.org/LICENSE_1_0.txt) + * ==================================================== + */ + template + struct tgamma_kernel; + + template + struct tgamma_kernel> + { + using batch_type = batch; + static inline batch_type compute(const batch_type& x) + { + return horner(x); + } + }; + + template + struct tgamma_kernel> + { + using batch_type = batch; + static inline batch_type compute(const batch_type& x) + { + return horner(x) / + horner(x); + } + }; + + /* origin: boost/simd/arch/common/simd/function/gamma.hpp */ + /* + * ==================================================== + * copyright 2016 NumScale SAS + * + * Distributed under the Boost Software License, Version 1.0. + * (See copy at http://boost.org/LICENSE_1_0.txt) + * ==================================================== + */ + template + B tgamma_large_negative(const B& a) + { + B st = stirling(a); + B p = floor(a); + B sgngam = select(is_even(p), -B(1.), B(1.)); + B z = a - p; + auto test2 = z < B(0.5); + z = select(test2, z - B(1.), z); + z = a * sin(z, trigo_pi_tag()); + z = abs(z); + return sgngam * constants::pi() / (z * st); + } + + template + B tgamma_other(const B& a, const BB& test) + { + B x = select(test, B(2.), a); +#ifndef XSIMD_NO_INFINITIES + auto inf_result = (a == constants::infinity()); + x = select(inf_result, B(2.), x); +#endif + B z = B(1.); + auto test1 = (x >= B(3.)); + while (any(test1)) + { + x = select(test1, x - B(1.), x); + z = select(test1, z * x, z); + test1 = (x >= B(3.)); + } + test1 = (x < B(0.)); + while (any(test1)) + { + z = select(test1, z / x, z); + x = select(test1, x + B(1.), x); + test1 = (x < B(0.)); + } + auto test2 = (x < B(2.)); + while (any(test2)) + { + z = select(test2, z / x, z); + x = select(test2, x + B(1.), x); + test2 = (x < B(2.)); + } + x = z * tgamma_kernel::compute(x - B(2.)); +#ifndef XSIMD_NO_INFINITIES + return select(inf_result, a, x); +#else + return x; +#endif + } + } + + template batch tgamma(batch const& self, requires_arch) { + using batch_type = batch; + auto nan_result = (self < batch_type(0.) && is_flint(self)); +#ifndef XSIMD_NO_INVALIDS + nan_result = isnan(self) || nan_result; +#endif + batch_type q = abs(self); + auto test = (self < batch_type(-33.)); + batch_type r = constants::nan(); + if (any(test)) + { + r = detail::tgamma_large_negative(q); + if (all(test)) + return select(nan_result, constants::nan(), r); + } + batch_type r1 = detail::tgamma_other(self, test); + batch_type r2 = select(test, r, r1); + return select(self == batch_type(0.), copysign(constants::infinity(), self), select(nan_result, constants::nan(), r2)); + } + + + } + +} + +#endif + diff --git a/third_party/xsimd/arch/generic/xsimd_generic_memory.hpp b/third_party/xsimd/arch/generic/xsimd_generic_memory.hpp new file mode 100644 index 0000000000..39f4c102c6 --- /dev/null +++ b/third_party/xsimd/arch/generic/xsimd_generic_memory.hpp @@ -0,0 +1,188 @@ +/*************************************************************************** +* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * +* Martin Renou * +* Copyright (c) QuantStack * +* Copyright (c) Serge Guelton * +* * +* Distributed under the terms of the BSD 3-Clause License. * +* * +* The full license is in the file LICENSE, distributed with this software. * +****************************************************************************/ + +#ifndef XSIMD_GENERIC_MEMORY_HPP +#define XSIMD_GENERIC_MEMORY_HPP + +#include +#include +#include + +#include "./xsimd_generic_details.hpp" + + +namespace xsimd { + + namespace kernel { + + using namespace types; + + // extract_pair + template batch extract_pair(batch const& self, batch const& other, std::size_t i, requires_arch) { + constexpr std::size_t size = batch::size; + assert(0<= i && i< size && "index in bounds"); + + alignas(A::alignment()) T self_buffer[size]; + self.store_aligned(self_buffer); + + alignas(A::alignment()) T other_buffer[size]; + other.store_aligned(other_buffer); + + alignas(A::alignment()) T concat_buffer[size]; + + for (std::size_t j = 0 ; j < (size - i); ++j) + { + concat_buffer[j] = other_buffer[i + j]; + if(j < i) + { + concat_buffer[size - 1 - j] = self_buffer[i - 1 - j]; + } + } + return batch::load_aligned(concat_buffer); + } + + // load_aligned + namespace detail { + template + batch load_aligned(T_in const* mem, convert, requires_arch, with_fast_conversion) { + using batch_type_in = batch; + using batch_type_out = batch; + return fast_cast(batch_type_in::load_aligned(mem), batch_type_out(), A{}); + } + template + batch load_aligned(T_in const* mem, convert, requires_arch, with_slow_conversion) { + static_assert(!std::is_same::value, "there should be a direct load for this type combination"); + using batch_type_out = batch; + alignas(A::alignment()) T_out buffer[batch_type_out::size]; + std::copy(mem, mem + batch_type_out::size, std::begin(buffer)); + return batch_type_out::load_aligned(buffer); + } + } + template + batch load_aligned(T_in const* mem, convert cvt, requires_arch) { + return detail::load_aligned(mem, cvt, A{}, detail::conversion_type{}); + } + + // load_unaligned + namespace detail { + template + batch load_unaligned(T_in const* mem, convert, requires_arch, with_fast_conversion) { + using batch_type_in = batch; + using batch_type_out = batch; + return fast_cast(batch_type_in::load_unaligned(mem), batch_type_out(), A{}); + } + + template + batch load_unaligned(T_in const* mem, convert cvt, requires_arch, with_slow_conversion) { + static_assert(!std::is_same::value, "there should be a direct load for this type combination"); + return load_aligned(mem, cvt, generic{}, with_slow_conversion{}); + } + } + template + batch load_unaligned(T_in const* mem, convert cvt, requires_arch) { + return detail::load_unaligned(mem, cvt, generic{}, detail::conversion_type{}); + } + + // store + template + void store(batch_bool const& self, bool* mem, requires_arch) { + using batch_type = batch; + constexpr auto size = batch_bool::size; + alignas(A::alignment()) T buffer[size]; + kernel::store_aligned(&buffer[0], batch_type(self), A{}); + for(std::size_t i = 0; i < size; ++i) + mem[i] = bool(buffer[i]); + } + + + // store_aligned + template void store_aligned(T_out *mem, batch const& self, requires_arch) { + static_assert(!std::is_same::value, "there should be a direct store for this type combination"); + alignas(A::alignment()) T_in buffer[batch::size]; + store_aligned(&buffer[0], self); + std::copy(std::begin(buffer), std::end(buffer), mem); + } + + // store_unaligned + template void store_unaligned(T_out *mem, batch const& self, requires_arch) { + static_assert(!std::is_same::value, "there should be a direct store for this type combination"); + return store_aligned(mem, self, generic{}); + } + + namespace detail + { + template + batch, A> load_complex(batch const& /*hi*/, batch const& /*lo*/, requires_arch) + { + static_assert(std::is_same::value, "load_complex not implemented for the required architecture"); + } + + template + batch complex_high(batch, A> const& /*src*/, requires_arch) + { + static_assert(std::is_same::value, "complex_high not implemented for the required architecture"); + } + + template + batch complex_low(batch, A> const& /*src*/, requires_arch) + { + static_assert(std::is_same::value, "complex_low not implemented for the required architecture"); + } + } + + // load_complex_aligned + template + batch, A> load_complex_aligned(std::complex const* mem, convert>, requires_arch) { + using real_batch = batch; + T_in const* buffer = reinterpret_cast(mem); + real_batch hi = real_batch::load_aligned(buffer), + lo = real_batch::load_aligned(buffer + real_batch::size); + return detail::load_complex(hi, lo, A{}); + } + + // load_complex_unaligned + template + batch, A> load_complex_unaligned(std::complex const* mem, convert> ,requires_arch) { + using real_batch = batch; + T_in const* buffer = reinterpret_cast(mem); + real_batch hi = real_batch::load_unaligned(buffer), + lo = real_batch::load_unaligned(buffer + real_batch::size); + return detail::load_complex(hi, lo, A{}); + } + + // store_complex_aligned + template + void store_complex_aligned(std::complex* dst, batch, A> const& src, requires_arch) { + using real_batch = batch; + real_batch hi = detail::complex_high(src, A{}); + real_batch lo = detail::complex_low(src, A{}); + T_out* buffer = reinterpret_cast(dst); + lo.store_aligned(buffer); + hi.store_aligned(buffer + real_batch::size); + } + + // store_compelx_unaligned + template + void store_complex_unaligned(std::complex* dst, batch, A> const& src, requires_arch) { + using real_batch = batch; + real_batch hi = detail::complex_high(src, A{}); + real_batch lo = detail::complex_low(src, A{}); + T_out* buffer = reinterpret_cast(dst); + lo.store_unaligned(buffer); + hi.store_unaligned(buffer + real_batch::size); + } + + } + +} + +#endif + diff --git a/third_party/xsimd/arch/generic/xsimd_generic_rounding.hpp b/third_party/xsimd/arch/generic/xsimd_generic_rounding.hpp new file mode 100644 index 0000000000..b1c988101b --- /dev/null +++ b/third_party/xsimd/arch/generic/xsimd_generic_rounding.hpp @@ -0,0 +1,64 @@ +/*************************************************************************** +* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * +* Martin Renou * +* Copyright (c) QuantStack * +* Copyright (c) Serge Guelton * +* * +* Distributed under the terms of the BSD 3-Clause License. * +* * +* The full license is in the file LICENSE, distributed with this software. * +****************************************************************************/ + +#ifndef XSIMD_GENERIC_ROUNDING_HPP +#define XSIMD_GENERIC_ROUNDING_HPP + +#include "./xsimd_generic_details.hpp" + + +namespace xsimd { + + namespace kernel { + + + using namespace types; + + // ceil + template batch ceil(batch const& self, requires_arch) { + batch truncated_self = trunc(self); + return select(truncated_self < self, truncated_self + 1, truncated_self); + } + + + // floor + template batch floor(batch const& self, requires_arch) { + batch truncated_self = trunc(self); + return select(truncated_self > self, truncated_self - 1, truncated_self); + } + + // round + template batch round(batch const& self, requires_arch) { + auto v = abs(self); + auto c = ceil(v); + auto cp = select(c - 0.5 > v, c - 1, c); + return select(v > constants::maxflint>(), self, copysign(cp, self)); + } + + // trunc + template::value, void>::type> + batch trunc(batch const& self, requires_arch) { + return self; + } + template batch trunc(batch const& self, requires_arch) { + return select(abs(self) < constants::maxflint>(), to_float(to_int(self)), self); + } + template batch trunc(batch const& self, requires_arch) { + return select(abs(self) < constants::maxflint>(), to_float(to_int(self)), self); + } + + + } + +} + +#endif + diff --git a/third_party/xsimd/arch/generic/xsimd_generic_trigo.hpp b/third_party/xsimd/arch/generic/xsimd_generic_trigo.hpp new file mode 100644 index 0000000000..f649c6c259 --- /dev/null +++ b/third_party/xsimd/arch/generic/xsimd_generic_trigo.hpp @@ -0,0 +1,936 @@ +/*************************************************************************** +* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * +* Martin Renou * +* Copyright (c) QuantStack * +* Copyright (c) Serge Guelton * +* * +* Distributed under the terms of the BSD 3-Clause License. * +* * +* The full license is in the file LICENSE, distributed with this software. * +****************************************************************************/ + +#ifndef XSIMD_GENERIC_TRIGO_HPP +#define XSIMD_GENERIC_TRIGO_HPP + +#include "./xsimd_generic_details.hpp" + +namespace xsimd { + + namespace kernel { + /* origin: boost/simd/arch/common/detail/simd/trig_base.hpp */ + /* + * ==================================================== + * copyright 2016 NumScale SAS + * + * Distributed under the Boost Software License, Version 1.0. + * (See copy at http://boost.org/LICENSE_1_0.txt) + * ==================================================== + */ + + using namespace types; + + // acos + template batch acos(batch const& self, requires_arch) { + using batch_type = batch; + batch_type x = abs(self); + auto x_larger_05 = x > batch_type(0.5); + x = select(x_larger_05, sqrt(fma(batch_type(-0.5), x, batch_type(0.5))), self); + x = asin(x); + x = select(x_larger_05, x + x, x); + x = select(self < batch_type(-0.5), constants::pi() - x, x); + return select(x_larger_05, x, constants::pio2() - x); + } + template + batch, A> acos(const batch, A>& z, requires_arch) + { + using batch_type = batch, A>; + using real_batch = typename batch_type::real_batch; + batch_type tmp = asin(z); + return {constants::pio2() - tmp.real(), -tmp.imag()}; + } + + // acosh + /* origin: boost/simd/arch/common/simd/function/acosh.hpp */ + /* + * ==================================================== + * copyright 2016 NumScale SAS + * + * Distributed under the Boost Software License, Version 1.0. + * (See copy at http://boost.org/LICENSE_1_0.txt) + * ==================================================== + */ + template batch acosh(batch const& self, requires_arch) { + using batch_type = batch; + batch_type x = self - batch_type(1.); + auto test = x > constants::oneotwoeps(); + batch_type z = select(test, self, x + sqrt(x + x + x * x)); + batch_type l1pz = log1p(z); + return select(test, l1pz + constants::log_2(), l1pz); + } + template + inline batch, A> acosh(const batch, A>& z, requires_arch) + { + using batch_type = batch, A>; + batch_type w = acos(z); + w = batch_type(-w.imag(), w.real()); + return w; + } + + // asin + template batch asin(batch const& self, requires_arch) { + using batch_type = batch; + batch_type x = abs(self); + batch_type sign = bitofsign(self); + auto x_larger_05 = x > batch_type(0.5); + batch_type z = select(x_larger_05, batch_type(0.5) * (batch_type(1.) - x), x * x); + x = select(x_larger_05, sqrt(z), x); + batch_type z1 = detail::horner(z); + z1 = fma(z1, z * x, x); + z = select(x_larger_05, constants::pio2() - (z1 + z1), z1); + return z ^ sign; + } + template batch asin(batch const& self, requires_arch) { + using batch_type = batch; + batch_type x = abs(self); + auto small_cond = x < constants::sqrteps(); + batch_type ct1 = batch_type(bit_cast(int64_t(0x3fe4000000000000))); + batch_type zz1 = batch_type(1.) - x; + batch_type vp = zz1 * detail::horner(zz1) / + detail::horner1(zz1); + zz1 = sqrt(zz1 + zz1); + batch_type z = constants::pio4() - zz1; + zz1 = fms(zz1, vp, constants::pio_2lo()); + z = z - zz1; + zz1 = z + constants::pio4(); + batch_type zz2 = self * self; + z = zz2 * detail::horner(zz2) / + detail::horner1(zz2); + zz2 = fma(x, z, x); + return select(x > batch_type(1.), constants::nan(), + select(small_cond, x, + select(x > ct1, zz1, zz2)) ^ + bitofsign(self)); + } + template + batch, A> asin(const batch, A>& z, requires_arch) + { + using batch_type = batch, A>; + using real_batch = typename batch_type::real_batch; + real_batch x = z.real(); + real_batch y = z.imag(); + + batch_type ct(-y, x); + batch_type zz(real_batch(1.) - (x - y) * (x + y), -2 * x * y); + zz = log(ct + sqrt(zz)); + batch_type resg(zz.imag(), -zz.real()); + + return select(y == real_batch(0.), + select(fabs(x) > real_batch(1.), + batch_type(constants::pio2(), real_batch(0.)), + batch_type(asin(x), real_batch(0.))), + resg); + } + + // asinh + /* origin: boost/simd/arch/common/simd/function/asinh.hpp */ + /* + * ==================================================== + * copyright 2016 NumScale SAS + * + * Distributed under the Boost Software License, Version 1.0. + * (See copy at http://boost.org/LICENSE_1_0.txt) + * ==================================================== + */ + namespace detail { + template::value, void>::type> + batch + average(const batch& x1, const batch& x2) + { + return (x1 & x2) + ((x1 ^ x2) >> 1); + } + + template + batch + averagef(const batch& x1, const batch& x2) + { + using batch_type = batch; + return fma(x1, batch_type(0.5), x2 * batch_type(0.5)); + } + template + batch average(batch const & x1, batch const & x2) { + return averagef(x1, x2); + } + template + batch average(batch const & x1, batch const & x2) { + return averagef(x1, x2); + } + } + template batch asinh(batch const& self, requires_arch) { + using batch_type = batch; + batch_type x = abs(self); + auto lthalf = x < batch_type(0.5); + batch_type x2 = x * x; + batch_type bts = bitofsign(self); + batch_type z(0.); + if (any(lthalf)) + { + z = detail::horner(x2) * + x; + if (all(lthalf)) + return z ^ bts; + } + batch_type tmp = select(x > constants::oneosqrteps(), x, detail::average(x, hypot(batch_type(1.), x))); +#ifndef XSIMD_NO_NANS + return select(isnan(self), constants::nan(), select(lthalf, z, log(tmp) + constants::log_2()) ^ bts); +#else + return select(lthalf, z, log(tmp) + constants::log_2()) ^ bts; +#endif + } + template batch asinh(batch const& self, requires_arch) { + using batch_type = batch; + batch_type x = abs(self); + auto test = x > constants::oneosqrteps(); + batch_type z = select(test, x - batch_type(1.), x + x * x / (batch_type(1.) + hypot(batch_type(1.), x))); +#ifndef XSIMD_NO_INFINITIES + z = select(x == constants::infinity(), x, z); +#endif + batch_type l1pz = log1p(z); + z = select(test, l1pz + constants::log_2(), l1pz); + return bitofsign(self) ^ z; + } + template + inline batch, A> asinh(const batch, A>& z, requires_arch) + { + using batch_type = batch, A>; + batch_type w = asin(batch_type(-z.imag(), z.real())); + w = batch_type(w.imag(), -w.real()); + return w; + } + + // atan + namespace detail { + template + static inline batch kernel_atan(const batch& x, const batch& recx) + { + using batch_type = batch; + const auto flag1 = x < constants::tan3pio8(); + const auto flag2 = (x >= batch_type(bit_cast((uint32_t)0x3ed413cd))) && flag1; + batch_type yy = select(flag1, batch_type(0.), constants::pio2()); + yy = select(flag2, constants::pio4(), yy); + batch_type xx = select(flag1, x, -recx); + xx = select(flag2, (x - batch_type(1.)) / (x + batch_type(1.)), xx); + const batch_type z = xx * xx; + batch_type z1 = detail::horner(z); + z1 = fma(xx, z1 * z, xx); + z1 = select(flag2, z1 + constants::pio_4lo(), z1); + z1 = select(!flag1, z1 + constants::pio_2lo(), z1); + return yy + z1; + } + template + static inline batch kernel_atan(const batch& x, const batch& recx) + { + using batch_type = batch; + const auto flag1 = x < constants::tan3pio8(); + const auto flag2 = (x >= constants::tanpio8()) && flag1; + batch_type yy = select(flag1, batch_type(0.), constants::pio2()); + yy = select(flag2, constants::pio4(), yy); + batch_type xx = select(flag1, x, -recx); + xx = select(flag2, (x - batch_type(1.)) / (x + batch_type(1.)), xx); + batch_type z = xx * xx; + z *= detail::horner(z) / + detail::horner1(z); + z = fma(xx, z, xx); + z = select(flag2, z + constants::pio_4lo(), z); + z = z + select(flag1, batch_type(0.), constants::pio_2lo()); + return yy + z; + } + } + template batch atan(batch const& self, requires_arch) { + using batch_type = batch; + const batch_type absa = abs(self); + const batch_type x = detail::kernel_atan(absa, batch_type(1.) / absa); + return x ^ bitofsign(self); + } + template + batch, A> atan(const batch, A>& z, requires_arch) + { + using batch_type = batch, A>; + using real_batch = typename batch_type::real_batch; + real_batch x = z.real(); + real_batch y = z.imag(); + real_batch x2 = x * x; + real_batch one(1.); + real_batch a = one - x2 - (y * y); + real_batch w = 0.5 * atan2(2. * x, a); + real_batch num = y + one; + num = x2 + num * num; + real_batch den = y - one; + den = x2 + den * den; + batch_type res = select((x == real_batch(0.)) && (y == real_batch(1.)), + batch_type(real_batch(0.), constants::infinity()), + batch_type(w, 0.25 * log(num / den))); + return res; + } + + // atanh + /* origin: boost/simd/arch/common/simd/function/acosh.hpp */ + /* + * ==================================================== + * copyright 2016 NumScale SAS + * + * Distributed under the Boost Software License, Version 1.0. + * (See copy at http://boost.org/LICENSE_1_0.txt) + * ==================================================== + */ + template batch atanh(batch const& self, requires_arch) { + using batch_type = batch; + batch_type x = abs(self); + batch_type t = x + x; + batch_type z = batch_type(1.) - x; + auto test = x < batch_type(0.5); + batch_type tmp = select(test, x, t) / z; + return bitofsign(self) ^ (batch_type(0.5) * log1p(select(test, fma(t, tmp, t), tmp))); + } + template + inline batch, A> atanh(const batch, A>& z, requires_arch) + { + using batch_type = batch, A>; + batch_type w = atan(batch_type(-z.imag(), z.real())); + w = batch_type(w.imag(), -w.real()); + return w; + } + + // atan2 + template batch atan2(batch const& self, batch const& other, requires_arch) { + using batch_type = batch; + const batch_type q = abs(self / other); + const batch_type z = detail::kernel_atan(q, batch_type(1.) / q); + return select(other > batch_type(0.), z, constants::pi() - z) * signnz(self); + } + + + // cos + namespace detail + { + template + batch quadrant(const batch& x) { + return x & batch(3); + } + + template + batch quadrant(const batch& x) { + return to_float(quadrant(to_int(x))); + } + + template + batch quadrant(const batch& x) { + using batch_type = batch; + batch_type a = x * batch_type(0.25); + return (a - floor(a)) * batch_type(4.); + } + /* origin: boost/simd/arch/common/detail/simd/f_trig_evaluation.hpp */ + /* + * ==================================================== + * copyright 2016 NumScale SAS + * + * Distributed under the Boost Software License, Version 1.0. + * (See copy at http://boost.org/LICENSE_1_0.txt) + * ==================================================== + */ + + template + inline batch cos_eval(const batch& z) + { + using batch_type = batch; + batch_type y = detail::horner(z); + return batch_type(1.) + fma(z, batch_type(-0.5), y * z * z); + } + + template + inline batch sin_eval(const batch& z, const batch& x) + { + using batch_type = batch; + batch_type y = detail::horner(z); + return fma(y * z, x, x); + } + + template + static inline batch base_tancot_eval(const batch& z) + { + using batch_type = batch; + batch_type zz = z * z; + batch_type y = detail::horner(zz); + return fma(y, zz * z, z); + } + + template + static inline batch tan_eval(const batch& z, const BB& test) + { + using batch_type = batch; + batch_type y = base_tancot_eval(z); + return select(test, y, -batch_type(1.) / y); + } + + template + static inline batch cot_eval(const batch& z, const BB& test) + { + using batch_type = batch; + batch_type y = base_tancot_eval(z); + return select(test, batch_type(1.) / y, -y); + } + + /* origin: boost/simd/arch/common/detail/simd/d_trig_evaluation.hpp */ + /* + * ==================================================== + * copyright 2016 NumScale SAS + * + * Distributed under the Boost Software License, Version 1.0. + * (See copy at http://boost.org/LICENSE_1_0.txt) + * ==================================================== + */ + template + static inline batch cos_eval(const batch& z) + { + using batch_type = batch; + batch_type y = detail::horner(z); + return batch_type(1.) - y * z; + } + + template + static inline batch sin_eval(const batch& z, const batch& x) + { + using batch_type = batch; + batch_type y = detail::horner(z); + return fma(y * z, x, x); + } + + template + static inline batch base_tancot_eval(const batch& z) + { + using batch_type = batch; + batch_type zz = z * z; + batch_type num = detail::horner(zz); + batch_type den = detail::horner1(zz); + return fma(z, (zz * (num / den)), z); + } + + template + static inline batch tan_eval(const batch& z, const BB& test) + { + using batch_type = batch; + batch_type y = base_tancot_eval(z); + return select(test, y, -batch_type(1.) / y); + } + + template + static inline batch cot_eval(const batch& z, const BB& test) + { + using batch_type = batch; + batch_type y = base_tancot_eval(z); + return select(test, batch_type(1.) / y, -y); + } + /* origin: boost/simd/arch/common/detail/simd/trig_reduction.hpp */ + /* + * ==================================================== + * copyright 2016 NumScale SAS + * + * Distributed under the Boost Software License, Version 1.0. + * (See copy at http://boost.org/LICENSE_1_0.txt) + * ==================================================== + */ + + struct trigo_radian_tag + { + }; + struct trigo_pi_tag + { + }; + + template + struct trigo_reducer + { + static inline B reduce(const B& x, B& xr) + { + if (all(x <= constants::pio4())) + { + xr = x; + return B(0.); + } + else if (all(x <= constants::pio2())) + { + auto test = x > constants::pio4(); + xr = x - constants::pio2_1(); + xr -= constants::pio2_2(); + xr -= constants::pio2_3(); + xr = select(test, xr, x); + return select(test, B(1.), B(0.)); + } + else if (all(x <= constants::twentypi())) + { + B xi = nearbyint(x * constants::twoopi()); + xr = fnma(xi, constants::pio2_1(), x); + xr -= xi * constants::pio2_2(); + xr -= xi * constants::pio2_3(); + return quadrant(xi); + } + else if (all(x <= constants::mediumpi())) + { + B fn = nearbyint(x * constants::twoopi()); + B r = x - fn * constants::pio2_1(); + B w = fn * constants::pio2_1t(); + B t = r; + w = fn * constants::pio2_2(); + r = t - w; + w = fn * constants::pio2_2t() - ((t - r) - w); + t = r; + w = fn * constants::pio2_3(); + r = t - w; + w = fn * constants::pio2_3t() - ((t - r) - w); + xr = r - w; + return quadrant(fn); + } + else + { + static constexpr std::size_t size = B::size; + using value_type = typename B::value_type; + alignas(B) std::array tmp; + alignas(B) std::array txr; + alignas(B) std::array args; + x.store_aligned(args.data()); + + for (std::size_t i = 0; i < size; ++i) + { + double arg = args[i]; + if (arg == std::numeric_limits::infinity()) + { + tmp[i] = 0.; + txr[i] = std::numeric_limits::quiet_NaN(); + } + else + { + double y[2]; + std::int32_t n = ::xsimd::detail::__ieee754_rem_pio2(arg, y); + tmp[i] = value_type(n & 3); + txr[i] = value_type(y[0]); + } + } + xr = B::load_aligned(&txr[0]); + B res = B::load_aligned(&tmp[0]); + return res; + } + } + }; + + template + struct trigo_reducer + { + static inline B reduce(const B& x, B& xr) + { + B xi = nearbyint(x * B(2.)); + B x2 = x - xi * B(0.5); + xr = x2 * constants::pi(); + return quadrant(xi); + } + }; + + } + template batch cos(batch const& self, requires_arch) { + using batch_type = batch; + const batch_type x = abs(self); + batch_type xr = constants::nan(); + const batch_type n = detail::trigo_reducer::reduce(x, xr); + auto tmp = select(n >= batch_type(2.), batch_type(1.), batch_type(0.)); + auto swap_bit = fma(batch_type(-2.), tmp, n); + auto sign_bit = select((swap_bit ^ tmp) != batch_type(0.), constants::signmask(), batch_type(0.)); + const batch_type z = xr * xr; + const batch_type se = detail::sin_eval(z, xr); + const batch_type ce = detail::cos_eval(z); + const batch_type z1 = select(swap_bit != batch_type(0.), se, ce); + return z1 ^ sign_bit; + } + + template batch, A> cos(batch, A> const& z, requires_arch) { + return {cos(z.real()) * cosh(z.imag()), -sin(z.real()) * sinh(z.imag())}; + } + + // cosh + + /* origin: boost/simd/arch/common/simd/function/cosh.hpp */ + /* + * ==================================================== + * copyright 2016 NumScale SAS + * + * Distributed under the Boost Software License, Version 1.0. + * (See copy at http://boost.org/LICENSE_1_0.txt) + * ==================================================== + */ + + template batch cosh(batch const& self, requires_arch) { + using batch_type = batch; + batch_type x = abs(self); + auto test1 = x > (constants::maxlog() - constants::log_2()); + batch_type fac = select(test1, batch_type(0.5), batch_type(1.)); + batch_type tmp = exp(x * fac); + batch_type tmp1 = batch_type(0.5) * tmp; + return select(test1, tmp1 * tmp, detail::average(tmp, batch_type(1.) / tmp)); + } + template + inline batch, A> cosh(const batch, A>& z, requires_arch) + { + auto x = z.real(); + auto y = z.imag(); + return {cosh(x) * cos(y), sinh(x) * sin(y)}; + } + + + // sin + namespace detail { + template batch sin(batch const& self, Tag = Tag()) { + using batch_type = batch; + const batch_type x = abs(self); + batch_type xr = constants::nan(); + const batch_type n = detail::trigo_reducer::reduce(x, xr); + auto tmp = select(n >= batch_type(2.), batch_type(1.), batch_type(0.)); + auto swap_bit = fma(batch_type(-2.), tmp, n); + auto sign_bit = bitofsign(self) ^ select(tmp != batch_type(0.), constants::signmask(), batch_type(0.)); + const batch_type z = xr * xr; + const batch_type se = detail::sin_eval(z, xr); + const batch_type ce = detail::cos_eval(z); + const batch_type z1 = select(swap_bit == batch_type(0.), se, ce); + return z1 ^ sign_bit; + } + } + + template batch sin(batch const& self, requires_arch) { + return detail::sin(self); + } + + template batch, A> sin(batch, A> const& z, requires_arch) { + return {sin(z.real()) * cosh(z.imag()), cos(z.real()) * sinh(z.imag())}; + } + + // sincos + template std::pair, batch> sincos(batch const& self, requires_arch) { + using batch_type = batch; + const batch_type x = abs(self); + batch_type xr = constants::nan(); + const batch_type n = detail::trigo_reducer::reduce(x, xr); + auto tmp = select(n >= batch_type(2.), batch_type(1.), batch_type(0.)); + auto swap_bit = fma(batch_type(-2.), tmp, n); + const batch_type z = xr * xr; + const batch_type se = detail::sin_eval(z, xr); + const batch_type ce = detail::cos_eval(z); + auto sin_sign_bit = bitofsign(self) ^ select(tmp != batch_type(0.), constants::signmask(), batch_type(0.)); + const batch_type sin_z1 = select(swap_bit == batch_type(0.), se, ce); + auto cos_sign_bit = select((swap_bit ^ tmp) != batch_type(0.), constants::signmask(), batch_type(0.)); + const batch_type cos_z1 = select(swap_bit != batch_type(0.), se, ce); + return std::make_pair(sin_z1 ^ sin_sign_bit, cos_z1 ^ cos_sign_bit); + } + + template + std::pair, A>, batch, A>> + sincos(batch, A> const& z, requires_arch) { + using batch_type = batch, A>; + using real_batch = typename batch_type::real_batch; + real_batch rcos = cos(z.real()); + real_batch rsin = sin(z.real()); + real_batch icosh = cosh(z.imag()); + real_batch isinh = sinh(z.imag()); + return std::make_pair(batch_type(rsin * icosh, rcos * isinh), batch_type(rcos * icosh, -rsin * isinh)); + } + + // sinh + namespace detail { + /* origin: boost/simd/arch/common/detail/generic/sinh_kernel.hpp */ + /* + * ==================================================== + * copyright 2016 NumScale SAS + * + * Distributed under the Boost Software License, Version 1.0. + * (See copy at http://boost.org/LICENSE_1_0.txt) + * ==================================================== + */ + template batch sinh_kernel(batch const& self) { + using batch_type = batch; + batch_type sqr_self = self * self; + return detail::horner(sqr_self) * + self; + } + + template batch sinh_kernel(batch const& self) { + using batch_type = batch; + batch_type sqrself = self * self; + return fma(self, (detail::horner(sqrself) / + detail::horner1(sqrself)) * + sqrself, + self); + } + } + /* origin: boost/simd/arch/common/simd/function/sinh.hpp */ + /* + * ==================================================== + * copyright 2016 NumScale SAS + * + * Distributed under the Boost Software License, Version 1.0. + * (See copy at http://boost.org/LICENSE_1_0.txt) + * ==================================================== + */ + template batch sinh(batch const& a, requires_arch) { + using batch_type = batch; + batch_type half(0.5); + batch_type x = abs(a); + auto lt1 = x < batch_type(1.); + batch_type bts = bitofsign(a); + batch_type z(0.); + if (any(lt1)) + { + z = detail::sinh_kernel(x); + if (all(lt1)) + return z ^ bts; + } + auto test1 = x >( constants::maxlog() - constants::log_2()); + batch_type fac = select(test1, half, batch_type(1.)); + batch_type tmp = exp(x * fac); + batch_type tmp1 = half * tmp; + batch_type r = select(test1, tmp1 * tmp, tmp1 - half / tmp); + return select(lt1, z, r) ^ bts; + } + template + inline batch, A> sinh(const batch, A>& z, requires_arch) + { + auto x = z.real(); + auto y = z.imag(); + return {sinh(x) * cos(y), cosh(x) * sin(y)}; + } + + // tan + template batch tan(batch const& self, requires_arch) { + using batch_type = batch; + const batch_type x = abs(self); + batch_type xr = constants::nan(); + const batch_type n = detail::trigo_reducer::reduce(x, xr); + auto tmp = select(n >= batch_type(2.), batch_type(1.), batch_type(0.)); + auto swap_bit = fma(batch_type(-2.), tmp, n); + auto test = (swap_bit == batch_type(0.)); + const batch_type y = detail::tan_eval(xr, test); + return y ^ bitofsign(self); + } + template batch, A> tan(batch, A> const& z, requires_arch) { + using batch_type = batch, A>; + using real_batch = typename batch_type::real_batch; + real_batch d = cos(2 * z.real()) + cosh(2 * z.imag()); + batch_type winf(constants::infinity(), constants::infinity()); + real_batch wreal = sin(2 * z.real()) / d; + real_batch wimag = sinh(2 * z.imag()); + batch_type wres = select(isinf(wimag), batch_type(wreal, real_batch(1.)), batch_type(wreal, wimag / d)); + return select(d == real_batch(0.), winf, wres); + + } + + // tanh + namespace detail { + /* origin: boost/simd/arch/common/detail/generic/tanh_kernel.hpp */ + /* + * ==================================================== + * copyright 2016 NumScale SAS + * + * Distributed under the Boost Software License, Version 1.0. + * (See copy at http://boost.org/LICENSE_1_0.txt) + * ==================================================== + */ + template + struct tanh_kernel; + + template + struct tanh_kernel> + { + using batch_type = batch; + static inline batch_type tanh(const batch_type& x) + { + batch_type sqrx = x * x; + return fma(detail::horner(sqrx) * + sqrx, + x, x); + } + + static inline batch_type cotanh(const batch_type& x) + { + return batch_type(1.) / tanh(x); + } + }; + + template + struct tanh_kernel> + { + using batch_type = batch; + static inline batch_type tanh(const batch_type& x) + { + batch_type sqrx = x * x; + return fma(sqrx * p(sqrx) / q(sqrx), x, x); + } + + static inline batch_type cotanh(const batch_type& x) + { + batch_type sqrx = x * x; + batch_type qval = q(sqrx); + return qval / (x * fma(p(sqrx), sqrx, qval)); + } + + static inline batch_type p(const batch_type& x) + { + return detail::horner(x); + } + + static inline batch_type q(const batch_type& x) + { + return detail::horner1(x); + } + }; + + } + /* origin: boost/simd/arch/common/simd/function/tanh.hpp */ + /* + * ==================================================== + * copyright 2016 NumScale SAS + * + * Distributed under the Boost Software License, Version 1.0. + * (See copy at http://boost.org/LICENSE_1_0.txt) + * ==================================================== + */ + template batch tanh(batch const& self, requires_arch) { + using batch_type = batch; + batch_type one(1.); + batch_type x = abs(self); + auto test = x < (batch_type(5.) / batch_type(8.)); + batch_type bts = bitofsign(self); + batch_type z = one; + if (any(test)) + { + z = detail::tanh_kernel::tanh(x); + if (all(test)) + return z ^ bts; + } + batch_type r = fma(batch_type(-2.), one / (one + exp(x + x)), one); + return select(test, z, r) ^ bts; + } + template + inline batch, A> tanh(const batch, A>& z, requires_arch) + { + using real_batch = typename batch, A>::real_batch; + auto x = z.real(); + auto y = z.imag(); + real_batch two(2); + auto d = cosh(two * x) + cos(two * y); + return {sinh(two * x) / d, sin(two * y) / d}; + } + + } + +} + +#endif + diff --git a/third_party/xsimd/arch/xsimd_avx.hpp b/third_party/xsimd/arch/xsimd_avx.hpp new file mode 100644 index 0000000000..7ac817f559 --- /dev/null +++ b/third_party/xsimd/arch/xsimd_avx.hpp @@ -0,0 +1,961 @@ +/*************************************************************************** +* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * +* Martin Renou * +* Copyright (c) QuantStack * +* Copyright (c) Serge Guelton * +* * +* Distributed under the terms of the BSD 3-Clause License. * +* * +* The full license is in the file LICENSE, distributed with this software. * +****************************************************************************/ + +#ifndef XSIMD_AVX_HPP +#define XSIMD_AVX_HPP + +#include +#include +#include + +#include "../types/xsimd_avx_register.hpp" + +namespace xsimd { + + namespace kernel { + using namespace types; + + namespace detail { + inline void split_avx(__m256i val, __m128i& low, __m128i& high) { + low =_mm256_castsi256_si128(val); + high =_mm256_extractf128_si256(val, 1); + } + inline __m256i merge_sse(__m128i low, __m128i high) { + return _mm256_insertf128_si256(_mm256_castsi128_si256(low), high, 1); + } + template + __m256i fwd_to_sse(F f, __m256i self, __m256i other) { + __m128i self_low, self_high, other_low, other_high; + split_avx(self, self_low, self_high); + split_avx(other, other_low, other_high); + __m128i res_low = f(self_low, other_low); + __m128i res_high = f(self_high, other_high); + return merge_sse(res_low, res_high); + } + template + __m256i fwd_to_sse(F f, __m256i self, int32_t other) { + __m128i self_low, self_high; + split_avx(self, self_low, self_high); + __m128i res_low = f(self_low, other); + __m128i res_high = f(self_high, other); + return merge_sse(res_low, res_high); + } + } + + // abs + template batch abs(batch const& self, requires_arch) { + __m256 sign_mask = _mm256_set1_ps(-0.f); // -0.f = 1 << 31 + return _mm256_andnot_ps(sign_mask, self); + } + template batch abs(batch const& self, requires_arch) { + __m256d sign_mask = _mm256_set1_pd(-0.f); // -0.f = 1 << 31 + return _mm256_andnot_pd(sign_mask, self); + } + + // add + template::value, void>::type> + batch add(batch const& self, batch const& other, requires_arch) { + switch(sizeof(T)) { + case 1: return _mm256_add_epi8(self, other); + case 2: return _mm256_add_epi16(self, other); + case 4: return detail::fwd_to_sse([](__m128i s, __m128i o) { return add(batch(s), batch(o)); }, self, other); + case 8: return detail::fwd_to_sse([](__m128i s, __m128i o) { return add(batch(s), batch(o)); }, self, other); + default: assert(false && "unsupported arch/op combination"); return {}; + } + } + template batch add(batch const& self, batch const& other, requires_arch) { + return _mm256_add_ps(self, other); + } + template batch add(batch const& self, batch const& other, requires_arch) { + return _mm256_add_pd(self, other); + } + + // all + template bool all(batch_bool const& self, requires_arch) { + return _mm256_testc_ps(self, batch_bool(true)) != 0; + } + template bool all(batch_bool const& self, requires_arch) { + return _mm256_testc_pd(self, batch_bool(true)) != 0; + } + template::value, void>::type> + bool all(batch_bool const& self, requires_arch) { + return _mm256_testc_si256(self, batch_bool(true)) != 0; + } + + // any + template bool any(batch_bool const& self, requires_arch) { + return !_mm256_testz_ps(self, self); + } + template bool any(batch_bool const& self, requires_arch) { + return !_mm256_testz_pd(self, self); + } + template::value, void>::type> + bool any(batch_bool const& self, requires_arch) { + return !_mm256_testz_si256(self, self); + } + + // bitwise_and + template batch bitwise_and(batch const& self, batch const& other, requires_arch) { + return _mm256_and_ps(self, other); + } + template batch bitwise_and(batch const& self, batch const& other, requires_arch) { + return _mm256_and_pd(self, other); + } + + template batch_bool bitwise_and(batch_bool const& self, batch_bool const& other, requires_arch) { + return _mm256_and_ps(self, other); + } + template batch_bool bitwise_and(batch_bool const& self, batch_bool const& other, requires_arch) { + return _mm256_and_pd(self, other); + } + + template::value, void>::type> + batch bitwise_and(batch const& self, batch const& other, requires_arch) { + return detail::fwd_to_sse([](__m128i s, __m128i o) { return bitwise_and(batch(s), batch(o)); }, self, other); + } + template::value, void>::type> + batch_bool bitwise_and(batch_bool const& self, batch_bool const& other, requires_arch) { + return detail::fwd_to_sse([](__m128i s, __m128i o) { return bitwise_and(batch(s), batch(o)); }, self, other); + } + + // bitwise_andnot + template batch bitwise_andnot(batch const& self, batch const& other, requires_arch) { + return _mm256_andnot_ps(self, other); + } + template batch bitwise_andnot(batch const& self, batch const& other, requires_arch) { + return _mm256_andnot_pd(self, other); + } + + template batch_bool bitwise_andnot(batch_bool const& self, batch_bool const& other, requires_arch) { + return _mm256_andnot_ps(self, other); + } + template batch_bool bitwise_andnot(batch_bool const& self, batch_bool const& other, requires_arch) { + return _mm256_andnot_pd(self, other); + } + + template::value, void>::type> + batch bitwise_andnot(batch const& self, batch const& other, requires_arch) { + return detail::fwd_to_sse([](__m128i s, __m128i o) { return bitwise_andnot(batch(s), batch(o)); }, self, other); + } + template::value, void>::type> + batch_bool bitwise_andnot(batch_bool const& self, batch_bool const& other, requires_arch) { + return detail::fwd_to_sse([](__m128i s, __m128i o) { return bitwise_andnot(batch(s), batch(o)); }, self, other); + } + + // bitwise_lshift + template::value, void>::type> + batch bitwise_lshift(batch const& self, int32_t other, requires_arch) { + switch(sizeof(T)) { + case 1: return detail::fwd_to_sse([](__m128i s, __m128i o) { return bitwise_and(batch(s), batch(o)); }, _mm256_set1_epi8(0xFF << other), _mm256_slli_epi32(self, other)); + + case 2: return _mm256_slli_epi16(self, other); + case 4: return detail::fwd_to_sse([](__m128i s, int32_t o) { return bitwise_lshift(batch(s), o, sse4_2{}); },self, other); + case 8: return _mm256_slli_epi64(self, other); + default: assert(false && "unsupported arch/op combination"); return {}; + } + } + + // bitwise_not + template::value, void>::type> + batch bitwise_not(batch const& self, requires_arch) { + return _mm256_xor_si256(self, _mm256_set1_epi32(-1)); + } + template::value, void>::type> + batch_bool bitwise_not(batch_bool const& self, requires_arch) { + return _mm256_xor_si256(self, _mm256_set1_epi32(-1)); + } + + // bitwise_or + template batch bitwise_or(batch const& self, batch const& other, requires_arch) { + return _mm256_or_ps(self, other); + } + template batch bitwise_or(batch const& self, batch const& other, requires_arch) { + return _mm256_or_pd(self, other); + } + template batch_bool bitwise_or(batch_bool const& self, batch_bool const& other, requires_arch) { + return _mm256_or_ps(self, other); + } + template batch_bool bitwise_or(batch_bool const& self, batch_bool const& other, requires_arch) { + return _mm256_or_pd(self, other); + } + template::value, void>::type> + batch bitwise_or(batch const& self, batch const& other, requires_arch) { + return _mm256_or_si256(self, other); + } + template::value, void>::type> + batch_bool bitwise_or(batch_bool const& self, batch_bool const& other, requires_arch) { + return _mm256_or_si256(self, other); + } + + // bitwise_rshift + template::value, void>::type> + batch bitwise_rshift(batch const& self, int32_t other, requires_arch) { + if(std::is_signed::value) { + switch(sizeof(T)) { + case 1: { + __m256i sign_mask = _mm256_set1_epi16((0xFF00 >> other) & 0x00FF); + __m256i cmp_is_negative = _mm256_cmpgt_epi8(_mm256_setzero_si256(), self); + __m256i res = _mm256_srai_epi16(self, other); + return _mm256_or_si256( + detail::fwd_to_sse([](__m128i s, __m128i o) { return bitwise_and(batch(s), batch(o)); }, sign_mask, cmp_is_negative), + _mm256_andnot_si256(sign_mask, res)); + } + case 2: return _mm256_srai_epi16(self, other); + case 4: return detail::fwd_to_sse([](__m128i s, int32_t o) { return bitwise_rshift(batch(s), o, sse4_2{}); }, self, other); + case 8: { + // from https://github.com/samyvilar/vect/blob/master/vect_128.h + return _mm256_or_si256( + _mm256_srli_epi64(self, other), + _mm256_slli_epi64( + _mm256_srai_epi32(_mm256_shuffle_epi32(self, _MM_SHUFFLE(3, 3, 1, 1)), 32), + 64 - other)); + } + default: assert(false && "unsupported arch/op combination"); return {}; + } + } + else { + switch(sizeof(T)) { + case 1: return detail::fwd_to_sse([](__m128i s, __m128i o) { return bitwise_and(batch(s), batch(o)); }, _mm256_set1_epi8(0xFF >> other), _mm256_srli_epi32(self, other)); + case 2: return _mm256_srli_epi16(self, other); + case 4: return _mm256_srli_epi32(self, other); + case 8: return _mm256_srli_epi64(self, other); + default: assert(false && "unsupported arch/op combination"); return {}; + } + } + } + + // bitwise_xor + template batch bitwise_xor(batch const& self, batch const& other, requires_arch) { + return _mm256_xor_ps(self, other); + } + template batch bitwise_xor(batch const& self, batch const& other, requires_arch) { + return _mm256_xor_pd(self, other); + } + template batch_bool bitwise_xor(batch_bool const& self, batch_bool const& other, requires_arch) { + return _mm256_xor_ps(self, other); + } + template batch_bool bitwise_xor(batch_bool const& self, batch_bool const& other, requires_arch) { + return _mm256_xor_pd(self, other); + } + template::value, void>::type> + batch bitwise_xor(batch const& self, batch const& other, requires_arch) { + return _mm256_xor_si256(self, other); + } + // bitwise_cast + template::value, void>::type> + batch bitwise_cast(batch const& self, batch const &, requires_arch) { + return _mm256_castsi256_ps(self); + } + template::value, void>::type> + batch bitwise_cast(batch const& self, batch const &, requires_arch) { + return _mm256_castsi256_pd(self); + } + template::type>::value, void>::type> + batch bitwise_cast(batch const& self, batch const &, requires_arch) { + return batch(self.data); + } + template + batch bitwise_cast(batch const& self, batch const &, requires_arch) { + return _mm256_castps_pd(self); + } + template::value, void>::type> + batch bitwise_cast(batch const& self, batch const &, requires_arch) { + return _mm256_castps_si256(self); + } + template + batch bitwise_cast(batch const& self, batch const &, requires_arch) { + return _mm256_castpd_ps(self); + } + template::value, void>::type> + batch bitwise_cast(batch const& self, batch const &, requires_arch) { + return _mm256_castpd_si256(self); + } + + // bitwise_not + template batch bitwise_not(batch const& self, requires_arch) { + return _mm256_xor_ps(self, _mm256_castsi256_ps(_mm256_set1_epi32(-1))); + } + template + batch bitwise_not(batch const &self, requires_arch) { + return _mm256_xor_pd(self, _mm256_castsi256_pd(_mm256_set1_epi32(-1))); + } + template batch_bool bitwise_not(batch_bool const& self, requires_arch) { + return _mm256_xor_ps(self, _mm256_castsi256_ps(_mm256_set1_epi32(-1))); + } + template + batch_bool bitwise_not(batch_bool const &self, requires_arch) { + return _mm256_xor_pd(self, _mm256_castsi256_pd(_mm256_set1_epi32(-1))); + } + + // bool_cast + template batch_bool bool_cast(batch_bool const& self, requires_arch) { + return _mm256_castps_si256(self); + } + template batch_bool bool_cast(batch_bool const& self, requires_arch) { + return _mm256_castsi256_ps(self); + } + template batch_bool bool_cast(batch_bool const& self, requires_arch) { + return _mm256_castpd_si256(self); + } + template batch_bool bool_cast(batch_bool const& self, requires_arch) { + return _mm256_castsi256_pd(self); + } + + // broadcast + template::value, void>::type> + batch broadcast(T val, requires_arch) { + switch(sizeof(T)) { + case 1: return _mm256_set1_epi8(val); + case 2: return _mm256_set1_epi16(val); + case 4: return _mm256_set1_epi32(val); + case 8: return _mm256_set1_epi64x(val); + default: assert(false && "unsupported"); return {}; + } + } + template batch broadcast(float val, requires_arch) { + return _mm256_set1_ps(val); + } + template batch broadcast(double val, requires_arch) { + return _mm256_set1_pd(val); + } + + // ceil + template batch ceil(batch const& self, requires_arch) { + return _mm256_ceil_ps(self); + } + template batch ceil(batch const& self, requires_arch) { + return _mm256_ceil_pd(self); + } + + + namespace detail { + // On clang, _mm256_extractf128_ps is built upon build_shufflevector + // which require index parameter to be a constant + template + inline B get_half_complex_f(const B& real, const B& imag) + { + __m128 tmp0 = _mm256_extractf128_ps(real, index); + __m128 tmp1 = _mm256_extractf128_ps(imag, index); + __m128 tmp2 = _mm_unpackhi_ps(tmp0, tmp1); + tmp0 = _mm_unpacklo_ps(tmp0, tmp1); + __m256 res = real; + res = _mm256_insertf128_ps(res, tmp0, 0); + res = _mm256_insertf128_ps(res, tmp2, 1); + return res; + } + template + inline B get_half_complex_d(const B& real, const B& imag) + { + __m128d tmp0 = _mm256_extractf128_pd(real, index); + __m128d tmp1 = _mm256_extractf128_pd(imag, index); + __m128d tmp2 = _mm_unpackhi_pd(tmp0, tmp1); + tmp0 = _mm_unpacklo_pd(tmp0, tmp1); + __m256d res = real; + res = _mm256_insertf128_pd(res, tmp0, 0); + res = _mm256_insertf128_pd(res, tmp2, 1); + return res; + } + + // complex_low + template batch complex_low(batch, A> const& self, requires_arch) { + return get_half_complex_f<0>(self.real(), self.imag()); + } + template batch complex_low(batch, A> const& self, requires_arch) { + return get_half_complex_d<0>(self.real(), self.imag()); + } + + // complex_high + template batch complex_high(batch, A> const& self, requires_arch) { + return get_half_complex_f<1>(self.real(), self.imag()); + } + template batch complex_high(batch, A> const& self, requires_arch) { + return get_half_complex_d<1>(self.real(), self.imag()); + } + } + // convert + namespace detail { + template batch fast_cast(batch const& self, batch const&, requires_arch) { + return _mm256_cvtepi32_ps(self); + } + template batch fast_cast(batch const& self, batch const&, requires_arch) { + return _mm256_cvttps_epi32(self); + } + } + + // div + template batch div(batch const& self, batch const& other, requires_arch) { + return _mm256_div_ps(self, other); + } + template batch div(batch const& self, batch const& other, requires_arch) { + return _mm256_div_pd(self, other); + } + + // eq + template batch_bool eq(batch const& self, batch const& other, requires_arch) { + return _mm256_cmp_ps(self, other, _CMP_EQ_OQ); + } + template batch_bool eq(batch const& self, batch const& other, requires_arch) { + return _mm256_cmp_pd(self, other, _CMP_EQ_OQ); + } + template batch_bool eq(batch_bool const& self, batch_bool const& other, requires_arch) { + return _mm256_castsi256_ps(detail::fwd_to_sse([](__m128i s, __m128i o) { return eq(batch_bool(s), batch_bool(o), sse4_2{}); }, + _mm256_castps_si256(self), _mm256_castps_si256(other))); + } + template batch_bool eq(batch_bool const& self, batch_bool const& other, requires_arch) { + return _mm256_castsi256_pd(detail::fwd_to_sse([](__m128i s, __m128i o) { return eq(batch_bool(s), batch_bool(o), sse4_2{}); }, _mm256_castpd_si256(self), _mm256_castpd_si256(other))); + } + template::value, void>::type> + batch_bool eq(batch const& self, batch const& other, requires_arch) { + switch(sizeof(T)) { + case 1: return _mm256_cmpeq_epi8(self, other); + case 2: return _mm256_cmpeq_epi16(self, other); + case 4: return detail::fwd_to_sse([](__m128i s, __m128i o) { return eq(batch(s), batch(o), sse4_2{}); },self, other); + case 8: { + __m256i tmp1 = detail::fwd_to_sse([](__m128i s, __m128i o) { return eq(batch(s), batch(o)); },self, other); + __m256i tmp2 = _mm256_shuffle_epi32(tmp1, 0xB1); + __m256i tmp3 = detail::fwd_to_sse([](__m128i s, __m128i o) { return bitwise_and(batch(s), batch(o)); }, tmp1, tmp2); + __m256i tmp4 = _mm256_srai_epi32(tmp3, 31); + return _mm256_shuffle_epi32(tmp4, 0xF5); + } + default: assert(false && "unsupported arch/op combination"); return {}; + } + } + template::value, void>::type> + batch_bool eq(batch_bool const& self, batch_bool const& other, requires_arch) { + return eq(batch(self.data), batch(other.data)); + } + + // floor + template batch floor(batch const& self, requires_arch) { + return _mm256_floor_ps(self); + } + template batch floor(batch const& self, requires_arch) { + return _mm256_floor_pd(self); + } + + // ge + template batch_bool ge(batch const& self, batch const& other, requires_arch) { + return _mm256_cmp_ps(self, other, _CMP_GE_OQ); + } + template batch_bool ge(batch const& self, batch const& other, requires_arch) { + return _mm256_cmp_pd(self, other, _CMP_GE_OQ); + } + template::value, void>::type> + batch_bool ge(batch const& self, batch const& other, requires_arch) { + return detail::fwd_to_sse([](__m128i s, __m128i o) { return ge(batch(s), batch(o)); }, self, other); + } + + // gt + template batch_bool gt(batch const& self, batch const& other, requires_arch) { + return _mm256_cmp_ps(self, other, _CMP_GT_OQ); + } + template batch_bool gt(batch const& self, batch const& other, requires_arch) { + return _mm256_cmp_pd(self, other, _CMP_GT_OQ); + } + template::value, void>::type> + batch_bool gt(batch const& self, batch const& other, requires_arch) { + return detail::fwd_to_sse([](__m128i s, __m128i o) { return gt(batch(s), batch(o)); }, self, other); + } + + + // hadd + template float hadd(batch const& rhs, requires_arch) { + // Warning about _mm256_hadd_ps: + // _mm256_hadd_ps(a,b) gives + // (a0+a1,a2+a3,b0+b1,b2+b3,a4+a5,a6+a7,b4+b5,b6+b7). Hence we can't + // rely on a naive use of this method + // rhs = (x0, x1, x2, x3, x4, x5, x6, x7) + // tmp = (x4, x5, x6, x7, x0, x1, x2, x3) + __m256 tmp = _mm256_permute2f128_ps(rhs, rhs, 1); + // tmp = (x4+x0, x5+x1, x6+x2, x7+x3, x0+x4, x1+x5, x2+x6, x3+x7) + tmp = _mm256_add_ps(rhs, tmp); + // tmp = (x4+x0+x5+x1, x6+x2+x7+x3, -, -, -, -, -, -) + tmp = _mm256_hadd_ps(tmp, tmp); + // tmp = (x4+x0+x5+x1+x6+x2+x7+x3, -, -, -, -, -, -, -) + tmp = _mm256_hadd_ps(tmp, tmp); + return _mm_cvtss_f32(_mm256_extractf128_ps(tmp, 0)); + + } + template + double hadd(batch const &rhs, requires_arch) { + // rhs = (x0, x1, x2, x3) + // tmp = (x2, x3, x0, x1) + __m256d tmp = _mm256_permute2f128_pd(rhs, rhs, 1); + // tmp = (x2+x0, x3+x1, -, -) + tmp = _mm256_add_pd(rhs, tmp); + // tmp = (x2+x0+x3+x1, -, -, -) + tmp = _mm256_hadd_pd(tmp, tmp); + return _mm_cvtsd_f64(_mm256_extractf128_pd(tmp, 0)); + } + template::value, void>::type> + T hadd(batch const& self, requires_arch) { + __m128i low, high; + detail::split_avx(self, low, high); + batch blow(low), bhigh(high); + return hadd(blow) + hadd(bhigh); + } + + // haddp + template batch haddp(batch const* row, requires_arch) { + // row = (a,b,c,d,e,f,g,h) + // tmp0 = (a0+a1, a2+a3, b0+b1, b2+b3, a4+a5, a6+a7, b4+b5, b6+b7) + __m256 tmp0 = _mm256_hadd_ps(row[0], row[1]); + // tmp1 = (c0+c1, c2+c3, d1+d2, d2+d3, c4+c5, c6+c7, d4+d5, d6+d7) + __m256 tmp1 = _mm256_hadd_ps(row[2], row[3]); + // tmp1 = (a0+a1+a2+a3, b0+b1+b2+b3, c0+c1+c2+c3, d0+d1+d2+d3, + // a4+a5+a6+a7, b4+b5+b6+b7, c4+c5+c6+c7, d4+d5+d6+d7) + tmp1 = _mm256_hadd_ps(tmp0, tmp1); + // tmp0 = (e0+e1, e2+e3, f0+f1, f2+f3, e4+e5, e6+e7, f4+f5, f6+f7) + tmp0 = _mm256_hadd_ps(row[4], row[5]); + // tmp2 = (g0+g1, g2+g3, h0+h1, h2+h3, g4+g5, g6+g7, h4+h5, h6+h7) + __m256 tmp2 = _mm256_hadd_ps(row[6], row[7]); + // tmp2 = (e0+e1+e2+e3, f0+f1+f2+f3, g0+g1+g2+g3, h0+h1+h2+h3, + // e4+e5+e6+e7, f4+f5+f6+f7, g4+g5+g6+g7, h4+h5+h6+h7) + tmp2 = _mm256_hadd_ps(tmp0, tmp2); + // tmp0 = (a0+a1+a2+a3, b0+b1+b2+b3, c0+c1+c2+c3, d0+d1+d2+d3, + // e4+e5+e6+e7, f4+f5+f6+f7, g4+g5+g6+g7, h4+h5+h6+h7) + tmp0 = _mm256_blend_ps(tmp1, tmp2, 0b11110000); + // tmp1 = (a4+a5+a6+a7, b4+b5+b6+b7, c4+c5+c6+c7, d4+d5+d6+d7, + // e0+e1+e2+e3, f0+f1+f2+f3, g0+g1+g2+g3, h0+h1+h2+h3) + tmp1 = _mm256_permute2f128_ps(tmp1, tmp2, 0x21); + return _mm256_add_ps(tmp0, tmp1); + } + template + batch haddp(batch const *row, requires_arch) { + // row = (a,b,c,d) + // tmp0 = (a0+a1, b0+b1, a2+a3, b2+b3) + __m256d tmp0 = _mm256_hadd_pd(row[0], row[1]); + // tmp1 = (c0+c1, d0+d1, c2+c3, d2+d3) + __m256d tmp1 = _mm256_hadd_pd(row[2], row[3]); + // tmp2 = (a0+a1, b0+b1, c2+c3, d2+d3) + __m256d tmp2 = _mm256_blend_pd(tmp0, tmp1, 0b1100); + // tmp1 = (a2+a3, b2+b3, c2+c3, d2+d3) + tmp1 = _mm256_permute2f128_pd(tmp0, tmp1, 0x21); + return _mm256_add_pd(tmp1, tmp2); + } + + // isnan + template batch_bool isnan(batch const& self, requires_arch) { + return _mm256_cmp_ps(self, self, _CMP_UNORD_Q); + } + template batch_bool isnan(batch const& self, requires_arch) { + return _mm256_cmp_pd(self, self, _CMP_UNORD_Q); + } + + // le + template batch_bool le(batch const& self, batch const& other, requires_arch) { + return _mm256_cmp_ps(self, other, _CMP_LE_OQ); + } + template batch_bool le(batch const& self, batch const& other, requires_arch) { + return _mm256_cmp_pd(self, other, _CMP_LE_OQ); + } + + // load_aligned + template::value, void>::type> + batch load_aligned(T const* mem, convert, requires_arch) { + return _mm256_load_si256((__m256i const*)mem); + } + template batch load_aligned(float const* mem, convert, requires_arch) { + return _mm256_load_ps(mem); + } + template batch load_aligned(double const* mem, convert, requires_arch) { + return _mm256_load_pd(mem); + } + + namespace detail + { + // load_complex + template batch, A> load_complex(batch const& hi, batch const& lo, requires_arch) { + using batch_type = batch; + __m128 tmp0 = _mm256_extractf128_ps(hi, 0); + __m128 tmp1 = _mm256_extractf128_ps(hi, 1); + batch_type real, imag; + __m128 tmp_real = _mm_shuffle_ps(tmp0, tmp1, _MM_SHUFFLE(2, 0, 2, 0)); + __m128 tmp_imag = _mm_shuffle_ps(tmp0, tmp1, _MM_SHUFFLE(3, 1, 3, 1)); + real = _mm256_insertf128_ps(real, tmp_real, 0); + imag = _mm256_insertf128_ps(imag, tmp_imag, 0); + + tmp0 = _mm256_extractf128_ps(lo, 0); + tmp1 = _mm256_extractf128_ps(lo, 1); + tmp_real = _mm_shuffle_ps(tmp0, tmp1, _MM_SHUFFLE(2, 0, 2, 0)); + tmp_imag = _mm_shuffle_ps(tmp0, tmp1, _MM_SHUFFLE(3, 1, 3, 1)); + real = _mm256_insertf128_ps(real, tmp_real, 1); + imag = _mm256_insertf128_ps(imag, tmp_imag, 1); + return {real, imag}; + } + template batch, A> load_complex(batch const& hi, batch const& lo, requires_arch) { + using batch_type = batch; + __m128d tmp0 = _mm256_extractf128_pd(hi, 0); + __m128d tmp1 = _mm256_extractf128_pd(hi, 1); + batch_type real, imag; + __m256d re_tmp0 = _mm256_insertf128_pd(real, _mm_unpacklo_pd(tmp0, tmp1), 0); + __m256d im_tmp0 = _mm256_insertf128_pd(imag, _mm_unpackhi_pd(tmp0, tmp1), 0); + tmp0 = _mm256_extractf128_pd(lo, 0); + tmp1 = _mm256_extractf128_pd(lo, 1); + __m256d re_tmp1 = _mm256_insertf128_pd(real, _mm_unpacklo_pd(tmp0, tmp1), 1); + __m256d im_tmp1 = _mm256_insertf128_pd(imag, _mm_unpackhi_pd(tmp0, tmp1), 1); + real = _mm256_blend_pd(re_tmp0, re_tmp1, 12); + imag = _mm256_blend_pd(im_tmp0, im_tmp1, 12); + return {real, imag}; + } + } + + // load_unaligned + template::value, void>::type> + batch load_unaligned(T const* mem, convert, requires_arch) { + return _mm256_loadu_si256((__m256i const*)mem); + } + template batch load_unaligned(float const* mem, convert, requires_arch){ + return _mm256_loadu_ps(mem); + } + template batch load_unaligned(double const* mem, convert, requires_arch){ + return _mm256_loadu_pd(mem); + } + + // lt + template batch_bool lt(batch const& self, batch const& other, requires_arch) { + return _mm256_cmp_ps(self, other, _CMP_LT_OQ); + } + template batch_bool lt(batch const& self, batch const& other, requires_arch) { + return _mm256_cmp_pd(self, other, _CMP_LT_OQ); + } + + template::value, void>::type> + batch_bool lt(batch const& self, batch const& other, requires_arch) { + return detail::fwd_to_sse([](__m128i s, __m128i o) { return lt(batch(s), batch(o)); }, self, other); + } + + // max + template batch max(batch const& self, batch const& other, requires_arch) { + return _mm256_max_ps(self, other); + } + template batch max(batch const& self, batch const& other, requires_arch) { + return _mm256_max_pd(self, other); + } + template::value, void>::type> + batch max(batch const& self, batch const& other, requires_arch) { + return select(self > other, self, other); + } + + // min + template batch min(batch const& self, batch const& other, requires_arch) { + return _mm256_min_ps(self, other); + } + template batch min(batch const& self, batch const& other, requires_arch) { + return _mm256_min_pd(self, other); + } + template::value, void>::type> + batch min(batch const& self, batch const& other, requires_arch) { + return select(self <= other, self, other); + } + + // mul + template batch mul(batch const& self, batch const& other, requires_arch) { + return _mm256_mul_ps(self, other); + } + template batch mul(batch const& self, batch const& other, requires_arch) { + return _mm256_mul_pd(self, other); + } + + // nearbyint + template batch nearbyint(batch const& self, requires_arch) { + return _mm256_round_ps(self, _MM_FROUND_TO_NEAREST_INT); + } + template batch nearbyint(batch const& self, requires_arch) { + return _mm256_round_pd(self, _MM_FROUND_TO_NEAREST_INT); + } + + // neg + template::value, void>::type> + batch neg(batch const& self, requires_arch) { + return 0 - self; + } + template batch neg(batch const& self, requires_arch) { + return _mm256_xor_ps(self, _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000))); + } + template + batch neg(batch const &self, requires_arch) { + return _mm256_xor_pd(self, _mm256_castsi256_pd(_mm256_set1_epi64x(0x8000000000000000))); + } + + // neq + template batch_bool neq(batch const& self, batch const& other, requires_arch) { + return _mm256_cmp_ps(self, other, _CMP_NEQ_OQ); + } + template batch_bool neq(batch const& self, batch const& other, requires_arch) { + return _mm256_cmp_pd(self, other, _CMP_NEQ_OQ); + } + template::value, void>::type> + batch_bool neq(batch const& self, batch const& other, requires_arch) { + return ~(self == other); + } + + + template batch_bool neq(batch_bool const& self, batch_bool const& other, requires_arch) { + return _mm256_xor_ps(self, other); + } + template batch_bool neq(batch_bool const& self, batch_bool const& other, requires_arch) { + return _mm256_xor_pd(self, other); + } + template::value, void>::type> + batch_bool neq(batch_bool const& self, batch_bool const& other, requires_arch) { + return ~(self == other); + } + + // sadd + template batch sadd(batch const& self, batch const& other, requires_arch) { + return add(self, other); // no saturated arithmetic on floating point numbers + } + template batch sadd(batch const& self, batch const& other, requires_arch) { + return add(self, other); // no saturated arithmetic on floating point numbers + } + template::value, void>::type> + batch sadd(batch const& self, batch const& other, requires_arch) { + if(std::is_signed::value) { + auto mask = (other >> (8 * sizeof(T) - 1)); + auto self_pos_branch = min(std::numeric_limits::max() - other, self); + auto self_neg_branch = max(std::numeric_limits::min() - other, self); + return other + select(batch_bool(mask.data), self_neg_branch, self_pos_branch); + } + else { + const auto diffmax = std::numeric_limits::max() - self; + const auto mindiff = min(diffmax, other); + return self + mindiff; + } + } + + // select + template batch select(batch_bool const& cond, batch const& true_br, batch const& false_br, requires_arch) { + return _mm256_blendv_ps(false_br, true_br, cond); + } + template batch select(batch_bool const& cond, batch const& true_br, batch const& false_br, requires_arch) { + return _mm256_blendv_pd(false_br, true_br, cond); + } + template::value, void>::type> + batch select(batch_bool const& cond, batch const& true_br, batch const& false_br, requires_arch) { + __m128i cond_low, cond_hi; + detail::split_avx(cond, cond_low, cond_hi); + + __m128i true_low, true_hi; + detail::split_avx(true_br, true_low, true_hi); + + __m128i false_low, false_hi; + detail::split_avx(false_br, false_low, false_hi); + + __m128i res_low = select(batch_bool(cond_low), batch(true_low), batch(false_low), sse4_2{}); + __m128i res_hi = select(batch_bool(cond_hi), batch(true_hi), batch(false_hi), sse4_2{}); + return detail::merge_sse(res_low, res_hi); + } + template::value, void>::type> + batch select(batch_bool_constant, Values...> const&, batch const& true_br, batch const& false_br, requires_arch) { + return select(batch_bool{Values...}, true_br, false_br, avx2{}); + } + + + // set + template + batch set(batch const&, requires_arch, Values... values) { + static_assert(sizeof...(Values) == batch::size, "consistent init"); + return _mm256_setr_ps(values...); + } + + template + batch set(batch const&, requires_arch, Values... values) { + static_assert(sizeof...(Values) == batch::size, "consistent init"); + return _mm256_setr_pd(values...); + } + template::value, void>::type> + batch set(batch const&, requires_arch, T v0, T v1, T v2, T v3) { + return _mm256_set_epi64x(v3, v2, v1, v0); + } + template::value, void>::type> + batch set(batch const&, requires_arch, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7) { + return _mm256_setr_epi32(v0, v1, v2, v3, v4, v5, v6, v7); + } + template::value, void>::type> + batch set(batch const&, requires_arch, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7, T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15) { + return _mm256_setr_epi16(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15); + } + template::value, void>::type> + batch set(batch const&, requires_arch, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7, T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15, + T v16, T v17, T v18, T v19, T v20, T v21, T v22, T v23, T v24, T v25, T v26, T v27, T v28, T v29, T v30, T v31 ) { + return _mm256_setr_epi8(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31); + } + + template::value, void>::type> + batch_bool set(batch_bool const&, requires_arch, Values... values) { + return set(batch(), A{}, static_cast(values ? -1LL : 0LL )...).data; + } + + template + batch_bool set(batch_bool const&, requires_arch, Values... values) { + static_assert(sizeof...(Values) == batch_bool::size, "consistent init"); + return _mm256_castsi256_ps(set(batch(), A{}, static_cast(values ? -1LL : 0LL )...).data); + } + + template + batch_bool set(batch_bool const&, requires_arch, Values... values) { + static_assert(sizeof...(Values) == batch_bool::size, "consistent init"); + return _mm256_castsi256_pd(set(batch(), A{}, static_cast(values ? -1LL : 0LL )...).data); + } + + // sqrt + template batch sqrt(batch const& val, requires_arch) { + return _mm256_sqrt_ps(val); + } + template batch sqrt(batch const& val, requires_arch) { + return _mm256_sqrt_pd(val); + } + + // ssub + template batch ssub(batch const& self, batch const& other, requires_arch) { + return _mm256_sub_ps(self, other); // no saturated arithmetic on floating point numbers + } + template batch ssub(batch const& self, batch const& other, requires_arch) { + return _mm256_sub_pd(self, other); // no saturated arithmetic on floating point numbers + } + template::value, void>::type> + batch ssub(batch const& self, batch const& other, requires_arch) { + if(std::is_signed::value) { + return sadd(self, -other); + } + else { + const auto diff = min(self, other); + return self - diff; + } + } + + // store_aligned + template::value, void>::type> + void store_aligned(T *mem, batch const& self, requires_arch) { + return _mm256_store_si256((__m256i *)mem, self); + } + template::value, void>::type> + void store_aligned(T *mem, batch_bool const& self, requires_arch) { + return _mm256_store_si256((__m256i *)mem, self); + } + template void store_aligned(float *mem, batch const& self, requires_arch) { + return _mm256_store_ps(mem, self); + } + template void store_aligned(double *mem, batch const& self, requires_arch) { + return _mm256_store_pd(mem, self); + } + + // store_unaligned + template::value, void>::type> + void store_unaligned(T *mem, batch const& self, requires_arch) { + return _mm256_storeu_si256((__m256i *)mem, self); + } + template::value, void>::type> + void store_unaligned(T *mem, batch_bool const& self, requires_arch) { + return _mm256_storeu_si256((__m256i *)mem, self); + } + template void store_unaligned(float *mem, batch const& self, requires_arch) { + return _mm256_storeu_ps(mem, self); + } + template void store_unaligned(double *mem, batch const& self, requires_arch) { + return _mm256_storeu_pd(mem, self); + } + + // sub + template::value, void>::type> + batch sub(batch const& self, batch const& other, requires_arch) { + switch(sizeof(T)) { + case 1: return _mm256_sub_epi8(self, other); + case 2: return _mm256_sub_epi16(self, other); + case 4: return detail::fwd_to_sse([](__m128i s, __m128i o) { return sub(batch(s), batch(o)); }, self, other); + case 8: return _mm256_sub_epi64(self, other); + default: assert(false && "unsupported arch/op combination"); return {}; + } + } + template batch sub(batch const& self, batch const& other, requires_arch) { + return _mm256_sub_ps(self, other); + } + template batch sub(batch const& self, batch const& other, requires_arch) { + return _mm256_sub_pd(self, other); + } + + // to_float + template + batch to_float(batch const& self, requires_arch) { + return _mm256_cvtepi32_ps(self); + } + template + batch to_float(batch const& self, requires_arch) { + // FIXME: call _mm_cvtepi64_pd + alignas(A::alignment()) int64_t buffer[batch::size]; + self.store_aligned(&buffer[0]); + return {(double)buffer[0], (double)buffer[1], (double)buffer[2], (double)buffer[3],}; + } + + // to_int + template + batch to_int(batch const& self, requires_arch) { + return _mm256_cvttps_epi32(self); + } + + template + batch to_int(batch const& self, requires_arch) { + // FIXME: call _mm_cvttpd_epi64 + alignas(A::alignment()) double buffer[batch::size]; + self.store_aligned(&buffer[0]); + return {(int64_t)buffer[0], (int64_t)buffer[1], (int64_t)buffer[2], (int64_t)buffer[3]}; + } + + // trunc + template batch trunc(batch const& self, requires_arch) { + return _mm256_round_ps(self, _MM_FROUND_TO_ZERO); + } + template batch trunc(batch const& self, requires_arch) { + return _mm256_round_pd(self, _MM_FROUND_TO_ZERO); + } + + // zip_hi + template::value, void>::type> + batch zip_hi(batch const& self, batch const& other, requires_arch) { + switch(sizeof(T)) { + case 1: return _mm256_unpackhi_epi8(self, other); + case 2: return _mm256_unpackhi_epi16(self, other); + case 4: return _mm256_unpackhi_epi32(self, other); + case 8: return _mm256_unpackhi_epi64(self, other); + default: assert(false && "unsupported arch/op combination"); return {}; + } + } + template batch zip_hi(batch const& self, batch const& other, requires_arch) { + return _mm256_unpackhi_ps(self, other); + } + template batch zip_hi(batch const& self, batch const& other, requires_arch) { + return _mm256_unpackhi_pd(self, other); + } + + // zip_lo + template::value, void>::type> + batch zip_lo(batch const& self, batch const& other, requires_arch) { + switch(sizeof(T)) { + case 1: return _mm256_unpacklo_epi8(self, other); + case 2: return _mm256_unpacklo_epi16(self, other); + case 4: return _mm256_unpacklo_epi32(self, other); + case 8: return _mm256_unpacklo_epi64(self, other); + default: assert(false && "unsupported arch/op combination"); return {}; + } + } + template batch zip_lo(batch const& self, batch const& other, requires_arch) { + return _mm256_unpacklo_ps(self, other); + } + template batch zip_lo(batch const& self, batch const& other, requires_arch) { + return _mm256_unpacklo_pd(self, other); + } + + } + +} + +#endif diff --git a/third_party/xsimd/arch/xsimd_avx2.hpp b/third_party/xsimd/arch/xsimd_avx2.hpp new file mode 100644 index 0000000000..9e42f6a2f0 --- /dev/null +++ b/third_party/xsimd/arch/xsimd_avx2.hpp @@ -0,0 +1,358 @@ +/*************************************************************************** +* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * +* Martin Renou * +* Copyright (c) QuantStack * +* Copyright (c) Serge Guelton * +* * +* Distributed under the terms of the BSD 3-Clause License. * +* * +* The full license is in the file LICENSE, distributed with this software. * +****************************************************************************/ + +#ifndef XSIMD_AVX2_HPP +#define XSIMD_AVX2_HPP + +#include +#include + +#include "../types/xsimd_avx2_register.hpp" + + +namespace xsimd { + + namespace kernel { + using namespace types; + + // abs + template::value, void>::type> + batch abs(batch const& self, requires_arch) { + if(std::is_signed::value) { + switch(sizeof(T)) { + case 1: return _mm256_abs_epi8(self); + case 2: return _mm256_abs_epi16(self); + case 4: return _mm256_abs_epi32(self); + default: return abs(self, avx{}); + } + } + return self; + } + + // add + template::value, void>::type> + batch add(batch const& self, batch const& other, requires_arch) { + switch(sizeof(T)) { + case 1: return _mm256_add_epi8(self, other); + case 2: return _mm256_add_epi16(self, other); + case 4: return _mm256_add_epi32(self, other); + case 8: return _mm256_add_epi64(self, other); + default: return add(self, other, avx{}); + } + } + + // bitwise_and + template::value, void>::type> + batch bitwise_and(batch const& self, batch const& other, requires_arch) { + return _mm256_and_si256(self, other); + } + template::value, void>::type> + batch_bool bitwise_and(batch_bool const& self, batch_bool const& other, requires_arch) { + return _mm256_and_si256(self, other); + } + + // bitwise_andnot + template::value, void>::type> + batch bitwise_andnot(batch const& self, batch const& other, requires_arch) { + return _mm256_andnot_si256(self, other); + } + template::value, void>::type> + batch_bool bitwise_andnot(batch_bool const& self, batch_bool const& other, requires_arch) { + return _mm256_andnot_si256(self, other); + } + + // bitwise_lshift + template::value, void>::type> + batch bitwise_lshift(batch const& self, int32_t other, requires_arch) { + switch(sizeof(T)) { + case 2: return _mm256_slli_epi16(self, other); + case 4: return _mm256_slli_epi32(self, other); + case 8: return _mm256_slli_epi64(self, other); + default: return bitwise_lshift(self, other, avx{}); + } + } + + template::value, void>::type> + batch bitwise_lshift(batch const& self, batch const& other, requires_arch) { + switch(sizeof(T)) { + case 4: return _mm256_sllv_epi32(self, other); + case 8: return _mm256_sllv_epi64(self, other); + default: return bitwise_lshift(self, other, avx{}); + } + } + + // bitwise_rshift + template::value, void>::type> + batch bitwise_rshift(batch const& self, int32_t other, requires_arch) { + if(std::is_signed::value) { + switch(sizeof(T)) { + case 2: return _mm256_srai_epi16(self, other); + case 4: return _mm256_srai_epi32(self, other); + default: return bitwise_rshift(self, other, avx{}); + } + } + else { + switch(sizeof(T)) { + case 2: return _mm256_srli_epi16(self, other); + case 4: return _mm256_srli_epi32(self, other); + case 8: return _mm256_srli_epi64(self, other); + default: return bitwise_rshift(self, other, avx{}); + } + } + } + + template::value, void>::type> + batch bitwise_rshift(batch const& self, batch const& other, requires_arch) { + if(std::is_signed::value) { + switch(sizeof(T)) { + case 4: return _mm256_srav_epi32(self, other); + default: return bitwise_rshift(self, other, avx{}); + } + } + else { + switch(sizeof(T)) { + case 4: return _mm256_srlv_epi32(self, other); + case 8: return _mm256_srlv_epi64(self, other); + default: return bitwise_rshift(self, other, avx{}); + } + } + } + + // complex_low + template batch complex_low(batch, A> const& self, requires_arch) { + __m256d tmp0 = _mm256_permute4x64_pd(self.real(), _MM_SHUFFLE(3, 1, 1, 0)); + __m256d tmp1 = _mm256_permute4x64_pd(self.imag(), _MM_SHUFFLE(1, 2, 0, 0)); + return _mm256_blend_pd(tmp0, tmp1, 10); + } + + // complex_high + template batch complex_high(batch, A> const& self, requires_arch) { + __m256d tmp0 = _mm256_permute4x64_pd(self.real(), _MM_SHUFFLE(3, 3, 1, 2)); + __m256d tmp1 = _mm256_permute4x64_pd(self.imag(), _MM_SHUFFLE(3, 2, 2, 0)); + return _mm256_blend_pd(tmp0, tmp1, 10); + } + + // eq + template::value, void>::type> + batch_bool eq(batch const& self, batch const& other, requires_arch) { + switch(sizeof(T)) { + case 1: return _mm256_cmpeq_epi8(self, other); + case 2: return _mm256_cmpeq_epi16(self, other); + case 4: return _mm256_cmpeq_epi32(self, other); + case 8: return _mm256_cmpeq_epi64(self, other); + default: return eq(self, other, avx{}); + } + } + + // gt + template::value, void>::type> + batch_bool gt(batch const& self, batch const& other, requires_arch) { + if(std::is_signed::value) { + switch(sizeof(T)) { + case 1: return _mm256_cmpgt_epi8(self, other); + case 2: return _mm256_cmpgt_epi16(self, other); + case 4: return _mm256_cmpgt_epi32(self, other); + case 8: return _mm256_cmpgt_epi64(self, other); + default: return gt(self, other, avx{}); + } + } + else { + return gt(self, other, avx{}); + } + } + + // hadd + template::value, void>::type> + T hadd(batch const& self, requires_arch) { + switch(sizeof(T)) { + case 4: + { + __m256i tmp1 = _mm256_hadd_epi32(self, self); + __m256i tmp2 = _mm256_hadd_epi32(tmp1, tmp1); + __m128i tmp3 = _mm256_extracti128_si256(tmp2, 1); + __m128i tmp4 = _mm_add_epi32(_mm256_castsi256_si128(tmp2), tmp3); + return _mm_cvtsi128_si32(tmp4); + } + case 8: + { + __m256i tmp1 = _mm256_shuffle_epi32(self, 0x0E); + __m256i tmp2 = _mm256_add_epi64(self, tmp1); + __m128i tmp3 = _mm256_extracti128_si256(tmp2, 1); + __m128i res = _mm_add_epi64(_mm256_castsi256_si128(tmp2), tmp3); +#if defined(__x86_64__) + return _mm_cvtsi128_si64(res); +#else + __m128i m; + _mm_storel_epi64(&m, res); + int64_t i; + std::memcpy(&i, &m, sizeof(i)); + return i; +#endif + } + default: return hadd(self, avx{}); + } + } + // load_complex + template batch, A> load_complex(batch const& hi, batch const& lo, requires_arch) { + using batch_type = batch; + batch_type real = _mm256_castpd_ps( + _mm256_permute4x64_pd( + _mm256_castps_pd(_mm256_shuffle_ps(hi, lo, _MM_SHUFFLE(2, 0, 2, 0))), + _MM_SHUFFLE(3, 1, 2, 0))); + batch_type imag = _mm256_castpd_ps( + _mm256_permute4x64_pd( + _mm256_castps_pd(_mm256_shuffle_ps(hi, lo, _MM_SHUFFLE(3, 1, 3, 1))), + _MM_SHUFFLE(3, 1, 2, 0))); + return {real, imag}; + } + template batch, A> load_complex(batch const& hi, batch const& lo, requires_arch) { + using batch_type = batch; + batch_type real = _mm256_permute4x64_pd(_mm256_unpacklo_pd(hi, lo), _MM_SHUFFLE(3, 1, 2, 0)); + batch_type imag = _mm256_permute4x64_pd(_mm256_unpackhi_pd(hi, lo), _MM_SHUFFLE(3, 1, 2, 0)); + return {real, imag}; + } + + // max + template::value, void>::type> + batch max(batch const& self, batch const& other, requires_arch) { + if(std::is_signed::value) { + switch(sizeof(T)) { + case 1: return _mm256_max_epi8(self, other); + case 2: return _mm256_max_epi16(self, other); + case 4: return _mm256_max_epi32(self, other); + default: return max(self, other, avx{}); + } + } + else { + switch(sizeof(T)) { + case 1: return _mm256_max_epu8(self, other); + case 2: return _mm256_max_epu16(self, other); + case 4: return _mm256_max_epu32(self, other); + default: return max(self, other, avx{}); + } + } + } + + // min + template::value, void>::type> + batch min(batch const& self, batch const& other, requires_arch) { + if(std::is_signed::value) { + switch(sizeof(T)) { + case 1: return _mm256_min_epi8(self, other); + case 2: return _mm256_min_epi16(self, other); + case 4: return _mm256_min_epi32(self, other); + default: return min(self, other, avx{}); + } + } + else { + switch(sizeof(T)) { + case 1: return _mm256_min_epu8(self, other); + case 2: return _mm256_min_epu16(self, other); + case 4: return _mm256_min_epu32(self, other); + default: return min(self, other, avx{}); + } + } + } + + // mul + template::value, void>::type> + batch mul(batch const& self, batch const& other, requires_arch) { + switch(sizeof(T)) { + case 2: return _mm256_mullo_epi16(self, other); + case 4: return _mm256_mullo_epi32(self, other); + default: return mul(self, other, avx{}); + } + } + + // sadd + template::value, void>::type> + batch sadd(batch const& self, batch const& other, requires_arch) { + if(std::is_signed::value) { + switch(sizeof(T)) { + case 1: return _mm256_adds_epi8(self, other); + case 2: return _mm256_adds_epi16(self, other); + default: return sadd(self, other, avx{}); + } + } + else { + switch(sizeof(T)) { + case 1: return _mm256_adds_epu8(self, other); + case 2: return _mm256_adds_epu16(self, other); + default: return sadd(self, other, avx{}); + } + } + } + + // select + template::value, void>::type> + batch select(batch_bool const& cond, batch const& true_br, batch const& false_br, requires_arch) { + switch(sizeof(T)) { + case 1: return _mm256_blendv_epi8(false_br, true_br, cond); + case 2: return _mm256_blendv_epi8(false_br, true_br, cond); + case 4: return _mm256_blendv_epi8(false_br, true_br, cond); + case 8: return _mm256_blendv_epi8(false_br, true_br, cond); + default: return select(cond, true_br, false_br, avx{}); + } + } + template::value, void>::type> + batch select(batch_bool_constant, Values...> const&, batch const& true_br, batch const& false_br, requires_arch) { + constexpr int mask = batch_bool_constant, Values...>::mask(); + switch(sizeof(T)) { + // FIXME: for some reason mask here is not considered as an immediate, + // but it's okay for _mm256_blend_epi32 + //case 2: return _mm256_blend_epi16(false_br, true_br, mask); + case 4: return _mm256_blend_epi32(false_br, true_br, mask); + case 8: { + constexpr int imask = detail::interleave(mask); + return _mm256_blend_epi32(false_br, true_br, imask); + } + default: return select(batch_bool{Values...}, true_br, false_br, avx2{}); + } + } + + // ssub + template::value, void>::type> + batch ssub(batch const& self, batch const& other, requires_arch) { + if(std::is_signed::value) { + switch(sizeof(T)) { + case 1: return _mm256_subs_epi8(self, other); + case 2: return _mm256_subs_epi16(self, other); + default: return ssub(self, other, avx{}); + } + } + else { + switch(sizeof(T)) { + case 1: return _mm256_subs_epu8(self, other); + case 2: return _mm256_subs_epu16(self, other); + default: return ssub(self, other, avx{}); + } + } + } + + // sub + template::value, void>::type> + batch sub(batch const& self, batch const& other, requires_arch) { + switch(sizeof(T)) { + case 1: return _mm256_sub_epi8(self, other); + case 2: return _mm256_sub_epi16(self, other); + case 4: return _mm256_sub_epi32(self, other); + case 8: return _mm256_sub_epi64(self, other); + default: return sub(self, other, avx{}); + } + } + + + + } + +} + +#endif diff --git a/third_party/xsimd/arch/xsimd_avx512bw.hpp b/third_party/xsimd/arch/xsimd_avx512bw.hpp new file mode 100644 index 0000000000..e33b426666 --- /dev/null +++ b/third_party/xsimd/arch/xsimd_avx512bw.hpp @@ -0,0 +1,276 @@ +/*************************************************************************** +* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * +* Martin Renou * +* Copyright (c) QuantStack * +* Copyright (c) Serge Guelton * +* * +* Distributed under the terms of the BSD 3-Clause License. * +* * +* The full license is in the file LICENSE, distributed with this software. * +****************************************************************************/ + +#ifndef XSIMD_AVX512BW_HPP +#define XSIMD_AVX512BW_HPP + +#include + +#include "../types/xsimd_avx512bw_register.hpp" + +namespace xsimd { + + namespace kernel { + using namespace types; + + namespace detail { + template + batch_bool compare_int_avx512bw(batch const& self, batch const& other) { + using register_type = typename batch_bool::register_type; + if(std::is_signed::value) { + switch(sizeof(T)) { + case 1: return (register_type)_mm512_cmp_epi8_mask(self, other, Cmp); + case 2: return (register_type)_mm512_cmp_epi16_mask(self, other, Cmp); + case 4: return (register_type)_mm512_cmp_epi32_mask(self, other, Cmp); + case 8: return (register_type)_mm512_cmp_epi64_mask(self, other, Cmp); + } + } + else { + switch(sizeof(T)) { + case 1: return (register_type)_mm512_cmp_epu8_mask(self, other, Cmp); + case 2: return (register_type)_mm512_cmp_epu16_mask(self, other, Cmp); + case 4: return (register_type)_mm512_cmp_epu32_mask(self, other, Cmp); + case 8: return (register_type)_mm512_cmp_epu64_mask(self, other, Cmp); + } + } + } + } + + // abs + template::value, void>::type> + batch abs(batch const& self, requires_arch) { + if(std::is_unsigned::value) + return self; + + switch(sizeof(T)) { + case 1: return _mm512_abs_epi8(self); + case 2: return _mm512_abs_epi16(self); + default: return abs(self, avx512dq{}); + } + } + + // add + template::value, void>::type> + batch add(batch const& self, batch const& other, requires_arch) { + switch(sizeof(T)) { + case 1: return _mm512_add_epi8(self, other); + case 2: return _mm512_add_epi16(self, other); + default: return add(self, other, avx512dq{}); + } + } + + // bitwise_lshift + template::value, void>::type> + batch bitwise_lshift(batch const& self, int32_t other, requires_arch) { + switch(sizeof(T)) { +#if defined(XSIMD_AVX512_SHIFT_INTRINSICS_IMM_ONLY) + case 2: return _mm512_sllv_epi16(self, _mm512_set1_epi16(other)); +#else + case 2: return _mm512_slli_epi16(self, other); +#endif + default: return bitwise_lshift(self, other, avx512dq{}); + } + } + + // bitwise_rshift + template::value, void>::type> + batch bitwise_rshift(batch const& self, int32_t other, requires_arch) { + if(std::is_signed::value) { + switch(sizeof(T)) { + case 1: + { + __m512i sign_mask = _mm512_set1_epi16((0xFF00 >> other) & 0x00FF); + __m512i zeros = _mm512_setzero_si512(); + __mmask64 cmp_is_negative_mask = _mm512_cmpgt_epi8_mask(zeros, self); + __m512i cmp_sign_mask = _mm512_mask_blend_epi8(cmp_is_negative_mask, zeros, sign_mask); +#if defined(XSIMD_AVX512_SHIFT_INTRINSICS_IMM_ONLY) + __m512i res = _mm512_srav_epi16(self, _mm512_set1_epi16(other)); +#else + __m512i res = _mm512_srai_epi16(self, other); +#endif + return _mm512_or_si512(cmp_sign_mask, _mm512_andnot_si512(sign_mask, res)); + } +#if defined(XSIMD_AVX512_SHIFT_INTRINSICS_IMM_ONLY) + case 2: return _mm512_srav_epi16(self, _mm512_set1_epi16(other)); +#else + case 2: return _mm512_srai_epi16(self, other); +#endif + default: return bitwise_rshift(self, other, avx512dq{}); + } + } + else { + switch(sizeof(T)) { +#if defined(XSIMD_AVX512_SHIFT_INTRINSICS_IMM_ONLY) + case 2: return _mm512_srlv_epi16(self, _mm512_set1_epi16(other)); +#else + case 2: return _mm512_srli_epi16(self, other); +#endif + default: return bitwise_rshift(self, other, avx512dq{}); + } + } + } + + // eq + template::value, void>::type> + batch_bool eq(batch const& self, batch const& other, requires_arch) { + return detail::compare_int_avx512bw(self, other); + } + + // ge + template::value, void>::type> + batch_bool ge(batch const& self, batch const& other, requires_arch) { + return detail::compare_int_avx512bw(self, other); + } + + // gt + template::value, void>::type> + batch_bool gt(batch const& self, batch const& other, requires_arch) { + return detail::compare_int_avx512bw(self, other); + } + + + // le + template::value, void>::type> + batch_bool le(batch const& self, batch const& other, requires_arch) { + return detail::compare_int_avx512bw(self, other); + } + + // lt + template::value, void>::type> + batch_bool lt(batch const& self, batch const& other, requires_arch) { + return detail::compare_int_avx512bw(self, other); + } + + // max + template::value, void>::type> + batch max(batch const& self, batch const& other, requires_arch) { + if(std::is_signed::value) { + switch(sizeof(T)) { + case 1: return _mm512_max_epi8(self, other); + case 2: return _mm512_max_epi16(self, other); + default: return max(self, other, avx512dq{}); + } + } + else { + switch(sizeof(T)) { + case 1: return _mm512_max_epu8(self, other); + case 2: return _mm512_max_epu16(self, other); + default: return max(self, other, avx512dq{}); + } + } + } + + // min + template::value, void>::type> + batch min(batch const& self, batch const& other, requires_arch) { + if(std::is_signed::value) { + switch(sizeof(T)) { + case 1: return _mm512_min_epi8(self, other); + case 2: return _mm512_min_epi16(self, other); + default: return min(self, other, avx512dq{}); + } + } + else { + switch(sizeof(T)) { + case 1: return _mm512_min_epu8(self, other); + case 2: return _mm512_min_epu16(self, other); + default: return min(self, other, avx512dq{}); + } + } + } + + + // mul + template::value, void>::type> + batch mul(batch const& self, batch const& other, requires_arch) { + switch(sizeof(T)) { + case 1: { + __m512i upper = _mm512_and_si512(_mm512_mullo_epi16(self, other), _mm512_srli_epi16(_mm512_set1_epi16(-1), 8)); + __m512i lower = _mm512_slli_epi16(_mm512_mullo_epi16(_mm512_srli_epi16(self, 8), _mm512_srli_epi16(other, 8)), 8); + return _mm512_or_si512(upper, lower); + } + case 2: return _mm512_mullo_epi16(self, other); + default: return mul(self, other, avx512dq{}); + } + } + + + // neq + template::value, void>::type> + batch_bool neq(batch const& self, batch const& other, requires_arch) { + return detail::compare_int_avx512bw(self, other); + } + + // sadd + template::value, void>::type> + batch sadd(batch const& self, batch const& other, requires_arch) { + if(std::is_signed::value) { + switch(sizeof(T)) { + case 1: return _mm512_adds_epi8(self, other); + case 2: return _mm512_adds_epi16(self, other); + default: return sadd(self, other, avx512dq{}); + } + } + else { + switch(sizeof(T)) { + case 1: return _mm512_adds_epu8(self, other); + case 2: return _mm512_adds_epu16(self, other); + default: return sadd(self, other, avx512dq{}); + } + } + } + + // select + template::value, void>::type> + batch select(batch_bool const& cond, batch const& true_br, batch const& false_br, requires_arch) { + switch(sizeof(T)) { + case 1: return _mm512_mask_blend_epi8(cond, false_br, true_br); + case 2: return _mm512_mask_blend_epi16(cond, false_br, true_br); + default: return select(cond, true_br, false_br, avx512dq{}); + }; + } + + + // ssub + template::value, void>::type> + batch ssub(batch const& self, batch const& other, requires_arch) { + if(std::is_signed::value) { + switch(sizeof(T)) { + case 1: return _mm512_subs_epi8(self, other); + case 2: return _mm512_subs_epi16(self, other); + default: return ssub(self, other, avx512dq{}); + } + } + else { + switch(sizeof(T)) { + case 1: return _mm512_subs_epu8(self, other); + case 2: return _mm512_subs_epu16(self, other); + default: return ssub(self, other, avx512dq{}); + } + } + } + + + // sub + template::value, void>::type> + batch sub(batch const& self, batch const& other, requires_arch) { + switch(sizeof(T)) { + case 1: return _mm512_sub_epi8(self, other); + case 2: return _mm512_sub_epi16(self, other); + default: return sub(self, other, avx512dq{}); + } + } + + } + +} + +#endif diff --git a/third_party/xsimd/math/xsimd_math.hpp b/third_party/xsimd/arch/xsimd_avx512cd.hpp similarity index 63% rename from third_party/xsimd/math/xsimd_math.hpp rename to third_party/xsimd/arch/xsimd_avx512cd.hpp index cedb00e018..b426adbb2e 100644 --- a/third_party/xsimd/math/xsimd_math.hpp +++ b/third_party/xsimd/arch/xsimd_avx512cd.hpp @@ -2,25 +2,25 @@ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * +* Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ -#ifndef XSIMD_MATH_HPP -#define XSIMD_MATH_HPP +#ifndef XSIMD_AVX512CD_HPP +#define XSIMD_AVX512CD_HPP -#include "xsimd_basic_math.hpp" -#include "xsimd_error.hpp" -#include "xsimd_exponential.hpp" -#include "xsimd_fp_manipulation.hpp" -#include "xsimd_gamma.hpp" -#include "xsimd_hyperbolic.hpp" -#include "xsimd_logarithm.hpp" -#include "xsimd_power.hpp" -#include "xsimd_rounding.hpp" -#include "xsimd_trigonometric.hpp" -#include "xsimd/types/xsimd_scalar.hpp" +#include "../types/xsimd_avx512cd_register.hpp" + +namespace xsimd { + + namespace kernel { + // Nothing there yet. + + } + +} #endif diff --git a/third_party/xsimd/arch/xsimd_avx512dq.hpp b/third_party/xsimd/arch/xsimd_avx512dq.hpp new file mode 100644 index 0000000000..00fe1f5494 --- /dev/null +++ b/third_party/xsimd/arch/xsimd_avx512dq.hpp @@ -0,0 +1,75 @@ +/*************************************************************************** +* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * +* Martin Renou * +* Copyright (c) QuantStack * +* Copyright (c) Serge Guelton * +* * +* Distributed under the terms of the BSD 3-Clause License. * +* * +* The full license is in the file LICENSE, distributed with this software. * +****************************************************************************/ + +#ifndef XSIMD_AVX512_DQHPP +#define XSIMD_AVX512_D_HPP + +#include "../types/xsimd_avx512dq_register.hpp" + +namespace xsimd { + + namespace kernel { + using namespace types; + + // bitwise_and + template batch bitwise_and(batch const& self, batch const& other, requires_arch) { + return _mm512_and_ps(self, other); + } + template batch bitwise_and(batch const& self, batch const& other, requires_arch) { + return _mm512_and_pd(self, other); + } + + // bitwise_andnot + template batch bitwise_andnot(batch const& self, batch const& other, requires_arch) { + return _mm512_andnot_ps(self, other); + } + template batch bitwise_andnot(batch const& self, batch const& other, requires_arch) { + return _mm512_andnot_pd(self, other); + } + + // bitwise_or + template batch bitwise_or(batch const& self, batch const& other, requires_arch) { + return _mm512_or_ps(self, other); + } + template batch bitwise_or(batch const& self, batch const& other, requires_arch) { + return _mm512_or_pd(self, other); + } + + template batch_bool bitwise_or(batch_bool const& self, batch_bool const& other, requires_arch) { + using register_type = typename batch_bool::register_type; + return register_type(self.data | other.data); + } + + // bitwise_xor + template batch bitwise_xor(batch const& self, batch const& other, requires_arch) { + return _mm512_xor_ps(self, other); + } + template batch bitwise_xor(batch const& self, batch const& other, requires_arch) { + return _mm512_xor_pd(self, other); + } + + // to_float + template + batch to_float(batch const& self, requires_arch) { + return _mm512_cvtepi64_pd(self); + } + + // to_int + template + batch to_int(batch const& self, requires_arch) { + return _mm512_cvttpd_epi64(self); + } + + } + +} + +#endif diff --git a/third_party/xsimd/arch/xsimd_avx512f.hpp b/third_party/xsimd/arch/xsimd_avx512f.hpp new file mode 100644 index 0000000000..13cc0da714 --- /dev/null +++ b/third_party/xsimd/arch/xsimd_avx512f.hpp @@ -0,0 +1,1230 @@ +/*************************************************************************** +* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * +* Martin Renou * +* Copyright (c) QuantStack * +* Copyright (c) Serge Guelton * +* * +* Distributed under the terms of the BSD 3-Clause License. * +* * +* The full license is in the file LICENSE, distributed with this software. * +****************************************************************************/ + +#ifndef XSIMD_AVX512F_HPP +#define XSIMD_AVX512F_HPP + +#include +#include +#include + +#include "../types/xsimd_avx512f_register.hpp" + +namespace xsimd { + + namespace kernel { + using namespace types; + + namespace detail { + inline void split_avx512(__m512 val, __m256& low, __m256& high) { + low =_mm512_castps512_ps256(val); + high =_mm512_extractf32x8_ps(val, 1); + } + inline void split_avx512(__m512d val, __m256d& low, __m256d& high) { + low =_mm512_castpd512_pd256(val); + high =_mm512_extractf64x4_pd(val, 1); + } + inline void split_avx512(__m512i val, __m256i& low, __m256i& high) { + low =_mm512_castsi512_si256(val); + high =_mm512_extracti64x4_epi64(val, 1); + } + inline __m512i merge_avx(__m256i low, __m256i high) { + return _mm512_inserti64x4(_mm512_castsi256_si512(low), high, 1); + } + inline __m512 merge_avx(__m256 low, __m256 high) { + return _mm512_insertf32x8(_mm512_castps256_ps512(low), high, 1); + } + inline __m512d merge_avx(__m256d low, __m256d high) { + return _mm512_insertf64x4(_mm512_castpd256_pd512(low), high, 1); + } + template + __m512i fwd_to_avx(F f, __m512i self) { + __m256i self_low, self_high; + split_avx512(self, self_low, self_high); + __m256i res_low = f(self_low); + __m256i res_high = f(self_high); + return merge_avx(res_low, res_high); + } + template + __m512i fwd_to_avx(F f, __m512i self, __m512i other) { + __m256i self_low, self_high, other_low, other_high; + split_avx512(self, self_low, self_high); + split_avx512(other, other_low, other_high); + __m256i res_low = f(self_low, other_low); + __m256i res_high = f(self_high, other_high); + return merge_avx(res_low, res_high); + } + template + __m512i fwd_to_avx(F f, __m512i self, int32_t other) { + __m256i self_low, self_high; + split_avx512(self, self_low, self_high); + __m256i res_low = f(self_low, other); + __m256i res_high = f(self_high, other); + return merge_avx(res_low, res_high); + } + } + namespace detail { + + inline uint32_t morton(uint16_t x, uint16_t y) { + + static const unsigned short MortonTable256[256] = + { + 0x0000, 0x0001, 0x0004, 0x0005, 0x0010, 0x0011, 0x0014, 0x0015, + 0x0040, 0x0041, 0x0044, 0x0045, 0x0050, 0x0051, 0x0054, 0x0055, + 0x0100, 0x0101, 0x0104, 0x0105, 0x0110, 0x0111, 0x0114, 0x0115, + 0x0140, 0x0141, 0x0144, 0x0145, 0x0150, 0x0151, 0x0154, 0x0155, + 0x0400, 0x0401, 0x0404, 0x0405, 0x0410, 0x0411, 0x0414, 0x0415, + 0x0440, 0x0441, 0x0444, 0x0445, 0x0450, 0x0451, 0x0454, 0x0455, + 0x0500, 0x0501, 0x0504, 0x0505, 0x0510, 0x0511, 0x0514, 0x0515, + 0x0540, 0x0541, 0x0544, 0x0545, 0x0550, 0x0551, 0x0554, 0x0555, + 0x1000, 0x1001, 0x1004, 0x1005, 0x1010, 0x1011, 0x1014, 0x1015, + 0x1040, 0x1041, 0x1044, 0x1045, 0x1050, 0x1051, 0x1054, 0x1055, + 0x1100, 0x1101, 0x1104, 0x1105, 0x1110, 0x1111, 0x1114, 0x1115, + 0x1140, 0x1141, 0x1144, 0x1145, 0x1150, 0x1151, 0x1154, 0x1155, + 0x1400, 0x1401, 0x1404, 0x1405, 0x1410, 0x1411, 0x1414, 0x1415, + 0x1440, 0x1441, 0x1444, 0x1445, 0x1450, 0x1451, 0x1454, 0x1455, + 0x1500, 0x1501, 0x1504, 0x1505, 0x1510, 0x1511, 0x1514, 0x1515, + 0x1540, 0x1541, 0x1544, 0x1545, 0x1550, 0x1551, 0x1554, 0x1555, + 0x4000, 0x4001, 0x4004, 0x4005, 0x4010, 0x4011, 0x4014, 0x4015, + 0x4040, 0x4041, 0x4044, 0x4045, 0x4050, 0x4051, 0x4054, 0x4055, + 0x4100, 0x4101, 0x4104, 0x4105, 0x4110, 0x4111, 0x4114, 0x4115, + 0x4140, 0x4141, 0x4144, 0x4145, 0x4150, 0x4151, 0x4154, 0x4155, + 0x4400, 0x4401, 0x4404, 0x4405, 0x4410, 0x4411, 0x4414, 0x4415, + 0x4440, 0x4441, 0x4444, 0x4445, 0x4450, 0x4451, 0x4454, 0x4455, + 0x4500, 0x4501, 0x4504, 0x4505, 0x4510, 0x4511, 0x4514, 0x4515, + 0x4540, 0x4541, 0x4544, 0x4545, 0x4550, 0x4551, 0x4554, 0x4555, + 0x5000, 0x5001, 0x5004, 0x5005, 0x5010, 0x5011, 0x5014, 0x5015, + 0x5040, 0x5041, 0x5044, 0x5045, 0x5050, 0x5051, 0x5054, 0x5055, + 0x5100, 0x5101, 0x5104, 0x5105, 0x5110, 0x5111, 0x5114, 0x5115, + 0x5140, 0x5141, 0x5144, 0x5145, 0x5150, 0x5151, 0x5154, 0x5155, + 0x5400, 0x5401, 0x5404, 0x5405, 0x5410, 0x5411, 0x5414, 0x5415, + 0x5440, 0x5441, 0x5444, 0x5445, 0x5450, 0x5451, 0x5454, 0x5455, + 0x5500, 0x5501, 0x5504, 0x5505, 0x5510, 0x5511, 0x5514, 0x5515, + 0x5540, 0x5541, 0x5544, 0x5545, 0x5550, 0x5551, 0x5554, 0x5555 + }; + + uint32_t z = MortonTable256[y >> 8] << 17 | + MortonTable256[x >> 8] << 16 | + MortonTable256[y & 0xFF] << 1 | + MortonTable256[x & 0xFF]; + return z; + } + + template + batch_bool compare_int_avx512f(batch const& self, batch const& other) { + using register_type = typename batch_bool::register_type; + if(std::is_signed::value) { + switch(sizeof(T)) { + case 1: { + // shifting to take sign into account + uint64_t mask_low0 = _mm512_cmp_epi32_mask((batch(self.data) & batch(0x000000FF)) << 24, + (batch(other.data) & batch(0x000000FF)) << 24, + Cmp); + uint64_t mask_low1 = _mm512_cmp_epi32_mask((batch(self.data) & batch(0x0000FF00)) << 16, + (batch(other.data) & batch(0x0000FF00)) << 16, + Cmp); + uint64_t mask_high0 = _mm512_cmp_epi32_mask((batch(self.data) & batch(0x00FF0000)) << 8, + (batch(other.data) & batch(0x00FF0000)) << 8, + Cmp); + uint64_t mask_high1 = _mm512_cmp_epi32_mask((batch(self.data) & batch(0xFF000000)), + (batch(other.data) & batch(0xFF000000)), + Cmp); + uint64_t mask = 0; + for(unsigned i = 0; i < 16; ++i) { + mask |= (mask_low0 & (uint64_t(1) << i)) << (3 * i + 0); + mask |= (mask_low1 & (uint64_t(1) << i)) << (3 * i + 1); + mask |= (mask_high0 & (uint64_t(1) << i)) << (3 * i + 2); + mask |= (mask_high1 & (uint64_t(1) << i)) << (3 * i + 3); + } + return (register_type)mask; + } + case 2: { + // shifting to take sign into account + uint16_t mask_low = _mm512_cmp_epi32_mask((batch(self.data) & batch(0x0000FFFF)) << 16, + (batch(other.data) & batch(0x0000FFFF)) << 16, + Cmp); + uint16_t mask_high = _mm512_cmp_epi32_mask((batch(self.data) & batch(0xFFFF0000)), + (batch(other.data) & batch(0xFFFF0000)), + Cmp); + return static_cast(morton(mask_low, mask_high)); + } + case 4: return (register_type)_mm512_cmp_epi32_mask(self, other, Cmp); + case 8: return (register_type)_mm512_cmp_epi64_mask(self, other, Cmp); + } + } + else { + switch(sizeof(T)) { + case 1: { + uint64_t mask_low0 = _mm512_cmp_epu32_mask((batch(self.data) & batch(0x000000FF)), (batch(other.data) & batch(0x000000FF)), Cmp); + uint64_t mask_low1 = _mm512_cmp_epu32_mask((batch(self.data) & batch(0x0000FF00)), (batch(other.data) & batch(0x0000FF00)), Cmp); + uint64_t mask_high0 = _mm512_cmp_epu32_mask((batch(self.data) & batch(0x00FF0000)), (batch(other.data) & batch(0x00FF0000)), Cmp); + uint64_t mask_high1 = _mm512_cmp_epu32_mask((batch(self.data) & batch(0xFF000000)), (batch(other.data) & batch(0xFF000000)), Cmp); + uint64_t mask = 0; + for(unsigned i = 0; i < 16; ++i) { + mask |= (mask_low0 & (uint64_t(1) << i)) << (3 * i + 0); + mask |= (mask_low1 & (uint64_t(1) << i)) << (3 * i + 1); + mask |= (mask_high0 & (uint64_t(1) << i)) << (3 * i + 2); + mask |= (mask_high1 & (uint64_t(1) << i)) << (3 * i + 3); + } + return (register_type)mask; + } + case 2: { + uint16_t mask_low = _mm512_cmp_epu32_mask((batch(self.data) & batch(0x0000FFFF)), (batch(other.data) & batch(0x0000FFFF)), Cmp); + uint16_t mask_high = _mm512_cmp_epu32_mask((batch(self.data) & batch(0xFFFF0000)), (batch(other.data) & batch(0xFFFF0000)), Cmp); + return static_cast(morton(mask_low, mask_high)); + } + case 4: return (register_type)_mm512_cmp_epu32_mask(self, other, Cmp); + case 8: return (register_type)_mm512_cmp_epu64_mask(self, other, Cmp); + } + } + } + } + + // abs + template batch abs(batch const& self, requires_arch) { + __m512 self_asf = (__m512)self; + __m512i self_asi = *reinterpret_cast<__m512i *>(&self_asf); + __m512i res_asi = _mm512_and_epi32(_mm512_set1_epi32(0x7FFFFFFF), self_asi); + return *reinterpret_cast<__m512*>(&res_asi); + } + template batch abs(batch const& self, requires_arch) { + __m512d self_asd = (__m512d)self; + __m512i self_asi = *reinterpret_cast<__m512i*>(&self_asd); + __m512i res_asi = _mm512_and_epi64(_mm512_set1_epi64(0x7FFFFFFFFFFFFFFF), + self_asi); + return *reinterpret_cast<__m512d*>(&res_asi); + } + template::value, void>::type> + batch abs(batch const& self, requires_arch) { + if(std::is_unsigned::value) + return self; + + switch(sizeof(T)) { + case 1: return detail::fwd_to_avx([](__m256i s) { return abs(batch(s)); }, self); + case 2: return detail::fwd_to_avx([](__m256i s) { return abs(batch(s)); }, self); + case 4: return _mm512_abs_epi32(self); + case 8: return _mm512_abs_epi64(self); + default: assert(false && "unsupported arch/op combination"); return {}; + } + } + + // add + template::value, void>::type> + batch add(batch const& self, batch const& other, requires_arch) { + switch(sizeof(T)) { + case 1: return detail::fwd_to_avx([](__m256i s, __m256i o) { return add(batch(s), batch(o)); }, self, other); + case 2: return detail::fwd_to_avx([](__m256i s, __m256i o) { return add(batch(s), batch(o)); }, self, other); + case 4: return _mm512_add_epi32(self, other); + case 8: return _mm512_add_epi64(self, other); + default: assert(false && "unsupported arch/op combination"); return {}; + } + } + template batch add(batch const& self, batch const& other, requires_arch) { + return _mm512_add_ps(self, other); + } + template batch add(batch const& self, batch const& other, requires_arch) { + return _mm512_add_pd(self, other); + } + + // all + template + bool all(batch_bool const& self, requires_arch) { + using register_type = typename batch_bool::register_type; + return self.data == register_type(-1); + } + + // any + template + bool any(batch_bool const& self, requires_arch) { + using register_type = typename batch_bool::register_type; + return self.data != register_type(0); + } + + // bitwise_and + template batch bitwise_and(batch const& self, batch const& other, requires_arch) { + return _mm512_castsi512_ps(_mm512_and_si512(_mm512_castps_si512(self), _mm512_castps_si512(other))); + } + template batch bitwise_and(batch const& self, batch const& other, requires_arch) { + return _mm512_castsi512_pd(_mm512_and_si512(_mm512_castpd_si512(self), _mm512_castpd_si512(other))); + } + + template::value, void>::type> + batch bitwise_and(batch const& self, batch const& other, requires_arch) { + return _mm512_and_si512(self, other); + } + + template + batch_bool bitwise_and(batch_bool const& self, batch_bool const& other, requires_arch) { + using register_type = typename batch_bool::register_type; + return register_type(self.data & other.data); + } + + // bitwise_andnot + template batch bitwise_andnot(batch const& self, batch const& other, requires_arch) { + return _mm512_castsi512_ps(_mm512_andnot_si512(_mm512_castps_si512(self), _mm512_castps_si512(other))); + } + template batch bitwise_andnot(batch const& self, batch const& other, requires_arch) { + return _mm512_castsi512_pd(_mm512_andnot_si512(_mm512_castpd_si512(self), _mm512_castpd_si512(other))); + } + + template::value, void>::type> + batch bitwise_andnot(batch const& self, batch const& other, requires_arch) { + return _mm512_andnot_si512(self, other); + } + + template + batch_bool bitwise_andnot(batch_bool const& self, batch_bool const& other, requires_arch) { + using register_type = typename batch_bool::register_type; + return register_type(self.data & ~other.data); + } + + // bitwise_lshift + template::value, void>::type> + batch bitwise_lshift(batch const& self, int32_t other, requires_arch) { + switch(sizeof(T)) { + case 1: { +#if defined(XSIMD_AVX512_SHIFT_INTRINSICS_IMM_ONLY) + __m512i tmp = _mm512_sllv_epi32(self, _mm512_set1_epi32(other)); +#else + __m512i tmp = _mm512_slli_epi32(self, other); +#endif + return _mm512_and_si512(_mm512_set1_epi8(0xFF << other), tmp); + } + case 2: return detail::fwd_to_avx([](__m256i s, int32_t o) { return bitwise_lshift(batch(s), o, avx2{}); }, self, other); +#if defined(XSIMD_AVX512_SHIFT_INTRINSICS_IMM_ONLY) + case 4: return _mm512_sllv_epi32(self, _mm512_set1_epi32(other)); + case 8: return _mm512_sllv_epi64(self, _mm512_set1_epi64(other)); +#else + case 4: return _mm512_slli_epi32(self, other); + case 8: return _mm512_slli_epi64(self, other); +#endif + default: assert(false && "unsupported arch/op combination"); return {}; + } + } + + // bitwise_not + template::value, void>::type> + batch bitwise_not(batch const& self, requires_arch) { + return _mm512_xor_si512(self, _mm512_set1_epi32(-1)); + } + template + batch_bool bitwise_not(batch_bool const& self, requires_arch) { + using register_type = typename batch_bool::register_type; + return register_type(~self.data); + } + + template batch bitwise_not(batch const& self, requires_arch) { + return _mm512_xor_ps(self, _mm512_castsi512_ps(_mm512_set1_epi32(-1))); + } + template + batch bitwise_not(batch const &self, requires_arch) { + return _mm512_xor_pd(self, _mm512_castsi512_pd(_mm512_set1_epi32(-1))); + } + + // bitwise_or + template batch bitwise_or(batch const& self, batch const& other, requires_arch) { + return _mm512_castsi512_ps(_mm512_or_si512(_mm512_castps_si512(self), _mm512_castps_si512(other))); + } + template batch bitwise_or(batch const& self, batch const& other, requires_arch) { + return _mm512_castsi512_pd(_mm512_or_si512(_mm512_castpd_si512(self), _mm512_castpd_si512(other))); + } + + template batch_bool bitwise_or(batch_bool const& self, batch_bool const& other, requires_arch) { + using register_type = typename batch_bool::register_type; + return register_type(self.data | other.data); + } + + template::value, void>::type> + batch bitwise_or(batch const& self, batch const& other, requires_arch) { + return _mm512_or_si512(self, other); + } + + // bitwise_rshift + template::value, void>::type> + batch bitwise_rshift(batch const& self, int32_t other, requires_arch) { + if(std::is_signed::value) { + switch(sizeof(T)) { +#if defined(XSIMD_AVX512_SHIFT_INTRINSICS_IMM_ONLY) + case 4: return _mm512_srav_epi32(self, _mm512_set1_epi32(other)); + case 8: return _mm512_srav_epi64(self, _mm512_set1_epi64(other)); +#else + case 4: return _mm512_srai_epi32(self, other); + case 8: return _mm512_srai_epi64(self, other); +#endif + default: return detail::fwd_to_avx([](__m256i s, int32_t o) { return bitwise_rshift(batch(s), o, avx2{}); }, self, other); + } + } + else { + switch(sizeof(T)) { + case 1: + { +#if defined(XSIMD_AVX512_SHIFT_INTRINSICS_IMM_ONLY) + __m512i tmp = _mm512_srlv_epi32(self, _mm512_set1_epi32(other)); +#else + __m512i tmp = _mm512_srli_epi32(self, other); +#endif + return _mm512_and_si512(_mm512_set1_epi8(0xFF >> other), tmp); + } +#if defined(XSIMD_AVX512_SHIFT_INTRINSICS_IMM_ONLY) + case 4: return _mm512_srlv_epi32(self, _mm512_set1_epi32(other)); + case 8: return _mm512_srlv_epi64(self, _mm512_set1_epi64(other)); +#else + case 4: return _mm512_srli_epi32(self, other); + case 8: return _mm512_srli_epi64(self, other); +#endif + default: return detail::fwd_to_avx([](__m256i s, int32_t o) { return bitwise_rshift(batch(s), o, avx2{}); }, self, other); + } + } + } + + // bitwise_xor + template batch bitwise_xor(batch const& self, batch const& other, requires_arch) { + return _mm512_castsi512_ps(_mm512_xor_si512(_mm512_castps_si512(self), _mm512_castps_si512(other))); + } + template batch bitwise_xor(batch const& self, batch const& other, requires_arch) { + return _mm512_castsi512_pd(_mm512_xor_si512(_mm512_castpd_si512(self), _mm512_castpd_si512(other))); + } + + template batch_bool bitwise_xor(batch_bool const& self, batch_bool const& other, requires_arch) { + using register_type = typename batch_bool::register_type; + return register_type(self.data | other.data); + } + + template::value, void>::type> + batch bitwise_xor(batch const& self, batch const& other, requires_arch) { + return _mm512_xor_si512(self, other); + } + + // bitwise_cast + template::value, void>::type> + batch bitwise_cast(batch const& self, batch const &, requires_arch) { + return _mm512_castsi512_ps(self); + } + template::value, void>::type> + batch bitwise_cast(batch const& self, batch const &, requires_arch) { + return _mm512_castsi512_pd(self); + } + template::type>::value, void>::type> + batch bitwise_cast(batch const& self, batch const &, requires_arch) { + return batch(self.data); + } + template + batch bitwise_cast(batch const& self, batch const &, requires_arch) { + return _mm512_castps_pd(self); + } + template::value, void>::type> + batch bitwise_cast(batch const& self, batch const &, requires_arch) { + return _mm512_castps_si512(self); + } + template + batch bitwise_cast(batch const& self, batch const &, requires_arch) { + return _mm512_castpd_ps(self); + } + template::value, void>::type> + batch bitwise_cast(batch const& self, batch const &, requires_arch) { + return _mm512_castpd_si512(self); + } + + // bool_cast + template batch_bool bool_cast(batch_bool const& self, requires_arch) { + return self.data; + } + template batch_bool bool_cast(batch_bool const& self, requires_arch) { + return self.data; + } + template batch_bool bool_cast(batch_bool const& self, requires_arch) { + return self.data; + } + template batch_bool bool_cast(batch_bool const& self, requires_arch) { + return self.data; + } + + + // broadcast + template::value, void>::type> + batch broadcast(T val, requires_arch) { + switch(sizeof(T)) { + case 1: return _mm512_set1_epi8(val); + case 2: return _mm512_set1_epi16(val); + case 4: return _mm512_set1_epi32(val); + case 8: return _mm512_set1_epi64(val); + default: assert(false && "unsupported"); return {}; + } + } + template batch broadcast(float val, requires_arch) { + return _mm512_set1_ps(val); + } + template batch broadcast(double val, requires_arch) { + return _mm512_set1_pd(val); + } + + // ceil + template batch ceil(batch const& self, requires_arch) { + return _mm512_roundscale_ps(self, _MM_FROUND_TO_POS_INF); + } + template batch ceil(batch const& self, requires_arch) { + return _mm512_roundscale_pd(self, _MM_FROUND_TO_POS_INF); + } + + + namespace detail + { + // complex_low + template batch complex_low(batch, A> const& self, requires_arch) { + __m512i idx = _mm512_setr_epi32(0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23); + return _mm512_permutex2var_ps(self.real(), idx, self.imag()); + } + template batch complex_low(batch, A> const& self, requires_arch) { + __m512i idx = _mm512_setr_epi64(0, 8, 1, 9, 2, 10, 3, 11); + return _mm512_permutex2var_pd(self.real(), idx, self.imag()); + } + + // complex_high + template batch complex_high(batch, A> const& self, requires_arch) { + __m512i idx = _mm512_setr_epi32(8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31); + return _mm512_permutex2var_ps(self.real(), idx, self.imag()); + } + template batch complex_high(batch, A> const& self, requires_arch) { + __m512i idx = _mm512_setr_epi64(4, 12, 5, 13, 6, 14, 7, 15); + return _mm512_permutex2var_pd(self.real(), idx, self.imag()); + } + } + + // convert + namespace detail { + template batch fast_cast(batch const& self, batch const&, requires_arch) { + return _mm512_cvtepi32_ps(self); + } + template batch fast_cast(batch const& self, batch const&, requires_arch) { + return _mm512_cvttps_epi32(self); + } + } + + // div + template batch div(batch const& self, batch const& other, requires_arch) { + return _mm512_div_ps(self, other); + } + template batch div(batch const& self, batch const& other, requires_arch) { + return _mm512_div_pd(self, other); + } + + + // eq + template batch_bool eq(batch const& self, batch const& other, requires_arch) { + return _mm512_cmp_ps_mask(self, other, _CMP_EQ_OQ); + } + template batch_bool eq(batch const& self, batch const& other, requires_arch) { + return _mm512_cmp_pd_mask(self, other, _CMP_EQ_OQ); + } + + template::value, void>::type> + batch_bool eq(batch const& self, batch const& other, requires_arch) { + return detail::compare_int_avx512f(self, other); + } + template + batch_bool eq(batch_bool const& self, batch_bool const& other, requires_arch) { + using register_type = typename batch_bool::register_type; + return register_type(~self.data ^ other.data); + } + + // floor + template batch floor(batch const& self, requires_arch) { + return _mm512_roundscale_ps(self, _MM_FROUND_TO_NEG_INF); + } + template batch floor(batch const& self, requires_arch) { + return _mm512_roundscale_pd(self, _MM_FROUND_TO_NEG_INF); + } + + // from bool + template + batch from_bool(batch_bool const& self, requires_arch) { + return select(self, batch(1), batch(0)); + } + + // ge + template batch_bool ge(batch const& self, batch const& other, requires_arch) { + return _mm512_cmp_ps_mask(self, other, _CMP_GE_OQ); + } + template batch_bool ge(batch const& self, batch const& other, requires_arch) { + return _mm512_cmp_pd_mask(self, other, _CMP_GE_OQ); + } + template::value, void>::type> + batch_bool ge(batch const& self, batch const& other, requires_arch) { + return detail::compare_int_avx512f(self, other); + } + + // gt + template batch_bool gt(batch const& self, batch const& other, requires_arch) { + return _mm512_cmp_ps_mask(self, other, _CMP_GT_OQ); + } + template batch_bool gt(batch const& self, batch const& other, requires_arch) { + return _mm512_cmp_pd_mask(self, other, _CMP_GT_OQ); + } + template::value, void>::type> + batch_bool gt(batch const& self, batch const& other, requires_arch) { + return detail::compare_int_avx512f(self, other); + } + + + // hadd + template float hadd(batch const& rhs, requires_arch) { + __m256 tmp1 = _mm512_extractf32x8_ps(rhs, 1); + __m256 tmp2 = _mm512_extractf32x8_ps(rhs, 0); + __m256 res1 = _mm256_add_ps(tmp1, tmp2); + return hadd(batch(res1), avx2{}); + + } + template + double hadd(batch const &rhs, requires_arch) { + __m256d tmp1 = _mm512_extractf64x4_pd(rhs, 1); + __m256d tmp2 = _mm512_extractf64x4_pd(rhs, 0); + __m256d res1 = _mm256_add_pd(tmp1, tmp2); + return hadd(batch(res1), avx2{}); + } + template::value, void>::type> + T hadd(batch const& self, requires_arch) { + __m256i low, high; + detail::split_avx512(self, low, high); + batch blow(low), bhigh(high); + return hadd(blow, avx2{}) + hadd(bhigh, avx2{}); + } + + // haddp + template batch haddp(batch const* row, requires_arch) { + // The following folds over the vector once: + // tmp1 = [a0..8, b0..8] + // tmp2 = [a8..f, b8..f] +#define XSIMD_AVX512_HADDP_STEP1(I, a, b) \ + batch res ## I; \ + { \ + auto tmp1 = _mm512_shuffle_f32x4(a, b, _MM_SHUFFLE(1, 0, 1, 0)); \ + auto tmp2 = _mm512_shuffle_f32x4(a, b, _MM_SHUFFLE(3, 2, 3, 2)); \ + res ## I = _mm512_add_ps(tmp1, tmp2); \ + } \ + + XSIMD_AVX512_HADDP_STEP1(0, row[0], row[2]); + XSIMD_AVX512_HADDP_STEP1(1, row[4], row[6]); + XSIMD_AVX512_HADDP_STEP1(2, row[1], row[3]); + XSIMD_AVX512_HADDP_STEP1(3, row[5], row[7]); + XSIMD_AVX512_HADDP_STEP1(4, row[8], row[10]); + XSIMD_AVX512_HADDP_STEP1(5, row[12], row[14]); + XSIMD_AVX512_HADDP_STEP1(6, row[9], row[11]); + XSIMD_AVX512_HADDP_STEP1(7, row[13], row[15]); + +#undef XSIMD_AVX512_HADDP_STEP1 + + // The following flds the code and shuffles so that hadd_ps produces the correct result + // tmp1 = [a0..4, a8..12, b0..4, b8..12] (same for tmp3) + // tmp2 = [a5..8, a12..16, b5..8, b12..16] (same for tmp4) + // tmp5 = [r1[0], r1[2], r2[0], r2[2], r1[4], r1[6] ... +#define XSIMD_AVX512_HADDP_STEP2(I, a, b, c, d) \ + batch halfx ## I; \ + { \ + auto tmp1 = _mm512_shuffle_f32x4(a, b, _MM_SHUFFLE(2, 0, 2, 0)); \ + auto tmp2 = _mm512_shuffle_f32x4(a, b, _MM_SHUFFLE(3, 1, 3, 1)); \ + \ + auto resx1 = _mm512_add_ps(tmp1, tmp2); \ + \ + auto tmp3 = _mm512_shuffle_f32x4(c, d, _MM_SHUFFLE(2, 0, 2, 0)); \ + auto tmp4 = _mm512_shuffle_f32x4(c, d, _MM_SHUFFLE(3, 1, 3, 1)); \ + \ + auto resx2 = _mm512_add_ps(tmp3, tmp4); \ + \ + auto tmp5 = _mm512_shuffle_ps(resx1, resx2, _MM_SHUFFLE(2, 0, 2, 0)); \ + auto tmp6 = _mm512_shuffle_ps(resx1, resx2, _MM_SHUFFLE(3, 1, 3, 1)); \ + \ + auto resx3 = _mm512_add_ps(tmp5, tmp6); \ + \ + halfx ## I = _mm256_hadd_ps(_mm512_extractf32x8_ps(resx3, 0), \ + _mm512_extractf32x8_ps(resx3, 1)); \ + } \ + + XSIMD_AVX512_HADDP_STEP2(0, res0, res1, res2, res3); + XSIMD_AVX512_HADDP_STEP2(1, res4, res5, res6, res7); + +#undef XSIMD_AVX512_HADDP_STEP2 + + auto concat = _mm512_castps256_ps512(halfx0); + concat = _mm512_insertf32x8(concat, halfx1, 1); + return concat; + } + template + batch haddp(batch const *row, requires_arch) { +#define step1(I, a, b) \ + batch res ## I; \ + { \ + auto tmp1 = _mm512_shuffle_f64x2(a, b, _MM_SHUFFLE(1, 0, 1, 0)); \ + auto tmp2 = _mm512_shuffle_f64x2(a, b, _MM_SHUFFLE(3, 2, 3, 2)); \ + res ## I = _mm512_add_pd(tmp1, tmp2); \ + } \ + + step1(1, row[0], row[2]); + step1(2, row[4], row[6]); + step1(3, row[1], row[3]); + step1(4, row[5], row[7]); + +#undef step1 + + auto tmp5 = _mm512_shuffle_f64x2(res1, res2, _MM_SHUFFLE(2, 0, 2, 0)); + auto tmp6 = _mm512_shuffle_f64x2(res1, res2, _MM_SHUFFLE(3, 1, 3, 1)); + + auto resx1 = _mm512_add_pd(tmp5, tmp6); + + auto tmp7 = _mm512_shuffle_f64x2(res3, res4, _MM_SHUFFLE(2, 0, 2, 0)); + auto tmp8 = _mm512_shuffle_f64x2(res3, res4, _MM_SHUFFLE(3, 1, 3, 1)); + + auto resx2 = _mm512_add_pd(tmp7, tmp8); + + auto tmpx = _mm512_shuffle_pd(resx1, resx2, 0b00000000); + auto tmpy = _mm512_shuffle_pd(resx1, resx2, 0b11111111); + + return _mm512_add_pd(tmpx, tmpy); + } + + // isnan + template batch_bool isnan(batch const& self, requires_arch) { + return _mm512_cmp_ps_mask(self, self, _CMP_UNORD_Q); + } + template batch_bool isnan(batch const& self, requires_arch) { + return _mm512_cmp_pd_mask(self, self, _CMP_UNORD_Q); + } + + // le + template batch_bool le(batch const& self, batch const& other, requires_arch) { + return _mm512_cmp_ps_mask(self, other, _CMP_LE_OQ); + } + template batch_bool le(batch const& self, batch const& other, requires_arch) { + return _mm512_cmp_pd_mask(self, other, _CMP_LE_OQ); + } + template::value, void>::type> + batch_bool le(batch const& self, batch const& other, requires_arch) { + return detail::compare_int_avx512f(self, other); + } + + + // load_aligned + template::value, void>::type> + batch load_aligned(T const* mem, convert, requires_arch) { + return _mm512_load_si512((__m512i const*)mem); + } + template batch load_aligned(float const* mem, convert, requires_arch) { + return _mm512_load_ps(mem); + } + template batch load_aligned(double const* mem, convert, requires_arch) { + return _mm512_load_pd(mem); + } + + // load_complex + namespace detail + { + template batch, A> load_complex(batch const& hi, batch const& lo, requires_arch) { + __m512i real_idx = _mm512_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30); + __m512i imag_idx = _mm512_setr_epi32(1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31); + auto real = _mm512_permutex2var_ps(hi, real_idx, lo); + auto imag = _mm512_permutex2var_ps(hi, imag_idx, lo); + return {real, imag}; + } + template batch, A> load_complex(batch const& hi, batch const& lo, requires_arch) { + __m512i real_idx = _mm512_setr_epi64(0, 2, 4, 6, 8, 10, 12, 14); + __m512i imag_idx = _mm512_setr_epi64(1, 3, 5, 7, 9, 11, 13, 15); + auto real = _mm512_permutex2var_pd(hi, real_idx, lo); + auto imag = _mm512_permutex2var_pd(hi, imag_idx, lo); + return {real, imag}; + } + } + + // load_unaligned + template::value, void>::type> + batch load_unaligned(T const* mem, convert, requires_arch) { + return _mm512_loadu_si512((__m512i const*)mem); + } + template batch load_unaligned(float const* mem, convert, requires_arch){ + return _mm512_loadu_ps(mem); + } + template batch load_unaligned(double const* mem, convert, requires_arch){ + return _mm512_loadu_pd(mem); + } + + // lt + template batch_bool lt(batch const& self, batch const& other, requires_arch) { + return _mm512_cmp_ps_mask(self, other, _CMP_LT_OQ); + } + template batch_bool lt(batch const& self, batch const& other, requires_arch) { + return _mm512_cmp_pd_mask(self, other, _CMP_LT_OQ); + } + + + template::value, void>::type> + batch_bool lt(batch const& self, batch const& other, requires_arch) { + return detail::compare_int_avx512f(self, other); + } + + // max + template batch max(batch const& self, batch const& other, requires_arch) { + return _mm512_max_ps(self, other); + } + template batch max(batch const& self, batch const& other, requires_arch) { + return _mm512_max_pd(self, other); + } + template::value, void>::type> + batch max(batch const& self, batch const& other, requires_arch) { + if(std::is_signed::value) { + switch(sizeof(T)) { + case 4: return _mm512_max_epi32(self, other); + case 8: return _mm512_max_epi64(self, other); + default : return detail::fwd_to_avx([](__m256i s, __m256i o) { return max(batch(s), batch(o)); }, self, other); + } + } + else { + switch(sizeof(T)) { + case 4: return _mm512_max_epu32(self, other); + case 8: return _mm512_max_epu64(self, other); + default : return detail::fwd_to_avx([](__m256i s, __m256i o) { return max(batch(s), batch(o)); }, self, other); + } + } + } + + // min + template batch min(batch const& self, batch const& other, requires_arch) { + return _mm512_min_ps(self, other); + } + template batch min(batch const& self, batch const& other, requires_arch) { + return _mm512_min_pd(self, other); + } + template::value, void>::type> + batch min(batch const& self, batch const& other, requires_arch) { + if(std::is_signed::value) { + switch(sizeof(T)) { + case 4: return _mm512_min_epi32(self, other); + case 8: return _mm512_min_epi64(self, other); + default : return detail::fwd_to_avx([](__m256i s, __m256i o) { return min(batch(s), batch(o)); }, self, other); + } + } + else { + switch(sizeof(T)) { + case 4: return _mm512_min_epu32(self, other); + case 8: return _mm512_min_epu64(self, other); + default : return detail::fwd_to_avx([](__m256i s, __m256i o) { return min(batch(s), batch(o)); }, self, other); + } + } + } + + // mul + template batch mul(batch const& self, batch const& other, requires_arch) { + return _mm512_mul_ps(self, other); + } + template batch mul(batch const& self, batch const& other, requires_arch) { + return _mm512_mul_pd(self, other); + } + template::value, void>::type> + batch mul(batch const& self, batch const& other, requires_arch) { + switch(sizeof(T)) { + case 4: return _mm512_mullo_epi32(self, other); + case 8: return _mm512_mullo_epi64(self, other); + default : return detail::fwd_to_avx([](__m256i s, __m256i o) { return mul(batch(s), batch(o)); }, self, other); + } + } + + // nearbyint + template batch nearbyint(batch const& self, requires_arch) { + return _mm512_roundscale_round_ps(self, _MM_FROUND_TO_NEAREST_INT, _MM_FROUND_CUR_DIRECTION); + } + template batch nearbyint(batch const& self, requires_arch) { + return _mm512_roundscale_round_pd(self, _MM_FROUND_TO_NEAREST_INT, _MM_FROUND_CUR_DIRECTION); + } + + // neg + template + batch neg(batch const& self, requires_arch) { + return 0 - self; + } + + // neq + template batch_bool neq(batch const& self, batch const& other, requires_arch) { + return _mm512_cmp_ps_mask(self, other, _CMP_NEQ_OQ); + } + template batch_bool neq(batch const& self, batch const& other, requires_arch) { + return _mm512_cmp_pd_mask(self, other, _CMP_NEQ_OQ); + } + template::value, void>::type> + batch_bool neq(batch const& self, batch const& other, requires_arch) { + return ~(self == other); + } + + template + batch_bool neq(batch_bool const& self, batch_bool const& other, requires_arch) { + using register_type = typename batch_bool::register_type; + return register_type(self.data ^ other.data); + } + + // sadd + template batch sadd(batch const& self, batch const& other, requires_arch) { + return add(self, other); // no saturated arithmetic on floating point numbers + } + template batch sadd(batch const& self, batch const& other, requires_arch) { + return add(self, other); // no saturated arithmetic on floating point numbers + } + template::value, void>::type> + batch sadd(batch const& self, batch const& other, requires_arch) { + if(std::is_signed::value) { + auto mask = other < 0; + auto self_pos_branch = min(std::numeric_limits::max() - other, self); + auto self_neg_branch = max(std::numeric_limits::min() - other, self); + return other + select(mask, self_neg_branch, self_pos_branch); + } + else { + const auto diffmax = std::numeric_limits::max() - self; + const auto mindiff = min(diffmax, other); + return self + mindiff; + } + } + + // select + template batch select(batch_bool const& cond, batch const& true_br, batch const& false_br, requires_arch) { + return _mm512_mask_blend_ps(cond, false_br, true_br); + } + template batch select(batch_bool const& cond, batch const& true_br, batch const& false_br, requires_arch) { + return _mm512_mask_blend_pd(cond, false_br, true_br); + } + + template::value, void>::type> + batch select(batch_bool const& cond, batch const& true_br, batch const& false_br, requires_arch) { + switch(sizeof(T)) { + case 1: { + alignas(avx2::alignment()) uint8_t buffer[64]; + // FIXME: ultra inefficient + for(int i =0; i < 64; ++i) + buffer[i] = cond.data & ((uint64_t)1 << i) ? 0xFF : 0; + __m256i cond_low = batch::load_aligned(&buffer[0]); + __m256i cond_hi = batch::load_aligned(&buffer[32]); + + __m256i true_low, true_hi; + detail::split_avx512(true_br, true_low, true_hi); + + __m256i false_low, false_hi; + detail::split_avx512(false_br, false_low, false_hi); + + __m256i res_low = select(batch_bool(cond_low), batch(true_low), batch(false_low), avx2{}); + __m256i res_hi = select(batch_bool(cond_hi), batch(true_hi), batch(false_hi), avx2{}); + return detail::merge_avx(res_low, res_hi); + } + case 2: { + __m256i cond_low = _mm512_maskz_cvtepi32_epi16((uint64_t)cond.data & 0xFFFF, _mm512_set1_epi32(~0)); + __m256i cond_hi = _mm512_maskz_cvtepi32_epi16((uint64_t)cond.data >> 16, _mm512_set1_epi32(~0)); + + __m256i true_low, true_hi; + detail::split_avx512(true_br, true_low, true_hi); + + __m256i false_low, false_hi; + detail::split_avx512(false_br, false_low, false_hi); + + __m256i res_low = select(batch_bool(cond_low), batch(true_low), batch(false_low), avx2{}); + __m256i res_hi = select(batch_bool(cond_hi), batch(true_hi), batch(false_hi), avx2{}); + return detail::merge_avx(res_low, res_hi); + } + case 4: return _mm512_mask_blend_epi32(cond, false_br, true_br); + case 8: return _mm512_mask_blend_epi64(cond, false_br, true_br); + default: assert(false && "unsupported arch/type combination"); return {}; + }; + } + template::value, void>::type> + batch select(batch_bool_constant, Values...> const&, batch const& true_br, batch const& false_br, requires_arch) { + return select(batch_bool{Values...}, true_br, false_br, avx512f{}); + } + + + namespace detail + { + template + using enable_signed_integer_t = typename std::enable_if::value && std::is_signed::value, + int>::type; + + template + using enable_unsigned_integer_t = typename std::enable_if::value && std::is_unsigned::value, + int>::type; + } + + // set + template + batch set(batch const&, requires_arch, float v0, float v1, float v2, float v3, float v4, float v5, float v6, float v7, float v8, float v9, float v10, float v11, float v12, float v13, float v14, float v15) { + return _mm512_setr_ps(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15); + } + + template + batch set(batch const&, requires_arch, double v0, double v1, double v2, double v3, double v4, double v5, double v6, double v7) { + return _mm512_setr_pd(v0, v1, v2, v3, v4, v5, v6, v7); + } + template::value, void>::type> + batch set(batch const&, requires_arch, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7) { + return _mm512_set_epi64(v7, v6, v5, v4, v3, v2, v1, v0); + } + template::value, void>::type> + batch set(batch const&, requires_arch, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7, + T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15) { + return _mm512_setr_epi32(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15); + } + template = 0> + batch set(batch const&, requires_arch, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7, + T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15, + T v16, T v17, T v18, T v19, T v20, T v21, T v22, T v23, + T v24, T v25, T v26, T v27, T v28, T v29, T v30, T v31) { +#if defined(__clang__) || __GNUC__ + return __extension__ (__m512i)(__v32hi) + { + v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, + v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31 + }; +#else + return _mm512_set_epi16(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, + v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31); +#endif + } + template = 0> + batch set(batch const&, requires_arch, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7, + T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15, + T v16, T v17, T v18, T v19, T v20, T v21, T v22, T v23, + T v24, T v25, T v26, T v27, T v28, T v29, T v30, T v31) { +#if defined(__clang__) || __GNUC__ + return __extension__ (__m512i)(__v32hu) + { + v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, + v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31 + }; +#else + return _mm512_set_epi16(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, + v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31); +#endif + } + template = 0> + batch set(batch const&, requires_arch, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7, + T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15, + T v16, T v17, T v18, T v19, T v20, T v21, T v22, T v23, + T v24, T v25, T v26, T v27, T v28, T v29, T v30, T v31, + T v32, T v33, T v34, T v35, T v36, T v37, T v38, T v39, + T v40, T v41, T v42, T v43, T v44, T v45, T v46, T v47, + T v48, T v49, T v50, T v51, T v52, T v53, T v54, T v55, + T v56, T v57, T v58, T v59, T v60, T v61, T v62, T v63 + ) { + +#if defined(__clang__) || __GNUC__ + return __extension__ (__m512i)(__v64qi) + { + v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, + v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, + v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, + v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58, v59, v60, v61, v62, v63 + }; +#else + return _mm512_set_epi8(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, + v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, + v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, + v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58, v59, v60, v61, v62, v63); +#endif + } + template = 0> + batch set(batch const&, requires_arch, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7, + T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15, + T v16, T v17, T v18, T v19, T v20, T v21, T v22, T v23, + T v24, T v25, T v26, T v27, T v28, T v29, T v30, T v31, + T v32, T v33, T v34, T v35, T v36, T v37, T v38, T v39, + T v40, T v41, T v42, T v43, T v44, T v45, T v46, T v47, + T v48, T v49, T v50, T v51, T v52, T v53, T v54, T v55, + T v56, T v57, T v58, T v59, T v60, T v61, T v62, T v63 + ) { + +#if defined(__clang__) || __GNUC__ + return __extension__ (__m512i)(__v64qu) + { + v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, + v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, + v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, + v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58, v59, v60, v61, v62, v63 + }; +#else + return _mm512_set_epi8(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, + v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, + v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, + v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58, v59, v60, v61, v62, v63); +#endif + } + template + batch_bool set(batch_bool const&, requires_arch, Values... values) { + static_assert(sizeof...(Values) == batch_bool::size, "consistent init"); + using register_type = typename batch_bool::register_type; + register_type r = 0; + unsigned shift = 0; + (void)std::initializer_list{(r|=register_type(values?1:0) << (shift++))...}; + return r; + } + + // sqrt + template batch sqrt(batch const& val, requires_arch) { + return _mm512_sqrt_ps(val); + } + template batch sqrt(batch const& val, requires_arch) { + return _mm512_sqrt_pd(val); + } + + // ssub + template batch ssub(batch const& self, batch const& other, requires_arch) { + return _mm512_sub_ps(self, other); // no saturated arithmetic on floating point numbers + } + template batch ssub(batch const& self, batch const& other, requires_arch) { + return _mm512_sub_pd(self, other); // no saturated arithmetic on floating point numbers + } + template::value, void>::type> + batch ssub(batch const& self, batch const& other, requires_arch) { + if(std::is_signed::value) { + return sadd(self, -other); + } + else { + const auto diff = min(self, other); + return self - diff; + } + } + + // store + template + void store(batch_bool const& self, bool* mem, requires_arch) { + using register_type = typename batch_bool::register_type; + constexpr auto size = batch_bool::size; + for(std::size_t i = 0; i < size; ++i) + mem[i] = self.data & (register_type(1) << i); + } + + // store_aligned + template::value, void>::type> + void store_aligned(T *mem, batch const& self, requires_arch) { + return _mm512_store_si512((__m512i *)mem, self); + } + template::value, void>::type> + void store_aligned(T *mem, batch_bool const& self, requires_arch) { + return _mm512_store_si512((__m512i *)mem, self); + } + template void store_aligned(float *mem, batch const& self, requires_arch) { + return _mm512_store_ps(mem, self); + } + template void store_aligned(double *mem, batch const& self, requires_arch) { + return _mm512_store_pd(mem, self); + } + + // store_unaligned + template::value, void>::type> + void store_unaligned(T *mem, batch const& self, requires_arch) { + return _mm512_storeu_si512((__m512i *)mem, self); + } + template::value, void>::type> + void store_unaligned(T *mem, batch_bool const& self, requires_arch) { + return _mm512_storeu_si512((__m512i *)mem, self); + } + template void store_unaligned(float *mem, batch const& self, requires_arch) { + return _mm512_storeu_ps(mem, self); + } + template void store_unaligned(double *mem, batch const& self, requires_arch) { + return _mm512_storeu_pd(mem, self); + } + + // sub + template::value, void>::type> + batch sub(batch const& self, batch const& other, requires_arch) { + switch(sizeof(T)) { + case 1: return detail::fwd_to_avx([](__m256i s, __m256i o) { return sub(batch(s), batch(o)); }, self, other); + case 2: return detail::fwd_to_avx([](__m256i s, __m256i o) { return sub(batch(s), batch(o)); }, self, other); + case 4: return _mm512_sub_epi32(self, other); + case 8: return _mm512_sub_epi64(self, other); + default: assert(false && "unsupported arch/op combination"); return {}; + } + } + template batch sub(batch const& self, batch const& other, requires_arch) { + return _mm512_sub_ps(self, other); + } + template batch sub(batch const& self, batch const& other, requires_arch) { + return _mm512_sub_pd(self, other); + } + + // to_float + template + batch to_float(batch const& self, requires_arch) { + return _mm512_cvtepi32_ps(self); + } + template + batch to_float(batch const& self, requires_arch) { + // FIXME: call _mm_cvtepi64_pd + alignas(A::alignment()) int64_t buffer[batch::size]; + self.store_aligned(&buffer[0]); + return {(double)buffer[0], (double)buffer[1], (double)buffer[2], (double)buffer[3], + (double)buffer[4], (double)buffer[5], (double)buffer[6], (double)buffer[7]}; + } + + // to_int + template + batch to_int(batch const& self, requires_arch) { + return _mm512_cvttps_epi32(self); + } + + template + batch to_int(batch const& self, requires_arch) { + // FIXME: call _mm_cvttpd_epi64 + alignas(A::alignment()) double buffer[batch::size]; + self.store_aligned(&buffer[0]); + return {(int64_t)buffer[0], (int64_t)buffer[1], (int64_t)buffer[2], (int64_t)buffer[3], + (int64_t)buffer[4], (int64_t)buffer[5], (int64_t)buffer[6], (int64_t)buffer[7]}; + } + + // trunc + template batch trunc(batch const& self, requires_arch) { + return _mm512_roundscale_round_ps(self, _MM_FROUND_TO_ZERO, _MM_FROUND_CUR_DIRECTION); + } + template batch trunc(batch const& self, requires_arch) { + return _mm512_roundscale_round_pd(self, _MM_FROUND_TO_ZERO, _MM_FROUND_CUR_DIRECTION); + } + + // zip_hi + template::value, void>::type> + batch zip_hi(batch const& self, batch const& other, requires_arch) { + switch(sizeof(T)) { + case 1: return _mm512_unpackhi_epi8(self, other); + case 2: return _mm512_unpackhi_epi16(self, other); + case 4: return _mm512_unpackhi_epi32(self, other); + case 8: return _mm512_unpackhi_epi64(self, other); + default: assert(false && "unsupported arch/op combination"); return {}; + } + } + template batch zip_hi(batch const& self, batch const& other, requires_arch) { + return _mm512_unpackhi_ps(self, other); + } + template batch zip_hi(batch const& self, batch const& other, requires_arch) { + return _mm512_unpackhi_pd(self, other); + } + + // zip_lo + template::value, void>::type> + batch zip_lo(batch const& self, batch const& other, requires_arch) { + switch(sizeof(T)) { + case 1: return _mm512_unpacklo_epi8(self, other); + case 2: return _mm512_unpacklo_epi16(self, other); + case 4: return _mm512_unpacklo_epi32(self, other); + case 8: return _mm512_unpacklo_epi64(self, other); + default: assert(false && "unsupported arch/op combination"); return {}; + } + } + template batch zip_lo(batch const& self, batch const& other, requires_arch) { + return _mm512_unpacklo_ps(self, other); + } + template batch zip_lo(batch const& self, batch const& other, requires_arch) { + return _mm512_unpacklo_pd(self, other); + } + + } + +} + +#endif diff --git a/third_party/xsimd/math/xsimd_numerical_constant.hpp b/third_party/xsimd/arch/xsimd_constants.hpp similarity index 92% rename from third_party/xsimd/math/xsimd_numerical_constant.hpp rename to third_party/xsimd/arch/xsimd_constants.hpp index 981945b9be..a9ff97feb2 100644 --- a/third_party/xsimd/math/xsimd_numerical_constant.hpp +++ b/third_party/xsimd/arch/xsimd_constants.hpp @@ -2,6 +2,7 @@ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * +* Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * @@ -13,43 +14,45 @@ #include -#include "../types/xsimd_types_include.hpp" +#include "../types/xsimd_utils.hpp" namespace xsimd { +namespace constants { + #define XSIMD_DEFINE_CONSTANT(NAME, SINGLE, DOUBLE) \ template \ - constexpr T NAME() noexcept \ + inline T NAME() noexcept \ { \ return T(NAME()); \ } \ template <> \ - constexpr float NAME() noexcept \ + inline float NAME() noexcept \ { \ return SINGLE; \ } \ template <> \ - constexpr double NAME() noexcept \ + inline double NAME() noexcept \ { \ return DOUBLE; \ } #define XSIMD_DEFINE_CONSTANT_HEX(NAME, SINGLE, DOUBLE) \ template \ - constexpr T NAME() noexcept \ + inline T NAME() noexcept \ { \ return T(NAME()); \ } \ template <> \ - constexpr float NAME() noexcept \ + inline float NAME() noexcept \ { \ - return detail::caster32_t(uint32_t(SINGLE)).f; \ + return bit_cast((uint32_t)SINGLE); \ } \ template <> \ - constexpr double NAME() noexcept \ + inline double NAME() noexcept \ { \ - return detail::caster64_t(uint64_t(DOUBLE)).f; \ + return bit_cast((uint64_t)DOUBLE); \ } XSIMD_DEFINE_CONSTANT(infinity, (std::numeric_limits::infinity()), (std::numeric_limits::infinity())) @@ -91,7 +94,7 @@ namespace xsimd XSIMD_DEFINE_CONSTANT_HEX(pio2_3t, 0x248d3132, 0x397b839a252049c1) XSIMD_DEFINE_CONSTANT_HEX(pio4, 0x3f490fdb, 0x3fe921fb54442d18) XSIMD_DEFINE_CONSTANT_HEX(signmask, 0x80000000, 0x8000000000000000) - XSIMD_DEFINE_CONSTANT(smallestposval, 1.1754944e-38f, 2.225073858507201e-308) + XSIMD_DEFINE_CONSTANT(smallestposval, std::numeric_limits::min(), std::numeric_limits::min()) XSIMD_DEFINE_CONSTANT_HEX(sqrt_2pi, 0x40206c99, 0x40040d931ff62704) XSIMD_DEFINE_CONSTANT_HEX(sqrteps, 0x39b504f3, 0x3e50000000000000) XSIMD_DEFINE_CONSTANT_HEX(tanpio8, 0x3ed413cd, 0x3fda827999fcef31) @@ -326,18 +329,18 @@ namespace xsimd template <> struct minvalue_impl { - static constexpr float get_value() noexcept + static float get_value() noexcept { - return detail::caster32_t(uint32_t(0xff7fffff)).f; + return bit_cast((uint32_t)0xff7fffff); } }; template <> struct minvalue_impl { - static constexpr double get_value() noexcept + static double get_value() noexcept { - return detail::caster64_t(uint64_t(0xffefffffffffffff)).f; + return bit_cast((uint64_t)0xffefffffffffffff); } }; } @@ -357,7 +360,9 @@ namespace xsimd { return T(std::numeric_limits::max()); } - +} + } #endif + diff --git a/third_party/xsimd/arch/xsimd_fma3.hpp b/third_party/xsimd/arch/xsimd_fma3.hpp new file mode 100644 index 0000000000..e4646e90d1 --- /dev/null +++ b/third_party/xsimd/arch/xsimd_fma3.hpp @@ -0,0 +1,63 @@ +/*************************************************************************** +* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * +* Martin Renou * +* Copyright (c) QuantStack * +* Copyright (c) Serge Guelton * +* * +* Distributed under the terms of the BSD 3-Clause License. * +* * +* The full license is in the file LICENSE, distributed with this software. * +****************************************************************************/ + +#ifndef XSIMD_FMA3_HPP +#define XSIMD_FMA3_HPP + +#include "../types/xsimd_sse_register.hpp" + + +namespace xsimd { + + namespace kernel { + using namespace types; + // fnma + template batch fnma(batch const& x, batch const& y, batch const& z, requires_arch) { + return _mm_fnmadd_ps(x, y, z); + } + + template batch fnma(batch const& x, batch const& y, batch const& z, requires_arch) { + return _mm_fnmadd_pd(x, y, z); + } + + // fnms + template batch fnms(batch const& x, batch const& y, batch const& z, requires_arch) { + return _mm_fnmsub_ps(x, y, z); + } + + template batch fnms(batch const& x, batch const& y, batch const& z, requires_arch) { + return _mm_fnmsub_pd(x, y, z); + } + + // fma + template batch fma(batch const& x, batch const& y, batch const& z, requires_arch) { + return _mm_fmadd_ps(x, y, z); + } + + template batch fma(batch const& x, batch const& y, batch const& z, requires_arch) { + return _mm_fmadd_pd(x, y, z); + } + + // fms + template batch fms(batch const& x, batch const& y, batch const& z, requires_arch) { + return _mm_fmsub_ps(x, y, z); + } + + template batch fms(batch const& x, batch const& y, batch const& z, requires_arch) { + return _mm_fmsub_pd(x, y, z); + } + + } + +} + +#endif + diff --git a/third_party/xsimd/arch/xsimd_fma4.hpp b/third_party/xsimd/arch/xsimd_fma4.hpp new file mode 100644 index 0000000000..814bcd1453 --- /dev/null +++ b/third_party/xsimd/arch/xsimd_fma4.hpp @@ -0,0 +1,63 @@ +/*************************************************************************** +* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * +* Martin Renou * +* Copyright (c) QuantStack * +* Copyright (c) Serge Guelton * +* * +* Distributed under the terms of the BSD 3-Clause License. * +* * +* The full license is in the file LICENSE, distributed with this software. * +****************************************************************************/ + +#ifndef XSIMD_FMA3_HPP +#define XSIMD_FMA3_HPP + +#include "../types/xsimd_sse_register.hpp" + + +namespace xsimd { + + namespace kernel { + using namespace types; + + // fnma + template batch fnma(simd_register const& x, simd_register const& y, simd_register const& z, requires_arch) { + return _mm_nmacc_ps(x, y, z); + } + + template batch fnma(simd_register const& x, simd_register const& y, simd_register const& z, requires_arch) { + return _mm_nmacc_pd(x, y, z); + } + + // fnms + template batch fnms(simd_register const& x, simd_register const& y, simd_register const& z, requires_arch) { + return _mm_nmsub_ps(x, y, z); + } + + template batch fnms(simd_register const& x, simd_register const& y, simd_register const& z, requires_arch) { + return _mm_nmsub_pd(x, y, z); + } + + // fma + template batch fma(simd_register const& x, simd_register const& y, simd_register const& z, requires_arch) { + return _mm_macc_ps(x, y, z); + } + + template batch fma(simd_register const& x, simd_register const& y, simd_register const& z, requires_arch) { + return _mm_macc_pd(x, y, z); + } + + // fms + template batch fms(simd_register const& x, simd_register const& y, simd_register const& z, requires_arch) { + return _mm_msub_ps(x, y, z); + } + + template batch fms(simd_register const& x, simd_register const& y, simd_register const& z, requires_arch) { + return _mm_msub_pd(x, y, z); + } + } + +} + +#endif + diff --git a/third_party/xsimd/arch/xsimd_fma5.hpp b/third_party/xsimd/arch/xsimd_fma5.hpp new file mode 100644 index 0000000000..786949d25e --- /dev/null +++ b/third_party/xsimd/arch/xsimd_fma5.hpp @@ -0,0 +1,64 @@ +/*************************************************************************** +* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * +* Martin Renou * +* Copyright (c) QuantStack * +* Copyright (c) Serge Guelton * +* * +* Distributed under the terms of the BSD 3-Clause License. * +* * +* The full license is in the file LICENSE, distributed with this software. * +****************************************************************************/ + +#ifndef XSIMD_FMA5_HPP +#define XSIMD_FMA5_HPP + +#include "../types/xsimd_fma5_register.hpp" + + +namespace xsimd { + + namespace kernel { + using namespace types; + + // fnma + template batch fnma(batch const& x, batch const& y, batch const& z, requires_arch) { + return _mm256_fnmadd_ps(x, y, z); + } + + template batch fnma(batch const& x, batch const& y, batch const& z, requires_arch) { + return _mm256_fnmadd_pd(x, y, z); + } + + // fnms + template batch fnms(batch const& x, batch const& y, batch const& z, requires_arch) { + return _mm256_fnmsub_ps(x, y, z); + } + + template batch fnms(batch const& x, batch const& y, batch const& z, requires_arch) { + return _mm256_fnmsub_pd(x, y, z); + } + + // fma + template batch fma(batch const& x, batch const& y, batch const& z, requires_arch) { + return _mm256_fmadd_ps(x, y, z); + } + + template batch fma(batch const& x, batch const& y, batch const& z, requires_arch) { + return _mm256_fmadd_pd(x, y, z); + } + + // fms + template batch fms(batch const& x, batch const& y, batch const& z, requires_arch) { + return _mm256_fmsub_ps(x, y, z); + } + + template batch fms(batch const& x, batch const& y, batch const& z, requires_arch) { + return _mm256_fmsub_pd(x, y, z); + } + + + } + +} + +#endif diff --git a/third_party/xsimd/types/xsimd_neon_utils.hpp b/third_party/xsimd/arch/xsimd_generic.hpp similarity index 58% rename from third_party/xsimd/types/xsimd_neon_utils.hpp rename to third_party/xsimd/arch/xsimd_generic.hpp index 92cb6a85e5..338413d36c 100644 --- a/third_party/xsimd/types/xsimd_neon_utils.hpp +++ b/third_party/xsimd/arch/xsimd_generic.hpp @@ -2,31 +2,23 @@ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * +* Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ -#ifndef XSIMD_NEON_UTILS_HPP -#define XSIMD_NEON_UTILS_HPP +#ifndef XSIMD_GENERIC_HPP +#define XSIMD_GENERIC_HPP -namespace xsimd -{ - namespace neon_detail - { - template - inline R unroll_op_impl(F&& f, detail::index_sequence) - { - return R{static_cast(f(I))...}; - } - - template - inline R unroll_op(F&& f) - { - return unroll_op_impl(f, detail::make_index_sequence{}); - } - } -} +#include "./generic/xsimd_generic_arithmetic.hpp" +#include "./generic/xsimd_generic_complex.hpp" +#include "./generic/xsimd_generic_logical.hpp" +#include "./generic/xsimd_generic_math.hpp" +#include "./generic/xsimd_generic_memory.hpp" +#include "./generic/xsimd_generic_rounding.hpp" +#include "./generic/xsimd_generic_trigo.hpp" #endif + diff --git a/third_party/xsimd/arch/xsimd_generic_fwd.hpp b/third_party/xsimd/arch/xsimd_generic_fwd.hpp new file mode 100644 index 0000000000..8326066e11 --- /dev/null +++ b/third_party/xsimd/arch/xsimd_generic_fwd.hpp @@ -0,0 +1,35 @@ +/*************************************************************************** +* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * +* Martin Renou * +* Copyright (c) QuantStack * +* Copyright (c) Serge Guelton * +* * +* Distributed under the terms of the BSD 3-Clause License. * +* * +* The full license is in the file LICENSE, distributed with this software. * +****************************************************************************/ + +#ifndef XSIMD_GENERIC_FWD_HPP +#define XSIMD_GENERIC_FWD_HPP + +#include + +namespace xsimd { + + namespace kernel { + // forward declaration + template::value, void>::type> + batch abs(batch const& self, requires_arch); + template::value, void>::type> + batch bitwise_lshift(batch const& self, batch const& other, requires_arch); + template::value, void>::type> + batch bitwise_rshift(batch const& self, batch const& other, requires_arch); + template batch_bool gt(batch const& self, batch const& other, requires_arch); + template::value, void>::type> + batch mul(batch const& self, batch const& other, requires_arch); + + } +} + +#endif + diff --git a/third_party/xsimd/arch/xsimd_isa.hpp b/third_party/xsimd/arch/xsimd_isa.hpp new file mode 100644 index 0000000000..c3f8687409 --- /dev/null +++ b/third_party/xsimd/arch/xsimd_isa.hpp @@ -0,0 +1,75 @@ +/*************************************************************************** +* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * +* Martin Renou * +* Copyright (c) QuantStack * +* Copyright (c) Serge Guelton * +* * +* Distributed under the terms of the BSD 3-Clause License. * +* * +* The full license is in the file LICENSE, distributed with this software. * +****************************************************************************/ + +#ifndef XSIMD_ISA_HPP +#define XSIMD_ISA_HPP + +#include "../config/xsimd_arch.hpp" + +#include "./xsimd_generic_fwd.hpp" + +#if XSIMD_WITH_SSE2 +#include "./xsimd_sse2.hpp" +#endif + +#if XSIMD_WITH_SSE3 +#include "./xsimd_sse3.hpp" +#endif + +#if XSIMD_WITH_SSSE3 +#include "./xsimd_ssse3.hpp" +#endif + +#if XSIMD_WITH_SSE4_1 +#include "./xsimd_sse4_1.hpp" +#endif + +#if XSIMD_WITH_SSE4_2 +#include "./xsimd_sse4_2.hpp" +#endif + +#if XSIMD_WITH_FMA3 +#include "./xsimd_fma3.hpp" +#endif + +#if XSIMD_WITH_AVX +#include "./xsimd_avx.hpp" +#endif + +#if XSIMD_WITH_AVX2 +#include "./xsimd_avx2.hpp" +#endif + +#if XSIMD_WITH_FMA5 +#include "./xsimd_fma5.hpp" +#endif + +#if XSIMD_WITH_AVX512F +#include "./xsimd_avx512f.hpp" +#endif + +#if XSIMD_WITH_AVX512BW +#include "./xsimd_avx512bw.hpp" +#endif + +#if XSIMD_WITH_NEON +#include "./xsimd_neon.hpp" +#endif + +#if XSIMD_WITH_NEON64 +#include "./xsimd_neon64.hpp" +#endif + +// Must come last to have access to all conversion specializations. +#include "./xsimd_generic.hpp" + +#endif + diff --git a/third_party/xsimd/arch/xsimd_neon.hpp b/third_party/xsimd/arch/xsimd_neon.hpp new file mode 100644 index 0000000000..e3b02018ad --- /dev/null +++ b/third_party/xsimd/arch/xsimd_neon.hpp @@ -0,0 +1,2318 @@ +/*************************************************************************** +* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * +* Martin Renou * +* Copyright (c) QuantStack * +* Copyright (c) Serge Guelton * +* * +* Distributed under the terms of the BSD 3-Clause License. * +* * +* The full license is in the file LICENSE, distributed with this software. * +****************************************************************************/ + +#ifndef XSIMD_NEON_HPP +#define XSIMD_NEON_HPP + +#include +#include +#include +#include + +#include "../types/xsimd_neon_register.hpp" +#include "../types/xsimd_utils.hpp" + +// Wrap intrinsics so we can pass them as function pointers +// - OP: intrinsics name prefix, e.g., vorrq +// - RT: type traits to deduce intrinsics return types +#define WRAP_BINARY_INT_EXCLUDING_64(OP, RT) \ + namespace wrap { \ + inline RT OP##_u8 (uint8x16_t a, uint8x16_t b) { return ::OP##_u8 (a, b); } \ + inline RT OP##_s8 (int8x16_t a, int8x16_t b) { return ::OP##_s8 (a, b); } \ + inline RT OP##_u16(uint16x8_t a, uint16x8_t b) { return ::OP##_u16(a, b); } \ + inline RT OP##_s16(int16x8_t a, int16x8_t b) { return ::OP##_s16(a, b); } \ + inline RT OP##_u32(uint32x4_t a, uint32x4_t b) { return ::OP##_u32(a, b); } \ + inline RT OP##_s32(int32x4_t a, int32x4_t b) { return ::OP##_s32(a, b); } \ + } + +#define WRAP_BINARY_INT(OP, RT) \ + WRAP_BINARY_INT_EXCLUDING_64(OP, RT) \ + namespace wrap { \ + inline RT OP##_u64(uint64x2_t a, uint64x2_t b) { return ::OP##_u64(a, b); } \ + inline RT OP##_s64(int64x2_t a, int64x2_t b) { return ::OP##_s64(a, b); } \ + } + +#define WRAP_BINARY_FLOAT(OP, RT) \ + namespace wrap { \ + inline RT OP##_f32(float32x4_t a, float32x4_t b) { return ::OP##_f32(a, b); } \ + } + +#define WRAP_UNARY_INT_EXCLUDING_64(OP) \ + namespace wrap { \ + inline uint8x16_t OP##_u8 (uint8x16_t a) { return ::OP##_u8 (a); } \ + inline int8x16_t OP##_s8 (int8x16_t a) { return ::OP##_s8 (a); } \ + inline uint16x8_t OP##_u16(uint16x8_t a) { return ::OP##_u16(a); } \ + inline int16x8_t OP##_s16(int16x8_t a) { return ::OP##_s16(a); } \ + inline uint32x4_t OP##_u32(uint32x4_t a) { return ::OP##_u32(a); } \ + inline int32x4_t OP##_s32(int32x4_t a) { return ::OP##_s32(a); } \ + } + +#define WRAP_UNARY_INT(OP) \ + WRAP_UNARY_INT_EXCLUDING_64(OP) \ + namespace wrap { \ + inline uint64x2_t OP##_u64(uint64x2_t a) { return ::OP##_u64(a); } \ + inline int64x2_t OP##_s64(int64x2_t a) { return ::OP##_s64(a); } \ + } + +#define WRAP_UNARY_FLOAT(OP) \ + namespace wrap { \ + inline float32x4_t OP##_f32(float32x4_t a) { return ::OP##_f32(a); } \ + } + +// Dummy identity caster to ease coding +inline uint8x16_t vreinterpretq_u8_u8 (uint8x16_t arg) { return arg; } +inline int8x16_t vreinterpretq_s8_s8 (int8x16_t arg) { return arg; } +inline uint16x8_t vreinterpretq_u16_u16(uint16x8_t arg) { return arg; } +inline int16x8_t vreinterpretq_s16_s16(int16x8_t arg) { return arg; } +inline uint32x4_t vreinterpretq_u32_u32(uint32x4_t arg) { return arg; } +inline int32x4_t vreinterpretq_s32_s32(int32x4_t arg) { return arg; } +inline uint64x2_t vreinterpretq_u64_u64(uint64x2_t arg) { return arg; } +inline int64x2_t vreinterpretq_s64_s64(int64x2_t arg) { return arg; } +inline float32x4_t vreinterpretq_f32_f32(float32x4_t arg) { return arg; } + +namespace xsimd +{ + namespace kernel + { + using namespace types; + + namespace detail + { + template