diff --git a/third_party/xsimd/arch/generic/xsimd_generic_arithmetic.hpp b/third_party/xsimd/arch/generic/xsimd_generic_arithmetic.hpp index a927ba975..7fc09e561 100644 --- a/third_party/xsimd/arch/generic/xsimd_generic_arithmetic.hpp +++ b/third_party/xsimd/arch/generic/xsimd_generic_arithmetic.hpp @@ -1,13 +1,13 @@ /*************************************************************************** -* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * -* Martin Renou * -* Copyright (c) QuantStack * -* Copyright (c) Serge Guelton * -* * -* Distributed under the terms of the BSD 3-Clause License. * -* * -* The full license is in the file LICENSE, distributed with this software. * -****************************************************************************/ + * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * + * Martin Renou * + * Copyright (c) QuantStack * + * Copyright (c) Serge Guelton * + * * + * Distributed under the terms of the BSD 3-Clause License. * + * * + * The full license is in the file LICENSE, distributed with this software. * + ****************************************************************************/ #ifndef XSIMD_GENERIC_ARITHMETIC_HPP #define XSIMD_GENERIC_ARITHMETIC_HPP @@ -17,85 +17,112 @@ #include "./xsimd_generic_details.hpp" -namespace xsimd { +namespace xsimd +{ + + namespace kernel + { + + using namespace types; + + // bitwise_lshift + template ::value, void>::type*/> + inline batch bitwise_lshift(batch const& self, batch const& other, requires_arch) noexcept + { + return detail::apply([](T x, T y) noexcept + { return x << y; }, + self, other); + } + + // bitwise_rshift + template ::value, void>::type*/> + inline batch bitwise_rshift(batch const& self, batch const& other, requires_arch) noexcept + { + return detail::apply([](T x, T y) noexcept + { return x >> y; }, + self, other); + } + + // div + template ::value, void>::type> + inline batch div(batch const& self, batch const& other, requires_arch) noexcept + { + return detail::apply([](T x, T y) noexcept -> T + { return x / y; }, + self, other); + } + + // fma + template + inline batch fma(batch const& x, batch const& y, batch const& z, requires_arch) noexcept + { + return x * y + z; + } + + template + inline batch, A> fma(batch, A> const& x, batch, A> const& y, batch, A> const& z, requires_arch) noexcept + { + auto res_r = fms(x.real(), y.real(), fms(x.imag(), y.imag(), z.real())); + auto res_i = fma(x.real(), y.imag(), fma(x.imag(), y.real(), z.imag())); + return { res_r, res_i }; + } + + // fms + template + inline batch fms(batch const& x, batch const& y, batch const& z, requires_arch) noexcept + { + return x * y - z; + } + + template + inline batch, A> fms(batch, A> const& x, batch, A> const& y, batch, A> const& z, requires_arch) noexcept + { + auto res_r = fms(x.real(), y.real(), fma(x.imag(), y.imag(), z.real())); + auto res_i = fma(x.real(), y.imag(), fms(x.imag(), y.real(), z.imag())); + return { res_r, res_i }; + } + + // fnma + template + inline batch fnma(batch const& x, batch const& y, batch const& z, requires_arch) noexcept + { + return -x * y + z; + } + + template + inline batch, A> fnma(batch, A> const& x, batch, A> const& y, batch, A> const& z, requires_arch) noexcept + { + auto res_r = -fms(x.real(), y.real(), fma(x.imag(), y.imag(), z.real())); + auto res_i = -fma(x.real(), y.imag(), fms(x.imag(), y.real(), z.imag())); + return { res_r, res_i }; + } + + // fnms + template + inline batch fnms(batch const& x, batch const& y, batch const& z, requires_arch) noexcept + { + return -x * y - z; + } + + template + inline batch, A> fnms(batch, A> const& x, batch, A> const& y, batch, A> const& z, requires_arch) noexcept + { + auto res_r = -fms(x.real(), y.real(), fms(x.imag(), y.imag(), z.real())); + auto res_i = -fma(x.real(), y.imag(), fma(x.imag(), y.real(), z.imag())); + return { res_r, res_i }; + } + + // mul + template ::value, void>::type*/> + inline batch mul(batch const& self, batch const& other, requires_arch) noexcept + { + return detail::apply([](T x, T y) noexcept -> T + { return x * y; }, + self, other); + } - namespace kernel { - - using namespace types; - - // bitwise_lshift - template::value, void>::type*/> - batch bitwise_lshift(batch const& self, batch const& other, requires_arch) { - return detail::apply([](T x, T y) { return x << y;}, self, other); - } - - // bitwise_rshift - template::value, void>::type*/> - batch bitwise_rshift(batch const& self, batch const& other, requires_arch) { - return detail::apply([](T x, T y) { return x >> y;}, self, other); - } - - // div - template::value, void>::type> - batch div(batch const& self, batch const& other, requires_arch) { - return detail::apply([](T x, T y) -> T { return x / y;}, self, other); - } - - // fma - template batch fma(batch const& x, batch const& y, batch const& z, requires_arch) { - return x * y + z; - } - - template batch, A> fma(batch, A> const& x, batch, A> const& y, batch, A> const& z, requires_arch) { - auto res_r = fms(x.real(), y.real(), fms(x.imag(), y.imag(), z.real())); - auto res_i = fma(x.real(), y.imag(), fma(x.imag(), y.real(), z.imag())); - return {res_r, res_i}; - } - - // fms - template batch fms(batch const& x, batch const& y, batch const& z, requires_arch) { - return x * y - z; - } - - template batch, A> fms(batch, A> const& x, batch, A> const& y, batch, A> const& z, requires_arch) { - auto res_r = fms(x.real(), y.real(), fma(x.imag(), y.imag(), z.real())); - auto res_i = fma(x.real(), y.imag(), fms(x.imag(), y.real(), z.imag())); - return {res_r, res_i}; } - // fnma - template batch fnma(batch const& x, batch const& y, batch const& z, requires_arch) { - return -x * y + z; - } - - template batch, A> fnma(batch, A> const& x, batch, A> const& y, batch, A> const& z, requires_arch) { - auto res_r = - fms(x.real(), y.real(), fma(x.imag(), y.imag(), z.real())); - auto res_i = - fma(x.real(), y.imag(), fms(x.imag(), y.real(), z.imag())); - return {res_r, res_i}; - } - - // fnms - template batch fnms(batch const& x, batch const& y, batch const& z, requires_arch) { - return -x * y - z; - } - - template batch, A> fnms(batch, A> const& x, batch, A> const& y, batch, A> const& z, requires_arch) { - auto res_r = - fms(x.real(), y.real(), fms(x.imag(), y.imag(), z.real())); - auto res_i = - fma(x.real(), y.imag(), fma(x.imag(), y.real(), z.imag())); - return {res_r, res_i}; - } - - - - // mul - template::value, void>::type*/> - batch mul(batch const& self, batch const& other, requires_arch) { - return detail::apply([](T x, T y) -> T { return x * y;}, self, other); - } - - } - } #endif - diff --git a/third_party/xsimd/arch/generic/xsimd_generic_complex.hpp b/third_party/xsimd/arch/generic/xsimd_generic_complex.hpp index 79fd0f1e7..ede95ee93 100644 --- a/third_party/xsimd/arch/generic/xsimd_generic_complex.hpp +++ b/third_party/xsimd/arch/generic/xsimd_generic_complex.hpp @@ -1,13 +1,13 @@ /*************************************************************************** -* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * -* Martin Renou * -* Copyright (c) QuantStack * -* Copyright (c) Serge Guelton * -* * -* Distributed under the terms of the BSD 3-Clause License. * -* * -* The full license is in the file LICENSE, distributed with this software. * -****************************************************************************/ + * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * + * Martin Renou * + * Copyright (c) QuantStack * + * Copyright (c) Serge Guelton * + * * + * Distributed under the terms of the BSD 3-Clause License. * + * * + * The full license is in the file LICENSE, distributed with this software. * + ****************************************************************************/ #ifndef XSIMD_GENERIC_COMPLEX_HPP #define XSIMD_GENERIC_COMPLEX_HPP @@ -16,71 +16,81 @@ #include "./xsimd_generic_details.hpp" -namespace xsimd { - - namespace kernel { - - using namespace types; - - // real - template - batch real(batch const& self, requires_arch) { - return self; - } - - template - batch real(batch, A> const& self, requires_arch) { - return self.real(); - } - - // imag - template - batch imag(batch const& /*self*/, requires_arch) { - return batch(T(0)); - } - - template - batch imag(batch, A> const& self, requires_arch) { - return self.imag(); - } - - // arg - template - real_batch_type_t> arg(batch const& self, requires_arch) { - return atan2(imag(self), real(self)); +namespace xsimd +{ + + namespace kernel + { + + using namespace types; + + // real + template + inline batch real(batch const& self, requires_arch) noexcept + { + return self; + } + + template + inline batch real(batch, A> const& self, requires_arch) noexcept + { + return self.real(); + } + + // imag + template + inline batch imag(batch const& /*self*/, requires_arch) noexcept + { + return batch(T(0)); + } + + template + inline batch imag(batch, A> const& self, requires_arch) noexcept + { + return self.imag(); + } + + // arg + template + inline real_batch_type_t> arg(batch const& self, requires_arch) noexcept + { + return atan2(imag(self), real(self)); + } + + // conj + template + inline complex_batch_type_t> conj(batch const& self, requires_arch) noexcept + { + return { real(self), -imag(self) }; + } + + // norm + template + inline real_batch_type_t> norm(batch const& self, requires_arch) noexcept + { + return { fma(real(self), real(self), imag(self) * imag(self)) }; + } + + // proj + template + inline complex_batch_type_t> proj(batch const& self, requires_arch) noexcept + { + using batch_type = complex_batch_type_t>; + using real_batch = typename batch_type::real_batch; + using real_value_type = typename real_batch::value_type; + auto cond = xsimd::isinf(real(self)) || xsimd::isinf(imag(self)); + return select(cond, + batch_type(constants::infinity(), + copysign(real_batch(real_value_type(0)), imag(self))), + batch_type(self)); + } + + template + inline batch_bool isnan(batch, A> const& self, requires_arch) noexcept + { + return batch_bool(isnan(self.real()) || isnan(self.imag())); + } } - - // conj - template - complex_batch_type_t> conj(batch const& self, requires_arch) { - return {real(self), - imag(self)}; - } - - // norm - template - real_batch_type_t> norm(batch const& self, requires_arch) { - return {fma(real(self), real(self), imag(self) * imag(self))}; - } - - // proj - template - complex_batch_type_t> proj(batch const& self, requires_arch) { - using batch_type = complex_batch_type_t>; - using real_batch = typename batch_type::real_batch; - using real_value_type = typename real_batch::value_type; - auto cond = xsimd::isinf(real(self)) || xsimd::isinf(imag(self)); - return select(cond, - batch_type(constants::infinity(), - copysign(real_batch(real_value_type(0)), imag(self))), - batch_type(self)); - } - - template - batch_bool isnan(batch, A> const& self, requires_arch) { - return batch_bool(isnan(self.real()) || isnan(self.imag())); - } - } } #endif - diff --git a/third_party/xsimd/arch/generic/xsimd_generic_details.hpp b/third_party/xsimd/arch/generic/xsimd_generic_details.hpp index c14ba3d85..153ca0b4e 100644 --- a/third_party/xsimd/arch/generic/xsimd_generic_details.hpp +++ b/third_party/xsimd/arch/generic/xsimd_generic_details.hpp @@ -1,217 +1,226 @@ /*************************************************************************** -* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * -* Martin Renou * -* Copyright (c) QuantStack * -* Copyright (c) Serge Guelton * -* * -* Distributed under the terms of the BSD 3-Clause License. * -* * -* The full license is in the file LICENSE, distributed with this software. * -****************************************************************************/ + * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * + * Martin Renou * + * Copyright (c) QuantStack * + * Copyright (c) Serge Guelton * + * * + * Distributed under the terms of the BSD 3-Clause License. * + * * + * The full license is in the file LICENSE, distributed with this software. * + ****************************************************************************/ #ifndef XSIMD_GENERIC_DETAILS_HPP #define XSIMD_GENERIC_DETAILS_HPP #include +#include "../../math/xsimd_rem_pio2.hpp" #include "../../types/xsimd_generic_arch.hpp" #include "../../types/xsimd_utils.hpp" -#include "../../math/xsimd_rem_pio2.hpp" #include "../xsimd_constants.hpp" -namespace xsimd { - // Forward declaration. Should we put them in a separate file? - template - batch abs(batch const& self); - template - batch abs(batch, A> const& self); - template - bool any(batch_bool const& self); - template - batch atan2(batch const& self, batch const& other); - template - batch bitofsign(batch const& self); - template - B bitwise_cast(batch const& self); - template - batch_bool bool_cast(batch_bool const& self); - template - batch_bool bool_cast(batch_bool const& self); - template - batch_bool bool_cast(batch_bool const& self); - template - batch_bool bool_cast(batch_bool const& self); - template - batch cos(batch const& self); - template - batch cosh(batch const& self); - template - batch exp(batch const& self); - template - batch fma(batch const& x, batch const& y, batch const& z); - template - batch fms(batch const& x, batch const& y, batch const& z); - template - batch frexp(const batch& x, const batch, A>& e); - template - T hadd(batch const&); - template - batch horner(const batch& self); - template - batch hypot(const batch& self); - template - batch_bool is_even(batch const& self); - template - batch_bool is_flint(batch const& self); - template - batch_bool is_odd(batch const& self); - template - batch_bool isinf(batch const& self); - template - typename batch::batch_bool_type isnan(batch const& self); - template - batch ldexp(const batch& x, const batch, A>& e); - template - batch log(batch const& self); - template - batch nearbyint(batch const& self); - template - batch select(batch_bool const&, batch const& , batch const& ); - template - batch, A> select(batch_bool const&, batch, A> const& , batch, A> const& ); - template - batch sign(batch const& self); - template - batch signnz(batch const& self); - template - batch sin(batch const& self); - template - batch sinh(batch const& self); - template - std::pair, batch> sincos(batch const& self); - template - batch sqrt(batch const& self); - template - batch tan(batch const& self); - template - batch, A> to_float(batch const& self); - template - batch, A> to_int(batch const& self); - template - batch trunc(batch const& self); - - - namespace kernel { - - namespace detail { - template - batch apply(F&& func, batch const& self, batch const& other) { - constexpr std::size_t size = batch::size; - alignas(A::alignment()) T self_buffer[size]; - alignas(A::alignment()) T other_buffer[size]; - self.store_aligned(&self_buffer[0]); - other.store_aligned(&other_buffer[0]); - for(std::size_t i = 0; i < size; ++i) { - self_buffer[i] = func(self_buffer[i], other_buffer[i]); - } - return batch::load_aligned(self_buffer); - } - } - - namespace detail { - // Generic conversion handling machinery. Each architecture must define - // conversion function when such conversions exits in the form of - // intrinsic. Then we use that information to automatically decide whether - // to use scalar or vector conversion when doing load / store / batch_cast - struct with_fast_conversion{}; - struct with_slow_conversion{}; - - template - struct conversion_type_impl - { - using type = with_slow_conversion; - }; - - using xsimd::detail::void_t; - - template - struct conversion_type_impl(), std::declval(), std::declval()))>> - { - using type = with_fast_conversion; - }; - - template - using conversion_type = typename conversion_type_impl::type; - } - - namespace detail { - /* origin: boost/simdfunction/horn.hpp*/ - /* - * ==================================================== - * copyright 2016 NumScale SAS - * - * Distributed under the Boost Software License, Version 1.0. - * (See copy at http://boost.org/LICENSE_1_0.txt) - * ==================================================== - */ - template - inline B coef() noexcept - { - using value_type = typename B::value_type; - return B(bit_cast(as_unsigned_integer_t(c))); - } - template - inline B horner(const B&) noexcept +namespace xsimd +{ + // Forward declaration. Should we put them in a separate file? + template + inline batch abs(batch const& self) noexcept; + template + inline batch abs(batch, A> const& self) noexcept; + template + inline bool any(batch_bool const& self) noexcept; + template + inline batch atan2(batch const& self, batch const& other) noexcept; + template + inline batch bitofsign(batch const& self) noexcept; + template + inline B bitwise_cast(batch const& self) noexcept; + template + inline batch_bool bool_cast(batch_bool const& self) noexcept; + template + inline batch_bool bool_cast(batch_bool const& self) noexcept; + template + inline batch_bool bool_cast(batch_bool const& self) noexcept; + template + inline batch_bool bool_cast(batch_bool const& self) noexcept; + template + inline batch cos(batch const& self) noexcept; + template + inline batch cosh(batch const& self) noexcept; + template + inline batch exp(batch const& self) noexcept; + template + inline batch fma(batch const& x, batch const& y, batch const& z) noexcept; + template + inline batch fms(batch const& x, batch const& y, batch const& z) noexcept; + template + inline batch frexp(const batch& x, const batch, A>& e) noexcept; + template + inline T hadd(batch const&) noexcept; + template + inline batch horner(const batch& self) noexcept; + template + inline batch hypot(const batch& self) noexcept; + template + inline batch_bool is_even(batch const& self) noexcept; + template + inline batch_bool is_flint(batch const& self) noexcept; + template + inline batch_bool is_odd(batch const& self) noexcept; + template + inline batch_bool isinf(batch const& self) noexcept; + template + inline typename batch::batch_bool_type isnan(batch const& self) noexcept; + template + inline batch ldexp(const batch& x, const batch, A>& e) noexcept; + template + inline batch log(batch const& self) noexcept; + template + inline batch nearbyint(batch const& self) noexcept; + template + inline batch select(batch_bool const&, batch const&, batch const&) noexcept; + template + inline batch, A> select(batch_bool const&, batch, A> const&, batch, A> const&) noexcept; + template + inline batch sign(batch const& self) noexcept; + template + inline batch signnz(batch const& self) noexcept; + template + inline batch sin(batch const& self) noexcept; + template + inline batch sinh(batch const& self) noexcept; + template + inline std::pair, batch> sincos(batch const& self) noexcept; + template + inline batch sqrt(batch const& self) noexcept; + template + inline batch tan(batch const& self) noexcept; + template + inline batch, A> to_float(batch const& self) noexcept; + template + inline batch, A> to_int(batch const& self) noexcept; + template + inline batch trunc(batch const& self) noexcept; + + namespace kernel + { + + namespace detail { - return B(typename B::value_type(0.)); + template + inline batch apply(F&& func, batch const& self, batch const& other) noexcept + { + constexpr std::size_t size = batch::size; + alignas(A::alignment()) T self_buffer[size]; + alignas(A::alignment()) T other_buffer[size]; + self.store_aligned(&self_buffer[0]); + other.store_aligned(&other_buffer[0]); + for (std::size_t i = 0; i < size; ++i) + { + self_buffer[i] = func(self_buffer[i], other_buffer[i]); + } + return batch::load_aligned(self_buffer); + } } - template - inline B horner(const B&) noexcept + namespace detail { - return coef(); + // Generic conversion handling machinery. Each architecture must define + // conversion function when such conversions exits in the form of + // intrinsic. Then we use that information to automatically decide whether + // to use scalar or vector conversion when doing load / store / batch_cast + struct with_fast_conversion + { + }; + struct with_slow_conversion + { + }; + + template + struct conversion_type_impl + { + using type = with_slow_conversion; + }; + + using xsimd::detail::void_t; + + template + struct conversion_type_impl&>(), + std::declval&>(), + std::declval()))>> + { + using type = with_fast_conversion; + }; + + template + using conversion_type = typename conversion_type_impl::type; } - template - inline B horner(const B& self) noexcept + namespace detail { - return fma(self, horner(self), coef()); + /* origin: boost/simdfunction/horn.hpp*/ + /* + * ==================================================== + * copyright 2016 NumScale SAS + * + * Distributed under the Boost Software License, Version 1.0. + * (See copy at http://boost.org/LICENSE_1_0.txt) + * ==================================================== + */ + template + inline B coef() noexcept + { + using value_type = typename B::value_type; + return B(bit_cast(as_unsigned_integer_t(c))); + } + template + inline B horner(const B&) noexcept + { + return B(typename B::value_type(0.)); + } + + template + inline B horner(const B&) noexcept + { + return coef(); + } + + template + inline B horner(const B& self) noexcept + { + return fma(self, horner(self), coef()); + } + + /* origin: boost/simdfunction/horn1.hpp*/ + /* + * ==================================================== + * copyright 2016 NumScale SAS + * + * Distributed under the Boost Software License, Version 1.0. + * (See copy at http://boost.org/LICENSE_1_0.txt) + * ==================================================== + */ + template + inline B horner1(const B&) noexcept + { + return B(1.); + } + + template + inline B horner1(const B& x) noexcept + { + return x + detail::coef(); + } + + template + inline B horner1(const B& x) noexcept + { + return fma(x, horner1(x), detail::coef()); + } } - /* origin: boost/simdfunction/horn1.hpp*/ - /* - * ==================================================== - * copyright 2016 NumScale SAS - * - * Distributed under the Boost Software License, Version 1.0. - * (See copy at http://boost.org/LICENSE_1_0.txt) - * ==================================================== - */ - template - inline B horner1(const B&) noexcept - { - return B(1.); - } - - template - inline B horner1(const B& x) noexcept - { - return x + detail::coef(); - } - - template - inline B horner1(const B& x) noexcept - { - return fma(x, horner1(x), detail::coef()); - } } - - - } - } #endif - diff --git a/third_party/xsimd/arch/generic/xsimd_generic_logical.hpp b/third_party/xsimd/arch/generic/xsimd_generic_logical.hpp index fb8110b97..d535e36c5 100644 --- a/third_party/xsimd/arch/generic/xsimd_generic_logical.hpp +++ b/third_party/xsimd/arch/generic/xsimd_generic_logical.hpp @@ -1,107 +1,136 @@ /*************************************************************************** -* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * -* Martin Renou * -* Copyright (c) QuantStack * -* Copyright (c) Serge Guelton * -* * -* Distributed under the terms of the BSD 3-Clause License. * -* * -* The full license is in the file LICENSE, distributed with this software. * -****************************************************************************/ + * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * + * Martin Renou * + * Copyright (c) QuantStack * + * Copyright (c) Serge Guelton * + * * + * Distributed under the terms of the BSD 3-Clause License. * + * * + * The full license is in the file LICENSE, distributed with this software. * + ****************************************************************************/ #ifndef XSIMD_GENERIC_LOGICAL_HPP #define XSIMD_GENERIC_LOGICAL_HPP #include "./xsimd_generic_details.hpp" - -namespace xsimd { - - namespace kernel { - - using namespace types; - - // ge - template batch_bool ge(batch const& self, batch const& other, requires_arch) { - return other <= self; - } - - // gt - template batch_bool gt(batch const& self, batch const& other, requires_arch) { - return other < self; - } - - // is_even - template batch_bool is_even(batch const& self, requires_arch) { - return is_flint(self * T(0.5)); - } - - // is_flint - template batch_bool is_flint(batch const& self, requires_arch) { - auto frac = select(isnan(self - self), constants::nan>(), self - trunc(self)); - return frac == T(0.); - } - - // is_odd - template batch_bool is_odd(batch const& self, requires_arch) { - return is_even(self - T(1.)); - } - - // isinf - template::value, void>::type> - batch_bool isinf(batch const& , requires_arch) { - return batch_bool(false); - } - template batch_bool isinf(batch const& self, requires_arch) { - return abs(self) == std::numeric_limits::infinity(); - } - template batch_bool isinf(batch const& self, requires_arch) { - return abs(self) == std::numeric_limits::infinity(); +namespace xsimd +{ + + namespace kernel + { + + using namespace types; + + // ge + template + inline batch_bool ge(batch const& self, batch const& other, requires_arch) noexcept + { + return other <= self; + } + + // gt + template + inline batch_bool gt(batch const& self, batch const& other, requires_arch) noexcept + { + return other < self; + } + + // is_even + template + inline batch_bool is_even(batch const& self, requires_arch) noexcept + { + return is_flint(self * T(0.5)); + } + + // is_flint + template + inline batch_bool is_flint(batch const& self, requires_arch) noexcept + { + auto frac = select(isnan(self - self), constants::nan>(), self - trunc(self)); + return frac == T(0.); + } + + // is_odd + template + inline batch_bool is_odd(batch const& self, requires_arch) noexcept + { + return is_even(self - T(1.)); + } + + // isinf + template ::value, void>::type> + inline batch_bool isinf(batch const&, requires_arch) noexcept + { + return batch_bool(false); + } + template + inline batch_bool isinf(batch const& self, requires_arch) noexcept + { + return abs(self) == std::numeric_limits::infinity(); + } + template + inline batch_bool isinf(batch const& self, requires_arch) noexcept + { + return abs(self) == std::numeric_limits::infinity(); + } + + // isfinite + template ::value, void>::type> + inline batch_bool isfinite(batch const&, requires_arch) noexcept + { + return batch_bool(true); + } + template + inline batch_bool isfinite(batch const& self, requires_arch) noexcept + { + return (self - self) == 0; + } + template + inline batch_bool isfinite(batch const& self, requires_arch) noexcept + { + return (self - self) == 0; + } + + // isnan + template ::value, void>::type> + inline batch_bool isnan(batch const&, requires_arch) noexcept + { + return batch_bool(false); + } + + // le + template ::value, void>::type> + inline batch_bool le(batch const& self, batch const& other, requires_arch) noexcept + { + return (self < other) || (self == other); + } + + // neq + template + inline batch_bool neq(batch const& self, batch const& other, requires_arch) noexcept + { + return !(other == self); + } + + // logical_and + template + inline batch logical_and(batch const& self, batch const& other, requires_arch) noexcept + { + return detail::apply([](T x, T y) noexcept + { return x && y; }, + self, other); + } + + // logical_or + template + inline batch logical_or(batch const& self, batch const& other, requires_arch) noexcept + { + return detail::apply([](T x, T y) noexcept + { return x || y; }, + self, other); + } } - - // isfinite - template::value, void>::type> - batch_bool isfinite(batch const& , requires_arch) { - return batch_bool(true); - } - template batch_bool isfinite(batch const& self, requires_arch) { - return (self - self) == 0; - } - template batch_bool isfinite(batch const& self, requires_arch) { - return (self - self) == 0; - } - - // isnan - template::value, void>::type> - batch_bool isnan(batch const& , requires_arch) { - return batch_bool(false); - } - - // le - template::value, void>::type> - batch_bool le(batch const& self, batch const& other, requires_arch) { - return (self < other) || (self == other); - } - - - // neq - template batch_bool neq(batch const& self, batch const& other, requires_arch) { - return !(other == self); - } - - // logical_and - template - batch logical_and(batch const& self, batch const& other, requires_arch) { - return detail::apply([](T x, T y) { return x && y;}, self, other); - } - - // logical_or - template - batch logical_or(batch const& self, batch const& other, requires_arch) { - return detail::apply([](T x, T y) { return x || y;}, self, other); - } - } } #endif - diff --git a/third_party/xsimd/arch/generic/xsimd_generic_math.hpp b/third_party/xsimd/arch/generic/xsimd_generic_math.hpp index 2b8562b73..d236b410d 100644 --- a/third_party/xsimd/arch/generic/xsimd_generic_math.hpp +++ b/third_party/xsimd/arch/generic/xsimd_generic_math.hpp @@ -1,95 +1,109 @@ /*************************************************************************** -* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * -* Martin Renou * -* Copyright (c) QuantStack * -* Copyright (c) Serge Guelton * -* * -* Distributed under the terms of the BSD 3-Clause License. * -* * -* The full license is in the file LICENSE, distributed with this software. * -****************************************************************************/ + * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * + * Martin Renou * + * Copyright (c) QuantStack * + * Copyright (c) Serge Guelton * + * * + * Distributed under the terms of the BSD 3-Clause License. * + * * + * The full license is in the file LICENSE, distributed with this software. * + ****************************************************************************/ #ifndef XSIMD_GENERIC_MATH_HPP #define XSIMD_GENERIC_MATH_HPP -#include - +#include "../xsimd_scalar.hpp" #include "./xsimd_generic_details.hpp" #include "./xsimd_generic_trigo.hpp" -#include "../xsimd_scalar.hpp" +#include -namespace xsimd { - - namespace kernel { +namespace xsimd +{ - using namespace types; - // abs - template::value, void>::type*/> - batch abs(batch const& self, requires_arch) + namespace kernel { - if(std::is_unsigned::value) - return self; - else { - auto sign = bitofsign(self); - auto inv = self ^ sign; - return inv - sign; - } - } - template - batch abs(batch, A> const& z, requires_arch) { - return hypot(z.real(), z.imag()); - } + using namespace types; + // abs + template ::value, void>::type*/> + inline batch abs(batch const& self, requires_arch) noexcept + { + if (std::is_unsigned::value) + return self; + else + { + auto sign = bitofsign(self); + auto inv = self ^ sign; + return inv - sign; + } + } - // batch_cast - template batch batch_cast(batch const& self, batch const&, requires_arch) { - return self; - } + template + inline batch abs(batch, A> const& z, requires_arch) noexcept + { + return hypot(z.real(), z.imag()); + } - namespace detail { - template - batch batch_cast(batch const& self, batch const& out, requires_arch, with_fast_conversion) { - return fast_cast(self, out, A{}); - } - template - batch batch_cast(batch const& self, batch const&, requires_arch, with_slow_conversion) { - static_assert(!std::is_same::value, "there should be no conversion for this type combination"); - using batch_type_in = batch; - using batch_type_out = batch; - static_assert(batch_type_in::size == batch_type_out::size, "compatible sizes"); - alignas(A::alignment()) T_in buffer_in[batch_type_in::size]; - alignas(A::alignment()) T_out buffer_out[batch_type_out::size]; - self.store_aligned(&buffer_in[0]); - std::copy(std::begin(buffer_in), std::end(buffer_in), std::begin(buffer_out)); - return batch_type_out::load_aligned(buffer_out); - } + // batch_cast + template + inline batch batch_cast(batch const& self, batch const&, requires_arch) noexcept + { + return self; + } - } + namespace detail + { + template + inline batch batch_cast(batch const& self, batch const& out, requires_arch, with_fast_conversion) noexcept + { + return fast_cast(self, out, A {}); + } + template + inline batch batch_cast(batch const& self, batch const&, requires_arch, with_slow_conversion) noexcept + { + static_assert(!std::is_same::value, "there should be no conversion for this type combination"); + using batch_type_in = batch; + using batch_type_out = batch; + static_assert(batch_type_in::size == batch_type_out::size, "compatible sizes"); + alignas(A::alignment()) T_in buffer_in[batch_type_in::size]; + alignas(A::alignment()) T_out buffer_out[batch_type_out::size]; + self.store_aligned(&buffer_in[0]); + std::copy(std::begin(buffer_in), std::end(buffer_in), std::begin(buffer_out)); + return batch_type_out::load_aligned(buffer_out); + } - template - batch batch_cast(batch const& self, batch const& out, requires_arch) { - return detail::batch_cast(self, out, A{}, detail::conversion_type{}); - } + } - // bitofsign - template batch bitofsign(batch const& self, requires_arch) { - static_assert(std::is_integral::value, "int type implementation"); - if(std::is_unsigned::value) - return batch(0); - else - return self >> (T)(8 * sizeof(T) - 1); - } + template + inline batch batch_cast(batch const& self, batch const& out, requires_arch) noexcept + { + return detail::batch_cast(self, out, A {}, detail::conversion_type {}); + } - template batch bitofsign(batch const& self, requires_arch) { - return self & constants::minuszero>(); - } - template batch bitofsign(batch const& self, requires_arch) { - return self & constants::minuszero>(); - } + // bitofsign + template + inline batch bitofsign(batch const& self, requires_arch) noexcept + { + static_assert(std::is_integral::value, "int type implementation"); + if (std::is_unsigned::value) + return batch(0); + else + return self >> (T)(8 * sizeof(T) - 1); + } + template + inline batch bitofsign(batch const& self, requires_arch) noexcept + { + return self & constants::minuszero>(); + } + template + inline batch bitofsign(batch const& self, requires_arch) noexcept + { + return self & constants::minuszero>(); + } - // cbrt + // cbrt /* origin: boost/simd/arch/common/simd/function/cbrt.hpp */ /* * ==================================================== @@ -99,112 +113,280 @@ namespace xsimd { * (See copy at http://boost.org/LICENSE_1_0.txt) * ==================================================== */ - template batch cbrt(batch const& self, requires_arch) { - using batch_type = batch; - batch_type z = abs(self); + template + inline batch cbrt(batch const& self, requires_arch) noexcept + { + using batch_type = batch; + batch_type z = abs(self); #ifndef XSIMD_NO_DENORMALS - auto denormal = z < constants::smallestposval(); - z = select(denormal, z * constants::twotonmb(), z); - batch_type f = select(denormal, constants::twotonmbo3(), batch_type(1.)); + auto denormal = z < constants::smallestposval(); + z = select(denormal, z * constants::twotonmb(), z); + batch_type f = select(denormal, constants::twotonmbo3(), batch_type(1.)); #endif - const batch_type CBRT2 (bit_cast(0x3fa14518)); - const batch_type CBRT4 (bit_cast(0x3fcb2ff5)); - const batch_type CBRT2I (bit_cast(0x3f4b2ff5)); - const batch_type CBRT4I (bit_cast(0x3f214518)); - using i_type = as_integer_t; - i_type e; - batch_type x = frexp(z, e); - x = detail::horner(x); - auto flag = e >= i_type(0); - i_type e1 = abs(e); - i_type rem = e1; - e1 /= i_type(3); - rem -= e1 * i_type(3); - e = e1 * sign(e); - const batch_type cbrt2 = select(bool_cast(flag), CBRT2, CBRT2I); - const batch_type cbrt4 = select(bool_cast(flag), CBRT4, CBRT4I); - batch_type fact = select(bool_cast(rem == i_type(1)), cbrt2, batch_type(1.)); - fact = select(bool_cast(rem == i_type(2)), cbrt4, fact); - x = ldexp(x * fact, e); - x -= (x - z / (x * x)) * batch_type(1.f / 3.f); + const batch_type CBRT2(bit_cast(0x3fa14518)); + const batch_type CBRT4(bit_cast(0x3fcb2ff5)); + const batch_type CBRT2I(bit_cast(0x3f4b2ff5)); + const batch_type CBRT4I(bit_cast(0x3f214518)); + using i_type = as_integer_t; + i_type e; + batch_type x = frexp(z, e); + x = detail::horner(x); + auto flag = e >= i_type(0); + i_type e1 = abs(e); + i_type rem = e1; + e1 /= i_type(3); + rem -= e1 * i_type(3); + e = e1 * sign(e); + const batch_type cbrt2 = select(bool_cast(flag), CBRT2, CBRT2I); + const batch_type cbrt4 = select(bool_cast(flag), CBRT4, CBRT4I); + batch_type fact = select(bool_cast(rem == i_type(1)), cbrt2, batch_type(1.)); + fact = select(bool_cast(rem == i_type(2)), cbrt4, fact); + x = ldexp(x * fact, e); + x -= (x - z / (x * x)) * batch_type(1.f / 3.f); #ifndef XSIMD_NO_DENORMALS - x = (x | bitofsign(self)) * f; + x = (x | bitofsign(self)) * f; #else - x = x | bitofsign(self); + x = x | bitofsign(self); #endif #ifndef XSIMD_NO_INFINITIES - return select(self == batch_type(0.) || isinf(self), self, x); + return select(self == batch_type(0.) || isinf(self), self, x); #else - return select(self == batch_type(0.), self, x); + return select(self == batch_type(0.), self, x); #endif - } - template batch cbrt(batch const& self, requires_arch) { - using batch_type = batch; - batch_type z = abs(self); + } + + template + inline batch cbrt(batch const& self, requires_arch) noexcept + { + using batch_type = batch; + batch_type z = abs(self); #ifndef XSIMD_NO_DENORMALS - auto denormal = z < constants::smallestposval(); - z = select(denormal, z * constants::twotonmb(), z); - batch_type f = select(denormal, constants::twotonmbo3(), batch_type(1.)); + auto denormal = z < constants::smallestposval(); + z = select(denormal, z * constants::twotonmb(), z); + batch_type f = select(denormal, constants::twotonmbo3(), batch_type(1.)); #endif - const batch_type CBRT2(bit_cast(int64_t(0x3ff428a2f98d728b))); - const batch_type CBRT4(bit_cast(int64_t(0x3ff965fea53d6e3d))); - const batch_type CBRT2I(bit_cast(int64_t(0x3fe965fea53d6e3d))); - const batch_type CBRT4I(bit_cast(int64_t(0x3fe428a2f98d728b))); - using i_type = as_integer_t; - i_type e; - batch_type x = frexp(z, e); - x = detail::horner(x); - auto flag = e >= typename i_type::value_type(0); - i_type e1 = abs(e); - i_type rem = e1; - e1 /= i_type(3); - rem -= e1 * i_type(3); - e = e1 * sign(e); - const batch_type cbrt2 = select(bool_cast(flag), CBRT2, CBRT2I); - const batch_type cbrt4 = select(bool_cast(flag), CBRT4, CBRT4I); - batch_type fact = select(bool_cast(rem == i_type(1)), cbrt2, batch_type(1.)); - fact = select(bool_cast(rem == i_type(2)), cbrt4, fact); - x = ldexp(x * fact, e); - x -= (x - z / (x * x)) * batch_type(1. / 3.); - x -= (x - z / (x * x)) * batch_type(1. / 3.); + const batch_type CBRT2(bit_cast(int64_t(0x3ff428a2f98d728b))); + const batch_type CBRT4(bit_cast(int64_t(0x3ff965fea53d6e3d))); + const batch_type CBRT2I(bit_cast(int64_t(0x3fe965fea53d6e3d))); + const batch_type CBRT4I(bit_cast(int64_t(0x3fe428a2f98d728b))); + using i_type = as_integer_t; + i_type e; + batch_type x = frexp(z, e); + x = detail::horner(x); + auto flag = e >= typename i_type::value_type(0); + i_type e1 = abs(e); + i_type rem = e1; + e1 /= i_type(3); + rem -= e1 * i_type(3); + e = e1 * sign(e); + const batch_type cbrt2 = select(bool_cast(flag), CBRT2, CBRT2I); + const batch_type cbrt4 = select(bool_cast(flag), CBRT4, CBRT4I); + batch_type fact = select(bool_cast(rem == i_type(1)), cbrt2, batch_type(1.)); + fact = select(bool_cast(rem == i_type(2)), cbrt4, fact); + x = ldexp(x * fact, e); + x -= (x - z / (x * x)) * batch_type(1. / 3.); + x -= (x - z / (x * x)) * batch_type(1. / 3.); #ifndef XSIMD_NO_DENORMALS - x = (x | bitofsign(self)) * f; + x = (x | bitofsign(self)) * f; #else - x = x | bitofsign(self); + x = x | bitofsign(self); #endif #ifndef XSIMD_NO_INFINITIES - return select(self == batch_type(0.) || isinf(self), self, x); + return select(self == batch_type(0.) || isinf(self), self, x); #else - return select(self == batch_type(0.), self, x); + return select(self == batch_type(0.), self, x); #endif - } + } - // clip - template batch clip(batch const& self, batch const& lo, batch const& hi, requires_arch) { - return min(hi, max(self, lo)); - } + // clip + template + inline batch clip(batch const& self, batch const& lo, batch const& hi, requires_arch) noexcept + { + return min(hi, max(self, lo)); + } + // copysign + template + inline batch copysign(batch const& self, batch const& other, requires_arch) noexcept + { + return abs(self) | bitofsign(other); + } - // copysign - template batch copysign(batch const& self, batch const& other, requires_arch) { - return abs(self) | bitofsign(other); - } + // erf + namespace detail + { + /* origin: boost/simd/arch/common/detail/generic/erf_kernel.hpp */ + /* + * ==================================================== + * copyright 2016 NumScale SAS + * + * Distributed under the Boost Software License, Version 1.0. + * (See copy at http://boost.org/LICENSE_1_0.txt) + * ==================================================== + */ + template + struct erf_kernel; + + template + struct erf_kernel> + { + using batch_type = batch; + // computes erf(a0)/a0 + // x is sqr(a0) and 0 <= abs(a0) <= 2/3 + static inline batch_type erf1(const batch_type& x) noexcept + { + return detail::horner(x); + } + + // computes erfc(x)*exp(sqr(x)) + // x >= 2/3 + static inline batch_type erfc2(const batch_type& x) noexcept + { + return detail::horner(x); + } - // erf + static inline batch_type erfc3(const batch_type& x) noexcept + { + return (batch_type(1.) - x) * detail::horner(x); + } + }; + + template + struct erf_kernel> + { + using batch_type = batch; + // computes erf(a0)/a0 + // x is sqr(a0) and 0 <= abs(a0) <= 0.65 + static inline batch_type erf1(const batch_type& x) noexcept + { + return detail::horner(x) + / detail::horner(x); + } + + // computes erfc(x)*exp(x*x) + // 0.65 <= abs(x) <= 2.2 + static inline batch_type erfc2(const batch_type& x) noexcept + { + return detail::horner(x) + / detail::horner(x); + } + + // computes erfc(x)*exp(x*x) + // 2.2 <= abs(x) <= 6 + static inline batch_type erfc3(const batch_type& x) noexcept + { + return detail::horner(x) + / detail::horner(x); + } - namespace detail { - /* origin: boost/simd/arch/common/detail/generic/erf_kernel.hpp */ + // computes erfc(rx)*exp(rx*rx) + // x >= 6 rx = 1/x + static inline batch_type erfc4(const batch_type& x) noexcept + { + return detail::horner(x); + } + }; + } + /* origin: boost/simd/arch/common/simd/function/erf.hpp */ /* * ==================================================== * copyright 2016 NumScale SAS @@ -213,624 +395,486 @@ namespace xsimd { * (See copy at http://boost.org/LICENSE_1_0.txt) * ==================================================== */ - template - struct erf_kernel; template - struct erf_kernel> + inline batch erf(batch const& self, requires_arch) noexcept { using batch_type = batch; - // computes erf(a0)/a0 - // x is sqr(a0) and 0 <= abs(a0) <= 2/3 - static inline batch_type erf1(const batch_type& x) - { - return detail::horner(x); - } + batch_type x = abs(self); + batch_type r1(0.); + auto test1 = x < batch_type(2.f / 3.f); + if (any(test1)) + { + r1 = self * detail::erf_kernel::erf1(x * x); + if (all(test1)) + return r1; + } + batch_type z = x / (batch_type(1.) + x); + z -= batch_type(0.4f); + batch_type r2 = batch_type(1.) - exp(-x * x) * detail::erf_kernel::erfc2(z); + r2 = select(self < batch_type(0.), -r2, r2); + r1 = select(test1, r1, r2); +#ifndef XSIMD_NO_INFINITIES + r1 = select(xsimd::isinf(self), sign(self), r1); +#endif + return r1; + } - // computes erfc(x)*exp(sqr(x)) - // x >= 2/3 - static inline batch_type erfc2(const batch_type& x) - { - return detail::horner(x); + template + inline batch erf(batch const& self, requires_arch) noexcept + { + using batch_type = batch; + batch_type x = abs(self); + batch_type xx = x * x; + batch_type lim1(0.65); + batch_type lim2(2.2); + auto test1 = x < lim1; + batch_type r1(0.); + if (any(test1)) + { + r1 = self * detail::erf_kernel::erf1(xx); + if (all(test1)) + return r1; + } + auto test2 = x < lim2; + auto test3 = test2 && !test1; + batch_type ex = exp(-xx); + if (any(test3)) + { + batch_type z = batch_type(1.) - ex * detail::erf_kernel::erfc2(x); + batch_type r2 = select(self < batch_type(0.), -z, z); + r1 = select(test1, r1, r2); + if (all(test1 || test3)) + return r1; } + batch_type z = batch_type(1.) - ex * detail::erf_kernel::erfc3(x); + z = select(self < batch_type(0.), -z, z); +#ifndef XSIMD_NO_INFINITIES + z = select(xsimd::isinf(self), sign(self), z); +#endif + return select(test2, r1, z); + } - static inline batch_type erfc3(const batch_type& x) - { - return (batch_type(1.) - x) * detail::horner(x); - } - }; + // erfc + template + inline batch erfc(batch const& self, requires_arch) noexcept + { + using batch_type = batch; + batch_type x = abs(self); + auto test0 = self < batch_type(0.); + batch_type r1(0.); + auto test1 = x < batch_type(2.f / 3.f); + batch_type z = x / (batch_type(1.) + x); + if (any(test1)) + { + r1 = detail::erf_kernel::erfc3(z); + if (all(test1)) + return select(test0, batch_type(2.) - r1, r1); + } + z -= batch_type(0.4f); + batch_type r2 = exp(-x * x) * detail::erf_kernel::erfc2(z); + r1 = select(test1, r1, r2); +#ifndef XSIMD_NO_INFINITIES + r1 = select(x == constants::infinity(), batch_type(0.), r1); +#endif + return select(test0, batch_type(2.) - r1, r1); + } template - struct erf_kernel> + inline batch erfc(batch const& self, requires_arch) noexcept { using batch_type = batch; - // computes erf(a0)/a0 - // x is sqr(a0) and 0 <= abs(a0) <= 0.65 - static inline batch_type erf1(const batch_type& x) - { - return detail::horner(x) / - detail::horner(x); - } + batch_type x = abs(self); + batch_type xx = x * x; + batch_type lim1(0.65); + batch_type lim2(2.2); + auto test0 = self < batch_type(0.); + auto test1 = x < lim1; + batch_type r1(0.); + if (any(test1)) + { + r1 = batch_type(1.) - x * detail::erf_kernel::erf1(xx); + if (all(test1)) + return select(test0, batch_type(2.) - r1, r1); + } + auto test2 = x < lim2; + auto test3 = test2 && !test1; + batch_type ex = exp(-xx); + if (any(test3)) + { + batch_type z = ex * detail::erf_kernel::erfc2(x); + r1 = select(test1, r1, z); + if (all(test1 || test3)) + return select(test0, batch_type(2.) - r1, r1); + } + batch_type z = ex * detail::erf_kernel::erfc3(x); + r1 = select(test2, r1, z); +#ifndef XSIMD_NO_INFINITIES + r1 = select(x == constants::infinity(), batch_type(0.), r1); +#endif + return select(test0, batch_type(2.) - r1, r1); + } - // computes erfc(x)*exp(x*x) - // 0.65 <= abs(x) <= 2.2 - static inline batch_type erfc2(const batch_type& x) - { - return detail::horner(x) / - detail::horner(x); - } + // estrin + namespace detail + { - // computes erfc(x)*exp(x*x) - // 2.2 <= abs(x) <= 6 - static inline batch_type erfc3(const batch_type& x) + template + struct estrin { - return detail::horner(x) / - detail::horner(x); - } + B x; - // computes erfc(rx)*exp(rx*rx) - // x >= 6 rx = 1/x - static inline batch_type erfc4(const batch_type& x) - { - return detail::horner(x); - } - }; - } - /* origin: boost/simd/arch/common/simd/function/erf.hpp */ - /* - * ==================================================== - * copyright 2016 NumScale SAS - * - * Distributed under the Boost Software License, Version 1.0. - * (See copy at http://boost.org/LICENSE_1_0.txt) - * ==================================================== - */ + template + inline B operator()(const Ts&... coefs) noexcept + { + return eval(coefs...); + } - template - batch erf(batch const& self, requires_arch) { - using batch_type = batch; - batch_type x = abs(self); - batch_type r1(0.); - auto test1 = x < batch_type(2.f / 3.f); - if (any(test1)) + private: + inline B eval(const B& c0) noexcept { - r1 = self * detail::erf_kernel::erf1(x * x); - if (all(test1)) - return r1; + return c0; } - batch_type z = x / (batch_type(1.) + x); - z -= batch_type(0.4f); - batch_type r2 = batch_type(1.) - exp(-x * x) * detail::erf_kernel::erfc2(z); - r2 = select(self < batch_type(0.), -r2, r2); - r1 = select(test1, r1, r2); -#ifndef XSIMD_NO_INFINITIES - r1 = select(xsimd::isinf(self), sign(self), r1); -#endif - return r1; - } - template batch erf(batch const& self, requires_arch) { - using batch_type = batch; - batch_type x = abs(self); - batch_type xx = x * x; - batch_type lim1 (0.65); - batch_type lim2 (2.2); - auto test1 = x < lim1; - batch_type r1 (0.); - if (any(test1)) + + inline B eval(const B& c0, const B& c1) noexcept { - r1 = self * detail::erf_kernel::erf1(xx); - if (all(test1)) - return r1; + return fma(x, c1, c0); } - auto test2 = x < lim2; - auto test3 = test2 && !test1; - batch_type ex = exp(-xx); - if (any(test3)) + + template + inline B eval(::xsimd::detail::index_sequence, const Tuple& tuple) { - batch_type z = batch_type(1.) - ex * detail::erf_kernel::erfc2(x); - batch_type r2 = select(self < batch_type(0.), -z, z); - r1 = select(test1, r1, r2); - if (all(test1 || test3)) - return r1; + return estrin { x * x }(std::get(tuple)...); } - batch_type z = batch_type(1.) - ex * detail::erf_kernel::erfc3(x); - z = select(self < batch_type(0.), -z, z); -#ifndef XSIMD_NO_INFINITIES - z = select(xsimd::isinf(self), sign(self), z); -#endif - return select(test2, r1, z); - } - // erfc - template batch erfc(batch const& self, requires_arch) { - using batch_type = batch; - batch_type x = abs(self); - auto test0 = self < batch_type(0.); - batch_type r1 (0.); - auto test1 = x < batch_type(2.f / 3.f); - batch_type z = x / (batch_type(1.) + x); - if (any(test1)) + template + inline B eval(const std::tuple& tuple) noexcept { - r1 = detail::erf_kernel::erfc3(z); - if (all(test1)) - return select(test0, batch_type(2.) - r1, r1); + return eval(::xsimd::detail::make_index_sequence(), tuple); } - z -= batch_type(0.4f); - batch_type r2 = exp(-x * x) * detail::erf_kernel::erfc2(z); - r1 = select(test1, r1, r2); -#ifndef XSIMD_NO_INFINITIES - r1 = select(x == constants::infinity(), batch_type(0.), r1); -#endif - return select(test0, batch_type(2.) - r1, r1); - } - template batch erfc(batch const& self, requires_arch) { - using batch_type = batch; - batch_type x = abs(self); - batch_type xx = x * x; - batch_type lim1 (0.65); - batch_type lim2 (2.2); - auto test0 = self < batch_type(0.); - auto test1 = x < lim1; - batch_type r1 (0.); - if (any(test1)) + + template + inline B eval(const std::tuple& tuple, const B& c0) noexcept { - r1 = batch_type(1.) - x * detail::erf_kernel::erf1(xx); - if (all(test1)) - return select(test0, batch_type(2.) - r1, r1); + return eval(std::tuple_cat(tuple, std::make_tuple(eval(c0)))); } - auto test2 = x < lim2; - auto test3 = test2 && !test1; - batch_type ex = exp(-xx); - if (any(test3)) + + template + inline B eval(const std::tuple& tuple, const B& c0, const B& c1) noexcept { - batch_type z = ex * detail::erf_kernel::erfc2(x); - r1 = select(test1, r1, z); - if (all(test1 || test3)) - return select(test0, batch_type(2.) - r1, r1); + return eval(std::tuple_cat(tuple, std::make_tuple(eval(c0, c1)))); } - batch_type z = ex * detail::erf_kernel::erfc3(x); - r1 = select(test2, r1, z); -#ifndef XSIMD_NO_INFINITIES - r1 = select(x == constants::infinity(), batch_type(0.), r1); -#endif - return select(test0, batch_type(2.) - r1, r1); - } - // estrin - namespace detail - { + template + inline B eval(const std::tuple& tuple, const B& c0, const B& c1, const Ts&... coefs) noexcept + { + return eval(std::tuple_cat(tuple, std::make_tuple(eval(c0, c1))), coefs...); + } + + template + inline B eval(const B& c0, const B& c1, const Ts&... coefs) noexcept + { + return eval(std::make_tuple(eval(c0, c1)), coefs...); + } + }; + } - template - struct estrin + template + inline batch estrin(const batch& self) noexcept { - B x; + using batch_type = batch; + return detail::estrin { self }(detail::coef()...); + } - template - inline B operator()(const Ts&... coefs) noexcept + // exp + /* origin: boost/simd/arch/common/detail/simd/expo_base.hpp */ + /* + * ==================================================== + * copyright 2016 NumScale SAS + * + * Distributed under the Boost Software License, Version 1.0. + * (See copy at http://boost.org/LICENSE_1_0.txt) + * ==================================================== + */ + namespace detail + { + enum exp_reduction_tag { - return eval(coefs...); - } + exp_tag, + exp2_tag, + exp10_tag + }; - private: - inline B eval(const B& c0) noexcept - { - return c0; - } + template + struct exp_reduction_base; - inline B eval(const B& c0, const B& c1) noexcept + template + struct exp_reduction_base { - return fma(x, c1, c0); - } + static constexpr B maxlog() noexcept + { + return constants::maxlog(); + } - template - inline B eval(::xsimd::detail::index_sequence, const Tuple& tuple) - { - return estrin{x * x}(std::get(tuple)...); - } + static constexpr B minlog() noexcept + { + return constants::minlog(); + } + }; - template - inline B eval(const std::tuple& tuple) noexcept + template + struct exp_reduction_base { - return eval(::xsimd::detail::make_index_sequence(), tuple); - } + static constexpr B maxlog() noexcept + { + return constants::maxlog10(); + } - template - inline B eval(const std::tuple& tuple, const B& c0) noexcept - { - return eval(std::tuple_cat(tuple, std::make_tuple(eval(c0)))); - } + static constexpr B minlog() noexcept + { + return constants::minlog10(); + } + }; - template - inline B eval(const std::tuple& tuple, const B& c0, const B& c1) noexcept + template + struct exp_reduction_base { - return eval(std::tuple_cat(tuple, std::make_tuple(eval(c0, c1)))); - } + static constexpr B maxlog() noexcept + { + return constants::maxlog2(); + } - template - inline B eval(const std::tuple& tuple, const B& c0, const B& c1, const Ts&... coefs) noexcept - { - return eval(std::tuple_cat(tuple, std::make_tuple(eval(c0, c1))), coefs...); - } + static constexpr B minlog() noexcept + { + return constants::minlog2(); + } + }; - template - inline B eval(const B& c0, const B& c1, const Ts&... coefs) noexcept - { - return eval(std::make_tuple(eval(c0, c1)), coefs...); - } - }; - } - template - batch estrin(const batch& self) { - using batch_type = batch; - return detail::estrin{self}(detail::coef()...); - } + template + struct exp_reduction; - // exp - /* origin: boost/simd/arch/common/detail/simd/expo_base.hpp */ - /* - * ==================================================== - * copyright 2016 NumScale SAS - * - * Distributed under the Boost Software License, Version 1.0. - * (See copy at http://boost.org/LICENSE_1_0.txt) - * ==================================================== - */ - namespace detail - { - enum exp_reduction_tag { exp_tag, exp2_tag, exp10_tag }; + template + struct exp_reduction : exp_reduction_base, exp_tag> + { + using batch_type = batch; + static inline batch_type approx(const batch_type& x) noexcept + { + batch_type y = detail::horner(x); + return ++fma(y, x * x, x); + } - template - struct exp_reduction_base; + static inline batch_type reduce(const batch_type& a, batch_type& x) noexcept + { + batch_type k = nearbyint(constants::invlog_2() * a); + x = fnma(k, constants::log_2hi(), a); + x = fnma(k, constants::log_2lo(), x); + return k; + } + }; - template - struct exp_reduction_base - { - static constexpr B maxlog() noexcept + template + struct exp_reduction : exp_reduction_base, exp10_tag> { - return constants::maxlog(); - } + using batch_type = batch; + static inline batch_type approx(const batch_type& x) noexcept + { + return ++(detail::horner(x) + * x); + } - static constexpr B minlog() noexcept - { - return constants::minlog(); - } - }; + static inline batch_type reduce(const batch_type& a, batch_type& x) noexcept + { + batch_type k = nearbyint(constants::invlog10_2() * a); + x = fnma(k, constants::log10_2hi(), a); + x -= k * constants::log10_2lo(); + return k; + } + }; - template - struct exp_reduction_base - { - static constexpr B maxlog() noexcept + template + struct exp_reduction : exp_reduction_base, exp2_tag> { - return constants::maxlog10(); - } + using batch_type = batch; + static inline batch_type approx(const batch_type& x) noexcept + { + batch_type y = detail::horner(x); + return ++fma(y, x * x, x * constants::log_2()); + } - static constexpr B minlog() noexcept - { - return constants::minlog10(); - } - }; + static inline batch_type reduce(const batch_type& a, batch_type& x) noexcept + { + batch_type k = nearbyint(a); + x = (a - k); + return k; + } + }; - template - struct exp_reduction_base - { - static constexpr B maxlog() noexcept + template + struct exp_reduction : exp_reduction_base, exp_tag> { - return constants::maxlog2(); - } + using batch_type = batch; + static inline batch_type approx(const batch_type& x) noexcept + { + batch_type t = x * x; + return fnma(t, + detail::horner(t), + x); + } - static constexpr B minlog() noexcept - { - return constants::minlog2(); - } - }; + static inline batch_type reduce(const batch_type& a, batch_type& hi, batch_type& lo, batch_type& x) noexcept + { + batch_type k = nearbyint(constants::invlog_2() * a); + hi = fnma(k, constants::log_2hi(), a); + lo = k * constants::log_2lo(); + x = hi - lo; + return k; + } - template - struct exp_reduction; + static inline batch_type finalize(const batch_type& x, const batch_type& c, const batch_type& hi, const batch_type& lo) noexcept + { + return batch_type(1.) - (((lo - (x * c) / (batch_type(2.) - c)) - hi)); + } + }; - template - struct exp_reduction : exp_reduction_base, exp_tag> - { - using batch_type = batch; - static inline batch_type approx(const batch_type& x) + template + struct exp_reduction : exp_reduction_base, exp10_tag> { - batch_type y = detail::horner(x); - return ++fma(y, x * x, x); - } + using batch_type = batch; + static inline batch_type approx(const batch_type& x) noexcept + { + batch_type xx = x * x; + batch_type px = x * detail::horner(xx); + batch_type x2 = px / (detail::horner1(xx) - px); + return ++(x2 + x2); + } - static inline batch_type reduce(const batch_type& a, batch_type& x) - { - batch_type k = nearbyint(constants::invlog_2() * a); - x = fnma(k, constants::log_2hi(), a); - x = fnma(k, constants::log_2lo(), x); - return k; - } - }; + static inline batch_type reduce(const batch_type& a, batch_type&, batch_type&, batch_type& x) noexcept + { + batch_type k = nearbyint(constants::invlog10_2() * a); + x = fnma(k, constants::log10_2hi(), a); + x = fnma(k, constants::log10_2lo(), x); + return k; + } - template - struct exp_reduction : exp_reduction_base, exp10_tag> - { - using batch_type = batch; - static inline batch_type approx(const batch_type& x) - { - return ++(detail::horner(x) * - x); - } + static inline batch_type finalize(const batch_type&, const batch_type& c, const batch_type&, const batch_type&) noexcept + { + return c; + } + }; - static inline batch_type reduce(const batch_type& a, batch_type& x) + template + struct exp_reduction : exp_reduction_base, exp2_tag> { - batch_type k = nearbyint(constants::invlog10_2() * a); - x = fnma(k, constants::log10_2hi(), a); - x -= k * constants::log10_2lo(); - return k; - } - }; + using batch_type = batch; + static inline batch_type approx(const batch_type& x) noexcept + { + batch_type t = x * x; + return fnma(t, + detail::horner(t), + x); + } - template - struct exp_reduction : exp_reduction_base, exp2_tag> - { - using batch_type = batch; - static inline batch_type approx(const batch_type& x) - { - batch_type y = detail::horner(x); - return ++fma(y, x * x, x * constants::log_2()); - } + static inline batch_type reduce(const batch_type& a, batch_type&, batch_type&, batch_type& x) noexcept + { + batch_type k = nearbyint(a); + x = (a - k) * constants::log_2(); + return k; + } - static inline batch_type reduce(const batch_type& a, batch_type& x) - { - batch_type k = nearbyint(a); - x = (a - k); - return k; + static inline batch_type finalize(const batch_type& x, const batch_type& c, const batch_type&, const batch_type&) noexcept + { + return batch_type(1.) + x + x * c / (batch_type(2.) - c); + } + }; + + template + inline batch exp(batch const& self) noexcept + { + using batch_type = batch; + using reducer_t = exp_reduction; + batch_type x; + batch_type k = reducer_t::reduce(self, x); + x = reducer_t::approx(x); + x = select(self <= reducer_t::minlog(), batch_type(0.), ldexp(x, to_int(k))); + x = select(self >= reducer_t::maxlog(), constants::infinity(), x); + return x; + } + + template + inline batch exp(batch const& self) noexcept + { + using batch_type = batch; + using reducer_t = exp_reduction; + batch_type hi, lo, x; + batch_type k = reducer_t::reduce(self, hi, lo, x); + batch_type c = reducer_t::approx(x); + c = reducer_t::finalize(x, c, hi, lo); + c = select(self <= reducer_t::minlog(), batch_type(0.), ldexp(c, to_int(k))); + c = select(self >= reducer_t::maxlog(), constants::infinity(), c); + return c; } - }; + } - template - struct exp_reduction : exp_reduction_base, exp_tag> + template + inline batch exp(batch const& self, requires_arch) noexcept { - using batch_type = batch; - static inline batch_type approx(const batch_type& x) - { - batch_type t = x * x; - return fnma(t, - detail::horner(t), - x); - } - - static inline batch_type reduce(const batch_type& a, batch_type& hi, batch_type& lo, batch_type& x) - { - batch_type k = nearbyint(constants::invlog_2() * a); - hi = fnma(k, constants::log_2hi(), a); - lo = k * constants::log_2lo(); - x = hi - lo; - return k; - } - - static inline batch_type finalize(const batch_type& x, const batch_type& c, const batch_type& hi, const batch_type& lo) - { - return batch_type(1.) - (((lo - (x * c) / (batch_type(2.) - c)) - hi)); - } - }; + return detail::exp(self); + } - template - struct exp_reduction : exp_reduction_base, exp10_tag> + template + inline batch, A> exp(batch, A> const& self, requires_arch) noexcept { - using batch_type = batch; - static inline batch_type approx(const batch_type& x) - { - batch_type xx = x * x; - batch_type px = x * detail::horner(xx); - batch_type x2 = px / (detail::horner1(xx) - - px); - return ++(x2 + x2); - } - - static inline batch_type reduce(const batch_type& a, batch_type&, batch_type&, batch_type& x) - { - batch_type k = nearbyint(constants::invlog10_2() * a); - x = fnma(k, constants::log10_2hi(), a); - x = fnma(k, constants::log10_2lo(), x); - return k; - } - - static inline batch_type finalize(const batch_type&, const batch_type& c, const batch_type&, const batch_type&) - { - return c; - } - }; + using batch_type = batch, A>; + auto isincos = sincos(self.imag()); + return exp(self.real()) * batch_type(std::get<1>(isincos), std::get<0>(isincos)); + } - template - struct exp_reduction : exp_reduction_base, exp2_tag> + // exp10 + template + inline batch exp10(batch const& self, requires_arch) noexcept { - using batch_type = batch; - static inline batch_type approx(const batch_type& x) - { - batch_type t = x * x; - return fnma(t, - detail::horner(t), - x); - } - - static inline batch_type reduce(const batch_type& a, batch_type&, batch_type&, batch_type& x) - { - batch_type k = nearbyint(a); - x = (a - k) * constants::log_2(); - return k; - } - - static inline batch_type finalize(const batch_type& x, const batch_type& c, const batch_type&, const batch_type&) - { - return batch_type(1.) + x + x * c / (batch_type(2.) - c); - } - }; - - template batch exp(batch const& self) { - using batch_type = batch; - using reducer_t = exp_reduction; - batch_type x; - batch_type k = reducer_t::reduce(self, x); - x = reducer_t::approx(x); - x = select(self <= reducer_t::minlog(), batch_type(0.), ldexp(x, to_int(k))); - x = select(self >= reducer_t::maxlog(), constants::infinity(), x); - return x; - } - template batch exp(batch const& self) { - using batch_type = batch; - using reducer_t = exp_reduction; - batch_type hi, lo, x; - batch_type k = reducer_t::reduce(self, hi, lo, x); - batch_type c = reducer_t::approx(x); - c = reducer_t::finalize(x, c, hi, lo); - c = select(self <= reducer_t::minlog(), batch_type(0.), ldexp(c, to_int(k))); - c = select(self >= reducer_t::maxlog(), constants::infinity(), c); - return c; - } - } - - template batch exp(batch const& self, requires_arch) { - return detail::exp(self); - } - - template batch, A> exp(batch, A> const& self, requires_arch) { - using batch_type = batch, A>; - auto isincos = sincos(self.imag()); - return exp(self.real()) * batch_type(std::get<1>(isincos), std::get<0>(isincos)); - } - - // exp10 - template batch exp10(batch const& self, requires_arch) { - return detail::exp(self); - } + return detail::exp(self); + } - // exp2 - template batch exp2(batch const& self, requires_arch) { - return detail::exp(self); - } + // exp2 + template + inline batch exp2(batch const& self, requires_arch) noexcept + { + return detail::exp(self); + } - // expm1 - namespace detail { + // expm1 + namespace detail + { /* origin: boost/simd/arch/common/detail/generic/expm1_kernel.hpp */ /* * ==================================================== @@ -840,20 +884,20 @@ namespace xsimd { * (See copy at http://boost.org/LICENSE_1_0.txt) * ==================================================== */ - template - static inline batch expm1(const batch& a) + template + static inline batch expm1(const batch& a) noexcept { - using batch_type = batch; + using batch_type = batch; batch_type k = nearbyint(constants::invlog_2() * a); batch_type x = fnma(k, constants::log_2hi(), a); x = fnma(k, constants::log_2lo(), x); batch_type hx = x * batch_type(0.5); batch_type hxs = x * hx; batch_type r = detail::horner(hxs); + 0X3F800000UL, // 1 + 0XBD08887FUL, // -3.3333298E-02 + 0X3ACF6DB4UL // 1.582554 + >(hxs); batch_type t = fnma(r, hx, batch_type(3.)); batch_type e = hxs * ((r - t) / (batch_type(6.) - x * t)); e = fms(x, e, hxs); @@ -864,22 +908,22 @@ namespace xsimd { return ldexp(y, ik); } - template - static inline batch expm1(const batch& a) + template + static inline batch expm1(const batch& a) noexcept { - using batch_type = batch; + using batch_type = batch; batch_type k = nearbyint(constants::invlog_2() * a); batch_type hi = fnma(k, constants::log_2hi(), a); batch_type lo = k * constants::log_2lo(); batch_type x = hi - lo; batch_type hxs = x * x * batch_type(0.5); batch_type r = detail::horner(hxs); + 0X3FF0000000000000ULL, + 0XBFA11111111110F4ULL, + 0X3F5A01A019FE5585ULL, + 0XBF14CE199EAADBB7ULL, + 0X3ED0CFCA86E65239ULL, + 0XBE8AFDB76E09C32DULL>(hxs); batch_type t = batch_type(3.) - r * batch_type(0.5) * x; batch_type e = hxs * ((r - t) / (batch_type(6) - x * t)); batch_type c = (hi - x) - lo; @@ -893,946 +937,973 @@ namespace xsimd { return ldexp(y, ik); } - } + } - template batch expm1(batch const& self, requires_arch) { - using batch_type = batch; - return select(self < constants::logeps(), - batch_type(-1.), - select(self > constants::maxlog(), - constants::infinity(), - detail::expm1(self))); - } + template + inline batch expm1(batch const& self, requires_arch) noexcept + { + using batch_type = batch; + return select(self < constants::logeps(), + batch_type(-1.), + select(self > constants::maxlog(), + constants::infinity(), + detail::expm1(self))); + } - template - batch, A> expm1(const batch, A>& z, requires_arch) - { - using batch_type = batch, A>; - using real_batch = typename batch_type::real_batch; - real_batch isin = sin(z.imag()); - real_batch rem1 = expm1(z.real()); - real_batch re = rem1 + 1.; - real_batch si = sin(z.imag() * 0.5); - return {rem1 - 2. * re * si * si, re * isin}; - } + template + inline batch, A> expm1(const batch, A>& z, requires_arch) noexcept + { + using batch_type = batch, A>; + using real_batch = typename batch_type::real_batch; + real_batch isin = sin(z.imag()); + real_batch rem1 = expm1(z.real()); + real_batch re = rem1 + 1.; + real_batch si = sin(z.imag() * 0.5); + return { rem1 - 2. * re * si * si, re * isin }; + } - // fdim - template batch fdim(batch const& self, batch const& other, requires_arch) { - return fmax(batch(0), self - other); - } + // fdim + template + inline batch fdim(batch const& self, batch const& other, requires_arch) noexcept + { + return fmax(batch(0), self - other); + } - // fmod - template batch fmod(batch const& self, batch const& other, requires_arch) { - return fnma(trunc(self / other), other, self); - } + // fmod + template + inline batch fmod(batch const& self, batch const& other, requires_arch) noexcept + { + return fnma(trunc(self / other), other, self); + } - // frexp - /* origin: boost/simd/arch/common/simd/function/ifrexp.hpp */ - /* - * ==================================================== - * copyright 2016 NumScale SAS - * - * Distributed under the Boost Software License, Version 1.0. - * (See copy at http://boost.org/LICENSE_1_0.txt) - * ==================================================== - */ - template - batch frexp(const batch& self, batch, A>& exp, requires_arch) { - using batch_type = batch; - using i_type = batch, A>; - i_type m1f = constants::mask1frexp(); - i_type r1 = m1f & ::xsimd::bitwise_cast(self); - batch_type x = self & ::xsimd::bitwise_cast(~m1f); - exp = (r1 >> constants::nmb()) - constants::maxexponentm1(); - exp = select(bool_cast(self != batch_type(0.)), exp, i_type(typename i_type::value_type(0))); - return select((self != batch_type(0.)), x | ::xsimd::bitwise_cast(constants::mask2frexp()), batch_type(0.)); - } + // frexp + /* origin: boost/simd/arch/common/simd/function/ifrexp.hpp */ + /* + * ==================================================== + * copyright 2016 NumScale SAS + * + * Distributed under the Boost Software License, Version 1.0. + * (See copy at http://boost.org/LICENSE_1_0.txt) + * ==================================================== + */ + template + inline batch frexp(const batch& self, batch, A>& exp, requires_arch) noexcept + { + using batch_type = batch; + using i_type = batch, A>; + i_type m1f = constants::mask1frexp(); + i_type r1 = m1f & ::xsimd::bitwise_cast(self); + batch_type x = self & ::xsimd::bitwise_cast(~m1f); + exp = (r1 >> constants::nmb()) - constants::maxexponentm1(); + exp = select(bool_cast(self != batch_type(0.)), exp, i_type(typename i_type::value_type(0))); + return select((self != batch_type(0.)), x | ::xsimd::bitwise_cast(constants::mask2frexp()), batch_type(0.)); + } - // from bool - template - batch from_bool(batch_bool const& self, requires_arch) { - return batch(self.data) & batch(1); - } + // from bool + template + inline batch from_bool(batch_bool const& self, requires_arch) noexcept + { + return batch(self.data) & batch(1); + } - // hadd - template std::complex hadd(batch, A> const& self, requires_arch) { - return {hadd(self.real()), hadd(self.imag())}; - } + // hadd + template + inline std::complex hadd(batch, A> const& self, requires_arch) noexcept + { + return { hadd(self.real()), hadd(self.imag()) }; + } - // horner - template - batch horner(const batch& self) { - return detail::horner, Coefs...>(self); - } + // horner + template + inline batch horner(const batch& self) noexcept + { + return detail::horner, Coefs...>(self); + } - // hypot - template batch hypot(batch const& self, batch const& other, requires_arch) { - return sqrt(fma(self, self, other * other)); - } + // hypot + template + inline batch hypot(batch const& self, batch const& other, requires_arch) noexcept + { + return sqrt(fma(self, self, other * other)); + } - // ipow - template batch ipow(batch const& self, ITy other, requires_arch) { - return ::xsimd::detail::ipow(self, other); - } + // ipow + template + inline batch ipow(batch const& self, ITy other, requires_arch) noexcept + { + return ::xsimd::detail::ipow(self, other); + } + // ldexp + /* origin: boost/simd/arch/common/simd/function/ldexp.hpp */ + /* + * ==================================================== + * copyright 2016 NumScale SAS + * + * Distributed under the Boost Software License, Version 1.0. + * (See copy at http://boost.org/LICENSE_1_0.txt) + * ==================================================== + */ + template + inline batch ldexp(const batch& self, const batch, A>& other, requires_arch) noexcept + { + using batch_type = batch; + using itype = as_integer_t; + itype ik = other + constants::maxexponent(); + ik = ik << constants::nmb(); + return self * ::xsimd::bitwise_cast(ik); + } - // ldexp - /* origin: boost/simd/arch/common/simd/function/ldexp.hpp */ - /* - * ==================================================== - * copyright 2016 NumScale SAS - * - * Distributed under the Boost Software License, Version 1.0. - * (See copy at http://boost.org/LICENSE_1_0.txt) - * ==================================================== - */ - template - batch ldexp(const batch& self, const batch, A>& other, requires_arch) { - using batch_type = batch; - using itype = as_integer_t; - itype ik = other + constants::maxexponent(); - ik = ik << constants::nmb(); - return self * ::xsimd::bitwise_cast(ik); - } + // lgamma + template + inline batch lgamma(batch const& self, requires_arch) noexcept; - // lgamma - template batch lgamma(batch const& self, requires_arch); - - namespace detail { - /* origin: boost/simd/arch/common/detail/generic/gammaln_kernel.hpp */ - /* - * ==================================================== - * copyright 2016 NumScale SAS - * - * Distributed under the Boost Software License, Version 1.0. - * (See copy at http://boost.org/LICENSE_1_0.txt) - * ==================================================== - */ - template - static inline batch gammalnB(const batch& x) + namespace detail + { + /* origin: boost/simd/arch/common/detail/generic/gammaln_kernel.hpp */ + /* + * ==================================================== + * copyright 2016 NumScale SAS + * + * Distributed under the Boost Software License, Version 1.0. + * (See copy at http://boost.org/LICENSE_1_0.txt) + * ==================================================== + */ + template + static inline batch gammalnB(const batch& x) noexcept { return horner, - 0x3ed87730, // 4.227843421859038E-001 - 0x3ea51a64, // 3.224669577325661E-001, - 0xbd89f07e, // -6.735323259371034E-002, - 0x3ca89ed8, // 2.058355474821512E-002, - 0xbbf164fd, // -7.366775108654962E-003, - 0x3b3ba883, // 2.863437556468661E-003, - 0xbaabeab1, // -1.311620815545743E-003, - 0x3a1ebb94 // 6.055172732649237E-004 + 0x3ed87730, // 4.227843421859038E-001 + 0x3ea51a64, // 3.224669577325661E-001, + 0xbd89f07e, // -6.735323259371034E-002, + 0x3ca89ed8, // 2.058355474821512E-002, + 0xbbf164fd, // -7.366775108654962E-003, + 0x3b3ba883, // 2.863437556468661E-003, + 0xbaabeab1, // -1.311620815545743E-003, + 0x3a1ebb94 // 6.055172732649237E-004 >(x); } - template - static inline batch gammalnC(const batch& x) + template + static inline batch gammalnC(const batch& x) noexcept { return horner, - 0xbf13c468, // -5.772156501719101E-001 - 0x3f528d34, // 8.224670749082976E-001, - 0xbecd27a8, // -4.006931650563372E-001, - 0x3e8a898b, // 2.705806208275915E-001, - 0xbe53c04f, // -2.067882815621965E-001, - 0x3e2d4dab, // 1.692415923504637E-001, - 0xbe22d329, // -1.590086327657347E-001, - 0x3e0c3c4f // 1.369488127325832E-001 + 0xbf13c468, // -5.772156501719101E-001 + 0x3f528d34, // 8.224670749082976E-001, + 0xbecd27a8, // -4.006931650563372E-001, + 0x3e8a898b, // 2.705806208275915E-001, + 0xbe53c04f, // -2.067882815621965E-001, + 0x3e2d4dab, // 1.692415923504637E-001, + 0xbe22d329, // -1.590086327657347E-001, + 0x3e0c3c4f // 1.369488127325832E-001 >(x); } - template - static inline batch gammaln2(const batch& x) + template + static inline batch gammaln2(const batch& x) noexcept { return horner, - 0x3daaaa94, // 8.333316229807355E-002f - 0xbb358701, // -2.769887652139868E-003f, - 0x3a31fd69 // 6.789774945028216E-004f + 0x3daaaa94, // 8.333316229807355E-002f + 0xbb358701, // -2.769887652139868E-003f, + 0x3a31fd69 // 6.789774945028216E-004f >(x); } - template - static inline batch gammaln1(const batch& x) + + template + static inline batch gammaln1(const batch& x) noexcept { return horner, - 0xc12a0c675418055eull, // -8.53555664245765465627E5 - 0xc13a45890219f20bull, // -1.72173700820839662146E6, - 0xc131bc82f994db51ull, // -1.16237097492762307383E6, - 0xc1143d73f89089e5ull, // -3.31612992738871184744E5, - 0xc0e2f234355bb93eull, // -3.88016315134637840924E4, - 0xc09589018ff36761ull // -1.37825152569120859100E3 - >(x) / - horner, - 0xc13ece4b6a11e14aull, // -2.01889141433532773231E6 - 0xc1435255892ff34cull, // -2.53252307177582951285E6, - 0xc131628671950043ull, // -1.13933444367982507207E6, - 0xc10aeb84b9744c9bull, // -2.20528590553854454839E5, - 0xc0d0aa0d7b89d757ull, // -1.70642106651881159223E4, - 0xc075fd0d1cf312b2ull, // -3.51815701436523470549E2, - 0x3ff0000000000000ull // 1.00000000000000000000E0 - >(x); + 0xc12a0c675418055eull, // -8.53555664245765465627E5 + 0xc13a45890219f20bull, // -1.72173700820839662146E6, + 0xc131bc82f994db51ull, // -1.16237097492762307383E6, + 0xc1143d73f89089e5ull, // -3.31612992738871184744E5, + 0xc0e2f234355bb93eull, // -3.88016315134637840924E4, + 0xc09589018ff36761ull // -1.37825152569120859100E3 + >(x) + / horner, + 0xc13ece4b6a11e14aull, // -2.01889141433532773231E6 + 0xc1435255892ff34cull, // -2.53252307177582951285E6, + 0xc131628671950043ull, // -1.13933444367982507207E6, + 0xc10aeb84b9744c9bull, // -2.20528590553854454839E5, + 0xc0d0aa0d7b89d757ull, // -1.70642106651881159223E4, + 0xc075fd0d1cf312b2ull, // -3.51815701436523470549E2, + 0x3ff0000000000000ull // 1.00000000000000000000E0 + >(x); } - template - static inline batch gammalnA(const batch& x) + template + static inline batch gammalnA(const batch& x) noexcept { return horner, - 0x3fb555555555554bull, // 8.33333333333331927722E-2 - 0xbf66c16c16b0a5a1ull, // -2.77777777730099687205E-3, - 0x3f4a019f20dc5ebbull, // 7.93650340457716943945E-4, - 0xbf437fbdb580e943ull, // -5.95061904284301438324E-4, - 0x3f4a985027336661ull // 8.11614167470508450300E-4 + 0x3fb555555555554bull, // 8.33333333333331927722E-2 + 0xbf66c16c16b0a5a1ull, // -2.77777777730099687205E-3, + 0x3f4a019f20dc5ebbull, // 7.93650340457716943945E-4, + 0xbf437fbdb580e943ull, // -5.95061904284301438324E-4, + 0x3f4a985027336661ull // 8.11614167470508450300E-4 >(x); } - /* origin: boost/simd/arch/common/simd/function/gammaln.hpp */ - /* - * ==================================================== - * copyright 2016 NumScale SAS - * - * Distributed under the Boost Software License, Version 1.0. - * (See copy at http://boost.org/LICENSE_1_0.txt) - * ==================================================== - */ - template - struct lgamma_impl; - template - struct lgamma_impl> - { - using batch_type = batch; - static inline batch_type compute(const batch_type& a) + /* origin: boost/simd/arch/common/simd/function/gammaln.hpp */ + /* + * ==================================================== + * copyright 2016 NumScale SAS + * + * Distributed under the Boost Software License, Version 1.0. + * (See copy at http://boost.org/LICENSE_1_0.txt) + * ==================================================== + */ + template + struct lgamma_impl; + + template + struct lgamma_impl> { - auto inf_result = (a <= batch_type(0.)) && is_flint(a); - batch_type x = select(inf_result, constants::nan(), a); - batch_type q = abs(x); + using batch_type = batch; + static inline batch_type compute(const batch_type& a) noexcept + { + auto inf_result = (a <= batch_type(0.)) && is_flint(a); + batch_type x = select(inf_result, constants::nan(), a); + batch_type q = abs(x); #ifndef XSIMD_NO_INFINITIES - inf_result = (x == constants::infinity()) || inf_result; + inf_result = (x == constants::infinity()) || inf_result; #endif - auto ltza = a < batch_type(0.); - batch_type r; - batch_type r1 = other(q); - if (any(ltza)) - { - r = select(inf_result, constants::infinity(), negative(q, r1)); - if (all(ltza)) - return r; + auto ltza = a < batch_type(0.); + batch_type r; + batch_type r1 = other(q); + if (any(ltza)) + { + r = select(inf_result, constants::infinity(), negative(q, r1)); + if (all(ltza)) + return r; + } + batch_type r2 = select(ltza, r, r1); + return select(a == constants::minusinfinity(), constants::nan(), select(inf_result, constants::infinity(), r2)); } - batch_type r2 = select(ltza, r, r1); - return select(a == constants::minusinfinity(), constants::nan(), select(inf_result, constants::infinity(), r2)); - } - private: - - static inline batch_type negative(const batch_type& q, const batch_type& w) - { - batch_type p = floor(q); - batch_type z = q - p; - auto test2 = z < batch_type(0.5); - z = select(test2, z - batch_type(1.), z); - z = q * sin(z, trigo_pi_tag()); - return -log(constants::invpi() * abs(z)) - w; - } + private: + static inline batch_type negative(const batch_type& q, const batch_type& w) noexcept + { + batch_type p = floor(q); + batch_type z = q - p; + auto test2 = z < batch_type(0.5); + z = select(test2, z - batch_type(1.), z); + z = q * sin(z, trigo_pi_tag()); + return -log(constants::invpi() * abs(z)) - w; + } - static inline batch_type other(const batch_type& x) - { - auto xlt650 = (x < batch_type(6.5)); - batch_type r0x = x; - batch_type r0z = x; - batch_type r0s = batch_type(1.); - batch_type r1 = batch_type(0.); - batch_type p = constants::nan(); - if (any(xlt650)) + static inline batch_type other(const batch_type& x) noexcept { - batch_type z = batch_type(1.); - batch_type tx = select(xlt650, x, batch_type(0.)); - batch_type nx = batch_type(0.); - const batch_type _075 = batch_type(0.75); - const batch_type _150 = batch_type(1.50); - const batch_type _125 = batch_type(1.25); - const batch_type _250 = batch_type(2.50); - auto xge150 = (x >= _150); - auto txgt250 = (tx > _250); - - // x >= 1.5 - while (any(xge150 && txgt250)) - { - nx = select(txgt250, nx - batch_type(1.), nx); - tx = select(txgt250, x + nx, tx); - z = select(txgt250, z * tx, z); - txgt250 = (tx > _250); - } - r0x = select(xge150, x + nx - batch_type(2.), x); - r0z = select(xge150, z, r0z); - r0s = select(xge150, batch_type(1.), r0s); - - // x >= 1.25 && x < 1.5 - auto xge125 = (x >= _125); - auto xge125t = xge125 && !xge150; - if (any(xge125)) + auto xlt650 = (x < batch_type(6.5)); + batch_type r0x = x; + batch_type r0z = x; + batch_type r0s = batch_type(1.); + batch_type r1 = batch_type(0.); + batch_type p = constants::nan(); + if (any(xlt650)) { - r0x = select(xge125t, x - batch_type(1.), r0x); - r0z = select(xge125t, z * x, r0z); - r0s = select(xge125t, batch_type(-1.), r0s); - } + batch_type z = batch_type(1.); + batch_type tx = select(xlt650, x, batch_type(0.)); + batch_type nx = batch_type(0.); + const batch_type _075 = batch_type(0.75); + const batch_type _150 = batch_type(1.50); + const batch_type _125 = batch_type(1.25); + const batch_type _250 = batch_type(2.50); + auto xge150 = (x >= _150); + auto txgt250 = (tx > _250); + + // x >= 1.5 + while (any(xge150 && txgt250)) + { + nx = select(txgt250, nx - batch_type(1.), nx); + tx = select(txgt250, x + nx, tx); + z = select(txgt250, z * tx, z); + txgt250 = (tx > _250); + } + r0x = select(xge150, x + nx - batch_type(2.), x); + r0z = select(xge150, z, r0z); + r0s = select(xge150, batch_type(1.), r0s); + + // x >= 1.25 && x < 1.5 + auto xge125 = (x >= _125); + auto xge125t = xge125 && !xge150; + if (any(xge125)) + { + r0x = select(xge125t, x - batch_type(1.), r0x); + r0z = select(xge125t, z * x, r0z); + r0s = select(xge125t, batch_type(-1.), r0s); + } - // x >= 0.75 && x < 1.5 - batch_bool kernelC(false); - auto xge075 = (x >= _075); - auto xge075t = xge075 && !xge125; - if (any(xge075t)) - { - kernelC = xge075t; - r0x = select(xge075t, x - batch_type(1.), x); - r0z = select(xge075t, batch_type(1.), r0z); - r0s = select(xge075t, batch_type(-1.), r0s); - p = gammalnC(r0x); - } + // x >= 0.75 && x < 1.5 + batch_bool kernelC(false); + auto xge075 = (x >= _075); + auto xge075t = xge075 && !xge125; + if (any(xge075t)) + { + kernelC = xge075t; + r0x = select(xge075t, x - batch_type(1.), x); + r0z = select(xge075t, batch_type(1.), r0z); + r0s = select(xge075t, batch_type(-1.), r0s); + p = gammalnC(r0x); + } - // tx < 1.5 && x < 0.75 - auto txlt150 = (tx < _150) && !xge075; - if (any(txlt150)) - { - auto orig = txlt150; - while (any(txlt150)) + // tx < 1.5 && x < 0.75 + auto txlt150 = (tx < _150) && !xge075; + if (any(txlt150)) { - z = select(txlt150, z * tx, z); - nx = select(txlt150, nx + batch_type(1.), nx); - tx = select(txlt150, x + nx, tx); - txlt150 = (tx < _150) && !xge075; + auto orig = txlt150; + while (any(txlt150)) + { + z = select(txlt150, z * tx, z); + nx = select(txlt150, nx + batch_type(1.), nx); + tx = select(txlt150, x + nx, tx); + txlt150 = (tx < _150) && !xge075; + } + r0x = select(orig, r0x + nx - batch_type(2.), r0x); + r0z = select(orig, z, r0z); + r0s = select(orig, batch_type(-1.), r0s); } - r0x = select(orig, r0x + nx - batch_type(2.), r0x); - r0z = select(orig, z, r0z); - r0s = select(orig, batch_type(-1.), r0s); + p = select(kernelC, p, gammalnB(r0x)); + if (all(xlt650)) + return fma(r0x, p, r0s * log(abs(r0z))); } - p = select(kernelC, p, gammalnB(r0x)); - if (all(xlt650)) - return fma(r0x, p, r0s * log(abs(r0z))); + r0z = select(xlt650, abs(r0z), x); + batch_type m = log(r0z); + r1 = fma(r0x, p, r0s * m); + batch_type r2 = fma(x - batch_type(0.5), m, constants::logsqrt2pi() - x); + r2 += gammaln2(batch_type(1.) / (x * x)) / x; + return select(xlt650, r1, r2); } - r0z = select(xlt650, abs(r0z), x); - batch_type m = log(r0z); - r1 = fma(r0x, p, r0s * m); - batch_type r2 = fma(x - batch_type(0.5), m, constants::logsqrt2pi() - x); - r2 += gammaln2(batch_type(1.) / (x * x)) / x; - return select(xlt650, r1, r2); - } - }; - - template - struct lgamma_impl> - { - using batch_type = batch; + }; - static inline batch_type compute(const batch_type& a) + template + struct lgamma_impl> { - auto inf_result = (a <= batch_type(0.)) && is_flint(a); - batch_type x = select(inf_result, constants::nan(), a); - batch_type q = abs(x); + using batch_type = batch; + + static inline batch_type compute(const batch_type& a) noexcept + { + auto inf_result = (a <= batch_type(0.)) && is_flint(a); + batch_type x = select(inf_result, constants::nan(), a); + batch_type q = abs(x); #ifndef XSIMD_NO_INFINITIES - inf_result = (q == constants::infinity()); + inf_result = (q == constants::infinity()); #endif - auto test = (a < batch_type(-34.)); - batch_type r = constants::nan(); - if (any(test)) - { - r = large_negative(q); - if (all(test)) - return select(inf_result, constants::nan(), r); + auto test = (a < batch_type(-34.)); + batch_type r = constants::nan(); + if (any(test)) + { + r = large_negative(q); + if (all(test)) + return select(inf_result, constants::nan(), r); + } + batch_type r1 = other(a); + batch_type r2 = select(test, r, r1); + return select(a == constants::minusinfinity(), constants::nan(), select(inf_result, constants::infinity(), r2)); } - batch_type r1 = other(a); - batch_type r2 = select(test, r, r1); - return select(a == constants::minusinfinity(), constants::nan(), select(inf_result, constants::infinity(), r2)); - } - - private: - static inline batch_type large_negative(const batch_type& q) - { - batch_type w = lgamma(q); - batch_type p = floor(q); - batch_type z = q - p; - auto test2 = (z < batch_type(0.5)); - z = select(test2, z - batch_type(1.), z); - z = q * sin(z, trigo_pi_tag()); - z = abs(z); - return constants::logpi() - log(z) - w; - } + private: + static inline batch_type large_negative(const batch_type& q) noexcept + { + batch_type w = lgamma(q); + batch_type p = floor(q); + batch_type z = q - p; + auto test2 = (z < batch_type(0.5)); + z = select(test2, z - batch_type(1.), z); + z = q * sin(z, trigo_pi_tag()); + z = abs(z); + return constants::logpi() - log(z) - w; + } - static inline batch_type other(const batch_type& xx) - { - batch_type x = xx; - auto test = (x < batch_type(13.)); - batch_type r1 = batch_type(0.); - if (any(test)) + static inline batch_type other(const batch_type& xx) noexcept { - batch_type z = batch_type(1.); - batch_type p = batch_type(0.); - batch_type u = select(test, x, batch_type(0.)); - auto test1 = (u >= batch_type(3.)); - while (any(test1)) + batch_type x = xx; + auto test = (x < batch_type(13.)); + batch_type r1 = batch_type(0.); + if (any(test)) { - p = select(test1, p - batch_type(1.), p); - u = select(test1, x + p, u); - z = select(test1, z * u, z); - test1 = (u >= batch_type(3.)); - } + batch_type z = batch_type(1.); + batch_type p = batch_type(0.); + batch_type u = select(test, x, batch_type(0.)); + auto test1 = (u >= batch_type(3.)); + while (any(test1)) + { + p = select(test1, p - batch_type(1.), p); + u = select(test1, x + p, u); + z = select(test1, z * u, z); + test1 = (u >= batch_type(3.)); + } - auto test2 = (u < batch_type(2.)); - while (any(test2)) - { - z = select(test2, z / u, z); - p = select(test2, p + batch_type(1.), p); - u = select(test2, x + p, u); - test2 = (u < batch_type(2.)); - } + auto test2 = (u < batch_type(2.)); + while (any(test2)) + { + z = select(test2, z / u, z); + p = select(test2, p + batch_type(1.), p); + u = select(test2, x + p, u); + test2 = (u < batch_type(2.)); + } - z = abs(z); - x += p - batch_type(2.); - r1 = x * gammaln1(x) + log(z); - if (all(test)) - return r1; + z = abs(z); + x += p - batch_type(2.); + r1 = x * gammaln1(x) + log(z); + if (all(test)) + return r1; + } + batch_type r2 = fma(xx - batch_type(0.5), log(xx), constants::logsqrt2pi() - xx); + batch_type p = batch_type(1.) / (xx * xx); + r2 += gammalnA(p) / xx; + return select(test, r1, r2); } - batch_type r2 = fma(xx - batch_type(0.5), log(xx), constants::logsqrt2pi() - xx); - batch_type p = batch_type(1.) / (xx * xx); - r2 += gammalnA(p) / xx; - return select(test, r1, r2); - } - }; - } - - template batch lgamma(batch const& self, requires_arch) { - return detail::lgamma_impl>::compute(self); - } + }; + } + template + inline batch lgamma(batch const& self, requires_arch) noexcept + { + return detail::lgamma_impl>::compute(self); + } - // log - /* origin: boost/simd/arch/common/simd/function/log.hpp */ - /* - * ==================================================== - * copyright 2016 NumScale SAS - * - * Distributed under the Boost Software License, Version 1.0. - * (See copy at http://boost.org/LICENSE_1_0.txt) - * ==================================================== - */ - template batch log(batch const& self, requires_arch) { - using batch_type = batch; - using i_type = as_integer_t; - batch_type x = self; - i_type k(0); - auto isnez = (self != batch_type(0.)); + // log + /* origin: boost/simd/arch/common/simd/function/log.hpp */ + /* + * ==================================================== + * copyright 2016 NumScale SAS + * + * Distributed under the Boost Software License, Version 1.0. + * (See copy at http://boost.org/LICENSE_1_0.txt) + * ==================================================== + */ + template + inline batch log(batch const& self, requires_arch) noexcept + { + using batch_type = batch; + using i_type = as_integer_t; + batch_type x = self; + i_type k(0); + auto isnez = (self != batch_type(0.)); #ifndef XSIMD_NO_DENORMALS - auto test = (self < constants::smallestposval()) && isnez; - if (any(test)) - { - k = select(bool_cast(test), k - i_type(23), k); - x = select(test, x * batch_type(8388608ul), x); - } + auto test = (self < constants::smallestposval()) && isnez; + if (any(test)) + { + k = select(bool_cast(test), k - i_type(23), k); + x = select(test, x * batch_type(8388608ul), x); + } #endif - i_type ix = ::xsimd::bitwise_cast(x); - ix += 0x3f800000 - 0x3f3504f3; - k += (ix >> 23) - 0x7f; - ix = (ix & i_type(0x007fffff)) + 0x3f3504f3; - x = ::xsimd::bitwise_cast(ix); - batch_type f = --x; - batch_type s = f / (batch_type(2.) + f); - batch_type z = s * s; - batch_type w = z * z; - batch_type t1 = w * detail::horner(w); - batch_type t2 = z * detail::horner(w); - batch_type R = t2 + t1; - batch_type hfsq = batch_type(0.5) * f * f; - batch_type dk = to_float(k); - batch_type r = fma(dk, constants::log_2hi(), fma(s, (hfsq + R), dk * constants::log_2lo()) - hfsq + f); + i_type ix = ::xsimd::bitwise_cast(x); + ix += 0x3f800000 - 0x3f3504f3; + k += (ix >> 23) - 0x7f; + ix = (ix & i_type(0x007fffff)) + 0x3f3504f3; + x = ::xsimd::bitwise_cast(ix); + batch_type f = --x; + batch_type s = f / (batch_type(2.) + f); + batch_type z = s * s; + batch_type w = z * z; + batch_type t1 = w * detail::horner(w); + batch_type t2 = z * detail::horner(w); + batch_type R = t2 + t1; + batch_type hfsq = batch_type(0.5) * f * f; + batch_type dk = to_float(k); + batch_type r = fma(dk, constants::log_2hi(), fma(s, (hfsq + R), dk * constants::log_2lo()) - hfsq + f); #ifndef XSIMD_NO_INFINITIES - batch_type zz = select(isnez, select(self == constants::infinity(), constants::infinity(), r), constants::minusinfinity()); + batch_type zz = select(isnez, select(self == constants::infinity(), constants::infinity(), r), constants::minusinfinity()); #else - batch_type zz = select(isnez, r, constants::minusinfinity()); + batch_type zz = select(isnez, r, constants::minusinfinity()); #endif - return select(!(self >= batch_type(0.)), constants::nan(), zz); - } + return select(!(self >= batch_type(0.)), constants::nan(), zz); + } - template batch log(batch const& self, requires_arch) { - using batch_type = batch; - using i_type = as_integer_t; + template + inline batch log(batch const& self, requires_arch) noexcept + { + using batch_type = batch; + using i_type = as_integer_t; - batch_type x = self; - i_type hx = ::xsimd::bitwise_cast(x) >> 32; - i_type k(0); - auto isnez = (self != batch_type(0.)); + batch_type x = self; + i_type hx = ::xsimd::bitwise_cast(x) >> 32; + i_type k(0); + auto isnez = (self != batch_type(0.)); #ifndef XSIMD_NO_DENORMALS - auto test = (self < constants::smallestposval()) && isnez; - if (any(test)) - { - k = select(bool_cast(test), k - i_type(54), k); - x = select(test, x * batch_type(18014398509481984ull), x); - } + auto test = (self < constants::smallestposval()) && isnez; + if (any(test)) + { + k = select(bool_cast(test), k - i_type(54), k); + x = select(test, x * batch_type(18014398509481984ull), x); + } #endif - hx += 0x3ff00000 - 0x3fe6a09e; - k += (hx >> 20) - 0x3ff; - batch_type dk = to_float(k); - hx = (hx & i_type(0x000fffff)) + 0x3fe6a09e; - x = ::xsimd::bitwise_cast(hx << 32 | (i_type(0xffffffff) & ::xsimd::bitwise_cast(x))); - - batch_type f = --x; - batch_type hfsq = batch_type(0.5) * f * f; - batch_type s = f / (batch_type(2.) + f); - batch_type z = s * s; - batch_type w = z * z; - - batch_type t1 = w * detail::horner(w); - batch_type t2 = z * detail::horner(w); - batch_type R = t2 + t1; - batch_type r = fma(dk, constants::log_2hi(), fma(s, (hfsq + R), dk * constants::log_2lo()) - hfsq + f); + hx += 0x3ff00000 - 0x3fe6a09e; + k += (hx >> 20) - 0x3ff; + batch_type dk = to_float(k); + hx = (hx & i_type(0x000fffff)) + 0x3fe6a09e; + x = ::xsimd::bitwise_cast(hx << 32 | (i_type(0xffffffff) & ::xsimd::bitwise_cast(x))); + + batch_type f = --x; + batch_type hfsq = batch_type(0.5) * f * f; + batch_type s = f / (batch_type(2.) + f); + batch_type z = s * s; + batch_type w = z * z; + + batch_type t1 = w * detail::horner(w); + batch_type t2 = z * detail::horner(w); + batch_type R = t2 + t1; + batch_type r = fma(dk, constants::log_2hi(), fma(s, (hfsq + R), dk * constants::log_2lo()) - hfsq + f); #ifndef XSIMD_NO_INFINITIES - batch_type zz = select(isnez, select(self == constants::infinity(), constants::infinity(), r), constants::minusinfinity()); + batch_type zz = select(isnez, select(self == constants::infinity(), constants::infinity(), r), constants::minusinfinity()); #else - batch_type zz = select(isnez, r, constants::minusinfinity()); + batch_type zz = select(isnez, r, constants::minusinfinity()); #endif - return select(!(self >= batch_type(0.)), constants::nan(), zz); - } + return select(!(self >= batch_type(0.)), constants::nan(), zz); + } - template - batch, A> log(const batch, A>& z, requires_arch) - { - return batch, A>(log(abs(z)), atan2(z.imag(), z.real())); - } + template + inline batch, A> log(const batch, A>& z, requires_arch) noexcept + { + return batch, A>(log(abs(z)), atan2(z.imag(), z.real())); + } - // log2 - template batch log2(batch const& self, requires_arch) { - using batch_type = batch; - using i_type = as_integer_t; - batch_type x = self; - i_type k(0); - auto isnez = (self != batch_type(0.)); + // log2 + template + inline batch log2(batch const& self, requires_arch) noexcept + { + using batch_type = batch; + using i_type = as_integer_t; + batch_type x = self; + i_type k(0); + auto isnez = (self != batch_type(0.)); #ifndef XSIMD_NO_DENORMALS - auto test = (self < constants::smallestposval()) && isnez; - if (any(test)) - { - k = select(bool_cast(test), k - i_type(25), k); - x = select(test, x * batch_type(33554432ul), x); - } + auto test = (self < constants::smallestposval()) && isnez; + if (any(test)) + { + k = select(bool_cast(test), k - i_type(25), k); + x = select(test, x * batch_type(33554432ul), x); + } #endif - i_type ix = ::xsimd::bitwise_cast(x); - ix += 0x3f800000 - 0x3f3504f3; - k += (ix >> 23) - 0x7f; - ix = (ix & i_type(0x007fffff)) + 0x3f3504f3; - x = ::xsimd::bitwise_cast(ix); - batch_type f = --x; - batch_type s = f / (batch_type(2.) + f); - batch_type z = s * s; - batch_type w = z * z; - batch_type t1 = w * detail::horner(w); - batch_type t2 = z * detail::horner(w); - batch_type R = t1 + t2; - batch_type hfsq = batch_type(0.5) * f * f; - batch_type dk = to_float(k); - batch_type r = fma(fms(s, hfsq + R, hfsq) + f, constants::invlog_2(), dk); + i_type ix = ::xsimd::bitwise_cast(x); + ix += 0x3f800000 - 0x3f3504f3; + k += (ix >> 23) - 0x7f; + ix = (ix & i_type(0x007fffff)) + 0x3f3504f3; + x = ::xsimd::bitwise_cast(ix); + batch_type f = --x; + batch_type s = f / (batch_type(2.) + f); + batch_type z = s * s; + batch_type w = z * z; + batch_type t1 = w * detail::horner(w); + batch_type t2 = z * detail::horner(w); + batch_type R = t1 + t2; + batch_type hfsq = batch_type(0.5) * f * f; + batch_type dk = to_float(k); + batch_type r = fma(fms(s, hfsq + R, hfsq) + f, constants::invlog_2(), dk); #ifndef XSIMD_NO_INFINITIES - batch_type zz = select(isnez, select(self == constants::infinity(), constants::infinity(), r), constants::minusinfinity()); + batch_type zz = select(isnez, select(self == constants::infinity(), constants::infinity(), r), constants::minusinfinity()); #else - batch_type zz = select(isnez, r, constants::minusinfinity()); + batch_type zz = select(isnez, r, constants::minusinfinity()); #endif - return select(!(self >= batch_type(0.)), constants::nan(), zz); - } - template batch log2(batch const& self, requires_arch) { - using batch_type = batch; - using i_type = as_integer_t; - batch_type x = self; - i_type hx = ::xsimd::bitwise_cast(x) >> 32; - i_type k(0); - auto isnez = (self != batch_type(0.)); + return select(!(self >= batch_type(0.)), constants::nan(), zz); + } + + template + inline batch log2(batch const& self, requires_arch) noexcept + { + using batch_type = batch; + using i_type = as_integer_t; + batch_type x = self; + i_type hx = ::xsimd::bitwise_cast(x) >> 32; + i_type k(0); + auto isnez = (self != batch_type(0.)); #ifndef XSIMD_NO_DENORMALS - auto test = (self < constants::smallestposval()) && isnez; - if (any(test)) - { - k = select(bool_cast(test), k - i_type(54), k); - x = select(test, x * batch_type(18014398509481984ull), x); - } + auto test = (self < constants::smallestposval()) && isnez; + if (any(test)) + { + k = select(bool_cast(test), k - i_type(54), k); + x = select(test, x * batch_type(18014398509481984ull), x); + } #endif - hx += 0x3ff00000 - 0x3fe6a09e; - k += (hx >> 20) - 0x3ff; - hx = (hx & i_type(0x000fffff)) + 0x3fe6a09e; - x = ::xsimd::bitwise_cast(hx << 32 | (i_type(0xffffffff) & ::xsimd::bitwise_cast(x))); - batch_type f = --x; - batch_type s = f / (batch_type(2.) + f); - batch_type z = s * s; - batch_type w = z * z; - batch_type t1 = w * detail::horner(w); - batch_type t2 = z * detail::horner(w); - batch_type R = t2 + t1; - batch_type hfsq = batch_type(0.5) * f * f; - batch_type hi = f - hfsq; - hi = hi & ::xsimd::bitwise_cast((constants::allbits() << 32)); - batch_type lo = fma(s, hfsq + R, f - hi - hfsq); - batch_type val_hi = hi * constants::invlog_2hi(); - batch_type val_lo = fma(lo + hi, constants::invlog_2lo(), lo * constants::invlog_2hi()); - batch_type dk = to_float(k); - batch_type w1 = dk + val_hi; - val_lo += (dk - w1) + val_hi; - val_hi = w1; - batch_type r = val_lo + val_hi; + hx += 0x3ff00000 - 0x3fe6a09e; + k += (hx >> 20) - 0x3ff; + hx = (hx & i_type(0x000fffff)) + 0x3fe6a09e; + x = ::xsimd::bitwise_cast(hx << 32 | (i_type(0xffffffff) & ::xsimd::bitwise_cast(x))); + batch_type f = --x; + batch_type s = f / (batch_type(2.) + f); + batch_type z = s * s; + batch_type w = z * z; + batch_type t1 = w * detail::horner(w); + batch_type t2 = z * detail::horner(w); + batch_type R = t2 + t1; + batch_type hfsq = batch_type(0.5) * f * f; + batch_type hi = f - hfsq; + hi = hi & ::xsimd::bitwise_cast((constants::allbits() << 32)); + batch_type lo = fma(s, hfsq + R, f - hi - hfsq); + batch_type val_hi = hi * constants::invlog_2hi(); + batch_type val_lo = fma(lo + hi, constants::invlog_2lo(), lo * constants::invlog_2hi()); + batch_type dk = to_float(k); + batch_type w1 = dk + val_hi; + val_lo += (dk - w1) + val_hi; + val_hi = w1; + batch_type r = val_lo + val_hi; #ifndef XSIMD_NO_INFINITIES - batch_type zz = select(isnez, select(self == constants::infinity(), constants::infinity(), r), constants::minusinfinity()); + batch_type zz = select(isnez, select(self == constants::infinity(), constants::infinity(), r), constants::minusinfinity()); #else - batch_type zz = select(isnez, r, constants::minusinfinity()); + batch_type zz = select(isnez, r, constants::minusinfinity()); #endif - return select(!(self >= batch_type(0.)), constants::nan(), zz); - } - namespace detail { - template - inline batch logN_complex_impl(const batch& z, typename batch::value_type base) + return select(!(self >= batch_type(0.)), constants::nan(), zz); + } + + namespace detail { - using batch_type = batch; - using rv_type = typename batch_type::value_type; - return log(z) / batch_type(rv_type(base)); + template + inline batch logN_complex_impl(const batch& z, typename batch::value_type base) noexcept + { + using batch_type = batch; + using rv_type = typename batch_type::value_type; + return log(z) / batch_type(rv_type(base)); + } } - } - template batch, A> log2(batch, A> const& self, requires_arch) { - return detail::logN_complex_impl(self, std::log(2)); - } + template + inline batch, A> log2(batch, A> const& self, requires_arch) noexcept + { + return detail::logN_complex_impl(self, std::log(2)); + } - // log10 - /* origin: FreeBSD /usr/src/lib/msun/src/e_log10f.c */ - /* - * ==================================================== - * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved. - * - * Developed at SunPro, a Sun Microsystems, Inc. business. - * Permission to use, copy, modify, and distribute this - * software is freely granted, provided that this notice - * is preserved. - * ==================================================== - */ - template batch log10(batch const& self, requires_arch) { - using batch_type = batch; - const batch_type - ivln10hi(4.3432617188e-01f), - ivln10lo(-3.1689971365e-05f), - log10_2hi(3.0102920532e-01f), - log10_2lo(7.9034151668e-07f); - using i_type = as_integer_t; - batch_type x = self; - i_type k(0); - auto isnez = (self != batch_type(0.)); + // log10 + /* origin: FreeBSD /usr/src/lib/msun/src/e_log10f.c */ + /* + * ==================================================== + * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved. + * + * Developed at SunPro, a Sun Microsystems, Inc. business. + * Permission to use, copy, modify, and distribute this + * software is freely granted, provided that this notice + * is preserved. + * ==================================================== + */ + template + inline batch log10(batch const& self, requires_arch) noexcept + { + using batch_type = batch; + const batch_type + ivln10hi(4.3432617188e-01f), + ivln10lo(-3.1689971365e-05f), + log10_2hi(3.0102920532e-01f), + log10_2lo(7.9034151668e-07f); + using i_type = as_integer_t; + batch_type x = self; + i_type k(0); + auto isnez = (self != batch_type(0.)); #ifndef XSIMD_NO_DENORMALS - auto test = (self < constants::smallestposval()) && isnez; - if (any(test)) - { - k = select(bool_cast(test), k - i_type(25), k); - x = select(test, x * batch_type(33554432ul), x); - } + auto test = (self < constants::smallestposval()) && isnez; + if (any(test)) + { + k = select(bool_cast(test), k - i_type(25), k); + x = select(test, x * batch_type(33554432ul), x); + } #endif - i_type ix = ::xsimd::bitwise_cast(x); - ix += 0x3f800000 - 0x3f3504f3; - k += (ix >> 23) - 0x7f; - ix = (ix & i_type(0x007fffff)) + 0x3f3504f3; - x = ::xsimd::bitwise_cast(ix); - batch_type f = --x; - batch_type s = f / (batch_type(2.) + f); - batch_type z = s * s; - batch_type w = z * z; - batch_type t1 = w * detail::horner(w); - batch_type t2 = z * detail::horner(w); - batch_type R = t2 + t1; - batch_type dk = to_float(k); - batch_type hfsq = batch_type(0.5) * f * f; - batch_type hibits = f - hfsq; - hibits &= ::xsimd::bitwise_cast(i_type(0xfffff000)); - batch_type lobits = fma(s, hfsq + R, f - hibits - hfsq); - batch_type r = fma(dk, log10_2hi, - fma(hibits, ivln10hi, - fma(lobits, ivln10hi, - fma(lobits + hibits, ivln10lo, dk * log10_2lo)))); + i_type ix = ::xsimd::bitwise_cast(x); + ix += 0x3f800000 - 0x3f3504f3; + k += (ix >> 23) - 0x7f; + ix = (ix & i_type(0x007fffff)) + 0x3f3504f3; + x = ::xsimd::bitwise_cast(ix); + batch_type f = --x; + batch_type s = f / (batch_type(2.) + f); + batch_type z = s * s; + batch_type w = z * z; + batch_type t1 = w * detail::horner(w); + batch_type t2 = z * detail::horner(w); + batch_type R = t2 + t1; + batch_type dk = to_float(k); + batch_type hfsq = batch_type(0.5) * f * f; + batch_type hibits = f - hfsq; + hibits &= ::xsimd::bitwise_cast(i_type(0xfffff000)); + batch_type lobits = fma(s, hfsq + R, f - hibits - hfsq); + batch_type r = fma(dk, log10_2hi, + fma(hibits, ivln10hi, + fma(lobits, ivln10hi, + fma(lobits + hibits, ivln10lo, dk * log10_2lo)))); #ifndef XSIMD_NO_INFINITIES - batch_type zz = select(isnez, select(self == constants::infinity(), constants::infinity(), r), constants::minusinfinity()); + batch_type zz = select(isnez, select(self == constants::infinity(), constants::infinity(), r), constants::minusinfinity()); #else - batch_type zz = select(isnez, r, constants::minusinfinity()); + batch_type zz = select(isnez, r, constants::minusinfinity()); #endif - return select(!(self >= batch_type(0.)), constants::nan(), zz); - } - template batch log10(batch const& self, requires_arch) { - using batch_type = batch; - const batch_type - ivln10hi(4.34294481878168880939e-01), - ivln10lo(2.50829467116452752298e-11), - log10_2hi(3.01029995663611771306e-01), - log10_2lo(3.69423907715893078616e-13); - using i_type = as_integer_t; - batch_type x = self; - i_type hx = ::xsimd::bitwise_cast(x) >> 32; - i_type k(0); - auto isnez = (self != batch_type(0.)); + return select(!(self >= batch_type(0.)), constants::nan(), zz); + } + + template + inline batch log10(batch const& self, requires_arch) noexcept + { + using batch_type = batch; + const batch_type + ivln10hi(4.34294481878168880939e-01), + ivln10lo(2.50829467116452752298e-11), + log10_2hi(3.01029995663611771306e-01), + log10_2lo(3.69423907715893078616e-13); + using i_type = as_integer_t; + batch_type x = self; + i_type hx = ::xsimd::bitwise_cast(x) >> 32; + i_type k(0); + auto isnez = (self != batch_type(0.)); #ifndef XSIMD_NO_DENORMALS - auto test = (self < constants::smallestposval()) && isnez; - if (any(test)) - { - k = select(bool_cast(test), k - i_type(54), k); - x = select(test, x * batch_type(18014398509481984ull), x); - } + auto test = (self < constants::smallestposval()) && isnez; + if (any(test)) + { + k = select(bool_cast(test), k - i_type(54), k); + x = select(test, x * batch_type(18014398509481984ull), x); + } #endif - hx += 0x3ff00000 - 0x3fe6a09e; - k += (hx >> 20) - 0x3ff; - hx = (hx & i_type(0x000fffff)) + 0x3fe6a09e; - x = ::xsimd::bitwise_cast(hx << 32 | (i_type(0xffffffff) & ::xsimd::bitwise_cast(x))); - batch_type f = --x; - batch_type dk = to_float(k); - batch_type s = f / (batch_type(2.) + f); - batch_type z = s * s; - batch_type w = z * z; - batch_type t1 = w * detail::horner(w); - batch_type t2 = z * detail::horner(w); - batch_type R = t2 + t1; - batch_type hfsq = batch_type(0.5) * f * f; - batch_type hi = f - hfsq; - hi = hi & ::xsimd::bitwise_cast(constants::allbits() << 32); - batch_type lo = f - hi - hfsq + s * (hfsq + R); - batch_type val_hi = hi * ivln10hi; - batch_type y = dk * log10_2hi; - batch_type val_lo = dk * log10_2lo + (lo + hi) * ivln10lo + lo * ivln10hi; - batch_type w1 = y + val_hi; - val_lo += (y - w1) + val_hi; - val_hi = w1; - batch_type r = val_lo + val_hi; + hx += 0x3ff00000 - 0x3fe6a09e; + k += (hx >> 20) - 0x3ff; + hx = (hx & i_type(0x000fffff)) + 0x3fe6a09e; + x = ::xsimd::bitwise_cast(hx << 32 | (i_type(0xffffffff) & ::xsimd::bitwise_cast(x))); + batch_type f = --x; + batch_type dk = to_float(k); + batch_type s = f / (batch_type(2.) + f); + batch_type z = s * s; + batch_type w = z * z; + batch_type t1 = w * detail::horner(w); + batch_type t2 = z * detail::horner(w); + batch_type R = t2 + t1; + batch_type hfsq = batch_type(0.5) * f * f; + batch_type hi = f - hfsq; + hi = hi & ::xsimd::bitwise_cast(constants::allbits() << 32); + batch_type lo = f - hi - hfsq + s * (hfsq + R); + batch_type val_hi = hi * ivln10hi; + batch_type y = dk * log10_2hi; + batch_type val_lo = dk * log10_2lo + (lo + hi) * ivln10lo + lo * ivln10hi; + batch_type w1 = y + val_hi; + val_lo += (y - w1) + val_hi; + val_hi = w1; + batch_type r = val_lo + val_hi; #ifndef XSIMD_NO_INFINITIES - batch_type zz = select(isnez, select(self == constants::infinity(), constants::infinity(), r), constants::minusinfinity()); + batch_type zz = select(isnez, select(self == constants::infinity(), constants::infinity(), r), constants::minusinfinity()); #else - batch_type zz = select(isnez, r, constants::minusinfinity()); + batch_type zz = select(isnez, r, constants::minusinfinity()); #endif - return select(!(self >= batch_type(0.)), constants::nan(), zz); - } + return select(!(self >= batch_type(0.)), constants::nan(), zz); + } template - batch, A> log10(const batch, A>& z, requires_arch) - { - return detail::logN_complex_impl(z, std::log(10)); - } + inline batch, A> log10(const batch, A>& z, requires_arch) noexcept + { + return detail::logN_complex_impl(z, std::log(10)); + } - // log1p - /* origin: boost/simd/arch/common/simd/function/log1p.hpp */ - /* - * ==================================================== - * copyright 2016 NumScale SAS - * - * Distributed under the Boost Software License, Version 1.0. - * (See copy at http://boost.org/LICENSE_1_0.txt) - * ==================================================== - */ - template batch log1p(batch const& self, requires_arch) { - using batch_type = batch; - using i_type = as_integer_t; - const batch_type uf = self + batch_type(1.); - auto isnez = (uf != batch_type(0.)); - i_type iu = ::xsimd::bitwise_cast(uf); - iu += 0x3f800000 - 0x3f3504f3; - i_type k = (iu >> 23) - 0x7f; - iu = (iu & i_type(0x007fffff)) + 0x3f3504f3; - batch_type f = --(::xsimd::bitwise_cast(iu)); - batch_type s = f / (batch_type(2.) + f); - batch_type z = s * s; - batch_type w = z * z; - batch_type t1 = w * detail::horner(w); - batch_type t2 = z * detail::horner(w); - batch_type R = t2 + t1; - batch_type hfsq = batch_type(0.5) * f * f; - batch_type dk = to_float(k); - /* correction term ~ log(1+x)-log(u), avoid underflow in c/u */ - batch_type c = select(bool_cast(k >= i_type(2)), batch_type(1.) - (uf - self), self - (uf - batch_type(1.))) / uf; - batch_type r = fma(dk, constants::log_2hi(), fma(s, (hfsq + R), dk * constants::log_2lo() + c) - hfsq + f); + // log1p + /* origin: boost/simd/arch/common/simd/function/log1p.hpp */ + /* + * ==================================================== + * copyright 2016 NumScale SAS + * + * Distributed under the Boost Software License, Version 1.0. + * (See copy at http://boost.org/LICENSE_1_0.txt) + * ==================================================== + */ + template + inline batch log1p(batch const& self, requires_arch) noexcept + { + using batch_type = batch; + using i_type = as_integer_t; + const batch_type uf = self + batch_type(1.); + auto isnez = (uf != batch_type(0.)); + i_type iu = ::xsimd::bitwise_cast(uf); + iu += 0x3f800000 - 0x3f3504f3; + i_type k = (iu >> 23) - 0x7f; + iu = (iu & i_type(0x007fffff)) + 0x3f3504f3; + batch_type f = --(::xsimd::bitwise_cast(iu)); + batch_type s = f / (batch_type(2.) + f); + batch_type z = s * s; + batch_type w = z * z; + batch_type t1 = w * detail::horner(w); + batch_type t2 = z * detail::horner(w); + batch_type R = t2 + t1; + batch_type hfsq = batch_type(0.5) * f * f; + batch_type dk = to_float(k); + /* correction term ~ log(1+x)-log(u), avoid underflow in c/u */ + batch_type c = select(bool_cast(k >= i_type(2)), batch_type(1.) - (uf - self), self - (uf - batch_type(1.))) / uf; + batch_type r = fma(dk, constants::log_2hi(), fma(s, (hfsq + R), dk * constants::log_2lo() + c) - hfsq + f); #ifndef XSIMD_NO_INFINITIES - batch_type zz = select(isnez, select(self == constants::infinity(), constants::infinity(), r), constants::minusinfinity()); + batch_type zz = select(isnez, select(self == constants::infinity(), constants::infinity(), r), constants::minusinfinity()); #else - batch_type zz = select(isnez, r, constants::minusinfinity()); + batch_type zz = select(isnez, r, constants::minusinfinity()); #endif - return select(!(uf >= batch_type(0.)), constants::nan(), zz); - } - template batch log1p(batch const& self, requires_arch) { - using batch_type = batch; - using i_type = as_integer_t; - const batch_type uf = self + batch_type(1.); - auto isnez = (uf != batch_type(0.)); - i_type hu = ::xsimd::bitwise_cast(uf) >> 32; - hu += 0x3ff00000 - 0x3fe6a09e; - i_type k = (hu >> 20) - 0x3ff; - /* correction term ~ log(1+x)-log(u), avoid underflow in c/u */ - batch_type c = select(bool_cast(k >= i_type(2)), batch_type(1.) - (uf - self), self - (uf - batch_type(1.))) / uf; - hu = (hu & i_type(0x000fffff)) + 0x3fe6a09e; - batch_type f = ::xsimd::bitwise_cast((hu << 32) | (i_type(0xffffffff) & ::xsimd::bitwise_cast(uf))); - f = --f; - batch_type hfsq = batch_type(0.5) * f * f; - batch_type s = f / (batch_type(2.) + f); - batch_type z = s * s; - batch_type w = z * z; - batch_type t1 = w * detail::horner(w); - batch_type t2 = z * detail::horner(w); - batch_type R = t2 + t1; - batch_type dk = to_float(k); - batch_type r = fma(dk, constants::log_2hi(), fma(s, hfsq + R, dk * constants::log_2lo() + c) - hfsq + f); + return select(!(uf >= batch_type(0.)), constants::nan(), zz); + } + + template + inline batch log1p(batch const& self, requires_arch) noexcept + { + using batch_type = batch; + using i_type = as_integer_t; + const batch_type uf = self + batch_type(1.); + auto isnez = (uf != batch_type(0.)); + i_type hu = ::xsimd::bitwise_cast(uf) >> 32; + hu += 0x3ff00000 - 0x3fe6a09e; + i_type k = (hu >> 20) - 0x3ff; + /* correction term ~ log(1+x)-log(u), avoid underflow in c/u */ + batch_type c = select(bool_cast(k >= i_type(2)), batch_type(1.) - (uf - self), self - (uf - batch_type(1.))) / uf; + hu = (hu & i_type(0x000fffff)) + 0x3fe6a09e; + batch_type f = ::xsimd::bitwise_cast((hu << 32) | (i_type(0xffffffff) & ::xsimd::bitwise_cast(uf))); + f = --f; + batch_type hfsq = batch_type(0.5) * f * f; + batch_type s = f / (batch_type(2.) + f); + batch_type z = s * s; + batch_type w = z * z; + batch_type t1 = w * detail::horner(w); + batch_type t2 = z * detail::horner(w); + batch_type R = t2 + t1; + batch_type dk = to_float(k); + batch_type r = fma(dk, constants::log_2hi(), fma(s, hfsq + R, dk * constants::log_2lo() + c) - hfsq + f); #ifndef XSIMD_NO_INFINITIES - batch_type zz = select(isnez, select(self == constants::infinity(), constants::infinity(), r), constants::minusinfinity()); + batch_type zz = select(isnez, select(self == constants::infinity(), constants::infinity(), r), constants::minusinfinity()); #else - batch_type zz = select(isnez, r, constants::minusinfinity()); + batch_type zz = select(isnez, r, constants::minusinfinity()); #endif - return select(!(uf >= batch_type(0.)), constants::nan(), zz); - } - - template batch, A> log1p(batch, A> const& self, requires_arch) { - using batch_type = batch, A>; - using real_batch = typename batch_type::real_batch; - batch_type u = 1 + self; - batch_type logu = log(u); - return select(u == batch_type(1.), - self, - select(u.real() <= real_batch(0.), - logu, - logu * self / (u - batch_type(1.)))); - } - + return select(!(uf >= batch_type(0.)), constants::nan(), zz); + } - // mod - template::value, void>::type> - batch mod(batch const& self, batch const& other, requires_arch) { - return detail::apply([](T x, T y) -> T { return x % y;}, self, other); - } + template + inline batch, A> log1p(batch, A> const& self, requires_arch) noexcept + { + using batch_type = batch, A>; + using real_batch = typename batch_type::real_batch; + batch_type u = 1 + self; + batch_type logu = log(u); + return select(u == batch_type(1.), + self, + select(u.real() <= real_batch(0.), + logu, + logu * self / (u - batch_type(1.)))); + } + // mod + template ::value, void>::type> + inline batch mod(batch const& self, batch const& other, requires_arch) noexcept + { + return detail::apply([](T x, T y) noexcept -> T + { return x % y; }, + self, other); + } - // nearbyint - template::value, void>::type> - batch nearbyint(batch const& self, requires_arch) { - return self; - } - namespace detail { - template batch nearbyintf(batch const& self) { - using batch_type = batch; - batch_type s = bitofsign(self); - batch_type v = self ^ s; - batch_type t2n = constants::twotonmb(); - // Under fast-math, reordering is possible and the compiler optimizes d - // to v. That's not what we want, so prevent compiler optimization here. - // FIXME: it may be better to emit a memory barrier here (?). + // nearbyint + template ::value, void>::type> + inline batch nearbyint(batch const& self, requires_arch) noexcept + { + return self; + } + namespace detail + { + template + inline batch nearbyintf(batch const& self) noexcept + { + using batch_type = batch; + batch_type s = bitofsign(self); + batch_type v = self ^ s; + batch_type t2n = constants::twotonmb(); + // Under fast-math, reordering is possible and the compiler optimizes d + // to v. That's not what we want, so prevent compiler optimization here. + // FIXME: it may be better to emit a memory barrier here (?). #ifdef __FAST_MATH__ - volatile batch_type d0 = v + t2n; - batch_type d = *(batch_type*)(void*)(&d0) - t2n; + volatile batch_type d0 = v + t2n; + batch_type d = *(batch_type*)(void*)(&d0) - t2n; #else - batch_type d0 = v + t2n; - batch_type d = d0 - t2n; + batch_type d0 = v + t2n; + batch_type d = d0 - t2n; #endif - return s ^ select(v < t2n, d, v); - } - } - template batch nearbyint(batch const& self, requires_arch) { - return detail::nearbyintf(self); - } - template batch nearbyint(batch const& self, requires_arch) { - return detail::nearbyintf(self); - } - - // nextafter - namespace detail - { - template ::value> - struct nextafter_kernel - { - using batch_type = batch; - - static inline batch_type next(batch_type const& b) noexcept - { - return b; - } - - static inline batch_type prev(batch_type const& b) noexcept - { - return b; + return s ^ select(v < t2n, d, v); } - }; - - template - struct bitwise_cast_batch; - + } template - struct bitwise_cast_batch + inline batch nearbyint(batch const& self, requires_arch) noexcept { - using type = batch; - }; - + return detail::nearbyintf(self); + } template - struct bitwise_cast_batch + inline batch nearbyint(batch const& self, requires_arch) noexcept { - using type = batch; - }; + return detail::nearbyintf(self); + } - template - struct nextafter_kernel + // nextafter + namespace detail { - using batch_type = batch; - using int_batch = typename bitwise_cast_batch::type; - using int_type = typename int_batch::value_type; + template ::value> + struct nextafter_kernel + { + using batch_type = batch; - static inline batch_type next(const batch_type& b) noexcept + static inline batch_type next(batch_type const& b) noexcept + { + return b; + } + + static inline batch_type prev(batch_type const& b) noexcept + { + return b; + } + }; + + template + struct bitwise_cast_batch; + + template + struct bitwise_cast_batch { - batch_type n = ::xsimd::bitwise_cast(::xsimd::bitwise_cast(b) + int_type(1)); - return select(b == constants::infinity(), b, n); - } + using type = batch; + }; - static inline batch_type prev(const batch_type& b) noexcept + template + struct bitwise_cast_batch { - batch_type p = ::xsimd::bitwise_cast(::xsimd::bitwise_cast(b) - int_type(1)); - return select(b == constants::minusinfinity(), b, p); - } - }; - } - template batch nextafter(batch const& from, batch const& to, requires_arch) { - using kernel = detail::nextafter_kernel; - return select(from == to, from, - select(to > from, kernel::next(from), kernel::prev(from))); - } + using type = batch; + }; + + template + struct nextafter_kernel + { + using batch_type = batch; + using int_batch = typename bitwise_cast_batch::type; + using int_type = typename int_batch::value_type; + static inline batch_type next(const batch_type& b) noexcept + { + batch_type n = ::xsimd::bitwise_cast(::xsimd::bitwise_cast(b) + int_type(1)); + return select(b == constants::infinity(), b, n); + } - // pow - /* origin: boost/simd/arch/common/simd/function/pow.hpp*/ - /* - * ==================================================== - * copyright 2016 NumScale SAS - * - * Distributed under the Boost Software License, Version 1.0. - * (See copy at http://boost.org/LICENSE_1_0.txt) - * ==================================================== - */ - template batch pow(batch const& self, batch const& other, requires_arch) { - using batch_type = batch; - auto negx = self < batch_type(0.); - batch_type z = exp(other * log(abs(self))); - z = select(is_odd(other) && negx, -z, z); - auto invalid = negx && !(is_flint(other) || isinf(other)); - return select(invalid, constants::nan(), z); - } + static inline batch_type prev(const batch_type& b) noexcept + { + batch_type p = ::xsimd::bitwise_cast(::xsimd::bitwise_cast(b) - int_type(1)); + return select(b == constants::minusinfinity(), b, p); + } + }; + } + template + inline batch nextafter(batch const& from, batch const& to, requires_arch) noexcept + { + using kernel = detail::nextafter_kernel; + return select(from == to, from, + select(to > from, kernel::next(from), kernel::prev(from))); + } + + // pow + /* origin: boost/simd/arch/common/simd/function/pow.hpp*/ + /* + * ==================================================== + * copyright 2016 NumScale SAS + * + * Distributed under the Boost Software License, Version 1.0. + * (See copy at http://boost.org/LICENSE_1_0.txt) + * ==================================================== + */ + template + inline batch pow(batch const& self, batch const& other, requires_arch) noexcept + { + using batch_type = batch; + auto negx = self < batch_type(0.); + batch_type z = exp(other * log(abs(self))); + z = select(is_odd(other) && negx, -z, z); + auto invalid = negx && !(is_flint(other) || isinf(other)); + return select(invalid, constants::nan(), z); + } template - inline batch, A> pow(const batch, A>& a, const batch, A>& z, requires_arch) + inline batch, A> pow(const batch, A>& a, const batch, A>& z, requires_arch) noexcept { using cplx_batch = batch, A>; using real_batch = typename cplx_batch::real_batch; @@ -1849,55 +1920,67 @@ namespace xsimd { return select(absa == ze, cplx_batch(ze), cplx_batch(r * cos(theta), r * sin(theta))); } + // remainder + template + inline batch remainder(batch const& self, batch const& other, requires_arch) noexcept + { + return fnma(nearbyint(self / other), other, self); + } + template + inline batch remainder(batch const& self, batch const& other, requires_arch) noexcept + { + return fnma(nearbyint(self / other), other, self); + } + template ::value, void>::type> + inline batch remainder(batch const& self, batch const& other, requires_arch) noexcept + { + auto mod = self % other; + return select(mod <= other / 2, mod, mod - other); + } - // remainder - template - batch remainder(batch const& self, batch const& other, requires_arch) { - return fnma(nearbyint(self / other), other, self); - } - template - batch remainder(batch const& self, batch const& other, requires_arch) { - return fnma(nearbyint(self / other), other, self); - } - template::value, void>::type> - batch remainder(batch const& self, batch const& other, requires_arch) { - auto mod = self % other; - return select(mod <= other / 2, mod, mod - other); - } - - // select - template - batch, A> select(batch_bool const& cond, batch, A> const& true_br, batch, A> const& false_br, requires_arch) { - return {select(cond, true_br.real(), false_br.real()), select(cond, true_br.imag(), false_br.imag())}; - } + // select + template + inline batch, A> select(batch_bool const& cond, batch, A> const& true_br, batch, A> const& false_br, requires_arch) noexcept + { + return { select(cond, true_br.real(), false_br.real()), select(cond, true_br.imag(), false_br.imag()) }; + } - // sign - template::value, void>::type> batch sign(batch const& self, requires_arch) { - using batch_type = batch; - batch_type res = select(self > batch_type(0), batch_type(1), batch_type(0)) - select(self < batch_type(0), batch_type(1), batch_type(0)); - return res; - } + // sign + template ::value, void>::type> + inline batch sign(batch const& self, requires_arch) noexcept + { + using batch_type = batch; + batch_type res = select(self > batch_type(0), batch_type(1), batch_type(0)) - select(self < batch_type(0), batch_type(1), batch_type(0)); + return res; + } - namespace detail { - template batch signf(batch const& self) { - using batch_type = batch; - batch_type res = select(self > batch_type(0.f), batch_type(1.f), batch_type(0.f)) - select(self < batch_type(0.f), batch_type(1.f), batch_type(0.f)); + namespace detail + { + template + inline batch signf(batch const& self) noexcept + { + using batch_type = batch; + batch_type res = select(self > batch_type(0.f), batch_type(1.f), batch_type(0.f)) - select(self < batch_type(0.f), batch_type(1.f), batch_type(0.f)); #ifdef XSIMD_NO_NANS - return res; + return res; #else - return select(isnan(self), constants::nan(), res); + return select(isnan(self), constants::nan(), res); #endif - } - } + } + } - template batch sign(batch const& self, requires_arch) { - return detail::signf(self); - } - template batch sign(batch const& self, requires_arch) { - return detail::signf(self); - } + template + inline batch sign(batch const& self, requires_arch) noexcept + { + return detail::signf(self); + } + template + inline batch sign(batch const& self, requires_arch) noexcept + { + return detail::signf(self); + } template - inline batch, A> sign(const batch, A>& z, requires_arch) + inline batch, A> sign(const batch, A>& z, requires_arch) noexcept { using batch_type = batch, A>; using real_batch = typename batch_type::real_batch; @@ -1908,286 +1991,299 @@ namespace xsimd { batch_type(sign(iz))); } - // signnz - template::value, void>::type> batch signnz(batch const& self, requires_arch) { - using batch_type = batch; - return (self >> (sizeof(T) * 8 - 1)) | batch_type(1.); - } + // signnz + template ::value, void>::type> + inline batch signnz(batch const& self, requires_arch) noexcept + { + using batch_type = batch; + return (self >> (sizeof(T) * 8 - 1)) | batch_type(1.); + } - namespace detail { - template batch signnzf(batch const& self) { - using batch_type = batch; + namespace detail + { + template + inline batch signnzf(batch const& self) noexcept + { + using batch_type = batch; #ifndef XSIMD_NO_NANS return select(isnan(self), constants::nan(), batch_type(1.) | (constants::signmask() & self)); #else return batch_type(1.) | (constants::signmask() & self); #endif - } - } - - template batch signnz(batch const& self, requires_arch) { - return detail::signnzf(self); - } - template batch signnz(batch const& self, requires_arch) { - return detail::signnzf(self); - } + } + } - // sqrt - template batch, A> sqrt(batch, A> const& z, requires_arch) { - - constexpr T csqrt_scale_factor = std::is_same::value?6.7108864e7f:1.8014398509481984e16; - constexpr T csqrt_scale = std::is_same::value?1.220703125e-4f:7.450580596923828125e-9; - using batch_type = batch, A>; - using real_batch = batch; - real_batch x = z.real(); - real_batch y = z.imag(); - real_batch sqrt_x = sqrt(fabs(x)); - real_batch sqrt_hy = sqrt(0.5 * fabs(y)); - auto cond = (fabs(x) > real_batch(4.) || fabs(y) > real_batch(4.)); - x = select(cond, x * 0.25, x * csqrt_scale_factor); - y = select(cond, y * 0.25, y * csqrt_scale_factor); - real_batch scale = select(cond, real_batch(2.), real_batch(csqrt_scale)); - real_batch r = abs(batch_type(x, y)); - - auto condxp = x > real_batch(0.); - real_batch t0 = select(condxp, xsimd::sqrt(0.5 * (r + x)), xsimd::sqrt(0.5 * (r - x))); - real_batch r0 = scale * fabs((0.5 * y) / t0); - t0 *= scale; - real_batch t = select(condxp, t0, r0); - r = select(condxp, r0, t0); - batch_type resg = select(y < real_batch(0.), batch_type(t, -r), batch_type(t, r)); - real_batch ze(0.); - - return select(y == ze, - select(x == ze, - batch_type(ze, ze), - select(x < ze, batch_type(ze, sqrt_x), batch_type(sqrt_x, ze))), - select(x == ze, - select(y > ze, batch_type(sqrt_hy, sqrt_hy), batch_type(sqrt_hy, -sqrt_hy)), - resg)); + template + inline batch signnz(batch const& self, requires_arch) noexcept + { + return detail::signnzf(self); + } + template + inline batch signnz(batch const& self, requires_arch) noexcept + { + return detail::signnzf(self); + } - } + // sqrt + template + inline batch, A> sqrt(batch, A> const& z, requires_arch) noexcept + { - // tgamma + constexpr T csqrt_scale_factor = std::is_same::value ? 6.7108864e7f : 1.8014398509481984e16; + constexpr T csqrt_scale = std::is_same::value ? 1.220703125e-4f : 7.450580596923828125e-9; + using batch_type = batch, A>; + using real_batch = batch; + real_batch x = z.real(); + real_batch y = z.imag(); + real_batch sqrt_x = sqrt(fabs(x)); + real_batch sqrt_hy = sqrt(0.5 * fabs(y)); + auto cond = (fabs(x) > real_batch(4.) || fabs(y) > real_batch(4.)); + x = select(cond, x * 0.25, x * csqrt_scale_factor); + y = select(cond, y * 0.25, y * csqrt_scale_factor); + real_batch scale = select(cond, real_batch(2.), real_batch(csqrt_scale)); + real_batch r = abs(batch_type(x, y)); + + auto condxp = x > real_batch(0.); + real_batch t0 = select(condxp, xsimd::sqrt(0.5 * (r + x)), xsimd::sqrt(0.5 * (r - x))); + real_batch r0 = scale * fabs((0.5 * y) / t0); + t0 *= scale; + real_batch t = select(condxp, t0, r0); + r = select(condxp, r0, t0); + batch_type resg = select(y < real_batch(0.), batch_type(t, -r), batch_type(t, r)); + real_batch ze(0.); + + return select(y == ze, + select(x == ze, + batch_type(ze, ze), + select(x < ze, batch_type(ze, sqrt_x), batch_type(sqrt_x, ze))), + select(x == ze, + select(y > ze, batch_type(sqrt_hy, sqrt_hy), batch_type(sqrt_hy, -sqrt_hy)), + resg)); + } - namespace detail { - /* origin: boost/simd/arch/common/detail/generic/stirling_kernel.hpp */ - /* - * ==================================================== - * copyright 2016 NumScale SAS - * - * Distributed under the Boost Software License, Version 1.0. - * (See copy at http://boost.org/LICENSE_1_0.txt) - * ==================================================== - */ - template - struct stirling_kernel; + // tgamma - template - struct stirling_kernel> + namespace detail { - using batch_type = batch; - static inline batch_type compute(const batch_type& x) - { - return horner(x); - } + /* origin: boost/simd/arch/common/detail/generic/stirling_kernel.hpp */ + /* + * ==================================================== + * copyright 2016 NumScale SAS + * + * Distributed under the Boost Software License, Version 1.0. + * (See copy at http://boost.org/LICENSE_1_0.txt) + * ==================================================== + */ + template + struct stirling_kernel; - static inline batch_type split_limit() + template + struct stirling_kernel> { - return batch_type(bit_cast(uint32_t(0x41d628f6))); - } + using batch_type = batch; + static inline batch_type compute(const batch_type& x) noexcept + { + return horner(x); + } - static inline batch_type large_limit() - { - return batch_type(bit_cast(uint32_t(0x420c28f3))); - } - }; + static inline batch_type split_limit() noexcept + { + return batch_type(bit_cast(uint32_t(0x41d628f6))); + } - template - struct stirling_kernel> - { - using batch_type = batch; - static inline batch_type compute(const batch_type& x) - { - return horner(x); - } + static inline batch_type large_limit() noexcept + { + return batch_type(bit_cast(uint32_t(0x420c28f3))); + } + }; - static inline batch_type split_limit() + template + struct stirling_kernel> { - return batch_type(bit_cast(uint64_t(0x4061e083ba3443d4))); - } + using batch_type = batch; + static inline batch_type compute(const batch_type& x) noexcept + { + return horner(x); + } - static inline batch_type large_limit() - { - return batch_type(bit_cast(uint64_t(0x4065800000000000))); - } - }; + static inline batch_type split_limit() noexcept + { + return batch_type(bit_cast(uint64_t(0x4061e083ba3443d4))); + } - /* origin: boost/simd/arch/common/simd/function/stirling.hpp */ - /* - * ==================================================== - * copyright 2016 NumScale SAS - * - * Distributed under the Boost Software License, Version 1.0. - * (See copy at http://boost.org/LICENSE_1_0.txt) - * ==================================================== - */ - template - inline batch stirling(const batch& a) - { - using batch_type = batch; - const batch_type stirlingsplitlim = stirling_kernel::split_limit(); - const batch_type stirlinglargelim = stirling_kernel::large_limit(); - batch_type x = select(a >= batch_type(0.), a, constants::nan()); - batch_type w = batch_type(1.) / x; - w = fma(w, stirling_kernel::compute(w), batch_type(1.)); - batch_type y = exp(-x); - auto test = (x < stirlingsplitlim); - batch_type z = x - batch_type(0.5); - z = select(test, z, batch_type(0.5) * z); - batch_type v = exp(z * log(abs(x))); - y *= v; - y = select(test, y, y * v); - y *= constants::sqrt_2pi() * w; + static inline batch_type large_limit() noexcept + { + return batch_type(bit_cast(uint64_t(0x4065800000000000))); + } + }; + + /* origin: boost/simd/arch/common/simd/function/stirling.hpp */ + /* + * ==================================================== + * copyright 2016 NumScale SAS + * + * Distributed under the Boost Software License, Version 1.0. + * (See copy at http://boost.org/LICENSE_1_0.txt) + * ==================================================== + */ + template + inline batch stirling(const batch& a) noexcept + { + using batch_type = batch; + const batch_type stirlingsplitlim = stirling_kernel::split_limit(); + const batch_type stirlinglargelim = stirling_kernel::large_limit(); + batch_type x = select(a >= batch_type(0.), a, constants::nan()); + batch_type w = batch_type(1.) / x; + w = fma(w, stirling_kernel::compute(w), batch_type(1.)); + batch_type y = exp(-x); + auto test = (x < stirlingsplitlim); + batch_type z = x - batch_type(0.5); + z = select(test, z, batch_type(0.5) * z); + batch_type v = exp(z * log(abs(x))); + y *= v; + y = select(test, y, y * v); + y *= constants::sqrt_2pi() * w; #ifndef XSIMD_NO_INFINITIES - y = select(isinf(x), x, y); + y = select(isinf(x), x, y); #endif - return select(x > stirlinglargelim, constants::infinity(), y); - } + return select(x > stirlinglargelim, constants::infinity(), y); + } - /* origin: boost/simd/arch/common/detail/generic/gamma_kernel.hpp */ - /* - * ==================================================== - * copyright 2016 NumScale SAS - * - * Distributed under the Boost Software License, Version 1.0. - * (See copy at http://boost.org/LICENSE_1_0.txt) - * ==================================================== - */ - template - struct tgamma_kernel; + /* origin: boost/simd/arch/common/detail/generic/gamma_kernel.hpp */ + /* + * ==================================================== + * copyright 2016 NumScale SAS + * + * Distributed under the Boost Software License, Version 1.0. + * (See copy at http://boost.org/LICENSE_1_0.txt) + * ==================================================== + */ + template + struct tgamma_kernel; - template - struct tgamma_kernel> - { - using batch_type = batch; - static inline batch_type compute(const batch_type& x) + template + struct tgamma_kernel> { - return horner(x); - } - }; + using batch_type = batch; + static inline batch_type compute(const batch_type& x) noexcept + { + return horner(x); + } + }; - template - struct tgamma_kernel> - { - using batch_type = batch; - static inline batch_type compute(const batch_type& x) + template + struct tgamma_kernel> { - return horner(x) / - horner(x); + using batch_type = batch; + static inline batch_type compute(const batch_type& x) noexcept + { + return horner(x) + / horner(x); + } + }; + + /* origin: boost/simd/arch/common/simd/function/gamma.hpp */ + /* + * ==================================================== + * copyright 2016 NumScale SAS + * + * Distributed under the Boost Software License, Version 1.0. + * (See copy at http://boost.org/LICENSE_1_0.txt) + * ==================================================== + */ + template + inline B tgamma_large_negative(const B& a) noexcept + { + B st = stirling(a); + B p = floor(a); + B sgngam = select(is_even(p), -B(1.), B(1.)); + B z = a - p; + auto test2 = z < B(0.5); + z = select(test2, z - B(1.), z); + z = a * sin(z, trigo_pi_tag()); + z = abs(z); + return sgngam * constants::pi() / (z * st); } - }; - - /* origin: boost/simd/arch/common/simd/function/gamma.hpp */ - /* - * ==================================================== - * copyright 2016 NumScale SAS - * - * Distributed under the Boost Software License, Version 1.0. - * (See copy at http://boost.org/LICENSE_1_0.txt) - * ==================================================== - */ - template - B tgamma_large_negative(const B& a) - { - B st = stirling(a); - B p = floor(a); - B sgngam = select(is_even(p), -B(1.), B(1.)); - B z = a - p; - auto test2 = z < B(0.5); - z = select(test2, z - B(1.), z); - z = a * sin(z, trigo_pi_tag()); - z = abs(z); - return sgngam * constants::pi() / (z * st); - } - - template - B tgamma_other(const B& a, const BB& test) - { - B x = select(test, B(2.), a); + + template + inline B tgamma_other(const B& a, const BB& test) noexcept + { + B x = select(test, B(2.), a); #ifndef XSIMD_NO_INFINITIES - auto inf_result = (a == constants::infinity()); - x = select(inf_result, B(2.), x); + auto inf_result = (a == constants::infinity()); + x = select(inf_result, B(2.), x); #endif - B z = B(1.); - auto test1 = (x >= B(3.)); - while (any(test1)) - { - x = select(test1, x - B(1.), x); - z = select(test1, z * x, z); - test1 = (x >= B(3.)); - } - test1 = (x < B(0.)); - while (any(test1)) - { - z = select(test1, z / x, z); - x = select(test1, x + B(1.), x); + B z = B(1.); + auto test1 = (x >= B(3.)); + while (any(test1)) + { + x = select(test1, x - B(1.), x); + z = select(test1, z * x, z); + test1 = (x >= B(3.)); + } test1 = (x < B(0.)); - } - auto test2 = (x < B(2.)); - while (any(test2)) - { - z = select(test2, z / x, z); - x = select(test2, x + B(1.), x); - test2 = (x < B(2.)); - } - x = z * tgamma_kernel::compute(x - B(2.)); + while (any(test1)) + { + z = select(test1, z / x, z); + x = select(test1, x + B(1.), x); + test1 = (x < B(0.)); + } + auto test2 = (x < B(2.)); + while (any(test2)) + { + z = select(test2, z / x, z); + x = select(test2, x + B(1.), x); + test2 = (x < B(2.)); + } + x = z * tgamma_kernel::compute(x - B(2.)); #ifndef XSIMD_NO_INFINITIES - return select(inf_result, a, x); + return select(inf_result, a, x); #else - return x; + return x; #endif + } } - } - template batch tgamma(batch const& self, requires_arch) { - using batch_type = batch; + template + inline batch tgamma(batch const& self, requires_arch) noexcept + { + using batch_type = batch; auto nan_result = (self < batch_type(0.) && is_flint(self)); #ifndef XSIMD_NO_INVALIDS nan_result = isnan(self) || nan_result; @@ -2204,12 +2300,10 @@ namespace xsimd { batch_type r1 = detail::tgamma_other(self, test); batch_type r2 = select(test, r, r1); return select(self == batch_type(0.), copysign(constants::infinity(), self), select(nan_result, constants::nan(), r2)); - } - + } - } + } } #endif - diff --git a/third_party/xsimd/arch/generic/xsimd_generic_memory.hpp b/third_party/xsimd/arch/generic/xsimd_generic_memory.hpp index 39f4c102c..5f4b1c79b 100644 --- a/third_party/xsimd/arch/generic/xsimd_generic_memory.hpp +++ b/third_party/xsimd/arch/generic/xsimd_generic_memory.hpp @@ -1,13 +1,13 @@ /*************************************************************************** -* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * -* Martin Renou * -* Copyright (c) QuantStack * -* Copyright (c) Serge Guelton * -* * -* Distributed under the terms of the BSD 3-Clause License. * -* * -* The full license is in the file LICENSE, distributed with this software. * -****************************************************************************/ + * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * + * Martin Renou * + * Copyright (c) QuantStack * + * Copyright (c) Serge Guelton * + * * + * Distributed under the terms of the BSD 3-Clause License. * + * * + * The full license is in the file LICENSE, distributed with this software. * + ****************************************************************************/ #ifndef XSIMD_GENERIC_MEMORY_HPP #define XSIMD_GENERIC_MEMORY_HPP @@ -18,171 +18,189 @@ #include "./xsimd_generic_details.hpp" +namespace xsimd +{ -namespace xsimd { - - namespace kernel { - - using namespace types; - - // extract_pair - template batch extract_pair(batch const& self, batch const& other, std::size_t i, requires_arch) { - constexpr std::size_t size = batch::size; - assert(0<= i && i< size && "index in bounds"); - - alignas(A::alignment()) T self_buffer[size]; - self.store_aligned(self_buffer); - - alignas(A::alignment()) T other_buffer[size]; - other.store_aligned(other_buffer); - - alignas(A::alignment()) T concat_buffer[size]; - - for (std::size_t j = 0 ; j < (size - i); ++j) - { - concat_buffer[j] = other_buffer[i + j]; - if(j < i) - { - concat_buffer[size - 1 - j] = self_buffer[i - 1 - j]; - } - } - return batch::load_aligned(concat_buffer); - } + namespace kernel + { - // load_aligned - namespace detail { - template - batch load_aligned(T_in const* mem, convert, requires_arch, with_fast_conversion) { - using batch_type_in = batch; - using batch_type_out = batch; - return fast_cast(batch_type_in::load_aligned(mem), batch_type_out(), A{}); - } - template - batch load_aligned(T_in const* mem, convert, requires_arch, with_slow_conversion) { - static_assert(!std::is_same::value, "there should be a direct load for this type combination"); - using batch_type_out = batch; - alignas(A::alignment()) T_out buffer[batch_type_out::size]; - std::copy(mem, mem + batch_type_out::size, std::begin(buffer)); - return batch_type_out::load_aligned(buffer); - } - } - template - batch load_aligned(T_in const* mem, convert cvt, requires_arch) { - return detail::load_aligned(mem, cvt, A{}, detail::conversion_type{}); - } + using namespace types; - // load_unaligned - namespace detail { - template - batch load_unaligned(T_in const* mem, convert, requires_arch, with_fast_conversion) { - using batch_type_in = batch; - using batch_type_out = batch; - return fast_cast(batch_type_in::load_unaligned(mem), batch_type_out(), A{}); - } - - template - batch load_unaligned(T_in const* mem, convert cvt, requires_arch, with_slow_conversion) { - static_assert(!std::is_same::value, "there should be a direct load for this type combination"); - return load_aligned(mem, cvt, generic{}, with_slow_conversion{}); - } - } - template - batch load_unaligned(T_in const* mem, convert cvt, requires_arch) { - return detail::load_unaligned(mem, cvt, generic{}, detail::conversion_type{}); - } + // extract_pair + template + inline batch extract_pair(batch const& self, batch const& other, std::size_t i, requires_arch) noexcept + { + constexpr std::size_t size = batch::size; + assert(0 <= i && i < size && "index in bounds"); + + alignas(A::alignment()) T self_buffer[size]; + self.store_aligned(self_buffer); + + alignas(A::alignment()) T other_buffer[size]; + other.store_aligned(other_buffer); + + alignas(A::alignment()) T concat_buffer[size]; + + for (std::size_t j = 0; j < (size - i); ++j) + { + concat_buffer[j] = other_buffer[i + j]; + if (j < i) + { + concat_buffer[size - 1 - j] = self_buffer[i - 1 - j]; + } + } + return batch::load_aligned(concat_buffer); + } - // store - template - void store(batch_bool const& self, bool* mem, requires_arch) { - using batch_type = batch; - constexpr auto size = batch_bool::size; - alignas(A::alignment()) T buffer[size]; - kernel::store_aligned(&buffer[0], batch_type(self), A{}); - for(std::size_t i = 0; i < size; ++i) - mem[i] = bool(buffer[i]); - } + // load_aligned + namespace detail + { + template + inline batch load_aligned(T_in const* mem, convert, requires_arch, with_fast_conversion) noexcept + { + using batch_type_in = batch; + using batch_type_out = batch; + return fast_cast(batch_type_in::load_aligned(mem), batch_type_out(), A {}); + } + template + inline batch load_aligned(T_in const* mem, convert, requires_arch, with_slow_conversion) noexcept + { + static_assert(!std::is_same::value, "there should be a direct load for this type combination"); + using batch_type_out = batch; + alignas(A::alignment()) T_out buffer[batch_type_out::size]; + std::copy(mem, mem + batch_type_out::size, std::begin(buffer)); + return batch_type_out::load_aligned(buffer); + } + } + template + inline batch load_aligned(T_in const* mem, convert cvt, requires_arch) noexcept + { + return detail::load_aligned(mem, cvt, A {}, detail::conversion_type {}); + } + // load_unaligned + namespace detail + { + template + inline batch load_unaligned(T_in const* mem, convert, requires_arch, with_fast_conversion) noexcept + { + using batch_type_in = batch; + using batch_type_out = batch; + return fast_cast(batch_type_in::load_unaligned(mem), batch_type_out(), A {}); + } + + template + inline batch load_unaligned(T_in const* mem, convert cvt, requires_arch, with_slow_conversion) noexcept + { + static_assert(!std::is_same::value, "there should be a direct load for this type combination"); + return load_aligned(mem, cvt, generic {}, with_slow_conversion {}); + } + } + template + inline batch load_unaligned(T_in const* mem, convert cvt, requires_arch) noexcept + { + return detail::load_unaligned(mem, cvt, generic {}, detail::conversion_type {}); + } - // store_aligned - template void store_aligned(T_out *mem, batch const& self, requires_arch) { - static_assert(!std::is_same::value, "there should be a direct store for this type combination"); - alignas(A::alignment()) T_in buffer[batch::size]; - store_aligned(&buffer[0], self); - std::copy(std::begin(buffer), std::end(buffer), mem); - } + // store + template + inline void store(batch_bool const& self, bool* mem, requires_arch) noexcept + { + using batch_type = batch; + constexpr auto size = batch_bool::size; + alignas(A::alignment()) T buffer[size]; + kernel::store_aligned(&buffer[0], batch_type(self), A {}); + for (std::size_t i = 0; i < size; ++i) + mem[i] = bool(buffer[i]); + } - // store_unaligned - template void store_unaligned(T_out *mem, batch const& self, requires_arch) { - static_assert(!std::is_same::value, "there should be a direct store for this type combination"); - return store_aligned(mem, self, generic{}); - } + // store_aligned + template + inline void store_aligned(T_out* mem, batch const& self, requires_arch) noexcept + { + static_assert(!std::is_same::value, "there should be a direct store for this type combination"); + alignas(A::alignment()) T_in buffer[batch::size]; + store_aligned(&buffer[0], self); + std::copy(std::begin(buffer), std::end(buffer), mem); + } - namespace detail - { - template - batch, A> load_complex(batch const& /*hi*/, batch const& /*lo*/, requires_arch) + // store_unaligned + template + inline void store_unaligned(T_out* mem, batch const& self, requires_arch) noexcept { - static_assert(std::is_same::value, "load_complex not implemented for the required architecture"); + static_assert(!std::is_same::value, "there should be a direct store for this type combination"); + return store_aligned(mem, self, generic {}); } - template - batch complex_high(batch, A> const& /*src*/, requires_arch) + namespace detail { - static_assert(std::is_same::value, "complex_high not implemented for the required architecture"); + template + inline batch, A> load_complex(batch const& /*hi*/, batch const& /*lo*/, requires_arch) noexcept + { + static_assert(std::is_same::value, "load_complex not implemented for the required architecture"); + } + + template + inline batch complex_high(batch, A> const& /*src*/, requires_arch) noexcept + { + static_assert(std::is_same::value, "complex_high not implemented for the required architecture"); + } + + template + inline batch complex_low(batch, A> const& /*src*/, requires_arch) noexcept + { + static_assert(std::is_same::value, "complex_low not implemented for the required architecture"); + } } - template - batch complex_low(batch, A> const& /*src*/, requires_arch) + // load_complex_aligned + template + inline batch, A> load_complex_aligned(std::complex const* mem, convert>, requires_arch) noexcept { - static_assert(std::is_same::value, "complex_low not implemented for the required architecture"); + using real_batch = batch; + T_in const* buffer = reinterpret_cast(mem); + real_batch hi = real_batch::load_aligned(buffer), + lo = real_batch::load_aligned(buffer + real_batch::size); + return detail::load_complex(hi, lo, A {}); } - } - // load_complex_aligned - template - batch, A> load_complex_aligned(std::complex const* mem, convert>, requires_arch) { - using real_batch = batch; - T_in const* buffer = reinterpret_cast(mem); - real_batch hi = real_batch::load_aligned(buffer), - lo = real_batch::load_aligned(buffer + real_batch::size); - return detail::load_complex(hi, lo, A{}); - } + // load_complex_unaligned + template + inline batch, A> load_complex_unaligned(std::complex const* mem, convert>, requires_arch) noexcept + { + using real_batch = batch; + T_in const* buffer = reinterpret_cast(mem); + real_batch hi = real_batch::load_unaligned(buffer), + lo = real_batch::load_unaligned(buffer + real_batch::size); + return detail::load_complex(hi, lo, A {}); + } - // load_complex_unaligned - template - batch, A> load_complex_unaligned(std::complex const* mem, convert> ,requires_arch) { - using real_batch = batch; - T_in const* buffer = reinterpret_cast(mem); - real_batch hi = real_batch::load_unaligned(buffer), - lo = real_batch::load_unaligned(buffer + real_batch::size); - return detail::load_complex(hi, lo, A{}); - } + // store_complex_aligned + template + inline void store_complex_aligned(std::complex* dst, batch, A> const& src, requires_arch) noexcept + { + using real_batch = batch; + real_batch hi = detail::complex_high(src, A {}); + real_batch lo = detail::complex_low(src, A {}); + T_out* buffer = reinterpret_cast(dst); + lo.store_aligned(buffer); + hi.store_aligned(buffer + real_batch::size); + } - // store_complex_aligned - template - void store_complex_aligned(std::complex* dst, batch, A> const& src, requires_arch) { - using real_batch = batch; - real_batch hi = detail::complex_high(src, A{}); - real_batch lo = detail::complex_low(src, A{}); - T_out* buffer = reinterpret_cast(dst); - lo.store_aligned(buffer); - hi.store_aligned(buffer + real_batch::size); - } + // store_compelx_unaligned + template + inline void store_complex_unaligned(std::complex* dst, batch, A> const& src, requires_arch) noexcept + { + using real_batch = batch; + real_batch hi = detail::complex_high(src, A {}); + real_batch lo = detail::complex_low(src, A {}); + T_out* buffer = reinterpret_cast(dst); + lo.store_unaligned(buffer); + hi.store_unaligned(buffer + real_batch::size); + } - // store_compelx_unaligned - template - void store_complex_unaligned(std::complex* dst, batch, A> const& src, requires_arch) { - using real_batch = batch; - real_batch hi = detail::complex_high(src, A{}); - real_batch lo = detail::complex_low(src, A{}); - T_out* buffer = reinterpret_cast(dst); - lo.store_unaligned(buffer); - hi.store_unaligned(buffer + real_batch::size); } - } - } #endif - diff --git a/third_party/xsimd/arch/generic/xsimd_generic_rounding.hpp b/third_party/xsimd/arch/generic/xsimd_generic_rounding.hpp index b1c988101..b6a79a451 100644 --- a/third_party/xsimd/arch/generic/xsimd_generic_rounding.hpp +++ b/third_party/xsimd/arch/generic/xsimd_generic_rounding.hpp @@ -1,64 +1,72 @@ /*************************************************************************** -* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * -* Martin Renou * -* Copyright (c) QuantStack * -* Copyright (c) Serge Guelton * -* * -* Distributed under the terms of the BSD 3-Clause License. * -* * -* The full license is in the file LICENSE, distributed with this software. * -****************************************************************************/ + * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * + * Martin Renou * + * Copyright (c) QuantStack * + * Copyright (c) Serge Guelton * + * * + * Distributed under the terms of the BSD 3-Clause License. * + * * + * The full license is in the file LICENSE, distributed with this software. * + ****************************************************************************/ #ifndef XSIMD_GENERIC_ROUNDING_HPP #define XSIMD_GENERIC_ROUNDING_HPP #include "./xsimd_generic_details.hpp" +namespace xsimd +{ + + namespace kernel + { + + using namespace types; + + // ceil + template + inline batch ceil(batch const& self, requires_arch) noexcept + { + batch truncated_self = trunc(self); + return select(truncated_self < self, truncated_self + 1, truncated_self); + } + + // floor + template + inline batch floor(batch const& self, requires_arch) noexcept + { + batch truncated_self = trunc(self); + return select(truncated_self > self, truncated_self - 1, truncated_self); + } + + // round + template + inline batch round(batch const& self, requires_arch) noexcept + { + auto v = abs(self); + auto c = ceil(v); + auto cp = select(c - 0.5 > v, c - 1, c); + return select(v > constants::maxflint>(), self, copysign(cp, self)); + } + + // trunc + template ::value, void>::type> + inline batch trunc(batch const& self, requires_arch) noexcept + { + return self; + } + template + inline batch trunc(batch const& self, requires_arch) noexcept + { + return select(abs(self) < constants::maxflint>(), to_float(to_int(self)), self); + } + template + inline batch trunc(batch const& self, requires_arch) noexcept + { + return select(abs(self) < constants::maxflint>(), to_float(to_int(self)), self); + } -namespace xsimd { - - namespace kernel { - - - using namespace types; - - // ceil - template batch ceil(batch const& self, requires_arch) { - batch truncated_self = trunc(self); - return select(truncated_self < self, truncated_self + 1, truncated_self); - } - - - // floor - template batch floor(batch const& self, requires_arch) { - batch truncated_self = trunc(self); - return select(truncated_self > self, truncated_self - 1, truncated_self); - } - - // round - template batch round(batch const& self, requires_arch) { - auto v = abs(self); - auto c = ceil(v); - auto cp = select(c - 0.5 > v, c - 1, c); - return select(v > constants::maxflint>(), self, copysign(cp, self)); - } - - // trunc - template::value, void>::type> - batch trunc(batch const& self, requires_arch) { - return self; - } - template batch trunc(batch const& self, requires_arch) { - return select(abs(self) < constants::maxflint>(), to_float(to_int(self)), self); } - template batch trunc(batch const& self, requires_arch) { - return select(abs(self) < constants::maxflint>(), to_float(to_int(self)), self); - } - - - } } #endif - diff --git a/third_party/xsimd/arch/generic/xsimd_generic_trigo.hpp b/third_party/xsimd/arch/generic/xsimd_generic_trigo.hpp index f649c6c25..e274c413f 100644 --- a/third_party/xsimd/arch/generic/xsimd_generic_trigo.hpp +++ b/third_party/xsimd/arch/generic/xsimd_generic_trigo.hpp @@ -1,22 +1,24 @@ /*************************************************************************** -* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * -* Martin Renou * -* Copyright (c) QuantStack * -* Copyright (c) Serge Guelton * -* * -* Distributed under the terms of the BSD 3-Clause License. * -* * -* The full license is in the file LICENSE, distributed with this software. * -****************************************************************************/ + * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * + * Martin Renou * + * Copyright (c) QuantStack * + * Copyright (c) Serge Guelton * + * * + * Distributed under the terms of the BSD 3-Clause License. * + * * + * The full license is in the file LICENSE, distributed with this software. * + ****************************************************************************/ #ifndef XSIMD_GENERIC_TRIGO_HPP #define XSIMD_GENERIC_TRIGO_HPP #include "./xsimd_generic_details.hpp" -namespace xsimd { +namespace xsimd +{ - namespace kernel { + namespace kernel + { /* origin: boost/simd/arch/common/detail/simd/trig_base.hpp */ /* * ==================================================== @@ -27,48 +29,52 @@ namespace xsimd { * ==================================================== */ - using namespace types; - - // acos - template batch acos(batch const& self, requires_arch) { - using batch_type = batch; - batch_type x = abs(self); - auto x_larger_05 = x > batch_type(0.5); - x = select(x_larger_05, sqrt(fma(batch_type(-0.5), x, batch_type(0.5))), self); - x = asin(x); - x = select(x_larger_05, x + x, x); - x = select(self < batch_type(-0.5), constants::pi() - x, x); - return select(x_larger_05, x, constants::pio2() - x); - } + using namespace types; + + // acos + template + inline batch acos(batch const& self, requires_arch) noexcept + { + using batch_type = batch; + batch_type x = abs(self); + auto x_larger_05 = x > batch_type(0.5); + x = select(x_larger_05, sqrt(fma(batch_type(-0.5), x, batch_type(0.5))), self); + x = asin(x); + x = select(x_larger_05, x + x, x); + x = select(self < batch_type(-0.5), constants::pi() - x, x); + return select(x_larger_05, x, constants::pio2() - x); + } template - batch, A> acos(const batch, A>& z, requires_arch) + inline batch, A> acos(const batch, A>& z, requires_arch) noexcept { using batch_type = batch, A>; using real_batch = typename batch_type::real_batch; batch_type tmp = asin(z); - return {constants::pio2() - tmp.real(), -tmp.imag()}; - } - - // acosh - /* origin: boost/simd/arch/common/simd/function/acosh.hpp */ - /* - * ==================================================== - * copyright 2016 NumScale SAS - * - * Distributed under the Boost Software License, Version 1.0. - * (See copy at http://boost.org/LICENSE_1_0.txt) - * ==================================================== - */ - template batch acosh(batch const& self, requires_arch) { - using batch_type = batch; - batch_type x = self - batch_type(1.); - auto test = x > constants::oneotwoeps(); - batch_type z = select(test, self, x + sqrt(x + x + x * x)); - batch_type l1pz = log1p(z); - return select(test, l1pz + constants::log_2(), l1pz); - } + return { constants::pio2() - tmp.real(), -tmp.imag() }; + } + + // acosh + /* origin: boost/simd/arch/common/simd/function/acosh.hpp */ + /* + * ==================================================== + * copyright 2016 NumScale SAS + * + * Distributed under the Boost Software License, Version 1.0. + * (See copy at http://boost.org/LICENSE_1_0.txt) + * ==================================================== + */ + template + inline batch acosh(batch const& self, requires_arch) noexcept + { + using batch_type = batch; + batch_type x = self - batch_type(1.); + auto test = x > constants::oneotwoeps(); + batch_type z = select(test, self, x + sqrt(x + x + x * x)); + batch_type l1pz = log1p(z); + return select(test, l1pz + constants::log_2(), l1pz); + } template - inline batch, A> acosh(const batch, A>& z, requires_arch) + inline batch, A> acosh(const batch, A>& z, requires_arch) noexcept { using batch_type = batch, A>; batch_type w = acos(z); @@ -76,68 +82,50 @@ namespace xsimd { return w; } - // asin - template batch asin(batch const& self, requires_arch) { - using batch_type = batch; - batch_type x = abs(self); - batch_type sign = bitofsign(self); - auto x_larger_05 = x > batch_type(0.5); - batch_type z = select(x_larger_05, batch_type(0.5) * (batch_type(1.) - x), x * x); - x = select(x_larger_05, sqrt(z), x); - batch_type z1 = detail::horner(z); - z1 = fma(z1, z * x, x); - z = select(x_larger_05, constants::pio2() - (z1 + z1), z1); - return z ^ sign; - } - template batch asin(batch const& self, requires_arch) { - using batch_type = batch; - batch_type x = abs(self); - auto small_cond = x < constants::sqrteps(); - batch_type ct1 = batch_type(bit_cast(int64_t(0x3fe4000000000000))); - batch_type zz1 = batch_type(1.) - x; - batch_type vp = zz1 * detail::horner(zz1) / - detail::horner1(zz1); - zz1 = sqrt(zz1 + zz1); - batch_type z = constants::pio4() - zz1; - zz1 = fms(zz1, vp, constants::pio_2lo()); - z = z - zz1; - zz1 = z + constants::pio4(); - batch_type zz2 = self * self; - z = zz2 * detail::horner(zz2) / - detail::horner1(zz2); - zz2 = fma(x, z, x); - return select(x > batch_type(1.), constants::nan(), - select(small_cond, x, - select(x > ct1, zz1, zz2)) ^ - bitofsign(self)); - } + // asin + template + inline batch asin(batch const& self, requires_arch) noexcept + { + using batch_type = batch; + batch_type x = abs(self); + batch_type sign = bitofsign(self); + auto x_larger_05 = x > batch_type(0.5); + batch_type z = select(x_larger_05, batch_type(0.5) * (batch_type(1.) - x), x * x); + x = select(x_larger_05, sqrt(z), x); + batch_type z1 = detail::horner(z); + z1 = fma(z1, z * x, x); + z = select(x_larger_05, constants::pio2() - (z1 + z1), z1); + return z ^ sign; + } + template + inline batch asin(batch const& self, requires_arch) noexcept + { + using batch_type = batch; + batch_type x = abs(self); + auto small_cond = x < constants::sqrteps(); + batch_type ct1 = batch_type(bit_cast(int64_t(0x3fe4000000000000))); + batch_type zz1 = batch_type(1.) - x; + batch_type vp = zz1 * detail::horner(zz1) / detail::horner1(zz1); + zz1 = sqrt(zz1 + zz1); + batch_type z = constants::pio4() - zz1; + zz1 = fms(zz1, vp, constants::pio_2lo()); + z = z - zz1; + zz1 = z + constants::pio4(); + batch_type zz2 = self * self; + z = zz2 * detail::horner(zz2) / detail::horner1(zz2); + zz2 = fma(x, z, x); + return select(x > batch_type(1.), constants::nan(), + select(small_cond, x, + select(x > ct1, zz1, zz2)) + ^ bitofsign(self)); + } template - batch, A> asin(const batch, A>& z, requires_arch) + inline batch, A> asin(const batch, A>& z, requires_arch) noexcept { using batch_type = batch, A>; using real_batch = typename batch_type::real_batch; @@ -156,7 +144,7 @@ namespace xsimd { resg); } - // asinh + // asinh /* origin: boost/simd/arch/common/simd/function/asinh.hpp */ /* * ==================================================== @@ -166,70 +154,77 @@ namespace xsimd { * (See copy at http://boost.org/LICENSE_1_0.txt) * ==================================================== */ - namespace detail { - template::value, void>::type> - batch - average(const batch& x1, const batch& x2) + namespace detail + { + template ::value, void>::type> + inline batch + average(const batch& x1, const batch& x2) noexcept { return (x1 & x2) + ((x1 ^ x2) >> 1); } - template - batch - averagef(const batch& x1, const batch& x2) + template + inline batch + averagef(const batch& x1, const batch& x2) noexcept { - using batch_type = batch; + using batch_type = batch; return fma(x1, batch_type(0.5), x2 * batch_type(0.5)); } - template - batch average(batch const & x1, batch const & x2) { - return averagef(x1, x2); - } - template - batch average(batch const & x1, batch const & x2) { - return averagef(x1, x2); - } - } - template batch asinh(batch const& self, requires_arch) { - using batch_type = batch; - batch_type x = abs(self); - auto lthalf = x < batch_type(0.5); - batch_type x2 = x * x; - batch_type bts = bitofsign(self); - batch_type z(0.); - if (any(lthalf)) - { - z = detail::horner(x2) * - x; - if (all(lthalf)) - return z ^ bts; - } - batch_type tmp = select(x > constants::oneosqrteps(), x, detail::average(x, hypot(batch_type(1.), x))); + template + inline batch average(batch const& x1, batch const& x2) noexcept + { + return averagef(x1, x2); + } + template + inline batch average(batch const& x1, batch const& x2) noexcept + { + return averagef(x1, x2); + } + } + template + inline batch asinh(batch const& self, requires_arch) noexcept + { + using batch_type = batch; + batch_type x = abs(self); + auto lthalf = x < batch_type(0.5); + batch_type x2 = x * x; + batch_type bts = bitofsign(self); + batch_type z(0.); + if (any(lthalf)) + { + z = detail::horner(x2) + * x; + if (all(lthalf)) + return z ^ bts; + } + batch_type tmp = select(x > constants::oneosqrteps(), x, detail::average(x, hypot(batch_type(1.), x))); #ifndef XSIMD_NO_NANS - return select(isnan(self), constants::nan(), select(lthalf, z, log(tmp) + constants::log_2()) ^ bts); + return select(isnan(self), constants::nan(), select(lthalf, z, log(tmp) + constants::log_2()) ^ bts); #else - return select(lthalf, z, log(tmp) + constants::log_2()) ^ bts; + return select(lthalf, z, log(tmp) + constants::log_2()) ^ bts; #endif - } - template batch asinh(batch const& self, requires_arch) { - using batch_type = batch; - batch_type x = abs(self); - auto test = x > constants::oneosqrteps(); - batch_type z = select(test, x - batch_type(1.), x + x * x / (batch_type(1.) + hypot(batch_type(1.), x))); + } + template + inline batch asinh(batch const& self, requires_arch) noexcept + { + using batch_type = batch; + batch_type x = abs(self); + auto test = x > constants::oneosqrteps(); + batch_type z = select(test, x - batch_type(1.), x + x * x / (batch_type(1.) + hypot(batch_type(1.), x))); #ifndef XSIMD_NO_INFINITIES - z = select(x == constants::infinity(), x, z); + z = select(x == constants::infinity(), x, z); #endif - batch_type l1pz = log1p(z); - z = select(test, l1pz + constants::log_2(), l1pz); - return bitofsign(self) ^ z; - } + batch_type l1pz = log1p(z); + z = select(test, l1pz + constants::log_2(), l1pz); + return bitofsign(self) ^ z; + } template - inline batch, A> asinh(const batch, A>& z, requires_arch) + inline batch, A> asinh(const batch, A>& z, requires_arch) noexcept { using batch_type = batch, A>; batch_type w = asin(batch_type(-z.imag(), z.real())); @@ -237,12 +232,13 @@ namespace xsimd { return w; } - // atan - namespace detail { - template - static inline batch kernel_atan(const batch& x, const batch& recx) + // atan + namespace detail + { + template + static inline batch kernel_atan(const batch& x, const batch& recx) noexcept { - using batch_type = batch; + using batch_type = batch; const auto flag1 = x < constants::tan3pio8(); const auto flag2 = (x >= batch_type(bit_cast((uint32_t)0x3ed413cd))) && flag1; batch_type yy = select(flag1, batch_type(0.), constants::pio2()); @@ -251,19 +247,19 @@ namespace xsimd { xx = select(flag2, (x - batch_type(1.)) / (x + batch_type(1.)), xx); const batch_type z = xx * xx; batch_type z1 = detail::horner(z); + 0xbeaaaa2aul, + 0x3e4c925ful, + 0xbe0e1b85ul, + 0x3da4f0d1ul>(z); z1 = fma(xx, z1 * z, xx); z1 = select(flag2, z1 + constants::pio_4lo(), z1); z1 = select(!flag1, z1 + constants::pio_2lo(), z1); return yy + z1; } - template - static inline batch kernel_atan(const batch& x, const batch& recx) + template + static inline batch kernel_atan(const batch& x, const batch& recx) noexcept { - using batch_type = batch; + using batch_type = batch; const auto flag1 = x < constants::tan3pio8(); const auto flag2 = (x >= constants::tanpio8()) && flag1; batch_type yy = select(flag1, batch_type(0.), constants::pio2()); @@ -272,31 +268,33 @@ namespace xsimd { xx = select(flag2, (x - batch_type(1.)) / (x + batch_type(1.)), xx); batch_type z = xx * xx; z *= detail::horner(z) / - detail::horner1(z); + 0xc0503669fd28ec8eull, + 0xc05eb8bf2d05ba25ull, + 0xc052c08c36880273ull, + 0xc03028545b6b807aull, + 0xbfec007fa1f72594ull>(z) + / detail::horner1(z); z = fma(xx, z, xx); z = select(flag2, z + constants::pio_4lo(), z); z = z + select(flag1, batch_type(0.), constants::pio_2lo()); return yy + z; } - } - template batch atan(batch const& self, requires_arch) { - using batch_type = batch; - const batch_type absa = abs(self); - const batch_type x = detail::kernel_atan(absa, batch_type(1.) / absa); - return x ^ bitofsign(self); - } + } + template + inline batch atan(batch const& self, requires_arch) noexcept + { + using batch_type = batch; + const batch_type absa = abs(self); + const batch_type x = detail::kernel_atan(absa, batch_type(1.) / absa); + return x ^ bitofsign(self); + } template - batch, A> atan(const batch, A>& z, requires_arch) + inline batch, A> atan(const batch, A>& z, requires_arch) noexcept { using batch_type = batch, A>; using real_batch = typename batch_type::real_batch; @@ -311,32 +309,34 @@ namespace xsimd { real_batch den = y - one; den = x2 + den * den; batch_type res = select((x == real_batch(0.)) && (y == real_batch(1.)), - batch_type(real_batch(0.), constants::infinity()), - batch_type(w, 0.25 * log(num / den))); + batch_type(real_batch(0.), constants::infinity()), + batch_type(w, 0.25 * log(num / den))); return res; } - // atanh - /* origin: boost/simd/arch/common/simd/function/acosh.hpp */ - /* - * ==================================================== - * copyright 2016 NumScale SAS - * - * Distributed under the Boost Software License, Version 1.0. - * (See copy at http://boost.org/LICENSE_1_0.txt) - * ==================================================== - */ - template batch atanh(batch const& self, requires_arch) { - using batch_type = batch; - batch_type x = abs(self); - batch_type t = x + x; - batch_type z = batch_type(1.) - x; - auto test = x < batch_type(0.5); - batch_type tmp = select(test, x, t) / z; - return bitofsign(self) ^ (batch_type(0.5) * log1p(select(test, fma(t, tmp, t), tmp))); - } + // atanh + /* origin: boost/simd/arch/common/simd/function/acosh.hpp */ + /* + * ==================================================== + * copyright 2016 NumScale SAS + * + * Distributed under the Boost Software License, Version 1.0. + * (See copy at http://boost.org/LICENSE_1_0.txt) + * ==================================================== + */ template - inline batch, A> atanh(const batch, A>& z, requires_arch) + inline batch atanh(batch const& self, requires_arch) noexcept + { + using batch_type = batch; + batch_type x = abs(self); + batch_type t = x + x; + batch_type z = batch_type(1.) - x; + auto test = x < batch_type(0.5); + batch_type tmp = select(test, x, t) / z; + return bitofsign(self) ^ (batch_type(0.5) * log1p(select(test, fma(t, tmp, t), tmp))); + } + template + inline batch, A> atanh(const batch, A>& z, requires_arch) noexcept { using batch_type = batch, A>; batch_type w = atan(batch_type(-z.imag(), z.real())); @@ -344,325 +344,337 @@ namespace xsimd { return w; } - // atan2 - template batch atan2(batch const& self, batch const& other, requires_arch) { - using batch_type = batch; - const batch_type q = abs(self / other); - const batch_type z = detail::kernel_atan(q, batch_type(1.) / q); - return select(other > batch_type(0.), z, constants::pi() - z) * signnz(self); - } - - - // cos - namespace detail - { - template - batch quadrant(const batch& x) { - return x & batch(3); + // atan2 + template + inline batch atan2(batch const& self, batch const& other, requires_arch) noexcept + { + using batch_type = batch; + const batch_type q = abs(self / other); + const batch_type z = detail::kernel_atan(q, batch_type(1.) / q); + return select(other > batch_type(0.), z, constants::pi() - z) * signnz(self); } - template - batch quadrant(const batch& x) { - return to_float(quadrant(to_int(x))); - } + // cos + namespace detail + { + template + inline batch quadrant(const batch& x) noexcept + { + return x & batch(3); + } - template - batch quadrant(const batch& x) { - using batch_type = batch; + template + inline batch quadrant(const batch& x) noexcept + { + return to_float(quadrant(to_int(x))); + } + + template + inline batch quadrant(const batch& x) noexcept + { + using batch_type = batch; batch_type a = x * batch_type(0.25); return (a - floor(a)) * batch_type(4.); - } - /* origin: boost/simd/arch/common/detail/simd/f_trig_evaluation.hpp */ - /* - * ==================================================== - * copyright 2016 NumScale SAS - * - * Distributed under the Boost Software License, Version 1.0. - * (See copy at http://boost.org/LICENSE_1_0.txt) - * ==================================================== - */ - - template - inline batch cos_eval(const batch& z) - { - using batch_type = batch; - batch_type y = detail::horner(z); - return batch_type(1.) + fma(z, batch_type(-0.5), y * z * z); - } + } + /* origin: boost/simd/arch/common/detail/simd/f_trig_evaluation.hpp */ + /* + * ==================================================== + * copyright 2016 NumScale SAS + * + * Distributed under the Boost Software License, Version 1.0. + * (See copy at http://boost.org/LICENSE_1_0.txt) + * ==================================================== + */ + + template + inline batch cos_eval(const batch& z) noexcept + { + using batch_type = batch; + batch_type y = detail::horner(z); + return batch_type(1.) + fma(z, batch_type(-0.5), y * z * z); + } - template - inline batch sin_eval(const batch& z, const batch& x) + template + inline batch sin_eval(const batch& z, const batch& x) noexcept { - using batch_type = batch; + using batch_type = batch; batch_type y = detail::horner(z); + 0xbe2aaaa2, + 0x3c08839d, + 0xb94ca1f9>(z); return fma(y * z, x, x); } - template - static inline batch base_tancot_eval(const batch& z) + template + static inline batch base_tancot_eval(const batch& z) noexcept { - using batch_type = batch; + using batch_type = batch; batch_type zz = z * z; batch_type y = detail::horner(zz); + 0x3eaaaa6f, + 0x3e0896dd, + 0x3d5ac5c9, + 0x3cc821b5, + 0x3b4c779c, + 0x3c19c53b>(zz); return fma(y, zz * z, z); } template - static inline batch tan_eval(const batch& z, const BB& test) + static inline batch tan_eval(const batch& z, const BB& test) noexcept { - using batch_type = batch; + using batch_type = batch; batch_type y = base_tancot_eval(z); return select(test, y, -batch_type(1.) / y); } template - static inline batch cot_eval(const batch& z, const BB& test) + static inline batch cot_eval(const batch& z, const BB& test) noexcept { - using batch_type = batch; + using batch_type = batch; batch_type y = base_tancot_eval(z); return select(test, batch_type(1.) / y, -y); } - /* origin: boost/simd/arch/common/detail/simd/d_trig_evaluation.hpp */ - /* - * ==================================================== - * copyright 2016 NumScale SAS - * - * Distributed under the Boost Software License, Version 1.0. - * (See copy at http://boost.org/LICENSE_1_0.txt) - * ==================================================== - */ - template - static inline batch cos_eval(const batch& z) + /* origin: boost/simd/arch/common/detail/simd/d_trig_evaluation.hpp */ + /* + * ==================================================== + * copyright 2016 NumScale SAS + * + * Distributed under the Boost Software License, Version 1.0. + * (See copy at http://boost.org/LICENSE_1_0.txt) + * ==================================================== + */ + template + static inline batch cos_eval(const batch& z) noexcept { - using batch_type = batch; + using batch_type = batch; batch_type y = detail::horner(z); + 0x3fe0000000000000ull, + 0xbfa5555555555551ull, + 0x3f56c16c16c15d47ull, + 0xbefa01a019ddbcd9ull, + 0x3e927e4f8e06d9a5ull, + 0xbe21eea7c1e514d4ull, + 0x3da8ff831ad9b219ull>(z); return batch_type(1.) - y * z; } - template - static inline batch sin_eval(const batch& z, const batch& x) + template + static inline batch sin_eval(const batch& z, const batch& x) noexcept { - using batch_type = batch; + using batch_type = batch; batch_type y = detail::horner(z); + 0xbfc5555555555548ull, + 0x3f8111111110f7d0ull, + 0xbf2a01a019bfdf03ull, + 0x3ec71de3567d4896ull, + 0xbe5ae5e5a9291691ull, + 0x3de5d8fd1fcf0ec1ull>(z); return fma(y * z, x, x); } - template - static inline batch base_tancot_eval(const batch& z) + template + static inline batch base_tancot_eval(const batch& z) noexcept { - using batch_type = batch; + using batch_type = batch; batch_type zz = z * z; batch_type num = detail::horner(zz); + 0xc1711fead3299176ull, + 0x413199eca5fc9dddull, + 0xc0c992d8d24f3f38ull>(zz); batch_type den = detail::horner1(zz); + 0xc189afe03cbe5a31ull, + 0x4177d98fc2ead8efull, + 0xc13427bc582abc96ull, + 0x40cab8a5eeb36572ull>(zz); return fma(z, (zz * (num / den)), z); } template - static inline batch tan_eval(const batch& z, const BB& test) + static inline batch tan_eval(const batch& z, const BB& test) noexcept { - using batch_type = batch; + using batch_type = batch; batch_type y = base_tancot_eval(z); return select(test, y, -batch_type(1.) / y); } template - static inline batch cot_eval(const batch& z, const BB& test) + static inline batch cot_eval(const batch& z, const BB& test) noexcept { - using batch_type = batch; + using batch_type = batch; batch_type y = base_tancot_eval(z); return select(test, batch_type(1.) / y, -y); } - /* origin: boost/simd/arch/common/detail/simd/trig_reduction.hpp */ - /* - * ==================================================== - * copyright 2016 NumScale SAS - * - * Distributed under the Boost Software License, Version 1.0. - * (See copy at http://boost.org/LICENSE_1_0.txt) - * ==================================================== - */ - - struct trigo_radian_tag - { - }; - struct trigo_pi_tag - { - }; + /* origin: boost/simd/arch/common/detail/simd/trig_reduction.hpp */ + /* + * ==================================================== + * copyright 2016 NumScale SAS + * + * Distributed under the Boost Software License, Version 1.0. + * (See copy at http://boost.org/LICENSE_1_0.txt) + * ==================================================== + */ + + struct trigo_radian_tag + { + }; + struct trigo_pi_tag + { + }; - template - struct trigo_reducer - { - static inline B reduce(const B& x, B& xr) + template + struct trigo_reducer { - if (all(x <= constants::pio4())) + static inline B reduce(const B& x, B& xr) noexcept { - xr = x; - return B(0.); - } - else if (all(x <= constants::pio2())) - { - auto test = x > constants::pio4(); - xr = x - constants::pio2_1(); - xr -= constants::pio2_2(); - xr -= constants::pio2_3(); - xr = select(test, xr, x); - return select(test, B(1.), B(0.)); - } - else if (all(x <= constants::twentypi())) - { - B xi = nearbyint(x * constants::twoopi()); - xr = fnma(xi, constants::pio2_1(), x); - xr -= xi * constants::pio2_2(); - xr -= xi * constants::pio2_3(); - return quadrant(xi); - } - else if (all(x <= constants::mediumpi())) - { - B fn = nearbyint(x * constants::twoopi()); - B r = x - fn * constants::pio2_1(); - B w = fn * constants::pio2_1t(); - B t = r; - w = fn * constants::pio2_2(); - r = t - w; - w = fn * constants::pio2_2t() - ((t - r) - w); - t = r; - w = fn * constants::pio2_3(); - r = t - w; - w = fn * constants::pio2_3t() - ((t - r) - w); - xr = r - w; - return quadrant(fn); - } - else - { - static constexpr std::size_t size = B::size; - using value_type = typename B::value_type; - alignas(B) std::array tmp; - alignas(B) std::array txr; - alignas(B) std::array args; - x.store_aligned(args.data()); - - for (std::size_t i = 0; i < size; ++i) + if (all(x <= constants::pio4())) { - double arg = args[i]; - if (arg == std::numeric_limits::infinity()) - { - tmp[i] = 0.; - txr[i] = std::numeric_limits::quiet_NaN(); - } - else + xr = x; + return B(0.); + } + else if (all(x <= constants::pio2())) + { + auto test = x > constants::pio4(); + xr = x - constants::pio2_1(); + xr -= constants::pio2_2(); + xr -= constants::pio2_3(); + xr = select(test, xr, x); + return select(test, B(1.), B(0.)); + } + else if (all(x <= constants::twentypi())) + { + B xi = nearbyint(x * constants::twoopi()); + xr = fnma(xi, constants::pio2_1(), x); + xr -= xi * constants::pio2_2(); + xr -= xi * constants::pio2_3(); + return quadrant(xi); + } + else if (all(x <= constants::mediumpi())) + { + B fn = nearbyint(x * constants::twoopi()); + B r = x - fn * constants::pio2_1(); + B w = fn * constants::pio2_1t(); + B t = r; + w = fn * constants::pio2_2(); + r = t - w; + w = fn * constants::pio2_2t() - ((t - r) - w); + t = r; + w = fn * constants::pio2_3(); + r = t - w; + w = fn * constants::pio2_3t() - ((t - r) - w); + xr = r - w; + return quadrant(fn); + } + else + { + static constexpr std::size_t size = B::size; + using value_type = typename B::value_type; + alignas(B) std::array tmp; + alignas(B) std::array txr; + alignas(B) std::array args; + x.store_aligned(args.data()); + + for (std::size_t i = 0; i < size; ++i) { - double y[2]; - std::int32_t n = ::xsimd::detail::__ieee754_rem_pio2(arg, y); - tmp[i] = value_type(n & 3); - txr[i] = value_type(y[0]); + double arg = args[i]; + if (arg == std::numeric_limits::infinity()) + { + tmp[i] = 0.; + txr[i] = std::numeric_limits::quiet_NaN(); + } + else + { + double y[2]; + std::int32_t n = ::xsimd::detail::__ieee754_rem_pio2(arg, y); + tmp[i] = value_type(n & 3); + txr[i] = value_type(y[0]); + } } + xr = B::load_aligned(&txr[0]); + B res = B::load_aligned(&tmp[0]); + return res; } - xr = B::load_aligned(&txr[0]); - B res = B::load_aligned(&tmp[0]); - return res; } - } - }; + }; - template - struct trigo_reducer - { - static inline B reduce(const B& x, B& xr) + template + struct trigo_reducer { - B xi = nearbyint(x * B(2.)); - B x2 = x - xi * B(0.5); - xr = x2 * constants::pi(); - return quadrant(xi); - } - }; + static inline B reduce(const B& x, B& xr) noexcept + { + B xi = nearbyint(x * B(2.)); + B x2 = x - xi * B(0.5); + xr = x2 * constants::pi(); + return quadrant(xi); + } + }; - } - template batch cos(batch const& self, requires_arch) { - using batch_type = batch; - const batch_type x = abs(self); - batch_type xr = constants::nan(); - const batch_type n = detail::trigo_reducer::reduce(x, xr); - auto tmp = select(n >= batch_type(2.), batch_type(1.), batch_type(0.)); - auto swap_bit = fma(batch_type(-2.), tmp, n); - auto sign_bit = select((swap_bit ^ tmp) != batch_type(0.), constants::signmask(), batch_type(0.)); - const batch_type z = xr * xr; - const batch_type se = detail::sin_eval(z, xr); - const batch_type ce = detail::cos_eval(z); - const batch_type z1 = select(swap_bit != batch_type(0.), se, ce); - return z1 ^ sign_bit; - } + } + template + inline batch cos(batch const& self, requires_arch) noexcept + { + using batch_type = batch; + const batch_type x = abs(self); + batch_type xr = constants::nan(); + const batch_type n = detail::trigo_reducer::reduce(x, xr); + auto tmp = select(n >= batch_type(2.), batch_type(1.), batch_type(0.)); + auto swap_bit = fma(batch_type(-2.), tmp, n); + auto sign_bit = select((swap_bit ^ tmp) != batch_type(0.), constants::signmask(), batch_type(0.)); + const batch_type z = xr * xr; + const batch_type se = detail::sin_eval(z, xr); + const batch_type ce = detail::cos_eval(z); + const batch_type z1 = select(swap_bit != batch_type(0.), se, ce); + return z1 ^ sign_bit; + } - template batch, A> cos(batch, A> const& z, requires_arch) { - return {cos(z.real()) * cosh(z.imag()), -sin(z.real()) * sinh(z.imag())}; - } + template + inline batch, A> cos(batch, A> const& z, requires_arch) noexcept + { + return { cos(z.real()) * cosh(z.imag()), -sin(z.real()) * sinh(z.imag()) }; + } + + // cosh + + /* origin: boost/simd/arch/common/simd/function/cosh.hpp */ + /* + * ==================================================== + * copyright 2016 NumScale SAS + * + * Distributed under the Boost Software License, Version 1.0. + * (See copy at http://boost.org/LICENSE_1_0.txt) + * ==================================================== + */ - // cosh - - /* origin: boost/simd/arch/common/simd/function/cosh.hpp */ - /* - * ==================================================== - * copyright 2016 NumScale SAS - * - * Distributed under the Boost Software License, Version 1.0. - * (See copy at http://boost.org/LICENSE_1_0.txt) - * ==================================================== - */ - - template batch cosh(batch const& self, requires_arch) { - using batch_type = batch; - batch_type x = abs(self); - auto test1 = x > (constants::maxlog() - constants::log_2()); - batch_type fac = select(test1, batch_type(0.5), batch_type(1.)); - batch_type tmp = exp(x * fac); - batch_type tmp1 = batch_type(0.5) * tmp; - return select(test1, tmp1 * tmp, detail::average(tmp, batch_type(1.) / tmp)); - } template - inline batch, A> cosh(const batch, A>& z, requires_arch) + inline batch cosh(batch const& self, requires_arch) noexcept + { + using batch_type = batch; + batch_type x = abs(self); + auto test1 = x > (constants::maxlog() - constants::log_2()); + batch_type fac = select(test1, batch_type(0.5), batch_type(1.)); + batch_type tmp = exp(x * fac); + batch_type tmp1 = batch_type(0.5) * tmp; + return select(test1, tmp1 * tmp, detail::average(tmp, batch_type(1.) / tmp)); + } + template + inline batch, A> cosh(const batch, A>& z, requires_arch) noexcept { auto x = z.real(); auto y = z.imag(); - return {cosh(x) * cos(y), sinh(x) * sin(y)}; + return { cosh(x) * cos(y), sinh(x) * sin(y) }; } - - // sin - namespace detail { - template batch sin(batch const& self, Tag = Tag()) { - using batch_type = batch; + // sin + namespace detail + { + template + inline batch sin(batch const& self, Tag = Tag()) noexcept + { + using batch_type = batch; const batch_type x = abs(self); batch_type xr = constants::nan(); const batch_type n = detail::trigo_reducer::reduce(x, xr); @@ -674,38 +686,45 @@ namespace xsimd { const batch_type ce = detail::cos_eval(z); const batch_type z1 = select(swap_bit == batch_type(0.), se, ce); return z1 ^ sign_bit; - } - } + } + } - template batch sin(batch const& self, requires_arch) { - return detail::sin(self); - } + template + inline batch sin(batch const& self, requires_arch) noexcept + { + return detail::sin(self); + } - template batch, A> sin(batch, A> const& z, requires_arch) { - return {sin(z.real()) * cosh(z.imag()), cos(z.real()) * sinh(z.imag())}; - } + template + inline batch, A> sin(batch, A> const& z, requires_arch) noexcept + { + return { sin(z.real()) * cosh(z.imag()), cos(z.real()) * sinh(z.imag()) }; + } - // sincos - template std::pair, batch> sincos(batch const& self, requires_arch) { - using batch_type = batch; - const batch_type x = abs(self); - batch_type xr = constants::nan(); - const batch_type n = detail::trigo_reducer::reduce(x, xr); - auto tmp = select(n >= batch_type(2.), batch_type(1.), batch_type(0.)); - auto swap_bit = fma(batch_type(-2.), tmp, n); - const batch_type z = xr * xr; - const batch_type se = detail::sin_eval(z, xr); - const batch_type ce = detail::cos_eval(z); - auto sin_sign_bit = bitofsign(self) ^ select(tmp != batch_type(0.), constants::signmask(), batch_type(0.)); - const batch_type sin_z1 = select(swap_bit == batch_type(0.), se, ce); - auto cos_sign_bit = select((swap_bit ^ tmp) != batch_type(0.), constants::signmask(), batch_type(0.)); - const batch_type cos_z1 = select(swap_bit != batch_type(0.), se, ce); - return std::make_pair(sin_z1 ^ sin_sign_bit, cos_z1 ^ cos_sign_bit); - } + // sincos + template + inline std::pair, batch> sincos(batch const& self, requires_arch) noexcept + { + using batch_type = batch; + const batch_type x = abs(self); + batch_type xr = constants::nan(); + const batch_type n = detail::trigo_reducer::reduce(x, xr); + auto tmp = select(n >= batch_type(2.), batch_type(1.), batch_type(0.)); + auto swap_bit = fma(batch_type(-2.), tmp, n); + const batch_type z = xr * xr; + const batch_type se = detail::sin_eval(z, xr); + const batch_type ce = detail::cos_eval(z); + auto sin_sign_bit = bitofsign(self) ^ select(tmp != batch_type(0.), constants::signmask(), batch_type(0.)); + const batch_type sin_z1 = select(swap_bit == batch_type(0.), se, ce); + auto cos_sign_bit = select((swap_bit ^ tmp) != batch_type(0.), constants::signmask(), batch_type(0.)); + const batch_type cos_z1 = select(swap_bit != batch_type(0.), se, ce); + return std::make_pair(sin_z1 ^ sin_sign_bit, cos_z1 ^ cos_sign_bit); + } - template - std::pair, A>, batch, A>> - sincos(batch, A> const& z, requires_arch) { + template + inline std::pair, A>, batch, A>> + sincos(batch, A> const& z, requires_arch) noexcept + { using batch_type = batch, A>; using real_batch = typename batch_type::real_batch; real_batch rcos = cos(z.real()); @@ -713,11 +732,55 @@ namespace xsimd { real_batch icosh = cosh(z.imag()); real_batch isinh = sinh(z.imag()); return std::make_pair(batch_type(rsin * icosh, rcos * isinh), batch_type(rcos * icosh, -rsin * isinh)); - } + } + + // sinh + namespace detail + { + /* origin: boost/simd/arch/common/detail/generic/sinh_kernel.hpp */ + /* + * ==================================================== + * copyright 2016 NumScale SAS + * + * Distributed under the Boost Software License, Version 1.0. + * (See copy at http://boost.org/LICENSE_1_0.txt) + * ==================================================== + */ + template + inline batch sinh_kernel(batch const& self) noexcept + { + using batch_type = batch; + batch_type sqr_self = self * self; + return detail::horner(sqr_self) + * self; + } - // sinh - namespace detail { - /* origin: boost/simd/arch/common/detail/generic/sinh_kernel.hpp */ + template + inline batch sinh_kernel(batch const& self) noexcept + { + using batch_type = batch; + batch_type sqrself = self * self; + return fma(self, (detail::horner(sqrself) + / detail::horner1(sqrself)) + * sqrself, + self); + } + } + /* origin: boost/simd/arch/common/simd/function/sinh.hpp */ /* * ==================================================== * copyright 2016 NumScale SAS @@ -726,86 +789,53 @@ namespace xsimd { * (See copy at http://boost.org/LICENSE_1_0.txt) * ==================================================== */ - template batch sinh_kernel(batch const& self) { - using batch_type = batch; - batch_type sqr_self = self * self; - return detail::horner(sqr_self) * - self; - } - - template batch sinh_kernel(batch const& self) { - using batch_type = batch; - batch_type sqrself = self * self; - return fma(self, (detail::horner(sqrself) / - detail::horner1(sqrself)) * - sqrself, - self); - } - } - /* origin: boost/simd/arch/common/simd/function/sinh.hpp */ - /* - * ==================================================== - * copyright 2016 NumScale SAS - * - * Distributed under the Boost Software License, Version 1.0. - * (See copy at http://boost.org/LICENSE_1_0.txt) - * ==================================================== - */ - template batch sinh(batch const& a, requires_arch) { - using batch_type = batch; - batch_type half(0.5); - batch_type x = abs(a); - auto lt1 = x < batch_type(1.); - batch_type bts = bitofsign(a); - batch_type z(0.); - if (any(lt1)) - { - z = detail::sinh_kernel(x); - if (all(lt1)) - return z ^ bts; - } - auto test1 = x >( constants::maxlog() - constants::log_2()); - batch_type fac = select(test1, half, batch_type(1.)); - batch_type tmp = exp(x * fac); - batch_type tmp1 = half * tmp; - batch_type r = select(test1, tmp1 * tmp, tmp1 - half / tmp); - return select(lt1, z, r) ^ bts; - } template - inline batch, A> sinh(const batch, A>& z, requires_arch) + inline batch sinh(batch const& a, requires_arch) noexcept + { + using batch_type = batch; + batch_type half(0.5); + batch_type x = abs(a); + auto lt1 = x < batch_type(1.); + batch_type bts = bitofsign(a); + batch_type z(0.); + if (any(lt1)) + { + z = detail::sinh_kernel(x); + if (all(lt1)) + return z ^ bts; + } + auto test1 = x > (constants::maxlog() - constants::log_2()); + batch_type fac = select(test1, half, batch_type(1.)); + batch_type tmp = exp(x * fac); + batch_type tmp1 = half * tmp; + batch_type r = select(test1, tmp1 * tmp, tmp1 - half / tmp); + return select(lt1, z, r) ^ bts; + } + template + inline batch, A> sinh(const batch, A>& z, requires_arch) noexcept { auto x = z.real(); auto y = z.imag(); - return {sinh(x) * cos(y), cosh(x) * sin(y)}; + return { sinh(x) * cos(y), cosh(x) * sin(y) }; } - // tan - template batch tan(batch const& self, requires_arch) { - using batch_type = batch; - const batch_type x = abs(self); - batch_type xr = constants::nan(); - const batch_type n = detail::trigo_reducer::reduce(x, xr); - auto tmp = select(n >= batch_type(2.), batch_type(1.), batch_type(0.)); - auto swap_bit = fma(batch_type(-2.), tmp, n); - auto test = (swap_bit == batch_type(0.)); - const batch_type y = detail::tan_eval(xr, test); - return y ^ bitofsign(self); - } - template batch, A> tan(batch, A> const& z, requires_arch) { + // tan + template + inline batch tan(batch const& self, requires_arch) noexcept + { + using batch_type = batch; + const batch_type x = abs(self); + batch_type xr = constants::nan(); + const batch_type n = detail::trigo_reducer::reduce(x, xr); + auto tmp = select(n >= batch_type(2.), batch_type(1.), batch_type(0.)); + auto swap_bit = fma(batch_type(-2.), tmp, n); + auto test = (swap_bit == batch_type(0.)); + const batch_type y = detail::tan_eval(xr, test); + return y ^ bitofsign(self); + } + template + inline batch, A> tan(batch, A> const& z, requires_arch) noexcept + { using batch_type = batch, A>; using real_batch = typename batch_type::real_batch; real_batch d = cos(2 * z.real()) + cosh(2 * z.imag()); @@ -814,12 +844,85 @@ namespace xsimd { real_batch wimag = sinh(2 * z.imag()); batch_type wres = select(isinf(wimag), batch_type(wreal, real_batch(1.)), batch_type(wreal, wimag / d)); return select(d == real_batch(0.), winf, wres); + } - } + // tanh + namespace detail + { + /* origin: boost/simd/arch/common/detail/generic/tanh_kernel.hpp */ + /* + * ==================================================== + * copyright 2016 NumScale SAS + * + * Distributed under the Boost Software License, Version 1.0. + * (See copy at http://boost.org/LICENSE_1_0.txt) + * ==================================================== + */ + template + struct tanh_kernel; + + template + struct tanh_kernel> + { + using batch_type = batch; + static inline batch_type tanh(const batch_type& x) noexcept + { + batch_type sqrx = x * x; + return fma(detail::horner(sqrx) + * sqrx, + x, x); + } - // tanh - namespace detail { - /* origin: boost/simd/arch/common/detail/generic/tanh_kernel.hpp */ + static inline batch_type cotanh(const batch_type& x) noexcept + { + return batch_type(1.) / tanh(x); + } + }; + + template + struct tanh_kernel> + { + using batch_type = batch; + static inline batch_type tanh(const batch_type& x) noexcept + { + batch_type sqrx = x * x; + return fma(sqrx * p(sqrx) / q(sqrx), x, x); + } + + static inline batch_type cotanh(const batch_type& x) noexcept + { + batch_type sqrx = x * x; + batch_type qval = q(sqrx); + return qval / (x * fma(p(sqrx), sqrx, qval)); + } + + static inline batch_type p(const batch_type& x) noexcept + { + return detail::horner(x); + } + + static inline batch_type q(const batch_type& x) noexcept + { + return detail::horner1(x); + } + }; + + } + /* origin: boost/simd/arch/common/simd/function/tanh.hpp */ /* * ==================================================== * copyright 2016 NumScale SAS @@ -828,109 +931,37 @@ namespace xsimd { * (See copy at http://boost.org/LICENSE_1_0.txt) * ==================================================== */ - template - struct tanh_kernel; - - template - struct tanh_kernel> - { - using batch_type = batch; - static inline batch_type tanh(const batch_type& x) - { - batch_type sqrx = x * x; - return fma(detail::horner(sqrx) * - sqrx, - x, x); - } - - static inline batch_type cotanh(const batch_type& x) - { - return batch_type(1.) / tanh(x); - } - }; - - template - struct tanh_kernel> + template + inline batch tanh(batch const& self, requires_arch) noexcept { - using batch_type = batch; - static inline batch_type tanh(const batch_type& x) + using batch_type = batch; + batch_type one(1.); + batch_type x = abs(self); + auto test = x < (batch_type(5.) / batch_type(8.)); + batch_type bts = bitofsign(self); + batch_type z = one; + if (any(test)) { - batch_type sqrx = x * x; - return fma(sqrx * p(sqrx) / q(sqrx), x, x); + z = detail::tanh_kernel::tanh(x); + if (all(test)) + return z ^ bts; } - - static inline batch_type cotanh(const batch_type& x) - { - batch_type sqrx = x * x; - batch_type qval = q(sqrx); - return qval / (x * fma(p(sqrx), sqrx, qval)); - } - - static inline batch_type p(const batch_type& x) - { - return detail::horner(x); - } - - static inline batch_type q(const batch_type& x) - { - return detail::horner1(x); - } - }; - - } - /* origin: boost/simd/arch/common/simd/function/tanh.hpp */ - /* - * ==================================================== - * copyright 2016 NumScale SAS - * - * Distributed under the Boost Software License, Version 1.0. - * (See copy at http://boost.org/LICENSE_1_0.txt) - * ==================================================== - */ - template batch tanh(batch const& self, requires_arch) { - using batch_type = batch; - batch_type one(1.); - batch_type x = abs(self); - auto test = x < (batch_type(5.) / batch_type(8.)); - batch_type bts = bitofsign(self); - batch_type z = one; - if (any(test)) - { - z = detail::tanh_kernel::tanh(x); - if (all(test)) - return z ^ bts; - } - batch_type r = fma(batch_type(-2.), one / (one + exp(x + x)), one); - return select(test, z, r) ^ bts; - } + batch_type r = fma(batch_type(-2.), one / (one + exp(x + x)), one); + return select(test, z, r) ^ bts; + } template - inline batch, A> tanh(const batch, A>& z, requires_arch) + inline batch, A> tanh(const batch, A>& z, requires_arch) noexcept { using real_batch = typename batch, A>::real_batch; auto x = z.real(); auto y = z.imag(); real_batch two(2); auto d = cosh(two * x) + cos(two * y); - return {sinh(two * x) / d, sin(two * y) / d}; + return { sinh(two * x) / d, sin(two * y) / d }; } - } + } } #endif - diff --git a/third_party/xsimd/arch/xsimd_avx.hpp b/third_party/xsimd/arch/xsimd_avx.hpp index 3634cbedc..ea2e8ce30 100644 --- a/third_party/xsimd/arch/xsimd_avx.hpp +++ b/third_party/xsimd/arch/xsimd_avx.hpp @@ -1,13 +1,13 @@ /*************************************************************************** -* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * -* Martin Renou * -* Copyright (c) QuantStack * -* Copyright (c) Serge Guelton * -* * -* Distributed under the terms of the BSD 3-Clause License. * -* * -* The full license is in the file LICENSE, distributed with this software. * -****************************************************************************/ + * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * + * Martin Renou * + * Copyright (c) QuantStack * + * Copyright (c) Serge Guelton * + * * + * Distributed under the terms of the BSD 3-Clause License. * + * * + * The full license is in the file LICENSE, distributed with this software. * + ****************************************************************************/ #ifndef XSIMD_AVX_HPP #define XSIMD_AVX_HPP @@ -18,897 +18,1212 @@ #include "../types/xsimd_avx_register.hpp" -namespace xsimd { - - namespace kernel { - using namespace types; - - namespace detail { - inline void split_avx(__m256i val, __m128i& low, __m128i& high) { - low =_mm256_castsi256_si128(val); - high =_mm256_extractf128_si256(val, 1); - } - inline __m256i merge_sse(__m128i low, __m128i high) { - return _mm256_insertf128_si256(_mm256_castsi128_si256(low), high, 1); - } - template - __m256i fwd_to_sse(F f, __m256i self) { - __m128i self_low, self_high; - split_avx(self, self_low, self_high); - __m128i res_low = f(self_low); - __m128i res_high = f(self_high); - return merge_sse(res_low, res_high); - } - template - __m256i fwd_to_sse(F f, __m256i self, __m256i other) { - __m128i self_low, self_high, other_low, other_high; - split_avx(self, self_low, self_high); - split_avx(other, other_low, other_high); - __m128i res_low = f(self_low, other_low); - __m128i res_high = f(self_high, other_high); - return merge_sse(res_low, res_high); - } - template - __m256i fwd_to_sse(F f, __m256i self, int32_t other) { - __m128i self_low, self_high; - split_avx(self, self_low, self_high); - __m128i res_low = f(self_low, other); - __m128i res_high = f(self_high, other); - return merge_sse(res_low, res_high); - } - } - - // abs - template batch abs(batch const& self, requires_arch) { - __m256 sign_mask = _mm256_set1_ps(-0.f); // -0.f = 1 << 31 - return _mm256_andnot_ps(sign_mask, self); - } - template batch abs(batch const& self, requires_arch) { - __m256d sign_mask = _mm256_set1_pd(-0.f); // -0.f = 1 << 31 - return _mm256_andnot_pd(sign_mask, self); - } - - // add - template::value, void>::type> - batch add(batch const& self, batch const& other, requires_arch) { - return detail::fwd_to_sse([](__m128i s, __m128i o) { return add(batch(s), batch(o)); }, self, other); - } - template batch add(batch const& self, batch const& other, requires_arch) { - return _mm256_add_ps(self, other); - } - template batch add(batch const& self, batch const& other, requires_arch) { - return _mm256_add_pd(self, other); - } +namespace xsimd +{ - // all - template bool all(batch_bool const& self, requires_arch) { - return _mm256_testc_ps(self, batch_bool(true)) != 0; - } - template bool all(batch_bool const& self, requires_arch) { - return _mm256_testc_pd(self, batch_bool(true)) != 0; - } - template::value, void>::type> - bool all(batch_bool const& self, requires_arch) { - return _mm256_testc_si256(self, batch_bool(true)) != 0; - } + namespace kernel + { + using namespace types; - // any - template bool any(batch_bool const& self, requires_arch) { - return !_mm256_testz_ps(self, self); - } - template bool any(batch_bool const& self, requires_arch) { - return !_mm256_testz_pd(self, self); - } - template::value, void>::type> - bool any(batch_bool const& self, requires_arch) { - return !_mm256_testz_si256(self, self); - } + namespace detail + { + inline void split_avx(__m256i val, __m128i& low, __m128i& high) noexcept + { + low = _mm256_castsi256_si128(val); + high = _mm256_extractf128_si256(val, 1); + } + inline __m256i merge_sse(__m128i low, __m128i high) noexcept + { + return _mm256_insertf128_si256(_mm256_castsi128_si256(low), high, 1); + } + template + inline __m256i fwd_to_sse(F f, __m256i self) noexcept + { + __m128i self_low, self_high; + split_avx(self, self_low, self_high); + __m128i res_low = f(self_low); + __m128i res_high = f(self_high); + return merge_sse(res_low, res_high); + } + template + inline __m256i fwd_to_sse(F f, __m256i self, __m256i other) noexcept + { + __m128i self_low, self_high, other_low, other_high; + split_avx(self, self_low, self_high); + split_avx(other, other_low, other_high); + __m128i res_low = f(self_low, other_low); + __m128i res_high = f(self_high, other_high); + return merge_sse(res_low, res_high); + } + template + inline __m256i fwd_to_sse(F f, __m256i self, int32_t other) noexcept + { + __m128i self_low, self_high; + split_avx(self, self_low, self_high); + __m128i res_low = f(self_low, other); + __m128i res_high = f(self_high, other); + return merge_sse(res_low, res_high); + } + } - // bitwise_and - template batch bitwise_and(batch const& self, batch const& other, requires_arch) { - return _mm256_and_ps(self, other); - } - template batch bitwise_and(batch const& self, batch const& other, requires_arch) { - return _mm256_and_pd(self, other); - } + // abs + template + inline batch abs(batch const& self, requires_arch) noexcept + { + __m256 sign_mask = _mm256_set1_ps(-0.f); // -0.f = 1 << 31 + return _mm256_andnot_ps(sign_mask, self); + } + template + inline batch abs(batch const& self, requires_arch) noexcept + { + __m256d sign_mask = _mm256_set1_pd(-0.f); // -0.f = 1 << 31 + return _mm256_andnot_pd(sign_mask, self); + } - template batch_bool bitwise_and(batch_bool const& self, batch_bool const& other, requires_arch) { - return _mm256_and_ps(self, other); - } - template batch_bool bitwise_and(batch_bool const& self, batch_bool const& other, requires_arch) { - return _mm256_and_pd(self, other); - } + // add + template ::value, void>::type> + inline batch add(batch const& self, batch const& other, requires_arch) noexcept + { + return detail::fwd_to_sse([](__m128i s, __m128i o) noexcept + { return add(batch(s), batch(o)); }, + self, other); + } + template + inline batch add(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm256_add_ps(self, other); + } + template + inline batch add(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm256_add_pd(self, other); + } - template::value, void>::type> - batch bitwise_and(batch const& self, batch const& other, requires_arch) { - return detail::fwd_to_sse([](__m128i s, __m128i o) { return bitwise_and(batch(s), batch(o)); }, self, other); - } - template::value, void>::type> - batch_bool bitwise_and(batch_bool const& self, batch_bool const& other, requires_arch) { - return detail::fwd_to_sse([](__m128i s, __m128i o) { return bitwise_and(batch(s), batch(o)); }, self, other); - } + // all + template + inline bool all(batch_bool const& self, requires_arch) noexcept + { + return _mm256_testc_ps(self, batch_bool(true)) != 0; + } + template + inline bool all(batch_bool const& self, requires_arch) noexcept + { + return _mm256_testc_pd(self, batch_bool(true)) != 0; + } + template ::value, void>::type> + inline bool all(batch_bool const& self, requires_arch) noexcept + { + return _mm256_testc_si256(self, batch_bool(true)) != 0; + } - // bitwise_andnot - template batch bitwise_andnot(batch const& self, batch const& other, requires_arch) { - return _mm256_andnot_ps(self, other); - } - template batch bitwise_andnot(batch const& self, batch const& other, requires_arch) { - return _mm256_andnot_pd(self, other); - } + // any + template + inline bool any(batch_bool const& self, requires_arch) noexcept + { + return !_mm256_testz_ps(self, self); + } + template + inline bool any(batch_bool const& self, requires_arch) noexcept + { + return !_mm256_testz_pd(self, self); + } + template ::value, void>::type> + inline bool any(batch_bool const& self, requires_arch) noexcept + { + return !_mm256_testz_si256(self, self); + } - template batch_bool bitwise_andnot(batch_bool const& self, batch_bool const& other, requires_arch) { - return _mm256_andnot_ps(self, other); - } - template batch_bool bitwise_andnot(batch_bool const& self, batch_bool const& other, requires_arch) { - return _mm256_andnot_pd(self, other); - } + // bitwise_and + template + inline batch bitwise_and(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm256_and_ps(self, other); + } + template + inline batch bitwise_and(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm256_and_pd(self, other); + } - template::value, void>::type> - batch bitwise_andnot(batch const& self, batch const& other, requires_arch) { - return detail::fwd_to_sse([](__m128i s, __m128i o) { return bitwise_andnot(batch(s), batch(o)); }, self, other); - } - template::value, void>::type> - batch_bool bitwise_andnot(batch_bool const& self, batch_bool const& other, requires_arch) { - return detail::fwd_to_sse([](__m128i s, __m128i o) { return bitwise_andnot(batch(s), batch(o)); }, self, other); - } + template + inline batch_bool bitwise_and(batch_bool const& self, batch_bool const& other, requires_arch) noexcept + { + return _mm256_and_ps(self, other); + } + template + inline batch_bool bitwise_and(batch_bool const& self, batch_bool const& other, requires_arch) noexcept + { + return _mm256_and_pd(self, other); + } - // bitwise_lshift - template::value, void>::type> - batch bitwise_lshift(batch const& self, int32_t other, requires_arch) { - return detail::fwd_to_sse([](__m128i s, int32_t o) { return bitwise_lshift(batch(s), o, sse4_2{}); },self, other); - } + template ::value, void>::type> + inline batch bitwise_and(batch const& self, batch const& other, requires_arch) noexcept + { + return detail::fwd_to_sse([](__m128i s, __m128i o) noexcept + { return bitwise_and(batch(s), batch(o)); }, + self, other); + } + template ::value, void>::type> + inline batch_bool bitwise_and(batch_bool const& self, batch_bool const& other, requires_arch) noexcept + { + return detail::fwd_to_sse([](__m128i s, __m128i o) noexcept + { return bitwise_and(batch(s), batch(o)); }, + self, other); + } - // bitwise_not - template::value, void>::type> - batch bitwise_not(batch const& self, requires_arch) { - return detail::fwd_to_sse([](__m128i s) { return bitwise_not(batch(s), sse4_2{}); }, self); - } - template::value, void>::type> - batch_bool bitwise_not(batch_bool const& self, requires_arch) { - return detail::fwd_to_sse([](__m128i s) { return bitwise_not(batch_bool(s), sse4_2{}); }, self); - } + // bitwise_andnot + template + inline batch bitwise_andnot(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm256_andnot_ps(self, other); + } + template + inline batch bitwise_andnot(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm256_andnot_pd(self, other); + } - // bitwise_or - template batch bitwise_or(batch const& self, batch const& other, requires_arch) { - return _mm256_or_ps(self, other); - } - template batch bitwise_or(batch const& self, batch const& other, requires_arch) { - return _mm256_or_pd(self, other); - } - template batch_bool bitwise_or(batch_bool const& self, batch_bool const& other, requires_arch) { - return _mm256_or_ps(self, other); - } - template batch_bool bitwise_or(batch_bool const& self, batch_bool const& other, requires_arch) { - return _mm256_or_pd(self, other); - } - template::value, void>::type> - batch bitwise_or(batch const& self, batch const& other, requires_arch) { - return detail::fwd_to_sse([](__m128i s, __m128i o) { return bitwise_or(batch(s), batch(o)); }, self, other); - } - template::value, void>::type> - batch_bool bitwise_or(batch_bool const& self, batch_bool const& other, requires_arch) { - return detail::fwd_to_sse([](__m128i s, __m128i o) { return bitwise_or(batch_bool(s), batch_bool(o)); }, self, other); - } + template + inline batch_bool bitwise_andnot(batch_bool const& self, batch_bool const& other, requires_arch) noexcept + { + return _mm256_andnot_ps(self, other); + } + template + inline batch_bool bitwise_andnot(batch_bool const& self, batch_bool const& other, requires_arch) noexcept + { + return _mm256_andnot_pd(self, other); + } - // bitwise_rshift - template::value, void>::type> - batch bitwise_rshift(batch const& self, int32_t other, requires_arch) { - return detail::fwd_to_sse([](__m128i s, int32_t o) { return bitwise_rshift(batch(s), o, sse4_2{}); }, self, other); - } + template ::value, void>::type> + inline batch bitwise_andnot(batch const& self, batch const& other, requires_arch) noexcept + { + return detail::fwd_to_sse([](__m128i s, __m128i o) noexcept + { return bitwise_andnot(batch(s), batch(o)); }, + self, other); + } + template ::value, void>::type> + inline batch_bool bitwise_andnot(batch_bool const& self, batch_bool const& other, requires_arch) noexcept + { + return detail::fwd_to_sse([](__m128i s, __m128i o) noexcept + { return bitwise_andnot(batch(s), batch(o)); }, + self, other); + } - // bitwise_xor - template batch bitwise_xor(batch const& self, batch const& other, requires_arch) { - return _mm256_xor_ps(self, other); - } - template batch bitwise_xor(batch const& self, batch const& other, requires_arch) { - return _mm256_xor_pd(self, other); - } - template batch_bool bitwise_xor(batch_bool const& self, batch_bool const& other, requires_arch) { - return _mm256_xor_ps(self, other); - } - template batch_bool bitwise_xor(batch_bool const& self, batch_bool const& other, requires_arch) { - return _mm256_xor_pd(self, other); - } - template::value, void>::type> - batch bitwise_xor(batch const& self, batch const& other, requires_arch) { - return detail::fwd_to_sse([](__m128i s, __m128i o) { return bitwise_xor(batch(s), batch(o), sse4_2{}); }, - self, other); - } - template::value, void>::type> - batch bitwise_xor(batch_bool const& self, batch_bool const& other, requires_arch) { - return detail::fwd_to_sse([](__m128i s, __m128i o) { return bitwise_xor(batch_bool(s), batch_bool(o), sse4_2{}); }, - self, other); - } + // bitwise_lshift + template ::value, void>::type> + inline batch bitwise_lshift(batch const& self, int32_t other, requires_arch) noexcept + { + return detail::fwd_to_sse([](__m128i s, int32_t o) noexcept + { return bitwise_lshift(batch(s), o, sse4_2 {}); }, + self, other); + } - // bitwise_cast - template::value, void>::type> - batch bitwise_cast(batch const& self, batch const &, requires_arch) { - return _mm256_castsi256_ps(self); - } - template::value, void>::type> - batch bitwise_cast(batch const& self, batch const &, requires_arch) { - return _mm256_castsi256_pd(self); - } - template::type>::value, void>::type> - batch bitwise_cast(batch const& self, batch const &, requires_arch) { - return batch(self.data); - } - template - batch bitwise_cast(batch const& self, batch const &, requires_arch) { - return _mm256_castps_pd(self); - } - template::value, void>::type> - batch bitwise_cast(batch const& self, batch const &, requires_arch) { - return _mm256_castps_si256(self); - } - template - batch bitwise_cast(batch const& self, batch const &, requires_arch) { - return _mm256_castpd_ps(self); - } - template::value, void>::type> - batch bitwise_cast(batch const& self, batch const &, requires_arch) { - return _mm256_castpd_si256(self); - } + // bitwise_not + template ::value, void>::type> + inline batch bitwise_not(batch const& self, requires_arch) noexcept + { + return detail::fwd_to_sse([](__m128i s) noexcept + { return bitwise_not(batch(s), sse4_2 {}); }, + self); + } + template ::value, void>::type> + inline batch_bool bitwise_not(batch_bool const& self, requires_arch) noexcept + { + return detail::fwd_to_sse([](__m128i s) noexcept + { return bitwise_not(batch_bool(s), sse4_2 {}); }, + self); + } - // bitwise_not - template batch bitwise_not(batch const& self, requires_arch) { - return _mm256_xor_ps(self, _mm256_castsi256_ps(_mm256_set1_epi32(-1))); - } - template - batch bitwise_not(batch const &self, requires_arch) { - return _mm256_xor_pd(self, _mm256_castsi256_pd(_mm256_set1_epi32(-1))); - } - template batch_bool bitwise_not(batch_bool const& self, requires_arch) { - return _mm256_xor_ps(self, _mm256_castsi256_ps(_mm256_set1_epi32(-1))); - } - template - batch_bool bitwise_not(batch_bool const &self, requires_arch) { - return _mm256_xor_pd(self, _mm256_castsi256_pd(_mm256_set1_epi32(-1))); - } + // bitwise_or + template + inline batch bitwise_or(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm256_or_ps(self, other); + } + template + inline batch bitwise_or(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm256_or_pd(self, other); + } + template + inline batch_bool bitwise_or(batch_bool const& self, batch_bool const& other, requires_arch) noexcept + { + return _mm256_or_ps(self, other); + } + template + inline batch_bool bitwise_or(batch_bool const& self, batch_bool const& other, requires_arch) noexcept + { + return _mm256_or_pd(self, other); + } + template ::value, void>::type> + inline batch bitwise_or(batch const& self, batch const& other, requires_arch) noexcept + { + return detail::fwd_to_sse([](__m128i s, __m128i o) noexcept + { return bitwise_or(batch(s), batch(o)); }, + self, other); + } + template ::value, void>::type> + inline batch_bool bitwise_or(batch_bool const& self, batch_bool const& other, requires_arch) noexcept + { + return detail::fwd_to_sse([](__m128i s, __m128i o) noexcept + { return bitwise_or(batch_bool(s), batch_bool(o)); }, + self, other); + } - // bool_cast - template batch_bool bool_cast(batch_bool const& self, requires_arch) { - return _mm256_castps_si256(self); - } - template batch_bool bool_cast(batch_bool const& self, requires_arch) { - return _mm256_castsi256_ps(self); - } - template batch_bool bool_cast(batch_bool const& self, requires_arch) { - return _mm256_castpd_si256(self); - } - template batch_bool bool_cast(batch_bool const& self, requires_arch) { - return _mm256_castsi256_pd(self); - } + // bitwise_rshift + template ::value, void>::type> + inline batch bitwise_rshift(batch const& self, int32_t other, requires_arch) noexcept + { + return detail::fwd_to_sse([](__m128i s, int32_t o) noexcept + { return bitwise_rshift(batch(s), o, sse4_2 {}); }, + self, other); + } - // broadcast - template::value, void>::type> - batch broadcast(T val, requires_arch) { - switch(sizeof(T)) { - case 1: return _mm256_set1_epi8(val); - case 2: return _mm256_set1_epi16(val); - case 4: return _mm256_set1_epi32(val); - case 8: return _mm256_set1_epi64x(val); - default: assert(false && "unsupported"); return {}; - } - } - template batch broadcast(float val, requires_arch) { - return _mm256_set1_ps(val); - } - template batch broadcast(double val, requires_arch) { - return _mm256_set1_pd(val); - } + // bitwise_xor + template + inline batch bitwise_xor(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm256_xor_ps(self, other); + } + template + inline batch bitwise_xor(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm256_xor_pd(self, other); + } + template + inline batch_bool bitwise_xor(batch_bool const& self, batch_bool const& other, requires_arch) noexcept + { + return _mm256_xor_ps(self, other); + } + template + inline batch_bool bitwise_xor(batch_bool const& self, batch_bool const& other, requires_arch) noexcept + { + return _mm256_xor_pd(self, other); + } + template ::value, void>::type> + inline batch bitwise_xor(batch const& self, batch const& other, requires_arch) noexcept + { + return detail::fwd_to_sse([](__m128i s, __m128i o) noexcept + { return bitwise_xor(batch(s), batch(o), sse4_2 {}); }, + self, other); + } + template ::value, void>::type> + inline batch bitwise_xor(batch_bool const& self, batch_bool const& other, requires_arch) noexcept + { + return detail::fwd_to_sse([](__m128i s, __m128i o) noexcept + { return bitwise_xor(batch_bool(s), batch_bool(o), sse4_2 {}); }, + self, other); + } - // ceil - template batch ceil(batch const& self, requires_arch) { - return _mm256_ceil_ps(self); - } - template batch ceil(batch const& self, requires_arch) { - return _mm256_ceil_pd(self); - } + // bitwise_cast + template ::value, void>::type> + inline batch bitwise_cast(batch const& self, batch const&, requires_arch) noexcept + { + return _mm256_castsi256_ps(self); + } + template ::value, void>::type> + inline batch bitwise_cast(batch const& self, batch const&, requires_arch) noexcept + { + return _mm256_castsi256_pd(self); + } + template ::type>::value, void>::type> + inline batch bitwise_cast(batch const& self, batch const&, requires_arch) noexcept + { + return batch(self.data); + } + template + inline batch bitwise_cast(batch const& self, batch const&, requires_arch) noexcept + { + return _mm256_castps_pd(self); + } + template ::value, void>::type> + inline batch bitwise_cast(batch const& self, batch const&, requires_arch) noexcept + { + return _mm256_castps_si256(self); + } + template + inline batch bitwise_cast(batch const& self, batch const&, requires_arch) noexcept + { + return _mm256_castpd_ps(self); + } + template ::value, void>::type> + inline batch bitwise_cast(batch const& self, batch const&, requires_arch) noexcept + { + return _mm256_castpd_si256(self); + } + // bitwise_not + template + inline batch bitwise_not(batch const& self, requires_arch) noexcept + { + return _mm256_xor_ps(self, _mm256_castsi256_ps(_mm256_set1_epi32(-1))); + } + template + inline batch bitwise_not(batch const& self, requires_arch) noexcept + { + return _mm256_xor_pd(self, _mm256_castsi256_pd(_mm256_set1_epi32(-1))); + } + template + inline batch_bool bitwise_not(batch_bool const& self, requires_arch) noexcept + { + return _mm256_xor_ps(self, _mm256_castsi256_ps(_mm256_set1_epi32(-1))); + } + template + inline batch_bool bitwise_not(batch_bool const& self, requires_arch) noexcept + { + return _mm256_xor_pd(self, _mm256_castsi256_pd(_mm256_set1_epi32(-1))); + } - namespace detail { - // On clang, _mm256_extractf128_ps is built upon build_shufflevector - // which require index parameter to be a constant - template - inline B get_half_complex_f(const B& real, const B& imag) - { - __m128 tmp0 = _mm256_extractf128_ps(real, index); - __m128 tmp1 = _mm256_extractf128_ps(imag, index); - __m128 tmp2 = _mm_unpackhi_ps(tmp0, tmp1); - tmp0 = _mm_unpacklo_ps(tmp0, tmp1); - __m256 res = real; - res = _mm256_insertf128_ps(res, tmp0, 0); - res = _mm256_insertf128_ps(res, tmp2, 1); - return res; - } - template - inline B get_half_complex_d(const B& real, const B& imag) - { - __m128d tmp0 = _mm256_extractf128_pd(real, index); - __m128d tmp1 = _mm256_extractf128_pd(imag, index); - __m128d tmp2 = _mm_unpackhi_pd(tmp0, tmp1); - tmp0 = _mm_unpacklo_pd(tmp0, tmp1); - __m256d res = real; - res = _mm256_insertf128_pd(res, tmp0, 0); - res = _mm256_insertf128_pd(res, tmp2, 1); - return res; - } - - // complex_low - template batch complex_low(batch, A> const& self, requires_arch) { - return get_half_complex_f<0>(self.real(), self.imag()); + // bool_cast + template + inline batch_bool bool_cast(batch_bool const& self, requires_arch) noexcept + { + return _mm256_castps_si256(self); } - template batch complex_low(batch, A> const& self, requires_arch) { - return get_half_complex_d<0>(self.real(), self.imag()); + template + inline batch_bool bool_cast(batch_bool const& self, requires_arch) noexcept + { + return _mm256_castsi256_ps(self); + } + template + inline batch_bool bool_cast(batch_bool const& self, requires_arch) noexcept + { + return _mm256_castpd_si256(self); + } + template + inline batch_bool bool_cast(batch_bool const& self, requires_arch) noexcept + { + return _mm256_castsi256_pd(self); } - // complex_high - template batch complex_high(batch, A> const& self, requires_arch) { - return get_half_complex_f<1>(self.real(), self.imag()); + // broadcast + template ::value, void>::type> + inline batch broadcast(T val, requires_arch) noexcept + { + switch (sizeof(T)) + { + case 1: + return _mm256_set1_epi8(val); + case 2: + return _mm256_set1_epi16(val); + case 4: + return _mm256_set1_epi32(val); + case 8: + return _mm256_set1_epi64x(val); + default: + assert(false && "unsupported"); + return {}; + } } - template batch complex_high(batch, A> const& self, requires_arch) { - return get_half_complex_d<1>(self.real(), self.imag()); + template + inline batch broadcast(float val, requires_arch) noexcept + { + return _mm256_set1_ps(val); + } + template + inline batch broadcast(double val, requires_arch) noexcept + { + return _mm256_set1_pd(val); } - } - // convert - namespace detail { - template batch fast_cast(batch const& self, batch const&, requires_arch) { - return _mm256_cvtepi32_ps(self); - } - template batch fast_cast(batch const& self, batch const&, requires_arch) { - return _mm256_cvttps_epi32(self); - } - } - // div - template batch div(batch const& self, batch const& other, requires_arch) { - return _mm256_div_ps(self, other); - } - template batch div(batch const& self, batch const& other, requires_arch) { - return _mm256_div_pd(self, other); - } + // ceil + template + inline batch ceil(batch const& self, requires_arch) noexcept + { + return _mm256_ceil_ps(self); + } + template + inline batch ceil(batch const& self, requires_arch) noexcept + { + return _mm256_ceil_pd(self); + } - // eq - template batch_bool eq(batch const& self, batch const& other, requires_arch) { - return _mm256_cmp_ps(self, other, _CMP_EQ_OQ); - } - template batch_bool eq(batch const& self, batch const& other, requires_arch) { - return _mm256_cmp_pd(self, other, _CMP_EQ_OQ); - } - template batch_bool eq(batch_bool const& self, batch_bool const& other, requires_arch) { - return _mm256_castsi256_ps(detail::fwd_to_sse([](__m128i s, __m128i o) { return eq(batch_bool(s), batch_bool(o), sse4_2{}); }, - _mm256_castps_si256(self), _mm256_castps_si256(other))); - } - template batch_bool eq(batch_bool const& self, batch_bool const& other, requires_arch) { - return _mm256_castsi256_pd(detail::fwd_to_sse([](__m128i s, __m128i o) { return eq(batch_bool(s), batch_bool(o), sse4_2{}); }, _mm256_castpd_si256(self), _mm256_castpd_si256(other))); - } - template::value, void>::type> - batch_bool eq(batch const& self, batch const& other, requires_arch) { - return detail::fwd_to_sse([](__m128i s, __m128i o) { return eq(batch(s), batch(o), sse4_2{}); },self, other); - } + namespace detail + { + // On clang, _mm256_extractf128_ps is built upon build_shufflevector + // which require index parameter to be a constant + template + inline B get_half_complex_f(const B& real, const B& imag) noexcept + { + __m128 tmp0 = _mm256_extractf128_ps(real, index); + __m128 tmp1 = _mm256_extractf128_ps(imag, index); + __m128 tmp2 = _mm_unpackhi_ps(tmp0, tmp1); + tmp0 = _mm_unpacklo_ps(tmp0, tmp1); + __m256 res = real; + res = _mm256_insertf128_ps(res, tmp0, 0); + res = _mm256_insertf128_ps(res, tmp2, 1); + return res; + } + template + inline B get_half_complex_d(const B& real, const B& imag) noexcept + { + __m128d tmp0 = _mm256_extractf128_pd(real, index); + __m128d tmp1 = _mm256_extractf128_pd(imag, index); + __m128d tmp2 = _mm_unpackhi_pd(tmp0, tmp1); + tmp0 = _mm_unpacklo_pd(tmp0, tmp1); + __m256d res = real; + res = _mm256_insertf128_pd(res, tmp0, 0); + res = _mm256_insertf128_pd(res, tmp2, 1); + return res; + } + + // complex_low + template + inline batch complex_low(batch, A> const& self, requires_arch) noexcept + { + return get_half_complex_f<0>(self.real(), self.imag()); + } + template + inline batch complex_low(batch, A> const& self, requires_arch) noexcept + { + return get_half_complex_d<0>(self.real(), self.imag()); + } - template::value, void>::type> - batch_bool eq(batch_bool const& self, batch_bool const& other, requires_arch) { - return eq(batch(self.data), batch(other.data)); - } + // complex_high + template + inline batch complex_high(batch, A> const& self, requires_arch) noexcept + { + return get_half_complex_f<1>(self.real(), self.imag()); + } + template + inline batch complex_high(batch, A> const& self, requires_arch) noexcept + { + return get_half_complex_d<1>(self.real(), self.imag()); + } + } - // floor - template batch floor(batch const& self, requires_arch) { - return _mm256_floor_ps(self); - } - template batch floor(batch const& self, requires_arch) { - return _mm256_floor_pd(self); - } + // convert + namespace detail + { + template + inline batch fast_cast(batch const& self, batch const&, requires_arch) noexcept + { + return _mm256_cvtepi32_ps(self); + } + + template + inline batch fast_cast(batch const& v, batch const&, requires_arch) noexcept + { + // see https://stackoverflow.com/questions/34066228/how-to-perform-uint32-float-conversion-with-sse + __m256i msk_lo = _mm256_set1_epi32(0xFFFF); + __m256 cnst65536f = _mm256_set1_ps(65536.0f); + + __m256i v_lo = bitwise_and(batch(v), batch(msk_lo)); /* extract the 16 lowest significant bits of self */ + __m256i v_hi = bitwise_rshift(batch(v), 16, avx {}); /* 16 most significant bits of v */ + __m256 v_lo_flt = _mm256_cvtepi32_ps(v_lo); /* No rounding */ + __m256 v_hi_flt = _mm256_cvtepi32_ps(v_hi); /* No rounding */ + v_hi_flt = _mm256_mul_ps(cnst65536f, v_hi_flt); /* No rounding */ + return _mm256_add_ps(v_hi_flt, v_lo_flt); /* Rounding may occur here, mul and add may fuse to fma for haswell and newer */ + } + + template + inline batch fast_cast(batch const& self, batch const&, requires_arch) noexcept + { + return _mm256_cvttps_epi32(self); + } - // ge - template batch_bool ge(batch const& self, batch const& other, requires_arch) { - return _mm256_cmp_ps(self, other, _CMP_GE_OQ); - } - template batch_bool ge(batch const& self, batch const& other, requires_arch) { - return _mm256_cmp_pd(self, other, _CMP_GE_OQ); - } - template::value, void>::type> - batch_bool ge(batch const& self, batch const& other, requires_arch) { - return detail::fwd_to_sse([](__m128i s, __m128i o) { return ge(batch(s), batch(o)); }, self, other); - } + } - // gt - template batch_bool gt(batch const& self, batch const& other, requires_arch) { - return _mm256_cmp_ps(self, other, _CMP_GT_OQ); - } - template batch_bool gt(batch const& self, batch const& other, requires_arch) { - return _mm256_cmp_pd(self, other, _CMP_GT_OQ); - } - template::value, void>::type> - batch_bool gt(batch const& self, batch const& other, requires_arch) { - return detail::fwd_to_sse([](__m128i s, __m128i o) { return gt(batch(s), batch(o)); }, self, other); - } + // div + template + inline batch div(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm256_div_ps(self, other); + } + template + inline batch div(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm256_div_pd(self, other); + } + // eq + template + inline batch_bool eq(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm256_cmp_ps(self, other, _CMP_EQ_OQ); + } + template + inline batch_bool eq(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm256_cmp_pd(self, other, _CMP_EQ_OQ); + } + template + inline batch_bool eq(batch_bool const& self, batch_bool const& other, requires_arch) noexcept + { + return _mm256_castsi256_ps(detail::fwd_to_sse([](__m128i s, __m128i o) noexcept + { return eq(batch_bool(s), batch_bool(o), sse4_2 {}); }, + _mm256_castps_si256(self), _mm256_castps_si256(other))); + } + template + inline batch_bool eq(batch_bool const& self, batch_bool const& other, requires_arch) noexcept + { + return _mm256_castsi256_pd(detail::fwd_to_sse([](__m128i s, __m128i o) noexcept + { return eq(batch_bool(s), batch_bool(o), sse4_2 {}); }, + _mm256_castpd_si256(self), _mm256_castpd_si256(other))); + } + template ::value, void>::type> + inline batch_bool eq(batch const& self, batch const& other, requires_arch) noexcept + { + return detail::fwd_to_sse([](__m128i s, __m128i o) noexcept + { return eq(batch(s), batch(o), sse4_2 {}); }, + self, other); + } - // hadd - template float hadd(batch const& rhs, requires_arch) { - // Warning about _mm256_hadd_ps: - // _mm256_hadd_ps(a,b) gives - // (a0+a1,a2+a3,b0+b1,b2+b3,a4+a5,a6+a7,b4+b5,b6+b7). Hence we can't - // rely on a naive use of this method - // rhs = (x0, x1, x2, x3, x4, x5, x6, x7) - // tmp = (x4, x5, x6, x7, x0, x1, x2, x3) - __m256 tmp = _mm256_permute2f128_ps(rhs, rhs, 1); - // tmp = (x4+x0, x5+x1, x6+x2, x7+x3, x0+x4, x1+x5, x2+x6, x3+x7) - tmp = _mm256_add_ps(rhs, tmp); - // tmp = (x4+x0+x5+x1, x6+x2+x7+x3, -, -, -, -, -, -) - tmp = _mm256_hadd_ps(tmp, tmp); - // tmp = (x4+x0+x5+x1+x6+x2+x7+x3, -, -, -, -, -, -, -) - tmp = _mm256_hadd_ps(tmp, tmp); - return _mm_cvtss_f32(_mm256_extractf128_ps(tmp, 0)); + template ::value, void>::type> + inline batch_bool eq(batch_bool const& self, batch_bool const& other, requires_arch) noexcept + { + return eq(batch(self.data), batch(other.data)); + } - } - template - double hadd(batch const &rhs, requires_arch) { - // rhs = (x0, x1, x2, x3) - // tmp = (x2, x3, x0, x1) - __m256d tmp = _mm256_permute2f128_pd(rhs, rhs, 1); - // tmp = (x2+x0, x3+x1, -, -) - tmp = _mm256_add_pd(rhs, tmp); - // tmp = (x2+x0+x3+x1, -, -, -) - tmp = _mm256_hadd_pd(tmp, tmp); - return _mm_cvtsd_f64(_mm256_extractf128_pd(tmp, 0)); - } - template::value, void>::type> - T hadd(batch const& self, requires_arch) { - __m128i low, high; - detail::split_avx(self, low, high); - batch blow(low), bhigh(high); - return hadd(blow) + hadd(bhigh); - } + // floor + template + inline batch floor(batch const& self, requires_arch) noexcept + { + return _mm256_floor_ps(self); + } + template + inline batch floor(batch const& self, requires_arch) noexcept + { + return _mm256_floor_pd(self); + } - // haddp - template batch haddp(batch const* row, requires_arch) { - // row = (a,b,c,d,e,f,g,h) - // tmp0 = (a0+a1, a2+a3, b0+b1, b2+b3, a4+a5, a6+a7, b4+b5, b6+b7) - __m256 tmp0 = _mm256_hadd_ps(row[0], row[1]); - // tmp1 = (c0+c1, c2+c3, d1+d2, d2+d3, c4+c5, c6+c7, d4+d5, d6+d7) - __m256 tmp1 = _mm256_hadd_ps(row[2], row[3]); - // tmp1 = (a0+a1+a2+a3, b0+b1+b2+b3, c0+c1+c2+c3, d0+d1+d2+d3, - // a4+a5+a6+a7, b4+b5+b6+b7, c4+c5+c6+c7, d4+d5+d6+d7) - tmp1 = _mm256_hadd_ps(tmp0, tmp1); - // tmp0 = (e0+e1, e2+e3, f0+f1, f2+f3, e4+e5, e6+e7, f4+f5, f6+f7) - tmp0 = _mm256_hadd_ps(row[4], row[5]); - // tmp2 = (g0+g1, g2+g3, h0+h1, h2+h3, g4+g5, g6+g7, h4+h5, h6+h7) - __m256 tmp2 = _mm256_hadd_ps(row[6], row[7]); - // tmp2 = (e0+e1+e2+e3, f0+f1+f2+f3, g0+g1+g2+g3, h0+h1+h2+h3, - // e4+e5+e6+e7, f4+f5+f6+f7, g4+g5+g6+g7, h4+h5+h6+h7) - tmp2 = _mm256_hadd_ps(tmp0, tmp2); - // tmp0 = (a0+a1+a2+a3, b0+b1+b2+b3, c0+c1+c2+c3, d0+d1+d2+d3, - // e4+e5+e6+e7, f4+f5+f6+f7, g4+g5+g6+g7, h4+h5+h6+h7) - tmp0 = _mm256_blend_ps(tmp1, tmp2, 0b11110000); - // tmp1 = (a4+a5+a6+a7, b4+b5+b6+b7, c4+c5+c6+c7, d4+d5+d6+d7, - // e0+e1+e2+e3, f0+f1+f2+f3, g0+g1+g2+g3, h0+h1+h2+h3) - tmp1 = _mm256_permute2f128_ps(tmp1, tmp2, 0x21); - return _mm256_add_ps(tmp0, tmp1); - } - template - batch haddp(batch const *row, requires_arch) { - // row = (a,b,c,d) - // tmp0 = (a0+a1, b0+b1, a2+a3, b2+b3) - __m256d tmp0 = _mm256_hadd_pd(row[0], row[1]); - // tmp1 = (c0+c1, d0+d1, c2+c3, d2+d3) - __m256d tmp1 = _mm256_hadd_pd(row[2], row[3]); - // tmp2 = (a0+a1, b0+b1, c2+c3, d2+d3) - __m256d tmp2 = _mm256_blend_pd(tmp0, tmp1, 0b1100); - // tmp1 = (a2+a3, b2+b3, c2+c3, d2+d3) - tmp1 = _mm256_permute2f128_pd(tmp0, tmp1, 0x21); - return _mm256_add_pd(tmp1, tmp2); - } + // hadd + template + inline float hadd(batch const& rhs, requires_arch) noexcept + { + // Warning about _mm256_hadd_ps: + // _mm256_hadd_ps(a,b) gives + // (a0+a1,a2+a3,b0+b1,b2+b3,a4+a5,a6+a7,b4+b5,b6+b7). Hence we can't + // rely on a naive use of this method + // rhs = (x0, x1, x2, x3, x4, x5, x6, x7) + // tmp = (x4, x5, x6, x7, x0, x1, x2, x3) + __m256 tmp = _mm256_permute2f128_ps(rhs, rhs, 1); + // tmp = (x4+x0, x5+x1, x6+x2, x7+x3, x0+x4, x1+x5, x2+x6, x3+x7) + tmp = _mm256_add_ps(rhs, tmp); + // tmp = (x4+x0+x5+x1, x6+x2+x7+x3, -, -, -, -, -, -) + tmp = _mm256_hadd_ps(tmp, tmp); + // tmp = (x4+x0+x5+x1+x6+x2+x7+x3, -, -, -, -, -, -, -) + tmp = _mm256_hadd_ps(tmp, tmp); + return _mm_cvtss_f32(_mm256_extractf128_ps(tmp, 0)); + } + template + inline double hadd(batch const& rhs, requires_arch) noexcept + { + // rhs = (x0, x1, x2, x3) + // tmp = (x2, x3, x0, x1) + __m256d tmp = _mm256_permute2f128_pd(rhs, rhs, 1); + // tmp = (x2+x0, x3+x1, -, -) + tmp = _mm256_add_pd(rhs, tmp); + // tmp = (x2+x0+x3+x1, -, -, -) + tmp = _mm256_hadd_pd(tmp, tmp); + return _mm_cvtsd_f64(_mm256_extractf128_pd(tmp, 0)); + } + template ::value, void>::type> + inline T hadd(batch const& self, requires_arch) noexcept + { + __m128i low, high; + detail::split_avx(self, low, high); + batch blow(low), bhigh(high); + return hadd(blow) + hadd(bhigh); + } - // isnan - template batch_bool isnan(batch const& self, requires_arch) { - return _mm256_cmp_ps(self, self, _CMP_UNORD_Q); - } - template batch_bool isnan(batch const& self, requires_arch) { - return _mm256_cmp_pd(self, self, _CMP_UNORD_Q); - } + // haddp + template + inline batch haddp(batch const* row, requires_arch) noexcept + { + // row = (a,b,c,d,e,f,g,h) + // tmp0 = (a0+a1, a2+a3, b0+b1, b2+b3, a4+a5, a6+a7, b4+b5, b6+b7) + __m256 tmp0 = _mm256_hadd_ps(row[0], row[1]); + // tmp1 = (c0+c1, c2+c3, d1+d2, d2+d3, c4+c5, c6+c7, d4+d5, d6+d7) + __m256 tmp1 = _mm256_hadd_ps(row[2], row[3]); + // tmp1 = (a0+a1+a2+a3, b0+b1+b2+b3, c0+c1+c2+c3, d0+d1+d2+d3, + // a4+a5+a6+a7, b4+b5+b6+b7, c4+c5+c6+c7, d4+d5+d6+d7) + tmp1 = _mm256_hadd_ps(tmp0, tmp1); + // tmp0 = (e0+e1, e2+e3, f0+f1, f2+f3, e4+e5, e6+e7, f4+f5, f6+f7) + tmp0 = _mm256_hadd_ps(row[4], row[5]); + // tmp2 = (g0+g1, g2+g3, h0+h1, h2+h3, g4+g5, g6+g7, h4+h5, h6+h7) + __m256 tmp2 = _mm256_hadd_ps(row[6], row[7]); + // tmp2 = (e0+e1+e2+e3, f0+f1+f2+f3, g0+g1+g2+g3, h0+h1+h2+h3, + // e4+e5+e6+e7, f4+f5+f6+f7, g4+g5+g6+g7, h4+h5+h6+h7) + tmp2 = _mm256_hadd_ps(tmp0, tmp2); + // tmp0 = (a0+a1+a2+a3, b0+b1+b2+b3, c0+c1+c2+c3, d0+d1+d2+d3, + // e4+e5+e6+e7, f4+f5+f6+f7, g4+g5+g6+g7, h4+h5+h6+h7) + tmp0 = _mm256_blend_ps(tmp1, tmp2, 0b11110000); + // tmp1 = (a4+a5+a6+a7, b4+b5+b6+b7, c4+c5+c6+c7, d4+d5+d6+d7, + // e0+e1+e2+e3, f0+f1+f2+f3, g0+g1+g2+g3, h0+h1+h2+h3) + tmp1 = _mm256_permute2f128_ps(tmp1, tmp2, 0x21); + return _mm256_add_ps(tmp0, tmp1); + } + template + inline batch haddp(batch const* row, requires_arch) noexcept + { + // row = (a,b,c,d) + // tmp0 = (a0+a1, b0+b1, a2+a3, b2+b3) + __m256d tmp0 = _mm256_hadd_pd(row[0], row[1]); + // tmp1 = (c0+c1, d0+d1, c2+c3, d2+d3) + __m256d tmp1 = _mm256_hadd_pd(row[2], row[3]); + // tmp2 = (a0+a1, b0+b1, c2+c3, d2+d3) + __m256d tmp2 = _mm256_blend_pd(tmp0, tmp1, 0b1100); + // tmp1 = (a2+a3, b2+b3, c2+c3, d2+d3) + tmp1 = _mm256_permute2f128_pd(tmp0, tmp1, 0x21); + return _mm256_add_pd(tmp1, tmp2); + } - // le - template batch_bool le(batch const& self, batch const& other, requires_arch) { - return _mm256_cmp_ps(self, other, _CMP_LE_OQ); - } - template batch_bool le(batch const& self, batch const& other, requires_arch) { - return _mm256_cmp_pd(self, other, _CMP_LE_OQ); - } + // isnan + template + inline batch_bool isnan(batch const& self, requires_arch) noexcept + { + return _mm256_cmp_ps(self, self, _CMP_UNORD_Q); + } + template + inline batch_bool isnan(batch const& self, requires_arch) noexcept + { + return _mm256_cmp_pd(self, self, _CMP_UNORD_Q); + } - // load_aligned - template::value, void>::type> - batch load_aligned(T const* mem, convert, requires_arch) { - return _mm256_load_si256((__m256i const*)mem); - } - template batch load_aligned(float const* mem, convert, requires_arch) { - return _mm256_load_ps(mem); - } - template batch load_aligned(double const* mem, convert, requires_arch) { - return _mm256_load_pd(mem); - } + // le + template + inline batch_bool le(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm256_cmp_ps(self, other, _CMP_LE_OQ); + } + template + inline batch_bool le(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm256_cmp_pd(self, other, _CMP_LE_OQ); + } - namespace detail - { - // load_complex - template batch, A> load_complex(batch const& hi, batch const& lo, requires_arch) { - using batch_type = batch; - __m128 tmp0 = _mm256_extractf128_ps(hi, 0); - __m128 tmp1 = _mm256_extractf128_ps(hi, 1); - batch_type real, imag; - __m128 tmp_real = _mm_shuffle_ps(tmp0, tmp1, _MM_SHUFFLE(2, 0, 2, 0)); - __m128 tmp_imag = _mm_shuffle_ps(tmp0, tmp1, _MM_SHUFFLE(3, 1, 3, 1)); - real = _mm256_insertf128_ps(real, tmp_real, 0); - imag = _mm256_insertf128_ps(imag, tmp_imag, 0); - - tmp0 = _mm256_extractf128_ps(lo, 0); - tmp1 = _mm256_extractf128_ps(lo, 1); - tmp_real = _mm_shuffle_ps(tmp0, tmp1, _MM_SHUFFLE(2, 0, 2, 0)); - tmp_imag = _mm_shuffle_ps(tmp0, tmp1, _MM_SHUFFLE(3, 1, 3, 1)); - real = _mm256_insertf128_ps(real, tmp_real, 1); - imag = _mm256_insertf128_ps(imag, tmp_imag, 1); - return {real, imag}; - } - template batch, A> load_complex(batch const& hi, batch const& lo, requires_arch) { - using batch_type = batch; - __m128d tmp0 = _mm256_extractf128_pd(hi, 0); - __m128d tmp1 = _mm256_extractf128_pd(hi, 1); - batch_type real, imag; - __m256d re_tmp0 = _mm256_insertf128_pd(real, _mm_unpacklo_pd(tmp0, tmp1), 0); - __m256d im_tmp0 = _mm256_insertf128_pd(imag, _mm_unpackhi_pd(tmp0, tmp1), 0); - tmp0 = _mm256_extractf128_pd(lo, 0); - tmp1 = _mm256_extractf128_pd(lo, 1); - __m256d re_tmp1 = _mm256_insertf128_pd(real, _mm_unpacklo_pd(tmp0, tmp1), 1); - __m256d im_tmp1 = _mm256_insertf128_pd(imag, _mm_unpackhi_pd(tmp0, tmp1), 1); - real = _mm256_blend_pd(re_tmp0, re_tmp1, 12); - imag = _mm256_blend_pd(im_tmp0, im_tmp1, 12); - return {real, imag}; - } - } + // load_aligned + template ::value, void>::type> + inline batch load_aligned(T const* mem, convert, requires_arch) noexcept + { + return _mm256_load_si256((__m256i const*)mem); + } + template + inline batch load_aligned(float const* mem, convert, requires_arch) noexcept + { + return _mm256_load_ps(mem); + } + template + inline batch load_aligned(double const* mem, convert, requires_arch) noexcept + { + return _mm256_load_pd(mem); + } - // load_unaligned - template::value, void>::type> - batch load_unaligned(T const* mem, convert, requires_arch) { - return _mm256_loadu_si256((__m256i const*)mem); - } - template batch load_unaligned(float const* mem, convert, requires_arch){ - return _mm256_loadu_ps(mem); - } - template batch load_unaligned(double const* mem, convert, requires_arch){ - return _mm256_loadu_pd(mem); - } + namespace detail + { + // load_complex + template + inline batch, A> load_complex(batch const& hi, batch const& lo, requires_arch) noexcept + { + using batch_type = batch; + __m128 tmp0 = _mm256_extractf128_ps(hi, 0); + __m128 tmp1 = _mm256_extractf128_ps(hi, 1); + batch_type real, imag; + __m128 tmp_real = _mm_shuffle_ps(tmp0, tmp1, _MM_SHUFFLE(2, 0, 2, 0)); + __m128 tmp_imag = _mm_shuffle_ps(tmp0, tmp1, _MM_SHUFFLE(3, 1, 3, 1)); + real = _mm256_insertf128_ps(real, tmp_real, 0); + imag = _mm256_insertf128_ps(imag, tmp_imag, 0); + + tmp0 = _mm256_extractf128_ps(lo, 0); + tmp1 = _mm256_extractf128_ps(lo, 1); + tmp_real = _mm_shuffle_ps(tmp0, tmp1, _MM_SHUFFLE(2, 0, 2, 0)); + tmp_imag = _mm_shuffle_ps(tmp0, tmp1, _MM_SHUFFLE(3, 1, 3, 1)); + real = _mm256_insertf128_ps(real, tmp_real, 1); + imag = _mm256_insertf128_ps(imag, tmp_imag, 1); + return { real, imag }; + } + template + inline batch, A> load_complex(batch const& hi, batch const& lo, requires_arch) noexcept + { + using batch_type = batch; + __m128d tmp0 = _mm256_extractf128_pd(hi, 0); + __m128d tmp1 = _mm256_extractf128_pd(hi, 1); + batch_type real, imag; + __m256d re_tmp0 = _mm256_insertf128_pd(real, _mm_unpacklo_pd(tmp0, tmp1), 0); + __m256d im_tmp0 = _mm256_insertf128_pd(imag, _mm_unpackhi_pd(tmp0, tmp1), 0); + tmp0 = _mm256_extractf128_pd(lo, 0); + tmp1 = _mm256_extractf128_pd(lo, 1); + __m256d re_tmp1 = _mm256_insertf128_pd(real, _mm_unpacklo_pd(tmp0, tmp1), 1); + __m256d im_tmp1 = _mm256_insertf128_pd(imag, _mm_unpackhi_pd(tmp0, tmp1), 1); + real = _mm256_blend_pd(re_tmp0, re_tmp1, 12); + imag = _mm256_blend_pd(im_tmp0, im_tmp1, 12); + return { real, imag }; + } + } - // lt - template batch_bool lt(batch const& self, batch const& other, requires_arch) { - return _mm256_cmp_ps(self, other, _CMP_LT_OQ); - } - template batch_bool lt(batch const& self, batch const& other, requires_arch) { - return _mm256_cmp_pd(self, other, _CMP_LT_OQ); - } + // load_unaligned + template ::value, void>::type> + inline batch load_unaligned(T const* mem, convert, requires_arch) noexcept + { + return _mm256_loadu_si256((__m256i const*)mem); + } + template + inline batch load_unaligned(float const* mem, convert, requires_arch) noexcept + { + return _mm256_loadu_ps(mem); + } + template + inline batch load_unaligned(double const* mem, convert, requires_arch) noexcept + { + return _mm256_loadu_pd(mem); + } - template::value, void>::type> - batch_bool lt(batch const& self, batch const& other, requires_arch) { - return detail::fwd_to_sse([](__m128i s, __m128i o) { return lt(batch(s), batch(o)); }, self, other); - } + // lt + template + inline batch_bool lt(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm256_cmp_ps(self, other, _CMP_LT_OQ); + } + template + inline batch_bool lt(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm256_cmp_pd(self, other, _CMP_LT_OQ); + } - // max - template batch max(batch const& self, batch const& other, requires_arch) { - return _mm256_max_ps(self, other); - } - template batch max(batch const& self, batch const& other, requires_arch) { - return _mm256_max_pd(self, other); - } - template::value, void>::type> - batch max(batch const& self, batch const& other, requires_arch) { - return select(self > other, self, other); - } + template ::value, void>::type> + inline batch_bool lt(batch const& self, batch const& other, requires_arch) noexcept + { + return detail::fwd_to_sse([](__m128i s, __m128i o) noexcept + { return lt(batch(s), batch(o)); }, + self, other); + } - // min - template batch min(batch const& self, batch const& other, requires_arch) { - return _mm256_min_ps(self, other); - } - template batch min(batch const& self, batch const& other, requires_arch) { - return _mm256_min_pd(self, other); - } - template::value, void>::type> - batch min(batch const& self, batch const& other, requires_arch) { - return select(self <= other, self, other); - } + // max + template + inline batch max(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm256_max_ps(self, other); + } + template + inline batch max(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm256_max_pd(self, other); + } + template ::value, void>::type> + inline batch max(batch const& self, batch const& other, requires_arch) noexcept + { + return select(self > other, self, other); + } - // mul - template batch mul(batch const& self, batch const& other, requires_arch) { - return _mm256_mul_ps(self, other); - } - template batch mul(batch const& self, batch const& other, requires_arch) { - return _mm256_mul_pd(self, other); - } + // min + template + inline batch min(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm256_min_ps(self, other); + } + template + inline batch min(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm256_min_pd(self, other); + } + template ::value, void>::type> + inline batch min(batch const& self, batch const& other, requires_arch) noexcept + { + return select(self <= other, self, other); + } - // nearbyint - template batch nearbyint(batch const& self, requires_arch) { - return _mm256_round_ps(self, _MM_FROUND_TO_NEAREST_INT); - } - template batch nearbyint(batch const& self, requires_arch) { - return _mm256_round_pd(self, _MM_FROUND_TO_NEAREST_INT); - } + // mul + template + inline batch mul(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm256_mul_ps(self, other); + } + template + inline batch mul(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm256_mul_pd(self, other); + } - // neg - template::value, void>::type> - batch neg(batch const& self, requires_arch) { - return 0 - self; - } - template batch neg(batch const& self, requires_arch) { - return _mm256_xor_ps(self, _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000))); - } - template - batch neg(batch const &self, requires_arch) { - return _mm256_xor_pd(self, _mm256_castsi256_pd(_mm256_set1_epi64x(0x8000000000000000))); - } + // nearbyint + template + inline batch nearbyint(batch const& self, requires_arch) noexcept + { + return _mm256_round_ps(self, _MM_FROUND_TO_NEAREST_INT); + } + template + inline batch nearbyint(batch const& self, requires_arch) noexcept + { + return _mm256_round_pd(self, _MM_FROUND_TO_NEAREST_INT); + } - // neq - template batch_bool neq(batch const& self, batch const& other, requires_arch) { - return _mm256_cmp_ps(self, other, _CMP_NEQ_OQ); - } - template batch_bool neq(batch const& self, batch const& other, requires_arch) { - return _mm256_cmp_pd(self, other, _CMP_NEQ_OQ); - } - template::value, void>::type> - batch_bool neq(batch const& self, batch const& other, requires_arch) { - return ~(self == other); - } + // neg + template ::value, void>::type> + inline batch neg(batch const& self, requires_arch) noexcept + { + return 0 - self; + } + template + batch neg(batch const& self, requires_arch) + { + return _mm256_xor_ps(self, _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000))); + } + template + inline batch neg(batch const& self, requires_arch) noexcept + { + return _mm256_xor_pd(self, _mm256_castsi256_pd(_mm256_set1_epi64x(0x8000000000000000))); + } + // neq + template + inline batch_bool neq(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm256_cmp_ps(self, other, _CMP_NEQ_OQ); + } + template + inline batch_bool neq(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm256_cmp_pd(self, other, _CMP_NEQ_OQ); + } + template ::value, void>::type> + inline batch_bool neq(batch const& self, batch const& other, requires_arch) noexcept + { + return ~(self == other); + } - template batch_bool neq(batch_bool const& self, batch_bool const& other, requires_arch) { - return _mm256_xor_ps(self, other); - } - template batch_bool neq(batch_bool const& self, batch_bool const& other, requires_arch) { - return _mm256_xor_pd(self, other); - } - template::value, void>::type> - batch_bool neq(batch_bool const& self, batch_bool const& other, requires_arch) { - return ~(self == other); - } + template + inline batch_bool neq(batch_bool const& self, batch_bool const& other, requires_arch) noexcept + { + return _mm256_xor_ps(self, other); + } + template + inline batch_bool neq(batch_bool const& self, batch_bool const& other, requires_arch) noexcept + { + return _mm256_xor_pd(self, other); + } + template ::value, void>::type> + inline batch_bool neq(batch_bool const& self, batch_bool const& other, requires_arch) noexcept + { + return ~(self == other); + } - // sadd - template batch sadd(batch const& self, batch const& other, requires_arch) { - return add(self, other); // no saturated arithmetic on floating point numbers - } - template batch sadd(batch const& self, batch const& other, requires_arch) { - return add(self, other); // no saturated arithmetic on floating point numbers - } - template::value, void>::type> - batch sadd(batch const& self, batch const& other, requires_arch) { - if(std::is_signed::value) { - auto mask = (other >> (8 * sizeof(T) - 1)); - auto self_pos_branch = min(std::numeric_limits::max() - other, self); - auto self_neg_branch = max(std::numeric_limits::min() - other, self); - return other + select(batch_bool(mask.data), self_neg_branch, self_pos_branch); - } - else { - const auto diffmax = std::numeric_limits::max() - self; - const auto mindiff = min(diffmax, other); - return self + mindiff; - } - } + // sadd + template + inline batch sadd(batch const& self, batch const& other, requires_arch) noexcept + { + return add(self, other); // no saturated arithmetic on floating point numbers + } + template + inline batch sadd(batch const& self, batch const& other, requires_arch) noexcept + { + return add(self, other); // no saturated arithmetic on floating point numbers + } + template ::value, void>::type> + inline batch sadd(batch const& self, batch const& other, requires_arch) noexcept + { + if (std::is_signed::value) + { + auto mask = (other >> (8 * sizeof(T) - 1)); + auto self_pos_branch = min(std::numeric_limits::max() - other, self); + auto self_neg_branch = max(std::numeric_limits::min() - other, self); + return other + select(batch_bool(mask.data), self_neg_branch, self_pos_branch); + } + else + { + const auto diffmax = std::numeric_limits::max() - self; + const auto mindiff = min(diffmax, other); + return self + mindiff; + } + } - // select - template batch select(batch_bool const& cond, batch const& true_br, batch const& false_br, requires_arch) { - return _mm256_blendv_ps(false_br, true_br, cond); - } - template batch select(batch_bool const& cond, batch const& true_br, batch const& false_br, requires_arch) { - return _mm256_blendv_pd(false_br, true_br, cond); - } - template::value, void>::type> - batch select(batch_bool const& cond, batch const& true_br, batch const& false_br, requires_arch) { - __m128i cond_low, cond_hi; - detail::split_avx(cond, cond_low, cond_hi); + // select + template + inline batch select(batch_bool const& cond, batch const& true_br, batch const& false_br, requires_arch) noexcept + { + return _mm256_blendv_ps(false_br, true_br, cond); + } + template + inline batch select(batch_bool const& cond, batch const& true_br, batch const& false_br, requires_arch) noexcept + { + return _mm256_blendv_pd(false_br, true_br, cond); + } + template ::value, void>::type> + inline batch select(batch_bool const& cond, batch const& true_br, batch const& false_br, requires_arch) noexcept + { + __m128i cond_low, cond_hi; + detail::split_avx(cond, cond_low, cond_hi); - __m128i true_low, true_hi; - detail::split_avx(true_br, true_low, true_hi); + __m128i true_low, true_hi; + detail::split_avx(true_br, true_low, true_hi); - __m128i false_low, false_hi; - detail::split_avx(false_br, false_low, false_hi); + __m128i false_low, false_hi; + detail::split_avx(false_br, false_low, false_hi); - __m128i res_low = select(batch_bool(cond_low), batch(true_low), batch(false_low), sse4_2{}); - __m128i res_hi = select(batch_bool(cond_hi), batch(true_hi), batch(false_hi), sse4_2{}); - return detail::merge_sse(res_low, res_hi); - } - template::value, void>::type> - batch select(batch_bool_constant, Values...> const&, batch const& true_br, batch const& false_br, requires_arch) { - return select(batch_bool{Values...}, true_br, false_br, avx2{}); - } + __m128i res_low = select(batch_bool(cond_low), batch(true_low), batch(false_low), sse4_2 {}); + __m128i res_hi = select(batch_bool(cond_hi), batch(true_hi), batch(false_hi), sse4_2 {}); + return detail::merge_sse(res_low, res_hi); + } + template ::value, void>::type> + inline batch select(batch_bool_constant, Values...> const&, batch const& true_br, batch const& false_br, requires_arch) noexcept + { + return select(batch_bool { Values... }, true_br, false_br, avx2 {}); + } + // set + template + inline batch set(batch const&, requires_arch, Values... values) noexcept + { + static_assert(sizeof...(Values) == batch::size, "consistent init"); + return _mm256_setr_ps(values...); + } - // set - template - batch set(batch const&, requires_arch, Values... values) { - static_assert(sizeof...(Values) == batch::size, "consistent init"); - return _mm256_setr_ps(values...); - } + template + inline batch set(batch const&, requires_arch, Values... values) noexcept + { + static_assert(sizeof...(Values) == batch::size, "consistent init"); + return _mm256_setr_pd(values...); + } + template ::value, void>::type> + inline batch set(batch const&, requires_arch, T v0, T v1, T v2, T v3) noexcept + { + return _mm256_set_epi64x(v3, v2, v1, v0); + } + template ::value, void>::type> + inline batch set(batch const&, requires_arch, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7) noexcept + { + return _mm256_setr_epi32(v0, v1, v2, v3, v4, v5, v6, v7); + } + template ::value, void>::type> + inline batch set(batch const&, requires_arch, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7, T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15) noexcept + { + return _mm256_setr_epi16(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15); + } + template ::value, void>::type> + inline batch set(batch const&, requires_arch, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7, T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15, + T v16, T v17, T v18, T v19, T v20, T v21, T v22, T v23, T v24, T v25, T v26, T v27, T v28, T v29, T v30, T v31) noexcept + { + return _mm256_setr_epi8(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31); + } - template - batch set(batch const&, requires_arch, Values... values) { - static_assert(sizeof...(Values) == batch::size, "consistent init"); - return _mm256_setr_pd(values...); - } - template::value, void>::type> - batch set(batch const&, requires_arch, T v0, T v1, T v2, T v3) { - return _mm256_set_epi64x(v3, v2, v1, v0); - } - template::value, void>::type> - batch set(batch const&, requires_arch, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7) { - return _mm256_setr_epi32(v0, v1, v2, v3, v4, v5, v6, v7); - } - template::value, void>::type> - batch set(batch const&, requires_arch, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7, T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15) { - return _mm256_setr_epi16(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15); - } - template::value, void>::type> - batch set(batch const&, requires_arch, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7, T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15, - T v16, T v17, T v18, T v19, T v20, T v21, T v22, T v23, T v24, T v25, T v26, T v27, T v28, T v29, T v30, T v31 ) { - return _mm256_setr_epi8(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31); - } + template ::value, void>::type> + inline batch_bool set(batch_bool const&, requires_arch, Values... values) noexcept + { + return set(batch(), A {}, static_cast(values ? -1LL : 0LL)...).data; + } - template::value, void>::type> - batch_bool set(batch_bool const&, requires_arch, Values... values) { - return set(batch(), A{}, static_cast(values ? -1LL : 0LL )...).data; - } + template + inline batch_bool set(batch_bool const&, requires_arch, Values... values) noexcept + { + static_assert(sizeof...(Values) == batch_bool::size, "consistent init"); + return _mm256_castsi256_ps(set(batch(), A {}, static_cast(values ? -1LL : 0LL)...).data); + } - template - batch_bool set(batch_bool const&, requires_arch, Values... values) { - static_assert(sizeof...(Values) == batch_bool::size, "consistent init"); - return _mm256_castsi256_ps(set(batch(), A{}, static_cast(values ? -1LL : 0LL )...).data); - } + template + inline batch_bool set(batch_bool const&, requires_arch, Values... values) noexcept + { + static_assert(sizeof...(Values) == batch_bool::size, "consistent init"); + return _mm256_castsi256_pd(set(batch(), A {}, static_cast(values ? -1LL : 0LL)...).data); + } - template - batch_bool set(batch_bool const&, requires_arch, Values... values) { - static_assert(sizeof...(Values) == batch_bool::size, "consistent init"); - return _mm256_castsi256_pd(set(batch(), A{}, static_cast(values ? -1LL : 0LL )...).data); - } + // sqrt + template + inline batch sqrt(batch const& val, requires_arch) noexcept + { + return _mm256_sqrt_ps(val); + } + template + inline batch sqrt(batch const& val, requires_arch) noexcept + { + return _mm256_sqrt_pd(val); + } - // sqrt - template batch sqrt(batch const& val, requires_arch) { - return _mm256_sqrt_ps(val); - } - template batch sqrt(batch const& val, requires_arch) { - return _mm256_sqrt_pd(val); - } + // ssub + template + inline batch ssub(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm256_sub_ps(self, other); // no saturated arithmetic on floating point numbers + } + template + inline batch ssub(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm256_sub_pd(self, other); // no saturated arithmetic on floating point numbers + } + template ::value, void>::type> + inline batch ssub(batch const& self, batch const& other, requires_arch) noexcept + { + if (std::is_signed::value) + { + return sadd(self, -other); + } + else + { + const auto diff = min(self, other); + return self - diff; + } + } - // ssub - template batch ssub(batch const& self, batch const& other, requires_arch) { - return _mm256_sub_ps(self, other); // no saturated arithmetic on floating point numbers - } - template batch ssub(batch const& self, batch const& other, requires_arch) { - return _mm256_sub_pd(self, other); // no saturated arithmetic on floating point numbers - } - template::value, void>::type> - batch ssub(batch const& self, batch const& other, requires_arch) { - if(std::is_signed::value) { - return sadd(self, -other); - } - else { - const auto diff = min(self, other); - return self - diff; - } - } + // store_aligned + template ::value, void>::type> + inline void store_aligned(T* mem, batch const& self, requires_arch) noexcept + { + return _mm256_store_si256((__m256i*)mem, self); + } + template ::value, void>::type> + inline void store_aligned(T* mem, batch_bool const& self, requires_arch) noexcept + { + return _mm256_store_si256((__m256i*)mem, self); + } + template + inline void store_aligned(float* mem, batch const& self, requires_arch) noexcept + { + return _mm256_store_ps(mem, self); + } + template + inline void store_aligned(double* mem, batch const& self, requires_arch) noexcept + { + return _mm256_store_pd(mem, self); + } - // store_aligned - template::value, void>::type> - void store_aligned(T *mem, batch const& self, requires_arch) { - return _mm256_store_si256((__m256i *)mem, self); - } - template::value, void>::type> - void store_aligned(T *mem, batch_bool const& self, requires_arch) { - return _mm256_store_si256((__m256i *)mem, self); - } - template void store_aligned(float *mem, batch const& self, requires_arch) { - return _mm256_store_ps(mem, self); - } - template void store_aligned(double *mem, batch const& self, requires_arch) { - return _mm256_store_pd(mem, self); - } + // store_unaligned + template ::value, void>::type> + inline void store_unaligned(T* mem, batch const& self, requires_arch) noexcept + { + return _mm256_storeu_si256((__m256i*)mem, self); + } + template ::value, void>::type> + inline void store_unaligned(T* mem, batch_bool const& self, requires_arch) noexcept + { + return _mm256_storeu_si256((__m256i*)mem, self); + } + template + inline void store_unaligned(float* mem, batch const& self, requires_arch) noexcept + { + return _mm256_storeu_ps(mem, self); + } + template + inline void store_unaligned(double* mem, batch const& self, requires_arch) noexcept + { + return _mm256_storeu_pd(mem, self); + } - // store_unaligned - template::value, void>::type> - void store_unaligned(T *mem, batch const& self, requires_arch) { - return _mm256_storeu_si256((__m256i *)mem, self); - } - template::value, void>::type> - void store_unaligned(T *mem, batch_bool const& self, requires_arch) { - return _mm256_storeu_si256((__m256i *)mem, self); - } - template void store_unaligned(float *mem, batch const& self, requires_arch) { - return _mm256_storeu_ps(mem, self); - } - template void store_unaligned(double *mem, batch const& self, requires_arch) { - return _mm256_storeu_pd(mem, self); - } + // sub + template ::value, void>::type> + inline batch sub(batch const& self, batch const& other, requires_arch) noexcept + { + return detail::fwd_to_sse([](__m128i s, __m128i o) noexcept + { return sub(batch(s), batch(o)); }, + self, other); + } + template + inline batch sub(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm256_sub_ps(self, other); + } + template + inline batch sub(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm256_sub_pd(self, other); + } - // sub - template::value, void>::type> - batch sub(batch const& self, batch const& other, requires_arch) { - return detail::fwd_to_sse([](__m128i s, __m128i o) { return sub(batch(s), batch(o)); }, self, other); - } - template batch sub(batch const& self, batch const& other, requires_arch) { - return _mm256_sub_ps(self, other); - } - template batch sub(batch const& self, batch const& other, requires_arch) { - return _mm256_sub_pd(self, other); - } + // to_float + template + inline batch to_float(batch const& self, requires_arch) noexcept + { + return _mm256_cvtepi32_ps(self); + } + template + inline batch to_float(batch const& self, requires_arch) noexcept + { + // FIXME: call _mm_cvtepi64_pd + alignas(A::alignment()) int64_t buffer[batch::size]; + self.store_aligned(&buffer[0]); + return { + (double)buffer[0], + (double)buffer[1], + (double)buffer[2], + (double)buffer[3], + }; + } - // to_float - template - batch to_float(batch const& self, requires_arch) { - return _mm256_cvtepi32_ps(self); - } - template - batch to_float(batch const& self, requires_arch) { - // FIXME: call _mm_cvtepi64_pd - alignas(A::alignment()) int64_t buffer[batch::size]; - self.store_aligned(&buffer[0]); - return {(double)buffer[0], (double)buffer[1], (double)buffer[2], (double)buffer[3],}; - } + // to_int + template + inline batch to_int(batch const& self, requires_arch) noexcept + { + return _mm256_cvttps_epi32(self); + } - // to_int - template - batch to_int(batch const& self, requires_arch) { - return _mm256_cvttps_epi32(self); - } + template + inline batch to_int(batch const& self, requires_arch) noexcept + { + // FIXME: call _mm_cvttpd_epi64 + alignas(A::alignment()) double buffer[batch::size]; + self.store_aligned(&buffer[0]); + return { (int64_t)buffer[0], (int64_t)buffer[1], (int64_t)buffer[2], (int64_t)buffer[3] }; + } - template - batch to_int(batch const& self, requires_arch) { - // FIXME: call _mm_cvttpd_epi64 - alignas(A::alignment()) double buffer[batch::size]; - self.store_aligned(&buffer[0]); - return {(int64_t)buffer[0], (int64_t)buffer[1], (int64_t)buffer[2], (int64_t)buffer[3]}; - } + // trunc + template + inline batch trunc(batch const& self, requires_arch) noexcept + { + return _mm256_round_ps(self, _MM_FROUND_TO_ZERO); + } + template + inline batch trunc(batch const& self, requires_arch) noexcept + { + return _mm256_round_pd(self, _MM_FROUND_TO_ZERO); + } - // trunc - template batch trunc(batch const& self, requires_arch) { - return _mm256_round_ps(self, _MM_FROUND_TO_ZERO); - } - template batch trunc(batch const& self, requires_arch) { - return _mm256_round_pd(self, _MM_FROUND_TO_ZERO); - } + // zip_hi + template ::value, void>::type> + inline batch zip_hi(batch const& self, batch const& other, requires_arch) noexcept + { + switch (sizeof(T)) + { + case 1: + return _mm256_unpackhi_epi8(self, other); + case 2: + return _mm256_unpackhi_epi16(self, other); + case 4: + return _mm256_unpackhi_epi32(self, other); + case 8: + return _mm256_unpackhi_epi64(self, other); + default: + assert(false && "unsupported arch/op combination"); + return {}; + } + } + template + inline batch zip_hi(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm256_unpackhi_ps(self, other); + } + template + inline batch zip_hi(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm256_unpackhi_pd(self, other); + } - // zip_hi - template::value, void>::type> - batch zip_hi(batch const& self, batch const& other, requires_arch) { - switch(sizeof(T)) { - case 1: return _mm256_unpackhi_epi8(self, other); - case 2: return _mm256_unpackhi_epi16(self, other); - case 4: return _mm256_unpackhi_epi32(self, other); - case 8: return _mm256_unpackhi_epi64(self, other); - default: assert(false && "unsupported arch/op combination"); return {}; - } - } - template batch zip_hi(batch const& self, batch const& other, requires_arch) { - return _mm256_unpackhi_ps(self, other); - } - template batch zip_hi(batch const& self, batch const& other, requires_arch) { - return _mm256_unpackhi_pd(self, other); - } + // zip_lo + template ::value, void>::type> + inline batch zip_lo(batch const& self, batch const& other, requires_arch) noexcept + { + switch (sizeof(T)) + { + case 1: + return _mm256_unpacklo_epi8(self, other); + case 2: + return _mm256_unpacklo_epi16(self, other); + case 4: + return _mm256_unpacklo_epi32(self, other); + case 8: + return _mm256_unpacklo_epi64(self, other); + default: + assert(false && "unsupported arch/op combination"); + return {}; + } + } + template + inline batch zip_lo(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm256_unpacklo_ps(self, other); + } + template + inline batch zip_lo(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm256_unpacklo_pd(self, other); + } - // zip_lo - template::value, void>::type> - batch zip_lo(batch const& self, batch const& other, requires_arch) { - switch(sizeof(T)) { - case 1: return _mm256_unpacklo_epi8(self, other); - case 2: return _mm256_unpacklo_epi16(self, other); - case 4: return _mm256_unpacklo_epi32(self, other); - case 8: return _mm256_unpacklo_epi64(self, other); - default: assert(false && "unsupported arch/op combination"); return {}; - } } - template batch zip_lo(batch const& self, batch const& other, requires_arch) { - return _mm256_unpacklo_ps(self, other); - } - template batch zip_lo(batch const& self, batch const& other, requires_arch) { - return _mm256_unpacklo_pd(self, other); - } - - } } diff --git a/third_party/xsimd/arch/xsimd_avx2.hpp b/third_party/xsimd/arch/xsimd_avx2.hpp index 2dacb2eba..301191e0b 100644 --- a/third_party/xsimd/arch/xsimd_avx2.hpp +++ b/third_party/xsimd/arch/xsimd_avx2.hpp @@ -1,13 +1,13 @@ /*************************************************************************** -* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * -* Martin Renou * -* Copyright (c) QuantStack * -* Copyright (c) Serge Guelton * -* * -* Distributed under the terms of the BSD 3-Clause License. * -* * -* The full license is in the file LICENSE, distributed with this software. * -****************************************************************************/ + * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * + * Martin Renou * + * Copyright (c) QuantStack * + * Copyright (c) Serge Guelton * + * * + * Distributed under the terms of the BSD 3-Clause License. * + * * + * The full license is in the file LICENSE, distributed with this software. * + ****************************************************************************/ #ifndef XSIMD_AVX2_HPP #define XSIMD_AVX2_HPP @@ -17,211 +17,313 @@ #include "../types/xsimd_avx2_register.hpp" - -namespace xsimd { - - namespace kernel { - using namespace types; - - // abs - template::value, void>::type> - batch abs(batch const& self, requires_arch) { - if(std::is_signed::value) { - switch(sizeof(T)) { - case 1: return _mm256_abs_epi8(self); - case 2: return _mm256_abs_epi16(self); - case 4: return _mm256_abs_epi32(self); - default: return abs(self, avx{}); +namespace xsimd +{ + + namespace kernel + { + using namespace types; + + // abs + template ::value, void>::type> + inline batch abs(batch const& self, requires_arch) noexcept + { + if (std::is_signed::value) + { + switch (sizeof(T)) + { + case 1: + return _mm256_abs_epi8(self); + case 2: + return _mm256_abs_epi16(self); + case 4: + return _mm256_abs_epi32(self); + default: + return abs(self, avx {}); + } + } + return self; } - } - return self; - } - // add - template::value, void>::type> - batch add(batch const& self, batch const& other, requires_arch) { - switch(sizeof(T)) { - case 1: return _mm256_add_epi8(self, other); - case 2: return _mm256_add_epi16(self, other); - case 4: return _mm256_add_epi32(self, other); - case 8: return _mm256_add_epi64(self, other); - default: return add(self, other, avx{}); - } - } + // add + template ::value, void>::type> + inline batch add(batch const& self, batch const& other, requires_arch) noexcept + { + switch (sizeof(T)) + { + case 1: + return _mm256_add_epi8(self, other); + case 2: + return _mm256_add_epi16(self, other); + case 4: + return _mm256_add_epi32(self, other); + case 8: + return _mm256_add_epi64(self, other); + default: + return add(self, other, avx {}); + } + } - // bitwise_and - template::value, void>::type> - batch bitwise_and(batch const& self, batch const& other, requires_arch) { - return _mm256_and_si256(self, other); - } - template::value, void>::type> - batch_bool bitwise_and(batch_bool const& self, batch_bool const& other, requires_arch) { - return _mm256_and_si256(self, other); - } + // bitwise_and + template ::value, void>::type> + inline batch bitwise_and(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm256_and_si256(self, other); + } + template ::value, void>::type> + inline batch_bool bitwise_and(batch_bool const& self, batch_bool const& other, requires_arch) noexcept + { + return _mm256_and_si256(self, other); + } - // bitwise_andnot - template::value, void>::type> - batch bitwise_andnot(batch const& self, batch const& other, requires_arch) { - return _mm256_andnot_si256(self, other); - } - template::value, void>::type> - batch_bool bitwise_andnot(batch_bool const& self, batch_bool const& other, requires_arch) { - return _mm256_andnot_si256(self, other); - } + // bitwise_andnot + template ::value, void>::type> + inline batch bitwise_andnot(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm256_andnot_si256(self, other); + } + template ::value, void>::type> + inline batch_bool bitwise_andnot(batch_bool const& self, batch_bool const& other, requires_arch) noexcept + { + return _mm256_andnot_si256(self, other); + } - // bitwise_not - template::value, void>::type> - batch bitwise_not(batch const& self, requires_arch) { - return _mm256_xor_si256(self, _mm256_set1_epi32(-1)); - } - template::value, void>::type> - batch_bool bitwise_not(batch_bool const& self, requires_arch) { - return _mm256_xor_si256(self, _mm256_set1_epi32(-1)); - } + // bitwise_not + template ::value, void>::type> + inline batch bitwise_not(batch const& self, requires_arch) noexcept + { + return _mm256_xor_si256(self, _mm256_set1_epi32(-1)); + } + template ::value, void>::type> + inline batch_bool bitwise_not(batch_bool const& self, requires_arch) noexcept + { + return _mm256_xor_si256(self, _mm256_set1_epi32(-1)); + } - // bitwise_lshift - template::value, void>::type> - batch bitwise_lshift(batch const& self, int32_t other, requires_arch) { - switch(sizeof(T)) { - case 2: return _mm256_slli_epi16(self, other); - case 4: return _mm256_slli_epi32(self, other); - case 8: return _mm256_slli_epi64(self, other); - default: return bitwise_lshift(self, other, avx{}); - } - } + // bitwise_lshift + template ::value, void>::type> + inline batch bitwise_lshift(batch const& self, int32_t other, requires_arch) noexcept + { + switch (sizeof(T)) + { + case 2: + return _mm256_slli_epi16(self, other); + case 4: + return _mm256_slli_epi32(self, other); + case 8: + return _mm256_slli_epi64(self, other); + default: + return bitwise_lshift(self, other, avx {}); + } + } - template::value, void>::type> - batch bitwise_lshift(batch const& self, batch const& other, requires_arch) { - switch(sizeof(T)) { - case 4: return _mm256_sllv_epi32(self, other); - case 8: return _mm256_sllv_epi64(self, other); - default: return bitwise_lshift(self, other, avx{}); - } - } + template ::value, void>::type> + inline batch bitwise_lshift(batch const& self, batch const& other, requires_arch) noexcept + { + switch (sizeof(T)) + { + case 4: + return _mm256_sllv_epi32(self, other); + case 8: + return _mm256_sllv_epi64(self, other); + default: + return bitwise_lshift(self, other, avx {}); + } + } - // bitwise_or - template::value, void>::type> - batch bitwise_or(batch const& self, batch const& other, requires_arch) { - return _mm256_or_si256(self, other); - } - template::value, void>::type> - batch_bool bitwise_or(batch_bool const& self, batch_bool const& other, requires_arch) { - return _mm256_or_si256(self, other); - } + // bitwise_or + template ::value, void>::type> + inline batch bitwise_or(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm256_or_si256(self, other); + } + template ::value, void>::type> + inline batch_bool bitwise_or(batch_bool const& self, batch_bool const& other, requires_arch) noexcept + { + return _mm256_or_si256(self, other); + } - // bitwise_rshift - template::value, void>::type> - batch bitwise_rshift(batch const& self, int32_t other, requires_arch) { - if(std::is_signed::value) { - switch(sizeof(T)) { - case 1: { - __m256i sign_mask = _mm256_set1_epi16((0xFF00 >> other) & 0x00FF); - __m256i cmp_is_negative = _mm256_cmpgt_epi8(_mm256_setzero_si256(), self); - __m256i res = _mm256_srai_epi16(self, other); - return _mm256_or_si256( - detail::fwd_to_sse([](__m128i s, __m128i o) { return bitwise_and(batch(s), batch(o), sse4_2{}); }, - sign_mask, cmp_is_negative), - _mm256_andnot_si256(sign_mask, res)); - } - case 2: return _mm256_srai_epi16(self, other); - case 4: return _mm256_srai_epi32(self, other); - default: return bitwise_rshift(self, other, avx{}); - } - } - else { - switch(sizeof(T)) { - case 2: return _mm256_srli_epi16(self, other); - case 4: return _mm256_srli_epi32(self, other); - case 8: return _mm256_srli_epi64(self, other); - default: return bitwise_rshift(self, other, avx{}); - } - } - } + // bitwise_rshift + template ::value, void>::type> + inline batch bitwise_rshift(batch const& self, int32_t other, requires_arch) noexcept + { + if (std::is_signed::value) + { + switch (sizeof(T)) + { + case 1: + { + __m256i sign_mask = _mm256_set1_epi16((0xFF00 >> other) & 0x00FF); + __m256i cmp_is_negative = _mm256_cmpgt_epi8(_mm256_setzero_si256(), self); + __m256i res = _mm256_srai_epi16(self, other); + return _mm256_or_si256( + detail::fwd_to_sse([](__m128i s, __m128i o) noexcept + { return bitwise_and(batch(s), batch(o), sse4_2 {}); }, + sign_mask, cmp_is_negative), + _mm256_andnot_si256(sign_mask, res)); + } + case 2: + return _mm256_srai_epi16(self, other); + case 4: + return _mm256_srai_epi32(self, other); + default: + return bitwise_rshift(self, other, avx {}); + } + } + else + { + switch (sizeof(T)) + { + case 2: + return _mm256_srli_epi16(self, other); + case 4: + return _mm256_srli_epi32(self, other); + case 8: + return _mm256_srli_epi64(self, other); + default: + return bitwise_rshift(self, other, avx {}); + } + } + } - template::value, void>::type> - batch bitwise_rshift(batch const& self, batch const& other, requires_arch) { - if(std::is_signed::value) { - switch(sizeof(T)) { - case 4: return _mm256_srav_epi32(self, other); - default: return bitwise_rshift(self, other, avx{}); - } - } - else { - switch(sizeof(T)) { - case 4: return _mm256_srlv_epi32(self, other); - case 8: return _mm256_srlv_epi64(self, other); - default: return bitwise_rshift(self, other, avx{}); - } - } - } + template ::value, void>::type> + inline batch bitwise_rshift(batch const& self, batch const& other, requires_arch) noexcept + { + if (std::is_signed::value) + { + switch (sizeof(T)) + { + case 4: + return _mm256_srav_epi32(self, other); + default: + return bitwise_rshift(self, other, avx {}); + } + } + else + { + switch (sizeof(T)) + { + case 4: + return _mm256_srlv_epi32(self, other); + case 8: + return _mm256_srlv_epi64(self, other); + default: + return bitwise_rshift(self, other, avx {}); + } + } + } - // bitwise_xor - template::value, void>::type> - batch bitwise_xor(batch const& self, batch const& other, requires_arch) { - return _mm256_xor_si256(self, other); - } - template::value, void>::type> - batch bitwise_xor(batch_bool const& self, batch_bool const& other, requires_arch) { - return _mm256_xor_si256(self, other); - } + // bitwise_xor + template ::value, void>::type> + inline batch bitwise_xor(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm256_xor_si256(self, other); + } + template ::value, void>::type> + inline batch bitwise_xor(batch_bool const& self, batch_bool const& other, requires_arch) noexcept + { + return _mm256_xor_si256(self, other); + } - // complex_low - template batch complex_low(batch, A> const& self, requires_arch) { + // complex_low + template + inline batch complex_low(batch, A> const& self, requires_arch) noexcept + { __m256d tmp0 = _mm256_permute4x64_pd(self.real(), _MM_SHUFFLE(3, 1, 1, 0)); __m256d tmp1 = _mm256_permute4x64_pd(self.imag(), _MM_SHUFFLE(1, 2, 0, 0)); return _mm256_blend_pd(tmp0, tmp1, 10); - } + } - // complex_high - template batch complex_high(batch, A> const& self, requires_arch) { + // complex_high + template + inline batch complex_high(batch, A> const& self, requires_arch) noexcept + { __m256d tmp0 = _mm256_permute4x64_pd(self.real(), _MM_SHUFFLE(3, 3, 1, 2)); __m256d tmp1 = _mm256_permute4x64_pd(self.imag(), _MM_SHUFFLE(3, 2, 2, 0)); return _mm256_blend_pd(tmp0, tmp1, 10); - } + } + // convert + namespace detail + { + + template + inline batch fast_cast(batch const& v, batch const&, requires_arch) noexcept + { + // see https://stackoverflow.com/questions/34066228/how-to-perform-uint32-float-conversion-with-sse + __m256i msk_lo = _mm256_set1_epi32(0xFFFF); + __m256 cnst65536f = _mm256_set1_ps(65536.0f); + + __m256i v_lo = _mm256_and_si256(v, msk_lo); /* extract the 16 lowest significant bits of self */ + __m256i v_hi = _mm256_srli_epi32(v, 16); /* 16 most significant bits of v */ + __m256 v_lo_flt = _mm256_cvtepi32_ps(v_lo); /* No rounding */ + __m256 v_hi_flt = _mm256_cvtepi32_ps(v_hi); /* No rounding */ + v_hi_flt = _mm256_mul_ps(cnst65536f, v_hi_flt); /* No rounding */ + return _mm256_add_ps(v_hi_flt, v_lo_flt); /* Rounding may occur here, mul and add may fuse to fma for haswell and newer */ + } - // eq - template::value, void>::type> - batch_bool eq(batch const& self, batch const& other, requires_arch) { - switch(sizeof(T)) { - case 1: return _mm256_cmpeq_epi8(self, other); - case 2: return _mm256_cmpeq_epi16(self, other); - case 4: return _mm256_cmpeq_epi32(self, other); - case 8: return _mm256_cmpeq_epi64(self, other); - default: return eq(self, other, avx{}); - } - } + } - // gt - template::value, void>::type> - batch_bool gt(batch const& self, batch const& other, requires_arch) { - if(std::is_signed::value) { - switch(sizeof(T)) { - case 1: return _mm256_cmpgt_epi8(self, other); - case 2: return _mm256_cmpgt_epi16(self, other); - case 4: return _mm256_cmpgt_epi32(self, other); - case 8: return _mm256_cmpgt_epi64(self, other); - default: return gt(self, other, avx{}); - } - } - else { - return gt(self, other, avx{}); - } - } + // eq + template ::value, void>::type> + inline batch_bool eq(batch const& self, batch const& other, requires_arch) noexcept + { + switch (sizeof(T)) + { + case 1: + return _mm256_cmpeq_epi8(self, other); + case 2: + return _mm256_cmpeq_epi16(self, other); + case 4: + return _mm256_cmpeq_epi32(self, other); + case 8: + return _mm256_cmpeq_epi64(self, other); + default: + return eq(self, other, avx {}); + } + } - // hadd - template::value, void>::type> - T hadd(batch const& self, requires_arch) { - switch(sizeof(T)) { - case 4: - { + // lt + template ::value, void>::type> + inline batch_bool lt(batch const& self, batch const& other, requires_arch) noexcept + { + if (std::is_signed::value) + { + switch (sizeof(T)) + { + case 1: + return _mm256_cmpgt_epi8(other, self); + case 2: + return _mm256_cmpgt_epi16(other, self); + case 4: + return _mm256_cmpgt_epi32(other, self); + case 8: + return _mm256_cmpgt_epi64(other, self); + default: + return lt(self, other, avx {}); + } + } + else + { + return lt(self, other, avx {}); + } + } + + // hadd + template ::value, void>::type> + inline T hadd(batch const& self, requires_arch) noexcept + { + switch (sizeof(T)) + { + case 4: + { __m256i tmp1 = _mm256_hadd_epi32(self, self); __m256i tmp2 = _mm256_hadd_epi32(tmp1, tmp1); __m128i tmp3 = _mm256_extracti128_si256(tmp2, 1); __m128i tmp4 = _mm_add_epi32(_mm256_castsi256_si128(tmp2), tmp3); return _mm_cvtsi128_si32(tmp4); - } - case 8: - { + } + case 8: + { __m256i tmp1 = _mm256_shuffle_epi32(self, 0x0E); __m256i tmp2 = _mm256_add_epi64(self, tmp1); __m128i tmp3 = _mm256_extracti128_si256(tmp2, 1); @@ -235,163 +337,237 @@ namespace xsimd { std::memcpy(&i, &m, sizeof(i)); return i; #endif - } - default: return hadd(self, avx{}); - } - } - // load_complex - template batch, A> load_complex(batch const& hi, batch const& lo, requires_arch) { + } + default: + return hadd(self, avx {}); + } + } + // load_complex + template + inline batch, A> load_complex(batch const& hi, batch const& lo, requires_arch) noexcept + { using batch_type = batch; batch_type real = _mm256_castpd_ps( - _mm256_permute4x64_pd( - _mm256_castps_pd(_mm256_shuffle_ps(hi, lo, _MM_SHUFFLE(2, 0, 2, 0))), - _MM_SHUFFLE(3, 1, 2, 0))); + _mm256_permute4x64_pd( + _mm256_castps_pd(_mm256_shuffle_ps(hi, lo, _MM_SHUFFLE(2, 0, 2, 0))), + _MM_SHUFFLE(3, 1, 2, 0))); batch_type imag = _mm256_castpd_ps( - _mm256_permute4x64_pd( - _mm256_castps_pd(_mm256_shuffle_ps(hi, lo, _MM_SHUFFLE(3, 1, 3, 1))), - _MM_SHUFFLE(3, 1, 2, 0))); - return {real, imag}; - } - template batch, A> load_complex(batch const& hi, batch const& lo, requires_arch) { + _mm256_permute4x64_pd( + _mm256_castps_pd(_mm256_shuffle_ps(hi, lo, _MM_SHUFFLE(3, 1, 3, 1))), + _MM_SHUFFLE(3, 1, 2, 0))); + return { real, imag }; + } + template + inline batch, A> load_complex(batch const& hi, batch const& lo, requires_arch) noexcept + { using batch_type = batch; batch_type real = _mm256_permute4x64_pd(_mm256_unpacklo_pd(hi, lo), _MM_SHUFFLE(3, 1, 2, 0)); batch_type imag = _mm256_permute4x64_pd(_mm256_unpackhi_pd(hi, lo), _MM_SHUFFLE(3, 1, 2, 0)); - return {real, imag}; - } + return { real, imag }; + } - // max - template::value, void>::type> - batch max(batch const& self, batch const& other, requires_arch) { - if(std::is_signed::value) { - switch(sizeof(T)) { - case 1: return _mm256_max_epi8(self, other); - case 2: return _mm256_max_epi16(self, other); - case 4: return _mm256_max_epi32(self, other); - default: return max(self, other, avx{}); - } - } - else { - switch(sizeof(T)) { - case 1: return _mm256_max_epu8(self, other); - case 2: return _mm256_max_epu16(self, other); - case 4: return _mm256_max_epu32(self, other); - default: return max(self, other, avx{}); - } - } - } + // max + template ::value, void>::type> + inline batch max(batch const& self, batch const& other, requires_arch) noexcept + { + if (std::is_signed::value) + { + switch (sizeof(T)) + { + case 1: + return _mm256_max_epi8(self, other); + case 2: + return _mm256_max_epi16(self, other); + case 4: + return _mm256_max_epi32(self, other); + default: + return max(self, other, avx {}); + } + } + else + { + switch (sizeof(T)) + { + case 1: + return _mm256_max_epu8(self, other); + case 2: + return _mm256_max_epu16(self, other); + case 4: + return _mm256_max_epu32(self, other); + default: + return max(self, other, avx {}); + } + } + } - // min - template::value, void>::type> - batch min(batch const& self, batch const& other, requires_arch) { - if(std::is_signed::value) { - switch(sizeof(T)) { - case 1: return _mm256_min_epi8(self, other); - case 2: return _mm256_min_epi16(self, other); - case 4: return _mm256_min_epi32(self, other); - default: return min(self, other, avx{}); - } - } - else { - switch(sizeof(T)) { - case 1: return _mm256_min_epu8(self, other); - case 2: return _mm256_min_epu16(self, other); - case 4: return _mm256_min_epu32(self, other); - default: return min(self, other, avx{}); - } - } - } + // min + template ::value, void>::type> + inline batch min(batch const& self, batch const& other, requires_arch) noexcept + { + if (std::is_signed::value) + { + switch (sizeof(T)) + { + case 1: + return _mm256_min_epi8(self, other); + case 2: + return _mm256_min_epi16(self, other); + case 4: + return _mm256_min_epi32(self, other); + default: + return min(self, other, avx {}); + } + } + else + { + switch (sizeof(T)) + { + case 1: + return _mm256_min_epu8(self, other); + case 2: + return _mm256_min_epu16(self, other); + case 4: + return _mm256_min_epu32(self, other); + default: + return min(self, other, avx {}); + } + } + } - // mul - template::value, void>::type> - batch mul(batch const& self, batch const& other, requires_arch) { - switch(sizeof(T)) { - case 2: return _mm256_mullo_epi16(self, other); - case 4: return _mm256_mullo_epi32(self, other); - default: return mul(self, other, avx{}); - } - } + // mul + template ::value, void>::type> + inline batch mul(batch const& self, batch const& other, requires_arch) noexcept + { + switch (sizeof(T)) + { + case 2: + return _mm256_mullo_epi16(self, other); + case 4: + return _mm256_mullo_epi32(self, other); + default: + return mul(self, other, avx {}); + } + } - // sadd - template::value, void>::type> - batch sadd(batch const& self, batch const& other, requires_arch) { - if(std::is_signed::value) { - switch(sizeof(T)) { - case 1: return _mm256_adds_epi8(self, other); - case 2: return _mm256_adds_epi16(self, other); - default: return sadd(self, other, avx{}); - } - } - else { - switch(sizeof(T)) { - case 1: return _mm256_adds_epu8(self, other); - case 2: return _mm256_adds_epu16(self, other); - default: return sadd(self, other, avx{}); - } - } - } + // sadd + template ::value, void>::type> + inline batch sadd(batch const& self, batch const& other, requires_arch) noexcept + { + if (std::is_signed::value) + { + switch (sizeof(T)) + { + case 1: + return _mm256_adds_epi8(self, other); + case 2: + return _mm256_adds_epi16(self, other); + default: + return sadd(self, other, avx {}); + } + } + else + { + switch (sizeof(T)) + { + case 1: + return _mm256_adds_epu8(self, other); + case 2: + return _mm256_adds_epu16(self, other); + default: + return sadd(self, other, avx {}); + } + } + } - // select - template::value, void>::type> - batch select(batch_bool const& cond, batch const& true_br, batch const& false_br, requires_arch) { - switch(sizeof(T)) { - case 1: return _mm256_blendv_epi8(false_br, true_br, cond); - case 2: return _mm256_blendv_epi8(false_br, true_br, cond); - case 4: return _mm256_blendv_epi8(false_br, true_br, cond); - case 8: return _mm256_blendv_epi8(false_br, true_br, cond); - default: return select(cond, true_br, false_br, avx{}); - } - } - template::value, void>::type> - batch select(batch_bool_constant, Values...> const&, batch const& true_br, batch const& false_br, requires_arch) { - constexpr int mask = batch_bool_constant, Values...>::mask(); - switch(sizeof(T)) { - // FIXME: for some reason mask here is not considered as an immediate, - // but it's okay for _mm256_blend_epi32 - //case 2: return _mm256_blend_epi16(false_br, true_br, mask); - case 4: return _mm256_blend_epi32(false_br, true_br, mask); - case 8: { - constexpr int imask = detail::interleave(mask); - return _mm256_blend_epi32(false_br, true_br, imask); - } - default: return select(batch_bool{Values...}, true_br, false_br, avx2{}); - } - } + // select + template ::value, void>::type> + inline batch select(batch_bool const& cond, batch const& true_br, batch const& false_br, requires_arch) noexcept + { + switch (sizeof(T)) + { + case 1: + return _mm256_blendv_epi8(false_br, true_br, cond); + case 2: + return _mm256_blendv_epi8(false_br, true_br, cond); + case 4: + return _mm256_blendv_epi8(false_br, true_br, cond); + case 8: + return _mm256_blendv_epi8(false_br, true_br, cond); + default: + return select(cond, true_br, false_br, avx {}); + } + } + template ::value, void>::type> + inline batch select(batch_bool_constant, Values...> const&, batch const& true_br, batch const& false_br, requires_arch) noexcept + { + constexpr int mask = batch_bool_constant, Values...>::mask(); + switch (sizeof(T)) + { + // FIXME: for some reason mask here is not considered as an immediate, + // but it's okay for _mm256_blend_epi32 + // case 2: return _mm256_blend_epi16(false_br, true_br, mask); + case 4: + return _mm256_blend_epi32(false_br, true_br, mask); + case 8: + { + constexpr int imask = detail::interleave(mask); + return _mm256_blend_epi32(false_br, true_br, imask); + } + default: + return select(batch_bool { Values... }, true_br, false_br, avx2 {}); + } + } - // ssub - template::value, void>::type> - batch ssub(batch const& self, batch const& other, requires_arch) { - if(std::is_signed::value) { - switch(sizeof(T)) { - case 1: return _mm256_subs_epi8(self, other); - case 2: return _mm256_subs_epi16(self, other); - default: return ssub(self, other, avx{}); - } - } - else { - switch(sizeof(T)) { - case 1: return _mm256_subs_epu8(self, other); - case 2: return _mm256_subs_epu16(self, other); - default: return ssub(self, other, avx{}); - } - } - } + // ssub + template ::value, void>::type> + inline batch ssub(batch const& self, batch const& other, requires_arch) noexcept + { + if (std::is_signed::value) + { + switch (sizeof(T)) + { + case 1: + return _mm256_subs_epi8(self, other); + case 2: + return _mm256_subs_epi16(self, other); + default: + return ssub(self, other, avx {}); + } + } + else + { + switch (sizeof(T)) + { + case 1: + return _mm256_subs_epu8(self, other); + case 2: + return _mm256_subs_epu16(self, other); + default: + return ssub(self, other, avx {}); + } + } + } - // sub - template::value, void>::type> - batch sub(batch const& self, batch const& other, requires_arch) { - switch(sizeof(T)) { - case 1: return _mm256_sub_epi8(self, other); - case 2: return _mm256_sub_epi16(self, other); - case 4: return _mm256_sub_epi32(self, other); - case 8: return _mm256_sub_epi64(self, other); - default: return sub(self, other, avx{}); - } + // sub + template ::value, void>::type> + inline batch sub(batch const& self, batch const& other, requires_arch) noexcept + { + switch (sizeof(T)) + { + case 1: + return _mm256_sub_epi8(self, other); + case 2: + return _mm256_sub_epi16(self, other); + case 4: + return _mm256_sub_epi32(self, other); + case 8: + return _mm256_sub_epi64(self, other); + default: + return sub(self, other, avx {}); + } + } } - - - } - } #endif diff --git a/third_party/xsimd/arch/xsimd_avx512bw.hpp b/third_party/xsimd/arch/xsimd_avx512bw.hpp index e33b42666..478b5de71 100644 --- a/third_party/xsimd/arch/xsimd_avx512bw.hpp +++ b/third_party/xsimd/arch/xsimd_avx512bw.hpp @@ -1,13 +1,13 @@ /*************************************************************************** -* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * -* Martin Renou * -* Copyright (c) QuantStack * -* Copyright (c) Serge Guelton * -* * -* Distributed under the terms of the BSD 3-Clause License. * -* * -* The full license is in the file LICENSE, distributed with this software. * -****************************************************************************/ + * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * + * Martin Renou * + * Copyright (c) QuantStack * + * Copyright (c) Serge Guelton * + * * + * Distributed under the terms of the BSD 3-Clause License. * + * * + * The full license is in the file LICENSE, distributed with this software. * + ****************************************************************************/ #ifndef XSIMD_AVX512BW_HPP #define XSIMD_AVX512BW_HPP @@ -16,261 +16,363 @@ #include "../types/xsimd_avx512bw_register.hpp" -namespace xsimd { - - namespace kernel { - using namespace types; - - namespace detail { - template - batch_bool compare_int_avx512bw(batch const& self, batch const& other) { - using register_type = typename batch_bool::register_type; - if(std::is_signed::value) { - switch(sizeof(T)) { - case 1: return (register_type)_mm512_cmp_epi8_mask(self, other, Cmp); - case 2: return (register_type)_mm512_cmp_epi16_mask(self, other, Cmp); - case 4: return (register_type)_mm512_cmp_epi32_mask(self, other, Cmp); - case 8: return (register_type)_mm512_cmp_epi64_mask(self, other, Cmp); - } - } - else { - switch(sizeof(T)) { - case 1: return (register_type)_mm512_cmp_epu8_mask(self, other, Cmp); - case 2: return (register_type)_mm512_cmp_epu16_mask(self, other, Cmp); - case 4: return (register_type)_mm512_cmp_epu32_mask(self, other, Cmp); - case 8: return (register_type)_mm512_cmp_epu64_mask(self, other, Cmp); +namespace xsimd +{ + + namespace kernel + { + using namespace types; + + namespace detail + { + template + inline batch_bool compare_int_avx512bw(batch const& self, batch const& other) noexcept + { + using register_type = typename batch_bool::register_type; + if (std::is_signed::value) + { + switch (sizeof(T)) + { + case 1: + return (register_type)_mm512_cmp_epi8_mask(self, other, Cmp); + case 2: + return (register_type)_mm512_cmp_epi16_mask(self, other, Cmp); + case 4: + return (register_type)_mm512_cmp_epi32_mask(self, other, Cmp); + case 8: + return (register_type)_mm512_cmp_epi64_mask(self, other, Cmp); + } + } + else + { + switch (sizeof(T)) + { + case 1: + return (register_type)_mm512_cmp_epu8_mask(self, other, Cmp); + case 2: + return (register_type)_mm512_cmp_epu16_mask(self, other, Cmp); + case 4: + return (register_type)_mm512_cmp_epu32_mask(self, other, Cmp); + case 8: + return (register_type)_mm512_cmp_epu64_mask(self, other, Cmp); + } + } + } } - } - } - } - - // abs - template::value, void>::type> - batch abs(batch const& self, requires_arch) { - if(std::is_unsigned::value) - return self; - switch(sizeof(T)) { - case 1: return _mm512_abs_epi8(self); - case 2: return _mm512_abs_epi16(self); - default: return abs(self, avx512dq{}); - } - } + // abs + template ::value, void>::type> + inline batch abs(batch const& self, requires_arch) noexcept + { + if (std::is_unsigned::value) + return self; + + switch (sizeof(T)) + { + case 1: + return _mm512_abs_epi8(self); + case 2: + return _mm512_abs_epi16(self); + default: + return abs(self, avx512dq {}); + } + } - // add - template::value, void>::type> - batch add(batch const& self, batch const& other, requires_arch) { - switch(sizeof(T)) { - case 1: return _mm512_add_epi8(self, other); - case 2: return _mm512_add_epi16(self, other); - default: return add(self, other, avx512dq{}); - } - } + // add + template ::value, void>::type> + inline batch add(batch const& self, batch const& other, requires_arch) noexcept + { + switch (sizeof(T)) + { + case 1: + return _mm512_add_epi8(self, other); + case 2: + return _mm512_add_epi16(self, other); + default: + return add(self, other, avx512dq {}); + } + } - // bitwise_lshift - template::value, void>::type> - batch bitwise_lshift(batch const& self, int32_t other, requires_arch) { - switch(sizeof(T)) { + // bitwise_lshift + template ::value, void>::type> + inline batch bitwise_lshift(batch const& self, int32_t other, requires_arch) noexcept + { + switch (sizeof(T)) + { #if defined(XSIMD_AVX512_SHIFT_INTRINSICS_IMM_ONLY) - case 2: return _mm512_sllv_epi16(self, _mm512_set1_epi16(other)); + case 2: + return _mm512_sllv_epi16(self, _mm512_set1_epi16(other)); #else - case 2: return _mm512_slli_epi16(self, other); + case 2: + return _mm512_slli_epi16(self, other); #endif - default: return bitwise_lshift(self, other, avx512dq{}); - } - } + default: + return bitwise_lshift(self, other, avx512dq {}); + } + } - // bitwise_rshift - template::value, void>::type> - batch bitwise_rshift(batch const& self, int32_t other, requires_arch) { - if(std::is_signed::value) { - switch(sizeof(T)) { - case 1: - { - __m512i sign_mask = _mm512_set1_epi16((0xFF00 >> other) & 0x00FF); - __m512i zeros = _mm512_setzero_si512(); - __mmask64 cmp_is_negative_mask = _mm512_cmpgt_epi8_mask(zeros, self); - __m512i cmp_sign_mask = _mm512_mask_blend_epi8(cmp_is_negative_mask, zeros, sign_mask); + // bitwise_rshift + template ::value, void>::type> + inline batch bitwise_rshift(batch const& self, int32_t other, requires_arch) noexcept + { + if (std::is_signed::value) + { + switch (sizeof(T)) + { + case 1: + { + __m512i sign_mask = _mm512_set1_epi16((0xFF00 >> other) & 0x00FF); + __m512i zeros = _mm512_setzero_si512(); + __mmask64 cmp_is_negative_mask = _mm512_cmpgt_epi8_mask(zeros, self); + __m512i cmp_sign_mask = _mm512_mask_blend_epi8(cmp_is_negative_mask, zeros, sign_mask); #if defined(XSIMD_AVX512_SHIFT_INTRINSICS_IMM_ONLY) - __m512i res = _mm512_srav_epi16(self, _mm512_set1_epi16(other)); + __m512i res = _mm512_srav_epi16(self, _mm512_set1_epi16(other)); #else - __m512i res = _mm512_srai_epi16(self, other); + __m512i res = _mm512_srai_epi16(self, other); #endif - return _mm512_or_si512(cmp_sign_mask, _mm512_andnot_si512(sign_mask, res)); - } + return _mm512_or_si512(cmp_sign_mask, _mm512_andnot_si512(sign_mask, res)); + } #if defined(XSIMD_AVX512_SHIFT_INTRINSICS_IMM_ONLY) - case 2: return _mm512_srav_epi16(self, _mm512_set1_epi16(other)); + case 2: + return _mm512_srav_epi16(self, _mm512_set1_epi16(other)); #else - case 2: return _mm512_srai_epi16(self, other); + case 2: + return _mm512_srai_epi16(self, other); #endif - default: return bitwise_rshift(self, other, avx512dq{}); - } - } - else { - switch(sizeof(T)) { + default: + return bitwise_rshift(self, other, avx512dq {}); + } + } + else + { + switch (sizeof(T)) + { #if defined(XSIMD_AVX512_SHIFT_INTRINSICS_IMM_ONLY) - case 2: return _mm512_srlv_epi16(self, _mm512_set1_epi16(other)); + case 2: + return _mm512_srlv_epi16(self, _mm512_set1_epi16(other)); #else - case 2: return _mm512_srli_epi16(self, other); + case 2: + return _mm512_srli_epi16(self, other); #endif - default: return bitwise_rshift(self, other, avx512dq{}); + default: + return bitwise_rshift(self, other, avx512dq {}); + } + } } - } - } - // eq - template::value, void>::type> - batch_bool eq(batch const& self, batch const& other, requires_arch) { - return detail::compare_int_avx512bw(self, other); - } - - // ge - template::value, void>::type> - batch_bool ge(batch const& self, batch const& other, requires_arch) { - return detail::compare_int_avx512bw(self, other); - } - - // gt - template::value, void>::type> - batch_bool gt(batch const& self, batch const& other, requires_arch) { - return detail::compare_int_avx512bw(self, other); - } - - - // le - template::value, void>::type> - batch_bool le(batch const& self, batch const& other, requires_arch) { - return detail::compare_int_avx512bw(self, other); - } + // eq + template ::value, void>::type> + inline batch_bool eq(batch const& self, batch const& other, requires_arch) noexcept + { + return detail::compare_int_avx512bw(self, other); + } - // lt - template::value, void>::type> - batch_bool lt(batch const& self, batch const& other, requires_arch) { - return detail::compare_int_avx512bw(self, other); - } + // ge + template ::value, void>::type> + inline batch_bool ge(batch const& self, batch const& other, requires_arch) noexcept + { + return detail::compare_int_avx512bw(self, other); + } - // max - template::value, void>::type> - batch max(batch const& self, batch const& other, requires_arch) { - if(std::is_signed::value) { - switch(sizeof(T)) { - case 1: return _mm512_max_epi8(self, other); - case 2: return _mm512_max_epi16(self, other); - default: return max(self, other, avx512dq{}); + // gt + template ::value, void>::type> + inline batch_bool gt(batch const& self, batch const& other, requires_arch) noexcept + { + return detail::compare_int_avx512bw(self, other); } - } - else { - switch(sizeof(T)) { - case 1: return _mm512_max_epu8(self, other); - case 2: return _mm512_max_epu16(self, other); - default: return max(self, other, avx512dq{}); + + // le + template ::value, void>::type> + inline batch_bool le(batch const& self, batch const& other, requires_arch) noexcept + { + return detail::compare_int_avx512bw(self, other); } - } - } - // min - template::value, void>::type> - batch min(batch const& self, batch const& other, requires_arch) { - if(std::is_signed::value) { - switch(sizeof(T)) { - case 1: return _mm512_min_epi8(self, other); - case 2: return _mm512_min_epi16(self, other); - default: return min(self, other, avx512dq{}); + // lt + template ::value, void>::type> + inline batch_bool lt(batch const& self, batch const& other, requires_arch) noexcept + { + return detail::compare_int_avx512bw(self, other); } - } - else { - switch(sizeof(T)) { - case 1: return _mm512_min_epu8(self, other); - case 2: return _mm512_min_epu16(self, other); - default: return min(self, other, avx512dq{}); + + // max + template ::value, void>::type> + inline batch max(batch const& self, batch const& other, requires_arch) noexcept + { + if (std::is_signed::value) + { + switch (sizeof(T)) + { + case 1: + return _mm512_max_epi8(self, other); + case 2: + return _mm512_max_epi16(self, other); + default: + return max(self, other, avx512dq {}); + } + } + else + { + switch (sizeof(T)) + { + case 1: + return _mm512_max_epu8(self, other); + case 2: + return _mm512_max_epu16(self, other); + default: + return max(self, other, avx512dq {}); + } + } } - } - } + // min + template ::value, void>::type> + inline batch min(batch const& self, batch const& other, requires_arch) noexcept + { + if (std::is_signed::value) + { + switch (sizeof(T)) + { + case 1: + return _mm512_min_epi8(self, other); + case 2: + return _mm512_min_epi16(self, other); + default: + return min(self, other, avx512dq {}); + } + } + else + { + switch (sizeof(T)) + { + case 1: + return _mm512_min_epu8(self, other); + case 2: + return _mm512_min_epu16(self, other); + default: + return min(self, other, avx512dq {}); + } + } + } - // mul - template::value, void>::type> - batch mul(batch const& self, batch const& other, requires_arch) { - switch(sizeof(T)) { - case 1: { + // mul + template ::value, void>::type> + inline batch mul(batch const& self, batch const& other, requires_arch) noexcept + { + switch (sizeof(T)) + { + case 1: + { __m512i upper = _mm512_and_si512(_mm512_mullo_epi16(self, other), _mm512_srli_epi16(_mm512_set1_epi16(-1), 8)); __m512i lower = _mm512_slli_epi16(_mm512_mullo_epi16(_mm512_srli_epi16(self, 8), _mm512_srli_epi16(other, 8)), 8); return _mm512_or_si512(upper, lower); + } + case 2: + return _mm512_mullo_epi16(self, other); + default: + return mul(self, other, avx512dq {}); + } } - case 2: return _mm512_mullo_epi16(self, other); - default: return mul(self, other, avx512dq{}); - } - } - - // neq - template::value, void>::type> - batch_bool neq(batch const& self, batch const& other, requires_arch) { - return detail::compare_int_avx512bw(self, other); - } - - // sadd - template::value, void>::type> - batch sadd(batch const& self, batch const& other, requires_arch) { - if(std::is_signed::value) { - switch(sizeof(T)) { - case 1: return _mm512_adds_epi8(self, other); - case 2: return _mm512_adds_epi16(self, other); - default: return sadd(self, other, avx512dq{}); - } - } - else { - switch(sizeof(T)) { - case 1: return _mm512_adds_epu8(self, other); - case 2: return _mm512_adds_epu16(self, other); - default: return sadd(self, other, avx512dq{}); + // neq + template ::value, void>::type> + inline batch_bool neq(batch const& self, batch const& other, requires_arch) noexcept + { + return detail::compare_int_avx512bw(self, other); } - } - } - - // select - template::value, void>::type> - batch select(batch_bool const& cond, batch const& true_br, batch const& false_br, requires_arch) { - switch(sizeof(T)) { - case 1: return _mm512_mask_blend_epi8(cond, false_br, true_br); - case 2: return _mm512_mask_blend_epi16(cond, false_br, true_br); - default: return select(cond, true_br, false_br, avx512dq{}); - }; - } + // sadd + template ::value, void>::type> + inline batch sadd(batch const& self, batch const& other, requires_arch) noexcept + { + if (std::is_signed::value) + { + switch (sizeof(T)) + { + case 1: + return _mm512_adds_epi8(self, other); + case 2: + return _mm512_adds_epi16(self, other); + default: + return sadd(self, other, avx512dq {}); + } + } + else + { + switch (sizeof(T)) + { + case 1: + return _mm512_adds_epu8(self, other); + case 2: + return _mm512_adds_epu16(self, other); + default: + return sadd(self, other, avx512dq {}); + } + } + } - // ssub - template::value, void>::type> - batch ssub(batch const& self, batch const& other, requires_arch) { - if(std::is_signed::value) { - switch(sizeof(T)) { - case 1: return _mm512_subs_epi8(self, other); - case 2: return _mm512_subs_epi16(self, other); - default: return ssub(self, other, avx512dq{}); + // select + template ::value, void>::type> + inline batch select(batch_bool const& cond, batch const& true_br, batch const& false_br, requires_arch) noexcept + { + switch (sizeof(T)) + { + case 1: + return _mm512_mask_blend_epi8(cond, false_br, true_br); + case 2: + return _mm512_mask_blend_epi16(cond, false_br, true_br); + default: + return select(cond, true_br, false_br, avx512dq {}); + }; } - } - else { - switch(sizeof(T)) { - case 1: return _mm512_subs_epu8(self, other); - case 2: return _mm512_subs_epu16(self, other); - default: return ssub(self, other, avx512dq{}); + + // ssub + template ::value, void>::type> + inline batch ssub(batch const& self, batch const& other, requires_arch) noexcept + { + if (std::is_signed::value) + { + switch (sizeof(T)) + { + case 1: + return _mm512_subs_epi8(self, other); + case 2: + return _mm512_subs_epi16(self, other); + default: + return ssub(self, other, avx512dq {}); + } + } + else + { + switch (sizeof(T)) + { + case 1: + return _mm512_subs_epu8(self, other); + case 2: + return _mm512_subs_epu16(self, other); + default: + return ssub(self, other, avx512dq {}); + } + } } - } - } + // sub + template ::value, void>::type> + inline batch sub(batch const& self, batch const& other, requires_arch) noexcept + { + switch (sizeof(T)) + { + case 1: + return _mm512_sub_epi8(self, other); + case 2: + return _mm512_sub_epi16(self, other); + default: + return sub(self, other, avx512dq {}); + } + } - // sub - template::value, void>::type> - batch sub(batch const& self, batch const& other, requires_arch) { - switch(sizeof(T)) { - case 1: return _mm512_sub_epi8(self, other); - case 2: return _mm512_sub_epi16(self, other); - default: return sub(self, other, avx512dq{}); - } } - } - } #endif diff --git a/third_party/xsimd/arch/xsimd_avx512cd.hpp b/third_party/xsimd/arch/xsimd_avx512cd.hpp index b426adbb2..95f3f1df8 100644 --- a/third_party/xsimd/arch/xsimd_avx512cd.hpp +++ b/third_party/xsimd/arch/xsimd_avx512cd.hpp @@ -1,25 +1,27 @@ /*************************************************************************** -* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * -* Martin Renou * -* Copyright (c) QuantStack * -* Copyright (c) Serge Guelton * -* * -* Distributed under the terms of the BSD 3-Clause License. * -* * -* The full license is in the file LICENSE, distributed with this software. * -****************************************************************************/ + * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * + * Martin Renou * + * Copyright (c) QuantStack * + * Copyright (c) Serge Guelton * + * * + * Distributed under the terms of the BSD 3-Clause License. * + * * + * The full license is in the file LICENSE, distributed with this software. * + ****************************************************************************/ #ifndef XSIMD_AVX512CD_HPP #define XSIMD_AVX512CD_HPP #include "../types/xsimd_avx512cd_register.hpp" -namespace xsimd { +namespace xsimd +{ - namespace kernel { - // Nothing there yet. + namespace kernel + { + // Nothing there yet. - } + } } diff --git a/third_party/xsimd/arch/xsimd_avx512dq.hpp b/third_party/xsimd/arch/xsimd_avx512dq.hpp index 00fe1f549..ecd4730d2 100644 --- a/third_party/xsimd/arch/xsimd_avx512dq.hpp +++ b/third_party/xsimd/arch/xsimd_avx512dq.hpp @@ -1,74 +1,96 @@ /*************************************************************************** -* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * -* Martin Renou * -* Copyright (c) QuantStack * -* Copyright (c) Serge Guelton * -* * -* Distributed under the terms of the BSD 3-Clause License. * -* * -* The full license is in the file LICENSE, distributed with this software. * -****************************************************************************/ + * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * + * Martin Renou * + * Copyright (c) QuantStack * + * Copyright (c) Serge Guelton * + * * + * Distributed under the terms of the BSD 3-Clause License. * + * * + * The full license is in the file LICENSE, distributed with this software. * + ****************************************************************************/ #ifndef XSIMD_AVX512_DQHPP #define XSIMD_AVX512_D_HPP #include "../types/xsimd_avx512dq_register.hpp" -namespace xsimd { +namespace xsimd +{ - namespace kernel { - using namespace types; + namespace kernel + { + using namespace types; - // bitwise_and - template batch bitwise_and(batch const& self, batch const& other, requires_arch) { - return _mm512_and_ps(self, other); - } - template batch bitwise_and(batch const& self, batch const& other, requires_arch) { - return _mm512_and_pd(self, other); - } + // bitwise_and + template + inline batch bitwise_and(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm512_and_ps(self, other); + } + template + inline batch bitwise_and(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm512_and_pd(self, other); + } - // bitwise_andnot - template batch bitwise_andnot(batch const& self, batch const& other, requires_arch) { - return _mm512_andnot_ps(self, other); - } - template batch bitwise_andnot(batch const& self, batch const& other, requires_arch) { - return _mm512_andnot_pd(self, other); - } + // bitwise_andnot + template + inline batch bitwise_andnot(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm512_andnot_ps(self, other); + } + template + inline batch bitwise_andnot(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm512_andnot_pd(self, other); + } - // bitwise_or - template batch bitwise_or(batch const& self, batch const& other, requires_arch) { - return _mm512_or_ps(self, other); - } - template batch bitwise_or(batch const& self, batch const& other, requires_arch) { - return _mm512_or_pd(self, other); - } + // bitwise_or + template + inline batch bitwise_or(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm512_or_ps(self, other); + } + template + inline batch bitwise_or(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm512_or_pd(self, other); + } - template batch_bool bitwise_or(batch_bool const& self, batch_bool const& other, requires_arch) { - using register_type = typename batch_bool::register_type; - return register_type(self.data | other.data); - } + template + inline batch_bool bitwise_or(batch_bool const& self, batch_bool const& other, requires_arch) noexcept + { + using register_type = typename batch_bool::register_type; + return register_type(self.data | other.data); + } - // bitwise_xor - template batch bitwise_xor(batch const& self, batch const& other, requires_arch) { - return _mm512_xor_ps(self, other); - } - template batch bitwise_xor(batch const& self, batch const& other, requires_arch) { - return _mm512_xor_pd(self, other); - } + // bitwise_xor + template + inline batch bitwise_xor(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm512_xor_ps(self, other); + } + template + inline batch bitwise_xor(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm512_xor_pd(self, other); + } - // to_float - template - batch to_float(batch const& self, requires_arch) { - return _mm512_cvtepi64_pd(self); - } + // to_float + template + inline batch to_float(batch const& self, requires_arch) noexcept + { + return _mm512_cvtepi64_pd(self); + } - // to_int - template - batch to_int(batch const& self, requires_arch) { - return _mm512_cvttpd_epi64(self); - } + // to_int + template + inline batch to_int(batch const& self, requires_arch) noexcept + { + return _mm512_cvttpd_epi64(self); + } - } + } } diff --git a/third_party/xsimd/arch/xsimd_avx512f.hpp b/third_party/xsimd/arch/xsimd_avx512f.hpp index 13cc0da71..29d5f0eeb 100644 --- a/third_party/xsimd/arch/xsimd_avx512f.hpp +++ b/third_party/xsimd/arch/xsimd_avx512f.hpp @@ -1,13 +1,13 @@ /*************************************************************************** -* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * -* Martin Renou * -* Copyright (c) QuantStack * -* Copyright (c) Serge Guelton * -* * -* Distributed under the terms of the BSD 3-Clause License. * -* * -* The full license is in the file LICENSE, distributed with this software. * -****************************************************************************/ + * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * + * Martin Renou * + * Copyright (c) QuantStack * + * Copyright (c) Serge Guelton * + * * + * Distributed under the terms of the BSD 3-Clause License. * + * * + * The full license is in the file LICENSE, distributed with this software. * + ****************************************************************************/ #ifndef XSIMD_AVX512F_HPP #define XSIMD_AVX512F_HPP @@ -18,1212 +18,1602 @@ #include "../types/xsimd_avx512f_register.hpp" -namespace xsimd { - - namespace kernel { - using namespace types; - - namespace detail { - inline void split_avx512(__m512 val, __m256& low, __m256& high) { - low =_mm512_castps512_ps256(val); - high =_mm512_extractf32x8_ps(val, 1); - } - inline void split_avx512(__m512d val, __m256d& low, __m256d& high) { - low =_mm512_castpd512_pd256(val); - high =_mm512_extractf64x4_pd(val, 1); - } - inline void split_avx512(__m512i val, __m256i& low, __m256i& high) { - low =_mm512_castsi512_si256(val); - high =_mm512_extracti64x4_epi64(val, 1); - } - inline __m512i merge_avx(__m256i low, __m256i high) { - return _mm512_inserti64x4(_mm512_castsi256_si512(low), high, 1); - } - inline __m512 merge_avx(__m256 low, __m256 high) { - return _mm512_insertf32x8(_mm512_castps256_ps512(low), high, 1); - } - inline __m512d merge_avx(__m256d low, __m256d high) { - return _mm512_insertf64x4(_mm512_castpd256_pd512(low), high, 1); - } - template - __m512i fwd_to_avx(F f, __m512i self) { - __m256i self_low, self_high; - split_avx512(self, self_low, self_high); - __m256i res_low = f(self_low); - __m256i res_high = f(self_high); - return merge_avx(res_low, res_high); - } - template - __m512i fwd_to_avx(F f, __m512i self, __m512i other) { - __m256i self_low, self_high, other_low, other_high; - split_avx512(self, self_low, self_high); - split_avx512(other, other_low, other_high); - __m256i res_low = f(self_low, other_low); - __m256i res_high = f(self_high, other_high); - return merge_avx(res_low, res_high); - } - template - __m512i fwd_to_avx(F f, __m512i self, int32_t other) { - __m256i self_low, self_high; - split_avx512(self, self_low, self_high); - __m256i res_low = f(self_low, other); - __m256i res_high = f(self_high, other); - return merge_avx(res_low, res_high); - } - } - namespace detail { - - inline uint32_t morton(uint16_t x, uint16_t y) { - - static const unsigned short MortonTable256[256] = - { - 0x0000, 0x0001, 0x0004, 0x0005, 0x0010, 0x0011, 0x0014, 0x0015, - 0x0040, 0x0041, 0x0044, 0x0045, 0x0050, 0x0051, 0x0054, 0x0055, - 0x0100, 0x0101, 0x0104, 0x0105, 0x0110, 0x0111, 0x0114, 0x0115, - 0x0140, 0x0141, 0x0144, 0x0145, 0x0150, 0x0151, 0x0154, 0x0155, - 0x0400, 0x0401, 0x0404, 0x0405, 0x0410, 0x0411, 0x0414, 0x0415, - 0x0440, 0x0441, 0x0444, 0x0445, 0x0450, 0x0451, 0x0454, 0x0455, - 0x0500, 0x0501, 0x0504, 0x0505, 0x0510, 0x0511, 0x0514, 0x0515, - 0x0540, 0x0541, 0x0544, 0x0545, 0x0550, 0x0551, 0x0554, 0x0555, - 0x1000, 0x1001, 0x1004, 0x1005, 0x1010, 0x1011, 0x1014, 0x1015, - 0x1040, 0x1041, 0x1044, 0x1045, 0x1050, 0x1051, 0x1054, 0x1055, - 0x1100, 0x1101, 0x1104, 0x1105, 0x1110, 0x1111, 0x1114, 0x1115, - 0x1140, 0x1141, 0x1144, 0x1145, 0x1150, 0x1151, 0x1154, 0x1155, - 0x1400, 0x1401, 0x1404, 0x1405, 0x1410, 0x1411, 0x1414, 0x1415, - 0x1440, 0x1441, 0x1444, 0x1445, 0x1450, 0x1451, 0x1454, 0x1455, - 0x1500, 0x1501, 0x1504, 0x1505, 0x1510, 0x1511, 0x1514, 0x1515, - 0x1540, 0x1541, 0x1544, 0x1545, 0x1550, 0x1551, 0x1554, 0x1555, - 0x4000, 0x4001, 0x4004, 0x4005, 0x4010, 0x4011, 0x4014, 0x4015, - 0x4040, 0x4041, 0x4044, 0x4045, 0x4050, 0x4051, 0x4054, 0x4055, - 0x4100, 0x4101, 0x4104, 0x4105, 0x4110, 0x4111, 0x4114, 0x4115, - 0x4140, 0x4141, 0x4144, 0x4145, 0x4150, 0x4151, 0x4154, 0x4155, - 0x4400, 0x4401, 0x4404, 0x4405, 0x4410, 0x4411, 0x4414, 0x4415, - 0x4440, 0x4441, 0x4444, 0x4445, 0x4450, 0x4451, 0x4454, 0x4455, - 0x4500, 0x4501, 0x4504, 0x4505, 0x4510, 0x4511, 0x4514, 0x4515, - 0x4540, 0x4541, 0x4544, 0x4545, 0x4550, 0x4551, 0x4554, 0x4555, - 0x5000, 0x5001, 0x5004, 0x5005, 0x5010, 0x5011, 0x5014, 0x5015, - 0x5040, 0x5041, 0x5044, 0x5045, 0x5050, 0x5051, 0x5054, 0x5055, - 0x5100, 0x5101, 0x5104, 0x5105, 0x5110, 0x5111, 0x5114, 0x5115, - 0x5140, 0x5141, 0x5144, 0x5145, 0x5150, 0x5151, 0x5154, 0x5155, - 0x5400, 0x5401, 0x5404, 0x5405, 0x5410, 0x5411, 0x5414, 0x5415, - 0x5440, 0x5441, 0x5444, 0x5445, 0x5450, 0x5451, 0x5454, 0x5455, - 0x5500, 0x5501, 0x5504, 0x5505, 0x5510, 0x5511, 0x5514, 0x5515, - 0x5540, 0x5541, 0x5544, 0x5545, 0x5550, 0x5551, 0x5554, 0x5555 - }; - - uint32_t z = MortonTable256[y >> 8] << 17 | - MortonTable256[x >> 8] << 16 | - MortonTable256[y & 0xFF] << 1 | - MortonTable256[x & 0xFF]; - return z; - } +namespace xsimd +{ - template - batch_bool compare_int_avx512f(batch const& self, batch const& other) { - using register_type = typename batch_bool::register_type; - if(std::is_signed::value) { - switch(sizeof(T)) { - case 1: { - // shifting to take sign into account - uint64_t mask_low0 = _mm512_cmp_epi32_mask((batch(self.data) & batch(0x000000FF)) << 24, - (batch(other.data) & batch(0x000000FF)) << 24, - Cmp); - uint64_t mask_low1 = _mm512_cmp_epi32_mask((batch(self.data) & batch(0x0000FF00)) << 16, - (batch(other.data) & batch(0x0000FF00)) << 16, - Cmp); - uint64_t mask_high0 = _mm512_cmp_epi32_mask((batch(self.data) & batch(0x00FF0000)) << 8, - (batch(other.data) & batch(0x00FF0000)) << 8, - Cmp); - uint64_t mask_high1 = _mm512_cmp_epi32_mask((batch(self.data) & batch(0xFF000000)), - (batch(other.data) & batch(0xFF000000)), - Cmp); - uint64_t mask = 0; - for(unsigned i = 0; i < 16; ++i) { - mask |= (mask_low0 & (uint64_t(1) << i)) << (3 * i + 0); - mask |= (mask_low1 & (uint64_t(1) << i)) << (3 * i + 1); - mask |= (mask_high0 & (uint64_t(1) << i)) << (3 * i + 2); - mask |= (mask_high1 & (uint64_t(1) << i)) << (3 * i + 3); + namespace kernel + { + using namespace types; + + namespace detail + { + inline void split_avx512(__m512 val, __m256& low, __m256& high) noexcept + { + low = _mm512_castps512_ps256(val); + high = _mm512_extractf32x8_ps(val, 1); } - return (register_type)mask; - } - case 2: { - // shifting to take sign into account - uint16_t mask_low = _mm512_cmp_epi32_mask((batch(self.data) & batch(0x0000FFFF)) << 16, - (batch(other.data) & batch(0x0000FFFF)) << 16, - Cmp); - uint16_t mask_high = _mm512_cmp_epi32_mask((batch(self.data) & batch(0xFFFF0000)), - (batch(other.data) & batch(0xFFFF0000)), - Cmp); - return static_cast(morton(mask_low, mask_high)); - } - case 4: return (register_type)_mm512_cmp_epi32_mask(self, other, Cmp); - case 8: return (register_type)_mm512_cmp_epi64_mask(self, other, Cmp); - } - } - else { - switch(sizeof(T)) { - case 1: { - uint64_t mask_low0 = _mm512_cmp_epu32_mask((batch(self.data) & batch(0x000000FF)), (batch(other.data) & batch(0x000000FF)), Cmp); - uint64_t mask_low1 = _mm512_cmp_epu32_mask((batch(self.data) & batch(0x0000FF00)), (batch(other.data) & batch(0x0000FF00)), Cmp); - uint64_t mask_high0 = _mm512_cmp_epu32_mask((batch(self.data) & batch(0x00FF0000)), (batch(other.data) & batch(0x00FF0000)), Cmp); - uint64_t mask_high1 = _mm512_cmp_epu32_mask((batch(self.data) & batch(0xFF000000)), (batch(other.data) & batch(0xFF000000)), Cmp); - uint64_t mask = 0; - for(unsigned i = 0; i < 16; ++i) { - mask |= (mask_low0 & (uint64_t(1) << i)) << (3 * i + 0); - mask |= (mask_low1 & (uint64_t(1) << i)) << (3 * i + 1); - mask |= (mask_high0 & (uint64_t(1) << i)) << (3 * i + 2); - mask |= (mask_high1 & (uint64_t(1) << i)) << (3 * i + 3); + inline void split_avx512(__m512d val, __m256d& low, __m256d& high) noexcept + { + low = _mm512_castpd512_pd256(val); + high = _mm512_extractf64x4_pd(val, 1); + } + inline void split_avx512(__m512i val, __m256i& low, __m256i& high) noexcept + { + low = _mm512_castsi512_si256(val); + high = _mm512_extracti64x4_epi64(val, 1); + } + inline __m512i merge_avx(__m256i low, __m256i high) noexcept + { + return _mm512_inserti64x4(_mm512_castsi256_si512(low), high, 1); + } + inline __m512 merge_avx(__m256 low, __m256 high) noexcept + { + return _mm512_insertf32x8(_mm512_castps256_ps512(low), high, 1); + } + inline __m512d merge_avx(__m256d low, __m256d high) noexcept + { + return _mm512_insertf64x4(_mm512_castpd256_pd512(low), high, 1); + } + template + __m512i fwd_to_avx(F f, __m512i self) + { + __m256i self_low, self_high; + split_avx512(self, self_low, self_high); + __m256i res_low = f(self_low); + __m256i res_high = f(self_high); + return merge_avx(res_low, res_high); + } + template + __m512i fwd_to_avx(F f, __m512i self, __m512i other) + { + __m256i self_low, self_high, other_low, other_high; + split_avx512(self, self_low, self_high); + split_avx512(other, other_low, other_high); + __m256i res_low = f(self_low, other_low); + __m256i res_high = f(self_high, other_high); + return merge_avx(res_low, res_high); + } + template + __m512i fwd_to_avx(F f, __m512i self, int32_t other) + { + __m256i self_low, self_high; + split_avx512(self, self_low, self_high); + __m256i res_low = f(self_low, other); + __m256i res_high = f(self_high, other); + return merge_avx(res_low, res_high); + } + } + namespace detail + { + + inline uint32_t morton(uint16_t x, uint16_t y) noexcept + { + + static const unsigned short MortonTable256[256] = { + 0x0000, 0x0001, 0x0004, 0x0005, 0x0010, 0x0011, 0x0014, 0x0015, + 0x0040, 0x0041, 0x0044, 0x0045, 0x0050, 0x0051, 0x0054, 0x0055, + 0x0100, 0x0101, 0x0104, 0x0105, 0x0110, 0x0111, 0x0114, 0x0115, + 0x0140, 0x0141, 0x0144, 0x0145, 0x0150, 0x0151, 0x0154, 0x0155, + 0x0400, 0x0401, 0x0404, 0x0405, 0x0410, 0x0411, 0x0414, 0x0415, + 0x0440, 0x0441, 0x0444, 0x0445, 0x0450, 0x0451, 0x0454, 0x0455, + 0x0500, 0x0501, 0x0504, 0x0505, 0x0510, 0x0511, 0x0514, 0x0515, + 0x0540, 0x0541, 0x0544, 0x0545, 0x0550, 0x0551, 0x0554, 0x0555, + 0x1000, 0x1001, 0x1004, 0x1005, 0x1010, 0x1011, 0x1014, 0x1015, + 0x1040, 0x1041, 0x1044, 0x1045, 0x1050, 0x1051, 0x1054, 0x1055, + 0x1100, 0x1101, 0x1104, 0x1105, 0x1110, 0x1111, 0x1114, 0x1115, + 0x1140, 0x1141, 0x1144, 0x1145, 0x1150, 0x1151, 0x1154, 0x1155, + 0x1400, 0x1401, 0x1404, 0x1405, 0x1410, 0x1411, 0x1414, 0x1415, + 0x1440, 0x1441, 0x1444, 0x1445, 0x1450, 0x1451, 0x1454, 0x1455, + 0x1500, 0x1501, 0x1504, 0x1505, 0x1510, 0x1511, 0x1514, 0x1515, + 0x1540, 0x1541, 0x1544, 0x1545, 0x1550, 0x1551, 0x1554, 0x1555, + 0x4000, 0x4001, 0x4004, 0x4005, 0x4010, 0x4011, 0x4014, 0x4015, + 0x4040, 0x4041, 0x4044, 0x4045, 0x4050, 0x4051, 0x4054, 0x4055, + 0x4100, 0x4101, 0x4104, 0x4105, 0x4110, 0x4111, 0x4114, 0x4115, + 0x4140, 0x4141, 0x4144, 0x4145, 0x4150, 0x4151, 0x4154, 0x4155, + 0x4400, 0x4401, 0x4404, 0x4405, 0x4410, 0x4411, 0x4414, 0x4415, + 0x4440, 0x4441, 0x4444, 0x4445, 0x4450, 0x4451, 0x4454, 0x4455, + 0x4500, 0x4501, 0x4504, 0x4505, 0x4510, 0x4511, 0x4514, 0x4515, + 0x4540, 0x4541, 0x4544, 0x4545, 0x4550, 0x4551, 0x4554, 0x4555, + 0x5000, 0x5001, 0x5004, 0x5005, 0x5010, 0x5011, 0x5014, 0x5015, + 0x5040, 0x5041, 0x5044, 0x5045, 0x5050, 0x5051, 0x5054, 0x5055, + 0x5100, 0x5101, 0x5104, 0x5105, 0x5110, 0x5111, 0x5114, 0x5115, + 0x5140, 0x5141, 0x5144, 0x5145, 0x5150, 0x5151, 0x5154, 0x5155, + 0x5400, 0x5401, 0x5404, 0x5405, 0x5410, 0x5411, 0x5414, 0x5415, + 0x5440, 0x5441, 0x5444, 0x5445, 0x5450, 0x5451, 0x5454, 0x5455, + 0x5500, 0x5501, 0x5504, 0x5505, 0x5510, 0x5511, 0x5514, 0x5515, + 0x5540, 0x5541, 0x5544, 0x5545, 0x5550, 0x5551, 0x5554, 0x5555 + }; + + uint32_t z = MortonTable256[y >> 8] << 17 | MortonTable256[x >> 8] << 16 | MortonTable256[y & 0xFF] << 1 | MortonTable256[x & 0xFF]; + return z; } - return (register_type)mask; - } - case 2: { - uint16_t mask_low = _mm512_cmp_epu32_mask((batch(self.data) & batch(0x0000FFFF)), (batch(other.data) & batch(0x0000FFFF)), Cmp); - uint16_t mask_high = _mm512_cmp_epu32_mask((batch(self.data) & batch(0xFFFF0000)), (batch(other.data) & batch(0xFFFF0000)), Cmp); - return static_cast(morton(mask_low, mask_high)); - } - case 4: return (register_type)_mm512_cmp_epu32_mask(self, other, Cmp); - case 8: return (register_type)_mm512_cmp_epu64_mask(self, other, Cmp); - } - } - } - } - // abs - template batch abs(batch const& self, requires_arch) { - __m512 self_asf = (__m512)self; - __m512i self_asi = *reinterpret_cast<__m512i *>(&self_asf); - __m512i res_asi = _mm512_and_epi32(_mm512_set1_epi32(0x7FFFFFFF), self_asi); - return *reinterpret_cast<__m512*>(&res_asi); - } - template batch abs(batch const& self, requires_arch) { - __m512d self_asd = (__m512d)self; - __m512i self_asi = *reinterpret_cast<__m512i*>(&self_asd); - __m512i res_asi = _mm512_and_epi64(_mm512_set1_epi64(0x7FFFFFFFFFFFFFFF), - self_asi); - return *reinterpret_cast<__m512d*>(&res_asi); - } - template::value, void>::type> - batch abs(batch const& self, requires_arch) { - if(std::is_unsigned::value) - return self; - - switch(sizeof(T)) { - case 1: return detail::fwd_to_avx([](__m256i s) { return abs(batch(s)); }, self); - case 2: return detail::fwd_to_avx([](__m256i s) { return abs(batch(s)); }, self); - case 4: return _mm512_abs_epi32(self); - case 8: return _mm512_abs_epi64(self); - default: assert(false && "unsupported arch/op combination"); return {}; - } - } + template + inline batch_bool compare_int_avx512f(batch const& self, batch const& other) noexcept + { + using register_type = typename batch_bool::register_type; + if (std::is_signed::value) + { + switch (sizeof(T)) + { + case 1: + { + // shifting to take sign into account + uint64_t mask_low0 = _mm512_cmp_epi32_mask((batch(self.data) & batch(0x000000FF)) << 24, + (batch(other.data) & batch(0x000000FF)) << 24, + Cmp); + uint64_t mask_low1 = _mm512_cmp_epi32_mask((batch(self.data) & batch(0x0000FF00)) << 16, + (batch(other.data) & batch(0x0000FF00)) << 16, + Cmp); + uint64_t mask_high0 = _mm512_cmp_epi32_mask((batch(self.data) & batch(0x00FF0000)) << 8, + (batch(other.data) & batch(0x00FF0000)) << 8, + Cmp); + uint64_t mask_high1 = _mm512_cmp_epi32_mask((batch(self.data) & batch(0xFF000000)), + (batch(other.data) & batch(0xFF000000)), + Cmp); + uint64_t mask = 0; + for (unsigned i = 0; i < 16; ++i) + { + mask |= (mask_low0 & (uint64_t(1) << i)) << (3 * i + 0); + mask |= (mask_low1 & (uint64_t(1) << i)) << (3 * i + 1); + mask |= (mask_high0 & (uint64_t(1) << i)) << (3 * i + 2); + mask |= (mask_high1 & (uint64_t(1) << i)) << (3 * i + 3); + } + return (register_type)mask; + } + case 2: + { + // shifting to take sign into account + uint16_t mask_low = _mm512_cmp_epi32_mask((batch(self.data) & batch(0x0000FFFF)) << 16, + (batch(other.data) & batch(0x0000FFFF)) << 16, + Cmp); + uint16_t mask_high = _mm512_cmp_epi32_mask((batch(self.data) & batch(0xFFFF0000)), + (batch(other.data) & batch(0xFFFF0000)), + Cmp); + return static_cast(morton(mask_low, mask_high)); + } + case 4: + return (register_type)_mm512_cmp_epi32_mask(self, other, Cmp); + case 8: + return (register_type)_mm512_cmp_epi64_mask(self, other, Cmp); + } + } + else + { + switch (sizeof(T)) + { + case 1: + { + uint64_t mask_low0 = _mm512_cmp_epu32_mask((batch(self.data) & batch(0x000000FF)), (batch(other.data) & batch(0x000000FF)), Cmp); + uint64_t mask_low1 = _mm512_cmp_epu32_mask((batch(self.data) & batch(0x0000FF00)), (batch(other.data) & batch(0x0000FF00)), Cmp); + uint64_t mask_high0 = _mm512_cmp_epu32_mask((batch(self.data) & batch(0x00FF0000)), (batch(other.data) & batch(0x00FF0000)), Cmp); + uint64_t mask_high1 = _mm512_cmp_epu32_mask((batch(self.data) & batch(0xFF000000)), (batch(other.data) & batch(0xFF000000)), Cmp); + uint64_t mask = 0; + for (unsigned i = 0; i < 16; ++i) + { + mask |= (mask_low0 & (uint64_t(1) << i)) << (3 * i + 0); + mask |= (mask_low1 & (uint64_t(1) << i)) << (3 * i + 1); + mask |= (mask_high0 & (uint64_t(1) << i)) << (3 * i + 2); + mask |= (mask_high1 & (uint64_t(1) << i)) << (3 * i + 3); + } + return (register_type)mask; + } + case 2: + { + uint16_t mask_low = _mm512_cmp_epu32_mask((batch(self.data) & batch(0x0000FFFF)), (batch(other.data) & batch(0x0000FFFF)), Cmp); + uint16_t mask_high = _mm512_cmp_epu32_mask((batch(self.data) & batch(0xFFFF0000)), (batch(other.data) & batch(0xFFFF0000)), Cmp); + return static_cast(morton(mask_low, mask_high)); + } + case 4: + return (register_type)_mm512_cmp_epu32_mask(self, other, Cmp); + case 8: + return (register_type)_mm512_cmp_epu64_mask(self, other, Cmp); + } + } + } + } - // add - template::value, void>::type> - batch add(batch const& self, batch const& other, requires_arch) { - switch(sizeof(T)) { - case 1: return detail::fwd_to_avx([](__m256i s, __m256i o) { return add(batch(s), batch(o)); }, self, other); - case 2: return detail::fwd_to_avx([](__m256i s, __m256i o) { return add(batch(s), batch(o)); }, self, other); - case 4: return _mm512_add_epi32(self, other); - case 8: return _mm512_add_epi64(self, other); - default: assert(false && "unsupported arch/op combination"); return {}; - } - } - template batch add(batch const& self, batch const& other, requires_arch) { - return _mm512_add_ps(self, other); - } - template batch add(batch const& self, batch const& other, requires_arch) { - return _mm512_add_pd(self, other); - } + // abs + template + inline batch abs(batch const& self, requires_arch) noexcept + { + __m512 self_asf = (__m512)self; + __m512i self_asi = *reinterpret_cast<__m512i*>(&self_asf); + __m512i res_asi = _mm512_and_epi32(_mm512_set1_epi32(0x7FFFFFFF), self_asi); + return *reinterpret_cast<__m512*>(&res_asi); + } + template + inline batch abs(batch const& self, requires_arch) noexcept + { + __m512d self_asd = (__m512d)self; + __m512i self_asi = *reinterpret_cast<__m512i*>(&self_asd); + __m512i res_asi = _mm512_and_epi64(_mm512_set1_epi64(0x7FFFFFFFFFFFFFFF), + self_asi); + return *reinterpret_cast<__m512d*>(&res_asi); + } + template ::value, void>::type> + inline batch abs(batch const& self, requires_arch) noexcept + { + if (std::is_unsigned::value) + return self; + + switch (sizeof(T)) + { + case 1: + return detail::fwd_to_avx([](__m256i s) noexcept + { return abs(batch(s)); }, + self); + case 2: + return detail::fwd_to_avx([](__m256i s) noexcept + { return abs(batch(s)); }, + self); + case 4: + return _mm512_abs_epi32(self); + case 8: + return _mm512_abs_epi64(self); + default: + assert(false && "unsupported arch/op combination"); + return {}; + } + } - // all - template - bool all(batch_bool const& self, requires_arch) { - using register_type = typename batch_bool::register_type; - return self.data == register_type(-1); - } + // add + template ::value, void>::type> + inline batch add(batch const& self, batch const& other, requires_arch) noexcept + { + switch (sizeof(T)) + { + case 1: + return detail::fwd_to_avx([](__m256i s, __m256i o) noexcept + { return add(batch(s), batch(o)); }, + self, other); + case 2: + return detail::fwd_to_avx([](__m256i s, __m256i o) noexcept + { return add(batch(s), batch(o)); }, + self, other); + case 4: + return _mm512_add_epi32(self, other); + case 8: + return _mm512_add_epi64(self, other); + default: + assert(false && "unsupported arch/op combination"); + return {}; + } + } + template + inline batch add(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm512_add_ps(self, other); + } + template + inline batch add(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm512_add_pd(self, other); + } - // any - template - bool any(batch_bool const& self, requires_arch) { - using register_type = typename batch_bool::register_type; - return self.data != register_type(0); - } + // all + template + inline bool all(batch_bool const& self, requires_arch) noexcept + { + using register_type = typename batch_bool::register_type; + return self.data == register_type(-1); + } - // bitwise_and - template batch bitwise_and(batch const& self, batch const& other, requires_arch) { - return _mm512_castsi512_ps(_mm512_and_si512(_mm512_castps_si512(self), _mm512_castps_si512(other))); - } - template batch bitwise_and(batch const& self, batch const& other, requires_arch) { - return _mm512_castsi512_pd(_mm512_and_si512(_mm512_castpd_si512(self), _mm512_castpd_si512(other))); - } + // any + template + inline bool any(batch_bool const& self, requires_arch) noexcept + { + using register_type = typename batch_bool::register_type; + return self.data != register_type(0); + } - template::value, void>::type> - batch bitwise_and(batch const& self, batch const& other, requires_arch) { - return _mm512_and_si512(self, other); - } + // bitwise_and + template + inline batch bitwise_and(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm512_castsi512_ps(_mm512_and_si512(_mm512_castps_si512(self), _mm512_castps_si512(other))); + } + template + inline batch bitwise_and(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm512_castsi512_pd(_mm512_and_si512(_mm512_castpd_si512(self), _mm512_castpd_si512(other))); + } - template - batch_bool bitwise_and(batch_bool const& self, batch_bool const& other, requires_arch) { - using register_type = typename batch_bool::register_type; - return register_type(self.data & other.data); - } + template ::value, void>::type> + inline batch bitwise_and(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm512_and_si512(self, other); + } - // bitwise_andnot - template batch bitwise_andnot(batch const& self, batch const& other, requires_arch) { - return _mm512_castsi512_ps(_mm512_andnot_si512(_mm512_castps_si512(self), _mm512_castps_si512(other))); - } - template batch bitwise_andnot(batch const& self, batch const& other, requires_arch) { - return _mm512_castsi512_pd(_mm512_andnot_si512(_mm512_castpd_si512(self), _mm512_castpd_si512(other))); - } + template + inline batch_bool bitwise_and(batch_bool const& self, batch_bool const& other, requires_arch) noexcept + { + using register_type = typename batch_bool::register_type; + return register_type(self.data & other.data); + } - template::value, void>::type> - batch bitwise_andnot(batch const& self, batch const& other, requires_arch) { - return _mm512_andnot_si512(self, other); - } + // bitwise_andnot + template + inline batch bitwise_andnot(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm512_castsi512_ps(_mm512_andnot_si512(_mm512_castps_si512(self), _mm512_castps_si512(other))); + } + template + inline batch bitwise_andnot(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm512_castsi512_pd(_mm512_andnot_si512(_mm512_castpd_si512(self), _mm512_castpd_si512(other))); + } - template - batch_bool bitwise_andnot(batch_bool const& self, batch_bool const& other, requires_arch) { - using register_type = typename batch_bool::register_type; - return register_type(self.data & ~other.data); - } + template ::value, void>::type> + inline batch bitwise_andnot(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm512_andnot_si512(self, other); + } + + template + inline batch_bool bitwise_andnot(batch_bool const& self, batch_bool const& other, requires_arch) noexcept + { + using register_type = typename batch_bool::register_type; + return register_type(self.data & ~other.data); + } - // bitwise_lshift - template::value, void>::type> - batch bitwise_lshift(batch const& self, int32_t other, requires_arch) { - switch(sizeof(T)) { - case 1: { + // bitwise_lshift + template ::value, void>::type> + inline batch bitwise_lshift(batch const& self, int32_t other, requires_arch) noexcept + { + switch (sizeof(T)) + { + case 1: + { #if defined(XSIMD_AVX512_SHIFT_INTRINSICS_IMM_ONLY) - __m512i tmp = _mm512_sllv_epi32(self, _mm512_set1_epi32(other)); + __m512i tmp = _mm512_sllv_epi32(self, _mm512_set1_epi32(other)); #else - __m512i tmp = _mm512_slli_epi32(self, other); + __m512i tmp = _mm512_slli_epi32(self, other); #endif - return _mm512_and_si512(_mm512_set1_epi8(0xFF << other), tmp); - } - case 2: return detail::fwd_to_avx([](__m256i s, int32_t o) { return bitwise_lshift(batch(s), o, avx2{}); }, self, other); + return _mm512_and_si512(_mm512_set1_epi8(0xFF << other), tmp); + } + case 2: + return detail::fwd_to_avx([](__m256i s, int32_t o) noexcept + { return bitwise_lshift(batch(s), o, avx2 {}); }, + self, other); #if defined(XSIMD_AVX512_SHIFT_INTRINSICS_IMM_ONLY) - case 4: return _mm512_sllv_epi32(self, _mm512_set1_epi32(other)); - case 8: return _mm512_sllv_epi64(self, _mm512_set1_epi64(other)); + case 4: + return _mm512_sllv_epi32(self, _mm512_set1_epi32(other)); + case 8: + return _mm512_sllv_epi64(self, _mm512_set1_epi64(other)); #else - case 4: return _mm512_slli_epi32(self, other); - case 8: return _mm512_slli_epi64(self, other); + case 4: + return _mm512_slli_epi32(self, other); + case 8: + return _mm512_slli_epi64(self, other); #endif - default: assert(false && "unsupported arch/op combination"); return {}; - } - } + default: + assert(false && "unsupported arch/op combination"); + return {}; + } + } - // bitwise_not - template::value, void>::type> - batch bitwise_not(batch const& self, requires_arch) { - return _mm512_xor_si512(self, _mm512_set1_epi32(-1)); - } - template - batch_bool bitwise_not(batch_bool const& self, requires_arch) { - using register_type = typename batch_bool::register_type; - return register_type(~self.data); - } + // bitwise_not + template ::value, void>::type> + inline batch bitwise_not(batch const& self, requires_arch) noexcept + { + return _mm512_xor_si512(self, _mm512_set1_epi32(-1)); + } + template + inline batch_bool bitwise_not(batch_bool const& self, requires_arch) noexcept + { + using register_type = typename batch_bool::register_type; + return register_type(~self.data); + } - template batch bitwise_not(batch const& self, requires_arch) { - return _mm512_xor_ps(self, _mm512_castsi512_ps(_mm512_set1_epi32(-1))); - } - template - batch bitwise_not(batch const &self, requires_arch) { - return _mm512_xor_pd(self, _mm512_castsi512_pd(_mm512_set1_epi32(-1))); - } + template + inline batch bitwise_not(batch const& self, requires_arch) noexcept + { + return _mm512_xor_ps(self, _mm512_castsi512_ps(_mm512_set1_epi32(-1))); + } + template + inline batch bitwise_not(batch const& self, requires_arch) noexcept + { + return _mm512_xor_pd(self, _mm512_castsi512_pd(_mm512_set1_epi32(-1))); + } - // bitwise_or - template batch bitwise_or(batch const& self, batch const& other, requires_arch) { - return _mm512_castsi512_ps(_mm512_or_si512(_mm512_castps_si512(self), _mm512_castps_si512(other))); - } - template batch bitwise_or(batch const& self, batch const& other, requires_arch) { - return _mm512_castsi512_pd(_mm512_or_si512(_mm512_castpd_si512(self), _mm512_castpd_si512(other))); - } + // bitwise_or + template + inline batch bitwise_or(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm512_castsi512_ps(_mm512_or_si512(_mm512_castps_si512(self), _mm512_castps_si512(other))); + } + template + inline batch bitwise_or(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm512_castsi512_pd(_mm512_or_si512(_mm512_castpd_si512(self), _mm512_castpd_si512(other))); + } - template batch_bool bitwise_or(batch_bool const& self, batch_bool const& other, requires_arch) { - using register_type = typename batch_bool::register_type; - return register_type(self.data | other.data); - } + template + inline batch_bool bitwise_or(batch_bool const& self, batch_bool const& other, requires_arch) noexcept + { + using register_type = typename batch_bool::register_type; + return register_type(self.data | other.data); + } - template::value, void>::type> - batch bitwise_or(batch const& self, batch const& other, requires_arch) { - return _mm512_or_si512(self, other); - } + template ::value, void>::type> + inline batch bitwise_or(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm512_or_si512(self, other); + } - // bitwise_rshift - template::value, void>::type> - batch bitwise_rshift(batch const& self, int32_t other, requires_arch) { - if(std::is_signed::value) { - switch(sizeof(T)) { + // bitwise_rshift + template ::value, void>::type> + inline batch bitwise_rshift(batch const& self, int32_t other, requires_arch) noexcept + { + if (std::is_signed::value) + { + switch (sizeof(T)) + { #if defined(XSIMD_AVX512_SHIFT_INTRINSICS_IMM_ONLY) - case 4: return _mm512_srav_epi32(self, _mm512_set1_epi32(other)); - case 8: return _mm512_srav_epi64(self, _mm512_set1_epi64(other)); + case 4: + return _mm512_srav_epi32(self, _mm512_set1_epi32(other)); + case 8: + return _mm512_srav_epi64(self, _mm512_set1_epi64(other)); #else - case 4: return _mm512_srai_epi32(self, other); - case 8: return _mm512_srai_epi64(self, other); + case 4: + return _mm512_srai_epi32(self, other); + case 8: + return _mm512_srai_epi64(self, other); #endif - default: return detail::fwd_to_avx([](__m256i s, int32_t o) { return bitwise_rshift(batch(s), o, avx2{}); }, self, other); - } - } - else { - switch(sizeof(T)) { - case 1: - { + default: + return detail::fwd_to_avx([](__m256i s, int32_t o) noexcept + { return bitwise_rshift(batch(s), o, avx2 {}); }, + self, other); + } + } + else + { + switch (sizeof(T)) + { + case 1: + { #if defined(XSIMD_AVX512_SHIFT_INTRINSICS_IMM_ONLY) - __m512i tmp = _mm512_srlv_epi32(self, _mm512_set1_epi32(other)); + __m512i tmp = _mm512_srlv_epi32(self, _mm512_set1_epi32(other)); #else - __m512i tmp = _mm512_srli_epi32(self, other); + __m512i tmp = _mm512_srli_epi32(self, other); #endif - return _mm512_and_si512(_mm512_set1_epi8(0xFF >> other), tmp); - } + return _mm512_and_si512(_mm512_set1_epi8(0xFF >> other), tmp); + } #if defined(XSIMD_AVX512_SHIFT_INTRINSICS_IMM_ONLY) - case 4: return _mm512_srlv_epi32(self, _mm512_set1_epi32(other)); - case 8: return _mm512_srlv_epi64(self, _mm512_set1_epi64(other)); + case 4: + return _mm512_srlv_epi32(self, _mm512_set1_epi32(other)); + case 8: + return _mm512_srlv_epi64(self, _mm512_set1_epi64(other)); #else - case 4: return _mm512_srli_epi32(self, other); - case 8: return _mm512_srli_epi64(self, other); + case 4: + return _mm512_srli_epi32(self, other); + case 8: + return _mm512_srli_epi64(self, other); #endif - default: return detail::fwd_to_avx([](__m256i s, int32_t o) { return bitwise_rshift(batch(s), o, avx2{}); }, self, other); + default: + return detail::fwd_to_avx([](__m256i s, int32_t o) noexcept + { return bitwise_rshift(batch(s), o, avx2 {}); }, + self, other); + } + } } - } - } - - // bitwise_xor - template batch bitwise_xor(batch const& self, batch const& other, requires_arch) { - return _mm512_castsi512_ps(_mm512_xor_si512(_mm512_castps_si512(self), _mm512_castps_si512(other))); - } - template batch bitwise_xor(batch const& self, batch const& other, requires_arch) { - return _mm512_castsi512_pd(_mm512_xor_si512(_mm512_castpd_si512(self), _mm512_castpd_si512(other))); - } - template batch_bool bitwise_xor(batch_bool const& self, batch_bool const& other, requires_arch) { - using register_type = typename batch_bool::register_type; - return register_type(self.data | other.data); - } - - template::value, void>::type> - batch bitwise_xor(batch const& self, batch const& other, requires_arch) { - return _mm512_xor_si512(self, other); - } + // bitwise_xor + template + inline batch bitwise_xor(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm512_castsi512_ps(_mm512_xor_si512(_mm512_castps_si512(self), _mm512_castps_si512(other))); + } + template + inline batch bitwise_xor(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm512_castsi512_pd(_mm512_xor_si512(_mm512_castpd_si512(self), _mm512_castpd_si512(other))); + } - // bitwise_cast - template::value, void>::type> - batch bitwise_cast(batch const& self, batch const &, requires_arch) { - return _mm512_castsi512_ps(self); - } - template::value, void>::type> - batch bitwise_cast(batch const& self, batch const &, requires_arch) { - return _mm512_castsi512_pd(self); - } - template::type>::value, void>::type> - batch bitwise_cast(batch const& self, batch const &, requires_arch) { - return batch(self.data); - } - template - batch bitwise_cast(batch const& self, batch const &, requires_arch) { - return _mm512_castps_pd(self); - } - template::value, void>::type> - batch bitwise_cast(batch const& self, batch const &, requires_arch) { - return _mm512_castps_si512(self); - } - template - batch bitwise_cast(batch const& self, batch const &, requires_arch) { - return _mm512_castpd_ps(self); - } - template::value, void>::type> - batch bitwise_cast(batch const& self, batch const &, requires_arch) { - return _mm512_castpd_si512(self); - } + template + inline batch_bool bitwise_xor(batch_bool const& self, batch_bool const& other, requires_arch) noexcept + { + using register_type = typename batch_bool::register_type; + return register_type(self.data | other.data); + } - // bool_cast - template batch_bool bool_cast(batch_bool const& self, requires_arch) { - return self.data; - } - template batch_bool bool_cast(batch_bool const& self, requires_arch) { - return self.data; - } - template batch_bool bool_cast(batch_bool const& self, requires_arch) { - return self.data; - } - template batch_bool bool_cast(batch_bool const& self, requires_arch) { - return self.data; - } + template ::value, void>::type> + inline batch bitwise_xor(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm512_xor_si512(self, other); + } + // bitwise_cast + template ::value, void>::type> + inline batch bitwise_cast(batch const& self, batch const&, requires_arch) noexcept + { + return _mm512_castsi512_ps(self); + } + template ::value, void>::type> + inline batch bitwise_cast(batch const& self, batch const&, requires_arch) noexcept + { + return _mm512_castsi512_pd(self); + } + template ::type>::value, void>::type> + inline batch bitwise_cast(batch const& self, batch const&, requires_arch) noexcept + { + return batch(self.data); + } + template + inline batch bitwise_cast(batch const& self, batch const&, requires_arch) noexcept + { + return _mm512_castps_pd(self); + } + template ::value, void>::type> + inline batch bitwise_cast(batch const& self, batch const&, requires_arch) noexcept + { + return _mm512_castps_si512(self); + } + template + inline batch bitwise_cast(batch const& self, batch const&, requires_arch) noexcept + { + return _mm512_castpd_ps(self); + } + template ::value, void>::type> + inline batch bitwise_cast(batch const& self, batch const&, requires_arch) noexcept + { + return _mm512_castpd_si512(self); + } - // broadcast - template::value, void>::type> - batch broadcast(T val, requires_arch) { - switch(sizeof(T)) { - case 1: return _mm512_set1_epi8(val); - case 2: return _mm512_set1_epi16(val); - case 4: return _mm512_set1_epi32(val); - case 8: return _mm512_set1_epi64(val); - default: assert(false && "unsupported"); return {}; - } - } - template batch broadcast(float val, requires_arch) { - return _mm512_set1_ps(val); - } - template batch broadcast(double val, requires_arch) { - return _mm512_set1_pd(val); - } + // bool_cast + template + inline batch_bool bool_cast(batch_bool const& self, requires_arch) noexcept + { + return self.data; + } + template + inline batch_bool bool_cast(batch_bool const& self, requires_arch) noexcept + { + return self.data; + } + template + inline batch_bool bool_cast(batch_bool const& self, requires_arch) noexcept + { + return self.data; + } + template + inline batch_bool bool_cast(batch_bool const& self, requires_arch) noexcept + { + return self.data; + } - // ceil - template batch ceil(batch const& self, requires_arch) { - return _mm512_roundscale_ps(self, _MM_FROUND_TO_POS_INF); - } - template batch ceil(batch const& self, requires_arch) { - return _mm512_roundscale_pd(self, _MM_FROUND_TO_POS_INF); - } + // broadcast + template ::value, void>::type> + inline batch broadcast(T val, requires_arch) noexcept + { + switch (sizeof(T)) + { + case 1: + return _mm512_set1_epi8(val); + case 2: + return _mm512_set1_epi16(val); + case 4: + return _mm512_set1_epi32(val); + case 8: + return _mm512_set1_epi64(val); + default: + assert(false && "unsupported"); + return {}; + } + } + template + inline batch broadcast(float val, requires_arch) noexcept + { + return _mm512_set1_ps(val); + } + template + batch inline broadcast(double val, requires_arch) noexcept + { + return _mm512_set1_pd(val); + } + // ceil + template + inline batch ceil(batch const& self, requires_arch) noexcept + { + return _mm512_roundscale_ps(self, _MM_FROUND_TO_POS_INF); + } + template + inline batch ceil(batch const& self, requires_arch) noexcept + { + return _mm512_roundscale_pd(self, _MM_FROUND_TO_POS_INF); + } - namespace detail - { - // complex_low - template batch complex_low(batch, A> const& self, requires_arch) { - __m512i idx = _mm512_setr_epi32(0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23); - return _mm512_permutex2var_ps(self.real(), idx, self.imag()); - } - template batch complex_low(batch, A> const& self, requires_arch) { - __m512i idx = _mm512_setr_epi64(0, 8, 1, 9, 2, 10, 3, 11); - return _mm512_permutex2var_pd(self.real(), idx, self.imag()); - } + // convert + namespace detail + { + template + inline batch fast_cast(batch const& self, batch const&, requires_arch) noexcept + { + return _mm512_cvtepi32_ps(self); + } - // complex_high - template batch complex_high(batch, A> const& self, requires_arch) { - __m512i idx = _mm512_setr_epi32(8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31); - return _mm512_permutex2var_ps(self.real(), idx, self.imag()); - } - template batch complex_high(batch, A> const& self, requires_arch) { - __m512i idx = _mm512_setr_epi64(4, 12, 5, 13, 6, 14, 7, 15); - return _mm512_permutex2var_pd(self.real(), idx, self.imag()); - } - } + template + inline batch fast_cast(batch const& self, batch const&, requires_arch) noexcept + { + return _mm512_cvttps_epi32(self); + } - // convert - namespace detail { - template batch fast_cast(batch const& self, batch const&, requires_arch) { - return _mm512_cvtepi32_ps(self); - } - template batch fast_cast(batch const& self, batch const&, requires_arch) { - return _mm512_cvttps_epi32(self); - } - } + template + inline batch fast_cast(batch const& self, batch const&, requires_arch) noexcept + { + return _mm512_cvtepu32_ps(self); + } - // div - template batch div(batch const& self, batch const& other, requires_arch) { - return _mm512_div_ps(self, other); - } - template batch div(batch const& self, batch const& other, requires_arch) { - return _mm512_div_pd(self, other); - } + template + batch fast_cast(batch const& self, batch const&, requires_arch) + { + return _mm512_cvttps_epu32(self); + } + } + namespace detail + { + // complex_low + template + inline batch complex_low(batch, A> const& self, requires_arch) noexcept + { + __m512i idx = _mm512_setr_epi32(0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23); + return _mm512_permutex2var_ps(self.real(), idx, self.imag()); + } + template + inline batch complex_low(batch, A> const& self, requires_arch) noexcept + { + __m512i idx = _mm512_setr_epi64(0, 8, 1, 9, 2, 10, 3, 11); + return _mm512_permutex2var_pd(self.real(), idx, self.imag()); + } - // eq - template batch_bool eq(batch const& self, batch const& other, requires_arch) { - return _mm512_cmp_ps_mask(self, other, _CMP_EQ_OQ); - } - template batch_bool eq(batch const& self, batch const& other, requires_arch) { - return _mm512_cmp_pd_mask(self, other, _CMP_EQ_OQ); - } + // complex_high + template + inline batch complex_high(batch, A> const& self, requires_arch) noexcept + { + __m512i idx = _mm512_setr_epi32(8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31); + return _mm512_permutex2var_ps(self.real(), idx, self.imag()); + } + template + inline batch complex_high(batch, A> const& self, requires_arch) noexcept + { + __m512i idx = _mm512_setr_epi64(4, 12, 5, 13, 6, 14, 7, 15); + return _mm512_permutex2var_pd(self.real(), idx, self.imag()); + } + } - template::value, void>::type> - batch_bool eq(batch const& self, batch const& other, requires_arch) { - return detail::compare_int_avx512f(self, other); - } - template - batch_bool eq(batch_bool const& self, batch_bool const& other, requires_arch) { - using register_type = typename batch_bool::register_type; - return register_type(~self.data ^ other.data); - } + // div + template + inline batch div(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm512_div_ps(self, other); + } + template + inline batch div(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm512_div_pd(self, other); + } - // floor - template batch floor(batch const& self, requires_arch) { - return _mm512_roundscale_ps(self, _MM_FROUND_TO_NEG_INF); - } - template batch floor(batch const& self, requires_arch) { - return _mm512_roundscale_pd(self, _MM_FROUND_TO_NEG_INF); - } + // eq + template + inline batch_bool eq(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm512_cmp_ps_mask(self, other, _CMP_EQ_OQ); + } + template + inline batch_bool eq(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm512_cmp_pd_mask(self, other, _CMP_EQ_OQ); + } - // from bool - template - batch from_bool(batch_bool const& self, requires_arch) { - return select(self, batch(1), batch(0)); - } + template ::value, void>::type> + inline batch_bool eq(batch const& self, batch const& other, requires_arch) noexcept + { + return detail::compare_int_avx512f(self, other); + } + template + inline batch_bool eq(batch_bool const& self, batch_bool const& other, requires_arch) noexcept + { + using register_type = typename batch_bool::register_type; + return register_type(~self.data ^ other.data); + } - // ge - template batch_bool ge(batch const& self, batch const& other, requires_arch) { - return _mm512_cmp_ps_mask(self, other, _CMP_GE_OQ); - } - template batch_bool ge(batch const& self, batch const& other, requires_arch) { - return _mm512_cmp_pd_mask(self, other, _CMP_GE_OQ); - } - template::value, void>::type> - batch_bool ge(batch const& self, batch const& other, requires_arch) { - return detail::compare_int_avx512f(self, other); - } + // floor + template + inline batch floor(batch const& self, requires_arch) noexcept + { + return _mm512_roundscale_ps(self, _MM_FROUND_TO_NEG_INF); + } + template + inline batch floor(batch const& self, requires_arch) noexcept + { + return _mm512_roundscale_pd(self, _MM_FROUND_TO_NEG_INF); + } - // gt - template batch_bool gt(batch const& self, batch const& other, requires_arch) { - return _mm512_cmp_ps_mask(self, other, _CMP_GT_OQ); - } - template batch_bool gt(batch const& self, batch const& other, requires_arch) { - return _mm512_cmp_pd_mask(self, other, _CMP_GT_OQ); - } - template::value, void>::type> - batch_bool gt(batch const& self, batch const& other, requires_arch) { - return detail::compare_int_avx512f(self, other); - } + // from bool + template + inline batch from_bool(batch_bool const& self, requires_arch) noexcept + { + return select(self, batch(1), batch(0)); + } + // ge + template + inline batch_bool ge(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm512_cmp_ps_mask(self, other, _CMP_GE_OQ); + } + template + inline batch_bool ge(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm512_cmp_pd_mask(self, other, _CMP_GE_OQ); + } + template ::value, void>::type> + inline batch_bool ge(batch const& self, batch const& other, requires_arch) noexcept + { + return detail::compare_int_avx512f(self, other); + } - // hadd - template float hadd(batch const& rhs, requires_arch) { - __m256 tmp1 = _mm512_extractf32x8_ps(rhs, 1); - __m256 tmp2 = _mm512_extractf32x8_ps(rhs, 0); - __m256 res1 = _mm256_add_ps(tmp1, tmp2); - return hadd(batch(res1), avx2{}); + // gt + template + inline batch_bool gt(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm512_cmp_ps_mask(self, other, _CMP_GT_OQ); + } + template + inline batch_bool gt(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm512_cmp_pd_mask(self, other, _CMP_GT_OQ); + } + template ::value, void>::type> + inline batch_bool gt(batch const& self, batch const& other, requires_arch) noexcept + { + return detail::compare_int_avx512f(self, other); + } - } - template - double hadd(batch const &rhs, requires_arch) { - __m256d tmp1 = _mm512_extractf64x4_pd(rhs, 1); - __m256d tmp2 = _mm512_extractf64x4_pd(rhs, 0); - __m256d res1 = _mm256_add_pd(tmp1, tmp2); - return hadd(batch(res1), avx2{}); - } - template::value, void>::type> - T hadd(batch const& self, requires_arch) { - __m256i low, high; - detail::split_avx512(self, low, high); - batch blow(low), bhigh(high); - return hadd(blow, avx2{}) + hadd(bhigh, avx2{}); - } + // hadd + template + inline float hadd(batch const& rhs, requires_arch) noexcept + { + __m256 tmp1 = _mm512_extractf32x8_ps(rhs, 1); + __m256 tmp2 = _mm512_extractf32x8_ps(rhs, 0); + __m256 res1 = _mm256_add_ps(tmp1, tmp2); + return hadd(batch(res1), avx2 {}); + } + template + inline double hadd(batch const& rhs, requires_arch) noexcept + { + __m256d tmp1 = _mm512_extractf64x4_pd(rhs, 1); + __m256d tmp2 = _mm512_extractf64x4_pd(rhs, 0); + __m256d res1 = _mm256_add_pd(tmp1, tmp2); + return hadd(batch(res1), avx2 {}); + } + template ::value, void>::type> + inline T hadd(batch const& self, requires_arch) noexcept + { + __m256i low, high; + detail::split_avx512(self, low, high); + batch blow(low), bhigh(high); + return hadd(blow, avx2 {}) + hadd(bhigh, avx2 {}); + } - // haddp - template batch haddp(batch const* row, requires_arch) { - // The following folds over the vector once: - // tmp1 = [a0..8, b0..8] - // tmp2 = [a8..f, b8..f] -#define XSIMD_AVX512_HADDP_STEP1(I, a, b) \ - batch res ## I; \ - { \ - auto tmp1 = _mm512_shuffle_f32x4(a, b, _MM_SHUFFLE(1, 0, 1, 0)); \ - auto tmp2 = _mm512_shuffle_f32x4(a, b, _MM_SHUFFLE(3, 2, 3, 2)); \ - res ## I = _mm512_add_ps(tmp1, tmp2); \ - } \ - - XSIMD_AVX512_HADDP_STEP1(0, row[0], row[2]); - XSIMD_AVX512_HADDP_STEP1(1, row[4], row[6]); - XSIMD_AVX512_HADDP_STEP1(2, row[1], row[3]); - XSIMD_AVX512_HADDP_STEP1(3, row[5], row[7]); - XSIMD_AVX512_HADDP_STEP1(4, row[8], row[10]); - XSIMD_AVX512_HADDP_STEP1(5, row[12], row[14]); - XSIMD_AVX512_HADDP_STEP1(6, row[9], row[11]); - XSIMD_AVX512_HADDP_STEP1(7, row[13], row[15]); + // haddp + template + inline batch haddp(batch const* row, requires_arch) noexcept + { + // The following folds over the vector once: + // tmp1 = [a0..8, b0..8] + // tmp2 = [a8..f, b8..f] +#define XSIMD_AVX512_HADDP_STEP1(I, a, b) \ + batch res##I; \ + { \ + auto tmp1 = _mm512_shuffle_f32x4(a, b, _MM_SHUFFLE(1, 0, 1, 0)); \ + auto tmp2 = _mm512_shuffle_f32x4(a, b, _MM_SHUFFLE(3, 2, 3, 2)); \ + res##I = _mm512_add_ps(tmp1, tmp2); \ + } + + XSIMD_AVX512_HADDP_STEP1(0, row[0], row[2]); + XSIMD_AVX512_HADDP_STEP1(1, row[4], row[6]); + XSIMD_AVX512_HADDP_STEP1(2, row[1], row[3]); + XSIMD_AVX512_HADDP_STEP1(3, row[5], row[7]); + XSIMD_AVX512_HADDP_STEP1(4, row[8], row[10]); + XSIMD_AVX512_HADDP_STEP1(5, row[12], row[14]); + XSIMD_AVX512_HADDP_STEP1(6, row[9], row[11]); + XSIMD_AVX512_HADDP_STEP1(7, row[13], row[15]); #undef XSIMD_AVX512_HADDP_STEP1 - // The following flds the code and shuffles so that hadd_ps produces the correct result - // tmp1 = [a0..4, a8..12, b0..4, b8..12] (same for tmp3) - // tmp2 = [a5..8, a12..16, b5..8, b12..16] (same for tmp4) - // tmp5 = [r1[0], r1[2], r2[0], r2[2], r1[4], r1[6] ... -#define XSIMD_AVX512_HADDP_STEP2(I, a, b, c, d) \ - batch halfx ## I; \ - { \ - auto tmp1 = _mm512_shuffle_f32x4(a, b, _MM_SHUFFLE(2, 0, 2, 0)); \ - auto tmp2 = _mm512_shuffle_f32x4(a, b, _MM_SHUFFLE(3, 1, 3, 1)); \ - \ - auto resx1 = _mm512_add_ps(tmp1, tmp2); \ - \ - auto tmp3 = _mm512_shuffle_f32x4(c, d, _MM_SHUFFLE(2, 0, 2, 0)); \ - auto tmp4 = _mm512_shuffle_f32x4(c, d, _MM_SHUFFLE(3, 1, 3, 1)); \ - \ - auto resx2 = _mm512_add_ps(tmp3, tmp4); \ - \ - auto tmp5 = _mm512_shuffle_ps(resx1, resx2, _MM_SHUFFLE(2, 0, 2, 0)); \ - auto tmp6 = _mm512_shuffle_ps(resx1, resx2, _MM_SHUFFLE(3, 1, 3, 1)); \ - \ - auto resx3 = _mm512_add_ps(tmp5, tmp6); \ - \ - halfx ## I = _mm256_hadd_ps(_mm512_extractf32x8_ps(resx3, 0), \ - _mm512_extractf32x8_ps(resx3, 1)); \ - } \ - - XSIMD_AVX512_HADDP_STEP2(0, res0, res1, res2, res3); - XSIMD_AVX512_HADDP_STEP2(1, res4, res5, res6, res7); + // The following flds the code and shuffles so that hadd_ps produces the correct result + // tmp1 = [a0..4, a8..12, b0..4, b8..12] (same for tmp3) + // tmp2 = [a5..8, a12..16, b5..8, b12..16] (same for tmp4) + // tmp5 = [r1[0], r1[2], r2[0], r2[2], r1[4], r1[6] ... +#define XSIMD_AVX512_HADDP_STEP2(I, a, b, c, d) \ + batch halfx##I; \ + { \ + auto tmp1 = _mm512_shuffle_f32x4(a, b, _MM_SHUFFLE(2, 0, 2, 0)); \ + auto tmp2 = _mm512_shuffle_f32x4(a, b, _MM_SHUFFLE(3, 1, 3, 1)); \ + \ + auto resx1 = _mm512_add_ps(tmp1, tmp2); \ + \ + auto tmp3 = _mm512_shuffle_f32x4(c, d, _MM_SHUFFLE(2, 0, 2, 0)); \ + auto tmp4 = _mm512_shuffle_f32x4(c, d, _MM_SHUFFLE(3, 1, 3, 1)); \ + \ + auto resx2 = _mm512_add_ps(tmp3, tmp4); \ + \ + auto tmp5 = _mm512_shuffle_ps(resx1, resx2, _MM_SHUFFLE(2, 0, 2, 0)); \ + auto tmp6 = _mm512_shuffle_ps(resx1, resx2, _MM_SHUFFLE(3, 1, 3, 1)); \ + \ + auto resx3 = _mm512_add_ps(tmp5, tmp6); \ + \ + halfx##I = _mm256_hadd_ps(_mm512_extractf32x8_ps(resx3, 0), \ + _mm512_extractf32x8_ps(resx3, 1)); \ + } + + XSIMD_AVX512_HADDP_STEP2(0, res0, res1, res2, res3); + XSIMD_AVX512_HADDP_STEP2(1, res4, res5, res6, res7); #undef XSIMD_AVX512_HADDP_STEP2 - auto concat = _mm512_castps256_ps512(halfx0); - concat = _mm512_insertf32x8(concat, halfx1, 1); - return concat; - } - template - batch haddp(batch const *row, requires_arch) { + auto concat = _mm512_castps256_ps512(halfx0); + concat = _mm512_insertf32x8(concat, halfx1, 1); + return concat; + } + + template + inline batch haddp(batch const* row, requires_arch) noexcept + { #define step1(I, a, b) \ - batch res ## I; \ - { \ - auto tmp1 = _mm512_shuffle_f64x2(a, b, _MM_SHUFFLE(1, 0, 1, 0)); \ - auto tmp2 = _mm512_shuffle_f64x2(a, b, _MM_SHUFFLE(3, 2, 3, 2)); \ - res ## I = _mm512_add_pd(tmp1, tmp2); \ - } \ - - step1(1, row[0], row[2]); - step1(2, row[4], row[6]); - step1(3, row[1], row[3]); - step1(4, row[5], row[7]); + batch res##I; \ + { \ + auto tmp1 = _mm512_shuffle_f64x2(a, b, _MM_SHUFFLE(1, 0, 1, 0)); \ + auto tmp2 = _mm512_shuffle_f64x2(a, b, _MM_SHUFFLE(3, 2, 3, 2)); \ + res##I = _mm512_add_pd(tmp1, tmp2); \ + } + + step1(1, row[0], row[2]); + step1(2, row[4], row[6]); + step1(3, row[1], row[3]); + step1(4, row[5], row[7]); #undef step1 - auto tmp5 = _mm512_shuffle_f64x2(res1, res2, _MM_SHUFFLE(2, 0, 2, 0)); - auto tmp6 = _mm512_shuffle_f64x2(res1, res2, _MM_SHUFFLE(3, 1, 3, 1)); + auto tmp5 = _mm512_shuffle_f64x2(res1, res2, _MM_SHUFFLE(2, 0, 2, 0)); + auto tmp6 = _mm512_shuffle_f64x2(res1, res2, _MM_SHUFFLE(3, 1, 3, 1)); - auto resx1 = _mm512_add_pd(tmp5, tmp6); + auto resx1 = _mm512_add_pd(tmp5, tmp6); - auto tmp7 = _mm512_shuffle_f64x2(res3, res4, _MM_SHUFFLE(2, 0, 2, 0)); - auto tmp8 = _mm512_shuffle_f64x2(res3, res4, _MM_SHUFFLE(3, 1, 3, 1)); + auto tmp7 = _mm512_shuffle_f64x2(res3, res4, _MM_SHUFFLE(2, 0, 2, 0)); + auto tmp8 = _mm512_shuffle_f64x2(res3, res4, _MM_SHUFFLE(3, 1, 3, 1)); - auto resx2 = _mm512_add_pd(tmp7, tmp8); + auto resx2 = _mm512_add_pd(tmp7, tmp8); - auto tmpx = _mm512_shuffle_pd(resx1, resx2, 0b00000000); - auto tmpy = _mm512_shuffle_pd(resx1, resx2, 0b11111111); + auto tmpx = _mm512_shuffle_pd(resx1, resx2, 0b00000000); + auto tmpy = _mm512_shuffle_pd(resx1, resx2, 0b11111111); - return _mm512_add_pd(tmpx, tmpy); - } + return _mm512_add_pd(tmpx, tmpy); + } - // isnan - template batch_bool isnan(batch const& self, requires_arch) { - return _mm512_cmp_ps_mask(self, self, _CMP_UNORD_Q); - } - template batch_bool isnan(batch const& self, requires_arch) { - return _mm512_cmp_pd_mask(self, self, _CMP_UNORD_Q); - } + // isnan + template + inline batch_bool isnan(batch const& self, requires_arch) noexcept + { + return _mm512_cmp_ps_mask(self, self, _CMP_UNORD_Q); + } + template + inline batch_bool isnan(batch const& self, requires_arch) noexcept + { + return _mm512_cmp_pd_mask(self, self, _CMP_UNORD_Q); + } - // le - template batch_bool le(batch const& self, batch const& other, requires_arch) { - return _mm512_cmp_ps_mask(self, other, _CMP_LE_OQ); - } - template batch_bool le(batch const& self, batch const& other, requires_arch) { - return _mm512_cmp_pd_mask(self, other, _CMP_LE_OQ); - } - template::value, void>::type> - batch_bool le(batch const& self, batch const& other, requires_arch) { - return detail::compare_int_avx512f(self, other); - } + // le + template + inline batch_bool le(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm512_cmp_ps_mask(self, other, _CMP_LE_OQ); + } + template + inline batch_bool le(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm512_cmp_pd_mask(self, other, _CMP_LE_OQ); + } + template ::value, void>::type> + inline batch_bool le(batch const& self, batch const& other, requires_arch) noexcept + { + return detail::compare_int_avx512f(self, other); + } + // load_aligned + template ::value, void>::type> + inline batch load_aligned(T const* mem, convert, requires_arch) noexcept + { + return _mm512_load_si512((__m512i const*)mem); + } + template + inline batch load_aligned(float const* mem, convert, requires_arch) noexcept + { + return _mm512_load_ps(mem); + } + template + inline batch load_aligned(double const* mem, convert, requires_arch) noexcept + { + return _mm512_load_pd(mem); + } - // load_aligned - template::value, void>::type> - batch load_aligned(T const* mem, convert, requires_arch) { - return _mm512_load_si512((__m512i const*)mem); - } - template batch load_aligned(float const* mem, convert, requires_arch) { - return _mm512_load_ps(mem); - } - template batch load_aligned(double const* mem, convert, requires_arch) { - return _mm512_load_pd(mem); - } + // load_complex + namespace detail + { + template + inline batch, A> load_complex(batch const& hi, batch const& lo, requires_arch) noexcept + { + __m512i real_idx = _mm512_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30); + __m512i imag_idx = _mm512_setr_epi32(1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31); + auto real = _mm512_permutex2var_ps(hi, real_idx, lo); + auto imag = _mm512_permutex2var_ps(hi, imag_idx, lo); + return { real, imag }; + } + template + inline batch, A> load_complex(batch const& hi, batch const& lo, requires_arch) noexcept + { + __m512i real_idx = _mm512_setr_epi64(0, 2, 4, 6, 8, 10, 12, 14); + __m512i imag_idx = _mm512_setr_epi64(1, 3, 5, 7, 9, 11, 13, 15); + auto real = _mm512_permutex2var_pd(hi, real_idx, lo); + auto imag = _mm512_permutex2var_pd(hi, imag_idx, lo); + return { real, imag }; + } + } - // load_complex - namespace detail - { - template batch, A> load_complex(batch const& hi, batch const& lo, requires_arch) { - __m512i real_idx = _mm512_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30); - __m512i imag_idx = _mm512_setr_epi32(1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31); - auto real = _mm512_permutex2var_ps(hi, real_idx, lo); - auto imag = _mm512_permutex2var_ps(hi, imag_idx, lo); - return {real, imag}; - } - template batch, A> load_complex(batch const& hi, batch const& lo, requires_arch) { - __m512i real_idx = _mm512_setr_epi64(0, 2, 4, 6, 8, 10, 12, 14); - __m512i imag_idx = _mm512_setr_epi64(1, 3, 5, 7, 9, 11, 13, 15); - auto real = _mm512_permutex2var_pd(hi, real_idx, lo); - auto imag = _mm512_permutex2var_pd(hi, imag_idx, lo); - return {real, imag}; - } - } + // load_unaligned + template ::value, void>::type> + inline batch load_unaligned(T const* mem, convert, requires_arch) noexcept + { + return _mm512_loadu_si512((__m512i const*)mem); + } + template + inline batch load_unaligned(float const* mem, convert, requires_arch) noexcept + { + return _mm512_loadu_ps(mem); + } + template + inline batch load_unaligned(double const* mem, convert, requires_arch) noexcept + { + return _mm512_loadu_pd(mem); + } - // load_unaligned - template::value, void>::type> - batch load_unaligned(T const* mem, convert, requires_arch) { - return _mm512_loadu_si512((__m512i const*)mem); - } - template batch load_unaligned(float const* mem, convert, requires_arch){ - return _mm512_loadu_ps(mem); - } - template batch load_unaligned(double const* mem, convert, requires_arch){ - return _mm512_loadu_pd(mem); - } + // lt + template + inline batch_bool lt(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm512_cmp_ps_mask(self, other, _CMP_LT_OQ); + } + template + inline batch_bool lt(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm512_cmp_pd_mask(self, other, _CMP_LT_OQ); + } - // lt - template batch_bool lt(batch const& self, batch const& other, requires_arch) { - return _mm512_cmp_ps_mask(self, other, _CMP_LT_OQ); - } - template batch_bool lt(batch const& self, batch const& other, requires_arch) { - return _mm512_cmp_pd_mask(self, other, _CMP_LT_OQ); - } + template ::value, void>::type> + inline batch_bool lt(batch const& self, batch const& other, requires_arch) noexcept + { + return detail::compare_int_avx512f(self, other); + } + // max + template + inline batch max(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm512_max_ps(self, other); + } + template + inline batch max(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm512_max_pd(self, other); + } + template ::value, void>::type> + inline batch max(batch const& self, batch const& other, requires_arch) noexcept + { + if (std::is_signed::value) + { + switch (sizeof(T)) + { + case 4: + return _mm512_max_epi32(self, other); + case 8: + return _mm512_max_epi64(self, other); + default: + return detail::fwd_to_avx([](__m256i s, __m256i o) noexcept + { return max(batch(s), batch(o)); }, + self, other); + } + } + else + { + switch (sizeof(T)) + { + case 4: + return _mm512_max_epu32(self, other); + case 8: + return _mm512_max_epu64(self, other); + default: + return detail::fwd_to_avx([](__m256i s, __m256i o) noexcept + { return max(batch(s), batch(o)); }, + self, other); + } + } + } - template::value, void>::type> - batch_bool lt(batch const& self, batch const& other, requires_arch) { - return detail::compare_int_avx512f(self, other); - } + // min + template + inline batch min(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm512_min_ps(self, other); + } + template + inline batch min(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm512_min_pd(self, other); + } + template ::value, void>::type> + inline batch min(batch const& self, batch const& other, requires_arch) noexcept + { + if (std::is_signed::value) + { + switch (sizeof(T)) + { + case 4: + return _mm512_min_epi32(self, other); + case 8: + return _mm512_min_epi64(self, other); + default: + return detail::fwd_to_avx([](__m256i s, __m256i o) noexcept + { return min(batch(s), batch(o)); }, + self, other); + } + } + else + { + switch (sizeof(T)) + { + case 4: + return _mm512_min_epu32(self, other); + case 8: + return _mm512_min_epu64(self, other); + default: + return detail::fwd_to_avx([](__m256i s, __m256i o) noexcept + { return min(batch(s), batch(o)); }, + self, other); + } + } + } - // max - template batch max(batch const& self, batch const& other, requires_arch) { - return _mm512_max_ps(self, other); - } - template batch max(batch const& self, batch const& other, requires_arch) { - return _mm512_max_pd(self, other); - } - template::value, void>::type> - batch max(batch const& self, batch const& other, requires_arch) { - if(std::is_signed::value) { - switch(sizeof(T)) { - case 4: return _mm512_max_epi32(self, other); - case 8: return _mm512_max_epi64(self, other); - default : return detail::fwd_to_avx([](__m256i s, __m256i o) { return max(batch(s), batch(o)); }, self, other); - } - } - else { - switch(sizeof(T)) { - case 4: return _mm512_max_epu32(self, other); - case 8: return _mm512_max_epu64(self, other); - default : return detail::fwd_to_avx([](__m256i s, __m256i o) { return max(batch(s), batch(o)); }, self, other); - } - } - } + // mul + template + inline batch mul(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm512_mul_ps(self, other); + } + template + inline batch mul(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm512_mul_pd(self, other); + } + template ::value, void>::type> + inline batch mul(batch const& self, batch const& other, requires_arch) noexcept + { + switch (sizeof(T)) + { + case 4: + return _mm512_mullo_epi32(self, other); + case 8: + return _mm512_mullo_epi64(self, other); + default: + return detail::fwd_to_avx([](__m256i s, __m256i o) noexcept + { return mul(batch(s), batch(o)); }, + self, other); + } + } - // min - template batch min(batch const& self, batch const& other, requires_arch) { - return _mm512_min_ps(self, other); - } - template batch min(batch const& self, batch const& other, requires_arch) { - return _mm512_min_pd(self, other); - } - template::value, void>::type> - batch min(batch const& self, batch const& other, requires_arch) { - if(std::is_signed::value) { - switch(sizeof(T)) { - case 4: return _mm512_min_epi32(self, other); - case 8: return _mm512_min_epi64(self, other); - default : return detail::fwd_to_avx([](__m256i s, __m256i o) { return min(batch(s), batch(o)); }, self, other); - } - } - else { - switch(sizeof(T)) { - case 4: return _mm512_min_epu32(self, other); - case 8: return _mm512_min_epu64(self, other); - default : return detail::fwd_to_avx([](__m256i s, __m256i o) { return min(batch(s), batch(o)); }, self, other); - } - } - } + // nearbyint + template + inline batch nearbyint(batch const& self, requires_arch) noexcept + { + return _mm512_roundscale_round_ps(self, _MM_FROUND_TO_NEAREST_INT, _MM_FROUND_CUR_DIRECTION); + } + template + inline batch nearbyint(batch const& self, requires_arch) noexcept + { + return _mm512_roundscale_round_pd(self, _MM_FROUND_TO_NEAREST_INT, _MM_FROUND_CUR_DIRECTION); + } - // mul - template batch mul(batch const& self, batch const& other, requires_arch) { - return _mm512_mul_ps(self, other); - } - template batch mul(batch const& self, batch const& other, requires_arch) { - return _mm512_mul_pd(self, other); - } - template::value, void>::type> - batch mul(batch const& self, batch const& other, requires_arch) { - switch(sizeof(T)) { - case 4: return _mm512_mullo_epi32(self, other); - case 8: return _mm512_mullo_epi64(self, other); - default : return detail::fwd_to_avx([](__m256i s, __m256i o) { return mul(batch(s), batch(o)); }, self, other); + // neg + template + inline batch neg(batch const& self, requires_arch) noexcept + { + return 0 - self; } - } - // nearbyint - template batch nearbyint(batch const& self, requires_arch) { - return _mm512_roundscale_round_ps(self, _MM_FROUND_TO_NEAREST_INT, _MM_FROUND_CUR_DIRECTION); - } - template batch nearbyint(batch const& self, requires_arch) { - return _mm512_roundscale_round_pd(self, _MM_FROUND_TO_NEAREST_INT, _MM_FROUND_CUR_DIRECTION); - } + // neq + template + inline batch_bool neq(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm512_cmp_ps_mask(self, other, _CMP_NEQ_OQ); + } + template + inline batch_bool neq(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm512_cmp_pd_mask(self, other, _CMP_NEQ_OQ); + } + template ::value, void>::type> + inline batch_bool neq(batch const& self, batch const& other, requires_arch) noexcept + { + return ~(self == other); + } - // neg - template - batch neg(batch const& self, requires_arch) { - return 0 - self; - } + template + inline batch_bool neq(batch_bool const& self, batch_bool const& other, requires_arch) noexcept + { + using register_type = typename batch_bool::register_type; + return register_type(self.data ^ other.data); + } - // neq - template batch_bool neq(batch const& self, batch const& other, requires_arch) { - return _mm512_cmp_ps_mask(self, other, _CMP_NEQ_OQ); - } - template batch_bool neq(batch const& self, batch const& other, requires_arch) { - return _mm512_cmp_pd_mask(self, other, _CMP_NEQ_OQ); - } - template::value, void>::type> - batch_bool neq(batch const& self, batch const& other, requires_arch) { - return ~(self == other); - } + // sadd + template + inline batch sadd(batch const& self, batch const& other, requires_arch) noexcept + { + return add(self, other); // no saturated arithmetic on floating point numbers + } + template + inline batch sadd(batch const& self, batch const& other, requires_arch) noexcept + { + return add(self, other); // no saturated arithmetic on floating point numbers + } + template ::value, void>::type> + inline batch sadd(batch const& self, batch const& other, requires_arch) noexcept + { + if (std::is_signed::value) + { + auto mask = other < 0; + auto self_pos_branch = min(std::numeric_limits::max() - other, self); + auto self_neg_branch = max(std::numeric_limits::min() - other, self); + return other + select(mask, self_neg_branch, self_pos_branch); + } + else + { + const auto diffmax = std::numeric_limits::max() - self; + const auto mindiff = min(diffmax, other); + return self + mindiff; + } + } - template - batch_bool neq(batch_bool const& self, batch_bool const& other, requires_arch) { - using register_type = typename batch_bool::register_type; - return register_type(self.data ^ other.data); - } + // select + template + inline batch select(batch_bool const& cond, batch const& true_br, batch const& false_br, requires_arch) noexcept + { + return _mm512_mask_blend_ps(cond, false_br, true_br); + } + template + inline batch select(batch_bool const& cond, batch const& true_br, batch const& false_br, requires_arch) noexcept + { + return _mm512_mask_blend_pd(cond, false_br, true_br); + } - // sadd - template batch sadd(batch const& self, batch const& other, requires_arch) { - return add(self, other); // no saturated arithmetic on floating point numbers - } - template batch sadd(batch const& self, batch const& other, requires_arch) { - return add(self, other); // no saturated arithmetic on floating point numbers - } - template::value, void>::type> - batch sadd(batch const& self, batch const& other, requires_arch) { - if(std::is_signed::value) { - auto mask = other < 0; - auto self_pos_branch = min(std::numeric_limits::max() - other, self); - auto self_neg_branch = max(std::numeric_limits::min() - other, self); - return other + select(mask, self_neg_branch, self_pos_branch); - } - else { - const auto diffmax = std::numeric_limits::max() - self; - const auto mindiff = min(diffmax, other); - return self + mindiff; - } - } + template ::value, void>::type> + inline batch select(batch_bool const& cond, batch const& true_br, batch const& false_br, requires_arch) noexcept + { + switch (sizeof(T)) + { + case 1: + { + alignas(avx2::alignment()) uint8_t buffer[64]; + // FIXME: ultra inefficient + for (int i = 0; i < 64; ++i) + buffer[i] = cond.data & ((uint64_t)1 << i) ? 0xFF : 0; + __m256i cond_low = batch::load_aligned(&buffer[0]); + __m256i cond_hi = batch::load_aligned(&buffer[32]); + + __m256i true_low, true_hi; + detail::split_avx512(true_br, true_low, true_hi); + + __m256i false_low, false_hi; + detail::split_avx512(false_br, false_low, false_hi); + + __m256i res_low = select(batch_bool(cond_low), batch(true_low), batch(false_low), avx2 {}); + __m256i res_hi = select(batch_bool(cond_hi), batch(true_hi), batch(false_hi), avx2 {}); + return detail::merge_avx(res_low, res_hi); + } + case 2: + { + __m256i cond_low = _mm512_maskz_cvtepi32_epi16((uint64_t)cond.data & 0xFFFF, _mm512_set1_epi32(~0)); + __m256i cond_hi = _mm512_maskz_cvtepi32_epi16((uint64_t)cond.data >> 16, _mm512_set1_epi32(~0)); - // select - template batch select(batch_bool const& cond, batch const& true_br, batch const& false_br, requires_arch) { - return _mm512_mask_blend_ps(cond, false_br, true_br); - } - template batch select(batch_bool const& cond, batch const& true_br, batch const& false_br, requires_arch) { - return _mm512_mask_blend_pd(cond, false_br, true_br); - } + __m256i true_low, true_hi; + detail::split_avx512(true_br, true_low, true_hi); - template::value, void>::type> - batch select(batch_bool const& cond, batch const& true_br, batch const& false_br, requires_arch) { - switch(sizeof(T)) { - case 1: { - alignas(avx2::alignment()) uint8_t buffer[64]; - // FIXME: ultra inefficient - for(int i =0; i < 64; ++i) - buffer[i] = cond.data & ((uint64_t)1 << i) ? 0xFF : 0; - __m256i cond_low = batch::load_aligned(&buffer[0]); - __m256i cond_hi = batch::load_aligned(&buffer[32]); - - __m256i true_low, true_hi; - detail::split_avx512(true_br, true_low, true_hi); - - __m256i false_low, false_hi; - detail::split_avx512(false_br, false_low, false_hi); - - __m256i res_low = select(batch_bool(cond_low), batch(true_low), batch(false_low), avx2{}); - __m256i res_hi = select(batch_bool(cond_hi), batch(true_hi), batch(false_hi), avx2{}); - return detail::merge_avx(res_low, res_hi); - } - case 2: { - __m256i cond_low = _mm512_maskz_cvtepi32_epi16((uint64_t)cond.data & 0xFFFF, _mm512_set1_epi32(~0)); - __m256i cond_hi = _mm512_maskz_cvtepi32_epi16((uint64_t)cond.data >> 16, _mm512_set1_epi32(~0)); - - __m256i true_low, true_hi; - detail::split_avx512(true_br, true_low, true_hi); - - __m256i false_low, false_hi; - detail::split_avx512(false_br, false_low, false_hi); - - __m256i res_low = select(batch_bool(cond_low), batch(true_low), batch(false_low), avx2{}); - __m256i res_hi = select(batch_bool(cond_hi), batch(true_hi), batch(false_hi), avx2{}); - return detail::merge_avx(res_low, res_hi); - } - case 4: return _mm512_mask_blend_epi32(cond, false_br, true_br); - case 8: return _mm512_mask_blend_epi64(cond, false_br, true_br); - default: assert(false && "unsupported arch/type combination"); return {}; - }; - } - template::value, void>::type> - batch select(batch_bool_constant, Values...> const&, batch const& true_br, batch const& false_br, requires_arch) { - return select(batch_bool{Values...}, true_br, false_br, avx512f{}); - } + __m256i false_low, false_hi; + detail::split_avx512(false_br, false_low, false_hi); + __m256i res_low = select(batch_bool(cond_low), batch(true_low), batch(false_low), avx2 {}); + __m256i res_hi = select(batch_bool(cond_hi), batch(true_hi), batch(false_hi), avx2 {}); + return detail::merge_avx(res_low, res_hi); + } + case 4: + return _mm512_mask_blend_epi32(cond, false_br, true_br); + case 8: + return _mm512_mask_blend_epi64(cond, false_br, true_br); + default: + assert(false && "unsupported arch/type combination"); + return {}; + }; + } - namespace detail - { - template - using enable_signed_integer_t = typename std::enable_if::value && std::is_signed::value, - int>::type; + template ::value, void>::type> + inline batch select(batch_bool_constant, Values...> const&, batch const& true_br, batch const& false_br, requires_arch) noexcept + { + return select(batch_bool { Values... }, true_br, false_br, avx512f {}); + } - template - using enable_unsigned_integer_t = typename std::enable_if::value && std::is_unsigned::value, - int>::type; - } + namespace detail + { + template + using enable_signed_integer_t = typename std::enable_if::value && std::is_signed::value, + int>::type; - // set - template - batch set(batch const&, requires_arch, float v0, float v1, float v2, float v3, float v4, float v5, float v6, float v7, float v8, float v9, float v10, float v11, float v12, float v13, float v14, float v15) { - return _mm512_setr_ps(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15); - } + template + using enable_unsigned_integer_t = typename std::enable_if::value && std::is_unsigned::value, + int>::type; + } - template - batch set(batch const&, requires_arch, double v0, double v1, double v2, double v3, double v4, double v5, double v6, double v7) { - return _mm512_setr_pd(v0, v1, v2, v3, v4, v5, v6, v7); - } - template::value, void>::type> - batch set(batch const&, requires_arch, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7) { - return _mm512_set_epi64(v7, v6, v5, v4, v3, v2, v1, v0); - } - template::value, void>::type> - batch set(batch const&, requires_arch, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7, - T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15) { - return _mm512_setr_epi32(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15); - } - template = 0> - batch set(batch const&, requires_arch, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7, - T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15, - T v16, T v17, T v18, T v19, T v20, T v21, T v22, T v23, - T v24, T v25, T v26, T v27, T v28, T v29, T v30, T v31) { + // set + template + inline batch set(batch const&, requires_arch, float v0, float v1, float v2, float v3, float v4, float v5, float v6, float v7, float v8, float v9, float v10, float v11, float v12, float v13, float v14, float v15) noexcept + { + return _mm512_setr_ps(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15); + } + + template + inline batch set(batch const&, requires_arch, double v0, double v1, double v2, double v3, double v4, double v5, double v6, double v7) noexcept + { + return _mm512_setr_pd(v0, v1, v2, v3, v4, v5, v6, v7); + } + template ::value, void>::type> + inline batch set(batch const&, requires_arch, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7) noexcept + { + return _mm512_set_epi64(v7, v6, v5, v4, v3, v2, v1, v0); + } + template ::value, void>::type> + inline batch set(batch const&, requires_arch, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7, + T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15) noexcept + { + return _mm512_setr_epi32(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15); + } + template = 0> + inline batch set(batch const&, requires_arch, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7, + T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15, + T v16, T v17, T v18, T v19, T v20, T v21, T v22, T v23, + T v24, T v25, T v26, T v27, T v28, T v29, T v30, T v31) noexcept + { #if defined(__clang__) || __GNUC__ - return __extension__ (__m512i)(__v32hi) - { - v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, - v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31 - }; + return __extension__(__m512i)(__v32hi) { + v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, + v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31 + }; #else - return _mm512_set_epi16(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, - v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31); + return _mm512_set_epi16(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, + v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31); #endif - } - template = 0> - batch set(batch const&, requires_arch, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7, - T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15, - T v16, T v17, T v18, T v19, T v20, T v21, T v22, T v23, - T v24, T v25, T v26, T v27, T v28, T v29, T v30, T v31) { + } + + template = 0> + inline batch set(batch const&, requires_arch, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7, + T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15, + T v16, T v17, T v18, T v19, T v20, T v21, T v22, T v23, + T v24, T v25, T v26, T v27, T v28, T v29, T v30, T v31) noexcept + { #if defined(__clang__) || __GNUC__ - return __extension__ (__m512i)(__v32hu) - { - v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, - v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31 - }; + return __extension__(__m512i)(__v32hu) { + v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, + v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31 + }; #else - return _mm512_set_epi16(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, - v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31); + return _mm512_set_epi16(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, + v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31); #endif - } - template = 0> - batch set(batch const&, requires_arch, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7, - T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15, - T v16, T v17, T v18, T v19, T v20, T v21, T v22, T v23, - T v24, T v25, T v26, T v27, T v28, T v29, T v30, T v31, - T v32, T v33, T v34, T v35, T v36, T v37, T v38, T v39, - T v40, T v41, T v42, T v43, T v44, T v45, T v46, T v47, - T v48, T v49, T v50, T v51, T v52, T v53, T v54, T v55, - T v56, T v57, T v58, T v59, T v60, T v61, T v62, T v63 - ) { + } + + template = 0> + inline batch set(batch const&, requires_arch, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7, + T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15, + T v16, T v17, T v18, T v19, T v20, T v21, T v22, T v23, + T v24, T v25, T v26, T v27, T v28, T v29, T v30, T v31, + T v32, T v33, T v34, T v35, T v36, T v37, T v38, T v39, + T v40, T v41, T v42, T v43, T v44, T v45, T v46, T v47, + T v48, T v49, T v50, T v51, T v52, T v53, T v54, T v55, + T v56, T v57, T v58, T v59, T v60, T v61, T v62, T v63) noexcept + { #if defined(__clang__) || __GNUC__ - return __extension__ (__m512i)(__v64qi) - { - v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, - v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, - v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, - v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58, v59, v60, v61, v62, v63 - }; + return __extension__(__m512i)(__v64qi) { + v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, + v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, + v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, + v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58, v59, v60, v61, v62, v63 + }; #else - return _mm512_set_epi8(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, - v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, - v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, - v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58, v59, v60, v61, v62, v63); + return _mm512_set_epi8(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, + v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, + v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, + v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58, v59, v60, v61, v62, v63); #endif - } - template = 0> - batch set(batch const&, requires_arch, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7, - T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15, - T v16, T v17, T v18, T v19, T v20, T v21, T v22, T v23, - T v24, T v25, T v26, T v27, T v28, T v29, T v30, T v31, - T v32, T v33, T v34, T v35, T v36, T v37, T v38, T v39, - T v40, T v41, T v42, T v43, T v44, T v45, T v46, T v47, - T v48, T v49, T v50, T v51, T v52, T v53, T v54, T v55, - T v56, T v57, T v58, T v59, T v60, T v61, T v62, T v63 - ) { + } + template = 0> + inline batch set(batch const&, requires_arch, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7, + T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15, + T v16, T v17, T v18, T v19, T v20, T v21, T v22, T v23, + T v24, T v25, T v26, T v27, T v28, T v29, T v30, T v31, + T v32, T v33, T v34, T v35, T v36, T v37, T v38, T v39, + T v40, T v41, T v42, T v43, T v44, T v45, T v46, T v47, + T v48, T v49, T v50, T v51, T v52, T v53, T v54, T v55, + T v56, T v57, T v58, T v59, T v60, T v61, T v62, T v63) noexcept + { #if defined(__clang__) || __GNUC__ - return __extension__ (__m512i)(__v64qu) - { - v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, - v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, - v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, - v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58, v59, v60, v61, v62, v63 - }; + return __extension__(__m512i)(__v64qu) { + v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, + v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, + v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, + v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58, v59, v60, v61, v62, v63 + }; #else - return _mm512_set_epi8(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, - v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, - v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, - v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58, v59, v60, v61, v62, v63); + return _mm512_set_epi8(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, + v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, + v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, + v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58, v59, v60, v61, v62, v63); #endif - } - template - batch_bool set(batch_bool const&, requires_arch, Values... values) { - static_assert(sizeof...(Values) == batch_bool::size, "consistent init"); - using register_type = typename batch_bool::register_type; - register_type r = 0; - unsigned shift = 0; - (void)std::initializer_list{(r|=register_type(values?1:0) << (shift++))...}; - return r; - } + } - // sqrt - template batch sqrt(batch const& val, requires_arch) { - return _mm512_sqrt_ps(val); - } - template batch sqrt(batch const& val, requires_arch) { - return _mm512_sqrt_pd(val); - } + template + inline batch_bool set(batch_bool const&, requires_arch, Values... values) noexcept + { + static_assert(sizeof...(Values) == batch_bool::size, "consistent init"); + using register_type = typename batch_bool::register_type; + register_type r = 0; + unsigned shift = 0; + (void)std::initializer_list { (r |= register_type(values ? 1 : 0) << (shift++))... }; + return r; + } - // ssub - template batch ssub(batch const& self, batch const& other, requires_arch) { - return _mm512_sub_ps(self, other); // no saturated arithmetic on floating point numbers - } - template batch ssub(batch const& self, batch const& other, requires_arch) { - return _mm512_sub_pd(self, other); // no saturated arithmetic on floating point numbers - } - template::value, void>::type> - batch ssub(batch const& self, batch const& other, requires_arch) { - if(std::is_signed::value) { - return sadd(self, -other); - } - else { - const auto diff = min(self, other); - return self - diff; - } - } + // sqrt + template + inline batch sqrt(batch const& val, requires_arch) noexcept + { + return _mm512_sqrt_ps(val); + } + template + inline batch sqrt(batch const& val, requires_arch) noexcept + { + return _mm512_sqrt_pd(val); + } - // store - template - void store(batch_bool const& self, bool* mem, requires_arch) { - using register_type = typename batch_bool::register_type; - constexpr auto size = batch_bool::size; - for(std::size_t i = 0; i < size; ++i) - mem[i] = self.data & (register_type(1) << i); - } + // ssub + template + inline batch ssub(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm512_sub_ps(self, other); // no saturated arithmetic on floating point numbers + } + template + inline batch ssub(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm512_sub_pd(self, other); // no saturated arithmetic on floating point numbers + } + template ::value, void>::type> + inline batch ssub(batch const& self, batch const& other, requires_arch) noexcept + { + if (std::is_signed::value) + { + return sadd(self, -other); + } + else + { + const auto diff = min(self, other); + return self - diff; + } + } - // store_aligned - template::value, void>::type> - void store_aligned(T *mem, batch const& self, requires_arch) { - return _mm512_store_si512((__m512i *)mem, self); - } - template::value, void>::type> - void store_aligned(T *mem, batch_bool const& self, requires_arch) { - return _mm512_store_si512((__m512i *)mem, self); - } - template void store_aligned(float *mem, batch const& self, requires_arch) { - return _mm512_store_ps(mem, self); - } - template void store_aligned(double *mem, batch const& self, requires_arch) { - return _mm512_store_pd(mem, self); - } + // store + template + inline void store(batch_bool const& self, bool* mem, requires_arch) noexcept + { + using register_type = typename batch_bool::register_type; + constexpr auto size = batch_bool::size; + for (std::size_t i = 0; i < size; ++i) + mem[i] = self.data & (register_type(1) << i); + } - // store_unaligned - template::value, void>::type> - void store_unaligned(T *mem, batch const& self, requires_arch) { - return _mm512_storeu_si512((__m512i *)mem, self); - } - template::value, void>::type> - void store_unaligned(T *mem, batch_bool const& self, requires_arch) { - return _mm512_storeu_si512((__m512i *)mem, self); - } - template void store_unaligned(float *mem, batch const& self, requires_arch) { - return _mm512_storeu_ps(mem, self); - } - template void store_unaligned(double *mem, batch const& self, requires_arch) { - return _mm512_storeu_pd(mem, self); - } + // store_aligned + template ::value, void>::type> + inline void store_aligned(T* mem, batch const& self, requires_arch) noexcept + { + return _mm512_store_si512((__m512i*)mem, self); + } + template ::value, void>::type> + inline void store_aligned(T* mem, batch_bool const& self, requires_arch) noexcept + { + return _mm512_store_si512((__m512i*)mem, self); + } + template + inline void store_aligned(float* mem, batch const& self, requires_arch) noexcept + { + return _mm512_store_ps(mem, self); + } + template + inline void store_aligned(double* mem, batch const& self, requires_arch) noexcept + { + return _mm512_store_pd(mem, self); + } - // sub - template::value, void>::type> - batch sub(batch const& self, batch const& other, requires_arch) { - switch(sizeof(T)) { - case 1: return detail::fwd_to_avx([](__m256i s, __m256i o) { return sub(batch(s), batch(o)); }, self, other); - case 2: return detail::fwd_to_avx([](__m256i s, __m256i o) { return sub(batch(s), batch(o)); }, self, other); - case 4: return _mm512_sub_epi32(self, other); - case 8: return _mm512_sub_epi64(self, other); - default: assert(false && "unsupported arch/op combination"); return {}; - } - } - template batch sub(batch const& self, batch const& other, requires_arch) { - return _mm512_sub_ps(self, other); - } - template batch sub(batch const& self, batch const& other, requires_arch) { - return _mm512_sub_pd(self, other); - } + // store_unaligned + template ::value, void>::type> + inline void store_unaligned(T* mem, batch const& self, requires_arch) noexcept + { + return _mm512_storeu_si512((__m512i*)mem, self); + } + template ::value, void>::type> + inline void store_unaligned(T* mem, batch_bool const& self, requires_arch) noexcept + { + return _mm512_storeu_si512((__m512i*)mem, self); + } + template + inline void store_unaligned(float* mem, batch const& self, requires_arch) noexcept + { + return _mm512_storeu_ps(mem, self); + } + template + inline void store_unaligned(double* mem, batch const& self, requires_arch) noexcept + { + return _mm512_storeu_pd(mem, self); + } - // to_float - template - batch to_float(batch const& self, requires_arch) { - return _mm512_cvtepi32_ps(self); - } - template - batch to_float(batch const& self, requires_arch) { - // FIXME: call _mm_cvtepi64_pd - alignas(A::alignment()) int64_t buffer[batch::size]; - self.store_aligned(&buffer[0]); - return {(double)buffer[0], (double)buffer[1], (double)buffer[2], (double)buffer[3], - (double)buffer[4], (double)buffer[5], (double)buffer[6], (double)buffer[7]}; - } + // sub + template ::value, void>::type> + inline batch sub(batch const& self, batch const& other, requires_arch) noexcept + { + switch (sizeof(T)) + { + case 1: + return detail::fwd_to_avx([](__m256i s, __m256i o) noexcept + { return sub(batch(s), batch(o)); }, + self, other); + case 2: + return detail::fwd_to_avx([](__m256i s, __m256i o) noexcept + { return sub(batch(s), batch(o)); }, + self, other); + case 4: + return _mm512_sub_epi32(self, other); + case 8: + return _mm512_sub_epi64(self, other); + default: + assert(false && "unsupported arch/op combination"); + return {}; + } + } + template + inline batch sub(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm512_sub_ps(self, other); + } + template + inline batch sub(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm512_sub_pd(self, other); + } - // to_int - template - batch to_int(batch const& self, requires_arch) { - return _mm512_cvttps_epi32(self); - } + // to_float + template + inline batch to_float(batch const& self, requires_arch) noexcept + { + return _mm512_cvtepi32_ps(self); + } + template + inline batch to_float(batch const& self, requires_arch) noexcept + { + // FIXME: call _mm_cvtepi64_pd + alignas(A::alignment()) int64_t buffer[batch::size]; + self.store_aligned(&buffer[0]); + return { (double)buffer[0], (double)buffer[1], (double)buffer[2], (double)buffer[3], + (double)buffer[4], (double)buffer[5], (double)buffer[6], (double)buffer[7] }; + } - template - batch to_int(batch const& self, requires_arch) { - // FIXME: call _mm_cvttpd_epi64 - alignas(A::alignment()) double buffer[batch::size]; - self.store_aligned(&buffer[0]); - return {(int64_t)buffer[0], (int64_t)buffer[1], (int64_t)buffer[2], (int64_t)buffer[3], - (int64_t)buffer[4], (int64_t)buffer[5], (int64_t)buffer[6], (int64_t)buffer[7]}; - } + // to_int + template + inline batch to_int(batch const& self, requires_arch) noexcept + { + return _mm512_cvttps_epi32(self); + } - // trunc - template batch trunc(batch const& self, requires_arch) { - return _mm512_roundscale_round_ps(self, _MM_FROUND_TO_ZERO, _MM_FROUND_CUR_DIRECTION); - } - template batch trunc(batch const& self, requires_arch) { - return _mm512_roundscale_round_pd(self, _MM_FROUND_TO_ZERO, _MM_FROUND_CUR_DIRECTION); - } + template + inline batch to_int(batch const& self, requires_arch) noexcept + { + // FIXME: call _mm_cvttpd_epi64 + alignas(A::alignment()) double buffer[batch::size]; + self.store_aligned(&buffer[0]); + return { (int64_t)buffer[0], (int64_t)buffer[1], (int64_t)buffer[2], (int64_t)buffer[3], + (int64_t)buffer[4], (int64_t)buffer[5], (int64_t)buffer[6], (int64_t)buffer[7] }; + } - // zip_hi - template::value, void>::type> - batch zip_hi(batch const& self, batch const& other, requires_arch) { - switch(sizeof(T)) { - case 1: return _mm512_unpackhi_epi8(self, other); - case 2: return _mm512_unpackhi_epi16(self, other); - case 4: return _mm512_unpackhi_epi32(self, other); - case 8: return _mm512_unpackhi_epi64(self, other); - default: assert(false && "unsupported arch/op combination"); return {}; - } - } - template batch zip_hi(batch const& self, batch const& other, requires_arch) { - return _mm512_unpackhi_ps(self, other); - } - template batch zip_hi(batch const& self, batch const& other, requires_arch) { - return _mm512_unpackhi_pd(self, other); - } + // trunc + template + inline batch trunc(batch const& self, requires_arch) noexcept + { + return _mm512_roundscale_round_ps(self, _MM_FROUND_TO_ZERO, _MM_FROUND_CUR_DIRECTION); + } + template + inline batch trunc(batch const& self, requires_arch) noexcept + { + return _mm512_roundscale_round_pd(self, _MM_FROUND_TO_ZERO, _MM_FROUND_CUR_DIRECTION); + } - // zip_lo - template::value, void>::type> - batch zip_lo(batch const& self, batch const& other, requires_arch) { - switch(sizeof(T)) { - case 1: return _mm512_unpacklo_epi8(self, other); - case 2: return _mm512_unpacklo_epi16(self, other); - case 4: return _mm512_unpacklo_epi32(self, other); - case 8: return _mm512_unpacklo_epi64(self, other); - default: assert(false && "unsupported arch/op combination"); return {}; - } - } - template batch zip_lo(batch const& self, batch const& other, requires_arch) { - return _mm512_unpacklo_ps(self, other); - } - template batch zip_lo(batch const& self, batch const& other, requires_arch) { - return _mm512_unpacklo_pd(self, other); - } + // zip_hi + template ::value, void>::type> + inline batch zip_hi(batch const& self, batch const& other, requires_arch) noexcept + { + switch (sizeof(T)) + { + case 1: + return _mm512_unpackhi_epi8(self, other); + case 2: + return _mm512_unpackhi_epi16(self, other); + case 4: + return _mm512_unpackhi_epi32(self, other); + case 8: + return _mm512_unpackhi_epi64(self, other); + default: + assert(false && "unsupported arch/op combination"); + return {}; + } + } + template + inline batch zip_hi(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm512_unpackhi_ps(self, other); + } + template + inline batch zip_hi(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm512_unpackhi_pd(self, other); + } - } + // zip_lo + template ::value, void>::type> + inline batch zip_lo(batch const& self, batch const& other, requires_arch) noexcept + { + switch (sizeof(T)) + { + case 1: + return _mm512_unpacklo_epi8(self, other); + case 2: + return _mm512_unpacklo_epi16(self, other); + case 4: + return _mm512_unpacklo_epi32(self, other); + case 8: + return _mm512_unpacklo_epi64(self, other); + default: + assert(false && "unsupported arch/op combination"); + return {}; + } + } + template + inline batch zip_lo(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm512_unpacklo_ps(self, other); + } + template + inline batch zip_lo(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm512_unpacklo_pd(self, other); + } + + } } diff --git a/third_party/xsimd/arch/xsimd_constants.hpp b/third_party/xsimd/arch/xsimd_constants.hpp index a9ff97feb..1ae77e8c7 100644 --- a/third_party/xsimd/arch/xsimd_constants.hpp +++ b/third_party/xsimd/arch/xsimd_constants.hpp @@ -1,13 +1,13 @@ /*************************************************************************** -* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * -* Martin Renou * -* Copyright (c) QuantStack * -* Copyright (c) Serge Guelton * -* * -* Distributed under the terms of the BSD 3-Clause License. * -* * -* The full license is in the file LICENSE, distributed with this software. * -****************************************************************************/ + * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * + * Martin Renou * + * Copyright (c) QuantStack * + * Copyright (c) Serge Guelton * + * * + * Distributed under the terms of the BSD 3-Clause License. * + * * + * The full license is in the file LICENSE, distributed with this software. * + ****************************************************************************/ #ifndef XSIMD_NUMERICAL_CONSTANT_HPP #define XSIMD_NUMERICAL_CONSTANT_HPP @@ -19,350 +19,366 @@ namespace xsimd { -namespace constants { + namespace constants + { #define XSIMD_DEFINE_CONSTANT(NAME, SINGLE, DOUBLE) \ template \ - inline T NAME() noexcept \ + inline T NAME() noexcept \ { \ return T(NAME()); \ } \ template <> \ - inline float NAME() noexcept \ + inline float NAME() noexcept \ { \ return SINGLE; \ } \ template <> \ - inline double NAME() noexcept \ + inline double NAME() noexcept \ { \ return DOUBLE; \ } #define XSIMD_DEFINE_CONSTANT_HEX(NAME, SINGLE, DOUBLE) \ template \ - inline T NAME() noexcept \ + inline T NAME() noexcept \ { \ return T(NAME()); \ } \ template <> \ - inline float NAME() noexcept \ + inline float NAME() noexcept \ { \ - return bit_cast((uint32_t)SINGLE); \ + return bit_cast((uint32_t)SINGLE); \ } \ template <> \ - inline double NAME() noexcept \ + inline double NAME() noexcept \ { \ - return bit_cast((uint64_t)DOUBLE); \ + return bit_cast((uint64_t)DOUBLE); \ } - XSIMD_DEFINE_CONSTANT(infinity, (std::numeric_limits::infinity()), (std::numeric_limits::infinity())) - XSIMD_DEFINE_CONSTANT(invlog_2, 1.442695040888963407359924681001892137426645954152986f, 1.442695040888963407359924681001892137426645954152986) - XSIMD_DEFINE_CONSTANT_HEX(invlog_2hi, 0x3fb8b000, 0x3ff7154765200000) - XSIMD_DEFINE_CONSTANT_HEX(invlog_2lo, 0xb9389ad4, 0x3de705fc2eefa200) - XSIMD_DEFINE_CONSTANT(invlog10_2, 3.32192809488736234787031942949f, 3.32192809488736234787031942949) - XSIMD_DEFINE_CONSTANT_HEX(invpi, 0x3ea2f983, 0x3fd45f306dc9c883) - XSIMD_DEFINE_CONSTANT(log_2, 0.6931471805599453094172321214581765680755001343602553f, 0.6931471805599453094172321214581765680755001343602553) - XSIMD_DEFINE_CONSTANT_HEX(log_2hi, 0x3f318000, 0x3fe62e42fee00000) - XSIMD_DEFINE_CONSTANT_HEX(log_2lo, 0xb95e8083, 0x3dea39ef35793c76) - XSIMD_DEFINE_CONSTANT_HEX(log10_2hi, 0x3e9a0000, 0x3fd3440000000000) - XSIMD_DEFINE_CONSTANT_HEX(log10_2lo, 0x39826a14, 0x3ed3509f79fef312) - XSIMD_DEFINE_CONSTANT_HEX(logeps, 0xc17f1402, 0xc04205966f2b4f12) - XSIMD_DEFINE_CONSTANT_HEX(logpi, 0x3f928682, 0x3ff250d048e7a1bd) - XSIMD_DEFINE_CONSTANT_HEX(logsqrt2pi, 0x3f6b3f8e, 0x3fed67f1c864beb5) - XSIMD_DEFINE_CONSTANT(maxflint, 16777216.0f, 9007199254740992.0) - XSIMD_DEFINE_CONSTANT(maxlog, 88.3762626647949f, 709.78271289338400) - XSIMD_DEFINE_CONSTANT(maxlog2, 127.0f, 1023.) - XSIMD_DEFINE_CONSTANT(maxlog10, 38.23080825805664f, 308.2547155599167) - XSIMD_DEFINE_CONSTANT_HEX(mediumpi, 0x43490fdb, 0x412921fb54442d18) - XSIMD_DEFINE_CONSTANT(minlog, -88.3762626647949f, -708.3964185322641) - XSIMD_DEFINE_CONSTANT(minlog2, -127.0f, -1023.) - XSIMD_DEFINE_CONSTANT(minlog10, -37.89999771118164f, -308.2547155599167) - XSIMD_DEFINE_CONSTANT(minusinfinity, (-infinity()), (-infinity())) - XSIMD_DEFINE_CONSTANT(minuszero, -0.0f, -0.0) - XSIMD_DEFINE_CONSTANT_HEX(nan, 0xffffffff, 0xffffffffffffffff) - XSIMD_DEFINE_CONSTANT_HEX(oneosqrteps, 0x453504f3, 0x4190000000000000) - XSIMD_DEFINE_CONSTANT_HEX(oneotwoeps, 0x4a800000, 0x4320000000000000) - XSIMD_DEFINE_CONSTANT_HEX(pi, 0x40490fdb, 0x400921fb54442d18) - XSIMD_DEFINE_CONSTANT_HEX(pio_2lo, 0xb33bbd2e, 0x3c91a62633145c07) - XSIMD_DEFINE_CONSTANT_HEX(pio_4lo, 0xb2bbbd2e, 0x3c81a62633145c07) - XSIMD_DEFINE_CONSTANT_HEX(pio2, 0x3fc90fdb, 0x3ff921fb54442d18) - XSIMD_DEFINE_CONSTANT_HEX(pio2_1, 0x3fc90f80, 0x3ff921fb54400000) - XSIMD_DEFINE_CONSTANT_HEX(pio2_1t, 0x37354443, 0x3dd0b4611a626331) - XSIMD_DEFINE_CONSTANT_HEX(pio2_2, 0x37354400, 0x3dd0b4611a600000) - XSIMD_DEFINE_CONSTANT_HEX(pio2_2t, 0x2e85a308, 0x3ba3198a2e037073) - XSIMD_DEFINE_CONSTANT_HEX(pio2_3, 0x2e85a300, 0x3ba3198a2e000000) - XSIMD_DEFINE_CONSTANT_HEX(pio2_3t, 0x248d3132, 0x397b839a252049c1) - XSIMD_DEFINE_CONSTANT_HEX(pio4, 0x3f490fdb, 0x3fe921fb54442d18) - XSIMD_DEFINE_CONSTANT_HEX(signmask, 0x80000000, 0x8000000000000000) - XSIMD_DEFINE_CONSTANT(smallestposval, std::numeric_limits::min(), std::numeric_limits::min()) - XSIMD_DEFINE_CONSTANT_HEX(sqrt_2pi, 0x40206c99, 0x40040d931ff62704) - XSIMD_DEFINE_CONSTANT_HEX(sqrteps, 0x39b504f3, 0x3e50000000000000) - XSIMD_DEFINE_CONSTANT_HEX(tanpio8, 0x3ed413cd, 0x3fda827999fcef31) - XSIMD_DEFINE_CONSTANT_HEX(tan3pio8, 0x401a827a, 0x4003504f333f9de6) - XSIMD_DEFINE_CONSTANT_HEX(twentypi, 0x427b53d1, 0x404f6a7a2955385e) - XSIMD_DEFINE_CONSTANT_HEX(twoopi, 0x3f22f983, 0x3fe45f306dc9c883) - XSIMD_DEFINE_CONSTANT(twotonmb, 8388608.0f, 4503599627370496.0) - XSIMD_DEFINE_CONSTANT_HEX(twotonmbo3, 0x3ba14518, 0x3ed428a2f98d7286) + XSIMD_DEFINE_CONSTANT(infinity, (std::numeric_limits::infinity()), (std::numeric_limits::infinity())) + XSIMD_DEFINE_CONSTANT(invlog_2, 1.442695040888963407359924681001892137426645954152986f, 1.442695040888963407359924681001892137426645954152986) + XSIMD_DEFINE_CONSTANT_HEX(invlog_2hi, 0x3fb8b000, 0x3ff7154765200000) + XSIMD_DEFINE_CONSTANT_HEX(invlog_2lo, 0xb9389ad4, 0x3de705fc2eefa200) + XSIMD_DEFINE_CONSTANT(invlog10_2, 3.32192809488736234787031942949f, 3.32192809488736234787031942949) + XSIMD_DEFINE_CONSTANT_HEX(invpi, 0x3ea2f983, 0x3fd45f306dc9c883) + XSIMD_DEFINE_CONSTANT(log_2, 0.6931471805599453094172321214581765680755001343602553f, 0.6931471805599453094172321214581765680755001343602553) + XSIMD_DEFINE_CONSTANT_HEX(log_2hi, 0x3f318000, 0x3fe62e42fee00000) + XSIMD_DEFINE_CONSTANT_HEX(log_2lo, 0xb95e8083, 0x3dea39ef35793c76) + XSIMD_DEFINE_CONSTANT_HEX(log10_2hi, 0x3e9a0000, 0x3fd3440000000000) + XSIMD_DEFINE_CONSTANT_HEX(log10_2lo, 0x39826a14, 0x3ed3509f79fef312) + XSIMD_DEFINE_CONSTANT_HEX(logeps, 0xc17f1402, 0xc04205966f2b4f12) + XSIMD_DEFINE_CONSTANT_HEX(logpi, 0x3f928682, 0x3ff250d048e7a1bd) + XSIMD_DEFINE_CONSTANT_HEX(logsqrt2pi, 0x3f6b3f8e, 0x3fed67f1c864beb5) + XSIMD_DEFINE_CONSTANT(maxflint, 16777216.0f, 9007199254740992.0) + XSIMD_DEFINE_CONSTANT(maxlog, 88.3762626647949f, 709.78271289338400) + XSIMD_DEFINE_CONSTANT(maxlog2, 127.0f, 1023.) + XSIMD_DEFINE_CONSTANT(maxlog10, 38.23080825805664f, 308.2547155599167) + XSIMD_DEFINE_CONSTANT_HEX(mediumpi, 0x43490fdb, 0x412921fb54442d18) + XSIMD_DEFINE_CONSTANT(minlog, -88.3762626647949f, -708.3964185322641) + XSIMD_DEFINE_CONSTANT(minlog2, -127.0f, -1023.) + XSIMD_DEFINE_CONSTANT(minlog10, -37.89999771118164f, -308.2547155599167) + XSIMD_DEFINE_CONSTANT(minusinfinity, (-infinity()), (-infinity())) + XSIMD_DEFINE_CONSTANT(minuszero, -0.0f, -0.0) + XSIMD_DEFINE_CONSTANT_HEX(nan, 0xffffffff, 0xffffffffffffffff) + XSIMD_DEFINE_CONSTANT_HEX(oneosqrteps, 0x453504f3, 0x4190000000000000) + XSIMD_DEFINE_CONSTANT_HEX(oneotwoeps, 0x4a800000, 0x4320000000000000) + XSIMD_DEFINE_CONSTANT_HEX(pi, 0x40490fdb, 0x400921fb54442d18) + XSIMD_DEFINE_CONSTANT_HEX(pio_2lo, 0xb33bbd2e, 0x3c91a62633145c07) + XSIMD_DEFINE_CONSTANT_HEX(pio_4lo, 0xb2bbbd2e, 0x3c81a62633145c07) + XSIMD_DEFINE_CONSTANT_HEX(pio2, 0x3fc90fdb, 0x3ff921fb54442d18) + XSIMD_DEFINE_CONSTANT_HEX(pio2_1, 0x3fc90f80, 0x3ff921fb54400000) + XSIMD_DEFINE_CONSTANT_HEX(pio2_1t, 0x37354443, 0x3dd0b4611a626331) + XSIMD_DEFINE_CONSTANT_HEX(pio2_2, 0x37354400, 0x3dd0b4611a600000) + XSIMD_DEFINE_CONSTANT_HEX(pio2_2t, 0x2e85a308, 0x3ba3198a2e037073) + XSIMD_DEFINE_CONSTANT_HEX(pio2_3, 0x2e85a300, 0x3ba3198a2e000000) + XSIMD_DEFINE_CONSTANT_HEX(pio2_3t, 0x248d3132, 0x397b839a252049c1) + XSIMD_DEFINE_CONSTANT_HEX(pio4, 0x3f490fdb, 0x3fe921fb54442d18) + XSIMD_DEFINE_CONSTANT_HEX(signmask, 0x80000000, 0x8000000000000000) + XSIMD_DEFINE_CONSTANT(smallestposval, std::numeric_limits::min(), std::numeric_limits::min()) + XSIMD_DEFINE_CONSTANT_HEX(sqrt_2pi, 0x40206c99, 0x40040d931ff62704) + XSIMD_DEFINE_CONSTANT_HEX(sqrteps, 0x39b504f3, 0x3e50000000000000) + XSIMD_DEFINE_CONSTANT_HEX(tanpio8, 0x3ed413cd, 0x3fda827999fcef31) + XSIMD_DEFINE_CONSTANT_HEX(tan3pio8, 0x401a827a, 0x4003504f333f9de6) + XSIMD_DEFINE_CONSTANT_HEX(twentypi, 0x427b53d1, 0x404f6a7a2955385e) + XSIMD_DEFINE_CONSTANT_HEX(twoopi, 0x3f22f983, 0x3fe45f306dc9c883) + XSIMD_DEFINE_CONSTANT(twotonmb, 8388608.0f, 4503599627370496.0) + XSIMD_DEFINE_CONSTANT_HEX(twotonmbo3, 0x3ba14518, 0x3ed428a2f98d7286) #undef XSIMD_DEFINE_CONSTANT #undef XSIMD_DEFINE_CONSTANT_HEX - template - constexpr T allbits() noexcept; + template + constexpr T allbits() noexcept; - template - constexpr as_integer_t mask1frexp() noexcept; + template + constexpr as_integer_t mask1frexp() noexcept; - template - constexpr as_integer_t mask2frexp() noexcept; + template + constexpr as_integer_t mask2frexp() noexcept; - template - constexpr as_integer_t maxexponent() noexcept; + template + constexpr as_integer_t maxexponent() noexcept; - template - constexpr as_integer_t maxexponentm1() noexcept; + template + constexpr as_integer_t maxexponentm1() noexcept; - template - constexpr int32_t nmb() noexcept; + template + constexpr int32_t nmb() noexcept; - template - constexpr T zero() noexcept; + template + constexpr T zero() noexcept; - template - constexpr T minvalue() noexcept; + template + constexpr T minvalue() noexcept; - template - constexpr T maxvalue() noexcept; + template + constexpr T maxvalue() noexcept; - /************************** - * allbits implementation * - **************************/ + /************************** + * allbits implementation * + **************************/ - namespace detail - { - template ::value> - struct allbits_impl + namespace detail { - static constexpr T get_value() noexcept + template ::value> + struct allbits_impl + { + static constexpr T get_value() noexcept + { + return T(~0); + } + }; + + template + struct allbits_impl { - return T(~0); - } - }; + static constexpr T get_value() noexcept + { + return nan(); + } + }; + } template - struct allbits_impl + inline constexpr T allbits() noexcept { - static constexpr T get_value() noexcept - { - return nan(); - } - }; - } - - template - constexpr T allbits() noexcept - { - return T(detail::allbits_impl::get_value()); - } - - /***************************** - * mask1frexp implementation * - *****************************/ - - template - constexpr as_integer_t mask1frexp() noexcept - { - return as_integer_t(mask1frexp()); - } + return T(detail::allbits_impl::get_value()); + } - template <> - constexpr int32_t mask1frexp() noexcept - { - return 0x7f800000; - } + /***************************** + * mask1frexp implementation * + *****************************/ - template <> - constexpr int64_t mask1frexp() noexcept - { - return 0x7ff0000000000000; - } - - /***************************** - * mask2frexp implementation * - *****************************/ - - template - constexpr as_integer_t mask2frexp() noexcept - { - return as_integer_t(mask2frexp()); - } + template + inline constexpr as_integer_t mask1frexp() noexcept + { + return as_integer_t(mask1frexp()); + } - template <> - constexpr int32_t mask2frexp() noexcept - { - return 0x3f000000; - } + template <> + inline constexpr int32_t mask1frexp() noexcept + { + return 0x7f800000; + } - template <> - constexpr int64_t mask2frexp() noexcept - { - return 0x3fe0000000000000; - } + template <> + inline constexpr int64_t mask1frexp() noexcept + { + return 0x7ff0000000000000; + } - /****************************** - * maxexponent implementation * - ******************************/ + /***************************** + * mask2frexp implementation * + *****************************/ - template - constexpr as_integer_t maxexponent() noexcept - { - return as_integer_t(maxexponent()); - } + template + inline constexpr as_integer_t mask2frexp() noexcept + { + return as_integer_t(mask2frexp()); + } - template <> - constexpr int32_t maxexponent() noexcept - { - return 127; - } + template <> + inline constexpr int32_t mask2frexp() noexcept + { + return 0x3f000000; + } - template <> - constexpr int64_t maxexponent() noexcept - { - return 1023; - } + template <> + inline constexpr int64_t mask2frexp() noexcept + { + return 0x3fe0000000000000; + } - /****************************** - * maxexponent implementation * - ******************************/ + /****************************** + * maxexponent implementation * + ******************************/ - template - constexpr as_integer_t maxexponentm1() noexcept - { - return as_integer_t(maxexponentm1()); - } + template + inline constexpr as_integer_t maxexponent() noexcept + { + return as_integer_t(maxexponent()); + } - template <> - constexpr int32_t maxexponentm1() noexcept - { - return 126; - } + template <> + inline constexpr int32_t maxexponent() noexcept + { + return 127; + } - template <> - constexpr int64_t maxexponentm1() noexcept - { - return 1022; - } + template <> + inline constexpr int64_t maxexponent() noexcept + { + return 1023; + } - /********************** - * nmb implementation * - **********************/ + /****************************** + * maxexponent implementation * + ******************************/ - template - constexpr int32_t nmb() noexcept - { - return nmb(); - } + template + inline constexpr as_integer_t maxexponentm1() noexcept + { + return as_integer_t(maxexponentm1()); + } - template <> - constexpr int32_t nmb() noexcept - { - return 23; - } + template <> + inline constexpr int32_t maxexponentm1() noexcept + { + return 126; + } - template <> - constexpr int32_t nmb() noexcept - { - return 52; - } + template <> + inline constexpr int64_t maxexponentm1() noexcept + { + return 1022; + } - /*********************** - * zero implementation * - ***********************/ + /********************** + * nmb implementation * + **********************/ - template - constexpr T zero() noexcept - { - return T(typename T::value_type(0)); - } + template + inline constexpr int32_t nmb() noexcept + { + return nmb(); + } - /*************************** - * minvalue implementation * - ***************************/ + template <> + inline constexpr int32_t nmb() noexcept + { + return 23; + } - namespace detail - { - template - struct minvalue_impl + template <> + inline constexpr int32_t nmb() noexcept { - static constexpr T get_value() noexcept - { - return std::numeric_limits::min(); - } - }; + return 52; + } + + /*********************** + * zero implementation * + ***********************/ template - struct minvalue_common + inline constexpr T zero() noexcept { - static constexpr T get_value() noexcept - { - return std::numeric_limits::min(); - } - }; + return T(typename T::value_type(0)); + } - template <> - struct minvalue_impl : minvalue_common {}; - template <> - struct minvalue_impl : minvalue_common {}; - template <> - struct minvalue_impl : minvalue_common {}; - template <> - struct minvalue_impl : minvalue_common {}; - template <> - struct minvalue_impl : minvalue_common {}; - template <> - struct minvalue_impl : minvalue_common {}; - template <> - struct minvalue_impl : minvalue_common {}; - template <> - struct minvalue_impl : minvalue_common {}; + /*************************** + * minvalue implementation * + ***************************/ - template <> - struct minvalue_impl + namespace detail { - static float get_value() noexcept + template + struct minvalue_impl + { + static constexpr T get_value() noexcept + { + return std::numeric_limits::min(); + } + }; + + template + struct minvalue_common + { + static constexpr T get_value() noexcept + { + return std::numeric_limits::min(); + } + }; + + template <> + struct minvalue_impl : minvalue_common + { + }; + template <> + struct minvalue_impl : minvalue_common + { + }; + template <> + struct minvalue_impl : minvalue_common + { + }; + template <> + struct minvalue_impl : minvalue_common + { + }; + template <> + struct minvalue_impl : minvalue_common + { + }; + template <> + struct minvalue_impl : minvalue_common + { + }; + template <> + struct minvalue_impl : minvalue_common { - return bit_cast((uint32_t)0xff7fffff); - } - }; + }; + template <> + struct minvalue_impl : minvalue_common + { + }; - template <> - struct minvalue_impl - { - static double get_value() noexcept + template <> + struct minvalue_impl { - return bit_cast((uint64_t)0xffefffffffffffff); - } - }; - } + static float get_value() noexcept + { + return bit_cast((uint32_t)0xff7fffff); + } + }; + + template <> + struct minvalue_impl + { + static double get_value() noexcept + { + return bit_cast((uint64_t)0xffefffffffffffff); + } + }; + } - template - constexpr T minvalue() noexcept - { - return T(detail::minvalue_impl::get_value()); - } + template + inline constexpr T minvalue() noexcept + { + return T(detail::minvalue_impl::get_value()); + } - /*************************** - * maxvalue implementation * - ***************************/ + /*************************** + * maxvalue implementation * + ***************************/ - template - constexpr T maxvalue() noexcept - { - return T(std::numeric_limits::max()); + template + inline constexpr T maxvalue() noexcept + { + return T(std::numeric_limits::max()); + } } -} } #endif - diff --git a/third_party/xsimd/arch/xsimd_fma3.hpp b/third_party/xsimd/arch/xsimd_fma3.hpp index e4646e90d..fa86c22a5 100644 --- a/third_party/xsimd/arch/xsimd_fma3.hpp +++ b/third_party/xsimd/arch/xsimd_fma3.hpp @@ -1,63 +1,79 @@ /*************************************************************************** -* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * -* Martin Renou * -* Copyright (c) QuantStack * -* Copyright (c) Serge Guelton * -* * -* Distributed under the terms of the BSD 3-Clause License. * -* * -* The full license is in the file LICENSE, distributed with this software. * -****************************************************************************/ + * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * + * Martin Renou * + * Copyright (c) QuantStack * + * Copyright (c) Serge Guelton * + * * + * Distributed under the terms of the BSD 3-Clause License. * + * * + * The full license is in the file LICENSE, distributed with this software. * + ****************************************************************************/ #ifndef XSIMD_FMA3_HPP #define XSIMD_FMA3_HPP -#include "../types/xsimd_sse_register.hpp" +#include "../types/xsimd_fma3_register.hpp" + +namespace xsimd +{ + + namespace kernel + { + using namespace types; + // fnma + template + inline batch fnma(batch const& x, batch const& y, batch const& z, requires_arch) noexcept + { + return _mm_fnmadd_ps(x, y, z); + } + + template + inline batch fnma(batch const& x, batch const& y, batch const& z, requires_arch) noexcept + { + return _mm_fnmadd_pd(x, y, z); + } + + // fnms + template + inline batch fnms(batch const& x, batch const& y, batch const& z, requires_arch) noexcept + { + return _mm_fnmsub_ps(x, y, z); + } + + template + inline batch fnms(batch const& x, batch const& y, batch const& z, requires_arch) noexcept + { + return _mm_fnmsub_pd(x, y, z); + } + + // fma + template + inline batch fma(batch const& x, batch const& y, batch const& z, requires_arch) noexcept + { + return _mm_fmadd_ps(x, y, z); + } + + template + inline batch fma(batch const& x, batch const& y, batch const& z, requires_arch) noexcept + { + return _mm_fmadd_pd(x, y, z); + } + + // fms + template + inline batch fms(batch const& x, batch const& y, batch const& z, requires_arch) noexcept + { + return _mm_fmsub_ps(x, y, z); + } + + template + inline batch fms(batch const& x, batch const& y, batch const& z, requires_arch) noexcept + { + return _mm_fmsub_pd(x, y, z); + } - -namespace xsimd { - - namespace kernel { - using namespace types; - // fnma - template batch fnma(batch const& x, batch const& y, batch const& z, requires_arch) { - return _mm_fnmadd_ps(x, y, z); - } - - template batch fnma(batch const& x, batch const& y, batch const& z, requires_arch) { - return _mm_fnmadd_pd(x, y, z); - } - - // fnms - template batch fnms(batch const& x, batch const& y, batch const& z, requires_arch) { - return _mm_fnmsub_ps(x, y, z); - } - - template batch fnms(batch const& x, batch const& y, batch const& z, requires_arch) { - return _mm_fnmsub_pd(x, y, z); - } - - // fma - template batch fma(batch const& x, batch const& y, batch const& z, requires_arch) { - return _mm_fmadd_ps(x, y, z); - } - - template batch fma(batch const& x, batch const& y, batch const& z, requires_arch) { - return _mm_fmadd_pd(x, y, z); - } - - // fms - template batch fms(batch const& x, batch const& y, batch const& z, requires_arch) { - return _mm_fmsub_ps(x, y, z); } - template batch fms(batch const& x, batch const& y, batch const& z, requires_arch) { - return _mm_fmsub_pd(x, y, z); - } - - } - } #endif - diff --git a/third_party/xsimd/arch/xsimd_fma4.hpp b/third_party/xsimd/arch/xsimd_fma4.hpp index 814bcd145..21df72416 100644 --- a/third_party/xsimd/arch/xsimd_fma4.hpp +++ b/third_party/xsimd/arch/xsimd_fma4.hpp @@ -1,63 +1,79 @@ /*************************************************************************** -* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * -* Martin Renou * -* Copyright (c) QuantStack * -* Copyright (c) Serge Guelton * -* * -* Distributed under the terms of the BSD 3-Clause License. * -* * -* The full license is in the file LICENSE, distributed with this software. * -****************************************************************************/ + * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * + * Martin Renou * + * Copyright (c) QuantStack * + * Copyright (c) Serge Guelton * + * * + * Distributed under the terms of the BSD 3-Clause License. * + * * + * The full license is in the file LICENSE, distributed with this software. * + ****************************************************************************/ #ifndef XSIMD_FMA3_HPP #define XSIMD_FMA3_HPP #include "../types/xsimd_sse_register.hpp" - -namespace xsimd { - - namespace kernel { - using namespace types; - - // fnma - template batch fnma(simd_register const& x, simd_register const& y, simd_register const& z, requires_arch) { - return _mm_nmacc_ps(x, y, z); - } - - template batch fnma(simd_register const& x, simd_register const& y, simd_register const& z, requires_arch) { - return _mm_nmacc_pd(x, y, z); - } - - // fnms - template batch fnms(simd_register const& x, simd_register const& y, simd_register const& z, requires_arch) { - return _mm_nmsub_ps(x, y, z); - } - - template batch fnms(simd_register const& x, simd_register const& y, simd_register const& z, requires_arch) { - return _mm_nmsub_pd(x, y, z); +namespace xsimd +{ + + namespace kernel + { + using namespace types; + + // fnma + template + inline batch fnma(simd_register const& x, simd_register const& y, simd_register const& z, requires_arch) noexcept + { + return _mm_nmacc_ps(x, y, z); + } + + template + inline batch fnma(simd_register const& x, simd_register const& y, simd_register const& z, requires_arch) noexcept + { + return _mm_nmacc_pd(x, y, z); + } + + // fnms + template + inline batch fnms(simd_register const& x, simd_register const& y, simd_register const& z, requires_arch) noexcept + { + return _mm_nmsub_ps(x, y, z); + } + + template + inline batch fnms(simd_register const& x, simd_register const& y, simd_register const& z, requires_arch) noexcept + { + return _mm_nmsub_pd(x, y, z); + } + + // fma + template + inline batch fma(simd_register const& x, simd_register const& y, simd_register const& z, requires_arch) noexcept + { + return _mm_macc_ps(x, y, z); + } + + template + inline batch fma(simd_register const& x, simd_register const& y, simd_register const& z, requires_arch) noexcept + { + return _mm_macc_pd(x, y, z); + } + + // fms + template + inline batch fms(simd_register const& x, simd_register const& y, simd_register const& z, requires_arch) noexcept + { + return _mm_msub_ps(x, y, z); + } + + template + inline batch fms(simd_register const& x, simd_register const& y, simd_register const& z, requires_arch) noexcept + { + return _mm_msub_pd(x, y, z); + } } - // fma - template batch fma(simd_register const& x, simd_register const& y, simd_register const& z, requires_arch) { - return _mm_macc_ps(x, y, z); - } - - template batch fma(simd_register const& x, simd_register const& y, simd_register const& z, requires_arch) { - return _mm_macc_pd(x, y, z); - } - - // fms - template batch fms(simd_register const& x, simd_register const& y, simd_register const& z, requires_arch) { - return _mm_msub_ps(x, y, z); - } - - template batch fms(simd_register const& x, simd_register const& y, simd_register const& z, requires_arch) { - return _mm_msub_pd(x, y, z); - } - } - } #endif - diff --git a/third_party/xsimd/arch/xsimd_fma5.hpp b/third_party/xsimd/arch/xsimd_fma5.hpp index 786949d25..51e767240 100644 --- a/third_party/xsimd/arch/xsimd_fma5.hpp +++ b/third_party/xsimd/arch/xsimd_fma5.hpp @@ -1,64 +1,80 @@ /*************************************************************************** -* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * -* Martin Renou * -* Copyright (c) QuantStack * -* Copyright (c) Serge Guelton * -* * -* Distributed under the terms of the BSD 3-Clause License. * -* * -* The full license is in the file LICENSE, distributed with this software. * -****************************************************************************/ + * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * + * Martin Renou * + * Copyright (c) QuantStack * + * Copyright (c) Serge Guelton * + * * + * Distributed under the terms of the BSD 3-Clause License. * + * * + * The full license is in the file LICENSE, distributed with this software. * + ****************************************************************************/ #ifndef XSIMD_FMA5_HPP #define XSIMD_FMA5_HPP #include "../types/xsimd_fma5_register.hpp" +namespace xsimd +{ + + namespace kernel + { + using namespace types; + + // fnma + template + inline batch fnma(batch const& x, batch const& y, batch const& z, requires_arch) noexcept + { + return _mm256_fnmadd_ps(x, y, z); + } + + template + inline batch fnma(batch const& x, batch const& y, batch const& z, requires_arch) noexcept + { + return _mm256_fnmadd_pd(x, y, z); + } + + // fnms + template + inline batch fnms(batch const& x, batch const& y, batch const& z, requires_arch) noexcept + { + return _mm256_fnmsub_ps(x, y, z); + } + + template + inline batch fnms(batch const& x, batch const& y, batch const& z, requires_arch) noexcept + { + return _mm256_fnmsub_pd(x, y, z); + } + + // fma + template + inline batch fma(batch const& x, batch const& y, batch const& z, requires_arch) noexcept + { + return _mm256_fmadd_ps(x, y, z); + } + + template + inline batch fma(batch const& x, batch const& y, batch const& z, requires_arch) noexcept + { + return _mm256_fmadd_pd(x, y, z); + } + + // fms + template + inline batch fms(batch const& x, batch const& y, batch const& z, requires_arch) noexcept + { + return _mm256_fmsub_ps(x, y, z); + } + + template + inline batch fms(batch const& x, batch const& y, batch const& z, requires_arch) noexcept + { + return _mm256_fmsub_pd(x, y, z); + } -namespace xsimd { - - namespace kernel { - using namespace types; - - // fnma - template batch fnma(batch const& x, batch const& y, batch const& z, requires_arch) { - return _mm256_fnmadd_ps(x, y, z); - } - - template batch fnma(batch const& x, batch const& y, batch const& z, requires_arch) { - return _mm256_fnmadd_pd(x, y, z); - } - - // fnms - template batch fnms(batch const& x, batch const& y, batch const& z, requires_arch) { - return _mm256_fnmsub_ps(x, y, z); - } - - template batch fnms(batch const& x, batch const& y, batch const& z, requires_arch) { - return _mm256_fnmsub_pd(x, y, z); - } - - // fma - template batch fma(batch const& x, batch const& y, batch const& z, requires_arch) { - return _mm256_fmadd_ps(x, y, z); - } - - template batch fma(batch const& x, batch const& y, batch const& z, requires_arch) { - return _mm256_fmadd_pd(x, y, z); - } - - // fms - template batch fms(batch const& x, batch const& y, batch const& z, requires_arch) { - return _mm256_fmsub_ps(x, y, z); } - template batch fms(batch const& x, batch const& y, batch const& z, requires_arch) { - return _mm256_fmsub_pd(x, y, z); - } - - - } - } #endif diff --git a/third_party/xsimd/arch/xsimd_generic.hpp b/third_party/xsimd/arch/xsimd_generic.hpp index 338413d36..6403cfb0f 100644 --- a/third_party/xsimd/arch/xsimd_generic.hpp +++ b/third_party/xsimd/arch/xsimd_generic.hpp @@ -1,13 +1,13 @@ /*************************************************************************** -* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * -* Martin Renou * -* Copyright (c) QuantStack * -* Copyright (c) Serge Guelton * -* * -* Distributed under the terms of the BSD 3-Clause License. * -* * -* The full license is in the file LICENSE, distributed with this software. * -****************************************************************************/ + * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * + * Martin Renou * + * Copyright (c) QuantStack * + * Copyright (c) Serge Guelton * + * * + * Distributed under the terms of the BSD 3-Clause License. * + * * + * The full license is in the file LICENSE, distributed with this software. * + ****************************************************************************/ #ifndef XSIMD_GENERIC_HPP #define XSIMD_GENERIC_HPP @@ -21,4 +21,3 @@ #include "./generic/xsimd_generic_trigo.hpp" #endif - diff --git a/third_party/xsimd/arch/xsimd_generic_fwd.hpp b/third_party/xsimd/arch/xsimd_generic_fwd.hpp index 8326066e1..d66ae7d6b 100644 --- a/third_party/xsimd/arch/xsimd_generic_fwd.hpp +++ b/third_party/xsimd/arch/xsimd_generic_fwd.hpp @@ -1,35 +1,37 @@ /*************************************************************************** -* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * -* Martin Renou * -* Copyright (c) QuantStack * -* Copyright (c) Serge Guelton * -* * -* Distributed under the terms of the BSD 3-Clause License. * -* * -* The full license is in the file LICENSE, distributed with this software. * -****************************************************************************/ + * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * + * Martin Renou * + * Copyright (c) QuantStack * + * Copyright (c) Serge Guelton * + * * + * Distributed under the terms of the BSD 3-Clause License. * + * * + * The full license is in the file LICENSE, distributed with this software. * + ****************************************************************************/ #ifndef XSIMD_GENERIC_FWD_HPP #define XSIMD_GENERIC_FWD_HPP #include -namespace xsimd { +namespace xsimd +{ - namespace kernel { - // forward declaration - template::value, void>::type> - batch abs(batch const& self, requires_arch); - template::value, void>::type> - batch bitwise_lshift(batch const& self, batch const& other, requires_arch); - template::value, void>::type> - batch bitwise_rshift(batch const& self, batch const& other, requires_arch); - template batch_bool gt(batch const& self, batch const& other, requires_arch); - template::value, void>::type> - batch mul(batch const& self, batch const& other, requires_arch); + namespace kernel + { + // forward declaration + template ::value, void>::type> + inline batch abs(batch const& self, requires_arch) noexcept; + template ::value, void>::type> + inline batch bitwise_lshift(batch const& self, batch const& other, requires_arch) noexcept; + template ::value, void>::type> + inline batch bitwise_rshift(batch const& self, batch const& other, requires_arch) noexcept; + template + inline batch_bool gt(batch const& self, batch const& other, requires_arch) noexcept; + template ::value, void>::type> + inline batch mul(batch const& self, batch const& other, requires_arch) noexcept; - } + } } #endif - diff --git a/third_party/xsimd/arch/xsimd_isa.hpp b/third_party/xsimd/arch/xsimd_isa.hpp index c3f868740..e9c847cf7 100644 --- a/third_party/xsimd/arch/xsimd_isa.hpp +++ b/third_party/xsimd/arch/xsimd_isa.hpp @@ -1,13 +1,13 @@ /*************************************************************************** -* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * -* Martin Renou * -* Copyright (c) QuantStack * -* Copyright (c) Serge Guelton * -* * -* Distributed under the terms of the BSD 3-Clause License. * -* * -* The full license is in the file LICENSE, distributed with this software. * -****************************************************************************/ + * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * + * Martin Renou * + * Copyright (c) QuantStack * + * Copyright (c) Serge Guelton * + * * + * Distributed under the terms of the BSD 3-Clause License. * + * * + * The full license is in the file LICENSE, distributed with this software. * + ****************************************************************************/ #ifndef XSIMD_ISA_HPP #define XSIMD_ISA_HPP @@ -72,4 +72,3 @@ #include "./xsimd_generic.hpp" #endif - diff --git a/third_party/xsimd/arch/xsimd_neon.hpp b/third_party/xsimd/arch/xsimd_neon.hpp index e3b02018a..42e256506 100644 --- a/third_party/xsimd/arch/xsimd_neon.hpp +++ b/third_party/xsimd/arch/xsimd_neon.hpp @@ -1,13 +1,13 @@ /*************************************************************************** -* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * -* Martin Renou * -* Copyright (c) QuantStack * -* Copyright (c) Serge Guelton * -* * -* Distributed under the terms of the BSD 3-Clause License. * -* * -* The full license is in the file LICENSE, distributed with this software. * -****************************************************************************/ + * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * + * Martin Renou * + * Copyright (c) QuantStack * + * Copyright (c) Serge Guelton * + * * + * Distributed under the terms of the BSD 3-Clause License. * + * * + * The full license is in the file LICENSE, distributed with this software. * + ****************************************************************************/ #ifndef XSIMD_NEON_HPP #define XSIMD_NEON_HPP @@ -23,63 +23,72 @@ // Wrap intrinsics so we can pass them as function pointers // - OP: intrinsics name prefix, e.g., vorrq // - RT: type traits to deduce intrinsics return types -#define WRAP_BINARY_INT_EXCLUDING_64(OP, RT) \ - namespace wrap { \ - inline RT OP##_u8 (uint8x16_t a, uint8x16_t b) { return ::OP##_u8 (a, b); } \ - inline RT OP##_s8 (int8x16_t a, int8x16_t b) { return ::OP##_s8 (a, b); } \ - inline RT OP##_u16(uint16x8_t a, uint16x8_t b) { return ::OP##_u16(a, b); } \ - inline RT OP##_s16(int16x8_t a, int16x8_t b) { return ::OP##_s16(a, b); } \ - inline RT OP##_u32(uint32x4_t a, uint32x4_t b) { return ::OP##_u32(a, b); } \ - inline RT OP##_s32(int32x4_t a, int32x4_t b) { return ::OP##_s32(a, b); } \ +#define WRAP_BINARY_INT_EXCLUDING_64(OP, RT) \ + namespace wrap \ + { \ + inline RT OP##_u8(uint8x16_t a, uint8x16_t b) noexcept { return ::OP##_u8(a, b); } \ + inline RT OP##_s8(int8x16_t a, int8x16_t b) noexcept { return ::OP##_s8(a, b); } \ + inline RT OP##_u16(uint16x8_t a, uint16x8_t b) noexcept { return ::OP##_u16(a, b); } \ + inline RT OP##_s16(int16x8_t a, int16x8_t b) noexcept { return ::OP##_s16(a, b); } \ + inline RT OP##_u32(uint32x4_t a, uint32x4_t b) noexcept { return ::OP##_u32(a, b); } \ + inline RT OP##_s32(int32x4_t a, int32x4_t b) noexcept { return ::OP##_s32(a, b); } \ } -#define WRAP_BINARY_INT(OP, RT) \ - WRAP_BINARY_INT_EXCLUDING_64(OP, RT) \ - namespace wrap { \ - inline RT OP##_u64(uint64x2_t a, uint64x2_t b) { return ::OP##_u64(a, b); } \ - inline RT OP##_s64(int64x2_t a, int64x2_t b) { return ::OP##_s64(a, b); } \ +#define WRAP_BINARY_INT(OP, RT) \ + WRAP_BINARY_INT_EXCLUDING_64(OP, RT) \ + namespace wrap \ + { \ + inline RT OP##_u64(uint64x2_t a, uint64x2_t b) noexcept { return ::OP##_u64(a, b); } \ + inline RT OP##_s64(int64x2_t a, int64x2_t b) noexcept { return ::OP##_s64(a, b); } \ } -#define WRAP_BINARY_FLOAT(OP, RT) \ - namespace wrap { \ - inline RT OP##_f32(float32x4_t a, float32x4_t b) { return ::OP##_f32(a, b); } \ +#define WRAP_BINARY_FLOAT(OP, RT) \ + namespace wrap \ + { \ + inline RT OP##_f32(float32x4_t a, float32x4_t b) noexcept { return ::OP##_f32(a, b); } \ } -#define WRAP_UNARY_INT_EXCLUDING_64(OP) \ - namespace wrap { \ - inline uint8x16_t OP##_u8 (uint8x16_t a) { return ::OP##_u8 (a); } \ - inline int8x16_t OP##_s8 (int8x16_t a) { return ::OP##_s8 (a); } \ - inline uint16x8_t OP##_u16(uint16x8_t a) { return ::OP##_u16(a); } \ - inline int16x8_t OP##_s16(int16x8_t a) { return ::OP##_s16(a); } \ - inline uint32x4_t OP##_u32(uint32x4_t a) { return ::OP##_u32(a); } \ - inline int32x4_t OP##_s32(int32x4_t a) { return ::OP##_s32(a); } \ +#define WRAP_UNARY_INT_EXCLUDING_64(OP) \ + namespace wrap \ + { \ + inline uint8x16_t OP##_u8(uint8x16_t a) noexcept { return ::OP##_u8(a); } \ + inline int8x16_t OP##_s8(int8x16_t a) noexcept { return ::OP##_s8(a); } \ + inline uint16x8_t OP##_u16(uint16x8_t a) noexcept { return ::OP##_u16(a); } \ + inline int16x8_t OP##_s16(int16x8_t a) noexcept { return ::OP##_s16(a); } \ + inline uint32x4_t OP##_u32(uint32x4_t a) noexcept { return ::OP##_u32(a); } \ + inline int32x4_t OP##_s32(int32x4_t a) noexcept { return ::OP##_s32(a); } \ } -#define WRAP_UNARY_INT(OP) \ - WRAP_UNARY_INT_EXCLUDING_64(OP) \ - namespace wrap { \ - inline uint64x2_t OP##_u64(uint64x2_t a) { return ::OP##_u64(a); } \ - inline int64x2_t OP##_s64(int64x2_t a) { return ::OP##_s64(a); } \ +#define WRAP_UNARY_INT(OP) \ + WRAP_UNARY_INT_EXCLUDING_64(OP) \ + namespace wrap \ + { \ + inline uint64x2_t OP##_u64(uint64x2_t a) noexcept { return ::OP##_u64(a); } \ + inline int64x2_t OP##_s64(int64x2_t a) noexcept { return ::OP##_s64(a); } \ } -#define WRAP_UNARY_FLOAT(OP) \ - namespace wrap { \ - inline float32x4_t OP##_f32(float32x4_t a) { return ::OP##_f32(a); } \ +#define WRAP_UNARY_FLOAT(OP) \ + namespace wrap \ + { \ + inline float32x4_t OP##_f32(float32x4_t a) noexcept { return ::OP##_f32(a); } \ } // Dummy identity caster to ease coding -inline uint8x16_t vreinterpretq_u8_u8 (uint8x16_t arg) { return arg; } -inline int8x16_t vreinterpretq_s8_s8 (int8x16_t arg) { return arg; } -inline uint16x8_t vreinterpretq_u16_u16(uint16x8_t arg) { return arg; } -inline int16x8_t vreinterpretq_s16_s16(int16x8_t arg) { return arg; } -inline uint32x4_t vreinterpretq_u32_u32(uint32x4_t arg) { return arg; } -inline int32x4_t vreinterpretq_s32_s32(int32x4_t arg) { return arg; } -inline uint64x2_t vreinterpretq_u64_u64(uint64x2_t arg) { return arg; } -inline int64x2_t vreinterpretq_s64_s64(int64x2_t arg) { return arg; } -inline float32x4_t vreinterpretq_f32_f32(float32x4_t arg) { return arg; } +inline uint8x16_t vreinterpretq_u8_u8(uint8x16_t arg) noexcept { return arg; } +inline int8x16_t vreinterpretq_s8_s8(int8x16_t arg) noexcept { return arg; } +inline uint16x8_t vreinterpretq_u16_u16(uint16x8_t arg) noexcept { return arg; } +inline int16x8_t vreinterpretq_s16_s16(int16x8_t arg) noexcept { return arg; } +inline uint32x4_t vreinterpretq_u32_u32(uint32x4_t arg) noexcept { return arg; } +inline int32x4_t vreinterpretq_s32_s32(int32x4_t arg) noexcept { return arg; } +inline uint64x2_t vreinterpretq_u64_u64(uint64x2_t arg) noexcept { return arg; } +inline int64x2_t vreinterpretq_s64_s64(int64x2_t arg) noexcept { return arg; } +inline float32x4_t vreinterpretq_f32_f32(float32x4_t arg) noexcept { return arg; } namespace xsimd { + template + struct batch_bool_constant; + namespace kernel { using namespace types; @@ -95,7 +104,7 @@ namespace xsimd const container_type m_func; template - return_type apply(U rhs) const + return_type apply(U rhs) const noexcept { using func_type = return_type (*)(U); auto func = xsimd::detail::get(m_func); @@ -105,11 +114,11 @@ namespace xsimd struct binary { - using container_type = std::tuple (*)(T, T) ...>; + using container_type = std::tuple (*)(T, T)...>; const container_type m_func; template - return_type apply(U lhs, U rhs) const + return_type apply(U lhs, U rhs) const noexcept { using func_type = return_type (*)(U, U); auto func = xsimd::detail::get(m_func); @@ -124,23 +133,22 @@ namespace xsimd template using identity_return_type = T; - + template struct neon_dispatcher_impl : neon_dispatcher_base { }; - using neon_dispatcher = neon_dispatcher_impl; + uint16x8_t, int16x8_t, + uint32x4_t, int32x4_t, + uint64x2_t, int64x2_t, + float32x4_t>; using excluding_int64_dispatcher = neon_dispatcher_impl; + uint16x8_t, int16x8_t, + uint32x4_t, int32x4_t, + float32x4_t>; /************************** * comparison dispatchers * @@ -196,7 +204,7 @@ namespace xsimd { using type = uint64x2_t; }; - + template <> struct comp_return_type_impl { @@ -212,9 +220,9 @@ namespace xsimd }; using excluding_int64_comp_dispatcher = neon_comp_dispatcher_impl; + uint16x8_t, int16x8_t, + uint32x4_t, int32x4_t, + float32x4_t>; /************************************** * enabling / disabling metafunctions * @@ -228,25 +236,20 @@ namespace xsimd int>::type; template - using enable_sized_signed_t = typename std::enable_if::value && - std::is_signed::value && - sizeof(T) == S, int>::type; + using enable_sized_signed_t = typename std::enable_if::value && std::is_signed::value && sizeof(T) == S, int>::type; template - using enable_sized_unsigned_t = typename std::enable_if::value && - !std::is_signed::value && - sizeof(T) == S, int>::type; + using enable_sized_unsigned_t = typename std::enable_if::value && !std::is_signed::value && sizeof(T) == S, int>::type; template - using enable_sized_integral_t = typename std::enable_if::value && - sizeof(T) == S, int>::type; + using enable_sized_integral_t = typename std::enable_if::value && sizeof(T) == S, int>::type; template using enable_sized_t = typename std::enable_if::type; template using exclude_int64_neon_t - = typename std::enable_if<(std::is_integral::value && sizeof(T) != 8) || std::is_same::value, int>::type; + = typename std::enable_if<(std::is_integral::value && sizeof(T) != 8) || std::is_same::value, int>::type; } /************* @@ -254,55 +257,55 @@ namespace xsimd *************/ template = 0> - batch broadcast(T val, requires_arch) + inline batch broadcast(T val, requires_arch) noexcept { return vdupq_n_u8(uint8_t(val)); } template = 0> - batch broadcast(T val, requires_arch) + inline batch broadcast(T val, requires_arch) noexcept { return vdupq_n_s8(int8_t(val)); } template = 0> - batch broadcast(T val, requires_arch) + inline batch broadcast(T val, requires_arch) noexcept { return vdupq_n_u16(uint16_t(val)); } template = 0> - batch broadcast(T val, requires_arch) + inline batch broadcast(T val, requires_arch) noexcept { return vdupq_n_s16(int16_t(val)); } template = 0> - batch broadcast(T val, requires_arch) + inline batch broadcast(T val, requires_arch) noexcept { return vdupq_n_u32(uint32_t(val)); } template = 0> - batch broadcast(T val, requires_arch) + inline batch broadcast(T val, requires_arch) noexcept { return vdupq_n_s32(int32_t(val)); } template = 0> - batch broadcast(T val, requires_arch) + inline batch broadcast(T val, requires_arch) noexcept { return vdupq_n_u64(uint64_t(val)); } template = 0> - batch broadcast(T val, requires_arch) + inline batch broadcast(T val, requires_arch) noexcept { return vdupq_n_s64(int64_t(val)); } template - batch broadcast(float val, requires_arch) + inline batch broadcast(float val, requires_arch) noexcept { return vdupq_n_f32(val); } @@ -312,31 +315,31 @@ namespace xsimd *******/ template = 0> - batch set(batch const&, requires_arch, Args... args) + inline batch set(batch const&, requires_arch, Args... args) noexcept { - return xsimd::types::detail::neon_vector_type{args...}; + return xsimd::types::detail::neon_vector_type { args... }; } template = 0> - batch_bool set(batch_bool const&, requires_arch, Args... args) + inline batch_bool set(batch_bool const&, requires_arch, Args... args) noexcept { using register_type = typename batch_bool::register_type; using unsigned_type = as_unsigned_integer_t; - return register_type{static_cast(args ? -1LL : 0LL)...}; + return register_type { static_cast(args ? -1LL : 0LL)... }; } template - batch set(batch const&, requires_arch, float f0, float f1, float f2, float f3) + inline batch set(batch const&, requires_arch, float f0, float f1, float f2, float f3) noexcept { - return float32x4_t{f0, f1, f2, f3}; + return float32x4_t { f0, f1, f2, f3 }; } template - batch_bool set(batch_bool const&, requires_arch, Args... args) + inline batch_bool set(batch_bool const&, requires_arch, Args... args) noexcept { using register_type = typename batch_bool::register_type; using unsigned_type = as_unsigned_integer_t; - return register_type{static_cast(args ? -1LL : 0LL)...}; + return register_type { static_cast(args ? -1LL : 0LL)... }; } /************* @@ -344,55 +347,55 @@ namespace xsimd *************/ template = 0> - batch from_bool(batch_bool const& arg, requires_arch) + inline batch from_bool(batch_bool const& arg, requires_arch) noexcept { return vandq_u8(arg, vdupq_n_u8(1)); } template = 0> - batch from_bool(batch_bool const& arg, requires_arch) + inline batch from_bool(batch_bool const& arg, requires_arch) noexcept { return vandq_s8(reinterpret_cast(arg.data), vdupq_n_s8(1)); } template = 0> - batch from_bool(batch_bool const& arg, requires_arch) + inline batch from_bool(batch_bool const& arg, requires_arch) noexcept { return vandq_u16(arg, vdupq_n_u16(1)); } template = 0> - batch from_bool(batch_bool const& arg, requires_arch) + inline batch from_bool(batch_bool const& arg, requires_arch) noexcept { return vandq_s16(reinterpret_cast(arg.data), vdupq_n_s16(1)); } template = 0> - batch from_bool(batch_bool const& arg, requires_arch) + inline batch from_bool(batch_bool const& arg, requires_arch) noexcept { return vandq_u32(arg, vdupq_n_u32(1)); } template = 0> - batch from_bool(batch_bool const& arg, requires_arch) + inline batch from_bool(batch_bool const& arg, requires_arch) noexcept { return vandq_s32(reinterpret_cast(arg.data), vdupq_n_s32(1)); } template = 0> - batch from_bool(batch_bool const& arg, requires_arch) + inline batch from_bool(batch_bool const& arg, requires_arch) noexcept { return vandq_u64(arg, vdupq_n_u64(1)); } template = 0> - batch from_bool(batch_bool const& arg, requires_arch) + inline batch from_bool(batch_bool const& arg, requires_arch) noexcept { return vandq_s64(reinterpret_cast(arg.data), vdupq_n_s64(1)); } template - batch from_bool(batch_bool const& arg, requires_arch) + inline batch from_bool(batch_bool const& arg, requires_arch) noexcept { return vreinterpretq_f32_u32(vandq_u32(arg, vreinterpretq_u32_f32(vdupq_n_f32(1.f)))); } @@ -402,58 +405,58 @@ namespace xsimd ********/ template = 0> - batch load_aligned(T const* src, convert, requires_arch) + inline batch load_aligned(T const* src, convert, requires_arch) noexcept { return vld1q_u8(src); } template = 0> - batch load_aligned(T const* src, convert, requires_arch) + inline batch load_aligned(T const* src, convert, requires_arch) noexcept { return vld1q_s8(src); } template = 0> - batch load_aligned(T const* src, convert, requires_arch) + inline batch load_aligned(T const* src, convert, requires_arch) noexcept { return vld1q_u16(src); } template = 0> - batch load_aligned(T const* src, convert, requires_arch) + inline batch load_aligned(T const* src, convert, requires_arch) noexcept { return vld1q_s16(src); } template = 0> - batch load_aligned(T const* src, convert, requires_arch) + inline batch load_aligned(T const* src, convert, requires_arch) noexcept { return vld1q_u32(src); } template = 0> - batch load_aligned(T const* src, convert, requires_arch) + inline batch load_aligned(T const* src, convert, requires_arch) noexcept { return vld1q_s32(src); } template = 0> - batch load_aligned(T const* src, convert, requires_arch) + inline batch load_aligned(T const* src, convert, requires_arch) noexcept { return vld1q_u64(src); } template = 0> - batch load_aligned(T const* src, convert, requires_arch) + inline batch load_aligned(T const* src, convert, requires_arch) noexcept { return vld1q_s64(src); } template - batch load_aligned(float const* src, convert, requires_arch) + inline batch load_aligned(float const* src, convert, requires_arch) noexcept { return vld1q_f32(src); } template - batch load_unaligned(T const* src, convert, requires_arch) + inline batch load_unaligned(T const* src, convert, requires_arch) noexcept { - return load_aligned(src, convert(), A{}); + return load_aligned(src, convert(), A {}); } /********* @@ -461,63 +464,63 @@ namespace xsimd *********/ template = 0> - void store_aligned(T* dst, batch const& src, requires_arch) + inline void store_aligned(T* dst, batch const& src, requires_arch) noexcept { vst1q_u8(dst, src); } template = 0> - void store_aligned(T* dst, batch const& src, requires_arch) + inline void store_aligned(T* dst, batch const& src, requires_arch) noexcept { vst1q_s8(dst, src); } - + template = 0> - void store_aligned(T* dst, batch const& src, requires_arch) + inline void store_aligned(T* dst, batch const& src, requires_arch) noexcept { vst1q_u16(dst, src); } - + template = 0> - void store_aligned(T* dst, batch const& src, requires_arch) + inline void store_aligned(T* dst, batch const& src, requires_arch) noexcept { vst1q_s16(dst, src); } - + template = 0> - void store_aligned(T* dst, batch const& src, requires_arch) + inline void store_aligned(T* dst, batch const& src, requires_arch) noexcept { vst1q_u32(dst, src); } - + template = 0> - void store_aligned(T* dst, batch const& src, requires_arch) + inline void store_aligned(T* dst, batch const& src, requires_arch) noexcept { vst1q_s32(dst, src); } - + template = 0> - void store_aligned(T* dst, batch const& src, requires_arch) + inline void store_aligned(T* dst, batch const& src, requires_arch) noexcept { vst1q_u64(dst, src); } template = 0> - void store_aligned(T* dst, batch const& src, requires_arch) + inline void store_aligned(T* dst, batch const& src, requires_arch) noexcept { vst1q_s64(dst, src); } template - void store_aligned(float* dst, batch const& src, requires_arch) + inline void store_aligned(float* dst, batch const& src, requires_arch) noexcept { vst1q_f32(dst, src); } template - void store_unaligned(T* dst, batch const& src, requires_arch) + inline void store_unaligned(T* dst, batch const& src, requires_arch) noexcept { - store_aligned(dst, src, A{}); + store_aligned(dst, src, A {}); } /**************** @@ -525,20 +528,20 @@ namespace xsimd ****************/ template - batch, A> load_complex_aligned(std::complex const* mem, convert>, requires_arch) + inline batch, A> load_complex_aligned(std::complex const* mem, convert>, requires_arch) noexcept { using real_batch = batch; const float* buf = reinterpret_cast(mem); float32x4x2_t tmp = vld2q_f32(buf); real_batch real = tmp.val[0], imag = tmp.val[1]; - return batch, A>{real, imag}; + return batch, A> { real, imag }; } template - batch, A> load_complex_unaligned(std::complex const* mem, convert> cvt, requires_arch) + inline batch, A> load_complex_unaligned(std::complex const* mem, convert> cvt, requires_arch) noexcept { - return load_complex_aligned(mem, cvt, A{}); + return load_complex_aligned(mem, cvt, A {}); } /***************** @@ -546,7 +549,7 @@ namespace xsimd *****************/ template - void store_complex_aligned(std::complex* dst, batch ,A> const& src, requires_arch) + inline void store_complex_aligned(std::complex* dst, batch, A> const& src, requires_arch) noexcept { float32x4x2_t tmp; tmp.val[0] = src.real(); @@ -556,9 +559,9 @@ namespace xsimd } template - void store_complex_unaligned(std::complex* dst, batch ,A> const& src, requires_arch) + inline void store_complex_unaligned(std::complex* dst, batch, A> const& src, requires_arch) noexcept { - store_complex_aligned(dst, src, A{}); + store_complex_aligned(dst, src, A {}); } /******* @@ -566,55 +569,55 @@ namespace xsimd *******/ template = 0> - batch neg(batch const& rhs, requires_arch) + inline batch neg(batch const& rhs, requires_arch) noexcept { return vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(rhs))); } template = 0> - batch neg(batch const& rhs, requires_arch) + inline batch neg(batch const& rhs, requires_arch) noexcept { return vnegq_s8(rhs); } template = 0> - batch neg(batch const& rhs, requires_arch) + inline batch neg(batch const& rhs, requires_arch) noexcept { return vreinterpretq_u16_s16(vnegq_s16(vreinterpretq_s16_u16(rhs))); } template = 0> - batch neg(batch const& rhs, requires_arch) + inline batch neg(batch const& rhs, requires_arch) noexcept { return vnegq_s16(rhs); } template = 0> - batch neg(batch const& rhs, requires_arch) + inline batch neg(batch const& rhs, requires_arch) noexcept { return vreinterpretq_u32_s32(vnegq_s32(vreinterpretq_s32_u32(rhs))); } template = 0> - batch neg(batch const& rhs, requires_arch) + inline batch neg(batch const& rhs, requires_arch) noexcept { return vnegq_s32(rhs); } - template = 0> - batch neg(batch const& rhs, requires_arch) + template = 0> + inline batch neg(batch const& rhs, requires_arch) noexcept { - return batch({-rhs.get(0), -rhs.get(1)}); + return batch({ -rhs.get(0), -rhs.get(1) }); } - template = 0> - batch neg(batch const& rhs, requires_arch) + template = 0> + inline batch neg(batch const& rhs, requires_arch) noexcept { - return batch({-rhs.get(0), -rhs.get(1)}); + return batch({ -rhs.get(0), -rhs.get(1) }); } - + template - batch neg(batch const& rhs, requires_arch) + inline batch neg(batch const& rhs, requires_arch) noexcept { return vnegq_f32(rhs); } @@ -627,11 +630,10 @@ namespace xsimd WRAP_BINARY_FLOAT(vaddq, detail::identity_return_type) template = 0> - batch add(batch const& lhs, batch const& rhs, requires_arch) + inline batch add(batch const& lhs, batch const& rhs, requires_arch) noexcept { using register_type = typename batch::register_type; - const detail::neon_dispatcher::binary dispatcher = - { + const detail::neon_dispatcher::binary dispatcher = { std::make_tuple(wrap::vaddq_u8, wrap::vaddq_s8, wrap::vaddq_u16, wrap::vaddq_s16, wrap::vaddq_u32, wrap::vaddq_s32, wrap::vaddq_u64, wrap::vaddq_s64, wrap::vaddq_f32) @@ -646,11 +648,10 @@ namespace xsimd WRAP_BINARY_INT(vqaddq, detail::identity_return_type) template = 0> - batch sadd(batch const& lhs, batch const& rhs, requires_arch) + inline batch sadd(batch const& lhs, batch const& rhs, requires_arch) noexcept { using register_type = typename batch::register_type; - const detail::neon_dispatcher::binary dispatcher = - { + const detail::neon_dispatcher::binary dispatcher = { std::make_tuple(wrap::vqaddq_u8, wrap::vqaddq_s8, wrap::vqaddq_u16, wrap::vqaddq_s16, wrap::vqaddq_u32, wrap::vqaddq_s32, wrap::vqaddq_u64, wrap::vqaddq_s64, wrap::vaddq_f32) @@ -666,11 +667,10 @@ namespace xsimd WRAP_BINARY_FLOAT(vsubq, detail::identity_return_type) template = 0> - batch sub(batch const& lhs, batch const& rhs, requires_arch) + inline batch sub(batch const& lhs, batch const& rhs, requires_arch) noexcept { using register_type = typename batch::register_type; - const detail::neon_dispatcher::binary dispatcher = - { + const detail::neon_dispatcher::binary dispatcher = { std::make_tuple(wrap::vsubq_u8, wrap::vsubq_s8, wrap::vsubq_u16, wrap::vsubq_s16, wrap::vsubq_u32, wrap::vsubq_s32, wrap::vsubq_u64, wrap::vsubq_s64, wrap::vsubq_f32) @@ -685,11 +685,10 @@ namespace xsimd WRAP_BINARY_INT(vqsubq, detail::identity_return_type) template = 0> - batch ssub(batch const& lhs, batch const& rhs, requires_arch) + inline batch ssub(batch const& lhs, batch const& rhs, requires_arch) noexcept { using register_type = typename batch::register_type; - const detail::neon_dispatcher::binary dispatcher = - { + const detail::neon_dispatcher::binary dispatcher = { std::make_tuple(wrap::vqsubq_u8, wrap::vqsubq_s8, wrap::vqsubq_u16, wrap::vqsubq_s16, wrap::vqsubq_u32, wrap::vqsubq_s32, wrap::vqsubq_u64, wrap::vqsubq_s64, wrap::vsubq_f32) @@ -697,7 +696,6 @@ namespace xsimd return dispatcher.apply(register_type(lhs), register_type(rhs)); } - /******* * mul * *******/ @@ -706,11 +704,10 @@ namespace xsimd WRAP_BINARY_FLOAT(vmulq, detail::identity_return_type) template = 0> - batch mul(batch const& lhs, batch const& rhs, requires_arch) + inline batch mul(batch const& lhs, batch const& rhs, requires_arch) noexcept { using register_type = typename batch::register_type; - const detail::excluding_int64_dispatcher::binary dispatcher = - { + const detail::excluding_int64_dispatcher::binary dispatcher = { std::make_tuple(wrap::vmulq_u8, wrap::vmulq_s8, wrap::vmulq_u16, wrap::vmulq_s16, wrap::vmulq_u32, wrap::vmulq_s32, wrap::vmulq_f32) }; @@ -722,21 +719,21 @@ namespace xsimd *******/ #if defined(XSIMD_FAST_INTEGER_DIVISION) - template = 0> - batch div(batch const& lhs, batch const& rhs, requires_arch) + template = 0> + inline batch div(batch const& lhs, batch const& rhs, requires_arch) noexcept { return vcvtq_s32_f32(vcvtq_f32_s32(lhs) / vcvtq_f32_s32(rhs)); } - template = 0> - batch div(batch const& lhs, batch const& rhs, requires_arch) + template = 0> + inline batch div(batch const& lhs, batch const& rhs, requires_arch) noexcept { return vcvtq_u32_f32(vcvtq_f32_u32(lhs) / vcvtq_f32_u32(rhs)); } #endif template - batch div(batch const& lhs, batch const& rhs, requires_arch) + inline batch div(batch const& lhs, batch const& rhs, requires_arch) noexcept { // from stackoverflow & https://projectne10.github.io/Ne10/doc/NE10__divc_8neon_8c_source.html // get an initial estimate of 1/b. @@ -760,11 +757,10 @@ namespace xsimd WRAP_BINARY_FLOAT(vceqq, detail::comp_return_type) template = 0> - batch_bool eq(batch const& lhs, batch const& rhs, requires_arch) + inline batch_bool eq(batch const& lhs, batch const& rhs, requires_arch) noexcept { using register_type = typename batch::register_type; - const detail::excluding_int64_comp_dispatcher::binary dispatcher = - { + const detail::excluding_int64_comp_dispatcher::binary dispatcher = { std::make_tuple(wrap::vceqq_u8, wrap::vceqq_s8, wrap::vceqq_u16, wrap::vceqq_s16, wrap::vceqq_u32, wrap::vceqq_s32, wrap::vceqq_f32) }; @@ -772,27 +768,26 @@ namespace xsimd } template = 0> - batch_bool eq(batch_bool const& lhs, batch_bool const& rhs, requires_arch) + inline batch_bool eq(batch_bool const& lhs, batch_bool const& rhs, requires_arch) noexcept { using register_type = typename batch_bool::register_type; using dispatcher_type = detail::neon_comp_dispatcher_impl::binary; - const dispatcher_type dispatcher = - { + const dispatcher_type dispatcher = { std::make_tuple(wrap::vceqq_u8, wrap::vceqq_u16, wrap::vceqq_u32) }; return dispatcher.apply(register_type(lhs), register_type(rhs)); } template = 0> - batch_bool eq(batch const& lhs, batch const& rhs, requires_arch) + inline batch_bool eq(batch const& lhs, batch const& rhs, requires_arch) noexcept { - return batch_bool({lhs.get(0) == rhs.get(0), lhs.get(1) == rhs.get(1)}); + return batch_bool({ lhs.get(0) == rhs.get(0), lhs.get(1) == rhs.get(1) }); } template = 0> - batch_bool eq(batch_bool const& lhs, batch_bool const& rhs, requires_arch) + inline batch_bool eq(batch_bool const& lhs, batch_bool const& rhs, requires_arch) noexcept { - return batch_bool({lhs.get(0) == rhs.get(0), lhs.get(1) == rhs.get(1)}); + return batch_bool({ lhs.get(0) == rhs.get(0), lhs.get(1) == rhs.get(1) }); } /****** @@ -803,11 +798,10 @@ namespace xsimd WRAP_BINARY_FLOAT(vcltq, detail::comp_return_type) template = 0> - batch_bool lt(batch const& lhs, batch const& rhs, requires_arch) + inline batch_bool lt(batch const& lhs, batch const& rhs, requires_arch) noexcept { using register_type = typename batch::register_type; - const detail::excluding_int64_comp_dispatcher::binary dispatcher = - { + const detail::excluding_int64_comp_dispatcher::binary dispatcher = { std::make_tuple(wrap::vcltq_u8, wrap::vcltq_s8, wrap::vcltq_u16, wrap::vcltq_s16, wrap::vcltq_u32, wrap::vcltq_s32, wrap::vcltq_f32) }; @@ -815,9 +809,9 @@ namespace xsimd } template = 0> - batch_bool lt(batch const& lhs, batch const& rhs, requires_arch) + inline batch_bool lt(batch const& lhs, batch const& rhs, requires_arch) noexcept { - return batch_bool({lhs.get(0) < rhs.get(0), lhs.get(1) < rhs.get(1)}); + return batch_bool({ lhs.get(0) < rhs.get(0), lhs.get(1) < rhs.get(1) }); } /****** @@ -828,11 +822,10 @@ namespace xsimd WRAP_BINARY_FLOAT(vcleq, detail::comp_return_type) template = 0> - batch_bool le(batch const& lhs, batch const& rhs, requires_arch) + inline batch_bool le(batch const& lhs, batch const& rhs, requires_arch) noexcept { using register_type = typename batch::register_type; - const detail::excluding_int64_comp_dispatcher::binary dispatcher = - { + const detail::excluding_int64_comp_dispatcher::binary dispatcher = { std::make_tuple(wrap::vcleq_u8, wrap::vcleq_s8, wrap::vcleq_u16, wrap::vcleq_s16, wrap::vcleq_u32, wrap::vcleq_s32, wrap::vcleq_f32) }; @@ -840,9 +833,9 @@ namespace xsimd } template = 0> - batch_bool le(batch const& lhs, batch const& rhs, requires_arch) + inline batch_bool le(batch const& lhs, batch const& rhs, requires_arch) noexcept { - return batch_bool({lhs.get(0) <= rhs.get(0), lhs.get(1) <= rhs.get(1)}); + return batch_bool({ lhs.get(0) <= rhs.get(0), lhs.get(1) <= rhs.get(1) }); } /****** @@ -853,11 +846,10 @@ namespace xsimd WRAP_BINARY_FLOAT(vcgtq, detail::comp_return_type) template = 0> - batch_bool gt(batch const& lhs, batch const& rhs, requires_arch) + inline batch_bool gt(batch const& lhs, batch const& rhs, requires_arch) noexcept { using register_type = typename batch::register_type; - const detail::excluding_int64_comp_dispatcher::binary dispatcher = - { + const detail::excluding_int64_comp_dispatcher::binary dispatcher = { std::make_tuple(wrap::vcgtq_u8, wrap::vcgtq_s8, wrap::vcgtq_u16, wrap::vcgtq_s16, wrap::vcgtq_u32, wrap::vcgtq_s32, wrap::vcgtq_f32) }; @@ -865,9 +857,9 @@ namespace xsimd } template = 0> - batch_bool gt(batch const& lhs, batch const& rhs, requires_arch) + inline batch_bool gt(batch const& lhs, batch const& rhs, requires_arch) noexcept { - return batch_bool({lhs.get(0) > rhs.get(0), lhs.get(1) > rhs.get(1)}); + return batch_bool({ lhs.get(0) > rhs.get(0), lhs.get(1) > rhs.get(1) }); } /****** @@ -878,11 +870,10 @@ namespace xsimd WRAP_BINARY_FLOAT(vcgeq, detail::comp_return_type) template = 0> - batch_bool ge(batch const& lhs, batch const& rhs, requires_arch) + inline batch_bool ge(batch const& lhs, batch const& rhs, requires_arch) noexcept { using register_type = typename batch::register_type; - const detail::excluding_int64_comp_dispatcher::binary dispatcher = - { + const detail::excluding_int64_comp_dispatcher::binary dispatcher = { std::make_tuple(wrap::vcgeq_u8, wrap::vcgeq_s8, wrap::vcgeq_u16, wrap::vcgeq_s16, wrap::vcgeq_u32, wrap::vcgeq_s32, wrap::vcgeq_f32) }; @@ -890,9 +881,9 @@ namespace xsimd } template = 0> - batch_bool ge(batch const& lhs, batch const& rhs, requires_arch) + inline batch_bool ge(batch const& lhs, batch const& rhs, requires_arch) noexcept { - return batch_bool({lhs.get(0) >= rhs.get(0), lhs.get(1) >= rhs.get(1)}); + return batch_bool({ lhs.get(0) >= rhs.get(0), lhs.get(1) >= rhs.get(1) }); } /*************** @@ -903,7 +894,7 @@ namespace xsimd namespace detail { - inline float32x4_t bitwise_and_f32(float32x4_t lhs, float32x4_t rhs) + inline float32x4_t bitwise_and_f32(float32x4_t lhs, float32x4_t rhs) noexcept { return vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(lhs), vreinterpretq_u32_f32(rhs))); @@ -912,8 +903,7 @@ namespace xsimd template V bitwise_and_neon(V const& lhs, V const& rhs) { - const neon_dispatcher::binary dispatcher = - { + const neon_dispatcher::binary dispatcher = { std::make_tuple(wrap::vandq_u8, wrap::vandq_s8, wrap::vandq_u16, wrap::vandq_s16, wrap::vandq_u32, wrap::vandq_s32, wrap::vandq_u64, wrap::vandq_s64, bitwise_and_f32) @@ -923,14 +913,14 @@ namespace xsimd } template = 0> - batch bitwise_and(batch const& lhs, batch const& rhs, requires_arch) + inline batch bitwise_and(batch const& lhs, batch const& rhs, requires_arch) noexcept { using register_type = typename batch::register_type; return detail::bitwise_and_neon(register_type(lhs), register_type(rhs)); } template = 0> - batch_bool bitwise_and(batch_bool const& lhs, batch_bool const& rhs, requires_arch) + inline batch_bool bitwise_and(batch_bool const& lhs, batch_bool const& rhs, requires_arch) noexcept { using register_type = typename batch_bool::register_type; return detail::bitwise_and_neon(register_type(lhs), register_type(rhs)); @@ -944,17 +934,16 @@ namespace xsimd namespace detail { - inline float32x4_t bitwise_or_f32(float32x4_t lhs, float32x4_t rhs) + inline float32x4_t bitwise_or_f32(float32x4_t lhs, float32x4_t rhs) noexcept { return vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(lhs), vreinterpretq_u32_f32(rhs))); } template - V bitwise_or_neon(V const& lhs, V const& rhs) + inline V bitwise_or_neon(V const& lhs, V const& rhs) noexcept { - const neon_dispatcher::binary dispatcher = - { + const neon_dispatcher::binary dispatcher = { std::make_tuple(wrap::vorrq_u8, wrap::vorrq_s8, wrap::vorrq_u16, wrap::vorrq_s16, wrap::vorrq_u32, wrap::vorrq_s32, wrap::vorrq_u64, wrap::vorrq_s64, bitwise_or_f32) @@ -964,14 +953,14 @@ namespace xsimd } template = 0> - batch bitwise_or(batch const& lhs, batch const& rhs, requires_arch) + inline batch bitwise_or(batch const& lhs, batch const& rhs, requires_arch) noexcept { using register_type = typename batch::register_type; return detail::bitwise_or_neon(register_type(lhs), register_type(rhs)); } template = 0> - batch_bool bitwise_or(batch_bool const& lhs, batch_bool const& rhs, requires_arch) + inline batch_bool bitwise_or(batch_bool const& lhs, batch_bool const& rhs, requires_arch) noexcept { using register_type = typename batch_bool::register_type; return detail::bitwise_or_neon(register_type(lhs), register_type(rhs)); @@ -985,17 +974,16 @@ namespace xsimd namespace detail { - inline float32x4_t bitwise_xor_f32(float32x4_t lhs, float32x4_t rhs) + inline float32x4_t bitwise_xor_f32(float32x4_t lhs, float32x4_t rhs) noexcept { return vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(lhs), vreinterpretq_u32_f32(rhs))); } template - V bitwise_xor_neon(V const& lhs, V const& rhs) + inline V bitwise_xor_neon(V const& lhs, V const& rhs) noexcept { - const neon_dispatcher::binary dispatcher = - { + const neon_dispatcher::binary dispatcher = { std::make_tuple(wrap::veorq_u8, wrap::veorq_s8, wrap::veorq_u16, wrap::veorq_s16, wrap::veorq_u32, wrap::veorq_s32, wrap::veorq_u64, wrap::veorq_s64, bitwise_xor_f32) @@ -1005,14 +993,14 @@ namespace xsimd } template = 0> - batch bitwise_xor(batch const& lhs, batch const& rhs, requires_arch) + inline batch bitwise_xor(batch const& lhs, batch const& rhs, requires_arch) noexcept { using register_type = typename batch::register_type; return detail::bitwise_xor_neon(register_type(lhs), register_type(rhs)); } template = 0> - batch_bool bitwise_xor(batch_bool const& lhs, batch_bool const& rhs, requires_arch) + inline batch_bool bitwise_xor(batch_bool const& lhs, batch_bool const& rhs, requires_arch) noexcept { using register_type = typename batch_bool::register_type; return detail::bitwise_xor_neon(register_type(lhs), register_type(rhs)); @@ -1023,9 +1011,9 @@ namespace xsimd *******/ template - batch_bool neq(batch_bool const& lhs, batch_bool const& rhs, requires_arch) + inline batch_bool neq(batch_bool const& lhs, batch_bool const& rhs, requires_arch) noexcept { - return bitwise_xor(lhs, rhs, A{}); + return bitwise_xor(lhs, rhs, A {}); } /*************** @@ -1036,26 +1024,25 @@ namespace xsimd namespace detail { - inline int64x2_t bitwise_not_s64(int64x2_t arg) + inline int64x2_t bitwise_not_s64(int64x2_t arg) noexcept { return vreinterpretq_s64_s32(vmvnq_s32(vreinterpretq_s32_s64(arg))); } - inline uint64x2_t bitwise_not_u64(uint64x2_t arg) + inline uint64x2_t bitwise_not_u64(uint64x2_t arg) noexcept { return vreinterpretq_u64_u32(vmvnq_u32(vreinterpretq_u32_u64(arg))); } - inline float32x4_t bitwise_not_f32(float32x4_t arg) + inline float32x4_t bitwise_not_f32(float32x4_t arg) noexcept { return vreinterpretq_f32_u32(vmvnq_u32(vreinterpretq_u32_f32(arg))); } template - V bitwise_not_neon(V const& arg) + inline V bitwise_not_neon(V const& arg) noexcept { - const neon_dispatcher::unary dispatcher = - { + const neon_dispatcher::unary dispatcher = { std::make_tuple(wrap::vmvnq_u8, wrap::vmvnq_s8, wrap::vmvnq_u16, wrap::vmvnq_s16, wrap::vmvnq_u32, wrap::vmvnq_s32, bitwise_not_u64, bitwise_not_s64, @@ -1066,14 +1053,14 @@ namespace xsimd } template = 0> - batch bitwise_not(batch const& arg, requires_arch) + inline batch bitwise_not(batch const& arg, requires_arch) noexcept { using register_type = typename batch::register_type; return detail::bitwise_not_neon(register_type(arg)); } template = 0> - batch_bool bitwise_not(batch_bool const& arg, requires_arch) + inline batch_bool bitwise_not(batch_bool const& arg, requires_arch) noexcept { using register_type = typename batch_bool::register_type; return detail::bitwise_not_neon(register_type(arg)); @@ -1087,16 +1074,15 @@ namespace xsimd namespace detail { - inline float32x4_t bitwise_andnot_f32(float32x4_t lhs, float32x4_t rhs) + inline float32x4_t bitwise_andnot_f32(float32x4_t lhs, float32x4_t rhs) noexcept { return vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(lhs), vreinterpretq_u32_f32(rhs))); } template - V bitwise_andnot_neon(V const& lhs, V const& rhs) + inline V bitwise_andnot_neon(V const& lhs, V const& rhs) noexcept { - const detail::neon_dispatcher::binary dispatcher = - { + const detail::neon_dispatcher::binary dispatcher = { std::make_tuple(wrap::vbicq_u8, wrap::vbicq_s8, wrap::vbicq_u16, wrap::vbicq_s16, wrap::vbicq_u32, wrap::vbicq_s32, wrap::vbicq_u64, wrap::vbicq_s64, bitwise_andnot_f32) @@ -1106,14 +1092,14 @@ namespace xsimd } template = 0> - batch bitwise_andnot(batch const& lhs, batch const& rhs, requires_arch) + inline batch bitwise_andnot(batch const& lhs, batch const& rhs, requires_arch) noexcept { using register_type = typename batch::register_type; return detail::bitwise_andnot_neon(register_type(lhs), register_type(rhs)); } template = 0> - batch_bool bitwise_andnot(batch_bool const& lhs, batch_bool const& rhs, requires_arch) + inline batch_bool bitwise_andnot(batch_bool const& lhs, batch_bool const& rhs, requires_arch) noexcept { using register_type = typename batch_bool::register_type; return detail::bitwise_andnot_neon(register_type(lhs), register_type(rhs)); @@ -1127,11 +1113,10 @@ namespace xsimd WRAP_BINARY_FLOAT(vminq, detail::identity_return_type) template = 0> - batch min(batch const& lhs, batch const& rhs, requires_arch) + inline batch min(batch const& lhs, batch const& rhs, requires_arch) noexcept { using register_type = typename batch::register_type; - const detail::excluding_int64_dispatcher::binary dispatcher = - { + const detail::excluding_int64_dispatcher::binary dispatcher = { std::make_tuple(wrap::vminq_u8, wrap::vminq_s8, wrap::vminq_u16, wrap::vminq_s16, wrap::vminq_u32, wrap::vminq_s32, wrap::vminq_f32) }; @@ -1139,7 +1124,7 @@ namespace xsimd } template = 0> - batch min(batch const& lhs, batch const& rhs, requires_arch) + inline batch min(batch const& lhs, batch const& rhs, requires_arch) noexcept { return { std::min(lhs.get(0), rhs.get(0)), std::min(lhs.get(1), rhs.get(1)) }; } @@ -1152,11 +1137,10 @@ namespace xsimd WRAP_BINARY_FLOAT(vmaxq, detail::identity_return_type) template = 0> - batch max(batch const& lhs, batch const& rhs, requires_arch) + inline batch max(batch const& lhs, batch const& rhs, requires_arch) noexcept { using register_type = typename batch::register_type; - const detail::excluding_int64_dispatcher::binary dispatcher = - { + const detail::excluding_int64_dispatcher::binary dispatcher = { std::make_tuple(wrap::vmaxq_u8, wrap::vmaxq_s8, wrap::vmaxq_u16, wrap::vmaxq_s16, wrap::vmaxq_u32, wrap::vmaxq_s32, wrap::vmaxq_f32) }; @@ -1164,7 +1148,7 @@ namespace xsimd } template = 0> - batch max(batch const& lhs, batch const& rhs, requires_arch) + inline batch max(batch const& lhs, batch const& rhs, requires_arch) noexcept { return { std::max(lhs.get(0), rhs.get(0)), std::max(lhs.get(1), rhs.get(1)) }; } @@ -1173,37 +1157,37 @@ namespace xsimd * abs * *******/ - namespace wrap { - inline int8x16_t vabsq_s8 (int8x16_t a) { return ::vabsq_s8 (a); } - inline int16x8_t vabsq_s16(int16x8_t a) { return ::vabsq_s16(a); } - inline int32x4_t vabsq_s32(int32x4_t a) { return ::vabsq_s32(a); } + namespace wrap + { + inline int8x16_t vabsq_s8(int8x16_t a) noexcept { return ::vabsq_s8(a); } + inline int16x8_t vabsq_s16(int16x8_t a) noexcept { return ::vabsq_s16(a); } + inline int32x4_t vabsq_s32(int32x4_t a) noexcept { return ::vabsq_s32(a); } } WRAP_UNARY_FLOAT(vabsq) namespace detail { - inline uint8x16_t abs_u8(uint8x16_t arg) + inline uint8x16_t abs_u8(uint8x16_t arg) noexcept { return arg; } - inline uint16x8_t abs_u16(uint16x8_t arg) + inline uint16x8_t abs_u16(uint16x8_t arg) noexcept { return arg; } - inline uint32x4_t abs_u32(uint32x4_t arg) + inline uint32x4_t abs_u32(uint32x4_t arg) noexcept { return arg; } } template = 0> - batch abs(batch const& arg, requires_arch) + inline batch abs(batch const& arg, requires_arch) noexcept { using register_type = typename batch::register_type; - const detail::excluding_int64_dispatcher::unary dispatcher = - { + const detail::excluding_int64_dispatcher::unary dispatcher = { std::make_tuple(detail::abs_u8, wrap::vabsq_s8, detail::abs_u16, wrap::vabsq_s16, detail::abs_u32, wrap::vabsq_s32, wrap::vabsq_f32) }; @@ -1215,7 +1199,7 @@ namespace xsimd ********/ template - batch sqrt(batch const& arg, requires_arch) + inline batch sqrt(batch const& arg, requires_arch) noexcept { batch sqrt_reciprocal = vrsqrteq_f32(arg); // one iter @@ -1231,13 +1215,13 @@ namespace xsimd #ifdef __ARM_FEATURE_FMA template - batch fma(batch const& x, batch const& y, batch const& z, requires_arch) + inline batch fma(batch const& x, batch const& y, batch const& z, requires_arch) noexcept { return vfmaq_f32(z, x, y); } template - batch fms(batch const& x, batch const& y, batch const& z, requires_arch) + inline batch fms(batch const& x, batch const& y, batch const& z, requires_arch) noexcept { return vfmaq_f32(-z, x, y); } @@ -1250,7 +1234,7 @@ namespace xsimd namespace detail { template - T sum_batch(V const& arg) + inline T sum_batch(V const& arg) noexcept { T res = T(0); for (std::size_t i = 0; i < batch::size; ++i) @@ -1262,35 +1246,45 @@ namespace xsimd } template = 0> - typename batch::value_type hadd(batch const& arg, requires_arch) + inline typename batch::value_type hadd(batch const& arg, requires_arch) noexcept { uint8x8_t tmp = vpadd_u8(vget_low_u8(arg), vget_high_u8(arg)); - return detail::sum_batch(tmp); + tmp = vpadd_u8(tmp, tmp); + tmp = vpadd_u8(tmp, tmp); + tmp = vpadd_u8(tmp, tmp); + return vget_lane_u8(tmp, 0); } template = 0> - typename batch::value_type hadd(batch const& arg, requires_arch) + inline typename batch::value_type hadd(batch const& arg, requires_arch) noexcept { int8x8_t tmp = vpadd_s8(vget_low_s8(arg), vget_high_s8(arg)); - return detail::sum_batch(tmp); + tmp = vpadd_s8(tmp, tmp); + tmp = vpadd_s8(tmp, tmp); + tmp = vpadd_s8(tmp, tmp); + return vget_lane_s8(tmp, 0); } template = 0> - typename batch::value_type hadd(batch const& arg, requires_arch) + inline typename batch::value_type hadd(batch const& arg, requires_arch) noexcept { uint16x4_t tmp = vpadd_u16(vget_low_u16(arg), vget_high_u16(arg)); - return detail::sum_batch(tmp); + tmp = vpadd_u16(tmp, tmp); + tmp = vpadd_u16(tmp, tmp); + return vget_lane_u16(tmp, 0); } template = 0> - typename batch::value_type hadd(batch const& arg, requires_arch) + inline typename batch::value_type hadd(batch const& arg, requires_arch) noexcept { int16x4_t tmp = vpadd_s16(vget_low_s16(arg), vget_high_s16(arg)); - return detail::sum_batch(tmp); + tmp = vpadd_s16(tmp, tmp); + tmp = vpadd_s16(tmp, tmp); + return vget_lane_s16(tmp, 0); } template = 0> - typename batch::value_type hadd(batch const& arg, requires_arch) + inline typename batch::value_type hadd(batch const& arg, requires_arch) noexcept { uint32x2_t tmp = vpadd_u32(vget_low_u32(arg), vget_high_u32(arg)); tmp = vpadd_u32(tmp, tmp); @@ -1298,7 +1292,7 @@ namespace xsimd } template = 0> - typename batch::value_type hadd(batch const& arg, requires_arch) + inline typename batch::value_type hadd(batch const& arg, requires_arch) noexcept { int32x2_t tmp = vpadd_s32(vget_low_s32(arg), vget_high_s32(arg)); tmp = vpadd_s32(tmp, tmp); @@ -1306,13 +1300,13 @@ namespace xsimd } template = 0> - typename batch::value_type hadd(batch const& arg, requires_arch) + inline typename batch::value_type hadd(batch const& arg, requires_arch) noexcept { return arg.get(0) + arg.get(1); } template - float hadd(batch const& arg, requires_arch) + inline float hadd(batch const& arg, requires_arch) noexcept { float32x2_t tmp = vpadd_f32(vget_low_f32(arg), vget_high_f32(arg)); tmp = vpadd_f32(tmp, tmp); @@ -1324,7 +1318,7 @@ namespace xsimd *********/ template - batch haddp(const batch* row, requires_arch) + inline batch haddp(const batch* row, requires_arch) noexcept { // row = (a,b,c,d) float32x2_t tmp1, tmp2, tmp3; @@ -1348,16 +1342,17 @@ namespace xsimd * select * **********/ - namespace wrap { - inline uint8x16_t vbslq_u8 (uint8x16_t a, uint8x16_t b, uint8x16_t c) { return ::vbslq_u8 (a, b, c); } - inline int8x16_t vbslq_s8 (uint8x16_t a, int8x16_t b, int8x16_t c) { return ::vbslq_s8 (a, b, c); } - inline uint16x8_t vbslq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c) { return ::vbslq_u16(a, b, c); } - inline int16x8_t vbslq_s16(uint16x8_t a, int16x8_t b, int16x8_t c) { return ::vbslq_s16(a, b, c); } - inline uint32x4_t vbslq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c) { return ::vbslq_u32(a, b, c); } - inline int32x4_t vbslq_s32(uint32x4_t a, int32x4_t b, int32x4_t c) { return ::vbslq_s32(a, b, c); } - inline uint64x2_t vbslq_u64(uint64x2_t a, uint64x2_t b, uint64x2_t c) { return ::vbslq_u64(a, b, c); } - inline int64x2_t vbslq_s64(uint64x2_t a, int64x2_t b, int64x2_t c) { return ::vbslq_s64(a, b, c); } - inline float32x4_t vbslq_f32(uint32x4_t a, float32x4_t b, float32x4_t c) { return ::vbslq_f32(a, b, c); } + namespace wrap + { + inline uint8x16_t vbslq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c) noexcept { return ::vbslq_u8(a, b, c); } + inline int8x16_t vbslq_s8(uint8x16_t a, int8x16_t b, int8x16_t c) noexcept { return ::vbslq_s8(a, b, c); } + inline uint16x8_t vbslq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c) noexcept { return ::vbslq_u16(a, b, c); } + inline int16x8_t vbslq_s16(uint16x8_t a, int16x8_t b, int16x8_t c) noexcept { return ::vbslq_s16(a, b, c); } + inline uint32x4_t vbslq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c) noexcept { return ::vbslq_u32(a, b, c); } + inline int32x4_t vbslq_s32(uint32x4_t a, int32x4_t b, int32x4_t c) noexcept { return ::vbslq_s32(a, b, c); } + inline uint64x2_t vbslq_u64(uint64x2_t a, uint64x2_t b, uint64x2_t c) noexcept { return ::vbslq_u64(a, b, c); } + inline int64x2_t vbslq_s64(uint64x2_t a, int64x2_t b, int64x2_t c) noexcept { return ::vbslq_s64(a, b, c); } + inline float32x4_t vbslq_f32(uint32x4_t a, float32x4_t b, float32x4_t c) noexcept { return ::vbslq_f32(a, b, c); } } namespace detail @@ -1369,7 +1364,7 @@ namespace xsimd const container_type m_func; template - U apply(comp_return_type cond, U lhs, U rhs) const + U apply(comp_return_type cond, U lhs, U rhs) const noexcept { using func_type = U (*)(comp_return_type, U, U); auto func = xsimd::detail::get(m_func); @@ -1378,19 +1373,18 @@ namespace xsimd }; using neon_select_dispatcher = neon_select_dispatcher_impl; + uint16x8_t, int16x8_t, + uint32x4_t, int32x4_t, + uint64x2_t, int64x2_t, + float32x4_t>; } template = 0> - batch select(batch_bool const& cond, batch const& a, batch const& b, requires_arch) + inline batch select(batch_bool const& cond, batch const& a, batch const& b, requires_arch) noexcept { using bool_register_type = typename batch_bool::register_type; using register_type = typename batch::register_type; - const detail::neon_select_dispatcher dispatcher = - { + const detail::neon_select_dispatcher dispatcher = { std::make_tuple(wrap::vbslq_u8, wrap::vbslq_s8, wrap::vbslq_u16, wrap::vbslq_s16, wrap::vbslq_u32, wrap::vbslq_s32, wrap::vbslq_u64, wrap::vbslq_s64, wrap::vbslq_f32) @@ -1399,9 +1393,9 @@ namespace xsimd } template = 0> - batch select(batch_bool_constant, b...> const&, batch const& true_br, batch const& false_br, requires_arch) + inline batch select(batch_bool_constant, b...> const&, batch const& true_br, batch const& false_br, requires_arch) noexcept { - return select(batch_bool{b...}, true_br, false_br, neon{}); + return select(batch_bool { b... }, true_br, false_br, neon {}); } /********** @@ -1409,61 +1403,61 @@ namespace xsimd **********/ template = 0> - batch zip_lo(batch const& lhs, batch const& rhs, requires_arch) + inline batch zip_lo(batch const& lhs, batch const& rhs, requires_arch) noexcept { uint8x8x2_t tmp = vzip_u8(vget_low_u8(lhs), vget_low_u8(rhs)); return vcombine_u8(tmp.val[0], tmp.val[1]); } template = 0> - batch zip_lo(batch const& lhs, batch const& rhs, requires_arch) + inline batch zip_lo(batch const& lhs, batch const& rhs, requires_arch) noexcept { int8x8x2_t tmp = vzip_s8(vget_low_s8(lhs), vget_low_s8(rhs)); return vcombine_s8(tmp.val[0], tmp.val[1]); } template = 0> - batch zip_lo(batch const& lhs, batch const& rhs, requires_arch) + inline batch zip_lo(batch const& lhs, batch const& rhs, requires_arch) noexcept { uint16x4x2_t tmp = vzip_u16(vget_low_u16(lhs), vget_low_u16(rhs)); return vcombine_u16(tmp.val[0], tmp.val[1]); } template = 0> - batch zip_lo(batch const& lhs, batch const& rhs, requires_arch) + inline batch zip_lo(batch const& lhs, batch const& rhs, requires_arch) noexcept { int16x4x2_t tmp = vzip_s16(vget_low_s16(lhs), vget_low_s16(rhs)); return vcombine_s16(tmp.val[0], tmp.val[1]); } template = 0> - batch zip_lo(batch const& lhs, batch const& rhs, requires_arch) + inline batch zip_lo(batch const& lhs, batch const& rhs, requires_arch) noexcept { uint32x2x2_t tmp = vzip_u32(vget_low_u32(lhs), vget_low_u32(rhs)); return vcombine_u32(tmp.val[0], tmp.val[1]); } template = 0> - batch zip_lo(batch const& lhs, batch const& rhs, requires_arch) + inline batch zip_lo(batch const& lhs, batch const& rhs, requires_arch) noexcept { int32x2x2_t tmp = vzip_s32(vget_low_s32(lhs), vget_low_s32(rhs)); return vcombine_s32(tmp.val[0], tmp.val[1]); } template = 0> - batch zip_lo(batch const& lhs, batch const& rhs, requires_arch) + inline batch zip_lo(batch const& lhs, batch const& rhs, requires_arch) noexcept { return vcombine_u64(vget_low_u64(lhs), vget_low_u64(rhs)); } template = 0> - batch zip_lo(batch const& lhs, batch const& rhs, requires_arch) + inline batch zip_lo(batch const& lhs, batch const& rhs, requires_arch) noexcept { return vcombine_s64(vget_low_s64(lhs), vget_low_s64(rhs)); } template - batch zip_lo(batch const& lhs, batch const& rhs, requires_arch) + inline batch zip_lo(batch const& lhs, batch const& rhs, requires_arch) noexcept { float32x2x2_t tmp = vzip_f32(vget_low_f32(lhs), vget_low_f32(rhs)); return vcombine_f32(tmp.val[0], tmp.val[1]); @@ -1474,61 +1468,61 @@ namespace xsimd **********/ template = 0> - batch zip_hi(batch const& lhs, batch const& rhs, requires_arch) + inline batch zip_hi(batch const& lhs, batch const& rhs, requires_arch) noexcept { uint8x8x2_t tmp = vzip_u8(vget_high_u8(lhs), vget_high_u8(rhs)); return vcombine_u8(tmp.val[0], tmp.val[1]); } template = 0> - batch zip_hi(batch const& lhs, batch const& rhs, requires_arch) + inline batch zip_hi(batch const& lhs, batch const& rhs, requires_arch) noexcept { int8x8x2_t tmp = vzip_s8(vget_high_s8(lhs), vget_high_s8(rhs)); return vcombine_s8(tmp.val[0], tmp.val[1]); } template = 0> - batch zip_hi(batch const& lhs, batch const& rhs, requires_arch) + inline batch zip_hi(batch const& lhs, batch const& rhs, requires_arch) noexcept { uint16x4x2_t tmp = vzip_u16(vget_high_u16(lhs), vget_high_u16(rhs)); return vcombine_u16(tmp.val[0], tmp.val[1]); } template = 0> - batch zip_hi(batch const& lhs, batch const& rhs, requires_arch) + inline batch zip_hi(batch const& lhs, batch const& rhs, requires_arch) noexcept { int16x4x2_t tmp = vzip_s16(vget_high_s16(lhs), vget_high_s16(rhs)); return vcombine_s16(tmp.val[0], tmp.val[1]); } template = 0> - batch zip_hi(batch const& lhs, batch const& rhs, requires_arch) + inline batch zip_hi(batch const& lhs, batch const& rhs, requires_arch) noexcept { uint32x2x2_t tmp = vzip_u32(vget_high_u32(lhs), vget_high_u32(rhs)); return vcombine_u32(tmp.val[0], tmp.val[1]); } template = 0> - batch zip_hi(batch const& lhs, batch const& rhs, requires_arch) + inline batch zip_hi(batch const& lhs, batch const& rhs, requires_arch) noexcept { int32x2x2_t tmp = vzip_s32(vget_high_s32(lhs), vget_high_s32(rhs)); return vcombine_s32(tmp.val[0], tmp.val[1]); } template = 0> - batch zip_hi(batch const& lhs, batch const& rhs, requires_arch) + inline batch zip_hi(batch const& lhs, batch const& rhs, requires_arch) noexcept { return vcombine_u64(vget_high_u64(lhs), vget_high_u64(rhs)); } template = 0> - batch zip_hi(batch const& lhs, batch const& rhs, requires_arch) + inline batch zip_hi(batch const& lhs, batch const& rhs, requires_arch) noexcept { return vcombine_s64(vget_high_s64(lhs), vget_high_s64(rhs)); } template - batch zip_hi(batch const& lhs, batch const& rhs, requires_arch) + inline batch zip_hi(batch const& lhs, batch const& rhs, requires_arch) noexcept { float32x2x2_t tmp = vzip_f32(vget_high_f32(lhs), vget_high_f32(rhs)); return vcombine_f32(tmp.val[0], tmp.val[1]); @@ -1541,14 +1535,14 @@ namespace xsimd namespace detail { template - batch extract_pair(batch const&, batch const& /*rhs*/, std::size_t, ::xsimd::detail::index_sequence<>) + inline batch extract_pair(batch const&, batch const& /*rhs*/, std::size_t, ::xsimd::detail::index_sequence<>) noexcept { assert(false && "extract_pair out of bounds"); - return batch{}; + return batch {}; } template = 0> - batch extract_pair(batch const& lhs, batch const& rhs, std::size_t n, ::xsimd::detail::index_sequence) + inline batch extract_pair(batch const& lhs, batch const& rhs, std::size_t n, ::xsimd::detail::index_sequence) noexcept { if (n == I) { @@ -1561,7 +1555,7 @@ namespace xsimd } template = 0> - batch extract_pair(batch const& lhs, batch const& rhs, std::size_t n, ::xsimd::detail::index_sequence) + inline batch extract_pair(batch const& lhs, batch const& rhs, std::size_t n, ::xsimd::detail::index_sequence) noexcept { if (n == I) { @@ -1574,7 +1568,7 @@ namespace xsimd } template = 0> - batch extract_pair(batch const& lhs, batch const& rhs, std::size_t n, ::xsimd::detail::index_sequence) + inline batch extract_pair(batch const& lhs, batch const& rhs, std::size_t n, ::xsimd::detail::index_sequence) noexcept { if (n == I) { @@ -1587,7 +1581,7 @@ namespace xsimd } template = 0> - batch extract_pair(batch const& lhs, batch const& rhs, std::size_t n, ::xsimd::detail::index_sequence) + inline batch extract_pair(batch const& lhs, batch const& rhs, std::size_t n, ::xsimd::detail::index_sequence) noexcept { if (n == I) { @@ -1600,7 +1594,7 @@ namespace xsimd } template = 0> - batch extract_pair(batch const& lhs, batch const& rhs, std::size_t n, ::xsimd::detail::index_sequence) + inline batch extract_pair(batch const& lhs, batch const& rhs, std::size_t n, ::xsimd::detail::index_sequence) noexcept { if (n == I) { @@ -1613,7 +1607,7 @@ namespace xsimd } template = 0> - batch extract_pair(batch const& lhs, batch const& rhs, std::size_t n, ::xsimd::detail::index_sequence) + inline batch extract_pair(batch const& lhs, batch const& rhs, std::size_t n, ::xsimd::detail::index_sequence) noexcept { if (n == I) { @@ -1626,7 +1620,7 @@ namespace xsimd } template = 0> - batch extract_pair(batch const& lhs, batch const& rhs, std::size_t n, ::xsimd::detail::index_sequence) + inline batch extract_pair(batch const& lhs, batch const& rhs, std::size_t n, ::xsimd::detail::index_sequence) noexcept { if (n == I) { @@ -1639,7 +1633,7 @@ namespace xsimd } template = 0> - batch extract_pair(batch const& lhs, batch const& rhs, std::size_t n, ::xsimd::detail::index_sequence) + inline batch extract_pair(batch const& lhs, batch const& rhs, std::size_t n, ::xsimd::detail::index_sequence) noexcept { if (n == I) { @@ -1652,7 +1646,7 @@ namespace xsimd } template - batch extract_pair(batch const& lhs, batch const& rhs, std::size_t n, ::xsimd::detail::index_sequence) + inline batch extract_pair(batch const& lhs, batch const& rhs, std::size_t n, ::xsimd::detail::index_sequence) noexcept { if (n == I) { @@ -1665,7 +1659,7 @@ namespace xsimd } template - batch extract_pair_impl(batch const& lhs, batch const& rhs, std::size_t n, ::xsimd::detail::index_sequence<0, Is...>) + inline batch extract_pair_impl(batch const& lhs, batch const& rhs, std::size_t n, ::xsimd::detail::index_sequence<0, Is...>) noexcept { if (n == 0) { @@ -1679,10 +1673,10 @@ namespace xsimd } template - batch extract_pair(batch const& lhs, batch const& rhs, std::size_t n, requires_arch) + inline batch extract_pair(batch const& lhs, batch const& rhs, std::size_t n, requires_arch) noexcept { constexpr std::size_t size = batch::size; - assert(0<= n && n< size && "index in bounds"); + assert(0 <= n && n < size && "index in bounds"); return detail::extract_pair_impl(lhs, rhs, n, ::xsimd::detail::make_index_sequence()); } @@ -1693,14 +1687,14 @@ namespace xsimd namespace detail { template - batch bitwise_lshift(batch const& /*lhs*/, int /*n*/, ::xsimd::detail::int_sequence<>) + inline batch bitwise_lshift(batch const& /*lhs*/, int /*n*/, ::xsimd::detail::int_sequence<>) noexcept { assert(false && "bitwise_lshift out of bounds"); - return batch{}; + return batch {}; } template = 0> - batch bitwise_lshift(batch const& lhs, int n, ::xsimd::detail::int_sequence) + inline batch bitwise_lshift(batch const& lhs, int n, ::xsimd::detail::int_sequence) noexcept { if (n == I) { @@ -1713,7 +1707,7 @@ namespace xsimd } template = 0> - batch bitwise_lshift(batch const& lhs, int n, ::xsimd::detail::int_sequence) + inline batch bitwise_lshift(batch const& lhs, int n, ::xsimd::detail::int_sequence) noexcept { if (n == I) { @@ -1726,7 +1720,7 @@ namespace xsimd } template = 0> - batch bitwise_lshift(batch const& lhs, int n, ::xsimd::detail::int_sequence) + inline batch bitwise_lshift(batch const& lhs, int n, ::xsimd::detail::int_sequence) noexcept { if (n == I) { @@ -1739,7 +1733,7 @@ namespace xsimd } template = 0> - batch bitwise_lshift(batch const& lhs, int n, ::xsimd::detail::int_sequence) + inline batch bitwise_lshift(batch const& lhs, int n, ::xsimd::detail::int_sequence) noexcept { if (n == I) { @@ -1752,7 +1746,7 @@ namespace xsimd } template = 0> - batch bitwise_lshift(batch const& lhs, int n, ::xsimd::detail::int_sequence) + inline batch bitwise_lshift(batch const& lhs, int n, ::xsimd::detail::int_sequence) noexcept { if (n == I) { @@ -1765,7 +1759,7 @@ namespace xsimd } template = 0> - batch bitwise_lshift(batch const& lhs, int n, ::xsimd::detail::int_sequence) + inline batch bitwise_lshift(batch const& lhs, int n, ::xsimd::detail::int_sequence) noexcept { if (n == I) { @@ -1778,7 +1772,7 @@ namespace xsimd } template = 0> - batch bitwise_lshift(batch const& lhs, int n, ::xsimd::detail::int_sequence) + inline batch bitwise_lshift(batch const& lhs, int n, ::xsimd::detail::int_sequence) noexcept { if (n == I) { @@ -1791,7 +1785,7 @@ namespace xsimd } template = 0> - batch bitwise_lshift(batch const& lhs, int n, ::xsimd::detail::int_sequence) + inline batch bitwise_lshift(batch const& lhs, int n, ::xsimd::detail::int_sequence) noexcept { if (n == I) { @@ -1804,7 +1798,7 @@ namespace xsimd } template - batch bitwise_lshift_impl(batch const& lhs, int n, ::xsimd::detail::int_sequence<0, Is...>) + inline batch bitwise_lshift_impl(batch const& lhs, int n, ::xsimd::detail::int_sequence<0, Is...>) noexcept { if (n == 0) { @@ -1816,59 +1810,59 @@ namespace xsimd } } } - + template - batch bitwise_lshift(batch const& lhs, int n, requires_arch) + inline batch bitwise_lshift(batch const& lhs, int n, requires_arch) noexcept { constexpr std::size_t size = sizeof(typename batch::value_type) * 8; - assert(0<= n && n< size && "index in bounds"); + assert(0 <= n && n < size && "index in bounds"); return detail::bitwise_lshift_impl(lhs, n, ::xsimd::detail::make_int_sequence()); } template = 0> - batch bitwise_lshift(batch const& lhs, batch, A> const& rhs, requires_arch) + inline batch bitwise_lshift(batch const& lhs, batch, A> const& rhs, requires_arch) noexcept { return vshlq_u8(lhs, rhs); } template = 0> - batch bitwise_lshift(batch const& lhs, batch const& rhs, requires_arch) + inline batch bitwise_lshift(batch const& lhs, batch const& rhs, requires_arch) noexcept { return vshlq_s8(lhs, rhs); } template = 0> - batch bitwise_lshift(batch const& lhs, batch, A> const& rhs, requires_arch) + inline batch bitwise_lshift(batch const& lhs, batch, A> const& rhs, requires_arch) noexcept { return vshlq_u16(lhs, rhs); } template = 0> - batch bitwise_lshift(batch const& lhs, batch const& rhs, requires_arch) + inline batch bitwise_lshift(batch const& lhs, batch const& rhs, requires_arch) noexcept { return vshlq_s16(lhs, rhs); } template = 0> - batch bitwise_lshift(batch const& lhs, batch, A> const& rhs, requires_arch) + inline batch bitwise_lshift(batch const& lhs, batch, A> const& rhs, requires_arch) noexcept { return vshlq_u32(lhs, rhs); } template = 0> - batch bitwise_lshift(batch const& lhs, batch const& rhs, requires_arch) + inline batch bitwise_lshift(batch const& lhs, batch const& rhs, requires_arch) noexcept { return vshlq_s32(lhs, rhs); } template = 0> - batch bitwise_lshift(batch const& lhs, batch, A> const& rhs, requires_arch) + inline batch bitwise_lshift(batch const& lhs, batch, A> const& rhs, requires_arch) noexcept { return vshlq_u64(lhs, rhs); } template = 0> - batch bitwise_lshift(batch const& lhs, batch const& rhs, requires_arch) + inline batch bitwise_lshift(batch const& lhs, batch const& rhs, requires_arch) noexcept { return vshlq_s64(lhs, rhs); } @@ -1880,14 +1874,14 @@ namespace xsimd namespace detail { template - batch bitwise_rshift(batch const& /*lhs*/, int /*n*/, ::xsimd::detail::int_sequence<>) + inline batch bitwise_rshift(batch const& /*lhs*/, int /*n*/, ::xsimd::detail::int_sequence<>) noexcept { assert(false && "bitwise_rshift out of bounds"); - return batch{}; + return batch {}; } template = 0> - batch bitwise_rshift(batch const& lhs, int n, ::xsimd::detail::int_sequence) + inline batch bitwise_rshift(batch const& lhs, int n, ::xsimd::detail::int_sequence) noexcept { if (n == I) { @@ -1900,7 +1894,7 @@ namespace xsimd } template = 0> - batch bitwise_rshift(batch const& lhs, int n, ::xsimd::detail::int_sequence) + inline batch bitwise_rshift(batch const& lhs, int n, ::xsimd::detail::int_sequence) noexcept { if (n == I) { @@ -1913,7 +1907,7 @@ namespace xsimd } template = 0> - batch bitwise_rshift(batch const& lhs, int n, ::xsimd::detail::int_sequence) + inline batch bitwise_rshift(batch const& lhs, int n, ::xsimd::detail::int_sequence) noexcept { if (n == I) { @@ -1926,7 +1920,7 @@ namespace xsimd } template = 0> - batch bitwise_rshift(batch const& lhs, int n, ::xsimd::detail::int_sequence) + inline batch bitwise_rshift(batch const& lhs, int n, ::xsimd::detail::int_sequence) noexcept { if (n == I) { @@ -1939,7 +1933,7 @@ namespace xsimd } template = 0> - batch bitwise_rshift(batch const& lhs, int n, ::xsimd::detail::int_sequence) + inline batch bitwise_rshift(batch const& lhs, int n, ::xsimd::detail::int_sequence) noexcept { if (n == I) { @@ -1952,7 +1946,7 @@ namespace xsimd } template = 0> - batch bitwise_rshift(batch const& lhs, int n, ::xsimd::detail::int_sequence) + inline batch bitwise_rshift(batch const& lhs, int n, ::xsimd::detail::int_sequence) noexcept { if (n == I) { @@ -1965,7 +1959,7 @@ namespace xsimd } template = 0> - batch bitwise_rshift(batch const& lhs, int n, ::xsimd::detail::int_sequence) + inline batch bitwise_rshift(batch const& lhs, int n, ::xsimd::detail::int_sequence) noexcept { if (n == I) { @@ -1978,7 +1972,7 @@ namespace xsimd } template = 0> - batch bitwise_rshift(batch const& lhs, int n, ::xsimd::detail::int_sequence) + inline batch bitwise_rshift(batch const& lhs, int n, ::xsimd::detail::int_sequence) noexcept { if (n == I) { @@ -1991,7 +1985,7 @@ namespace xsimd } template - batch bitwise_rshift_impl(batch const& lhs, int n, ::xsimd::detail::int_sequence<0, Is...>) + inline batch bitwise_rshift_impl(batch const& lhs, int n, ::xsimd::detail::int_sequence<0, Is...>) noexcept { if (n == 0) { @@ -2003,47 +1997,47 @@ namespace xsimd } } } - + template - batch bitwise_rshift(batch const& lhs, int n, requires_arch) + inline batch bitwise_rshift(batch const& lhs, int n, requires_arch) noexcept { constexpr std::size_t size = sizeof(typename batch::value_type) * 8; - assert(0<= n && n< size && "index in bounds"); + assert(0 <= n && n < size && "index in bounds"); return detail::bitwise_rshift_impl(lhs, n, ::xsimd::detail::make_int_sequence()); } - + template = 0> - batch bitwise_rshift(batch const& lhs, batch, A> const& rhs, requires_arch) + inline batch bitwise_rshift(batch const& lhs, batch, A> const& rhs, requires_arch) noexcept { return vshlq_u8(lhs, vnegq_s8(rhs)); } template = 0> - batch bitwise_rshift(batch const& lhs, batch const& rhs, requires_arch) + inline batch bitwise_rshift(batch const& lhs, batch const& rhs, requires_arch) noexcept { return vshlq_s8(lhs, vnegq_s8(rhs)); } template = 0> - batch bitwise_rshift(batch const& lhs, batch, A> const& rhs, requires_arch) + inline batch bitwise_rshift(batch const& lhs, batch, A> const& rhs, requires_arch) noexcept { return vshlq_u16(lhs, vnegq_s16(rhs)); } template = 0> - batch bitwise_rshift(batch const& lhs, batch const& rhs, requires_arch) + inline batch bitwise_rshift(batch const& lhs, batch const& rhs, requires_arch) noexcept { return vshlq_s16(lhs, vnegq_s16(rhs)); } template = 0> - batch bitwise_rshift(batch const& lhs, batch, A> const& rhs, requires_arch) + inline batch bitwise_rshift(batch const& lhs, batch, A> const& rhs, requires_arch) noexcept { return vshlq_u32(lhs, vnegq_s32(rhs)); } template = 0> - batch bitwise_rshift(batch const& lhs, batch const& rhs, requires_arch) + inline batch bitwise_rshift(batch const& lhs, batch const& rhs, requires_arch) noexcept { return vshlq_s32(lhs, vnegq_s32(rhs)); } @@ -2055,7 +2049,7 @@ namespace xsimd *******/ template = 0> - bool all(batch_bool const& arg, requires_arch) + inline bool all(batch_bool const& arg, requires_arch) noexcept { uint8x8_t tmp = vand_u8(vget_low_u8(arg), vget_high_u8(arg)); tmp = vpmin_u8(tmp, tmp); @@ -2065,7 +2059,7 @@ namespace xsimd } template = 0> - bool all(batch_bool const& arg, requires_arch) + inline bool all(batch_bool const& arg, requires_arch) noexcept { uint16x4_t tmp = vand_u16(vget_low_u16(arg), vget_high_u16(arg)); tmp = vpmin_u16(tmp, tmp); @@ -2074,14 +2068,14 @@ namespace xsimd } template = 0> - bool all(batch_bool const& arg, requires_arch) + inline bool all(batch_bool const& arg, requires_arch) noexcept { uint32x2_t tmp = vand_u32(vget_low_u32(arg), vget_high_u32(arg)); return vget_lane_u32(vpmin_u32(tmp, tmp), 0) != 0; } template = 0> - bool all(batch_bool const& arg, requires_arch) + inline bool all(batch_bool const& arg, requires_arch) noexcept { uint64x1_t tmp = vand_u64(vget_low_u64(arg), vget_high_u64(arg)); return vget_lane_u64(tmp, 0) != 0; @@ -2092,7 +2086,7 @@ namespace xsimd *******/ template = 0> - bool any(batch_bool const& arg, requires_arch) + inline bool any(batch_bool const& arg, requires_arch) noexcept { uint8x8_t tmp = vorr_u8(vget_low_u8(arg), vget_high_u8(arg)); tmp = vpmax_u8(tmp, tmp); @@ -2102,7 +2096,7 @@ namespace xsimd } template = 0> - bool any(batch_bool const& arg, requires_arch) + inline bool any(batch_bool const& arg, requires_arch) noexcept { uint16x4_t tmp = vorr_u16(vget_low_u16(arg), vget_high_u16(arg)); tmp = vpmax_u16(tmp, tmp); @@ -2111,14 +2105,14 @@ namespace xsimd } template = 0> - bool any(batch_bool const& arg, requires_arch) + inline bool any(batch_bool const& arg, requires_arch) noexcept { uint32x2_t tmp = vorr_u32(vget_low_u32(arg), vget_high_u32(arg)); return vget_lane_u32(vpmax_u32(tmp, tmp), 0); } template = 0> - bool any(batch_bool const& arg, requires_arch) + inline bool any(batch_bool const& arg, requires_arch) noexcept { uint64x1_t tmp = vorr_u64(vget_low_u64(arg), vget_high_u64(arg)); return bool(vget_lane_u64(tmp, 0)); @@ -2128,18 +2122,19 @@ namespace xsimd * bitwise_cast * ****************/ - #define WRAP_CAST(SUFFIX, TYPE) \ - namespace wrap { \ - inline TYPE vreinterpretq_##SUFFIX##_u8(uint8x16_t a) { return ::vreinterpretq_##SUFFIX##_u8 (a); } \ - inline TYPE vreinterpretq_##SUFFIX##_s8(int8x16_t a) { return ::vreinterpretq_##SUFFIX##_s8 (a); } \ - inline TYPE vreinterpretq_##SUFFIX##_u16(uint16x8_t a) { return ::vreinterpretq_##SUFFIX##_u16(a); } \ - inline TYPE vreinterpretq_##SUFFIX##_s16(int16x8_t a) { return ::vreinterpretq_##SUFFIX##_s16(a); } \ - inline TYPE vreinterpretq_##SUFFIX##_u32(uint32x4_t a) { return ::vreinterpretq_##SUFFIX##_u32(a); } \ - inline TYPE vreinterpretq_##SUFFIX##_s32(int32x4_t a) { return ::vreinterpretq_##SUFFIX##_s32(a); } \ - inline TYPE vreinterpretq_##SUFFIX##_u64(uint64x2_t a) { return ::vreinterpretq_##SUFFIX##_u64(a); } \ - inline TYPE vreinterpretq_##SUFFIX##_s64(int64x2_t a) { return ::vreinterpretq_##SUFFIX##_s64(a); } \ - inline TYPE vreinterpretq_##SUFFIX##_f32(float32x4_t a) { return ::vreinterpretq_##SUFFIX##_f32(a); } \ - } +#define WRAP_CAST(SUFFIX, TYPE) \ + namespace wrap \ + { \ + inline TYPE vreinterpretq_##SUFFIX##_u8(uint8x16_t a) noexcept { return ::vreinterpretq_##SUFFIX##_u8(a); } \ + inline TYPE vreinterpretq_##SUFFIX##_s8(int8x16_t a) noexcept { return ::vreinterpretq_##SUFFIX##_s8(a); } \ + inline TYPE vreinterpretq_##SUFFIX##_u16(uint16x8_t a) noexcept { return ::vreinterpretq_##SUFFIX##_u16(a); } \ + inline TYPE vreinterpretq_##SUFFIX##_s16(int16x8_t a) noexcept { return ::vreinterpretq_##SUFFIX##_s16(a); } \ + inline TYPE vreinterpretq_##SUFFIX##_u32(uint32x4_t a) noexcept { return ::vreinterpretq_##SUFFIX##_u32(a); } \ + inline TYPE vreinterpretq_##SUFFIX##_s32(int32x4_t a) noexcept { return ::vreinterpretq_##SUFFIX##_s32(a); } \ + inline TYPE vreinterpretq_##SUFFIX##_u64(uint64x2_t a) noexcept { return ::vreinterpretq_##SUFFIX##_u64(a); } \ + inline TYPE vreinterpretq_##SUFFIX##_s64(int64x2_t a) noexcept { return ::vreinterpretq_##SUFFIX##_s64(a); } \ + inline TYPE vreinterpretq_##SUFFIX##_f32(float32x4_t a) noexcept { return ::vreinterpretq_##SUFFIX##_f32(a); } \ + } WRAP_CAST(u8, uint8x16_t) WRAP_CAST(s8, int8x16_t) @@ -2151,7 +2146,7 @@ namespace xsimd WRAP_CAST(s64, int64x2_t) WRAP_CAST(f32, float32x4_t) - #undef WRAP_CAST +#undef WRAP_CAST namespace detail { @@ -2162,7 +2157,7 @@ namespace xsimd container_type m_func; template - R apply(U rhs) const + R apply(U rhs) const noexcept { using func_type = R (*)(U); auto func = xsimd::detail::get(m_func); @@ -2171,14 +2166,16 @@ namespace xsimd }; template - const bitwise_caster_impl make_bitwise_caster_impl(R (*...arg)(T)) + inline const bitwise_caster_impl make_bitwise_caster_impl(R (*... arg)(T)) noexcept { - return {std::make_tuple(arg...)}; + return { std::make_tuple(arg...) }; } template - struct type_list {}; - + struct type_list + { + }; + template struct bitwise_caster; @@ -2189,7 +2186,7 @@ namespace xsimd container_type m_caster; template - V apply(U rhs) const + V apply(U rhs) const noexcept { using caster_type = bitwise_caster_impl; auto caster = xsimd::detail::get(m_caster); @@ -2199,46 +2196,46 @@ namespace xsimd template using bitwise_caster_t = bitwise_caster, type_list>; - + using neon_bitwise_caster = bitwise_caster_t; + uint16x8_t, int16x8_t, + uint32x4_t, int32x4_t, + uint64x2_t, int64x2_t, + float32x4_t>; } template - batch bitwise_cast(batch const& arg, batch const&, requires_arch) + inline batch bitwise_cast(batch const& arg, batch const&, requires_arch) noexcept { const detail::neon_bitwise_caster caster = { std::make_tuple( - detail::make_bitwise_caster_impl(wrap::vreinterpretq_u8_u8, wrap::vreinterpretq_u8_s8, wrap::vreinterpretq_u8_u16, wrap::vreinterpretq_u8_s16, - wrap::vreinterpretq_u8_u32, wrap::vreinterpretq_u8_s32, wrap::vreinterpretq_u8_u64, wrap::vreinterpretq_u8_s64, - wrap::vreinterpretq_u8_f32), - detail::make_bitwise_caster_impl(wrap::vreinterpretq_s8_u8, wrap::vreinterpretq_s8_s8, wrap::vreinterpretq_s8_u16, wrap::vreinterpretq_s8_s16, - wrap::vreinterpretq_s8_u32, wrap::vreinterpretq_s8_s32, wrap::vreinterpretq_s8_u64, wrap::vreinterpretq_s8_s64, - wrap::vreinterpretq_s8_f32), - detail::make_bitwise_caster_impl(wrap::vreinterpretq_u16_u8, wrap::vreinterpretq_u16_s8, wrap::vreinterpretq_u16_u16, wrap::vreinterpretq_u16_s16, - wrap::vreinterpretq_u16_u32, wrap::vreinterpretq_u16_s32, wrap::vreinterpretq_u16_u64, wrap::vreinterpretq_u16_s64, - wrap::vreinterpretq_u16_f32), - detail::make_bitwise_caster_impl(wrap::vreinterpretq_s16_u8, wrap::vreinterpretq_s16_s8, wrap::vreinterpretq_s16_u16, wrap::vreinterpretq_s16_s16, - wrap::vreinterpretq_s16_u32, wrap::vreinterpretq_s16_s32, wrap::vreinterpretq_s16_u64, wrap::vreinterpretq_s16_s64, - wrap::vreinterpretq_s16_f32), - detail::make_bitwise_caster_impl(wrap::vreinterpretq_u32_u8, wrap::vreinterpretq_u32_s8, wrap::vreinterpretq_u32_u16, wrap::vreinterpretq_u32_s16, - wrap::vreinterpretq_u32_u32, wrap::vreinterpretq_u32_s32, wrap::vreinterpretq_u32_u64, wrap::vreinterpretq_u32_s64, - wrap::vreinterpretq_u32_f32), - detail::make_bitwise_caster_impl(wrap::vreinterpretq_s32_u8, wrap::vreinterpretq_s32_s8, wrap::vreinterpretq_s32_u16, wrap::vreinterpretq_s32_s16, - wrap::vreinterpretq_s32_u32, wrap::vreinterpretq_s32_s32, wrap::vreinterpretq_s32_u64, wrap::vreinterpretq_s32_s64, - wrap::vreinterpretq_s32_f32), - detail::make_bitwise_caster_impl(wrap::vreinterpretq_u64_u8, wrap::vreinterpretq_u64_s8, wrap::vreinterpretq_u64_u16, wrap::vreinterpretq_u64_s16, - wrap::vreinterpretq_u64_u32, wrap::vreinterpretq_u64_s32, wrap::vreinterpretq_u64_u64, wrap::vreinterpretq_u64_s64, - wrap::vreinterpretq_u64_f32), - detail::make_bitwise_caster_impl(wrap::vreinterpretq_s64_u8, wrap::vreinterpretq_s64_s8, wrap::vreinterpretq_s64_u16, wrap::vreinterpretq_s64_s16, - wrap::vreinterpretq_s64_u32, wrap::vreinterpretq_s64_s32, wrap::vreinterpretq_s64_u64, wrap::vreinterpretq_s64_s64, - wrap::vreinterpretq_s64_f32), - detail::make_bitwise_caster_impl(wrap::vreinterpretq_f32_u8, wrap::vreinterpretq_f32_s8, wrap::vreinterpretq_f32_u16, wrap::vreinterpretq_f32_s16, - wrap::vreinterpretq_f32_u32, wrap::vreinterpretq_f32_s32, wrap::vreinterpretq_f32_u64, wrap::vreinterpretq_f32_s64, - wrap::vreinterpretq_f32_f32)) + detail::make_bitwise_caster_impl(wrap::vreinterpretq_u8_u8, wrap::vreinterpretq_u8_s8, wrap::vreinterpretq_u8_u16, wrap::vreinterpretq_u8_s16, + wrap::vreinterpretq_u8_u32, wrap::vreinterpretq_u8_s32, wrap::vreinterpretq_u8_u64, wrap::vreinterpretq_u8_s64, + wrap::vreinterpretq_u8_f32), + detail::make_bitwise_caster_impl(wrap::vreinterpretq_s8_u8, wrap::vreinterpretq_s8_s8, wrap::vreinterpretq_s8_u16, wrap::vreinterpretq_s8_s16, + wrap::vreinterpretq_s8_u32, wrap::vreinterpretq_s8_s32, wrap::vreinterpretq_s8_u64, wrap::vreinterpretq_s8_s64, + wrap::vreinterpretq_s8_f32), + detail::make_bitwise_caster_impl(wrap::vreinterpretq_u16_u8, wrap::vreinterpretq_u16_s8, wrap::vreinterpretq_u16_u16, wrap::vreinterpretq_u16_s16, + wrap::vreinterpretq_u16_u32, wrap::vreinterpretq_u16_s32, wrap::vreinterpretq_u16_u64, wrap::vreinterpretq_u16_s64, + wrap::vreinterpretq_u16_f32), + detail::make_bitwise_caster_impl(wrap::vreinterpretq_s16_u8, wrap::vreinterpretq_s16_s8, wrap::vreinterpretq_s16_u16, wrap::vreinterpretq_s16_s16, + wrap::vreinterpretq_s16_u32, wrap::vreinterpretq_s16_s32, wrap::vreinterpretq_s16_u64, wrap::vreinterpretq_s16_s64, + wrap::vreinterpretq_s16_f32), + detail::make_bitwise_caster_impl(wrap::vreinterpretq_u32_u8, wrap::vreinterpretq_u32_s8, wrap::vreinterpretq_u32_u16, wrap::vreinterpretq_u32_s16, + wrap::vreinterpretq_u32_u32, wrap::vreinterpretq_u32_s32, wrap::vreinterpretq_u32_u64, wrap::vreinterpretq_u32_s64, + wrap::vreinterpretq_u32_f32), + detail::make_bitwise_caster_impl(wrap::vreinterpretq_s32_u8, wrap::vreinterpretq_s32_s8, wrap::vreinterpretq_s32_u16, wrap::vreinterpretq_s32_s16, + wrap::vreinterpretq_s32_u32, wrap::vreinterpretq_s32_s32, wrap::vreinterpretq_s32_u64, wrap::vreinterpretq_s32_s64, + wrap::vreinterpretq_s32_f32), + detail::make_bitwise_caster_impl(wrap::vreinterpretq_u64_u8, wrap::vreinterpretq_u64_s8, wrap::vreinterpretq_u64_u16, wrap::vreinterpretq_u64_s16, + wrap::vreinterpretq_u64_u32, wrap::vreinterpretq_u64_s32, wrap::vreinterpretq_u64_u64, wrap::vreinterpretq_u64_s64, + wrap::vreinterpretq_u64_f32), + detail::make_bitwise_caster_impl(wrap::vreinterpretq_s64_u8, wrap::vreinterpretq_s64_s8, wrap::vreinterpretq_s64_u16, wrap::vreinterpretq_s64_s16, + wrap::vreinterpretq_s64_u32, wrap::vreinterpretq_s64_s32, wrap::vreinterpretq_s64_u64, wrap::vreinterpretq_s64_s64, + wrap::vreinterpretq_s64_f32), + detail::make_bitwise_caster_impl(wrap::vreinterpretq_f32_u8, wrap::vreinterpretq_f32_s8, wrap::vreinterpretq_f32_u16, wrap::vreinterpretq_f32_s16, + wrap::vreinterpretq_f32_u32, wrap::vreinterpretq_f32_s32, wrap::vreinterpretq_f32_u64, wrap::vreinterpretq_f32_s64, + wrap::vreinterpretq_f32_f32)) }; using src_register_type = typename batch::register_type; using dst_register_type = typename batch::register_type; @@ -2250,14 +2247,14 @@ namespace xsimd *************/ template - batch_bool bool_cast(batch_bool const& arg, requires_arch) + inline batch_bool bool_cast(batch_bool const& arg, requires_arch) noexcept { using register_type = typename batch_bool::register_type; return register_type(arg); } template - batch_bool bool_cast(batch_bool const& arg, requires_arch) + inline batch_bool bool_cast(batch_bool const& arg, requires_arch) noexcept { using register_type = typename batch_bool::register_type; return register_type(arg); @@ -2268,7 +2265,7 @@ namespace xsimd **********/ template - batch to_int(const batch& x, requires_arch) + inline batch to_int(const batch& x, requires_arch) noexcept { return vcvtq_s32_f32(x); } @@ -2278,30 +2275,17 @@ namespace xsimd ************/ template - batch to_float(const batch& x, requires_arch) + inline batch to_float(const batch& x, requires_arch) noexcept { return vcvtq_f32_s32(x); } - /************* - * fast_cast * - *************/ - - namespace detail - { - template - batch fast_cast(batch const& in, batch const& out, requires_arch) - { - return bitwise_cast(in, out, A{}); - } - } - /********* * isnan * *********/ template - batch_bool isnan(batch const& arg, requires_arch) + inline batch_bool isnan(batch const& arg, requires_arch) noexcept { return !(arg == arg); } diff --git a/third_party/xsimd/arch/xsimd_neon64.hpp b/third_party/xsimd/arch/xsimd_neon64.hpp index f63254adf..50f172688 100644 --- a/third_party/xsimd/arch/xsimd_neon64.hpp +++ b/third_party/xsimd/arch/xsimd_neon64.hpp @@ -1,13 +1,13 @@ /*************************************************************************** -* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * -* Martin Renou * -* Copyright (c) QuantStack * -* Copyright (c) Serge Guelton * -* * -* Distributed under the terms of the BSD 3-Clause License. * -* * -* The full license is in the file LICENSE, distributed with this software. * -****************************************************************************/ + * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * + * Martin Renou * + * Copyright (c) QuantStack * + * Copyright (c) Serge Guelton * + * * + * Distributed under the terms of the BSD 3-Clause License. * + * * + * The full license is in the file LICENSE, distributed with this software. * + ****************************************************************************/ #ifndef XSIMD_NEON64_HPP #define XSIMD_NEON64_HPP @@ -21,6 +21,9 @@ namespace xsimd { + template + struct batch_bool_constant; + namespace kernel { using namespace types; @@ -29,27 +32,27 @@ namespace xsimd *******/ template = 0> - bool all(batch_bool const& arg, requires_arch) + inline bool all(batch_bool const& arg, requires_arch) noexcept { return vminvq_u8(arg); } template = 0> - bool all(batch_bool const& arg, requires_arch) + inline bool all(batch_bool const& arg, requires_arch) noexcept { return vminvq_u16(arg); } template = 0> - bool all(batch_bool const& arg, requires_arch) + inline bool all(batch_bool const& arg, requires_arch) noexcept { return vminvq_u32(arg); } template = 0> - bool all(batch_bool const& arg, requires_arch) + inline bool all(batch_bool const& arg, requires_arch) noexcept { - return all(batch_bool(vreinterpretq_u32_u64(arg)), neon64{}); + return all(batch_bool(vreinterpretq_u32_u64(arg)), neon64 {}); } /******* @@ -57,27 +60,27 @@ namespace xsimd *******/ template = 0> - bool any(batch_bool const& arg, requires_arch) + inline bool any(batch_bool const& arg, requires_arch) noexcept { return vmaxvq_u8(arg); } template = 0> - bool any(batch_bool const& arg, requires_arch) + inline bool any(batch_bool const& arg, requires_arch) noexcept { return vmaxvq_u16(arg); } template = 0> - bool any(batch_bool const& arg, requires_arch) + inline bool any(batch_bool const& arg, requires_arch) noexcept { return vmaxvq_u32(arg); } template = 0> - bool any(batch_bool const& arg, requires_arch) + inline bool any(batch_bool const& arg, requires_arch) noexcept { - return any(batch_bool(vreinterpretq_u32_u64(arg)), neon64{}); + return any(batch_bool(vreinterpretq_u32_u64(arg)), neon64 {}); } /************* @@ -86,13 +89,13 @@ namespace xsimd // Required to avoid ambiguous call template - batch broadcast(T val, requires_arch) + inline batch broadcast(T val, requires_arch) noexcept { - return broadcast(val, neon{}); + return broadcast(val, neon {}); } template - batch broadcast(double val, requires_arch) + inline batch broadcast(double val, requires_arch) noexcept { return vdupq_n_f64(val); } @@ -102,18 +105,18 @@ namespace xsimd *******/ template - batch set(batch const&, requires_arch, double d0, double d1) + inline batch set(batch const&, requires_arch, double d0, double d1) noexcept { - return float64x2_t{d0, d1}; + return float64x2_t { d0, d1 }; } template - batch_bool set(batch_bool const&, requires_arch, bool b0, bool b1) + inline batch_bool set(batch_bool const&, requires_arch, bool b0, bool b1) noexcept { using register_type = typename batch_bool::register_type; using unsigned_type = as_unsigned_integer_t; - return register_type{static_cast(b0 ? -1LL : 0LL), - static_cast(b1 ? -1LL : 0LL)}; + return register_type { static_cast(b0 ? -1LL : 0LL), + static_cast(b1 ? -1LL : 0LL) }; } /************* @@ -121,7 +124,7 @@ namespace xsimd *************/ template - batch from_bool(batch_bool const& arg, requires_arch) + inline batch from_bool(batch_bool const& arg, requires_arch) noexcept { return vreinterpretq_f64_u64(vandq_u64(arg, vreinterpretq_u64_f64(vdupq_n_f64(1.)))); } @@ -131,15 +134,15 @@ namespace xsimd ********/ template - batch load_aligned(double const* src, convert, requires_arch) + inline batch load_aligned(double const* src, convert, requires_arch) noexcept { return vld1q_f64(src); } template - batch load_unaligned(double const* src, convert, requires_arch) + inline batch load_unaligned(double const* src, convert, requires_arch) noexcept { - return load_aligned(src, convert(), A{}); + return load_aligned(src, convert(), A {}); } /********* @@ -147,15 +150,15 @@ namespace xsimd *********/ template - void store_aligned(double* dst, batch const& src, requires_arch) + inline void store_aligned(double* dst, batch const& src, requires_arch) noexcept { vst1q_f64(dst, src); } template - void store_unaligned(double* dst, batch const& src, requires_arch) + inline void store_unaligned(double* dst, batch const& src, requires_arch) noexcept { - return store_aligned(dst, src, A{}); + return store_aligned(dst, src, A {}); } /**************** @@ -163,20 +166,20 @@ namespace xsimd ****************/ template - batch, A> load_complex_aligned(std::complex const* mem, convert>, requires_arch) + inline batch, A> load_complex_aligned(std::complex const* mem, convert>, requires_arch) noexcept { using real_batch = batch; const double* buf = reinterpret_cast(mem); float64x2x2_t tmp = vld2q_f64(buf); real_batch real = tmp.val[0], imag = tmp.val[1]; - return batch, A>{real, imag}; + return batch, A> { real, imag }; } template - batch, A> load_complex_unaligned(std::complex const* mem, convert> cvt, requires_arch) + inline batch, A> load_complex_unaligned(std::complex const* mem, convert> cvt, requires_arch) noexcept { - return load_complex_aligned(mem, cvt, A{}); + return load_complex_aligned(mem, cvt, A {}); } /***************** @@ -184,7 +187,7 @@ namespace xsimd *****************/ template - void store_complex_aligned(std::complex* dst, batch ,A> const& src, requires_arch) + inline void store_complex_aligned(std::complex* dst, batch, A> const& src, requires_arch) noexcept { float64x2x2_t tmp; tmp.val[0] = src.real(); @@ -194,29 +197,29 @@ namespace xsimd } template - void store_complex_unaligned(std::complex* dst, batch, A> const& src, requires_arch) + inline void store_complex_unaligned(std::complex* dst, batch, A> const& src, requires_arch) noexcept { - store_complex_aligned(dst, src, A{}); + store_complex_aligned(dst, src, A {}); } /******* * neg * *******/ - template = 0> - batch neg(batch const& rhs, requires_arch) + template = 0> + inline batch neg(batch const& rhs, requires_arch) noexcept { return vreinterpretq_u64_s64(vnegq_s64(vreinterpretq_s64_u64(rhs))); } - template = 0> - batch neg(batch const& rhs, requires_arch) + template = 0> + inline batch neg(batch const& rhs, requires_arch) noexcept { return vnegq_s64(rhs); } template - batch neg(batch const& rhs, requires_arch) + inline batch neg(batch const& rhs, requires_arch) noexcept { return vnegq_f64(rhs); } @@ -226,7 +229,7 @@ namespace xsimd *******/ template - batch add(batch const& lhs, batch const& rhs, requires_arch) + inline batch add(batch const& lhs, batch const& rhs, requires_arch) noexcept { return vaddq_f64(lhs, rhs); } @@ -236,9 +239,9 @@ namespace xsimd ********/ template - batch sadd(batch const& lhs, batch const& rhs, requires_arch) + inline batch sadd(batch const& lhs, batch const& rhs, requires_arch) noexcept { - return add(lhs, rhs, neon64{}); + return add(lhs, rhs, neon64 {}); } /******* @@ -246,7 +249,7 @@ namespace xsimd *******/ template - batch sub(batch const& lhs, batch const& rhs, requires_arch) + inline batch sub(batch const& lhs, batch const& rhs, requires_arch) noexcept { return vsubq_f64(lhs, rhs); } @@ -256,9 +259,9 @@ namespace xsimd ********/ template - batch ssub(batch const& lhs, batch const& rhs, requires_arch) + inline batch ssub(batch const& lhs, batch const& rhs, requires_arch) noexcept { - return sub(lhs, rhs, neon64{}); + return sub(lhs, rhs, neon64 {}); } /******* @@ -266,7 +269,7 @@ namespace xsimd *******/ template - batch mul(batch const& lhs, batch const& rhs, requires_arch) + inline batch mul(batch const& lhs, batch const& rhs, requires_arch) noexcept { return vmulq_f64(lhs, rhs); } @@ -276,20 +279,20 @@ namespace xsimd *******/ #if defined(XSIMD_FAST_INTEGER_DIVISION) - template = 0> - batch div(batch const& lhs, batch const& rhs, requires_arch) + template = 0> + inline batch div(batch const& lhs, batch const& rhs, requires_arch) noexcept { return vcvtq_u64_f64(vcvtq_f64_u64(lhs) / vcvtq_f64_u64(rhs)); } - template = 0> - batch div(batch const& lhs, batch const& rhs, requires_arch) + template = 0> + inline batch div(batch const& lhs, batch const& rhs, requires_arch) noexcept { return vcvtq_s64_f64(vcvtq_f64_s64(lhs) / vcvtq_f64_s64(rhs)); } #endif template - batch div(batch const& lhs, batch const& rhs, requires_arch) + inline batch div(batch const& lhs, batch const& rhs, requires_arch) noexcept { return vdivq_f64(lhs, rhs); } @@ -298,38 +301,38 @@ namespace xsimd * eq * ******/ - template = 0> - batch_bool eq(batch const& lhs, batch const& rhs, requires_arch) + template = 0> + inline batch_bool eq(batch const& lhs, batch const& rhs, requires_arch) noexcept { return vceqq_u64(lhs, rhs); } - template = 0> - batch_bool eq(batch const& lhs, batch const& rhs, requires_arch) + template = 0> + inline batch_bool eq(batch const& lhs, batch const& rhs, requires_arch) noexcept { return vceqq_s64(lhs, rhs); } template - batch_bool eq(batch const& lhs, batch const& rhs, requires_arch) + inline batch_bool eq(batch const& lhs, batch const& rhs, requires_arch) noexcept { return vceqq_f64(lhs, rhs); } - template = 0> - batch_bool eq(batch_bool const& lhs, batch_bool const& rhs, requires_arch) + template = 0> + inline batch_bool eq(batch_bool const& lhs, batch_bool const& rhs, requires_arch) noexcept { return vceqq_u64(lhs, rhs); } - template = 0> - batch_bool eq(batch_bool const& lhs, batch_bool const& rhs, requires_arch) + template = 0> + inline batch_bool eq(batch_bool const& lhs, batch_bool const& rhs, requires_arch) noexcept { return vceqq_u64(lhs, rhs); } template - batch_bool eq(batch_bool const& lhs, batch_bool const& rhs, requires_arch) + inline batch_bool eq(batch_bool const& lhs, batch_bool const& rhs, requires_arch) noexcept { return vceqq_u64(lhs, rhs); } @@ -338,20 +341,20 @@ namespace xsimd * lt * ******/ - template = 0> - batch_bool lt(batch const& lhs, batch const& rhs, requires_arch) + template = 0> + inline batch_bool lt(batch const& lhs, batch const& rhs, requires_arch) noexcept { return vcltq_u64(lhs, rhs); } - template = 0> - batch_bool lt(batch const& lhs, batch const& rhs, requires_arch) + template = 0> + inline batch_bool lt(batch const& lhs, batch const& rhs, requires_arch) noexcept { return vcltq_s64(lhs, rhs); } template - batch_bool lt(batch const& lhs, batch const& rhs, requires_arch) + inline batch_bool lt(batch const& lhs, batch const& rhs, requires_arch) noexcept { return vcltq_f64(lhs, rhs); } @@ -359,21 +362,21 @@ namespace xsimd /****** * le * ******/ - - template = 0> - batch_bool le(batch const& lhs, batch const& rhs, requires_arch) + + template = 0> + inline batch_bool le(batch const& lhs, batch const& rhs, requires_arch) noexcept { return vcleq_u64(lhs, rhs); } - template = 0> - batch_bool le(batch const& lhs, batch const& rhs, requires_arch) + template = 0> + inline batch_bool le(batch const& lhs, batch const& rhs, requires_arch) noexcept { return vcleq_s64(lhs, rhs); } template - batch_bool le(batch const& lhs, batch const& rhs, requires_arch) + inline batch_bool le(batch const& lhs, batch const& rhs, requires_arch) noexcept { return vcleq_f64(lhs, rhs); } @@ -382,20 +385,20 @@ namespace xsimd * gt * ******/ - template = 0> - batch_bool gt(batch const& lhs, batch const& rhs, requires_arch) + template = 0> + inline batch_bool gt(batch const& lhs, batch const& rhs, requires_arch) noexcept { return vcgtq_u64(lhs, rhs); } - template = 0> - batch_bool gt(batch const& lhs, batch const& rhs, requires_arch) + template = 0> + inline batch_bool gt(batch const& lhs, batch const& rhs, requires_arch) noexcept { return vcgtq_s64(lhs, rhs); } template - batch_bool gt(batch const& lhs, batch const& rhs, requires_arch) + inline batch_bool gt(batch const& lhs, batch const& rhs, requires_arch) noexcept { return vcgtq_f64(lhs, rhs); } @@ -404,20 +407,20 @@ namespace xsimd * ge * ******/ - template = 0> - batch_bool ge(batch const& lhs, batch const& rhs, requires_arch) + template = 0> + inline batch_bool ge(batch const& lhs, batch const& rhs, requires_arch) noexcept { return vcgeq_u64(lhs, rhs); } - template = 0> - batch_bool ge(batch const& lhs, batch const& rhs, requires_arch) + template = 0> + inline batch_bool ge(batch const& lhs, batch const& rhs, requires_arch) noexcept { return vcgeq_s64(lhs, rhs); } template - batch_bool ge(batch const& lhs, batch const& rhs, requires_arch) + inline batch_bool ge(batch const& lhs, batch const& rhs, requires_arch) noexcept { return vcgeq_f64(lhs, rhs); } @@ -427,14 +430,14 @@ namespace xsimd ***************/ template - batch bitwise_and(batch const& lhs, batch const& rhs, requires_arch) + inline batch bitwise_and(batch const& lhs, batch const& rhs, requires_arch) noexcept { return vreinterpretq_f64_u64(vandq_u64(vreinterpretq_u64_f64(lhs), vreinterpretq_u64_f64(rhs))); } template - batch_bool bitwise_and(batch_bool const& lhs, batch_bool const& rhs, requires_arch) + inline batch_bool bitwise_and(batch_bool const& lhs, batch_bool const& rhs, requires_arch) noexcept { return vandq_u64(lhs, rhs); } @@ -444,14 +447,14 @@ namespace xsimd **************/ template - batch bitwise_or(batch const& lhs, batch const& rhs, requires_arch) + inline batch bitwise_or(batch const& lhs, batch const& rhs, requires_arch) noexcept { return vreinterpretq_f64_u64(vorrq_u64(vreinterpretq_u64_f64(lhs), vreinterpretq_u64_f64(rhs))); } template - batch_bool bitwise_or(batch_bool const& lhs, batch_bool const& rhs, requires_arch) + inline batch_bool bitwise_or(batch_bool const& lhs, batch_bool const& rhs, requires_arch) noexcept { return vorrq_u64(lhs, rhs); } @@ -461,14 +464,14 @@ namespace xsimd ***************/ template - batch bitwise_xor(batch const& lhs, batch const& rhs, requires_arch) + inline batch bitwise_xor(batch const& lhs, batch const& rhs, requires_arch) noexcept { return vreinterpretq_f64_u64(veorq_u64(vreinterpretq_u64_f64(lhs), vreinterpretq_u64_f64(rhs))); } template - batch_bool bitwise_xor(batch_bool const& lhs, batch_bool const& rhs, requires_arch) + inline batch_bool bitwise_xor(batch_bool const& lhs, batch_bool const& rhs, requires_arch) noexcept { return veorq_u64(lhs, rhs); } @@ -478,9 +481,9 @@ namespace xsimd *******/ template - batch_bool neq(batch_bool const& lhs, batch_bool const& rhs, requires_arch) + inline batch_bool neq(batch_bool const& lhs, batch_bool const& rhs, requires_arch) noexcept { - return bitwise_xor(lhs, rhs, A{}); + return bitwise_xor(lhs, rhs, A {}); } /*************** @@ -488,13 +491,13 @@ namespace xsimd ***************/ template - batch bitwise_not(batch const& rhs, requires_arch) + inline batch bitwise_not(batch const& rhs, requires_arch) noexcept { return vreinterpretq_f64_u32(vmvnq_u32(vreinterpretq_u32_f64(rhs))); } template - batch_bool bitwise_not(batch_bool const& rhs, requires_arch) + inline batch_bool bitwise_not(batch_bool const& rhs, requires_arch) noexcept { return detail::bitwise_not_u64(rhs); } @@ -504,14 +507,14 @@ namespace xsimd ******************/ template - batch bitwise_andnot(batch const& lhs, batch const& rhs, requires_arch) + inline batch bitwise_andnot(batch const& lhs, batch const& rhs, requires_arch) noexcept { return vreinterpretq_f64_u64(vbicq_u64(vreinterpretq_u64_f64(lhs), vreinterpretq_u64_f64(rhs))); } - + template - batch_bool bitwise_andnot(batch_bool const& lhs, batch_bool const& rhs, requires_arch) + inline batch_bool bitwise_andnot(batch_bool const& lhs, batch_bool const& rhs, requires_arch) noexcept { return vbicq_u64(lhs, rhs); } @@ -521,7 +524,7 @@ namespace xsimd *******/ template - batch min(batch const& lhs, batch const& rhs, requires_arch) + inline batch min(batch const& lhs, batch const& rhs, requires_arch) noexcept { return vminq_f64(lhs, rhs); } @@ -531,7 +534,7 @@ namespace xsimd *******/ template - batch max(batch const& lhs, batch const& rhs, requires_arch) + inline batch max(batch const& lhs, batch const& rhs, requires_arch) noexcept { return vmaxq_f64(lhs, rhs); } @@ -540,20 +543,20 @@ namespace xsimd * abs * *******/ - template = 0> - batch abs(batch const& rhs, requires_arch) + template = 0> + inline batch abs(batch const& rhs, requires_arch) noexcept { return rhs; } - template = 0> - batch abs(batch const& rhs, requires_arch) + template = 0> + inline batch abs(batch const& rhs, requires_arch) noexcept { return vabsq_s64(rhs); } template - batch abs(batch const& rhs, requires_arch) + inline batch abs(batch const& rhs, requires_arch) noexcept { return vabsq_f64(rhs); } @@ -563,7 +566,7 @@ namespace xsimd ********/ template - batch sqrt(batch const& rhs, requires_arch) + inline batch sqrt(batch const& rhs, requires_arch) noexcept { return vsqrtq_f64(rhs); } @@ -571,16 +574,16 @@ namespace xsimd /******************** * Fused operations * ********************/ - + #ifdef __ARM_FEATURE_FMA template - batch fma(batch const& x, batch const& y, batch const& z, requires_arch) + inline batch fma(batch const& x, batch const& y, batch const& z, requires_arch) noexcept { return vfmaq_f64(z, x, y); } template - batch fms(batch const& x, batch const& y, batch const& z, requires_arch) + inline batch fms(batch const& x, batch const& y, batch const& z, requires_arch) noexcept { return vfmaq_f64(-z, x, y); } @@ -591,55 +594,55 @@ namespace xsimd ********/ template = 0> - typename batch::value_type hadd(batch const& arg, requires_arch) + inline typename batch::value_type hadd(batch const& arg, requires_arch) noexcept { return vaddvq_u8(arg); } template = 0> - typename batch::value_type hadd(batch const& arg, requires_arch) + inline typename batch::value_type hadd(batch const& arg, requires_arch) noexcept { return vaddvq_s8(arg); } template = 0> - typename batch::value_type hadd(batch const& arg, requires_arch) + inline typename batch::value_type hadd(batch const& arg, requires_arch) noexcept { return vaddvq_u16(arg); } template = 0> - typename batch::value_type hadd(batch const& arg, requires_arch) + inline typename batch::value_type hadd(batch const& arg, requires_arch) noexcept { return vaddvq_s16(arg); } template = 0> - typename batch::value_type hadd(batch const& arg, requires_arch) + inline typename batch::value_type hadd(batch const& arg, requires_arch) noexcept { return vaddvq_u32(arg); } template = 0> - typename batch::value_type hadd(batch const& arg, requires_arch) + inline typename batch::value_type hadd(batch const& arg, requires_arch) noexcept { return vaddvq_s32(arg); } template = 0> - typename batch::value_type hadd(batch const& arg, requires_arch) + inline typename batch::value_type hadd(batch const& arg, requires_arch) noexcept { return vaddvq_u64(arg); } template = 0> - typename batch::value_type hadd(batch const& arg, requires_arch) + inline typename batch::value_type hadd(batch const& arg, requires_arch) noexcept { return vaddvq_s64(arg); } template - double hadd(batch const& arg, requires_arch) + inline double hadd(batch const& arg, requires_arch) noexcept { return vaddvq_f64(arg); } @@ -649,7 +652,7 @@ namespace xsimd *********/ template - batch haddp(const batch* row, requires_arch) + inline batch haddp(const batch* row, requires_arch) noexcept { return vpaddq_f64(row[0], row[1]); } @@ -659,37 +662,37 @@ namespace xsimd **********/ template - batch select(batch_bool const& cond, batch const& a, batch const& b, requires_arch) + inline batch select(batch_bool const& cond, batch const& a, batch const& b, requires_arch) noexcept { return vbslq_f64(cond, a, b); } template - batch select(batch_bool_constant, b...> const&, - batch const& true_br, - batch const& false_br, - requires_arch) + inline batch select(batch_bool_constant, b...> const&, + batch const& true_br, + batch const& false_br, + requires_arch) noexcept { - return select(batch_bool{b...}, true_br, false_br, neon64{}); + return select(batch_bool { b... }, true_br, false_br, neon64 {}); } /********** * zip_lo * **********/ template = 0> - batch zip_lo(batch const& lhs, batch const& rhs, requires_arch) + inline batch zip_lo(batch const& lhs, batch const& rhs, requires_arch) noexcept { return vzip1q_u64(lhs, rhs); } template = 0> - batch zip_lo(batch const& lhs, batch const& rhs, requires_arch) + inline batch zip_lo(batch const& lhs, batch const& rhs, requires_arch) noexcept { return vzip1q_s64(lhs, rhs); } template - batch zip_lo(batch const& lhs, batch const& rhs, requires_arch) + inline batch zip_lo(batch const& lhs, batch const& rhs, requires_arch) noexcept { return vzip1q_f64(lhs, rhs); } @@ -699,19 +702,19 @@ namespace xsimd **********/ template = 0> - batch zip_hi(batch const& lhs, batch const& rhs, requires_arch) + inline batch zip_hi(batch const& lhs, batch const& rhs, requires_arch) noexcept { return vzip2q_u64(lhs, rhs); } template = 0> - batch zip_hi(batch const& lhs, batch const& rhs, requires_arch) + inline batch zip_hi(batch const& lhs, batch const& rhs, requires_arch) noexcept { return vzip2q_s64(lhs, rhs); } template - batch zip_hi(batch const& lhs, batch const& rhs, requires_arch) + inline batch zip_hi(batch const& lhs, batch const& rhs, requires_arch) noexcept { return vzip2q_f64(lhs, rhs); } @@ -723,8 +726,8 @@ namespace xsimd namespace detail { template - batch extract_pair(batch const& lhs, batch const& rhs, std::size_t n, - ::xsimd::detail::index_sequence) + inline batch extract_pair(batch const& lhs, batch const& rhs, std::size_t n, + ::xsimd::detail::index_sequence) noexcept { if (n == I) { @@ -738,37 +741,37 @@ namespace xsimd } template - batch extract_pair(batch const& lhs, batch const& rhs, std::size_t n, requires_arch) + inline batch extract_pair(batch const& lhs, batch const& rhs, std::size_t n, requires_arch) noexcept { constexpr std::size_t size = batch::size; - assert(0<= n && n< size && "index in bounds"); + assert(0 <= n && n < size && "index in bounds"); return detail::extract_pair(lhs, rhs, n, ::xsimd::detail::make_index_sequence()); } /****************** * bitwise_rshift * ******************/ - + template = 0> - batch bitwise_rshift(batch const& lhs, int n, requires_arch) + inline batch bitwise_rshift(batch const& lhs, int n, requires_arch) noexcept { - return bitwise_rshift(lhs, n, neon{}); + return bitwise_rshift(lhs, n, neon {}); } template = 0> - batch bitwise_rshift(batch const& lhs, batch, A> const& rhs, requires_arch) + inline batch bitwise_rshift(batch const& lhs, batch, A> const& rhs, requires_arch) noexcept { return vshlq_u64(lhs, vnegq_s64(rhs)); } template = 0> - batch bitwise_rshift(batch const& lhs, int n, requires_arch) + inline batch bitwise_rshift(batch const& lhs, int n, requires_arch) noexcept { - return bitwise_rshift(lhs, n, neon{}); + return bitwise_rshift(lhs, n, neon {}); } template = 0> - batch bitwise_rshift(batch const& lhs, batch const& rhs, requires_arch) + inline batch bitwise_rshift(batch const& lhs, batch const& rhs, requires_arch) noexcept { return vshlq_s64(lhs, vnegq_s64(rhs)); } @@ -777,11 +780,12 @@ namespace xsimd * bitwise_cast * ****************/ - #define WRAP_CAST(SUFFIX, TYPE) \ - namespace wrap { \ - inline float64x2_t vreinterpretq_f64_##SUFFIX(TYPE a) { return ::vreinterpretq_f64_##SUFFIX(a); } \ - inline TYPE vreinterpretq_##SUFFIX##_f64(float64x2_t a) { return ::vreinterpretq_##SUFFIX##_f64(a); } \ - } +#define WRAP_CAST(SUFFIX, TYPE) \ + namespace wrap \ + { \ + inline float64x2_t vreinterpretq_f64_##SUFFIX(TYPE a) noexcept { return ::vreinterpretq_f64_##SUFFIX(a); } \ + inline TYPE vreinterpretq_##SUFFIX##_f64(float64x2_t a) noexcept { return ::vreinterpretq_##SUFFIX##_f64(a); } \ + } WRAP_CAST(u8, uint8x16_t) WRAP_CAST(s8, int8x16_t) @@ -793,10 +797,10 @@ namespace xsimd WRAP_CAST(s64, int64x2_t) WRAP_CAST(f32, float32x4_t) - #undef WRAP_CAST +#undef WRAP_CAST template - batch bitwise_cast(batch const& arg, batch const&, requires_arch) + inline batch bitwise_cast(batch const& arg, batch const&, requires_arch) noexcept { using caster_type = detail::bitwise_caster_impl; const caster_type caster = { - std::make_tuple(wrap::vreinterpretq_f64_u8, wrap::vreinterpretq_f64_s8, wrap::vreinterpretq_f64_u16, wrap::vreinterpretq_f64_s16, + std::make_tuple(wrap::vreinterpretq_f64_u8, wrap::vreinterpretq_f64_s8, wrap::vreinterpretq_f64_u16, wrap::vreinterpretq_f64_s16, wrap::vreinterpretq_f64_u32, wrap::vreinterpretq_f64_s32, wrap::vreinterpretq_f64_u64, wrap::vreinterpretq_f64_s64, wrap::vreinterpretq_f64_f32) }; @@ -832,7 +836,7 @@ namespace xsimd } template - batch bitwise_cast(batch const& arg, batch const&, requires_arch) + inline batch bitwise_cast(batch const& arg, batch const&, requires_arch) noexcept { using caster_type = detail::bitwise_caster_neon64; const caster_type caster = { - std::make_tuple(wrap::vreinterpretq_u8_f64, wrap::vreinterpretq_s8_f64, wrap::vreinterpretq_u16_f64, wrap::vreinterpretq_s16_f64, + std::make_tuple(wrap::vreinterpretq_u8_f64, wrap::vreinterpretq_s8_f64, wrap::vreinterpretq_u16_f64, wrap::vreinterpretq_s16_f64, wrap::vreinterpretq_u32_f64, wrap::vreinterpretq_s32_f64, wrap::vreinterpretq_u64_f64, wrap::vreinterpretq_s64_f64, wrap::vreinterpretq_f32_f64) }; @@ -851,7 +855,7 @@ namespace xsimd } template - batch bitwise_cast(batch const& arg, batch const&, requires_arch) + inline batch bitwise_cast(batch const& arg, batch const&, requires_arch) noexcept { return arg; } @@ -861,14 +865,14 @@ namespace xsimd *************/ template - batch_bool bool_cast(batch_bool const& arg, requires_arch) + inline batch_bool bool_cast(batch_bool const& arg, requires_arch) noexcept { using register_type = typename batch_bool::register_type; return register_type(arg); } template - batch_bool bool_cast(batch_bool const& arg, requires_arch) + inline batch_bool bool_cast(batch_bool const& arg, requires_arch) noexcept { using register_type = typename batch_bool::register_type; return register_type(arg); @@ -879,7 +883,7 @@ namespace xsimd **********/ template - batch to_int(const batch& x, requires_arch) + inline batch to_int(const batch& x, requires_arch) noexcept { return vcvtq_s64_f64(x); } @@ -889,7 +893,7 @@ namespace xsimd ************/ template - batch to_float(batch const& x, requires_arch) + inline batch to_float(batch const& x, requires_arch) noexcept { return vcvtq_f64_s64(x); } @@ -899,7 +903,7 @@ namespace xsimd *********/ template - batch_bool isnan(batch const& arg, requires_arch) + inline batch_bool isnan(batch const& arg, requires_arch) noexcept { return !(arg == arg); } @@ -907,4 +911,3 @@ namespace xsimd } #endif - diff --git a/third_party/xsimd/arch/xsimd_scalar.hpp b/third_party/xsimd/arch/xsimd_scalar.hpp index 42adfc994..c9e7e65e7 100644 --- a/third_party/xsimd/arch/xsimd_scalar.hpp +++ b/third_party/xsimd/arch/xsimd_scalar.hpp @@ -1,13 +1,13 @@ /*************************************************************************** -* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * -* Martin Renou * -* Copyright (c) QuantStack * -* Copyright (c) Serge Guelton * -* * -* Distributed under the terms of the BSD 3-Clause License. * -* * -* The full license is in the file LICENSE, distributed with this software. * -****************************************************************************/ + * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * + * Martin Renou * + * Copyright (c) QuantStack * + * Copyright (c) Serge Guelton * + * * + * Distributed under the terms of the BSD 3-Clause License. * + * * + * The full license is in the file LICENSE, distributed with this software. * + ****************************************************************************/ #ifndef XSIMD_SCALAR_HPP #define XSIMD_SCALAR_HPP @@ -17,8 +17,16 @@ #include #include +#ifdef XSIMD_ENABLE_XTL_COMPLEX +#include "xtl/xcomplex.hpp" +#endif + namespace xsimd { + template + class batch; + template + class batch_bool; using std::abs; @@ -26,8 +34,8 @@ namespace xsimd using std::acosh; using std::asin; using std::asinh; - using std::atan2; using std::atan; + using std::atan2; using std::atanh; using std::cbrt; using std::ceil; @@ -36,30 +44,28 @@ namespace xsimd using std::cosh; using std::erf; using std::erfc; - using std::exp2; using std::exp; + using std::exp2; using std::expm1; using std::fabs; using std::fdim; + using std::floor; using std::fmax; using std::fmin; - using std::floor; using std::fmod; using std::hypot; - using std::lgamma; using std::ldexp; + using std::lgamma; + using std::log; using std::log10; using std::log1p; using std::log2; - using std::log; using std::modf; using std::nearbyint; using std::nextafter; using std::proj; using std::remainder; using std::rint; - using std::rint; - using std::round; using std::round; using std::sin; using std::sinh; @@ -76,43 +82,43 @@ namespace xsimd #else // Windows defines catch all templates template - typename std::enable_if::value, bool>::type - isfinite(T var) + inline typename std::enable_if::value, bool>::type + isfinite(T var) noexcept { return std::isfinite(var); } template - typename std::enable_if::value, bool>::type - isfinite(T var) + inline typename std::enable_if::value, bool>::type + isfinite(T var) noexcept { return isfinite(double(var)); } template - typename std::enable_if::value, bool>::type - isinf(T var) + inline typename std::enable_if::value, bool>::type + isinf(T var) noexcept { return std::isinf(var); } template - typename std::enable_if::value, bool>::type - isinf(T var) + inline typename std::enable_if::value, bool>::type + isinf(T var) noexcept { return isinf(double(var)); } template - typename std::enable_if::value, bool>::type - isnan(T var) + inline typename std::enable_if::value, bool>::type + isnan(T var) noexcept { return std::isnan(var); } template - typename std::enable_if::value, bool>::type - isnan(T var) + inline typename std::enable_if::value, bool>::type + isnan(T var) noexcept { return isnan(double(var)); } @@ -120,13 +126,13 @@ namespace xsimd #ifdef XSIMD_ENABLE_NUMPY_COMPLEX template - bool isnan(std::complex var) + inline bool isnan(std::complex var) noexcept { return std::isnan(std::real(var)) || std::isnan(std::imag(var)); } template - bool isinf(std::complex var) + inline bool isinf(std::complex var) noexcept { return std::isinf(std::real(var)) || std::isinf(std::imag(var)); } @@ -134,77 +140,75 @@ namespace xsimd #ifdef XSIMD_ENABLE_XTL_COMPLEX using xtl::abs; - using xtl::norm; - using xtl::proj; + using xtl::acos; + using xtl::acosh; + using xtl::asin; + using xtl::asinh; + using xtl::atan; + using xtl::atanh; + using xtl::cos; + using xtl::cosh; using xtl::exp; using xtl::log; using xtl::log10; + using xtl::norm; using xtl::pow; - using xtl::sqrt; + using xtl::proj; using xtl::sin; - using xtl::cos; - using xtl::tan; - using xtl::asin; - using xtl::acos; - using xtl::atan; using xtl::sinh; - using xtl::cosh; + using xtl::sqrt; + using xtl::tan; using xtl::tanh; - using xtl::asinh; - using xtl::acosh; - using xtl::atanh; #endif template ::value>::type> - inline bool is_flint(const T& x) + inline bool is_flint(const T& x) noexcept { return std::isnan(x - x) ? std::numeric_limits::quiet_NaN() : x - std::trunc(x); } template ::value>::type> - inline bool is_odd(const T& x) + inline bool is_odd(const T& x) noexcept { return is_even(x - 1.); } template ::value>::type> - inline bool is_even(const T& x) + inline bool is_even(const T& x) noexcept { return is_flint(x * T(0.5)); } template ::value>::type> - inline T exp10(const T& x) + inline T exp10(const T& x) noexcept { // FIXME: faster alternatives exist return std::pow(T(10), x); } - namespace detail { template - inline C expm1_complex_scalar_impl(const C& val) + inline C expm1_complex_scalar_impl(const C& val) noexcept { using T = typename C::value_type; T isin = std::sin(val.imag()); T rem1 = std::expm1(val.real()); T re = rem1 + T(1.); T si = std::sin(val.imag() * T(0.5)); - return std::complex(rem1 - T(2.) * re *si * si, re * isin); + return std::complex(rem1 - T(2.) * re * si * si, re * isin); } } - template - inline std::complex expm1(const std::complex& val) + inline std::complex expm1(const std::complex& val) noexcept { return detail::expm1_complex_scalar_impl(val); } #ifdef XSIMD_ENABLE_XTL_COMPLEX template - inline xtl::xcomplex expm1(const xtl::xcomplex& val) + inline xtl::xcomplex expm1(const xtl::xcomplex& val) noexcept { return detail::expm1_complex_scalar_impl(val); } @@ -213,7 +217,7 @@ namespace xsimd namespace detail { template - inline C log1p_complex_scalar_impl(const C& val) + inline C log1p_complex_scalar_impl(const C& val) noexcept { using T = typename C::value_type; C u = C(1.) + val; @@ -222,19 +226,19 @@ namespace xsimd } template - inline std::complex log1p(const std::complex& val) + inline std::complex log1p(const std::complex& val) noexcept { return detail::log1p_complex_scalar_impl(val); } template - std::complex log2(const std::complex& val) + inline std::complex log2(const std::complex& val) noexcept { return log(val) / std::log(T(2)); } - template::value>::type> - T sadd(const T& lhs, const T& rhs) + template ::value>::type> + inline T sadd(const T& lhs, const T& rhs) noexcept { if (std::numeric_limits::is_signed) { @@ -246,7 +250,8 @@ namespace xsimd { return std::numeric_limits::lowest(); } - else { + else + { return lhs + rhs; } } @@ -260,12 +265,11 @@ namespace xsimd { return lhs + rhs; } - } } - template::value>::type> - T ssub(const T& lhs, const T& rhs) + template ::value>::type> + inline T ssub(const T& lhs, const T& rhs) noexcept { if (std::numeric_limits::is_signed) { @@ -281,57 +285,61 @@ namespace xsimd { return lhs - rhs; } - } } - namespace detail { - template struct value_type_or_type_helper { - using type = T; - }; - template struct value_type_or_type_helper> { - using type = T; - }; - - template - using value_type_or_type = typename value_type_or_type_helper::type; - - template - inline typename std::enable_if::value, T0>::type - ipow(const T0& x, const T1& n) - { - static_assert(std::is_integral::value, "second argument must be an integer"); - T0 a = x; - T1 b = n; - bool const recip = b < 0; - T0 r(static_cast>(1)); - while (1) + namespace detail + { + template + struct value_type_or_type_helper { - if (b & 1) - { - r *= a; - } - b /= 2; - if (b == 0) + using type = T; + }; + template + struct value_type_or_type_helper> + { + using type = T; + }; + + template + using value_type_or_type = typename value_type_or_type_helper::type; + + template + inline typename std::enable_if::value, T0>::type + ipow(const T0& x, const T1& n) noexcept + { + static_assert(std::is_integral::value, "second argument must be an integer"); + T0 a = x; + T1 b = n; + bool const recip = b < 0; + T0 r(static_cast>(1)); + while (1) { - break; + if (b & 1) + { + r *= a; + } + b /= 2; + if (b == 0) + { + break; + } + a *= a; } - a *= a; + return recip ? 1 / r : r; } - return recip ? 1 / r : r; - } } template inline typename std::enable_if::value, T0>::type - pow(const T0& x, const T1& n) + pow(const T0& x, const T1& n) noexcept { - return detail::ipow(x, n); + return detail::ipow(x, n); } template inline auto - pow(const T0& t0, const T1& t1) + pow(const T0& t0, const T1& t1) noexcept -> typename std::enable_if::value && std::is_floating_point::value, decltype(std::pow(t0, t1))>::type { return std::pow(t0, t1); @@ -339,53 +347,54 @@ namespace xsimd template inline typename std::enable_if::value, std::complex>::type - pow(const std::complex& t0, const T1& t1) + pow(const std::complex& t0, const T1& t1) noexcept { return detail::ipow(t0, t1); } template inline typename std::enable_if::value, std::complex>::type - pow(const std::complex& t0, const T1& t1) + pow(const std::complex& t0, const T1& t1) noexcept { - return std::pow(t0, t1); + return std::pow(t0, t1); } template inline auto - pow(const T0& t0, const std::complex& t1) + pow(const T0& t0, const std::complex& t1) noexcept -> typename std::enable_if::value, decltype(std::pow(t0, t1))>::type { - return std::pow(t0, t1); + return std::pow(t0, t1); } template - inline auto bitofsign(T const& x) -> decltype(std::signbit(x)) + inline auto bitofsign(T const& x) noexcept -> decltype(std::signbit(x)) { return std::signbit(x); } template - inline auto signbit(T const& v) -> decltype(bitofsign(v)) + inline auto signbit(T const& v) noexcept -> decltype(bitofsign(v)) { return bitofsign(v); } - inline double sign(bool const &v) + inline double sign(bool const& v) noexcept { return v; } template ::value>::type> - inline T sign(const T& v) + inline T sign(const T& v) noexcept { - return v < T(0) ? T(-1.) : v == T(0) ? T(0.) : T(1.); + return v < T(0) ? T(-1.) : v == T(0) ? T(0.) + : T(1.); } namespace detail { template - inline C sign_complex_scalar_impl(const C& v) + inline C sign_complex_scalar_impl(const C& v) noexcept { using value_type = typename C::value_type; if (v.real()) @@ -400,14 +409,14 @@ namespace xsimd } template - inline std::complex sign(const std::complex& v) + inline std::complex sign(const std::complex& v) noexcept { return detail::sign_complex_scalar_impl(v); } #ifdef XSIMD_ENABLE_XTL_COMPLEX template - inline xtl::xcomplex sign(const xtl::xcomplex& v) + inline xtl::xcomplex sign(const xtl::xcomplex& v) noexcept { return detail::sign_complex_scalar_impl(v); } @@ -415,25 +424,24 @@ namespace xsimd #ifdef XSIMD_ENABLE_XTL_COMPLEX template - inline xtl::xcomplex log2(const xtl::xcomplex& val) + inline xtl::xcomplex log2(const xtl::xcomplex& val) noexcept { return log(val) / log(T(2)); } #endif - #ifdef XSIMD_ENABLE_XTL_COMPLEX template - inline xtl::xcomplex log1p(const xtl::xcomplex& val) + inline xtl::xcomplex log1p(const xtl::xcomplex& val) noexcept { return detail::log1p_complex_scalar_impl(val); } #endif template - inline auto min(T0 const &self, T1 const &other) -> - typename std::enable_if::value && std::is_scalar::value, - typename std::decay other ? other : self)>::type>::type + inline auto min(T0 const& self, T1 const& other) noexcept + -> typename std::enable_if::value && std::is_scalar::value, + typename std::decay other ? other : self)>::type>::type { return self > other ? other : self; } @@ -441,15 +449,15 @@ namespace xsimd // numpy defines minimum operator on complex using lexical comparison template inline std::complex::type> - min(std::complex const &self, std::complex const &other) + min(std::complex const& self, std::complex const& other) noexcept { return (self.real() < other.real()) ? (self) : (self.real() == other.real() ? (self.imag() < other.imag() ? self : other) : other); } - template - inline auto max(T0 const &self, T1 const &other) -> - typename std::enable_if::value && std::is_scalar::value, - typename std::decay other ? other : self)>::type>::type + template + inline auto max(T0 const& self, T1 const& other) noexcept + -> typename std::enable_if::value && std::is_scalar::value, + typename std::decay other ? other : self)>::type>::type { return self < other ? other : self; } @@ -457,13 +465,13 @@ namespace xsimd // numpy defines maximum operator on complex using lexical comparison template inline std::complex::type> - max(std::complex const &self, std::complex const &other) + max(std::complex const& self, std::complex const& other) noexcept { return (self.real() > other.real()) ? (self) : (self.real() == other.real() ? (self.imag() > other.imag() ? self : other) : other); } template - inline typename std::enable_if::value, T>::type fma(const T& a, const T& b, const T& c) + inline typename std::enable_if::value, T>::type fma(const T& a, const T& b, const T& c) noexcept { return std::fma(a, b, c); } @@ -471,22 +479,22 @@ namespace xsimd namespace detail { template - inline C fma_complex_scalar_impl(const C& a, const C& b, const C& c) + inline C fma_complex_scalar_impl(const C& a, const C& b, const C& c) noexcept { - return {fms(a.real(), b.real(), fms(a.imag(), b.imag(), c.real())), - fma(a.real(), b.imag(), fma(a.imag(), b.real(), c.imag()))}; + return { fms(a.real(), b.real(), fms(a.imag(), b.imag(), c.real())), + fma(a.real(), b.imag(), fma(a.imag(), b.real(), c.imag())) }; } } template - inline std::complex fma(const std::complex& a, const std::complex& b, const std::complex& c) + inline std::complex fma(const std::complex& a, const std::complex& b, const std::complex& c) noexcept { return detail::fma_complex_scalar_impl(a, b, c); } #ifdef XSIMD_ENABLE_XTL_COMPLEX template - inline xtl::xcomplex fma(const xtl::xcomplex& a, const xtl::xcomplex& b, const xtl::xcomplex& c) + inline xtl::xcomplex fma(const xtl::xcomplex& a, const xtl::xcomplex& b, const xtl::xcomplex& c) noexcept { return detail::fma_complex_scalar_impl(a, b, c); } @@ -494,14 +502,15 @@ namespace xsimd namespace detail { -#define XSIMD_HASSINCOS_TRAIT(func) \ - template \ - struct has##func \ - { \ - template static auto get(T* ptr) -> decltype(func(std::declval(), std::declval(), std::declval()), std::true_type{});\ - static std::false_type get(...); \ - static constexpr bool value = decltype(get((S*)nullptr))::value; \ - } +#define XSIMD_HASSINCOS_TRAIT(func) \ + template \ + struct has##func \ + { \ + template \ + static auto get(T* ptr) -> decltype(func(std::declval(), std::declval(), std::declval()), std::true_type {}); \ + static std::false_type get(...); \ + static constexpr bool value = decltype(get((S*)nullptr))::value; \ + } #define XSIMD_HASSINCOS(func, T) has##func::value @@ -512,48 +521,48 @@ namespace xsimd struct generic_sincosf { - template + template typename std::enable_if::type - operator()(float val, T &s, T &c) + operator()(float val, T& s, T& c) { sincosf(val, &s, &c); } - template + template typename std::enable_if::type - operator()(float val, T &s, T &c) + operator()(float val, T& s, T& c) { __sincosf(val, &s, &c); } - template + template typename std::enable_if::type - operator()(float val, T &s, T &c) + operator()(float val, T& s, T& c) { s = std::sin(val); c = std::cos(val); } }; - struct generic_sincos + struct generic_sincos { - template + template typename std::enable_if::type - operator()(double val, T &s, T &c) + operator()(double val, T& s, T& c) { sincos(val, &s, &c); } - template + template typename std::enable_if::type - operator()(double val, T &s, T &c) + operator()(double val, T& s, T& c) { __sincos(val, &s, &c); } - template + template typename std::enable_if::type - operator()(double val, T &s, T &c) + operator()(double val, T& s, T& c) { s = std::sin(val); c = std::cos(val); @@ -564,18 +573,18 @@ namespace xsimd #undef XSIMD_HASSINCOS } - inline void sincos(float val, float&s, float& c) + inline void sincos(float val, float& s, float& c) noexcept { - detail::generic_sincosf{}(val, s, c); + detail::generic_sincosf {}(val, s, c); } - inline void sincos(double val, double&s, double& c) + inline void sincos(double val, double& s, double& c) noexcept { - detail::generic_sincos{}(val, s, c); + detail::generic_sincos {}(val, s, c); } template - inline void sincos(const std::complex& val, std::complex& s, std::complex& c) + inline void sincos(const std::complex& val, std::complex& s, std::complex& c) noexcept { s = std::sin(val); c = std::cos(val); @@ -583,7 +592,7 @@ namespace xsimd #ifdef XSIMD_ENABLE_XTL_COMPLEX template - inline void sincos(const xtl::xcomplex& val, xtl::xcomplex& s, xtl::xcomplex& c) + inline void sincos(const xtl::xcomplex& val, xtl::xcomplex& s, xtl::xcomplex& c) noexcept { s = sin(val); c = cos(val); @@ -591,19 +600,18 @@ namespace xsimd #endif template - inline T frexp(T const& val, int& exp) + inline T frexp(T const& val, int& exp) noexcept { return std::frexp(val, &exp); } template - inline decltype(abs(std::declval())) norm(const T& val) + inline decltype(abs(std::declval())) norm(const T& val) noexcept { auto tmp = abs(val); return tmp * tmp; } - } #endif diff --git a/third_party/xsimd/arch/xsimd_sse2.hpp b/third_party/xsimd/arch/xsimd_sse2.hpp index 755956d67..d5306f1aa 100644 --- a/third_party/xsimd/arch/xsimd_sse2.hpp +++ b/third_party/xsimd/arch/xsimd_sse2.hpp @@ -1,13 +1,13 @@ /*************************************************************************** -* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * -* Martin Renou * -* Copyright (c) QuantStack * -* Copyright (c) Serge Guelton * -* * -* Distributed under the terms of the BSD 3-Clause License. * -* * -* The full license is in the file LICENSE, distributed with this software. * -****************************************************************************/ + * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * + * Martin Renou * + * Copyright (c) QuantStack * + * Copyright (c) Serge Guelton * + * * + * Distributed under the terms of the BSD 3-Clause License. * + * * + * The full license is in the file LICENSE, distributed with this software. * + ****************************************************************************/ #ifndef XSIMD_SSE2_HPP #define XSIMD_SSE2_HPP @@ -18,428 +18,632 @@ #include "../types/xsimd_sse2_register.hpp" -namespace xsimd { +namespace xsimd +{ + template + struct batch_bool_constant; - namespace kernel { - using namespace types; + namespace kernel + { + using namespace types; - // abs - template batch abs(batch const& self, requires_arch) { - __m128d sign_mask = _mm_set1_pd(-0.f); // -0.f = 1 << 31 - return _mm_andnot_pd(sign_mask, self); - } - template batch abs(batch const& self, requires_arch) { - __m128 sign_mask = _mm_set1_ps(-0.f); // -0.f = 1 << 31 - return _mm_andnot_ps(sign_mask, self); - } + // abs + template + inline batch abs(batch const& self, requires_arch) noexcept + { + __m128d sign_mask = _mm_set1_pd(-0.f); // -0.f = 1 << 31 + return _mm_andnot_pd(sign_mask, self); + } + template + inline batch abs(batch const& self, requires_arch) noexcept + { + __m128 sign_mask = _mm_set1_ps(-0.f); // -0.f = 1 << 31 + return _mm_andnot_ps(sign_mask, self); + } - // add - template::value, void>::type> - batch add(batch const& self, batch const& other, requires_arch) { - switch(sizeof(T)) { - case 1: return _mm_add_epi8(self, other); - case 2: return _mm_add_epi16(self, other); - case 4: return _mm_add_epi32(self, other); - case 8: return _mm_add_epi64(self, other); - default: assert(false && "unsupported arch/op combination"); return {}; - } - } + // add + template ::value, void>::type> + inline batch add(batch const& self, batch const& other, requires_arch) noexcept + { + switch (sizeof(T)) + { + case 1: + return _mm_add_epi8(self, other); + case 2: + return _mm_add_epi16(self, other); + case 4: + return _mm_add_epi32(self, other); + case 8: + return _mm_add_epi64(self, other); + default: + assert(false && "unsupported arch/op combination"); + return {}; + } + } - template batch add(batch const& self, batch const& other, requires_arch) { - return _mm_add_ps(self, other); - } + template + inline batch add(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm_add_ps(self, other); + } - template batch add(batch const& self, batch const& other, requires_arch) { - return _mm_add_pd(self, other); - } + template + inline batch add(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm_add_pd(self, other); + } - // all - template bool all(batch_bool const& self, requires_arch) { - return _mm_movemask_ps(self) == 0x0F; - } - template bool all(batch_bool const& self, requires_arch) { - return _mm_movemask_pd(self) == 0x03; - } - template::value, void>::type> - bool all(batch_bool const& self, requires_arch) { - return _mm_movemask_epi8(self) == 0xFFFF; - } + // all + template + inline bool all(batch_bool const& self, requires_arch) noexcept + { + return _mm_movemask_ps(self) == 0x0F; + } + template + inline bool all(batch_bool const& self, requires_arch) noexcept + { + return _mm_movemask_pd(self) == 0x03; + } + template ::value, void>::type> + inline bool all(batch_bool const& self, requires_arch) noexcept + { + return _mm_movemask_epi8(self) == 0xFFFF; + } - // any - template bool any(batch_bool const& self, requires_arch) { - return _mm_movemask_ps(self) != 0; - } - template bool any(batch_bool const& self, requires_arch) { - return _mm_movemask_pd(self) != 0; - } - template::value, void>::type> - bool any(batch_bool const& self, requires_arch) { - return _mm_movemask_epi8(self) != 0; - } + // any + template + inline bool any(batch_bool const& self, requires_arch) noexcept + { + return _mm_movemask_ps(self) != 0; + } + template + inline bool any(batch_bool const& self, requires_arch) noexcept + { + return _mm_movemask_pd(self) != 0; + } + template ::value, void>::type> + inline bool any(batch_bool const& self, requires_arch) noexcept + { + return _mm_movemask_epi8(self) != 0; + } - // bitwise_and - template batch bitwise_and(batch const& self, batch const& other, requires_arch) { - return _mm_and_ps(self, other); - } - template batch_bool bitwise_and(batch_bool const& self, batch_bool const& other, requires_arch) { - return _mm_and_ps(self, other); - } - template::value, void>::type> - batch bitwise_and(batch const& self, batch const& other, requires_arch) { - return _mm_and_si128(self, other); - } - template::value, void>::type> - batch_bool bitwise_and(batch_bool const& self, batch_bool const& other, requires_arch) { - return _mm_and_si128(self, other); - } + // bitwise_and + template + inline batch bitwise_and(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm_and_ps(self, other); + } + template + inline batch_bool bitwise_and(batch_bool const& self, batch_bool const& other, requires_arch) noexcept + { + return _mm_and_ps(self, other); + } + template ::value, void>::type> + inline batch bitwise_and(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm_and_si128(self, other); + } + template ::value, void>::type> + inline batch_bool bitwise_and(batch_bool const& self, batch_bool const& other, requires_arch) noexcept + { + return _mm_and_si128(self, other); + } - template batch bitwise_and(batch const& self, batch const& other, requires_arch) { - return _mm_and_pd(self, other); - } + template + batch inline bitwise_and(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm_and_pd(self, other); + } - template batch_bool bitwise_and(batch_bool const& self, batch_bool const& other, requires_arch) { - return _mm_and_pd(self, other); - } + template + inline batch_bool bitwise_and(batch_bool const& self, batch_bool const& other, requires_arch) noexcept + { + return _mm_and_pd(self, other); + } - // bitwise_andnot - template batch bitwise_andnot(batch const& self, batch const& other, requires_arch) { - return _mm_andnot_ps(self, other); - } + // bitwise_andnot + template + inline batch bitwise_andnot(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm_andnot_ps(self, other); + } - template batch_bool bitwise_andnot(batch_bool const& self, batch_bool const& other, requires_arch) { - return _mm_andnot_ps(self, other); - } - template::value, void>::type> - batch bitwise_andnot(batch const& self, batch const& other, requires_arch) { - return _mm_andnot_si128(self, other); - } - template::value, void>::type> - batch_bool bitwise_andnot(batch_bool const& self, batch_bool const& other, requires_arch) { - return _mm_andnot_si128(self, other); - } + template + inline batch_bool bitwise_andnot(batch_bool const& self, batch_bool const& other, requires_arch) noexcept + { + return _mm_andnot_ps(self, other); + } + template ::value, void>::type> + inline batch bitwise_andnot(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm_andnot_si128(self, other); + } + template ::value, void>::type> + inline batch_bool bitwise_andnot(batch_bool const& self, batch_bool const& other, requires_arch) noexcept + { + return _mm_andnot_si128(self, other); + } - template batch bitwise_andnot(batch const& self, batch const& other, requires_arch) { - return _mm_andnot_pd(self, other); - } - - template batch_bool bitwise_andnot(batch_bool const& self, batch_bool const& other, requires_arch) { - return _mm_andnot_pd(self, other); - } + template + inline batch bitwise_andnot(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm_andnot_pd(self, other); + } - // bitwise_lshift - template::value, void>::type> - batch bitwise_lshift(batch const& self, int32_t other, requires_arch) { - switch(sizeof(T)) { - case 1: return _mm_and_si128(_mm_set1_epi8(0xFF << other), _mm_slli_epi32(self, other)); - case 2: return _mm_slli_epi16(self, other); - case 4: return _mm_slli_epi32(self, other); - case 8: return _mm_slli_epi64(self, other); - default: assert(false && "unsupported arch/op combination"); return {}; - } - } + template + inline batch_bool bitwise_andnot(batch_bool const& self, batch_bool const& other, requires_arch) noexcept + { + return _mm_andnot_pd(self, other); + } - // bitwise_not - template batch bitwise_not(batch const& self, requires_arch) { - return _mm_xor_ps(self, _mm_castsi128_ps(_mm_set1_epi32(-1))); - } - template batch_bool bitwise_not(batch_bool const& self, requires_arch) { - return _mm_xor_ps(self, _mm_castsi128_ps(_mm_set1_epi32(-1))); - } - template::value, void>::type> - batch bitwise_not(batch const& self, requires_arch) { - return _mm_xor_si128(self, _mm_set1_epi32(-1)); - } - template::value, void>::type> - batch_bool bitwise_not(batch_bool const& self, requires_arch) { - return _mm_xor_si128(self, _mm_set1_epi32(-1)); - } - template - batch bitwise_not(batch const &self, requires_arch) { - return _mm_xor_pd(self, _mm_castsi128_pd(_mm_set1_epi32(-1))); - } - template - batch_bool bitwise_not(batch_bool const &self, requires_arch) { - return _mm_xor_pd(self, _mm_castsi128_pd(_mm_set1_epi32(-1))); - } + // bitwise_lshift + template ::value, void>::type> + inline batch bitwise_lshift(batch const& self, int32_t other, requires_arch) noexcept + { + switch (sizeof(T)) + { + case 1: + return _mm_and_si128(_mm_set1_epi8(0xFF << other), _mm_slli_epi32(self, other)); + case 2: + return _mm_slli_epi16(self, other); + case 4: + return _mm_slli_epi32(self, other); + case 8: + return _mm_slli_epi64(self, other); + default: + assert(false && "unsupported arch/op combination"); + return {}; + } + } - // bitwise_or - template batch bitwise_or(batch const& self, batch const& other, requires_arch) { - return _mm_or_ps(self, other); - } - template batch_bool bitwise_or(batch_bool const& self, batch_bool const& other, requires_arch) { - return _mm_or_ps(self, other); - } - template::value, void>::type> - batch bitwise_or(batch const& self, batch const& other, requires_arch) { - return _mm_or_si128(self, other); - } - template::value, void>::type> - batch_bool bitwise_or(batch_bool const& self, batch_bool const& other, requires_arch) { - return _mm_or_si128(self, other); - } + // bitwise_not + template + inline batch bitwise_not(batch const& self, requires_arch) noexcept + { + return _mm_xor_ps(self, _mm_castsi128_ps(_mm_set1_epi32(-1))); + } + template + inline batch_bool bitwise_not(batch_bool const& self, requires_arch) noexcept + { + return _mm_xor_ps(self, _mm_castsi128_ps(_mm_set1_epi32(-1))); + } + template ::value, void>::type> + inline batch bitwise_not(batch const& self, requires_arch) noexcept + { + return _mm_xor_si128(self, _mm_set1_epi32(-1)); + } + template ::value, void>::type> + inline batch_bool bitwise_not(batch_bool const& self, requires_arch) noexcept + { + return _mm_xor_si128(self, _mm_set1_epi32(-1)); + } + template + inline batch bitwise_not(batch const& self, requires_arch) noexcept + { + return _mm_xor_pd(self, _mm_castsi128_pd(_mm_set1_epi32(-1))); + } + template + inline batch_bool bitwise_not(batch_bool const& self, requires_arch) noexcept + { + return _mm_xor_pd(self, _mm_castsi128_pd(_mm_set1_epi32(-1))); + } - template batch bitwise_or(batch const& self, batch const& other, requires_arch) { - return _mm_or_pd(self, other); - } + // bitwise_or + template + inline batch bitwise_or(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm_or_ps(self, other); + } + template + inline batch_bool bitwise_or(batch_bool const& self, batch_bool const& other, requires_arch) noexcept + { + return _mm_or_ps(self, other); + } + template ::value, void>::type> + inline batch bitwise_or(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm_or_si128(self, other); + } + template ::value, void>::type> + inline batch_bool bitwise_or(batch_bool const& self, batch_bool const& other, requires_arch) noexcept + { + return _mm_or_si128(self, other); + } - template batch_bool bitwise_or(batch_bool const& self, batch_bool const& other, requires_arch) { - return _mm_or_pd(self, other); - } + template + inline batch bitwise_or(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm_or_pd(self, other); + } - // bitwise_rshift - template::value, void>::type> - batch bitwise_rshift(batch const& self, int32_t other, requires_arch) { - if(std::is_signed::value) { - switch(sizeof(T)) { - case 1: { - __m128i sign_mask = _mm_set1_epi16((0xFF00 >> other) & 0x00FF); - __m128i cmp_is_negative = _mm_cmpgt_epi8(_mm_setzero_si128(), self); - __m128i res = _mm_srai_epi16(self, other); - return _mm_or_si128(_mm_and_si128(sign_mask, cmp_is_negative), _mm_andnot_si128(sign_mask, res)); - } - case 2: return _mm_srai_epi16(self, other); - case 4: return _mm_srai_epi32(self, other); - case 8: { - // from https://github.com/samyvilar/vect/blob/master/vect_128.h - return _mm_or_si128( - _mm_srli_epi64(self, other), - _mm_slli_epi64( - _mm_srai_epi32(_mm_shuffle_epi32(self, _MM_SHUFFLE(3, 3, 1, 1)), 32), - 64 - other)); - } - default: assert(false && "unsupported arch/op combination"); return {}; - } - } - else { - switch(sizeof(T)) { - case 1: return _mm_and_si128(_mm_set1_epi8(0xFF >> other), _mm_srli_epi32(self, other)); - case 2: return _mm_srli_epi16(self, other); - case 4: return _mm_srli_epi32(self, other); - case 8: return _mm_srli_epi64(self, other); - default: assert(false && "unsupported arch/op combination"); return {}; - } - } - } + template + inline batch_bool bitwise_or(batch_bool const& self, batch_bool const& other, requires_arch) noexcept + { + return _mm_or_pd(self, other); + } - // bitwise_xor - template batch bitwise_xor(batch const& self, batch const& other, requires_arch) { - return _mm_xor_ps(self, other); - } - template batch_bool bitwise_xor(batch_bool const& self, batch_bool const& other, requires_arch) { - return _mm_xor_ps(self, other); - } - template::value, void>::type> - batch bitwise_xor(batch const& self, batch const& other, requires_arch) { - return _mm_xor_si128(self, other); - } - template batch bitwise_xor(batch const& self, batch const& other, requires_arch) { - return _mm_xor_pd(self, other); - } - template batch_bool bitwise_xor(batch_bool const& self, batch_bool const& other, requires_arch) { - return _mm_xor_pd(self, other); - } - template::value, void>::type> - batch bitwise_xor(batch_bool const& self, batch_bool const& other, requires_arch) { - return _mm_xor_si128(self, other); - } + // bitwise_rshift + template ::value, void>::type> + inline batch bitwise_rshift(batch const& self, int32_t other, requires_arch) noexcept + { + if (std::is_signed::value) + { + switch (sizeof(T)) + { + case 1: + { + __m128i sign_mask = _mm_set1_epi16((0xFF00 >> other) & 0x00FF); + __m128i cmp_is_negative = _mm_cmpgt_epi8(_mm_setzero_si128(), self); + __m128i res = _mm_srai_epi16(self, other); + return _mm_or_si128(_mm_and_si128(sign_mask, cmp_is_negative), _mm_andnot_si128(sign_mask, res)); + } + case 2: + return _mm_srai_epi16(self, other); + case 4: + return _mm_srai_epi32(self, other); + case 8: + { + // from https://github.com/samyvilar/vect/blob/master/vect_128.h + return _mm_or_si128( + _mm_srli_epi64(self, other), + _mm_slli_epi64( + _mm_srai_epi32(_mm_shuffle_epi32(self, _MM_SHUFFLE(3, 3, 1, 1)), 32), + 64 - other)); + } + default: + assert(false && "unsupported arch/op combination"); + return {}; + } + } + else + { + switch (sizeof(T)) + { + case 1: + return _mm_and_si128(_mm_set1_epi8(0xFF >> other), _mm_srli_epi32(self, other)); + case 2: + return _mm_srli_epi16(self, other); + case 4: + return _mm_srli_epi32(self, other); + case 8: + return _mm_srli_epi64(self, other); + default: + assert(false && "unsupported arch/op combination"); + return {}; + } + } + } - // bitwise_cast - template::value, void>::type> - batch bitwise_cast(batch const& self, batch const &, requires_arch) { - return _mm_castsi128_ps(self); - } - template::type>::value, void>::type> - batch bitwise_cast(batch const& self, batch const &, requires_arch) { - return batch(self.data); - } - template::value, void>::type> - batch bitwise_cast(batch const& self, batch const &, requires_arch) { - return _mm_castps_si128(self); - } - template::value, void>::type> - batch bitwise_cast(batch const& self, batch const &, requires_arch) { - return _mm_castsi128_pd(self); - } - template - batch bitwise_cast(batch const& self, batch const &, requires_arch) { - return _mm_castps_pd(self); - } - template - batch bitwise_cast(batch const& self, batch const &, requires_arch) { - return _mm_castpd_ps(self); - } - template::value, void>::type> - batch bitwise_cast(batch const& self, batch const &, requires_arch) { - return _mm_castpd_si128(self); - } + // bitwise_xor + template + inline batch bitwise_xor(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm_xor_ps(self, other); + } + template + inline batch_bool bitwise_xor(batch_bool const& self, batch_bool const& other, requires_arch) noexcept + { + return _mm_xor_ps(self, other); + } + template ::value, void>::type> + inline batch bitwise_xor(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm_xor_si128(self, other); + } + template + inline batch bitwise_xor(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm_xor_pd(self, other); + } + template + inline batch_bool bitwise_xor(batch_bool const& self, batch_bool const& other, requires_arch) noexcept + { + return _mm_xor_pd(self, other); + } + template ::value, void>::type> + inline batch bitwise_xor(batch_bool const& self, batch_bool const& other, requires_arch) noexcept + { + return _mm_xor_si128(self, other); + } - // bool_cast - template batch_bool bool_cast(batch_bool const& self, requires_arch) { - return _mm_castps_si128(self); - } - template batch_bool bool_cast(batch_bool const& self, requires_arch) { - return _mm_castsi128_ps(self); - } - template batch_bool bool_cast(batch_bool const& self, requires_arch) { - return _mm_castpd_si128(self); - } - template batch_bool bool_cast(batch_bool const& self, requires_arch) { - return _mm_castsi128_pd(self); - } + // bitwise_cast + template ::value, void>::type> + inline batch bitwise_cast(batch const& self, batch const&, requires_arch) noexcept + { + return _mm_castsi128_ps(self); + } + template ::type>::value, void>::type> + inline batch bitwise_cast(batch const& self, batch const&, requires_arch) noexcept + { + return batch(self.data); + } + template ::value, void>::type> + inline batch bitwise_cast(batch const& self, batch const&, requires_arch) noexcept + { + return _mm_castps_si128(self); + } + template ::value, void>::type> + inline batch bitwise_cast(batch const& self, batch const&, requires_arch) noexcept + { + return _mm_castsi128_pd(self); + } + template + inline batch bitwise_cast(batch const& self, batch const&, requires_arch) noexcept + { + return _mm_castps_pd(self); + } + template + inline batch bitwise_cast(batch const& self, batch const&, requires_arch) noexcept + { + return _mm_castpd_ps(self); + } + template ::value, void>::type> + inline batch bitwise_cast(batch const& self, batch const&, requires_arch) noexcept + { + return _mm_castpd_si128(self); + } - // broadcast - template batch broadcast(float val, requires_arch) { - return _mm_set1_ps(val); - } - template::value, void>::type> - batch broadcast(T val, requires_arch) { - switch(sizeof(T)) { - case 1: return _mm_set1_epi8(val); - case 2: return _mm_set1_epi16(val); - case 4: return _mm_set1_epi32(val); - case 8: return _mm_set1_epi64x(val); - default: assert(false && "unsupported arch/op combination"); return {}; - } - } - template batch broadcast(double val, requires_arch) { - return _mm_set1_pd(val); - } + // bool_cast + template + batch_bool inline bool_cast(batch_bool const& self, requires_arch) noexcept + { + return _mm_castps_si128(self); + } + template + batch_bool inline bool_cast(batch_bool const& self, requires_arch) noexcept + { + return _mm_castsi128_ps(self); + } + template + batch_bool inline bool_cast(batch_bool const& self, requires_arch) noexcept + { + return _mm_castpd_si128(self); + } + template + batch_bool inline bool_cast(batch_bool const& self, requires_arch) noexcept + { + return _mm_castsi128_pd(self); + } - // store_complex - namespace detail - { - // Override these methods in SSE-based archs, no need to override store_aligned / store_unaligned - // complex_low - template batch complex_low(batch, A> const& self, requires_arch) { - return _mm_unpacklo_ps(self.real(), self.imag()); - } - // complex_high - template batch complex_high(batch, A> const& self, requires_arch) { - return _mm_unpackhi_ps(self.real(), self.imag()); - } - template batch complex_low(batch, A> const& self, requires_arch) { - return _mm_unpacklo_pd(self.real(), self.imag()); - } - template batch complex_high(batch, A> const& self, requires_arch) { - return _mm_unpackhi_pd(self.real(), self.imag()); - } - } + // broadcast + template + batch inline broadcast(float val, requires_arch) noexcept + { + return _mm_set1_ps(val); + } + template ::value, void>::type> + inline batch broadcast(T val, requires_arch) noexcept + { + switch (sizeof(T)) + { + case 1: + return _mm_set1_epi8(val); + case 2: + return _mm_set1_epi16(val); + case 4: + return _mm_set1_epi32(val); + case 8: + return _mm_set1_epi64x(val); + default: + assert(false && "unsupported arch/op combination"); + return {}; + } + } + template + inline batch broadcast(double val, requires_arch) noexcept + { + return _mm_set1_pd(val); + } - // div - template batch div(batch const& self, batch const& other, requires_arch) { - return _mm_div_ps(self, other); - } - template batch div(batch const& self, batch const& other, requires_arch) { - return _mm_div_pd(self, other); - } + // store_complex + namespace detail + { + // Override these methods in SSE-based archs, no need to override store_aligned / store_unaligned + // complex_low + template + inline batch complex_low(batch, A> const& self, requires_arch) noexcept + { + return _mm_unpacklo_ps(self.real(), self.imag()); + } + // complex_high + template + inline batch complex_high(batch, A> const& self, requires_arch) noexcept + { + return _mm_unpackhi_ps(self.real(), self.imag()); + } + template + inline batch complex_low(batch, A> const& self, requires_arch) noexcept + { + return _mm_unpacklo_pd(self.real(), self.imag()); + } + template + inline batch complex_high(batch, A> const& self, requires_arch) noexcept + { + return _mm_unpackhi_pd(self.real(), self.imag()); + } + } - // convert - namespace detail { - template batch fast_cast(batch const& self, batch const&, requires_arch) { - return _mm_cvtepi32_ps(self); - } - template batch fast_cast(batch const& self, batch const&, requires_arch) { - return _mm_cvttps_epi32(self); - } - } + // div + template + inline batch div(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm_div_ps(self, other); + } + template + inline batch div(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm_div_pd(self, other); + } - // eq - template batch_bool eq(batch const& self, batch const& other, requires_arch) { - return _mm_cmpeq_ps(self, other); - } - template batch_bool eq(batch_bool const& self, batch_bool const& other, requires_arch) { - return _mm_castsi128_ps(_mm_cmpeq_epi32(_mm_castps_si128(self), _mm_castps_si128(other))); - } - template::value, void>::type> - batch_bool eq(batch const& self, batch const& other, requires_arch) { - switch(sizeof(T)) { - case 1: return _mm_cmpeq_epi8(self, other); - case 2: return _mm_cmpeq_epi16(self, other); - case 4: return _mm_cmpeq_epi32(self, other); - case 8: { - __m128i tmp1 = _mm_cmpeq_epi32(self, other); - __m128i tmp2 = _mm_shuffle_epi32(tmp1, 0xB1); - __m128i tmp3 = _mm_and_si128(tmp1, tmp2); - __m128i tmp4 = _mm_srai_epi32(tmp3, 31); - return _mm_shuffle_epi32(tmp4, 0xF5); - } - default: assert(false && "unsupported arch/op combination"); return {}; - } - } - template::value, void>::type> - batch_bool eq(batch_bool const& self, batch_bool const& other, requires_arch) { - return eq(batch(self.data), batch(other.data)); - } - template batch_bool eq(batch const& self, batch const& other, requires_arch) { - return _mm_cmpeq_pd(self, other); - } - template batch_bool eq(batch_bool const& self, batch_bool const& other, requires_arch) { - return _mm_castsi128_pd(_mm_cmpeq_epi32(_mm_castpd_si128(self), _mm_castpd_si128(other))); - } + // convert + namespace detail + { + template + inline batch fast_cast(batch const& self, batch const&, requires_arch) noexcept + { + return _mm_cvtepi32_ps(self); + } + + template + inline batch fast_cast(batch const& v, batch const&, requires_arch) noexcept + { + // see https://stackoverflow.com/questions/34066228/how-to-perform-uint32-float-conversion-with-sse + __m128i msk_lo = _mm_set1_epi32(0xFFFF); + __m128 cnst65536f = _mm_set1_ps(65536.0f); + + __m128i v_lo = _mm_and_si128(v, msk_lo); /* extract the 16 lowest significant bits of self */ + __m128i v_hi = _mm_srli_epi32(v, 16); /* 16 most significant bits of v */ + __m128 v_lo_flt = _mm_cvtepi32_ps(v_lo); /* No rounding */ + __m128 v_hi_flt = _mm_cvtepi32_ps(v_hi); /* No rounding */ + v_hi_flt = _mm_mul_ps(cnst65536f, v_hi_flt); /* No rounding */ + return _mm_add_ps(v_hi_flt, v_lo_flt); /* Rounding may occur here, mul and add may fuse to fma for haswell and newer */ + } + + template + inline batch fast_cast(batch const& self, batch const&, requires_arch) noexcept + { + return _mm_cvttps_epi32(self); + } - // ge - template batch_bool ge(batch const& self, batch const& other, requires_arch) { - return _mm_cmpge_ps(self, other); - } - template batch_bool ge(batch const& self, batch const& other, requires_arch) { - return _mm_cmpge_pd(self, other); - } - // gt + } - template batch_bool gt(batch const& self, batch const& other, requires_arch) { - return _mm_cmpgt_ps(self, other); - } - template::value, void>::type> - batch_bool gt(batch const& self, batch const& other, requires_arch) { - if(std::is_signed::value) { - switch(sizeof(T)) { - case 1: return _mm_cmpgt_epi8(self, other); - case 2: return _mm_cmpgt_epi16(self, other); - case 4: return _mm_cmpgt_epi32(self, other); - default: return gt(self, other, generic{}); - } - } - else { - return gt(self, other, generic{}); - } - } + // eq + template + inline batch_bool eq(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm_cmpeq_ps(self, other); + } + template + inline batch_bool eq(batch_bool const& self, batch_bool const& other, requires_arch) noexcept + { + return _mm_castsi128_ps(_mm_cmpeq_epi32(_mm_castps_si128(self), _mm_castps_si128(other))); + } + template ::value, void>::type> + inline batch_bool eq(batch const& self, batch const& other, requires_arch) noexcept + { + switch (sizeof(T)) + { + case 1: + return _mm_cmpeq_epi8(self, other); + case 2: + return _mm_cmpeq_epi16(self, other); + case 4: + return _mm_cmpeq_epi32(self, other); + case 8: + { + __m128i tmp1 = _mm_cmpeq_epi32(self, other); + __m128i tmp2 = _mm_shuffle_epi32(tmp1, 0xB1); + __m128i tmp3 = _mm_and_si128(tmp1, tmp2); + __m128i tmp4 = _mm_srai_epi32(tmp3, 31); + return _mm_shuffle_epi32(tmp4, 0xF5); + } + default: + assert(false && "unsupported arch/op combination"); + return {}; + } + } + template ::value, void>::type> + inline batch_bool eq(batch_bool const& self, batch_bool const& other, requires_arch) noexcept + { + return eq(batch(self.data), batch(other.data)); + } + template + inline batch_bool eq(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm_cmpeq_pd(self, other); + } + template + inline batch_bool eq(batch_bool const& self, batch_bool const& other, requires_arch) noexcept + { + return _mm_castsi128_pd(_mm_cmpeq_epi32(_mm_castpd_si128(self), _mm_castpd_si128(other))); + } - template batch_bool gt(batch const& self, batch const& other, requires_arch) { - return _mm_cmpgt_pd(self, other); - } - - // hadd - template float hadd(batch const& self, requires_arch) { - __m128 tmp0 = _mm_add_ps(self, _mm_movehl_ps(self, self)); - __m128 tmp1 = _mm_add_ss(tmp0, _mm_shuffle_ps(tmp0, tmp0, 1)); - return _mm_cvtss_f32(tmp1); - } - // TODO: move this in xsimd_generic - namespace detail - { - template::value, void>::type> - T hadd_default(batch const& self, requires_arch) { - alignas(A::alignment()) T buffer[batch::size]; - self.store_aligned(buffer); - T res = 0; - for (T val : buffer) + // ge + template + inline batch_bool ge(batch const& self, batch const& other, requires_arch) noexcept { - res += val; + return _mm_cmpge_ps(self, other); } - return res; - } - } - template::value, void>::type> - T hadd(batch const& self, requires_arch) { - switch(sizeof(T)) { - case 4: { + template + inline batch_bool ge(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm_cmpge_pd(self, other); + } + + // gt + template + inline batch_bool gt(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm_cmpgt_ps(self, other); + } + template ::value, void>::type> + inline batch_bool gt(batch const& self, batch const& other, requires_arch) noexcept + { + if (std::is_signed::value) + { + switch (sizeof(T)) + { + case 1: + return _mm_cmpgt_epi8(self, other); + case 2: + return _mm_cmpgt_epi16(self, other); + case 4: + return _mm_cmpgt_epi32(self, other); + default: + return gt(self, other, generic {}); + } + } + else + { + return gt(self, other, generic {}); + } + } + + template + inline batch_bool gt(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm_cmpgt_pd(self, other); + } + + // hadd + template + inline float hadd(batch const& self, requires_arch) noexcept + { + __m128 tmp0 = _mm_add_ps(self, _mm_movehl_ps(self, self)); + __m128 tmp1 = _mm_add_ss(tmp0, _mm_shuffle_ps(tmp0, tmp0, 1)); + return _mm_cvtss_f32(tmp1); + } + // TODO: move this in xsimd_generic + namespace detail + { + template ::value, void>::type> + inline T hadd_default(batch const& self, requires_arch) noexcept + { + alignas(A::alignment()) T buffer[batch::size]; + self.store_aligned(buffer); + T res = 0; + for (T val : buffer) + { + res += val; + } + return res; + } + } + template ::value, void>::type> + inline T hadd(batch const& self, requires_arch) noexcept + { + switch (sizeof(T)) + { + case 4: + { __m128i tmp1 = _mm_shuffle_epi32(self, 0x0E); __m128i tmp2 = _mm_add_epi32(self, tmp1); __m128i tmp3 = _mm_shuffle_epi32(tmp2, 0x01); __m128i tmp4 = _mm_add_epi32(tmp2, tmp3); return _mm_cvtsi128_si32(tmp4); - } - case 8: { + } + case 8: + { __m128i tmp1 = _mm_shuffle_epi32(self, 0x0E); __m128i tmp2 = _mm_add_epi64(self, tmp1); #if defined(__x86_64__) @@ -451,479 +655,656 @@ namespace xsimd { std::memcpy(&i, &m, sizeof(i)); return i; #endif + } + default: + return detail::hadd_default(self, A {}); + } + } + template + inline double hadd(batch const& self, requires_arch) noexcept + { + return _mm_cvtsd_f64(_mm_add_sd(self, _mm_unpackhi_pd(self, self))); } - default: return detail::hadd_default(self, A{}); - } - } - template - double hadd(batch const &self, requires_arch) { - return _mm_cvtsd_f64(_mm_add_sd(self, _mm_unpackhi_pd(self, self))); - } - - // haddp - template batch haddp(batch const* row, requires_arch) { - __m128 tmp0 = _mm_unpacklo_ps(row[0], row[1]); - __m128 tmp1 = _mm_unpackhi_ps(row[0], row[1]); - __m128 tmp2 = _mm_unpackhi_ps(row[2], row[3]); - tmp0 = _mm_add_ps(tmp0, tmp1); - tmp1 = _mm_unpacklo_ps(row[2], row[3]); - tmp1 = _mm_add_ps(tmp1, tmp2); - tmp2 = _mm_movehl_ps(tmp1, tmp0); - tmp0 = _mm_movelh_ps(tmp0, tmp1); - return _mm_add_ps(tmp0, tmp2); - } - template - batch haddp(batch const *row, requires_arch) { - return _mm_add_pd(_mm_unpacklo_pd(row[0], row[1]), - _mm_unpackhi_pd(row[0], row[1])); - } - - // isnan - template batch_bool isnan(batch const& self, requires_arch) { - return _mm_cmpunord_ps(self, self); - } - template batch_bool isnan(batch const& self, requires_arch) { - return _mm_cmpunord_pd(self, self); - } - - // load_aligned - template batch load_aligned(float const* mem, convert, requires_arch) { - return _mm_load_ps(mem); - } - template::value, void>::type> - batch load_aligned(T const* mem, convert, requires_arch) { - return _mm_load_si128((__m128i const*)mem); - } - template batch load_aligned(double const* mem, convert, requires_arch) { - return _mm_load_pd(mem); - } - // load_unaligned - template batch load_unaligned(float const* mem, convert, requires_arch){ - return _mm_loadu_ps(mem); - } - template::value, void>::type> - batch load_unaligned(T const* mem, convert, requires_arch) { - return _mm_loadu_si128((__m128i const*)mem); - } - template batch load_unaligned(double const* mem, convert, requires_arch){ - return _mm_loadu_pd(mem); - } + // haddp + template + inline batch haddp(batch const* row, requires_arch) noexcept + { + __m128 tmp0 = _mm_unpacklo_ps(row[0], row[1]); + __m128 tmp1 = _mm_unpackhi_ps(row[0], row[1]); + __m128 tmp2 = _mm_unpackhi_ps(row[2], row[3]); + tmp0 = _mm_add_ps(tmp0, tmp1); + tmp1 = _mm_unpacklo_ps(row[2], row[3]); + tmp1 = _mm_add_ps(tmp1, tmp2); + tmp2 = _mm_movehl_ps(tmp1, tmp0); + tmp0 = _mm_movelh_ps(tmp0, tmp1); + return _mm_add_ps(tmp0, tmp2); + } + template + inline batch haddp(batch const* row, requires_arch) noexcept + { + return _mm_add_pd(_mm_unpacklo_pd(row[0], row[1]), + _mm_unpackhi_pd(row[0], row[1])); + } - // load_complex - namespace detail - { - // Redefine these methods in the SSE-based archs if required - template batch, A> load_complex(batch const& hi, batch const& lo, requires_arch) { - return {_mm_shuffle_ps(hi, lo, _MM_SHUFFLE(2, 0, 2, 0)), _mm_shuffle_ps(hi, lo, _MM_SHUFFLE(3, 1, 3, 1))}; - } - template batch, A> load_complex(batch const& hi, batch const& lo, requires_arch) { - return {_mm_shuffle_pd(hi, lo, _MM_SHUFFLE2(0, 0)), _mm_shuffle_pd(hi, lo, _MM_SHUFFLE2(1, 1))}; + // isnan + template + inline batch_bool isnan(batch const& self, requires_arch) noexcept + { + return _mm_cmpunord_ps(self, self); + } + template + inline batch_bool isnan(batch const& self, requires_arch) noexcept + { + return _mm_cmpunord_pd(self, self); } - } - // le - template batch_bool le(batch const& self, batch const& other, requires_arch) { - return _mm_cmple_ps(self, other); - } - template batch_bool le(batch const& self, batch const& other, requires_arch) { - return _mm_cmple_pd(self, other); - } + // load_aligned + template + inline batch load_aligned(float const* mem, convert, requires_arch) noexcept + { + return _mm_load_ps(mem); + } + template ::value, void>::type> + inline batch load_aligned(T const* mem, convert, requires_arch) noexcept + { + return _mm_load_si128((__m128i const*)mem); + } + template + inline batch load_aligned(double const* mem, convert, requires_arch) noexcept + { + return _mm_load_pd(mem); + } - // lt - template batch_bool lt(batch const& self, batch const& other, requires_arch) { - return _mm_cmplt_ps(self, other); - } - template::value, void>::type> - batch_bool lt(batch const& self, batch const& other, requires_arch) { - if(std::is_signed::value) { - switch(sizeof(T)) { - case 1: return _mm_cmplt_epi8(self, other); - case 2: return _mm_cmplt_epi16(self, other); - case 4: return _mm_cmplt_epi32(self, other); - case 8: { - __m128i tmp1 = _mm_sub_epi64(self, other); - __m128i tmp2 = _mm_xor_si128(self, other); - __m128i tmp3 = _mm_andnot_si128(other, self); - __m128i tmp4 = _mm_andnot_si128(tmp2, tmp1); - __m128i tmp5 = _mm_or_si128(tmp3, tmp4); - __m128i tmp6 = _mm_srai_epi32(tmp5, 31); - return _mm_shuffle_epi32(tmp6, 0xF5); - } - default: assert(false && "unsupported arch/op combination"); return {}; - } - } - else { - switch(sizeof(T)) { - case 1: return _mm_cmplt_epi8(_mm_xor_si128(self, _mm_set1_epi8(std::numeric_limits::lowest())), _mm_xor_si128(other, _mm_set1_epi8(std::numeric_limits::lowest()))); - case 2: return _mm_cmplt_epi16(_mm_xor_si128(self, _mm_set1_epi16(std::numeric_limits::lowest())), _mm_xor_si128(other, _mm_set1_epi16(std::numeric_limits::lowest()))); - case 4: return _mm_cmplt_epi32(_mm_xor_si128(self, _mm_set1_epi32(std::numeric_limits::lowest())), _mm_xor_si128(other, _mm_set1_epi32(std::numeric_limits::lowest()))); - case 8: { - auto xself = _mm_xor_si128(self, _mm_set1_epi64x(std::numeric_limits::lowest())); - auto xother = _mm_xor_si128(other, _mm_set1_epi64x(std::numeric_limits::lowest())); - __m128i tmp1 = _mm_sub_epi64(xself, xother); - __m128i tmp2 = _mm_xor_si128(xself, xother); - __m128i tmp3 = _mm_andnot_si128(xother, xself); - __m128i tmp4 = _mm_andnot_si128(tmp2, tmp1); - __m128i tmp5 = _mm_or_si128(tmp3, tmp4); - __m128i tmp6 = _mm_srai_epi32(tmp5, 31); - return _mm_shuffle_epi32(tmp6, 0xF5); - } - default: assert(false && "unsupported arch/op combination"); return {}; - } - } - } - template batch_bool lt(batch const& self, batch const& other, requires_arch) { - return _mm_cmplt_pd(self, other); - } + // load_unaligned + template + inline batch load_unaligned(float const* mem, convert, requires_arch) noexcept + { + return _mm_loadu_ps(mem); + } + template ::value, void>::type> + inline batch load_unaligned(T const* mem, convert, requires_arch) noexcept + { + return _mm_loadu_si128((__m128i const*)mem); + } + template + inline batch load_unaligned(double const* mem, convert, requires_arch) noexcept + { + return _mm_loadu_pd(mem); + } - // max - template batch max(batch const& self, batch const& other, requires_arch) { - return _mm_max_ps(self, other); - } - template::value, void>::type> - batch max(batch const& self, batch const& other, requires_arch) { - return select(self > other, self, other); - } - template batch max(batch const& self, batch const& other, requires_arch) { - return _mm_max_pd(self, other); - } + // load_complex + namespace detail + { + // Redefine these methods in the SSE-based archs if required + template + inline batch, A> load_complex(batch const& hi, batch const& lo, requires_arch) noexcept + { + return { _mm_shuffle_ps(hi, lo, _MM_SHUFFLE(2, 0, 2, 0)), _mm_shuffle_ps(hi, lo, _MM_SHUFFLE(3, 1, 3, 1)) }; + } + template + inline batch, A> load_complex(batch const& hi, batch const& lo, requires_arch) noexcept + { + return { _mm_shuffle_pd(hi, lo, _MM_SHUFFLE2(0, 0)), _mm_shuffle_pd(hi, lo, _MM_SHUFFLE2(1, 1)) }; + } + } - // min - template batch min(batch const& self, batch const& other, requires_arch) { - return _mm_min_ps(self, other); - } - template::value, void>::type> - batch min(batch const& self, batch const& other, requires_arch) { - return select(self <= other, self, other); - } - template batch min(batch const& self, batch const& other, requires_arch) { - return _mm_min_pd(self, other); - } + // le + template + inline batch_bool le(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm_cmple_ps(self, other); + } + template + inline batch_bool le(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm_cmple_pd(self, other); + } - // mul - template batch mul(batch const& self, batch const& other, requires_arch) { - return _mm_mul_ps(self, other); - } - template batch mul(batch const& self, batch const& other, requires_arch) { - return _mm_mul_pd(self, other); - } + // lt + template + inline batch_bool lt(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm_cmplt_ps(self, other); + } + template ::value, void>::type> + inline batch_bool lt(batch const& self, batch const& other, requires_arch) noexcept + { + if (std::is_signed::value) + { + switch (sizeof(T)) + { + case 1: + return _mm_cmplt_epi8(self, other); + case 2: + return _mm_cmplt_epi16(self, other); + case 4: + return _mm_cmplt_epi32(self, other); + case 8: + { + __m128i tmp1 = _mm_sub_epi64(self, other); + __m128i tmp2 = _mm_xor_si128(self, other); + __m128i tmp3 = _mm_andnot_si128(other, self); + __m128i tmp4 = _mm_andnot_si128(tmp2, tmp1); + __m128i tmp5 = _mm_or_si128(tmp3, tmp4); + __m128i tmp6 = _mm_srai_epi32(tmp5, 31); + return _mm_shuffle_epi32(tmp6, 0xF5); + } + default: + assert(false && "unsupported arch/op combination"); + return {}; + } + } + else + { + switch (sizeof(T)) + { + case 1: + return _mm_cmplt_epi8(_mm_xor_si128(self, _mm_set1_epi8(std::numeric_limits::lowest())), _mm_xor_si128(other, _mm_set1_epi8(std::numeric_limits::lowest()))); + case 2: + return _mm_cmplt_epi16(_mm_xor_si128(self, _mm_set1_epi16(std::numeric_limits::lowest())), _mm_xor_si128(other, _mm_set1_epi16(std::numeric_limits::lowest()))); + case 4: + return _mm_cmplt_epi32(_mm_xor_si128(self, _mm_set1_epi32(std::numeric_limits::lowest())), _mm_xor_si128(other, _mm_set1_epi32(std::numeric_limits::lowest()))); + case 8: + { + auto xself = _mm_xor_si128(self, _mm_set1_epi64x(std::numeric_limits::lowest())); + auto xother = _mm_xor_si128(other, _mm_set1_epi64x(std::numeric_limits::lowest())); + __m128i tmp1 = _mm_sub_epi64(xself, xother); + __m128i tmp2 = _mm_xor_si128(xself, xother); + __m128i tmp3 = _mm_andnot_si128(xother, xself); + __m128i tmp4 = _mm_andnot_si128(tmp2, tmp1); + __m128i tmp5 = _mm_or_si128(tmp3, tmp4); + __m128i tmp6 = _mm_srai_epi32(tmp5, 31); + return _mm_shuffle_epi32(tmp6, 0xF5); + } + default: + assert(false && "unsupported arch/op combination"); + return {}; + } + } + } - // neg - template::value, void>::type> - batch neg(batch const& self, requires_arch) { - return 0 - self; - } - template batch neg(batch const& self, requires_arch) { - return _mm_xor_ps(self, _mm_castsi128_ps(_mm_set1_epi32(0x80000000))); - } + template + inline batch_bool lt(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm_cmplt_pd(self, other); + } + // max + template + inline batch max(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm_max_ps(self, other); + } + template ::value, void>::type> + inline batch max(batch const& self, batch const& other, requires_arch) noexcept + { + return select(self > other, self, other); + } + template + inline batch max(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm_max_pd(self, other); + } - template - batch neg(batch const &self, requires_arch) { - return _mm_xor_pd( - self, _mm_castsi128_pd(_mm_setr_epi32(0, 0x80000000, 0, 0x80000000))); - } + // min + template + inline batch min(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm_min_ps(self, other); + } + template ::value, void>::type> + inline batch min(batch const& self, batch const& other, requires_arch) noexcept + { + return select(self <= other, self, other); + } + template + inline batch min(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm_min_pd(self, other); + } - // neq - template batch_bool neq(batch const& self, batch const& other, requires_arch) { - return _mm_cmpneq_ps(self, other); - } - template::value, void>::type> - batch_bool neq(batch const& self, batch const& other, requires_arch) { - return ~(self == other); - } - template batch_bool neq(batch_bool const& self, batch_bool const& other, requires_arch) { - return _mm_cmpneq_ps(self, other); - } - template::value, void>::type> - batch_bool neq(batch_bool const& self, batch_bool const& other, requires_arch) { - return ~(self == other); - } + // mul + template + inline batch mul(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm_mul_ps(self, other); + } + template + inline batch mul(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm_mul_pd(self, other); + } + // neg + template ::value, void>::type> + inline batch neg(batch const& self, requires_arch) noexcept + { + return 0 - self; + } + template + inline batch neg(batch const& self, requires_arch) noexcept + { + return _mm_xor_ps(self, _mm_castsi128_ps(_mm_set1_epi32(0x80000000))); + } - template batch_bool neq(batch const& self, batch const& other, requires_arch) { - return _mm_cmpneq_pd(self, other); - } - template batch_bool neq(batch_bool const& self, batch_bool const& other, requires_arch) { - return _mm_cmpneq_pd(self, other); - } + template + inline batch neg(batch const& self, requires_arch) noexcept + { + return _mm_xor_pd( + self, _mm_castsi128_pd(_mm_setr_epi32(0, 0x80000000, 0, 0x80000000))); + } - // select - template batch select(batch_bool const& cond, batch const& true_br, batch const& false_br, requires_arch) { - return _mm_or_ps(_mm_and_ps(cond, true_br), _mm_andnot_ps(cond, false_br)); - } + // neq + template + inline batch_bool neq(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm_cmpneq_ps(self, other); + } + template ::value, void>::type> + inline batch_bool neq(batch const& self, batch const& other, requires_arch) noexcept + { + return ~(self == other); + } + template + inline batch_bool neq(batch_bool const& self, batch_bool const& other, requires_arch) noexcept + { + return _mm_cmpneq_ps(self, other); + } + template ::value, void>::type> + inline batch_bool neq(batch_bool const& self, batch_bool const& other, requires_arch) noexcept + { + return ~(self == other); + } + template + inline batch_bool neq(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm_cmpneq_pd(self, other); + } + template + inline batch_bool neq(batch_bool const& self, batch_bool const& other, requires_arch) noexcept + { + return _mm_cmpneq_pd(self, other); + } - template::value, void>::type> - batch select(batch_bool const& cond, batch const& true_br, batch const& false_br, requires_arch) { - return _mm_or_si128(_mm_and_si128(cond, true_br), _mm_andnot_si128(cond, false_br)); - } - template::value, void>::type> - batch select(batch_bool_constant, Values...> const&, batch const& true_br, batch const& false_br, requires_arch) { - return select(batch_bool{Values...}, true_br, false_br, sse2{}); - } - template batch select(batch_bool const& cond, batch const& true_br, batch const& false_br, requires_arch) { - return _mm_or_pd(_mm_and_pd(cond, true_br), _mm_andnot_pd(cond, false_br)); - } + // select + template + inline batch select(batch_bool const& cond, batch const& true_br, batch const& false_br, requires_arch) noexcept + { + return _mm_or_ps(_mm_and_ps(cond, true_br), _mm_andnot_ps(cond, false_br)); + } - // sqrt - template batch sqrt(batch const& val, requires_arch) { - return _mm_sqrt_ps(val); - } - template batch sqrt(batch const& val, requires_arch) { - return _mm_sqrt_pd(val); - } - // sadd - template batch sadd(batch const& self, batch const& other, requires_arch) { - return _mm_add_ps(self, other); // no saturated arithmetic on floating point numbers - } - // TODO: move this in xsimd_generic - namespace detail - { - template::value, void>::type> - batch sadd_default(batch const& self, batch const& other, requires_arch) { - if(std::is_signed::value) { - auto mask = (other >> (8 * sizeof(T) - 1)); - auto self_pos_branch = min(std::numeric_limits::max() - other, self); - auto self_neg_branch = max(std::numeric_limits::min() - other, self); - return other + select(batch_bool(mask.data), self_neg_branch, self_pos_branch); - } - else { - const auto diffmax = std::numeric_limits::max() - self; - const auto mindiff = min(diffmax, other); - return self + mindiff; - } - } - } + template ::value, void>::type> + inline batch select(batch_bool const& cond, batch const& true_br, batch const& false_br, requires_arch) noexcept + { + return _mm_or_si128(_mm_and_si128(cond, true_br), _mm_andnot_si128(cond, false_br)); + } + template ::value, void>::type> + inline batch select(batch_bool_constant, Values...> const&, batch const& true_br, batch const& false_br, requires_arch) noexcept + { + return select(batch_bool { Values... }, true_br, false_br, sse2 {}); + } + template + inline batch select(batch_bool const& cond, batch const& true_br, batch const& false_br, requires_arch) noexcept + { + return _mm_or_pd(_mm_and_pd(cond, true_br), _mm_andnot_pd(cond, false_br)); + } + // sqrt + template + inline batch sqrt(batch const& val, requires_arch) noexcept + { + return _mm_sqrt_ps(val); + } + template + inline batch sqrt(batch const& val, requires_arch) noexcept + { + return _mm_sqrt_pd(val); + } + // sadd + template + inline batch sadd(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm_add_ps(self, other); // no saturated arithmetic on floating point numbers + } + // TODO: move this in xsimd_generic + namespace detail + { + template ::value, void>::type> + inline batch sadd_default(batch const& self, batch const& other, requires_arch) noexcept + { + if (std::is_signed::value) + { + auto mask = (other >> (8 * sizeof(T) - 1)); + auto self_pos_branch = min(std::numeric_limits::max() - other, self); + auto self_neg_branch = max(std::numeric_limits::min() - other, self); + return other + select(batch_bool(mask.data), self_neg_branch, self_pos_branch); + } + else + { + const auto diffmax = std::numeric_limits::max() - self; + const auto mindiff = min(diffmax, other); + return self + mindiff; + } + } + } - template::value, void>::type> - batch sadd(batch const& self, batch const& other, requires_arch) { - if(std::is_signed::value) { - switch(sizeof(T)) { - case 1: return _mm_adds_epi8(self, other); - case 2: return _mm_adds_epi16(self, other); - default: return detail::sadd_default(self, other, A{}); + template ::value, void>::type> + inline batch sadd(batch const& self, batch const& other, requires_arch) noexcept + { + if (std::is_signed::value) + { + switch (sizeof(T)) + { + case 1: + return _mm_adds_epi8(self, other); + case 2: + return _mm_adds_epi16(self, other); + default: + return detail::sadd_default(self, other, A {}); + } + } + else + { + switch (sizeof(T)) + { + case 1: + return _mm_adds_epu8(self, other); + case 2: + return _mm_adds_epu16(self, other); + default: + return detail::sadd_default(self, other, A {}); + } + } } - } - else { - switch(sizeof(T)) { - case 1: return _mm_adds_epu8(self, other); - case 2: return _mm_adds_epu16(self, other); - default: return detail::sadd_default(self, other, A{}); + template + inline batch sadd(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm_add_pd(self, other); // no saturated arithmetic on floating point numbers } - } - } - template batch sadd(batch const& self, batch const& other, requires_arch) { - return _mm_add_pd(self, other); // no saturated arithmetic on floating point numbers - } - // set - template - batch set(batch const&, requires_arch, Values... values) { - static_assert(sizeof...(Values) == batch::size, "consistent init"); - return _mm_setr_ps(values...); - } + // set + template + inline batch set(batch const&, requires_arch, Values... values) noexcept + { + static_assert(sizeof...(Values) == batch::size, "consistent init"); + return _mm_setr_ps(values...); + } + template ::value, void>::type> + inline batch set(batch const&, requires_arch, T v0, T v1) noexcept + { + return _mm_set_epi64x(v1, v0); + } + template ::value, void>::type> + inline batch set(batch const&, requires_arch, T v0, T v1, T v2, T v3) noexcept + { + return _mm_setr_epi32(v0, v1, v2, v3); + } + template ::value, void>::type> + inline batch set(batch const&, requires_arch, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7) noexcept + { + return _mm_setr_epi16(v0, v1, v2, v3, v4, v5, v6, v7); + } + template ::value, void>::type> + inline batch set(batch const&, requires_arch, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7, T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15) noexcept + { + return _mm_setr_epi8(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15); + } - template::value, void>::type> - batch set(batch const&, requires_arch, T v0, T v1) { - return _mm_set_epi64x(v1, v0); - } - template::value, void>::type> - batch set(batch const&, requires_arch, T v0, T v1, T v2, T v3) { - return _mm_setr_epi32(v0, v1, v2, v3); - } - template::value, void>::type> - batch set(batch const&, requires_arch, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7) { - return _mm_setr_epi16(v0, v1, v2, v3, v4, v5, v6, v7); - } - template::value, void>::type> - batch set(batch const&, requires_arch, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7, T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15) { - return _mm_setr_epi8(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15); - } + template + inline batch set(batch const&, requires_arch, Values... values) noexcept + { + static_assert(sizeof...(Values) == batch::size, "consistent init"); + return _mm_setr_pd(values...); + } - template - batch set(batch const&, requires_arch, Values... values) { - static_assert(sizeof...(Values) == batch::size, "consistent init"); - return _mm_setr_pd(values...); - } + template ::value, void>::type> + inline batch_bool set(batch_bool const&, requires_arch, Values... values) noexcept + { + return set(batch(), A {}, static_cast(values ? -1LL : 0LL)...).data; + } - template::value, void>::type> - batch_bool set(batch_bool const&, requires_arch, Values... values) { - return set(batch(), A{}, static_cast(values ? -1LL : 0LL )...).data; - } + template + inline batch_bool set(batch_bool const&, requires_arch, Values... values) noexcept + { + static_assert(sizeof...(Values) == batch_bool::size, "consistent init"); + return _mm_castsi128_ps(set(batch(), A {}, static_cast(values ? -1LL : 0LL)...).data); + } - template - batch_bool set(batch_bool const&, requires_arch, Values... values) { - static_assert(sizeof...(Values) == batch_bool::size, "consistent init"); - return _mm_castsi128_ps(set(batch(), A{}, static_cast(values ? -1LL : 0LL )...).data); - } + template + inline batch_bool set(batch_bool const&, requires_arch, Values... values) noexcept + { + static_assert(sizeof...(Values) == batch_bool::size, "consistent init"); + return _mm_castsi128_pd(set(batch(), A {}, static_cast(values ? -1LL : 0LL)...).data); + } - template - batch_bool set(batch_bool const&, requires_arch, Values... values) { - static_assert(sizeof...(Values) == batch_bool::size, "consistent init"); - return _mm_castsi128_pd(set(batch(), A{}, static_cast(values ? -1LL : 0LL )...).data); - } + // ssub + template + inline batch ssub(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm_sub_ps(self, other); // no saturated arithmetic on floating point numbers + } + // TODO: move this in xsimd_generic + namespace detail + { + template ::value, void>::type> + inline batch ssub_default(batch const& self, batch const& other, requires_arch) noexcept + { + if (std::is_signed::value) + { + return sadd(self, -other); + } + else + { + const auto diff = min(self, other); + return self - diff; + } + } + } - // ssub - template batch ssub(batch const& self, batch const& other, requires_arch) { - return _mm_sub_ps(self, other); // no saturated arithmetic on floating point numbers - } - // TODO: move this in xsimd_generic - namespace detail - { - template::value, void>::type> - batch ssub_default(batch const& self, batch const& other, requires_arch) { - if(std::is_signed::value) { - return sadd(self, -other); - } - else { - const auto diff = min(self, other); - return self - diff; - } - } - } + template ::value, void>::type> + inline batch ssub(batch const& self, batch const& other, requires_arch) noexcept + { + if (std::is_signed::value) + { + switch (sizeof(T)) + { + case 1: + return _mm_subs_epi8(self, other); + case 2: + return _mm_subs_epi16(self, other); + default: + return detail::ssub_default(self, other, A {}); + } + } + else + { + switch (sizeof(T)) + { + case 1: + return _mm_subs_epu8(self, other); + case 2: + return _mm_subs_epu16(self, other); + default: + return detail::ssub_default(self, other, A {}); + } + } + } + template + inline batch ssub(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm_sub_pd(self, other); // no saturated arithmetic on floating point numbers + } - template::value, void>::type> - batch ssub(batch const& self, batch const& other, requires_arch) { - if(std::is_signed::value) { - switch(sizeof(T)) { - case 1: return _mm_subs_epi8(self, other); - case 2: return _mm_subs_epi16(self, other); - default: return detail::ssub_default(self, other, A{}); + // store_aligned + template + inline void store_aligned(float* mem, batch const& self, requires_arch) noexcept + { + return _mm_store_ps(mem, self); } - } - else { - switch(sizeof(T)) { - case 1: return _mm_subs_epu8(self, other); - case 2: return _mm_subs_epu16(self, other); - default: return detail::ssub_default(self, other, A{}); + template ::value, void>::type> + inline void store_aligned(T* mem, batch const& self, requires_arch) noexcept + { + return _mm_store_si128((__m128i*)mem, self); + } + template ::value, void>::type> + inline void store_aligned(T* mem, batch_bool const& self, requires_arch) noexcept + { + return _mm_store_si128((__m128i*)mem, self); + } + template + inline void store_aligned(double* mem, batch const& self, requires_arch) noexcept + { + return _mm_store_pd(mem, self); } - } - } - template batch ssub(batch const& self, batch const& other, requires_arch) { - return _mm_sub_pd(self, other); // no saturated arithmetic on floating point numbers - } - - // store_aligned - template void store_aligned(float *mem, batch const& self, requires_arch) { - return _mm_store_ps(mem, self); - } - template::value, void>::type> - void store_aligned(T *mem, batch const& self, requires_arch) { - return _mm_store_si128((__m128i *)mem, self); - } - template::value, void>::type> - void store_aligned(T *mem, batch_bool const& self, requires_arch) { - return _mm_store_si128((__m128i *)mem, self); - } - template void store_aligned(double *mem, batch const& self, requires_arch) { - return _mm_store_pd(mem, self); - } - // store_unaligned - template void store_unaligned(float *mem, batch const& self, requires_arch) { - return _mm_storeu_ps(mem, self); - } - template::value, void>::type> - void store_unaligned(T *mem, batch const& self, requires_arch) { - return _mm_storeu_si128((__m128i *)mem, self); - } - template::value, void>::type> - void store_unaligned(T *mem, batch_bool const& self, requires_arch) { - return _mm_storeu_si128((__m128i *)mem, self); - } - template void store_unaligned(double *mem, batch const& self, requires_arch) { - return _mm_storeu_pd(mem, self); - } + // store_unaligned + template + inline void store_unaligned(float* mem, batch const& self, requires_arch) noexcept + { + return _mm_storeu_ps(mem, self); + } + template ::value, void>::type> + inline void store_unaligned(T* mem, batch const& self, requires_arch) noexcept + { + return _mm_storeu_si128((__m128i*)mem, self); + } + template ::value, void>::type> + inline void store_unaligned(T* mem, batch_bool const& self, requires_arch) noexcept + { + return _mm_storeu_si128((__m128i*)mem, self); + } + template + inline void store_unaligned(double* mem, batch const& self, requires_arch) noexcept + { + return _mm_storeu_pd(mem, self); + } - // sub - template batch sub(batch const& self, batch const& other, requires_arch) { - return _mm_sub_ps(self, other); - } - template::value, void>::type> - batch sub(batch const& self, batch const& other, requires_arch) { - switch(sizeof(T)) { - case 1: return _mm_sub_epi8(self, other); - case 2: return _mm_sub_epi16(self, other); - case 4: return _mm_sub_epi32(self, other); - case 8: return _mm_sub_epi64(self, other); - default: assert(false && "unsupported arch/op combination"); return {}; - } - } - template batch sub(batch const& self, batch const& other, requires_arch) { - return _mm_sub_pd(self, other); - } + // sub + template + inline batch sub(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm_sub_ps(self, other); + } + template ::value, void>::type> + inline batch sub(batch const& self, batch const& other, requires_arch) noexcept + { + switch (sizeof(T)) + { + case 1: + return _mm_sub_epi8(self, other); + case 2: + return _mm_sub_epi16(self, other); + case 4: + return _mm_sub_epi32(self, other); + case 8: + return _mm_sub_epi64(self, other); + default: + assert(false && "unsupported arch/op combination"); + return {}; + } + } + template + inline batch sub(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm_sub_pd(self, other); + } - // to_float - template - batch to_float(batch const& self, requires_arch) { - return _mm_cvtepi32_ps(self); - } - template - batch to_float(batch const& self, requires_arch) { - // FIXME: call _mm_cvtepi64_pd - alignas(A::alignment()) int64_t buffer[batch::size]; - self.store_aligned(&buffer[0]); - return {(double)buffer[0], (double)buffer[1]}; - } + // to_float + template + inline batch to_float(batch const& self, requires_arch) noexcept + { + return _mm_cvtepi32_ps(self); + } + template + inline batch to_float(batch const& self, requires_arch) noexcept + { + // FIXME: call _mm_cvtepi64_pd + alignas(A::alignment()) int64_t buffer[batch::size]; + self.store_aligned(&buffer[0]); + return { (double)buffer[0], (double)buffer[1] }; + } - // to_int - template - batch to_int(batch const& self, requires_arch) { - return _mm_cvttps_epi32(self); - } + // to_int + template + inline batch to_int(batch const& self, requires_arch) noexcept + { + return _mm_cvttps_epi32(self); + } - template - batch to_int(batch const& self, requires_arch) { - // FIXME: call _mm_cvttpd_epi64 - alignas(A::alignment()) double buffer[batch::size]; - self.store_aligned(&buffer[0]); - return {(int64_t)buffer[0], (int64_t)buffer[1]}; - } + template + inline batch to_int(batch const& self, requires_arch) noexcept + { + // FIXME: call _mm_cvttpd_epi64 + alignas(A::alignment()) double buffer[batch::size]; + self.store_aligned(&buffer[0]); + return { (int64_t)buffer[0], (int64_t)buffer[1] }; + } - // zip_hi - template batch zip_hi(batch const& self, batch const& other, requires_arch) { - return _mm_unpackhi_ps(self, other); - } - template::value, void>::type> - batch zip_hi(batch const& self, batch const& other, requires_arch) { - switch(sizeof(T)) { - case 1: return _mm_unpackhi_epi8(self, other); - case 2: return _mm_unpackhi_epi16(self, other); - case 4: return _mm_unpackhi_epi32(self, other); - case 8: return _mm_unpackhi_epi64(self, other); - default: assert(false && "unsupported arch/op combination"); return {}; - } - } - template batch zip_hi(batch const& self, batch const& other, requires_arch) { - return _mm_unpackhi_pd(self, other); - } + // zip_hi + template + inline batch zip_hi(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm_unpackhi_ps(self, other); + } + template ::value, void>::type> + inline batch zip_hi(batch const& self, batch const& other, requires_arch) noexcept + { + switch (sizeof(T)) + { + case 1: + return _mm_unpackhi_epi8(self, other); + case 2: + return _mm_unpackhi_epi16(self, other); + case 4: + return _mm_unpackhi_epi32(self, other); + case 8: + return _mm_unpackhi_epi64(self, other); + default: + assert(false && "unsupported arch/op combination"); + return {}; + } + } + template + inline batch zip_hi(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm_unpackhi_pd(self, other); + } - // zip_lo - template batch zip_lo(batch const& self, batch const& other, requires_arch) { - return _mm_unpacklo_ps(self, other); - } - template::value, void>::type> - batch zip_lo(batch const& self, batch const& other, requires_arch) { - switch(sizeof(T)) { - case 1: return _mm_unpacklo_epi8(self, other); - case 2: return _mm_unpacklo_epi16(self, other); - case 4: return _mm_unpacklo_epi32(self, other); - case 8: return _mm_unpacklo_epi64(self, other); - default: assert(false && "unsupported arch/op combination"); return {}; - } - } - template batch zip_lo(batch const& self, batch const& other, requires_arch) { - return _mm_unpacklo_pd(self, other); + // zip_lo + template + inline batch zip_lo(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm_unpacklo_ps(self, other); + } + template ::value, void>::type> + inline batch zip_lo(batch const& self, batch const& other, requires_arch) noexcept + { + switch (sizeof(T)) + { + case 1: + return _mm_unpacklo_epi8(self, other); + case 2: + return _mm_unpacklo_epi16(self, other); + case 4: + return _mm_unpacklo_epi32(self, other); + case 8: + return _mm_unpacklo_epi64(self, other); + default: + assert(false && "unsupported arch/op combination"); + return {}; + } + } + template + inline batch zip_lo(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm_unpacklo_pd(self, other); + } } - } } #endif - - diff --git a/third_party/xsimd/arch/xsimd_sse3.hpp b/third_party/xsimd/arch/xsimd_sse3.hpp index 5c521e9c0..bf7a8df74 100644 --- a/third_party/xsimd/arch/xsimd_sse3.hpp +++ b/third_party/xsimd/arch/xsimd_sse3.hpp @@ -1,56 +1,64 @@ /*************************************************************************** -* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * -* Martin Renou * -* Copyright (c) QuantStack * -* Copyright (c) Serge Guelton * -* * -* Distributed under the terms of the BSD 3-Clause License. * -* * -* The full license is in the file LICENSE, distributed with this software. * -****************************************************************************/ + * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * + * Martin Renou * + * Copyright (c) QuantStack * + * Copyright (c) Serge Guelton * + * * + * Distributed under the terms of the BSD 3-Clause License. * + * * + * The full license is in the file LICENSE, distributed with this software. * + ****************************************************************************/ #ifndef XSIMD_SSE3_HPP #define XSIMD_SSE3_HPP -#include #include "../types/xsimd_sse3_register.hpp" +#include -namespace xsimd { +namespace xsimd +{ - namespace kernel { - using namespace types; + namespace kernel + { + using namespace types; - // load_unaligned - template::value, void>::type> - batch load_unaligned(T const* mem, convert, requires_arch) { - return _mm_lddqu_si128((__m128i const*)mem); - } + // load_unaligned + template ::value, void>::type> + inline batch load_unaligned(T const* mem, convert, requires_arch) noexcept + { + return _mm_lddqu_si128((__m128i const*)mem); + } - // hadd - template float hadd(batch const& self, requires_arch) { - __m128 tmp0 = _mm_hadd_ps(self, self); - __m128 tmp1 = _mm_hadd_ps(tmp0, tmp0); - return _mm_cvtss_f32(tmp1); - } - template - double hadd(batch const &self, requires_arch) { - __m128d tmp0 = _mm_hadd_pd(self, self); - return _mm_cvtsd_f64(tmp0); - } + // hadd + template + inline float hadd(batch const& self, requires_arch) noexcept + { + __m128 tmp0 = _mm_hadd_ps(self, self); + __m128 tmp1 = _mm_hadd_ps(tmp0, tmp0); + return _mm_cvtss_f32(tmp1); + } + template + inline double hadd(batch const& self, requires_arch) noexcept + { + __m128d tmp0 = _mm_hadd_pd(self, self); + return _mm_cvtsd_f64(tmp0); + } - // haddp - template batch haddp(batch const* row, requires_arch) { - return _mm_hadd_ps(_mm_hadd_ps(row[0], row[1]), - _mm_hadd_ps(row[2], row[3])); - } - template - batch haddp(batch const *row, requires_arch) { - return _mm_hadd_pd(row[0], row[1]); - } + // haddp + template + inline batch haddp(batch const* row, requires_arch) noexcept + { + return _mm_hadd_ps(_mm_hadd_ps(row[0], row[1]), + _mm_hadd_ps(row[2], row[3])); + } + template + inline batch haddp(batch const* row, requires_arch) noexcept + { + return _mm_hadd_pd(row[0], row[1]); + } - } + } } #endif - diff --git a/third_party/xsimd/arch/xsimd_sse4_1.hpp b/third_party/xsimd/arch/xsimd_sse4_1.hpp index dbd35ff3f..6d5fe6ae2 100644 --- a/third_party/xsimd/arch/xsimd_sse4_1.hpp +++ b/third_party/xsimd/arch/xsimd_sse4_1.hpp @@ -1,13 +1,13 @@ /*************************************************************************** -* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * -* Martin Renou * -* Copyright (c) QuantStack * -* Copyright (c) Serge Guelton * -* * -* Distributed under the terms of the BSD 3-Clause License. * -* * -* The full license is in the file LICENSE, distributed with this software. * -****************************************************************************/ + * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * + * Martin Renou * + * Copyright (c) QuantStack * + * Copyright (c) Serge Guelton * + * * + * Distributed under the terms of the BSD 3-Clause License. * + * * + * The full license is in the file LICENSE, distributed with this software. * + ****************************************************************************/ #ifndef XSIMD_SSE4_1_HPP #define XSIMD_SSE4_1_HPP @@ -16,171 +16,239 @@ #include "../types/xsimd_sse4_1_register.hpp" -namespace xsimd { - - namespace kernel { - using namespace types; - // any - template::value, void>::type> - bool any(batch const& self, requires_arch) { - return !_mm_testz_si128(self, self); - } - // ceil - template batch ceil(batch const& self, requires_arch) { - return _mm_ceil_ps(self); - } - template batch ceil(batch const& self, requires_arch) { - return _mm_ceil_pd(self); - } +namespace xsimd +{ + + namespace kernel + { + using namespace types; + // any + template ::value, void>::type> + inline bool any(batch const& self, requires_arch) noexcept + { + return !_mm_testz_si128(self, self); + } + // ceil + template + inline batch ceil(batch const& self, requires_arch) noexcept + { + return _mm_ceil_ps(self); + } + template + inline batch ceil(batch const& self, requires_arch) noexcept + { + return _mm_ceil_pd(self); + } - // eq - template::value, void>::type> - batch_bool eq(batch const& self, batch const& other, requires_arch) { - switch(sizeof(T)) { - case 8: return _mm_cmpeq_epi64(self, other); - default: return eq(self, other, ssse3{}); - } - } + // eq + template ::value, void>::type> + inline batch_bool eq(batch const& self, batch const& other, requires_arch) noexcept + { + switch (sizeof(T)) + { + case 8: + return _mm_cmpeq_epi64(self, other); + default: + return eq(self, other, ssse3 {}); + } + } - // floor - template batch floor(batch const& self, requires_arch) { - return _mm_floor_ps(self); - } - template batch floor(batch const& self, requires_arch) { - return _mm_floor_pd(self); - } + // floor + template + inline batch floor(batch const& self, requires_arch) noexcept + { + return _mm_floor_ps(self); + } + template + inline batch floor(batch const& self, requires_arch) noexcept + { + return _mm_floor_pd(self); + } - // max - template::value, void>::type> - batch max(batch const& self, batch const& other, requires_arch) { - if(std::is_signed::value) { - switch(sizeof(T)) { - case 1: return _mm_max_epi8(self, other); - case 2: return _mm_max_epi16(self, other); - case 4: return _mm_max_epi32(self, other); - default: return max(self, other, ssse3{}); - } - } - else { - switch(sizeof(T)) { - case 1: return _mm_max_epu8(self, other); - case 2: return _mm_max_epu16(self, other); - case 4: return _mm_max_epu32(self, other); - default: return max(self, other, ssse3{}); - } - } - } + // max + template ::value, void>::type> + inline batch max(batch const& self, batch const& other, requires_arch) noexcept + { + if (std::is_signed::value) + { + switch (sizeof(T)) + { + case 1: + return _mm_max_epi8(self, other); + case 2: + return _mm_max_epi16(self, other); + case 4: + return _mm_max_epi32(self, other); + default: + return max(self, other, ssse3 {}); + } + } + else + { + switch (sizeof(T)) + { + case 1: + return _mm_max_epu8(self, other); + case 2: + return _mm_max_epu16(self, other); + case 4: + return _mm_max_epu32(self, other); + default: + return max(self, other, ssse3 {}); + } + } + } - // min - template::value, void>::type> - batch min(batch const& self, batch const& other, requires_arch) { - if(std::is_signed::value) { - switch(sizeof(T)) { - case 1: return _mm_min_epi8(self, other); - case 2: return _mm_min_epi16(self, other); - case 4: return _mm_min_epi32(self, other); - default: return min(self, other, ssse3{}); - } - } - else { - switch(sizeof(T)) { - case 1: return _mm_min_epu8(self, other); - case 2: return _mm_min_epu16(self, other); - case 4: return _mm_min_epu32(self, other); - default: return min(self, other, ssse3{}); - } - } - } + // min + template ::value, void>::type> + inline batch min(batch const& self, batch const& other, requires_arch) noexcept + { + if (std::is_signed::value) + { + switch (sizeof(T)) + { + case 1: + return _mm_min_epi8(self, other); + case 2: + return _mm_min_epi16(self, other); + case 4: + return _mm_min_epi32(self, other); + default: + return min(self, other, ssse3 {}); + } + } + else + { + switch (sizeof(T)) + { + case 1: + return _mm_min_epu8(self, other); + case 2: + return _mm_min_epu16(self, other); + case 4: + return _mm_min_epu32(self, other); + default: + return min(self, other, ssse3 {}); + } + } + } - // mul - template::value, void>::type> - batch mul(batch const& self, batch const& other, requires_arch) { - switch(sizeof(T)) { - case 1: return _mm_or_si128( - _mm_and_si128(_mm_mullo_epi16(self, other), _mm_srli_epi16(_mm_cmpeq_epi8(self, self), 8)), - _mm_slli_epi16(_mm_mullo_epi16(_mm_srli_epi16(self, 8), _mm_srli_epi16(other, 8)), 8) - ); - case 2: return _mm_mullo_epi16(self, other); - case 4: return _mm_mullo_epi32(self, other); - case 8: - return _mm_add_epi64( - _mm_mul_epu32(self, other), - _mm_slli_epi64( - _mm_add_epi64( - _mm_mul_epu32(other, _mm_shuffle_epi32(self, _MM_SHUFFLE(2, 3, 0, 1))), - _mm_mul_epu32(self, _mm_shuffle_epi32(other, _MM_SHUFFLE(2, 3, 0, 1)))), - 32)); - default: assert(false && "unsupported arch/op combination"); return {}; - } - } + // mul + template ::value, void>::type> + inline batch mul(batch const& self, batch const& other, requires_arch) noexcept + { + switch (sizeof(T)) + { + case 1: + return _mm_or_si128( + _mm_and_si128(_mm_mullo_epi16(self, other), _mm_srli_epi16(_mm_cmpeq_epi8(self, self), 8)), + _mm_slli_epi16(_mm_mullo_epi16(_mm_srli_epi16(self, 8), _mm_srli_epi16(other, 8)), 8)); + case 2: + return _mm_mullo_epi16(self, other); + case 4: + return _mm_mullo_epi32(self, other); + case 8: + return _mm_add_epi64( + _mm_mul_epu32(self, other), + _mm_slli_epi64( + _mm_add_epi64( + _mm_mul_epu32(other, _mm_shuffle_epi32(self, _MM_SHUFFLE(2, 3, 0, 1))), + _mm_mul_epu32(self, _mm_shuffle_epi32(other, _MM_SHUFFLE(2, 3, 0, 1)))), + 32)); + default: + assert(false && "unsupported arch/op combination"); + return {}; + } + } - // nearbyint - template batch nearbyint(batch const& self, requires_arch) { - return _mm_round_ps(self, _MM_FROUND_TO_NEAREST_INT); - } - template batch nearbyint(batch const& self, requires_arch) { - return _mm_round_pd(self, _MM_FROUND_TO_NEAREST_INT); - } + // nearbyint + template + inline batch nearbyint(batch const& self, requires_arch) noexcept + { + return _mm_round_ps(self, _MM_FROUND_TO_NEAREST_INT); + } + template + inline batch nearbyint(batch const& self, requires_arch) noexcept + { + return _mm_round_pd(self, _MM_FROUND_TO_NEAREST_INT); + } - // select - namespace detail { - template - constexpr T interleave(T const &cond) { - return (((cond * 0x0101010101010101ULL & 0x8040201008040201ULL) * 0x0102040810204081ULL >> 49) & 0x5555) | - (((cond * 0x0101010101010101ULL & 0x8040201008040201ULL) * 0x0102040810204081ULL >> 48) & 0xAAAA); - } - } + // select + namespace detail + { + template + inline constexpr T interleave(T const& cond) noexcept + { + return (((cond * 0x0101010101010101ULL & 0x8040201008040201ULL) * 0x0102040810204081ULL >> 49) & 0x5555) | (((cond * 0x0101010101010101ULL & 0x8040201008040201ULL) * 0x0102040810204081ULL >> 48) & 0xAAAA); + } + } - template::value, void>::type> - batch select(batch_bool const& cond, batch const& true_br, batch const& false_br, requires_arch) { - return _mm_blendv_epi8(false_br, true_br, cond); - } - template batch select(batch_bool const& cond, batch const& true_br, batch const& false_br, requires_arch) { - return _mm_blendv_ps(false_br, true_br, cond); - } - template batch select(batch_bool const& cond, batch const& true_br, batch const& false_br, requires_arch) { - return _mm_blendv_pd(false_br, true_br, cond); - } + template ::value, void>::type> + inline batch select(batch_bool const& cond, batch const& true_br, batch const& false_br, requires_arch) noexcept + { + return _mm_blendv_epi8(false_br, true_br, cond); + } + template + inline batch select(batch_bool const& cond, batch const& true_br, batch const& false_br, requires_arch) noexcept + { + return _mm_blendv_ps(false_br, true_br, cond); + } + template + inline batch select(batch_bool const& cond, batch const& true_br, batch const& false_br, requires_arch) noexcept + { + return _mm_blendv_pd(false_br, true_br, cond); + } - template::value, void>::type> - batch select(batch_bool_constant, Values...> const&, batch const& true_br, batch const& false_br, requires_arch) { - constexpr int mask = batch_bool_constant, Values...>::mask(); - switch(sizeof(T)) { - case 2: return _mm_blend_epi16(false_br, true_br, mask); - case 4: { - constexpr int imask = detail::interleave(mask); - return _mm_blend_epi16(false_br, true_br, imask); - } - case 8: { - constexpr int imask = detail::interleave(mask); - constexpr int imask2 = detail::interleave(imask); - return _mm_blend_epi16(false_br, true_br, imask2); - } - default: return select(batch_bool_constant, Values...>(), true_br, false_br, ssse3{}); - } - } - template batch select(batch_bool_constant, Values...> const& , batch const& true_br, batch const& false_br, requires_arch) { - constexpr int mask = batch_bool_constant, Values...>::mask(); - return _mm_blend_ps(false_br, true_br, mask); - } - template batch select(batch_bool_constant, Values...> const& , batch const& true_br, batch const& false_br, requires_arch) { - constexpr int mask = batch_bool_constant, Values...>::mask(); - return _mm_blend_pd(false_br, true_br, mask); - } + template ::value, void>::type> + inline batch select(batch_bool_constant, Values...> const&, batch const& true_br, batch const& false_br, requires_arch) noexcept + { + constexpr int mask = batch_bool_constant, Values...>::mask(); + switch (sizeof(T)) + { + case 2: + return _mm_blend_epi16(false_br, true_br, mask); + case 4: + { + constexpr int imask = detail::interleave(mask); + return _mm_blend_epi16(false_br, true_br, imask); + } + case 8: + { + constexpr int imask = detail::interleave(mask); + constexpr int imask2 = detail::interleave(imask); + return _mm_blend_epi16(false_br, true_br, imask2); + } + default: + return select(batch_bool_constant, Values...>(), true_br, false_br, ssse3 {}); + } + } + template + inline batch select(batch_bool_constant, Values...> const&, batch const& true_br, batch const& false_br, requires_arch) noexcept + { + constexpr int mask = batch_bool_constant, Values...>::mask(); + return _mm_blend_ps(false_br, true_br, mask); + } + template + inline batch select(batch_bool_constant, Values...> const&, batch const& true_br, batch const& false_br, requires_arch) noexcept + { + constexpr int mask = batch_bool_constant, Values...>::mask(); + return _mm_blend_pd(false_br, true_br, mask); + } + // trunc + template + inline batch trunc(batch const& self, requires_arch) noexcept + { + return _mm_round_ps(self, _MM_FROUND_TO_ZERO); + } + template + inline batch trunc(batch const& self, requires_arch) noexcept + { + return _mm_round_pd(self, _MM_FROUND_TO_ZERO); + } - // trunc - template batch trunc(batch const& self, requires_arch) { - return _mm_round_ps(self, _MM_FROUND_TO_ZERO); - } - template batch trunc(batch const& self, requires_arch) { - return _mm_round_pd(self, _MM_FROUND_TO_ZERO); } - - } - } #endif - diff --git a/third_party/xsimd/arch/xsimd_sse4_2.hpp b/third_party/xsimd/arch/xsimd_sse4_2.hpp index 2746302fa..8f9b7a76e 100644 --- a/third_party/xsimd/arch/xsimd_sse4_2.hpp +++ b/third_party/xsimd/arch/xsimd_sse4_2.hpp @@ -1,13 +1,13 @@ /*************************************************************************** -* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * -* Martin Renou * -* Copyright (c) QuantStack * -* Copyright (c) Serge Guelton * -* * -* Distributed under the terms of the BSD 3-Clause License. * -* * -* The full license is in the file LICENSE, distributed with this software. * -****************************************************************************/ + * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * + * Martin Renou * + * Copyright (c) QuantStack * + * Copyright (c) Serge Guelton * + * * + * Distributed under the terms of the BSD 3-Clause License. * + * * + * The full license is in the file LICENSE, distributed with this software. * + ****************************************************************************/ #ifndef XSIMD_SSE4_2_HPP #define XSIMD_SSE4_2_HPP @@ -16,26 +16,29 @@ #include "../types/xsimd_sse4_2_register.hpp" -namespace xsimd { +namespace xsimd +{ + + namespace kernel + { + using namespace types; + + // lt + template + inline batch_bool lt(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm_cmpgt_epi64(other, self); + } + template + inline batch_bool lt(batch const& self, batch const& other, requires_arch) noexcept + { + auto xself = _mm_xor_si128(self, _mm_set1_epi64x(std::numeric_limits::lowest())); + auto xother = _mm_xor_si128(other, _mm_set1_epi64x(std::numeric_limits::lowest())); + return _mm_cmpgt_epi64(xother, xself); + } - namespace kernel { - using namespace types; - - // lt - template - batch_bool lt(batch const& self, batch const& other, requires_arch) { - return _mm_cmpgt_epi64(other, self); - } - template - batch_bool lt(batch const& self, batch const& other, requires_arch) { - auto xself = _mm_xor_si128(self, _mm_set1_epi64x(std::numeric_limits::lowest())); - auto xother = _mm_xor_si128(other, _mm_set1_epi64x(std::numeric_limits::lowest())); - return _mm_cmpgt_epi64(xother, xself); } - } - } #endif - diff --git a/third_party/xsimd/arch/xsimd_ssse3.hpp b/third_party/xsimd/arch/xsimd_ssse3.hpp index 4bc2bcfa9..e830cbc73 100644 --- a/third_party/xsimd/arch/xsimd_ssse3.hpp +++ b/third_party/xsimd/arch/xsimd_ssse3.hpp @@ -1,13 +1,13 @@ /*************************************************************************** -* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * -* Martin Renou * -* Copyright (c) QuantStack * -* Copyright (c) Serge Guelton * -* * -* Distributed under the terms of the BSD 3-Clause License. * -* * -* The full license is in the file LICENSE, distributed with this software. * -****************************************************************************/ + * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * + * Martin Renou * + * Copyright (c) QuantStack * + * Copyright (c) Serge Guelton * + * * + * Distributed under the terms of the BSD 3-Clause License. * + * * + * The full license is in the file LICENSE, distributed with this software. * + ****************************************************************************/ #ifndef XSIMD_SSSE3_HPP #define XSIMD_SSSE3_HPP @@ -16,70 +16,90 @@ #include #include "../types/xsimd_ssse3_register.hpp" +#include "../types/xsimd_utils.hpp" -namespace xsimd { +namespace xsimd +{ - namespace kernel { - using namespace types; + namespace kernel + { + using namespace types; - // abs - template::value && std::is_signed::value, void>::type> - batch abs(batch const& self, requires_arch) { - switch(sizeof(T)) { - case 1: return _mm_abs_epi8(self); - case 2: return _mm_abs_epi16(self); - case 4: return _mm_abs_epi32(self); - case 8: return _mm_abs_epi64(self); - default: assert(false && "unsupported arch/op combination"); return {}; - } - } + // abs + template ::value && std::is_signed::value, void>::type> + inline batch abs(batch const& self, requires_arch) noexcept + { + switch (sizeof(T)) + { + case 1: + return _mm_abs_epi8(self); + case 2: + return _mm_abs_epi16(self); + case 4: + return _mm_abs_epi32(self); + case 8: + return _mm_abs_epi64(self); + default: + assert(false && "unsupported arch/op combination"); + return {}; + } + } - // extract_pair - namespace detail { + // extract_pair + namespace detail + { - template - batch extract_pair(batch const&, batch const& other, std::size_t, ::xsimd::detail::index_sequence<>) { - return other; - } + template + inline batch extract_pair(batch const&, batch const& other, std::size_t, ::xsimd::detail::index_sequence<>) noexcept + { + return other; + } - template - batch extract_pair(batch const& self, batch const& other, std::size_t i, ::xsimd::detail::index_sequence) { - if(i == I) { - return _mm_alignr_epi8(self, other, sizeof(T) * I); + template + inline batch extract_pair(batch const& self, batch const& other, std::size_t i, ::xsimd::detail::index_sequence) noexcept + { + if (i == I) + { + return _mm_alignr_epi8(self, other, sizeof(T) * I); + } + else + return extract_pair(self, other, i, ::xsimd::detail::index_sequence()); + } } - else - return extract_pair(self, other, i, ::xsimd::detail::index_sequence()); - } - } - template::value, void>::type> - batch extract_pair(batch const& self, batch const& other, std::size_t i, requires_arch) { - constexpr std::size_t size = batch::size; - assert(0<= i && i< size && "index in bounds"); - return detail::extract_pair(self, other, i, ::xsimd::detail::make_index_sequence()); - } + template ::value, void>::type> + inline batch extract_pair(batch const& self, batch const& other, std::size_t i, requires_arch) noexcept + { + constexpr std::size_t size = batch::size; + assert(0 <= i && i < size && "index in bounds"); + return detail::extract_pair(self, other, i, ::xsimd::detail::make_index_sequence()); + } - // hadd - template::value, void>::type> - T hadd(batch const& self, requires_arch) { - switch(sizeof(T)) { - case 2: { + // hadd + template ::value, void>::type> + inline T hadd(batch const& self, requires_arch) noexcept + { + switch (sizeof(T)) + { + case 2: + { __m128i tmp1 = _mm_hadd_epi16(self, self); __m128i tmp2 = _mm_hadd_epi16(tmp1, tmp1); __m128i tmp3 = _mm_hadd_epi16(tmp2, tmp2); return _mm_cvtsi128_si32(tmp3) & 0xFFFF; - } - case 4: { + } + case 4: + { __m128i tmp1 = _mm_hadd_epi32(self, self); __m128i tmp2 = _mm_hadd_epi32(tmp1, tmp1); return _mm_cvtsi128_si32(tmp2); - } - default: return hadd(self, sse3{}); - } + } + default: + return hadd(self, sse3 {}); + } + } } - } } #endif - diff --git a/third_party/xsimd/config/xsimd_arch.hpp b/third_party/xsimd/config/xsimd_arch.hpp index f88067e4b..7053ec90e 100644 --- a/third_party/xsimd/config/xsimd_arch.hpp +++ b/third_party/xsimd/config/xsimd_arch.hpp @@ -1,13 +1,13 @@ /*************************************************************************** -* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * -* Martin Renou * -* Copyright (c) QuantStack * -* Copyright (c) Serge Guelton * -* * -* Distributed under the terms of the BSD 3-Clause License. * -* * -* The full license is in the file LICENSE, distributed with this software. * -****************************************************************************/ + * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * + * Martin Renou * + * Copyright (c) QuantStack * + * Copyright (c) Serge Guelton * + * * + * Distributed under the terms of the BSD 3-Clause License. * + * * + * The full license is in the file LICENSE, distributed with this software. * + ****************************************************************************/ #ifndef XSIMD_ARCH_HPP #define XSIMD_ARCH_HPP @@ -16,160 +16,233 @@ #include #include -#include "./xsimd_config.hpp" #include "../types/xsimd_all_registers.hpp" +#include "./xsimd_config.hpp" #include "./xsimd_cpuid.hpp" -namespace xsimd { +namespace xsimd +{ - namespace detail { - // Checks whether T appears in Tys. - template struct contains; + namespace detail + { + // Checks whether T appears in Tys. + template + struct contains; - template struct contains : std::false_type {}; + template + struct contains : std::false_type + { + }; - template - struct contains - : std::conditional::value, std::true_type, - contains>::type {}; + template + struct contains + : std::conditional::value, std::true_type, + contains>::type + { + }; - template struct is_sorted; + template + struct is_sorted; - template <> struct is_sorted<> : std::true_type {}; + template <> + struct is_sorted<> : std::true_type + { + }; - template struct is_sorted : std::true_type {}; + template + struct is_sorted : std::true_type + { + }; - template - struct is_sorted - : std::conditional<(A0::version() >= A1::version()), is_sorted, - std::false_type>::type {}; + template + struct is_sorted + : std::conditional<(A0::version() >= A1::version()), is_sorted, + std::false_type>::type + { + }; + + template + inline constexpr T max_of(T value) noexcept + { + return value; + } - } // namespace detail + template + inline constexpr T max_of(T head0, T head1, Ts... tail) noexcept + { + return max_of((head0 > head1 ? head0 : head1), tail...); + } + + } // namespace detail - // An arch_list is a list of architectures, sorted by version number. - template struct arch_list { + // An arch_list is a list of architectures, sorted by version number. + template + struct arch_list + { #ifndef NDEBUG - static_assert(detail::is_sorted::value, - "architecture list must be sorted by version"); + static_assert(detail::is_sorted::value, + "architecture list must be sorted by version"); #endif - template using add = arch_list; + template + using add = arch_list; - template - using extend = arch_list; + template + using extend = arch_list; - template static constexpr bool contains() { - return detail::contains::value; - } + template + static constexpr bool contains() noexcept + { + return detail::contains::value; + } - template static void for_each(F &&f) { - (void)std::initializer_list{(f(Archs{}), true)...}; - } - }; + template + static void for_each(F&& f) noexcept + { + (void)std::initializer_list { (f(Archs {}), true)... }; + } - struct unavailable {}; + static constexpr std::size_t alignment() noexcept + { + // all alignments are a power of two + return detail::max_of(Archs::alignment()..., static_cast(0)); + } + }; - namespace detail { - // Pick the best architecture in arch_list L, which is the last - // because architectures are sorted by version. - template struct best; + struct unavailable + { + static constexpr bool supported() noexcept { return false; } + static constexpr bool available() noexcept { return false; } + static constexpr unsigned version() noexcept { return 0; } + static constexpr std::size_t alignment() noexcept { return 0; } + static constexpr bool requires_alignment() noexcept { return false; } + static constexpr char const* name() noexcept { return ""; } + }; - template <> struct best> { using type = unavailable; }; + namespace detail + { + // Pick the best architecture in arch_list L, which is the last + // because architectures are sorted by version. + template + struct best; - template struct best> { - using type = Arch; - }; + template <> + struct best> + { + using type = unavailable; + }; - // Filter archlists Archs, picking only supported archs and adding - // them to L. - template struct supported_helper; + template + struct best> + { + using type = Arch; + }; - template struct supported_helper> { using type = L; }; + // Filter archlists Archs, picking only supported archs and adding + // them to L. + template + struct supported_helper; - template - struct supported_helper> - : supported_helper< - typename std::conditional, L>::type, - arch_list> {}; + template + struct supported_helper> + { + using type = L; + }; - template - struct supported : supported_helper, Archs...> {}; + template + struct supported_helper> + : supported_helper< + typename std::conditional, L>::type, + arch_list> + { + }; - // Joins all arch_list Archs in a single arch_list. - template struct join; + template + struct supported : supported_helper, Archs...> + { + }; - template struct join { using type = Arch; }; + // Joins all arch_list Archs in a single arch_list. + template + struct join; - template - struct join, Args...> - : join, Args...> {}; - } // namespace detail + template + struct join + { + using type = Arch; + }; - struct unsupported {}; - using all_x86_architectures = arch_list; - using all_arm_architectures = arch_list; - using all_architectures = typename detail::join::type; + template + struct join, Args...> + : join, Args...> + { + }; + } // namespace detail - using supported_architectures = typename detail::supported::type; + struct unsupported + { + }; + using all_x86_architectures = arch_list; + using all_arm_architectures = arch_list; + using all_architectures = typename detail::join::type; - using x86_arch = typename detail::best::type>::type; - using arm_arch = typename detail::best::type>::type; - //using default_arch = typename detail::best>::type>::type; - using default_arch = typename std::conditional::value, - arm_arch, - x86_arch>::type; + using supported_architectures = typename detail::supported::type; + using x86_arch = typename detail::best::type>::type; + using arm_arch = typename detail::best::type>::type; + // using default_arch = typename detail::best>::type>::type; + using default_arch = typename std::conditional::value, + arm_arch, + x86_arch>::type; namespace detail { - template + template class dispatcher { const unsigned best_arch; F functor; - template - auto walk_archs(arch_list, Tys&&... args) -> decltype(functor(Arch{}, std::forward(args)...)) + template + auto walk_archs(arch_list, Tys&&... args) noexcept -> decltype(functor(Arch {}, std::forward(args)...)) { - static_assert(Arch::supported(), "dispatching on supported architecture"); assert(Arch::available() && "At least one arch must be supported during dispatch"); - return functor(Arch{}, std::forward(args)...); + return functor(Arch {}, std::forward(args)...); } - template - auto walk_archs(arch_list, Tys&&... args) -> decltype(functor(Arch{}, std::forward(args)...)) + template + auto walk_archs(arch_list, Tys&&... args) noexcept -> decltype(functor(Arch {}, std::forward(args)...)) { - static_assert(Arch::supported(), "dispatching on supported architecture"); - if(Arch::version() == best_arch) - return functor(Arch{}, std::forward(args)...); + if (Arch::version() <= best_arch) + return functor(Arch {}, std::forward(args)...); else - return walk_archs(arch_list{}, std::forward(args)...); + return walk_archs(arch_list {}, std::forward(args)...); } - public: - - dispatcher(F f) : best_arch(available_architectures().best), functor(f) + public: + dispatcher(F f) noexcept + : best_arch(available_architectures().best) + , functor(f) { } - template - auto operator()(Tys&&... args) -> decltype(functor(default_arch{}, std::forward(args)...)) + template + auto operator()(Tys&&... args) noexcept -> decltype(functor(default_arch {}, std::forward(args)...)) { - return walk_archs(ArchList{}, std::forward(args)...); + return walk_archs(ArchList {}, std::forward(args)...); } }; } // Generic function dispatch, à la ifunc - template - inline detail::dispatcher dispatch(F&& f) + template + inline detail::dispatcher dispatch(F&& f) noexcept { - return {std::forward(f)}; + return { std::forward(f) }; } } // namespace xsimd #endif - diff --git a/third_party/xsimd/config/xsimd_config.hpp b/third_party/xsimd/config/xsimd_config.hpp index a65726365..81cb7e11c 100644 --- a/third_party/xsimd/config/xsimd_config.hpp +++ b/third_party/xsimd/config/xsimd_config.hpp @@ -1,20 +1,20 @@ /*************************************************************************** -* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * -* Martin Renou * -* Copyright (c) QuantStack * -* Copyright (c) Serge Guelton * -* * -* Distributed under the terms of the BSD 3-Clause License. * -* * -* The full license is in the file LICENSE, distributed with this software. * -****************************************************************************/ + * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * + * Martin Renou * + * Copyright (c) QuantStack * + * Copyright (c) Serge Guelton * + * * + * Distributed under the terms of the BSD 3-Clause License. * + * * + * The full license is in the file LICENSE, distributed with this software. * + ****************************************************************************/ #ifndef XSIMD_CONFIG_HPP #define XSIMD_CONFIG_HPP #define XSIMD_VERSION_MAJOR 8 #define XSIMD_VERSION_MINOR 0 -#define XSIMD_VERSION_PATCH 3 +#define XSIMD_VERSION_PATCH 5 /** * high level free functions @@ -106,7 +106,7 @@ */ #ifdef __FMA__ -#if defined(__SSE__) && ! defined(__AVX__) +#if defined(__SSE__) && !defined(__AVX__) #define XSIMD_WITH_FMA3 1 #else #define XSIMD_WITH_FMA3 0 @@ -139,19 +139,19 @@ * Set to 1 if AVX512F is available at compile-time, to 0 otherwise. */ #ifdef __AVX512F__ - // AVX512 instructions are supported starting with gcc 6 - // see https://www.gnu.org/software/gcc/gcc-6/changes.html - // check clang first, newer clang always defines __GNUC__ = 4 - #if defined(__clang__) && __clang_major__ >= 6 - #define XSIMD_WITH_AVX512F 1 - #elif defined(__GNUC__) && __GNUC__ < 6 - #define XSIMD_WITH_AVX512F 0 - #else - #define XSIMD_WITH_AVX512F 1 - #if __GNUC__ == 6 - #define XSIMD_AVX512_SHIFT_INTRINSICS_IMM_ONLY 1 - #endif - #endif +// AVX512 instructions are supported starting with gcc 6 +// see https://www.gnu.org/software/gcc/gcc-6/changes.html +// check clang first, newer clang always defines __GNUC__ = 4 +#if defined(__clang__) && __clang_major__ >= 6 +#define XSIMD_WITH_AVX512F 1 +#elif defined(__GNUC__) && __GNUC__ < 6 +#define XSIMD_WITH_AVX512F 0 +#else +#define XSIMD_WITH_AVX512F 1 +#if __GNUC__ == 6 +#define XSIMD_AVX512_SHIFT_INTRINSICS_IMM_ONLY 1 +#endif +#endif #else #define XSIMD_WITH_AVX512F 0 #endif @@ -197,25 +197,25 @@ * * Set to 1 if NEON is available at compile-time, to 0 otherwise. */ - #if __ARM_ARCH >= 7 - #define XSIMD_WITH_NEON 1 - #else - #define XSIMD_WITH_NEON 0 - #endif +#if __ARM_ARCH >= 7 +#define XSIMD_WITH_NEON 1 +#else +#define XSIMD_WITH_NEON 0 +#endif /** * @ingroup xsimd_config_macro * * Set to 1 if NEON64 is available at compile-time, to 0 otherwise. */ - #ifdef __aarch64__ - #define XSIMD_WITH_NEON64 1 - #else - #define XSIMD_WITH_NEON64 0 - #endif +#ifdef __aarch64__ +#define XSIMD_WITH_NEON64 1 #else - #define XSIMD_WITH_NEON 0 - #define XSIMD_WITH_NEON64 0 +#define XSIMD_WITH_NEON64 0 +#endif +#else +#define XSIMD_WITH_NEON 0 +#define XSIMD_WITH_NEON64 0 #endif // Workaround for MSVC compiler @@ -229,6 +229,8 @@ #if XSIMD_WITH_AVX2 #undef XSIMD_WITH_AVX #define XSIMD_WITH_AVX 1 +#undef XSIMD_WITH_FMA5 +#define XSIMD_WITH_FMA5 1 #endif #if XSIMD_WITH_AVX @@ -263,4 +265,8 @@ #endif +#if !XSIMD_WITH_SSE2 && !XSIMD_WITH_SSE3 && !XSIMD_WITH_SSSE3 && !XSIMD_WITH_SSE4_1 && !XSIMD_WITH_SSE4_2 && !XSIMD_WITH_AVX && !XSIMD_WITH_AVX2 && !XSIMD_WITH_FMA3 && !XSIMD_WITH_FMA5 && !XSIMD_WITH_AVX512F && !XSIMD_WITH_AVX512CD && !XSIMD_WITH_AVX512DQ && !XSIMD_WITH_AVX512BW && !XSIMD_WITH_NEON && !XSIMD_WITH_NEON64 +#define XSIMD_NO_SUPPORTED_ARCHITECTURE +#endif + #endif diff --git a/third_party/xsimd/config/xsimd_cpuid.hpp b/third_party/xsimd/config/xsimd_cpuid.hpp index 48b8bd452..869d3e7f8 100644 --- a/third_party/xsimd/config/xsimd_cpuid.hpp +++ b/third_party/xsimd/config/xsimd_cpuid.hpp @@ -1,13 +1,13 @@ /*************************************************************************** -* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * -* Martin Renou * -* Copyright (c) QuantStack * -* Copyright (c) Serge Guelton * -* * -* Distributed under the terms of the BSD 3-Clause License. * -* * -* The full license is in the file LICENSE, distributed with this software. * -****************************************************************************/ + * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * + * Martin Renou * + * Copyright (c) QuantStack * + * Copyright (c) Serge Guelton * + * * + * Distributed under the terms of the BSD 3-Clause License. * + * * + * The full license is in the file LICENSE, distributed with this software. * + ****************************************************************************/ #ifndef XSIMD_CPUID_HPP #define XSIMD_CPUID_HPP @@ -16,8 +16,8 @@ #include #if defined(__linux__) && (defined(__ARM_NEON) || defined(_M_ARM)) -#include #include +#include #endif #if defined(_MSC_VER) @@ -45,6 +45,8 @@ namespace xsimd unsigned avx : 1; unsigned avx2 : 1; unsigned avx512f : 1; + unsigned avx512cd : 1; + unsigned avx512dq : 1; unsigned avx512bw : 1; unsigned neon : 1; unsigned neon64 : 1; @@ -52,109 +54,113 @@ namespace xsimd // version number of the best arch available unsigned best; - supported_arch() + supported_arch() noexcept { - memset(this, 0, sizeof(supported_arch)); + memset(this, 0, sizeof(supported_arch)); #if defined(__aarch64__) || defined(_M_ARM64) - neon = 1; - neon64 = 1; - best = neon64::version(); + neon = 1; + neon64 = 1; + best = neon64::version(); #elif defined(__ARM_NEON) || defined(_M_ARM) #if defined(__linux__) - neon = bool(getauxval(AT_HWCAP) & HWCAP_NEON); + neon = bool(getauxval(AT_HWCAP) & HWCAP_NEON); #else - // that's very conservative :-/ - neon = 0; + // that's very conservative :-/ + neon = 0; #endif - neon64 = 0; - best = neon::version() * neon; + neon64 = 0; + best = neon::version() * neon; #elif defined(__x86_64__) || defined(__i386__) || defined(_M_AMD64) || defined(_M_IX86) - auto get_cpuid = [](int reg[4], int func_id) - { + auto get_cpuid = [](int reg[4], int func_id) noexcept + { #if defined(_MSC_VER) - __cpuidex(reg, func_id, 0); + __cpuidex(reg, func_id, 0); #elif defined(__INTEL_COMPILER) - __cpuid(reg, func_id); + __cpuid(reg, func_id); #elif defined(__GNUC__) || defined(__clang__) -#if defined( __i386__ ) && defined(__PIC__) - // %ebx may be the PIC register - __asm__("xchg{l}\t{%%}ebx, %1\n\t" - "cpuid\n\t" - "xchg{l}\t{%%}ebx, %1\n\t" - : "=a" (reg[0]), "=r" (reg[1]), "=c" (reg[2]), - "=d" (reg[3]) - : "a" (func_id), "c" (0) - ); +#if defined(__i386__) && defined(__PIC__) + // %ebx may be the PIC register + __asm__("xchg{l}\t{%%}ebx, %1\n\t" + "cpuid\n\t" + "xchg{l}\t{%%}ebx, %1\n\t" + : "=a"(reg[0]), "=r"(reg[1]), "=c"(reg[2]), + "=d"(reg[3]) + : "a"(func_id), "c"(0)); #else - __asm__("cpuid\n\t" - : "=a" (reg[0]), "=b" (reg[1]), "=c" (reg[2]), - "=d" (reg[3]) - : "a" (func_id), "c" (0) - ); + __asm__("cpuid\n\t" + : "=a"(reg[0]), "=b"(reg[1]), "=c"(reg[2]), + "=d"(reg[3]) + : "a"(func_id), "c"(0)); #endif #else #error "Unsupported configuration" #endif - }; + }; + + int regs[4]; + + get_cpuid(regs, 0x1); - int regs[4]; + sse2 = regs[2] >> 26 & 1; + best = std::max(best, sse2::version() * sse2); - get_cpuid(regs, 0x1); + sse3 = regs[2] >> 0 & 1; + best = std::max(best, sse3::version() * sse3); - sse2 = regs[2] >> 26 & 1; - best = std::max(best, sse2::version() * sse2); + // ssse3 = regs[2] >> 9 & 1; + // best = std::max(best, ssse3::version() * ssse3); - sse3 = regs[2] >> 0 & 1; - best = std::max(best, sse3::version() * sse3); + sse4_1 = regs[2] >> 19 & 1; + best = std::max(best, sse4_1::version() * sse4_1); - //ssse3 = regs[2] >> 9 & 1; - //best = std::max(best, ssse3::version() * ssse3); + sse4_2 = regs[2] >> 20 & 1; + best = std::max(best, sse4_2::version() * sse4_2); - sse4_1 = regs[2] >> 19 & 1; - best = std::max(best, sse4_1::version() * sse4_1); + // sse4a = regs[2] >> 6 & 1; + // best = std::max(best, XSIMD_X86_AMD_SSE4A_VERSION * sse4a); - sse4_2 = regs[2] >> 20 & 1; - best = std::max(best, sse4_2::version() * sse4_2); + // xop = regs[2] >> 11 & 1; + // best = std::max(best, XSIMD_X86_AMD_XOP_VERSION * xop); - //sse4a = regs[2] >> 6 & 1; - //best = std::max(best, XSIMD_X86_AMD_SSE4A_VERSION * sse4a); + avx = regs[2] >> 28 & 1; + best = std::max(best, avx::version() * avx); - //xop = regs[2] >> 11 & 1; - //best = std::max(best, XSIMD_X86_AMD_XOP_VERSION * xop); + fma3 = regs[2] >> 12 & 1; - avx = regs[2] >> 28 & 1; - best = std::max(best, avx::version() * avx); + get_cpuid(regs, 0x7); + avx2 = regs[1] >> 5 & 1; + best = std::max(best, avx2::version() * avx2); + best = std::max(best, fma5::version() * avx2 * fma3); - //fma3 = regs[2] >> 12 & 1; - //best = std::max(best, XSIMD_X86_FMA3_VERSION * fma3); + avx512f = regs[1] >> 16 & 1; + best = std::max(best, avx512f::version() * avx512f); - get_cpuid(regs, 0x7); - avx2 = regs[1] >> 5 & 1; - best = std::max(best, avx2::version() * avx2); + avx512cd = regs[1] >> 28 & 1; + best = std::max(best, avx512cd::version() * avx512cd * avx512f); - avx512f = regs[1] >> 16 & 1; - best = std::max(best, avx512f::version() * avx512f); + avx512dq = regs[1] >> 17 & 1; + best = std::max(best, avx512dq::version() * avx512dq * avx512cd * avx512f); - avx512bw = regs[1] >> 30 & 1; - best = std::max(best, avx512bw::version() * avx512bw); + avx512bw = regs[1] >> 30 & 1; + best = std::max(best, avx512bw::version() * avx512bw * avx512dq * avx512cd * avx512f); - //get_cpuid(regs, 0x80000001); - //fma4 = regs[2] >> 16 & 1; - //best = std::max(best, XSIMD_X86_AMD_FMA4_VERSION * fma4); + // get_cpuid(regs, 0x80000001); + // fma4 = regs[2] >> 16 & 1; + // best = std::max(best, XSIMD_X86_AMD_FMA4_VERSION * fma4); #endif } }; } - inline detail::supported_arch available_architectures() + inline detail::supported_arch available_architectures() noexcept { static detail::supported_arch supported; return supported; diff --git a/third_party/xsimd/math/xsimd_rem_pio2.hpp b/third_party/xsimd/math/xsimd_rem_pio2.hpp index 65bcc772e..518ed8f0d 100644 --- a/third_party/xsimd/math/xsimd_rem_pio2.hpp +++ b/third_party/xsimd/math/xsimd_rem_pio2.hpp @@ -1,13 +1,13 @@ /*************************************************************************** -* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * -* Martin Renou * -* Copyright (c) QuantStack * -* Copyright (c) Serge Guelton * -* * -* Distributed under the terms of the BSD 3-Clause License. * -* * -* The full license is in the file LICENSE, distributed with this software. * -****************************************************************************/ + * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * + * Martin Renou * + * Copyright (c) QuantStack * + * Copyright (c) Serge Guelton * + * * + * Distributed under the terms of the BSD 3-Clause License. * + * * + * The full license is in the file LICENSE, distributed with this software. * + ****************************************************************************/ #include #include @@ -36,7 +36,7 @@ namespace xsimd #define ONCE0 while (0) #endif -/* + /* * ==================================================== * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved. * @@ -52,11 +52,9 @@ namespace xsimd #define XSIMD_LITTLE_ENDIAN #endif #elif defined(_WIN32) - // We can safely assume that Windows is always little endian - #define XSIMD_LITTLE_ENDIAN -#elif defined(i386) || defined(i486) || \ - defined(intel) || defined(x86) || defined(i86pc) || \ - defined(__alpha) || defined(__osf__) +// We can safely assume that Windows is always little endian +#define XSIMD_LITTLE_ENDIAN +#elif defined(i386) || defined(i486) || defined(intel) || defined(x86) || defined(i86pc) || defined(__alpha) || defined(__osf__) #define XSIMD_LITTLE_ENDIAN #endif @@ -68,53 +66,49 @@ namespace xsimd #define HIGH_WORD_IDX 0 #endif -#define GET_HIGH_WORD(i, d) \ - do \ - { \ - double f = (d); \ - std::memcpy(&(i), reinterpret_cast(&f) + \ - HIGH_WORD_IDX, \ - sizeof(std::uint32_t)); \ - } \ - ONCE0 \ -/**/ - -#define GET_LOW_WORD(i, d) \ - do \ - { \ - double f = (d); \ - std::memcpy(&(i), reinterpret_cast(&f) + \ - LOW_WORD_IDX, \ - sizeof(std::uint32_t)); \ - } \ - ONCE0 \ -/**/ - -#define SET_HIGH_WORD(d, v) \ - do \ - { \ - double f = (d); \ - std::uint32_t value = (v); \ - std::memcpy(reinterpret_cast(&f) + \ - HIGH_WORD_IDX, \ - &value, sizeof(std::uint32_t)); \ - (d) = f; \ - } \ - ONCE0 \ -/**/ - -#define SET_LOW_WORD(d, v) \ - do \ - { \ - double f = (d); \ - std::uint32_t value = (v); \ - std::memcpy(reinterpret_cast(&f) + \ - LOW_WORD_IDX, \ - &value, sizeof(std::uint32_t)); \ - (d) = f; \ - } \ - ONCE0 \ - /**/ +#define GET_HIGH_WORD(i, d) \ + do \ + { \ + double f = (d); \ + std::memcpy(&(i), reinterpret_cast(&f) + HIGH_WORD_IDX, \ + sizeof(std::uint32_t)); \ + } \ + ONCE0 \ + /**/ + +#define GET_LOW_WORD(i, d) \ + do \ + { \ + double f = (d); \ + std::memcpy(&(i), reinterpret_cast(&f) + LOW_WORD_IDX, \ + sizeof(std::uint32_t)); \ + } \ + ONCE0 \ + /**/ + +#define SET_HIGH_WORD(d, v) \ + do \ + { \ + double f = (d); \ + std::uint32_t value = (v); \ + std::memcpy(reinterpret_cast(&f) + HIGH_WORD_IDX, \ + &value, sizeof(std::uint32_t)); \ + (d) = f; \ + } \ + ONCE0 \ + /**/ + +#define SET_LOW_WORD(d, v) \ + do \ + { \ + double f = (d); \ + std::uint32_t value = (v); \ + std::memcpy(reinterpret_cast(&f) + LOW_WORD_IDX, \ + &value, sizeof(std::uint32_t)); \ + (d) = f; \ + } \ + ONCE0 \ + /**/ /* * __kernel_rem_pio2(x,y,e0,nx,prec,ipio2) @@ -223,9 +217,9 @@ namespace xsimd * */ - inline int32_t __kernel_rem_pio2(double* x, double* y, int32_t e0, int32_t nx, int32_t prec, const int32_t* ipio2) + inline int32_t __kernel_rem_pio2(double* x, double* y, int32_t e0, int32_t nx, int32_t prec, const int32_t* ipio2) noexcept { - static const int32_t init_jk[] = {2, 3, 4, 6}; /* initial value for jk */ + static const int32_t init_jk[] = { 2, 3, 4, 6 }; /* initial value for jk */ static const double PIo2[] = { 1.57079625129699707031e+00, /* 0x3FF921FB, 0x40000000 */ @@ -239,7 +233,8 @@ namespace xsimd }; static const double - zero = 0.0, + zero + = 0.0, one = 1.0, two24 = 1.67772160000000000000e+07, /* 0x41700000, 0x00000000 */ twon24 = 5.96046447753906250000e-08; /* 0x3E700000, 0x00000000 */ @@ -455,44 +450,125 @@ namespace xsimd return n & 7; } - - inline std::int32_t __ieee754_rem_pio2(double x, double* y) + inline std::int32_t __ieee754_rem_pio2(double x, double* y) noexcept { static const std::int32_t two_over_pi[] = { - 0xA2F983, 0x6E4E44, 0x1529FC, 0x2757D1, 0xF534DD, 0xC0DB62, - 0x95993C, 0x439041, 0xFE5163, 0xABDEBB, 0xC561B7, 0x246E3A, - 0x424DD2, 0xE00649, 0x2EEA09, 0xD1921C, 0xFE1DEB, 0x1CB129, - 0xA73EE8, 0x8235F5, 0x2EBB44, 0x84E99C, 0x7026B4, 0x5F7E41, - 0x3991D6, 0x398353, 0x39F49C, 0x845F8B, 0xBDF928, 0x3B1FF8, - 0x97FFDE, 0x05980F, 0xEF2F11, 0x8B5A0A, 0x6D1F6D, 0x367ECF, - 0x27CB09, 0xB74F46, 0x3F669E, 0x5FEA2D, 0x7527BA, 0xC7EBE5, - 0xF17B3D, 0x0739F7, 0x8A5292, 0xEA6BFB, 0x5FB11F, 0x8D5D08, - 0x560330, 0x46FC7B, 0x6BABF0, 0xCFBC20, 0x9AF436, 0x1DA9E3, - 0x91615E, 0xE61B08, 0x659985, 0x5F14A0, 0x68408D, 0xFFD880, - 0x4D7327, 0x310606, 0x1556CA, 0x73A8C9, 0x60E27B, 0xC08C6B, + 0xA2F983, + 0x6E4E44, + 0x1529FC, + 0x2757D1, + 0xF534DD, + 0xC0DB62, + 0x95993C, + 0x439041, + 0xFE5163, + 0xABDEBB, + 0xC561B7, + 0x246E3A, + 0x424DD2, + 0xE00649, + 0x2EEA09, + 0xD1921C, + 0xFE1DEB, + 0x1CB129, + 0xA73EE8, + 0x8235F5, + 0x2EBB44, + 0x84E99C, + 0x7026B4, + 0x5F7E41, + 0x3991D6, + 0x398353, + 0x39F49C, + 0x845F8B, + 0xBDF928, + 0x3B1FF8, + 0x97FFDE, + 0x05980F, + 0xEF2F11, + 0x8B5A0A, + 0x6D1F6D, + 0x367ECF, + 0x27CB09, + 0xB74F46, + 0x3F669E, + 0x5FEA2D, + 0x7527BA, + 0xC7EBE5, + 0xF17B3D, + 0x0739F7, + 0x8A5292, + 0xEA6BFB, + 0x5FB11F, + 0x8D5D08, + 0x560330, + 0x46FC7B, + 0x6BABF0, + 0xCFBC20, + 0x9AF436, + 0x1DA9E3, + 0x91615E, + 0xE61B08, + 0x659985, + 0x5F14A0, + 0x68408D, + 0xFFD880, + 0x4D7327, + 0x310606, + 0x1556CA, + 0x73A8C9, + 0x60E27B, + 0xC08C6B, }; static const std::int32_t npio2_hw[] = { - 0x3FF921FB, 0x400921FB, 0x4012D97C, 0x401921FB, 0x401F6A7A, 0x4022D97C, - 0x4025FDBB, 0x402921FB, 0x402C463A, 0x402F6A7A, 0x4031475C, 0x4032D97C, - 0x40346B9C, 0x4035FDBB, 0x40378FDB, 0x403921FB, 0x403AB41B, 0x403C463A, - 0x403DD85A, 0x403F6A7A, 0x40407E4C, 0x4041475C, 0x4042106C, 0x4042D97C, - 0x4043A28C, 0x40446B9C, 0x404534AC, 0x4045FDBB, 0x4046C6CB, 0x40478FDB, - 0x404858EB, 0x404921FB, + 0x3FF921FB, + 0x400921FB, + 0x4012D97C, + 0x401921FB, + 0x401F6A7A, + 0x4022D97C, + 0x4025FDBB, + 0x402921FB, + 0x402C463A, + 0x402F6A7A, + 0x4031475C, + 0x4032D97C, + 0x40346B9C, + 0x4035FDBB, + 0x40378FDB, + 0x403921FB, + 0x403AB41B, + 0x403C463A, + 0x403DD85A, + 0x403F6A7A, + 0x40407E4C, + 0x4041475C, + 0x4042106C, + 0x4042D97C, + 0x4043A28C, + 0x40446B9C, + 0x404534AC, + 0x4045FDBB, + 0x4046C6CB, + 0x40478FDB, + 0x404858EB, + 0x404921FB, }; /* - * invpio2: 53 bits of 2/pi - * pio2_1: first 33 bit of pi/2 - * pio2_1t: pi/2 - pio2_1 - * pio2_2: second 33 bit of pi/2 - * pio2_2t: pi/2 - (pio2_1+pio2_2) - * pio2_3: third 33 bit of pi/2 - * pio2_3t: pi/2 - (pio2_1+pio2_2+pio2_3) - */ + * invpio2: 53 bits of 2/pi + * pio2_1: first 33 bit of pi/2 + * pio2_1t: pi/2 - pio2_1 + * pio2_2: second 33 bit of pi/2 + * pio2_2t: pi/2 - (pio2_1+pio2_2) + * pio2_3: third 33 bit of pi/2 + * pio2_3t: pi/2 - (pio2_1+pio2_2+pio2_3) + */ static const double - zero = 0.00000000000000000000e+00, /* 0x00000000, 0x00000000 */ + zero + = 0.00000000000000000000e+00, /* 0x00000000, 0x00000000 */ half = 5.00000000000000000000e-01, /* 0x3FE00000, 0x00000000 */ two24 = 1.67772160000000000000e+07, /* 0x41700000, 0x00000000 */ invpio2 = 6.36619772367581382433e-01, /* 0x3FE45F30, 0x6DC9C883 */ @@ -600,8 +676,8 @@ namespace xsimd return n; } /* - * all other (large) arguments - */ + * all other (large) arguments + */ if (ix >= 0x7ff00000) { /* x is inf or NaN */ y[0] = y[1] = x - x; diff --git a/third_party/xsimd/memory/xsimd_aligned_allocator.hpp b/third_party/xsimd/memory/xsimd_aligned_allocator.hpp index c620861ff..d599c7c2b 100644 --- a/third_party/xsimd/memory/xsimd_aligned_allocator.hpp +++ b/third_party/xsimd/memory/xsimd_aligned_allocator.hpp @@ -1,13 +1,13 @@ /*************************************************************************** -* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * -* Martin Renou * -* Copyright (c) QuantStack * -* Copyright (c) Serge Guelton * -* * -* Distributed under the terms of the BSD 3-Clause License. * -* * -* The full license is in the file LICENSE, distributed with this software. * -****************************************************************************/ + * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * + * Martin Renou * + * Copyright (c) QuantStack * + * Copyright (c) Serge Guelton * + * * + * Distributed under the terms of the BSD 3-Clause License. * + * * + * The full license is in the file LICENSE, distributed with this software. * + ****************************************************************************/ #ifndef XSIMD_ALIGNED_ALLOCATOR_HPP #define XSIMD_ALIGNED_ALLOCATOR_HPP @@ -39,11 +39,10 @@ namespace xsimd * @tparam T type of objects to allocate. * @tparam Align alignment in bytes. */ - template + template class aligned_allocator { public: - using value_type = T; using pointer = T*; using const_pointer = const T*; @@ -92,14 +91,12 @@ namespace xsimd bool operator!=(const aligned_allocator& lhs, const aligned_allocator& rhs) noexcept; - void* aligned_malloc(size_t size, size_t alignment); void aligned_free(void* ptr); template size_t get_alignment_offset(const T* p, size_t size, size_t block_size); - /************************************ * aligned_allocator implementation * ************************************/ @@ -258,7 +255,7 @@ namespace xsimd } /** - * @ingroup allocator_comparison + * @ingroup allocator_comparison * Compares two aligned memory allocator for inequality. Since allocators * are stateless, return \c true iff A1 != A2. * @param lhs aligned_allocator to compare. @@ -272,7 +269,6 @@ namespace xsimd return !(lhs == rhs); } - /**************************************** * aligned malloc / free implementation * ****************************************/ @@ -287,7 +283,7 @@ namespace xsimd #ifdef _WIN32 res = _aligned_malloc(size, alignment); #else - if(posix_memalign(&res, alignment, size) != 0) + if (posix_memalign(&res, alignment, size) != 0) { res = nullptr; } @@ -342,11 +338,10 @@ namespace xsimd } } - template + template using default_allocator = typename std::conditional, - std::allocator - >::type; + aligned_allocator, + std::allocator>::type; } #endif diff --git a/third_party/xsimd/memory/xsimd_alignment.hpp b/third_party/xsimd/memory/xsimd_alignment.hpp index 40f75c5e2..70c1117b5 100644 --- a/third_party/xsimd/memory/xsimd_alignment.hpp +++ b/third_party/xsimd/memory/xsimd_alignment.hpp @@ -1,34 +1,34 @@ /*************************************************************************** -* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * -* Martin Renou * -* Copyright (c) QuantStack * -* Copyright (c) Serge Guelton * -* * -* Distributed under the terms of the BSD 3-Clause License. * -* * -* The full license is in the file LICENSE, distributed with this software. * -****************************************************************************/ + * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * + * Martin Renou * + * Copyright (c) QuantStack * + * Copyright (c) Serge Guelton * + * * + * Distributed under the terms of the BSD 3-Clause License. * + * * + * The full license is in the file LICENSE, distributed with this software. * + ****************************************************************************/ #ifndef XSIMD_ALIGNMENT_HPP #define XSIMD_ALIGNMENT_HPP -#include "xsimd_aligned_allocator.hpp" #include "../types/xsimd_utils.hpp" +#include "xsimd_aligned_allocator.hpp" namespace xsimd { /** - * @struct aligned_mode - * @brief tag for load and store of aligned memory. - */ + * @struct aligned_mode + * @brief tag for load and store of aligned memory. + */ struct aligned_mode { }; /** - * @struct unaligned_mode - * @brief tag for load and store of unaligned memory. - */ + * @struct unaligned_mode + * @brief tag for load and store of unaligned memory. + */ struct unaligned_mode { }; diff --git a/third_party/xsimd/stl/algorithms.hpp b/third_party/xsimd/stl/algorithms.hpp index ad9b02889..8e163582c 100644 --- a/third_party/xsimd/stl/algorithms.hpp +++ b/third_party/xsimd/stl/algorithms.hpp @@ -1,13 +1,13 @@ /*************************************************************************** -* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * -* Martin Renou * -* Copyright (c) QuantStack * -* Copyright (c) Serge Guelton * -* * -* Distributed under the terms of the BSD 3-Clause License. * -* * -* The full license is in the file LICENSE, distributed with this software. * -****************************************************************************/ + * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * + * Martin Renou * + * Copyright (c) QuantStack * + * Copyright (c) Serge Guelton * + * * + * Distributed under the terms of the BSD 3-Clause License. * + * * + * The full license is in the file LICENSE, distributed with this software. * + ****************************************************************************/ #ifndef XSIMD_ALGORITHMS_HPP #define XSIMD_ALGORITHMS_HPP @@ -21,8 +21,8 @@ namespace xsimd { - template - void transform(I1 first, I2 last, O1 out_first, UF&& f) + template + void transform(I1 first, I2 last, O1 out_first, UF&& f) noexcept { using value_type = typename std::decay::type; using batch_type = batch; @@ -75,8 +75,8 @@ namespace xsimd } } - template - void transform(I1 first_1, I2 last_1, I3 first_2, O1 out_first, UF&& f) + template + void transform(I1 first_1, I2 last_1, I3 first_2, O1 out_first, UF&& f) noexcept { using value_type = typename std::decay::type; using batch_type = batch; @@ -93,24 +93,24 @@ namespace xsimd std::size_t out_align = xsimd::get_alignment_offset(ptr_out, size, simd_size); std::size_t align_end = align_begin_1 + ((size - align_begin_1) & ~(simd_size - 1)); - #define XSIMD_LOOP_MACRO(A1, A2, A3) \ - for (std::size_t i = 0; i < align_begin_1; ++i) \ - { \ - out_first[i] = f(first_1[i], first_2[i]); \ - } \ - \ - batch_type batch_1, batch_2; \ - for (std::size_t i = align_begin_1; i < align_end; i += simd_size) \ - { \ - batch_1 = batch_type::A1(&first_1[i]); \ - batch_2 = batch_type::A2(&first_2[i]); \ - xsimd::A3(&out_first[i], f(batch_1, batch_2)); \ - } \ - \ - for (std::size_t i = align_end; i < size; ++i) \ - { \ - out_first[i] = f(first_1[i], first_2[i]); \ - } \ +#define XSIMD_LOOP_MACRO(A1, A2, A3) \ + for (std::size_t i = 0; i < align_begin_1; ++i) \ + { \ + out_first[i] = f(first_1[i], first_2[i]); \ + } \ + \ + batch_type batch_1, batch_2; \ + for (std::size_t i = align_begin_1; i < align_end; i += simd_size) \ + { \ + batch_1 = batch_type::A1(&first_1[i]); \ + batch_2 = batch_type::A2(&first_2[i]); \ + xsimd::A3(&out_first[i], f(batch_1, batch_2)); \ + } \ + \ + for (std::size_t i = align_end; i < size; ++i) \ + { \ + out_first[i] = f(first_1[i], first_2[i]); \ + } if (align_begin_1 == out_align && align_begin_1 == align_begin_2) { @@ -129,23 +129,21 @@ namespace xsimd XSIMD_LOOP_MACRO(load_aligned, load_unaligned, store_unaligned); } - #undef XSIMD_LOOP_MACRO +#undef XSIMD_LOOP_MACRO } - // TODO: Remove this once we drop C++11 support namespace detail { struct plus { template - auto operator()(X&& x, Y&& y) -> decltype(x + y) { return x + y; } + auto operator()(X&& x, Y&& y) noexcept -> decltype(x + y) { return x + y; } }; } - - template - Init reduce(Iterator1 first, Iterator2 last, Init init, BinaryFunction&& binfun = detail::plus{}) + template + Init reduce(Iterator1 first, Iterator2 last, Init init, BinaryFunction&& binfun = detail::plus {}) noexcept { using value_type = typename std::decay::type; using batch_type = batch; @@ -153,9 +151,9 @@ namespace xsimd std::size_t size = static_cast(std::distance(first, last)); constexpr std::size_t simd_size = batch_type::size; - if(size < simd_size) + if (size < simd_size) { - while(first != last) + while (first != last) { init = binfun(init, *first++); } @@ -186,7 +184,8 @@ namespace xsimd // reduce across batch alignas(batch_type) std::array arr; xsimd::store_aligned(arr.data(), batch_init); - for (auto x : arr) init = binfun(init, x); + for (auto x : arr) + init = binfun(init, x); // reduce final unaligned part for (std::size_t i = align_end; i < size; ++i) diff --git a/third_party/xsimd/types/xsimd_all_registers.hpp b/third_party/xsimd/types/xsimd_all_registers.hpp index 46f8ca6cb..5894ea1e1 100644 --- a/third_party/xsimd/types/xsimd_all_registers.hpp +++ b/third_party/xsimd/types/xsimd_all_registers.hpp @@ -1,29 +1,28 @@ /*************************************************************************** -* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * -* Martin Renou * -* Copyright (c) QuantStack * -* Copyright (c) Serge Guelton * -* * -* Distributed under the terms of the BSD 3-Clause License. * -* * -* The full license is in the file LICENSE, distributed with this software. * -****************************************************************************/ + * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * + * Martin Renou * + * Copyright (c) QuantStack * + * Copyright (c) Serge Guelton * + * * + * Distributed under the terms of the BSD 3-Clause License. * + * * + * The full license is in the file LICENSE, distributed with this software. * + ****************************************************************************/ +#include "../types/xsimd_fma3_register.hpp" #include "../types/xsimd_sse2_register.hpp" #include "../types/xsimd_sse3_register.hpp" #include "../types/xsimd_sse4_1_register.hpp" #include "../types/xsimd_sse4_2_register.hpp" -#include "../types/xsimd_fma3_register.hpp" -#include "../types/xsimd_avx_register.hpp" #include "../types/xsimd_avx2_register.hpp" +#include "../types/xsimd_avx_register.hpp" #include "../types/xsimd_fma5_register.hpp" -#include "../types/xsimd_avx512f_register.hpp" +#include "../types/xsimd_avx512bw_register.hpp" #include "../types/xsimd_avx512cd_register.hpp" #include "../types/xsimd_avx512dq_register.hpp" -#include "../types/xsimd_avx512bw_register.hpp" +#include "../types/xsimd_avx512f_register.hpp" -#include "xsimd_neon_register.hpp" #include "xsimd_neon64_register.hpp" - +#include "xsimd_neon_register.hpp" diff --git a/third_party/xsimd/types/xsimd_api.hpp b/third_party/xsimd/types/xsimd_api.hpp index e2e1795e0..0cc372743 100644 --- a/third_party/xsimd/types/xsimd_api.hpp +++ b/third_party/xsimd/types/xsimd_api.hpp @@ -1,13 +1,13 @@ /*************************************************************************** -* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * -* Martin Renou * -* Copyright (c) QuantStack * -* Copyright (c) Serge Guelton * -* * -* Distributed under the terms of the BSD 3-Clause License. * -* * -* The full license is in the file LICENSE, distributed with this software. * -****************************************************************************/ + * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * + * Martin Renou * + * Copyright (c) QuantStack * + * Copyright (c) Serge Guelton * + * * + * Distributed under the terms of the BSD 3-Clause License. * + * * + * The full license is in the file LICENSE, distributed with this software. * + ****************************************************************************/ #ifndef XSIMD_API_HPP #define XSIMD_API_HPP @@ -17,1750 +17,1877 @@ #include #include +#include "../arch/xsimd_isa.hpp" #include "../types/xsimd_batch.hpp" #include "../types/xsimd_traits.hpp" -#include "../arch/xsimd_isa.hpp" - -namespace xsimd { - -/** - * high level free functions - * - * @defgroup batch_arithmetic Arithmetic operators - * @defgroup batch_constant Constant batches - * @defgroup batch_data_transfer Memory operators - * @defgroup batch_math Basic math operators - * @defgroup batch_math_extra Extra math operators - * @defgroup batch_fp Floating point manipulation - * @defgroup batch_rounding Rounding operators - * @defgroup batch_conversion Conversion operators - * @defgroup batch_complex_op Complex operators - * @defgroup batch_logical Logical operators - * @defgroup batch_bitwise Bitwise operators - * @defgroup batch_reducers Reducers - * @defgroup batch_miscellaneous Miscellaneous - * @defgroup batch_trigo Trigonometry - * - * @defgroup batch_bool_logical Boolean logical operators - * @defgroup batch_bool_reducers Boolean reducers - */ - -/** - * @ingroup batch_math - * - * Computes the absolute values of each scalar in the batch \c x. - * @param x batch of integer or floating point values. - * @return the absolute values of \c x. - */ -template -batch abs(batch const& x) { - return kernel::abs(x, A{}); -} - -/** - * @ingroup batch_complex - * - * Computes the absolute values of each complex in the batch \c z. - * @param z batch of complex values. - * @return the absolute values of \c z. - */ -template -batch abs(batch, A> const& z) { - return kernel::abs(z, A{}); -} - -/** - * @ingroup batch_arithmetic - * - * Computes the sum of the batches \c x and \c y. - * @param x batch or scalar involved in the addition. - * @param y batch or scalar involved in the addition. - * @return the sum of \c x and \c y - */ -template -auto add(T const& x, Tp const& y) -> decltype(x + y){ - return x + y; -} - -/** - * @ingroup batch_trigo - * - * Computes the arc cosine of the batch \c x. - * @param x batch of floating point values. - * @return the arc cosine of \c x. - */ -template -batch acos(batch const& x) { - return kernel::acos(x, A{}); -} - -/** - * @ingroup batch_trigo - * - * Computes the inverse hyperbolic cosine of the batch \c x. - * @param x batch of floating point values. - * @return the inverse hyperbolic cosine of \c x. - */ -template -batch acosh(batch const& x) { - return kernel::acosh(x, A{}); -} - -/** - * @ingroup batch_complex - * - * Computes the argument of the batch \c z. - * @param z batch of complex or real values. - * @return the argument of \c z. - */ -template -real_batch_type_t> arg(batch const& z) { - return kernel::arg(z, A{}); -} - -/** - * @ingroup batch_trigo - * - * Computes the arc sine of the batch \c x. - * @param x batch of floating point values. - * @return the arc sine of \c x. - */ -template -batch asin(batch const& x) { - return kernel::asin(x, A{}); -} - -/** - * @ingroup batch_trigo - * - * Computes the inverse hyperbolic sine of the batch \c x. - * @param x batch of floating point values. - * @return the inverse hyperbolic sine of \c x. - */ -template -batch asinh(batch const& x) { - return kernel::asinh(x, A{}); -} - -/** - * @ingroup batch_trigo - * - * Computes the arc tangent of the batch \c x. - * @param x batch of floating point values. - * @return the arc tangent of \c x. - */ -template -batch atan(batch const& x) { - return kernel::atan(x, A{}); -} - -/** - * @ingroup batch_trigo - * - * Computes the arc tangent of the batch \c x/y, using the signs of the - * arguments to determine the correct quadrant. - * @param x batch of floating point values. - * @param y batch of floating point values. - * @return the arc tangent of \c x/y. - */ -template -batch atan2(batch const& x, batch const& y) { - return kernel::atan2(x, y, A{}); -} - -/** - * @ingroup batch_trigo - * - * Computes the inverse hyperbolic tangent of the batch \c x. - * @param x batch of floating point values. - * @return the inverse hyperbolic tangent of \c x. - */ -template -batch atanh(batch const& x) { - return kernel::atanh(x, A{}); -} - -/** - * @ingroup batch_conversion - * - * Perform a static_cast from \c T_in to \c T_out on \c \c x. - * @param x batch of \c T_in - * @return \c x casted to \c T_out - */ -template -batch batch_cast(batch const & x) { - return kernel::batch_cast(x, batch{}, A{}); -} - -/** - * @ingroup batch_miscellaneous - * - * Computes the bit of sign of \c x - * @param x batch of scalar - * @return bit of sign of \c x - */ -template -batch bitofsign(batch const& x) { - return kernel::bitofsign(x, A{}); -} - -/** - * @ingroup batch_bitwise - * - * Computes the bitwise and of the batches \c x and \c y. - * @param x batch involved in the operation. - * @param y batch involved in the operation. - * @return the result of the bitwise and. - */ -template -auto bitwise_and(T const& x, Tp const& y) -> decltype(x & y){ - return x & y; -} - -/** - * @ingroup batch_bitwise - * - * Computes the bitwise and not of batches \c x and \c y. - * @param x batch involved in the operation. - * @param y batch involved in the operation. - * @return the result of the bitwise and not. - */ -template -batch bitwise_andnot(batch const& x, batch const& y) { - return kernel::bitwise_andnot(x, y, A{}); -} - - -/** - * @ingroup batch_bool_logical - * - * Computes the bitwise and not of batches \c x and \c y. - * @param x batch involved in the operation. - * @param y batch involved in the operation. - * @return the result of the bitwise and not. - */ -template -batch_bool bitwise_andnot(batch_bool const& x, batch_bool const& y) { - return kernel::bitwise_andnot(x, y, A{}); -} - -/** - * @ingroup batch_conversion - * - * Perform a reinterpret_cast from \c T_in to \c T_out on \c x. - * @param x batch of \c T_in - * @return \c x reinterpreted as \c T_out - */ -template -B bitwise_cast(batch const& x) { - return kernel::bitwise_cast(x, B{}, A{}); -} - -/** - * @ingroup batch_bitwise - * - * Computes the bitwise not of batch \c x. - * @param x batch involved in the operation. - * @return the result of the bitwise not. - */ -template -batch bitwise_not(batch const& x) { - return kernel::bitwise_not(x, A{}); -} - -/** - * @ingroup batch_bitwise - * - * Computes the bitwise or of the batches \c x and \c y. - * @param x scalar or batch of scalars - * @param y scalar or batch of scalars - * @return the result of the bitwise or. - */ -template -auto bitwise_or(T const& x, Tp const& y) -> decltype(x | y){ - return x | y; -} - -/** - * @ingroup batch_bitwise - * - * Computes the bitwise xor of the batches \c x and \c y. - * @param x scalar or batch of scalars - * @param y scalar or batch of scalars - * @return the result of the bitwise xor. - */ -template -auto bitwise_xor(T const& x, Tp const& y) -> decltype(x ^ y){ - return x ^ y; -} - -// FIXME: check if these need to be exposed, or removed (?) -template -batch_bool bool_cast(batch_bool const& x) { - return kernel::bool_cast(x, A{}); -} -template -batch_bool bool_cast(batch_bool const& x) { - return kernel::bool_cast(x, A{}); -} -template -batch_bool bool_cast(batch_bool const& x) { - return kernel::bool_cast(x, A{}); -} -template -batch_bool bool_cast(batch_bool const& x) { - return kernel::bool_cast(x, A{}); -} - -/** - * @ingroup batch_data_transfer - * - * Creates a batch from the single value \c v. - * @param v the value used to initialize the batch - * @return a new batch instance - */ -template -batch broadcast(T v) { - return kernel::broadcast(v, A{}); -} - -/** - * @ingroup batch_data_transfer - * - * Creates a batch from the single value \c v and - * the specified batch value type \c To. - * @param v the value used to initialize the batch - * @return a new batch instance - */ -template -simd_return_type broadcast_as(From v) { - using batch_value_type = typename simd_return_type::value_type; - using value_type = typename std::conditional::value, - bool, - batch_value_type>::type; - return simd_return_type(value_type(v)); -} - -/** - * @ingroup batch_math - * - * Computes the cubic root of the batch \c x. - * @param x batch of floating point values. - * @return the cubic root of \c x. - */ -template -batch cbrt(batch const& x) { - return kernel::cbrt(x, A{}); -} - -/** - * @ingroup batch_rounding - * - * Computes the batch of smallest integer values not less than - * scalars in \c x. - * @param x batch of floating point values. - * @return the batch of smallest integer values not less than \c x. - */ -template -batch ceil(batch const& x) { - return kernel::ceil(x, A{}); -} - - -/** - * @ingroup batch_math - * - * Clips the values of the batch \c x between those of the batches \c lo and \c hi. - * @param x batch of floating point values. - * @param lo batch of floating point values. - * @param hi batch of floating point values. - * @return the result of the clipping. - */ -template -batch clip(batch const& x, batch const& lo, batch const& hi) { - return kernel::clip(x, lo, hi, A{}); -} - -/** - * @ingroup batch_complex - * - * Computes the conjugate of the batch \c z. - * @param z batch of complex values. - * @return the argument of \c z. - */ -template -complex_batch_type_t> conj(batch const& z) { - return kernel::conj(z, A{}); -} - -/** - * @ingroup batch_miscellaneous - * - * Computes a value whose absolute value matches - * that of \c x, but whose sign bit matches that of \c y. - * @param x batch of scalars - * @param y batch of scalars - * @return batch whose absolute value matches that of \c x, but whose sign bit - * matches that of \c y. - */ -template -batch copysign(batch const& x, batch const& y) { - return kernel::copysign(x, y, A{}); -} - -/** - * @ingroup batch_trigo - * - * Computes the cosine of the batch \c x. - * @param x batch of floating point values. - * @return the cosine of \c x. - */ -template -batch cos(batch const& x) { - return kernel::cos(x, A{}); -} - -/** - * @ingroup batch_trigo - * - * computes the hyperbolic cosine of the batch \c x. - * @param x batch of floating point values. - * @return the hyperbolic cosine of \c x. - */ -template -batch cosh(batch const& x) { - return kernel::cosh(x, A{}); -} - -/** - * @ingroup batch_arithmetic - * - * Computes the division of the batch \c x by the batch \c y. - * @param x scalar or batch of scalars - * @param y scalar or batch of scalars - * @return the result of the division. - */ -template -auto div(T const& x, Tp const& y) -> decltype(x / y){ - return x / y; -} - -/** - * @ingroup batch_logical - * - * Element-wise equality comparison of batches \c x and \c y. - * @param x batch of scalars - * @param y batch of scalars - * @return a boolean batch. - */ -template -batch_bool eq(batch const& x, batch const& y) { - return x == y; -} - -/** - * @ingroup batch_math - * - * Computes the natural exponential of the batch \c x. - * @param x batch of floating point values. - * @return the natural exponential of \c x. - */ -template -batch exp(batch const& x) { - return kernel::exp(x, A{}); -} - -/** - * @ingroup batch_math - * - * Computes the base 10 exponential of the batch \c x. - * @param x batch of floating point values. - * @return the base 10 exponential of \c x. - */ -template -batch exp10(batch const& x) { - return kernel::exp10(x, A{}); -} - -/** - * @ingroup batch_math - * - * Computes the base 2 exponential of the batch \c x. - * @param x batch of floating point values. - * @return the base 2 exponential of \c x. - */ -template -batch exp2(batch const& x) { - return kernel::exp2(x, A{}); -} - -/** - * @ingroup batch_math - * - * Computes the natural exponential of the batch \c x, minus one. - * @param x batch of floating point values. - * @return the natural exponential of \c x, minus one. - */ -template -batch expm1(batch const& x) { - return kernel::expm1(x, A{}); -} - -/** - * @ingroup batch_math_extra - * - * Computes the error function of the batch \c x. - * @param x batch of floating point values. - * @return the error function of \c x. - */ -template -batch erf(batch const& x) { - return kernel::erf(x, A{}); -} - -/** - * @ingroup batch_math_extra - * - * Computes the complementary error function of the batch \c x. - * @param x batch of floating point values. - * @return the error function of \c x. - */ -template -batch erfc(batch const& x) { - return kernel::erfc(x, A{}); -} - -/** - * @ingroup batch_math_extra - * - * Evaluate polynomial with coefficient \c Coefs on point \c x using estrin - * method. - * @param x batch of floating point values. - * @return the evaluation ofpolynomial with coefficient \c Coefs on point \c x. - */ -template -batch estrin(const batch& x) { - return kernel::estrin(x); -} - -/** - * Extract vector from pair of vectors - * extracts the lowest vector elements from the second source \c x - * and the highest vector elements from the first source \c y - * Concatenates the results into th Return value. - * @param x batch of integer or floating point values. - * @param y batch of integer or floating point values. - * @param i integer specifuing the lowest vector element to extract from the first source register - * @return. - */ -template -batch extract_pair(batch const & x, batch const& y, std::size_t i) { - return kernel::extract_pair(x, y, i, A{}); -} - -/** - * @ingroup batch_math - * - * Computes the absolute values of each scalar in the batch \c x. - * @param x batch floating point values. - * @return the asbolute values of \c x. - */ -template -batch fabs(batch const& x) { - return kernel::abs(x, A{}); -} - -/** - * @ingroup batch_math - * - * Computes the positive difference between \c x and \c y, that is, - * max(0, x-y). - * @param x batch of floating point values. - * @param y batch of floating point values. - * @return the positive difference. - */ -template -batch fdim(batch const& x, batch const& y) { - return kernel::fdim(x, y, A{}); -} - -/** - * @ingroup batch_rounding - * - * Computes the batch of largest integer values not greater than - * scalars in \c x. - * @param x batch of floating point values. - * @return the batch of largest integer values not greater than \c x. - */ -template -batch floor(batch const& x) { - return kernel::floor(x, A{}); -} - -/** - * @ingroup batch_arithmetic - * - * Computes (x*y) + z in a single instruction when possible. - * @param x a batch of integer or floating point values. - * @param y a batch of integer or floating point values. - * @param z a batch of integer or floating point values. - * @return the result of the fused multiply-add operation. - */ -template -batch fma(batch const& x, batch const& y, batch const& z) { - return kernel::fma(x, y, z, A{}); -} - - -/** - * @ingroup batch_math - * - * Computes the larger values of the batches \c x and \c y. - * @param x a batch of integer or floating point values. - * @param y a batch of integer or floating point values. - * @return a batch of the larger values. - */ -template -batch fmax(batch const& x, batch const& y) { - return kernel::max(x, y, A{}); -} - - -/** - * @ingroup batch_math - * - * Computes the smaller values of the batches \c x and \c y. - * @param x a batch of integer or floating point values. - * @param y a batch of integer or floating point values. - * @return a batch of the larger values. - */ -template -batch fmin(batch const& x, batch const& y) { - return kernel::min(x, y, A{}); -} - -/** - * @ingroup batch_math - * - * Computes the modulo of the batch \c x by the batch \c y. - * @param x batch involved in the modulo. - * @param y batch involved in the modulo. - * @return the result of the modulo. - */ -template -batch fmod(batch const& x, batch const& y) { - return kernel::fmod(x, y, A{}); -} - -/** - * @ingroup batch_arithmetic - * - * Computes (x*y) - z in a single instruction when possible. - * @param x a batch of integer or floating point values. - * @param y a batch of integer or floating point values. - * @param z a batch of integer or floating point values. - * @return the result of the fused multiply-sub operation. - */ -template -batch fms(batch const& x, batch const& y, batch const& z) { - return kernel::fms(x, y, z, A{}); -} - -/** - * @ingroup batch_arithmetic - * - * Computes -(x*y) + z in a single instruction when possible. - * @param x a batch of integer or floating point values. - * @param y a batch of integer or floating point values. - * @param z a batch of integer or floating point values. - * @return the result of the fused negated multiply-add operation. - */ -template -batch fnma(batch const& x, batch const& y, batch const& z) { - return kernel::fnma(x, y, z, A{}); -} - -/** - * @ingroup batch_arithmetic - * - * Computes -(x*y) - z in a single instruction when possible. - * @param x a batch of integer or floating point values. - * @param y a batch of integer or floating point values. - * @param z a batch of integer or floating point values. - * @return the result of the fused negated multiply-sub operation. - */ -template -batch fnms(batch const& x, batch const& y, batch const& z) { - return kernel::fnms(x, y, z, A{}); -} - -/** - * @ingroup batch_fp - * - * Split split the number x into a normalized fraction and an exponent which is stored in exp - * @param x a batch of integer or floating point values. - * @param y a batch of integer or floating point values. - * @return the normalized fraction of x - */ -template -batch frexp(const batch& x, batch, A>& y) { - return kernel::frexp(x, y, A{}); -} - -/** - * @ingroup batch_logical - * - * Element-wise greater or equal comparison of batches \c x and \c y. - * @tparam X the actual type of batch. - * @param x batch involved in the comparison. - * @param y batch involved in the comparison. - * @return a boolean batch. - */ -template -batch_bool ge(batch const& x, batch const& y) { - return x >= y; -} - -/** - * @ingroup batch_logical - * - * Element-wise greater than comparison of batches \c x and \c y. - * @tparam X the actual type of batch. - * @param x batch involved in the comparison. - * @param y batch involved in the comparison. - * @return a boolean batch. - */ -template -batch_bool gt(batch const& x, batch const& y) { - return x > y; -} - -/** - * @ingroup batch_reducers - * - * Adds all the scalars of the batch \c x. - * @param x batch involved in the reduction - * @return the result of the reduction. - */ -template -T hadd(batch const& x) { - return kernel::hadd(x, A{}); -} - -/** - * @ingroup batch_reducers - * - * Parallel horizontal addition: adds the scalars of each batch - * in the array pointed by \c row and store them in a returned - * batch. - * @param row an array of \c N batches - * @return the result of the reduction. - */ -template -batch haddp(batch const* row) { - return kernel::haddp(row, A{}); -} - - -/** - * @ingroup batch_math_extra - * - * Evaluate polynomial with coefficient \c Coefs on point \c x using horner - * method. - * @param x batch of floating point values. - * @return the evaluation ofpolynomial with coefficient \c Coefs on point \c x. - */ -template -batch horner(const batch& x) { - return kernel::horner(x); -} - -/** - * @ingroup batch_math - * - * Computes the square root of the sum of the squares of the batches - * \c x, and \c y. - * @param x batch of floating point values. - * @param y batch of floating point values. - * @return the square root of the sum of the squares of \c x and \c y. - */ -template -batch hypot(batch const& x, batch const& y) { - return kernel::hypot(x, y, A{}); -} - -/** - * @ingroup batch_complex - * - * Computes the imaginary part of the batch \c z. - * @param z batch of complex or real values. - * @return the argument of \c z. - */ -template -real_batch_type_t> imag(batch const& x) { - return kernel::imag(x, A{}); -} - -/** - * @ingroup batch_constant - * - * Return a batch of scalars representing positive infinity - * @return a batch of positive infinity - */ -template -B infinity() { - using T = typename B::value_type; - return B(std::numeric_limits::infinity()); -} - -/** - * @ingroup batch_logical - * - * Determines if the scalars in the given batch \c x represent an even integer value - * @param x batch of floating point values. - * @return a batch of booleans. - */ -template -batch_bool is_even(batch const& x) { - return kernel::is_even(x, A{}); -} - -/** - * @ingroup batch_logical - * - * Determines if the floating-point scalars in the given batch \c x represent integer value - * @param x batch of floating point values. - * @return a batch of booleans. - */ -template -batch_bool is_flint(batch const& x) { - return kernel::is_flint(x, A{}); -} - -/** - * @ingroup batch_logical - * - * Determines if the scalars in the given batch \c x represent an odd integer value - * @param x batch of floating point values. - * @return a batch of booleans. - */ -template -batch_bool is_odd(batch const& x) { - return kernel::is_odd(x, A{}); -} - - -/** - * @ingroup batch_logical - * - * Determines if the scalars in the given batch \c x are inf values. - * @param x batch of floating point values. - * @return a batch of booleans. - */ -template -batch_bool isinf(batch const& x) { - return kernel::isinf(x, A{}); -} - - -/** - * @ingroup batch_logical - * - * Determines if the scalars in the given batch \c x are finite values. - * @param x batch of floating point values. - * @return a batch of booleans. - */ -template -batch_bool isfinite(batch const& x) { - return kernel::isfinite(x, A{}); -} - -/** - * @ingroup batch_logical - * - * Determines if the scalars in the given batch \c x are NaN values. - * @param x batch of floating point values. - * @return a batch of booleans. - */ -template -typename batch::batch_bool_type isnan(batch const& x) { - return kernel::isnan(x, A{}); -} - -/** - * @ingroup batch_math_extra - * - * Computes the multiplication of the floating- point number x by 2 raised to the power exp. - * @param x batch of floating point values. - * @param y batch of floating point values. - * @return the natural logarithm of the gamma function of \c x. - */ -template -batch ldexp(const batch& x, const batch, A>& y) { - return kernel::ldexp(x, y, A{}); -} - -/** - * @ingroup batch_logical - * - * Element-wise lesser or equal to comparison of batches \c x and \c y. - * @param x batch involved in the comparison. - * @param y batch involved in the comparison. - * @return a boolean batch. - */ -template -batch_bool le(batch const& x, batch const& y) { - return x <= y; -} - -/** - * @ingroup batch_math_extra - * - * Computes the natural logarithm of the gamma function of the batch \c x. - * @param x batch of floating point values. - * @return the natural logarithm of the gamma function of \c x. - */ -template -batch lgamma(batch const& x) { - return kernel::lgamma(x, A{}); -} - -/** - * @ingroup batch_data_transfer - * - * Creates a batch from the buffer \c ptr and the specifed - * batch value type \c To. The memory needs to be aligned. - * @param ptr the memory buffer to read - * @return a new batch instance - */ -template -simd_return_type load_as(From const* ptr, aligned_mode) { - using batch_value_type = typename simd_return_type::value_type; - return kernel::load_aligned(ptr, kernel::convert{}, A{}); -} - -template -simd_return_type load_as(bool const* ptr, aligned_mode) { - return simd_return_type::load_aligned(ptr); -} -template -simd_return_type, To> load_as(std::complex const* ptr, aligned_mode) +namespace xsimd { - using batch_value_type = typename simd_return_type, To>::value_type; - return kernel::load_complex_aligned(ptr, kernel::convert{}, A{}); -} - -/** - * @ingroup batch_data_transfer - * - * Creates a batch from the buffer \c ptr and the specifed - * batch value type \c To. The memory does not need to be aligned. - * @param ptr the memory buffer to read - * @return a new batch instance - */ -template -simd_return_type load_as(From const* ptr, unaligned_mode) { - using batch_value_type = typename simd_return_type::value_type; - return kernel::load_unaligned(ptr, kernel::convert{}, A{}); -} - -template -simd_return_type load_as(bool const* ptr, unaligned_mode) { - return simd_return_type::load_unaligned(ptr); -} - -template -simd_return_type, To> load_as(std::complex const* ptr, unaligned_mode) -{ - using batch_value_type = typename simd_return_type, To>::value_type; - return kernel::load_complex_unaligned(ptr, kernel::convert{}, A{}); -} - -/** - * @ingroup batch_data_transfer - * - * Creates a batch from the buffer \c ptr. The - * memory needs to be aligned. - * @param ptr the memory buffer to read - * @return a new batch instance - */ -template -batch load(From const* ptr, aligned_mode= {}) { - return load_as(ptr, aligned_mode{}); -} - -/** - * @ingroup batch_data_transfer - * - * Creates a batch from the buffer \c ptr. The - * memory does not need to be aligned. - * @param ptr the memory buffer to read - * @return a new batch instance - */ -template -batch load(From const* ptr, unaligned_mode) { - return load_as(ptr, unaligned_mode{}); -} - -/** - * @ingroup batch_data_transfer - * - * Creates a batch from the buffer \c ptr. The - * memory needs to be aligned. - * @param ptr the memory buffer to read - * @return a new batch instance - */ -template -batch load_aligned(From const* ptr) { - return load_as(ptr, aligned_mode{}); -} - -/** - * @ingroup batch_data_transfer - * - * Creates a batch from the buffer \c ptr. The - * memory does not need to be aligned. - * @param ptr the memory buffer to read - * @return a new batch instance - */ -template -batch load_unaligned(From const* ptr) { - return load_as(ptr, unaligned_mode{}); -} - -/** - * @ingroup batch_math - * - * Computes the natural logarithm of the batch \c x. - * @param x batch of floating point values. - * @return the natural logarithm of \c x. - */ -template -batch log(batch const& x) { - return kernel::log(x, A{}); -} - -/** - * @ingroup batch_math - * Computes the base 2 logarithm of the batch \c x. - * @param x batch of floating point values. - * @return the base 2 logarithm of \c x. - */ -template -batch log2(batch const& x) { - return kernel::log2(x, A{}); -} - -/** - * @ingroup batch_math - * Computes the base 10 logarithm of the batch \c x. - * @param x batch of floating point values. - * @return the base 10 logarithm of \c x. - */ -template -batch log10(batch const& x) { - return kernel::log10(x, A{}); -} - -/** - * @ingroup batch_math - * Computes the natural logarithm of one plus the batch \c x. - * @param x batch of floating point values. - * @return the natural logarithm of one plus \c x. - */ -template -batch log1p(batch const& x) { - return kernel::log1p(x, A{}); -} - -/** - * @ingroup batch_logical - * - * Element-wise lesser than comparison of batches \c x and \c y. - * @param x batch involved in the comparison. - * @param y batch involved in the comparison. - * @return a boolean batch. - */ -template -batch_bool lt(batch const& x, batch const& y) { - return x < y; -} - -/** - * @ingroup batch_math - * - * Computes the larger values of the batches \c x and \c y. - * @param x a batch of integer or floating point values. - * @param y a batch of integer or floating point values. - * @return a batch of the larger values. - */ -template -batch max(batch const& x, batch const& y) { - return kernel::max(x, y, A{}); -} - -/** - * @ingroup batch_math - * - * Computes the smaller values of the batches \c x and \c y. - * @param x a batch of integer or floating point values. - * @param y a batch of integer or floating point values. - * @return a batch of the smaller values. - */ -template -batch min(batch const& x, batch const& y) { - return kernel::min(x, y, A{}); -} - -/** - * @ingroup batch_constant - * - * Return a batch of scalars representing positive infinity - * @return a batch of positive infinity - */ -template -B minusinfinity() { - using T = typename B::value_type; - return B(-std::numeric_limits::infinity()); -} - -/** - * @ingroup batch_arithmetic - * - * Computes the integer modulo of the batch \c x by the batch \c y. - * @param x batch involved in the modulo. - * @param y batch involved in the modulo. - * @return the result of the modulo. - */ -template -auto mod(T const& x, Tp const& y) -> decltype(x % y){ - return x % y; -} - -/** - * @ingroup batch_arithmetic - * - * Computes the product of the batches \c x and \c y. - * @tparam X the actual type of batch. - * @param x batch involved in the product. - * @param y batch involved in the product. - * @return the result of the product. - */ -template -auto mul(T const& x, Tp const& y) -> decltype(x * y){ - return x * y; -} - -/** - * @ingroup batch_rounding - * - * Rounds the scalars in \c x to integer values (in floating point format), using - * the current rounding mode. - * @param x batch of flaoting point values. - * @return the batch of nearest integer values. - */ -template -batch nearbyint(batch const& x) { - return kernel::nearbyint(x, A{}); -} - -/** - * @ingroup batch_logical - * - * Element-wise inequality comparison of batches \c x and \c y. - * @param x batch involved in the comparison. - * @param y batch involved in the comparison. - * @return a boolean batch. - */ -template -batch_bool neq(batch const& x, batch const& y) { - return x != y; -} - - -/** - * @ingroup batch_arithmetic - * - * Computes the opposite of the batch \c x. - * @param x batch involved in the operation. - * @return the opposite of \c x. - */ -template -batch neg(batch const& x) { - return -x; -} - -/** - * @ingroup batch_math_extra - * - * Computes the next representable floating-point - * value following x in the direction of y - * @param x batch of floating point values. - * @param y batch of floating point values. - * @return \c x raised to the power \c y. - */ -template -batch nextafter(batch const& x, batch const& y) { - return kernel::nextafter(x, y, A{}); -} - -/** - * @ingroup batch_complex - * - * Computes the norm of the batch \c x. - * @param x batch of complex or real values. - * @return the norm of \c x. - */ -template -real_batch_type_t> norm(batch const& x) { - return kernel::norm(x, A{}); -} - -/** - * @ingroup batch_arithmetic - * - * No-op on \c x. - * @param x batch involved in the operation. - * @return \c x. - */ -template -batch pos(batch const& x) { - return +x; -} - -/** - * @ingroup batch_math - * - * Computes the value of the batch \c x raised to the power - * \c y. - * @param x batch of floating point values. - * @param y batch of floating point values. - * @return \c x raised to the power \c y. - */ -template -batch pow(batch const& x, batch const& y) { - return kernel::pow(x, y, A{}); -} - -/** - * @ingroup batch_math - * - * Computes the value of the batch \c x raised to the power - * \c y. - * @param x batch of integral values. - * @param y batch of integral values. - * @return \c x raised to the power \c y. - */ -template::value, void>::type> -batch pow(batch const& x, ITy y) { - return kernel::ipow(x, y, A{}); -} - -/** - * @ingroup batch_complex - * - * Computes the projection of the batch \c x. - * @param x batch of complex or real values. - * @return the projection of \c x. - */ -template -complex_batch_type_t> proj(batch const& x) { - return kernel::proj(x, A{}); -} - -/** - * @ingroup batch_complex - * - * Computes the real part of the batch \c z. - * @param z batch of complex or real values. - * @return the argument of \c z. - */ -template -real_batch_type_t> real(batch const& x) { - return kernel::real(x, A{}); -} - -/** - * @ingroup batch_math - * - * Computes the remainder of dividing \c x by \c y - * @param x batch of scalar values - * @param y batch of scalar values - * @return the result of the addition. - */ -template -batch remainder(batch const& x, batch const& y) { - return kernel::remainder(x, y, A{}); -} - -/** - * @ingroup batch_rounding - * - * Rounds the scalars in \c x to integer values (in floating point format), using - * the current rounding mode. - * @param x batch of floating point values. - * @return the batch of rounded values. - */ -template -batch rint(batch const& x) { - return nearbyint(x); -} - -/** - * @ingroup batch_rounding - * - * Computes the batch of nearest integer values to scalars in \c x (in - * floating point format), rounding halfway cases away from zero, regardless - * of the current rounding mode. - * @param x batch of flaoting point values. - * @return the batch of nearest integer values. - */ -template -batch round(batch const& x) { - return kernel::round(x, A{}); -} -/** - * @ingroup batch_arithmetic - * - * Computes the saturate sum of the batch \c x and the batch \c y. - * \c x. - * @tparam X the actual type of batch. - * @param x batch involved in the saturated addition. - * @param y batch involved in the saturated addition. - * @return the result of the saturated addition. - */ -template -auto sadd(T const& x, Tp const& y) -> decltype(x + y) { - using B = decltype(x + y); - using A = typename B::arch_type; - return kernel::sadd(B(x), B(y), A{}); -} - -/** - * @ingroup batch_miscellaneous - * - * Ternary operator for batches: selects values from the batches \c true_br or \c false_br - * depending on the boolean values in the constant batch \c cond. Equivalent to - * \code{.cpp} - * for(std::size_t i = 0; i < N; ++i) - * res[i] = cond[i] ? true_br[i] : false_br[i]; - * \endcode - * @param cond constant batch condition. - * @param true_br batch values for truthy condition. - * @param false_br batch value for falsy condition. - * @return the result of the selection. - */ -template -batch select(batch_bool const& cond, batch const& true_br, batch const& false_br) { - return kernel::select(cond, true_br, false_br, A{}); -} - -/** - * @ingroup batch_miscellaneous - * - * Ternary operator for batches: selects values from the batches \c true_br or \c false_br - * depending on the boolean values in the constant batch \c cond. Equivalent to - * \code{.cpp} - * for(std::size_t i = 0; i < N; ++i) - * res[i] = cond[i] ? true_br[i] : false_br[i]; - * \endcode - * @param cond constant batch condition. - * @param true_br batch values for truthy condition. - * @param false_br batch value for falsy condition. - * @return the result of the selection. - */ -template -batch, A> select(batch_bool const& cond, batch, A> const& true_br, batch, A> const& false_br) { - return kernel::select(cond, true_br, false_br, A{}); -} - -/** - * @ingroup batch_miscellaneous - * - * Ternary operator for batches: selects values from the batches \c true_br or \c false_br - * depending on the boolean values in the constant batch \c cond. Equivalent to - * \code{.cpp} - * for(std::size_t i = 0; i < N; ++i) - * res[i] = cond[i] ? true_br[i] : false_br[i]; - * \endcode - * @param cond constant batch condition. - * @param true_br batch values for truthy condition. - * @param false_br batch value for falsy condition. - * @return the result of the selection. - */ -template -batch select(batch_bool_constant, Values...> const& cond, batch const& true_br, batch const& false_br) { - return kernel::select(cond, true_br, false_br, A{}); -} - -/** - * @ingroup batch_miscellaneous - * - * Computes the sign of \c x - * @param x batch - * @return -1 for each negative element, -1 or +1 for each null element and +1 for each element - */ -template -batch sign(batch const& x) { - return kernel::sign(x, A{}); -} - -/** - * @ingroup batch_miscellaneous - * - * Computes the sign of \c x, assuming x doesn't have any zero - * @param x batch - * @return -1 for each negative element, -1 or +1 for each null element and +1 for each element - */ -template -batch signnz(batch const& x) { - return kernel::signnz(x, A{}); -} - -/** - * @ingroup batch_trigo - * - * Computes the sine of the batch \c x. - * @param x batch of floating point values. - * @return the sine of \c x. - */ -template -batch sin(batch const& x) { - return kernel::sin(x, A{}); -} - -/** - * @ingroup batch_trigo - * - * Computes the hyperbolic sine of the batch \c x. - * @param x batch of floating point values. - * @return the hyperbolic sine of \c x. - */ -template -batch sinh(batch const& x) { - return kernel::sinh(x, A{}); -} - -/** - * @ingroup batch_trigo - * - * Computes the sine and the cosine of the batch \c x. This method is faster - * than calling sine and cosine independently. - * @param x batch of floating point values. - * @return a pair containing the sine then the cosine of batch \c x - */ -template -std::pair, batch> sincos(batch const& x) { - return kernel::sincos(x, A{}); -} - -/** - * @ingroup batch_math - * - * Computes the square root of the batch \c x. - * @param x batch of floating point values. - * @return the square root of \c x. - */ -template -batch sqrt(batch const& x) { - return kernel::sqrt(x, A{}); -} - -/** - * @ingroup batch_arithmetic - * - * Computes the saturate difference of the batch \c x and the batch \c y. - * \c x. - * @tparam X the actual type of batch. - * @param x batch involved in the saturated difference. - * @param y batch involved in the saturated difference. - * @return the result of the saturated difference. - */ -template -auto ssub(T const& x, Tp const& y) -> decltype(x - y) { - using B = decltype(x + y); - using A = typename B::arch_type; - return kernel::ssub(B(x), B(y), A{}); -} - -/** - * @ingroup batch_data_transfer - * - * Copy content of batch \c src to the buffer \c dst. The - * memory needs to be aligned. - * @param mem the memory buffer to write to - * @param val the batch to copy - */ -template -void store_as(To* dst, batch const& src, aligned_mode) { - kernel::store_aligned(dst, src, A{}); -} - -template -void store_as(bool* dst, batch_bool const& src, aligned_mode) { - kernel::store(src, dst, A{}); -} - -template -void store_as(std::complex* dst, batch,A> const& src, aligned_mode) { - kernel::store_complex_aligned(dst, src, A{}); -} - -/** - * @ingroup batch_data_transfer - * - * Copy content of batch \c src to the buffer \c dst. The - * memory does not need to be aligned. - * @param mem the memory buffer to write to - * @param val the batch to copy - */ -template -void store_as(To* dst, batch const& src, unaligned_mode) { - kernel::store_unaligned(dst, src, A{}); -} - -template -void store_as(bool* dst, batch_bool const& src, unaligned_mode) { - kernel::store(src, dst, A{}); -} - -template -void store_as(std::complex* dst, batch, A> const& src, unaligned_mode) { - kernel::store_complex_unaligned(dst, src, A{}); -} - -/** - * @ingroup batch_data_transfer - * - * Copy content of batch \c val to the buffer \c mem. The - * memory does not need to be aligned. - * @param mem the memory buffer to write to - * @param val the batch to copy from - */ -template -void store(T* mem, batch const& val, aligned_mode={}) { - store_as(mem, val, aligned_mode{}); -} - -/** - * @ingroup batch_data_transfer - * - * Copy content of batch \c val to the buffer \c mem. The - * memory does not need to be aligned. - * @param mem the memory buffer to write to - * @param val the batch to copy from - */ -template -void store(T* mem, batch const& val, unaligned_mode) { - store_as(mem, val, unaligned_mode{}); -} - -/** - * @ingroup batch_data_transfer - * - * Copy content of batch \c val to the buffer \c mem. The - * memory needs to be aligned. - * @param mem the memory buffer to write to - * @param val the batch to copy from - */ -template -void store_aligned(T* mem, batch const& val) { - store_as(mem, val, aligned_mode{}); -} - -/** - * @ingroup batch_data_transfer - * - * Copy content of batch \c val to the buffer \c mem. The - * memory does not need to be aligned. - * @param mem the memory buffer to write to - * @param val the batch to copy - */ -template -void store_unaligned(T* mem, batch const& val) { - store_as(mem, val, unaligned_mode{}); -} - -/** - * @ingroup batch_arithmetic - * - * Computes the difference between \c x and \c y - * @tparam X the actual type of batch. - * @param x scalar or batch of scalars - * @param y scalar or batch of scalars - * @return the difference between \c x and \c y - */ -template -auto sub(T const& x, Tp const& y) -> decltype(x - y){ - return x - y; -} - -/** - * @ingroup batch_trigo - * - * Computes the tangent of the batch \c x. - * @param x batch of floating point values. - * @return the tangent of \c x. - */ -template -batch tan(batch const& x) { - return kernel::tan(x, A{}); -} - -/** - * @ingroup batch_trigo - * - * Computes the hyperbolic tangent of the batch \c x. - * @param x batch of floating point values. - * @return the hyperbolic tangent of \c x. - */ -template -batch tanh(batch const& x) { - return kernel::tanh(x, A{}); -} - -/** - * @ingroup batch_math_extra - * - * Computes the gamma function of the batch \c x. - * @param x batch of floating point values. - * @return the gamma function of \c x. - */ -template -batch tgamma(batch const& x) { - return kernel::tgamma(x, A{}); -} - -/** - * @ingroup batch_conversion - * - * Perform a conversion from \c i to a value of an floating point type of the same size as \c T - * @param i batch of integers. - * @return \c i converted to a value of an floating point type of the same size as \c T - */ -template -batch, A> to_float(batch const& i) { - return kernel::to_float(i, A{}); -} - -/** - * @ingroup batch_conversion - * - * Perform a conversion from \c x to a value of an integer type of the same size as \c T - * @param x batch. - * @return \c x converted to a value of an integer type of the same size as \c T - */ -template -batch, A> to_int(batch const& x) { - return kernel::to_int(x, A{}); -} - -/** - * @ingroup batch_rounding - * - * Computes the batch of nearest integer values not greater in magnitude - * than scalars in \c x. - * @param x batch of floating point values. - * @return the batch of nearest integer values not greater in magnitude than \c x. - */ -template -batch trunc(batch const& x) { - return kernel::trunc(x, A{}); -} - -/** - * @ingroup batch_data_transfer - * - * Unpack and interleave data from the HIGH half of batches \c x and \c y. - * Store the results in the Return value. - * @param x a batch of integer or floating point or double precision values. - * @param y a batch of integer or floating point or double precision values. - * @return a batch of the high part of shuffled values. - */ -template -batch zip_hi(batch const& x, batch const& y) { - return kernel::zip_hi(x, y, A{}); -} - -/** - * @ingroup batch_data_transfer - * - * Unpack and interleave data from the LOW half of batches \c x and \c y. - * Store the results in the Return value. - * @param x a batch of integer or floating point or double precision values. - * @param y a batch of integer or floating point or double precision values. - * @return a batch of the low part of shuffled values. - */ -template -batch zip_lo(batch const& x, batch const& y) { - return kernel::zip_lo(x, y, A{}); -} - -// bitwise_cast -template ::value, int>::type = 3> -batch bitwise_cast(batch_bool const& self) -{ - T z(0); - return select(self, batch(T(~z)), batch(z)); -} - -template ::value, int>::type = 3> -batch bitwise_cast(batch_bool const& self) -{ - T z0(0), z1(0); - using int_type = as_unsigned_integer_t; - int_type value(~int_type(0)); - std::memcpy(&z1, &value, sizeof(int_type)); - return select(self, batch(z1), batch(z0)); -} - -/** - * @ingroup batch_bool_reducers - * - * Returns true if all the boolean values in the batch are true, - * false otherwise. - * @param x the batch to reduce. - * @return a boolean scalar. - */ -template -bool all(batch_bool const& x) { - return kernel::all(x, A{}); -} - -/** - * @ingroup batch_bool_reducers - * - * Return true if any of the boolean values in the batch is true, - * false otherwise. - * @param x the batch to reduce. - * @return a boolean scalar. - */ -template -bool any(batch_bool const& x) { - return kernel::any(x, A{}); -} - -/** - * @ingroup batch_miscellaneous - * - * Dump the content of batch \c x to stream \c o - * @param o the stream where the batch is dumped - * @param x batch to dump. - * @return a reference to \c o - */ -template -std::ostream& operator<<(std::ostream& o, batch const& x) { - constexpr auto size = batch::size; - alignas(A::alignment()) T buffer[size]; - x.store_aligned(&buffer[0]); - o << '('; - for(std::size_t i = 0; i < size - 1; ++i) - o << buffer[i] << ", "; - return o << buffer[size - 1] << ')'; -} + /** + * high level free functions + * + * @defgroup batch_arithmetic Arithmetic operators + * @defgroup batch_constant Constant batches + * @defgroup batch_data_transfer Memory operators + * @defgroup batch_math Basic math operators + * @defgroup batch_math_extra Extra math operators + * @defgroup batch_fp Floating point manipulation + * @defgroup batch_rounding Rounding operators + * @defgroup batch_conversion Conversion operators + * @defgroup batch_complex_op Complex operators + * @defgroup batch_logical Logical operators + * @defgroup batch_bitwise Bitwise operators + * @defgroup batch_reducers Reducers + * @defgroup batch_miscellaneous Miscellaneous + * @defgroup batch_trigo Trigonometry + * + * @defgroup batch_bool_logical Boolean logical operators + * @defgroup batch_bool_reducers Boolean reducers + */ + + /** + * @ingroup batch_math + * + * Computes the absolute values of each scalar in the batch \c x. + * @param x batch of integer or floating point values. + * @return the absolute values of \c x. + */ + template + inline batch abs(batch const& x) noexcept + { + return kernel::abs(x, A {}); + } + + /** + * @ingroup batch_complex + * + * Computes the absolute values of each complex in the batch \c z. + * @param z batch of complex values. + * @return the absolute values of \c z. + */ + template + inline batch abs(batch, A> const& z) noexcept + { + return kernel::abs(z, A {}); + } + + /** + * @ingroup batch_arithmetic + * + * Computes the sum of the batches \c x and \c y. + * @param x batch or scalar involved in the addition. + * @param y batch or scalar involved in the addition. + * @return the sum of \c x and \c y + */ + template + inline auto add(T const& x, Tp const& y) noexcept -> decltype(x + y) + { + return x + y; + } + + /** + * @ingroup batch_trigo + * + * Computes the arc cosine of the batch \c x. + * @param x batch of floating point values. + * @return the arc cosine of \c x. + */ + template + inline batch acos(batch const& x) noexcept + { + return kernel::acos(x, A {}); + } + + /** + * @ingroup batch_trigo + * + * Computes the inverse hyperbolic cosine of the batch \c x. + * @param x batch of floating point values. + * @return the inverse hyperbolic cosine of \c x. + */ + template + inline batch acosh(batch const& x) noexcept + { + return kernel::acosh(x, A {}); + } + + /** + * @ingroup batch_complex + * + * Computes the argument of the batch \c z. + * @param z batch of complex or real values. + * @return the argument of \c z. + */ + template + inline real_batch_type_t> arg(batch const& z) noexcept + { + return kernel::arg(z, A {}); + } + + /** + * @ingroup batch_trigo + * + * Computes the arc sine of the batch \c x. + * @param x batch of floating point values. + * @return the arc sine of \c x. + */ + template + inline batch asin(batch const& x) noexcept + { + return kernel::asin(x, A {}); + } + + /** + * @ingroup batch_trigo + * + * Computes the inverse hyperbolic sine of the batch \c x. + * @param x batch of floating point values. + * @return the inverse hyperbolic sine of \c x. + */ + template + inline batch asinh(batch const& x) noexcept + { + return kernel::asinh(x, A {}); + } + + /** + * @ingroup batch_trigo + * + * Computes the arc tangent of the batch \c x. + * @param x batch of floating point values. + * @return the arc tangent of \c x. + */ + template + inline batch atan(batch const& x) noexcept + { + return kernel::atan(x, A {}); + } + + /** + * @ingroup batch_trigo + * + * Computes the arc tangent of the batch \c x/y, using the signs of the + * arguments to determine the correct quadrant. + * @param x batch of floating point values. + * @param y batch of floating point values. + * @return the arc tangent of \c x/y. + */ + template + inline batch atan2(batch const& x, batch const& y) noexcept + { + return kernel::atan2(x, y, A {}); + } + + /** + * @ingroup batch_trigo + * + * Computes the inverse hyperbolic tangent of the batch \c x. + * @param x batch of floating point values. + * @return the inverse hyperbolic tangent of \c x. + */ + template + inline batch atanh(batch const& x) noexcept + { + return kernel::atanh(x, A {}); + } + + /** + * @ingroup batch_conversion + * + * Perform a static_cast from \c T_in to \c T_out on \c \c x. + * @param x batch of \c T_in + * @return \c x casted to \c T_out + */ + template + inline batch batch_cast(batch const& x) noexcept + { + return kernel::batch_cast(x, batch {}, A {}); + } + + /** + * @ingroup batch_miscellaneous + * + * Computes the bit of sign of \c x + * @param x batch of scalar + * @return bit of sign of \c x + */ + template + inline batch bitofsign(batch const& x) noexcept + { + return kernel::bitofsign(x, A {}); + } + + /** + * @ingroup batch_bitwise + * + * Computes the bitwise and of the batches \c x and \c y. + * @param x batch involved in the operation. + * @param y batch involved in the operation. + * @return the result of the bitwise and. + */ + template + inline auto bitwise_and(T const& x, Tp const& y) noexcept -> decltype(x & y) + { + return x & y; + } + + /** + * @ingroup batch_bitwise + * + * Computes the bitwise and not of batches \c x and \c y. + * @param x batch involved in the operation. + * @param y batch involved in the operation. + * @return the result of the bitwise and not. + */ + template + inline batch bitwise_andnot(batch const& x, batch const& y) noexcept + { + return kernel::bitwise_andnot(x, y, A {}); + } + + /** + * @ingroup batch_bool_logical + * + * Computes the bitwise and not of batches \c x and \c y. + * @param x batch involved in the operation. + * @param y batch involved in the operation. + * @return the result of the bitwise and not. + */ + template + inline batch_bool bitwise_andnot(batch_bool const& x, batch_bool const& y) noexcept + { + return kernel::bitwise_andnot(x, y, A {}); + } + + /** + * @ingroup batch_conversion + * + * Perform a reinterpret_cast from \c T_in to \c T_out on \c x. + * @param x batch of \c T_in + * @return \c x reinterpreted as \c T_out + */ + template + inline B bitwise_cast(batch const& x) noexcept + { + return kernel::bitwise_cast(x, B {}, A {}); + } + + /** + * @ingroup batch_bitwise + * + * Computes the bitwise not of batch \c x. + * @param x batch involved in the operation. + * @return the result of the bitwise not. + */ + template + inline batch bitwise_not(batch const& x) noexcept + { + return kernel::bitwise_not(x, A {}); + } + + /** + * @ingroup batch_bitwise + * + * Computes the bitwise or of the batches \c x and \c y. + * @param x scalar or batch of scalars + * @param y scalar or batch of scalars + * @return the result of the bitwise or. + */ + template + inline auto bitwise_or(T const& x, Tp const& y) noexcept -> decltype(x | y) + { + return x | y; + } + + /** + * @ingroup batch_bitwise + * + * Computes the bitwise xor of the batches \c x and \c y. + * @param x scalar or batch of scalars + * @param y scalar or batch of scalars + * @return the result of the bitwise xor. + */ + template + inline auto bitwise_xor(T const& x, Tp const& y) noexcept -> decltype(x ^ y) + { + return x ^ y; + } + + // FIXME: check if these need to be exposed, or removed (?) + template + inline batch_bool bool_cast(batch_bool const& x) noexcept + { + return kernel::bool_cast(x, A {}); + } + template + inline batch_bool bool_cast(batch_bool const& x) noexcept + { + return kernel::bool_cast(x, A {}); + } + template + inline batch_bool bool_cast(batch_bool const& x) noexcept + { + return kernel::bool_cast(x, A {}); + } + template + inline batch_bool bool_cast(batch_bool const& x) noexcept + { + return kernel::bool_cast(x, A {}); + } + + /** + * @ingroup batch_data_transfer + * + * Creates a batch from the single value \c v. + * @param v the value used to initialize the batch + * @return a new batch instance + */ + template + inline batch broadcast(T v) noexcept + { + return kernel::broadcast(v, A {}); + } + + /** + * @ingroup batch_data_transfer + * + * Creates a batch from the single value \c v and + * the specified batch value type \c To. + * @param v the value used to initialize the batch + * @return a new batch instance + */ + template + inline simd_return_type broadcast_as(From v) noexcept + { + using batch_value_type = typename simd_return_type::value_type; + using value_type = typename std::conditional::value, + bool, + batch_value_type>::type; + return simd_return_type(value_type(v)); + } + + /** + * @ingroup batch_math + * + * Computes the cubic root of the batch \c x. + * @param x batch of floating point values. + * @return the cubic root of \c x. + */ + template + inline batch cbrt(batch const& x) noexcept + { + return kernel::cbrt(x, A {}); + } + + /** + * @ingroup batch_rounding + * + * Computes the batch of smallest integer values not less than + * scalars in \c x. + * @param x batch of floating point values. + * @return the batch of smallest integer values not less than \c x. + */ + template + inline batch ceil(batch const& x) noexcept + { + return kernel::ceil(x, A {}); + } + + /** + * @ingroup batch_math + * + * Clips the values of the batch \c x between those of the batches \c lo and \c hi. + * @param x batch of floating point values. + * @param lo batch of floating point values. + * @param hi batch of floating point values. + * @return the result of the clipping. + */ + template + inline batch clip(batch const& x, batch const& lo, batch const& hi) noexcept + { + return kernel::clip(x, lo, hi, A {}); + } + + /** + * @ingroup batch_complex + * + * Computes the conjugate of the batch \c z. + * @param z batch of complex values. + * @return the argument of \c z. + */ + template + inline complex_batch_type_t> conj(batch const& z) noexcept + { + return kernel::conj(z, A {}); + } + + /** + * @ingroup batch_miscellaneous + * + * Computes a value whose absolute value matches + * that of \c x, but whose sign bit matches that of \c y. + * @param x batch of scalars + * @param y batch of scalars + * @return batch whose absolute value matches that of \c x, but whose sign bit + * matches that of \c y. + */ + template + inline batch copysign(batch const& x, batch const& y) noexcept + { + return kernel::copysign(x, y, A {}); + } + + /** + * @ingroup batch_trigo + * + * Computes the cosine of the batch \c x. + * @param x batch of floating point values. + * @return the cosine of \c x. + */ + template + inline batch cos(batch const& x) noexcept + { + return kernel::cos(x, A {}); + } + + /** + * @ingroup batch_trigo + * + * computes the hyperbolic cosine of the batch \c x. + * @param x batch of floating point values. + * @return the hyperbolic cosine of \c x. + */ + template + inline batch cosh(batch const& x) noexcept + { + return kernel::cosh(x, A {}); + } + + /** + * @ingroup batch_arithmetic + * + * Computes the division of the batch \c x by the batch \c y. + * @param x scalar or batch of scalars + * @param y scalar or batch of scalars + * @return the result of the division. + */ + template + inline auto div(T const& x, Tp const& y) noexcept -> decltype(x / y) + { + return x / y; + } + + /** + * @ingroup batch_logical + * + * Element-wise equality comparison of batches \c x and \c y. + * @param x batch of scalars + * @param y batch of scalars + * @return a boolean batch. + */ + template + inline batch_bool eq(batch const& x, batch const& y) noexcept + { + return x == y; + } + + /** + * @ingroup batch_math + * + * Computes the natural exponential of the batch \c x. + * @param x batch of floating point values. + * @return the natural exponential of \c x. + */ + template + inline batch exp(batch const& x) noexcept + { + return kernel::exp(x, A {}); + } + + /** + * @ingroup batch_math + * + * Computes the base 10 exponential of the batch \c x. + * @param x batch of floating point values. + * @return the base 10 exponential of \c x. + */ + template + inline batch exp10(batch const& x) noexcept + { + return kernel::exp10(x, A {}); + } + + /** + * @ingroup batch_math + * + * Computes the base 2 exponential of the batch \c x. + * @param x batch of floating point values. + * @return the base 2 exponential of \c x. + */ + template + inline batch exp2(batch const& x) noexcept + { + return kernel::exp2(x, A {}); + } + + /** + * @ingroup batch_math + * + * Computes the natural exponential of the batch \c x, minus one. + * @param x batch of floating point values. + * @return the natural exponential of \c x, minus one. + */ + template + inline batch expm1(batch const& x) noexcept + { + return kernel::expm1(x, A {}); + } + + /** + * @ingroup batch_math_extra + * + * Computes the error function of the batch \c x. + * @param x batch of floating point values. + * @return the error function of \c x. + */ + template + inline batch erf(batch const& x) noexcept + { + return kernel::erf(x, A {}); + } + + /** + * @ingroup batch_math_extra + * + * Computes the complementary error function of the batch \c x. + * @param x batch of floating point values. + * @return the error function of \c x. + */ + template + inline batch erfc(batch const& x) noexcept + { + return kernel::erfc(x, A {}); + } + + /** + * @ingroup batch_math_extra + * + * Evaluate polynomial with coefficient \c Coefs on point \c x using estrin + * method. + * @param x batch of floating point values. + * @return the evaluation ofpolynomial with coefficient \c Coefs on point \c x. + */ + template + inline batch estrin(const batch& x) noexcept + { + return kernel::estrin(x); + } + + /** + * Extract vector from pair of vectors + * extracts the lowest vector elements from the second source \c x + * and the highest vector elements from the first source \c y + * Concatenates the results into th Return value. + * @param x batch of integer or floating point values. + * @param y batch of integer or floating point values. + * @param i integer specifuing the lowest vector element to extract from the first source register + * @return. + */ + template + inline batch extract_pair(batch const& x, batch const& y, std::size_t i) noexcept + { + return kernel::extract_pair(x, y, i, A {}); + } + + /** + * @ingroup batch_math + * + * Computes the absolute values of each scalar in the batch \c x. + * @param x batch floating point values. + * @return the asbolute values of \c x. + */ + template + inline batch fabs(batch const& x) noexcept + { + return kernel::abs(x, A {}); + } + + /** + * @ingroup batch_math + * + * Computes the positive difference between \c x and \c y, that is, + * max(0, x-y). + * @param x batch of floating point values. + * @param y batch of floating point values. + * @return the positive difference. + */ + template + inline batch fdim(batch const& x, batch const& y) noexcept + { + return kernel::fdim(x, y, A {}); + } + + /** + * @ingroup batch_rounding + * + * Computes the batch of largest integer values not greater than + * scalars in \c x. + * @param x batch of floating point values. + * @return the batch of largest integer values not greater than \c x. + */ + template + inline batch floor(batch const& x) noexcept + { + return kernel::floor(x, A {}); + } + + /** + * @ingroup batch_arithmetic + * + * Computes (x*y) + z in a single instruction when possible. + * @param x a batch of integer or floating point values. + * @param y a batch of integer or floating point values. + * @param z a batch of integer or floating point values. + * @return the result of the fused multiply-add operation. + */ + template + inline batch fma(batch const& x, batch const& y, batch const& z) noexcept + { + return kernel::fma(x, y, z, A {}); + } + + /** + * @ingroup batch_math + * + * Computes the larger values of the batches \c x and \c y. + * @param x a batch of integer or floating point values. + * @param y a batch of integer or floating point values. + * @return a batch of the larger values. + */ + template + inline batch fmax(batch const& x, batch const& y) noexcept + { + return kernel::max(x, y, A {}); + } + + /** + * @ingroup batch_math + * + * Computes the smaller values of the batches \c x and \c y. + * @param x a batch of integer or floating point values. + * @param y a batch of integer or floating point values. + * @return a batch of the larger values. + */ + template + inline batch fmin(batch const& x, batch const& y) noexcept + { + return kernel::min(x, y, A {}); + } + + /** + * @ingroup batch_math + * + * Computes the modulo of the batch \c x by the batch \c y. + * @param x batch involved in the modulo. + * @param y batch involved in the modulo. + * @return the result of the modulo. + */ + template + inline batch fmod(batch const& x, batch const& y) noexcept + { + return kernel::fmod(x, y, A {}); + } + + /** + * @ingroup batch_arithmetic + * + * Computes (x*y) - z in a single instruction when possible. + * @param x a batch of integer or floating point values. + * @param y a batch of integer or floating point values. + * @param z a batch of integer or floating point values. + * @return the result of the fused multiply-sub operation. + */ + template + inline batch fms(batch const& x, batch const& y, batch const& z) noexcept + { + return kernel::fms(x, y, z, A {}); + } + + /** + * @ingroup batch_arithmetic + * + * Computes -(x*y) + z in a single instruction when possible. + * @param x a batch of integer or floating point values. + * @param y a batch of integer or floating point values. + * @param z a batch of integer or floating point values. + * @return the result of the fused negated multiply-add operation. + */ + template + inline batch fnma(batch const& x, batch const& y, batch const& z) noexcept + { + return kernel::fnma(x, y, z, A {}); + } + + /** + * @ingroup batch_arithmetic + * + * Computes -(x*y) - z in a single instruction when possible. + * @param x a batch of integer or floating point values. + * @param y a batch of integer or floating point values. + * @param z a batch of integer or floating point values. + * @return the result of the fused negated multiply-sub operation. + */ + template + inline batch fnms(batch const& x, batch const& y, batch const& z) noexcept + { + return kernel::fnms(x, y, z, A {}); + } + + /** + * @ingroup batch_fp + * + * Split split the number x into a normalized fraction and an exponent which is stored in exp + * @param x a batch of integer or floating point values. + * @param y a batch of integer or floating point values. + * @return the normalized fraction of x + */ + template + inline batch frexp(const batch& x, batch, A>& y) noexcept + { + return kernel::frexp(x, y, A {}); + } + + /** + * @ingroup batch_logical + * + * Element-wise greater or equal comparison of batches \c x and \c y. + * @tparam X the actual type of batch. + * @param x batch involved in the comparison. + * @param y batch involved in the comparison. + * @return a boolean batch. + */ + template + inline batch_bool ge(batch const& x, batch const& y) noexcept + { + return x >= y; + } + + /** + * @ingroup batch_logical + * + * Element-wise greater than comparison of batches \c x and \c y. + * @tparam X the actual type of batch. + * @param x batch involved in the comparison. + * @param y batch involved in the comparison. + * @return a boolean batch. + */ + template + inline batch_bool gt(batch const& x, batch const& y) noexcept + { + return x > y; + } + + /** + * @ingroup batch_reducers + * + * Adds all the scalars of the batch \c x. + * @param x batch involved in the reduction + * @return the result of the reduction. + */ + template + inline T hadd(batch const& x) noexcept + { + return kernel::hadd(x, A {}); + } + + /** + * @ingroup batch_reducers + * + * Parallel horizontal addition: adds the scalars of each batch + * in the array pointed by \c row and store them in a returned + * batch. + * @param row an array of \c N batches + * @return the result of the reduction. + */ + template + inline batch haddp(batch const* row) noexcept + { + return kernel::haddp(row, A {}); + } + + /** + * @ingroup batch_math_extra + * + * Evaluate polynomial with coefficient \c Coefs on point \c x using horner + * method. + * @param x batch of floating point values. + * @return the evaluation ofpolynomial with coefficient \c Coefs on point \c x. + */ + template + inline batch horner(const batch& x) noexcept + { + return kernel::horner(x); + } + + /** + * @ingroup batch_math + * + * Computes the square root of the sum of the squares of the batches + * \c x, and \c y. + * @param x batch of floating point values. + * @param y batch of floating point values. + * @return the square root of the sum of the squares of \c x and \c y. + */ + template + inline batch hypot(batch const& x, batch const& y) noexcept + { + return kernel::hypot(x, y, A {}); + } + + /** + * @ingroup batch_complex + * + * Computes the imaginary part of the batch \c z. + * @param z batch of complex or real values. + * @return the argument of \c z. + */ + template + inline real_batch_type_t> imag(batch const& x) noexcept + { + return kernel::imag(x, A {}); + } + + /** + * @ingroup batch_constant + * + * Return a batch of scalars representing positive infinity + * @return a batch of positive infinity + */ + template + B infinity() + { + using T = typename B::value_type; + return B(std::numeric_limits::infinity()); + } + + /** + * @ingroup batch_logical + * + * Determines if the scalars in the given batch \c x represent an even integer value + * @param x batch of floating point values. + * @return a batch of booleans. + */ + template + inline batch_bool is_even(batch const& x) noexcept + { + return kernel::is_even(x, A {}); + } + + /** + * @ingroup batch_logical + * + * Determines if the floating-point scalars in the given batch \c x represent integer value + * @param x batch of floating point values. + * @return a batch of booleans. + */ + template + inline batch_bool is_flint(batch const& x) noexcept + { + return kernel::is_flint(x, A {}); + } + + /** + * @ingroup batch_logical + * + * Determines if the scalars in the given batch \c x represent an odd integer value + * @param x batch of floating point values. + * @return a batch of booleans. + */ + template + inline batch_bool is_odd(batch const& x) noexcept + { + return kernel::is_odd(x, A {}); + } + + /** + * @ingroup batch_logical + * + * Determines if the scalars in the given batch \c x are inf values. + * @param x batch of floating point values. + * @return a batch of booleans. + */ + template + inline batch_bool isinf(batch const& x) noexcept + { + return kernel::isinf(x, A {}); + } + + /** + * @ingroup batch_logical + * + * Determines if the scalars in the given batch \c x are finite values. + * @param x batch of floating point values. + * @return a batch of booleans. + */ + template + inline batch_bool isfinite(batch const& x) noexcept + { + return kernel::isfinite(x, A {}); + } + + /** + * @ingroup batch_logical + * + * Determines if the scalars in the given batch \c x are NaN values. + * @param x batch of floating point values. + * @return a batch of booleans. + */ + template + inline typename batch::batch_bool_type isnan(batch const& x) noexcept + { + return kernel::isnan(x, A {}); + } + + /** + * @ingroup batch_math_extra + * + * Computes the multiplication of the floating- point number x by 2 raised to the power exp. + * @param x batch of floating point values. + * @param y batch of floating point values. + * @return the natural logarithm of the gamma function of \c x. + */ + template + inline batch ldexp(const batch& x, const batch, A>& y) noexcept + { + return kernel::ldexp(x, y, A {}); + } + + /** + * @ingroup batch_logical + * + * Element-wise lesser or equal to comparison of batches \c x and \c y. + * @param x batch involved in the comparison. + * @param y batch involved in the comparison. + * @return a boolean batch. + */ + template + inline batch_bool le(batch const& x, batch const& y) noexcept + { + return x <= y; + } + + /** + * @ingroup batch_math_extra + * + * Computes the natural logarithm of the gamma function of the batch \c x. + * @param x batch of floating point values. + * @return the natural logarithm of the gamma function of \c x. + */ + template + inline batch lgamma(batch const& x) noexcept + { + return kernel::lgamma(x, A {}); + } + + /** + * @ingroup batch_data_transfer + * + * Creates a batch from the buffer \c ptr and the specifed + * batch value type \c To. The memory needs to be aligned. + * @param ptr the memory buffer to read + * @return a new batch instance + */ + template + inline simd_return_type load_as(From const* ptr, aligned_mode) noexcept + { + using batch_value_type = typename simd_return_type::value_type; + return kernel::load_aligned(ptr, kernel::convert {}, A {}); + } + + template + inline simd_return_type load_as(bool const* ptr, aligned_mode) noexcept + { + return simd_return_type::load_aligned(ptr); + } + + template + inline simd_return_type, To> load_as(std::complex const* ptr, aligned_mode) noexcept + { + using batch_value_type = typename simd_return_type, To>::value_type; + return kernel::load_complex_aligned(ptr, kernel::convert {}, A {}); + } + + /** + * @ingroup batch_data_transfer + * + * Creates a batch from the buffer \c ptr and the specifed + * batch value type \c To. The memory does not need to be aligned. + * @param ptr the memory buffer to read + * @return a new batch instance + */ + template + inline simd_return_type load_as(From const* ptr, unaligned_mode) noexcept + { + using batch_value_type = typename simd_return_type::value_type; + return kernel::load_unaligned(ptr, kernel::convert {}, A {}); + } + + template + inline simd_return_type load_as(bool const* ptr, unaligned_mode) noexcept + { + return simd_return_type::load_unaligned(ptr); + } + + template + inline simd_return_type, To> load_as(std::complex const* ptr, unaligned_mode) noexcept + { + using batch_value_type = typename simd_return_type, To>::value_type; + return kernel::load_complex_unaligned(ptr, kernel::convert {}, A {}); + } + + /** + * @ingroup batch_data_transfer + * + * Creates a batch from the buffer \c ptr. The + * memory needs to be aligned. + * @param ptr the memory buffer to read + * @return a new batch instance + */ + template + inline batch load(From const* ptr, aligned_mode = {}) noexcept + { + return load_as(ptr, aligned_mode {}); + } + + /** + * @ingroup batch_data_transfer + * + * Creates a batch from the buffer \c ptr. The + * memory does not need to be aligned. + * @param ptr the memory buffer to read + * @return a new batch instance + */ + template + inline batch load(From const* ptr, unaligned_mode) noexcept + { + return load_as(ptr, unaligned_mode {}); + } + + /** + * @ingroup batch_data_transfer + * + * Creates a batch from the buffer \c ptr. The + * memory needs to be aligned. + * @param ptr the memory buffer to read + * @return a new batch instance + */ + template + inline batch load_aligned(From const* ptr) noexcept + { + return load_as(ptr, aligned_mode {}); + } + + /** + * @ingroup batch_data_transfer + * + * Creates a batch from the buffer \c ptr. The + * memory does not need to be aligned. + * @param ptr the memory buffer to read + * @return a new batch instance + */ + template + inline batch load_unaligned(From const* ptr) noexcept + { + return load_as(ptr, unaligned_mode {}); + } + + /** + * @ingroup batch_math + * + * Computes the natural logarithm of the batch \c x. + * @param x batch of floating point values. + * @return the natural logarithm of \c x. + */ + template + inline batch log(batch const& x) noexcept + { + return kernel::log(x, A {}); + } + + /** + * @ingroup batch_math + * Computes the base 2 logarithm of the batch \c x. + * @param x batch of floating point values. + * @return the base 2 logarithm of \c x. + */ + template + inline batch log2(batch const& x) noexcept + { + return kernel::log2(x, A {}); + } + + /** + * @ingroup batch_math + * Computes the base 10 logarithm of the batch \c x. + * @param x batch of floating point values. + * @return the base 10 logarithm of \c x. + */ + template + inline batch log10(batch const& x) noexcept + { + return kernel::log10(x, A {}); + } + + /** + * @ingroup batch_math + * Computes the natural logarithm of one plus the batch \c x. + * @param x batch of floating point values. + * @return the natural logarithm of one plus \c x. + */ + template + inline batch log1p(batch const& x) noexcept + { + return kernel::log1p(x, A {}); + } + + /** + * @ingroup batch_logical + * + * Element-wise lesser than comparison of batches \c x and \c y. + * @param x batch involved in the comparison. + * @param y batch involved in the comparison. + * @return a boolean batch. + */ + template + inline batch_bool lt(batch const& x, batch const& y) noexcept + { + return x < y; + } + + /** + * @ingroup batch_math + * + * Computes the larger values of the batches \c x and \c y. + * @param x a batch of integer or floating point values. + * @param y a batch of integer or floating point values. + * @return a batch of the larger values. + */ + template + inline batch max(batch const& x, batch const& y) noexcept + { + return kernel::max(x, y, A {}); + } + + /** + * @ingroup batch_math + * + * Computes the smaller values of the batches \c x and \c y. + * @param x a batch of integer or floating point values. + * @param y a batch of integer or floating point values. + * @return a batch of the smaller values. + */ + template + inline batch min(batch const& x, batch const& y) noexcept + { + return kernel::min(x, y, A {}); + } + + /** + * @ingroup batch_constant + * + * Return a batch of scalars representing positive infinity + * @return a batch of positive infinity + */ + template + inline B minusinfinity() noexcept + { + using T = typename B::value_type; + return B(-std::numeric_limits::infinity()); + } + + /** + * @ingroup batch_arithmetic + * + * Computes the integer modulo of the batch \c x by the batch \c y. + * @param x batch involved in the modulo. + * @param y batch involved in the modulo. + * @return the result of the modulo. + */ + template + inline auto mod(T const& x, Tp const& y) noexcept -> decltype(x % y) + { + return x % y; + } + + /** + * @ingroup batch_arithmetic + * + * Computes the product of the batches \c x and \c y. + * @tparam X the actual type of batch. + * @param x batch involved in the product. + * @param y batch involved in the product. + * @return the result of the product. + */ + template + inline auto mul(T const& x, Tp const& y) noexcept -> decltype(x * y) + { + return x * y; + } + + /** + * @ingroup batch_rounding + * + * Rounds the scalars in \c x to integer values (in floating point format), using + * the current rounding mode. + * @param x batch of flaoting point values. + * @return the batch of nearest integer values. + */ + template + inline batch nearbyint(batch const& x) noexcept + { + return kernel::nearbyint(x, A {}); + } + + /** + * @ingroup batch_logical + * + * Element-wise inequality comparison of batches \c x and \c y. + * @param x batch involved in the comparison. + * @param y batch involved in the comparison. + * @return a boolean batch. + */ + template + inline batch_bool neq(batch const& x, batch const& y) noexcept + { + return x != y; + } + + /** + * @ingroup batch_arithmetic + * + * Computes the opposite of the batch \c x. + * @param x batch involved in the operation. + * @return the opposite of \c x. + */ + template + inline batch neg(batch const& x) noexcept + { + return -x; + } + + /** + * @ingroup batch_math_extra + * + * Computes the next representable floating-point + * value following x in the direction of y + * @param x batch of floating point values. + * @param y batch of floating point values. + * @return \c x raised to the power \c y. + */ + template + inline batch nextafter(batch const& x, batch const& y) noexcept + { + return kernel::nextafter(x, y, A {}); + } + + /** + * @ingroup batch_complex + * + * Computes the norm of the batch \c x. + * @param x batch of complex or real values. + * @return the norm of \c x. + */ + template + inline real_batch_type_t> norm(batch const& x) noexcept + { + return kernel::norm(x, A {}); + } + + /** + * @ingroup batch_arithmetic + * + * No-op on \c x. + * @param x batch involved in the operation. + * @return \c x. + */ + template + inline batch pos(batch const& x) noexcept + { + return +x; + } + + /** + * @ingroup batch_math + * + * Computes the value of the batch \c x raised to the power + * \c y. + * @param x batch of floating point values. + * @param y batch of floating point values. + * @return \c x raised to the power \c y. + */ + template + inline batch pow(batch const& x, batch const& y) noexcept + { + return kernel::pow(x, y, A {}); + } + + /** + * @ingroup batch_math + * + * Computes the value of the batch \c x raised to the power + * \c y. + * @param x batch of integral values. + * @param y batch of integral values. + * @return \c x raised to the power \c y. + */ + template ::value, void>::type> + inline batch pow(batch const& x, ITy y) noexcept + { + return kernel::ipow(x, y, A {}); + } + + /** + * @ingroup batch_complex + * + * Computes the projection of the batch \c x. + * @param x batch of complex or real values. + * @return the projection of \c x. + */ + template + inline complex_batch_type_t> proj(batch const& x) noexcept + { + return kernel::proj(x, A {}); + } + + /** + * @ingroup batch_complex + * + * Computes the real part of the batch \c z. + * @param z batch of complex or real values. + * @return the argument of \c z. + */ + template + inline real_batch_type_t> real(batch const& x) noexcept + { + return kernel::real(x, A {}); + } + + /** + * @ingroup batch_math + * + * Computes the remainder of dividing \c x by \c y + * @param x batch of scalar values + * @param y batch of scalar values + * @return the result of the addition. + */ + template + inline batch remainder(batch const& x, batch const& y) noexcept + { + return kernel::remainder(x, y, A {}); + } + + /** + * @ingroup batch_rounding + * + * Rounds the scalars in \c x to integer values (in floating point format), using + * the current rounding mode. + * @param x batch of floating point values. + * @return the batch of rounded values. + */ + template + inline batch rint(batch const& x) noexcept + { + return nearbyint(x); + } + + /** + * @ingroup batch_rounding + * + * Computes the batch of nearest integer values to scalars in \c x (in + * floating point format), rounding halfway cases away from zero, regardless + * of the current rounding mode. + * @param x batch of flaoting point values. + * @return the batch of nearest integer values. + */ + template + inline batch round(batch const& x) noexcept + { + return kernel::round(x, A {}); + } + + /** + * @ingroup batch_arithmetic + * + * Computes the saturate sum of the batch \c x and the batch \c y. + * \c x. + * @tparam X the actual type of batch. + * @param x batch involved in the saturated addition. + * @param y batch involved in the saturated addition. + * @return the result of the saturated addition. + */ + template + inline auto sadd(T const& x, Tp const& y) noexcept -> decltype(x + y) + { + using B = decltype(x + y); + using A = typename B::arch_type; + return kernel::sadd(B(x), B(y), A {}); + } + + /** + * @ingroup batch_miscellaneous + * + * Ternary operator for batches: selects values from the batches \c true_br or \c false_br + * depending on the boolean values in the constant batch \c cond. Equivalent to + * \code{.cpp} + * for(std::size_t i = 0; i < N; ++i) + * res[i] = cond[i] ? true_br[i] : false_br[i]; + * \endcode + * @param cond constant batch condition. + * @param true_br batch values for truthy condition. + * @param false_br batch value for falsy condition. + * @return the result of the selection. + */ + template + inline batch select(batch_bool const& cond, batch const& true_br, batch const& false_br) noexcept + { + return kernel::select(cond, true_br, false_br, A {}); + } + + /** + * @ingroup batch_miscellaneous + * + * Ternary operator for batches: selects values from the batches \c true_br or \c false_br + * depending on the boolean values in the constant batch \c cond. Equivalent to + * \code{.cpp} + * for(std::size_t i = 0; i < N; ++i) + * res[i] = cond[i] ? true_br[i] : false_br[i]; + * \endcode + * @param cond constant batch condition. + * @param true_br batch values for truthy condition. + * @param false_br batch value for falsy condition. + * @return the result of the selection. + */ + template + inline batch, A> select(batch_bool const& cond, batch, A> const& true_br, batch, A> const& false_br) noexcept + { + return kernel::select(cond, true_br, false_br, A {}); + } + + /** + * @ingroup batch_miscellaneous + * + * Ternary operator for batches: selects values from the batches \c true_br or \c false_br + * depending on the boolean values in the constant batch \c cond. Equivalent to + * \code{.cpp} + * for(std::size_t i = 0; i < N; ++i) + * res[i] = cond[i] ? true_br[i] : false_br[i]; + * \endcode + * @param cond constant batch condition. + * @param true_br batch values for truthy condition. + * @param false_br batch value for falsy condition. + * @return the result of the selection. + */ + template + inline batch select(batch_bool_constant, Values...> const& cond, batch const& true_br, batch const& false_br) noexcept + { + return kernel::select(cond, true_br, false_br, A {}); + } + + /** + * @ingroup batch_miscellaneous + * + * Computes the sign of \c x + * @param x batch + * @return -1 for each negative element, -1 or +1 for each null element and +1 for each element + */ + template + inline batch sign(batch const& x) noexcept + { + return kernel::sign(x, A {}); + } + + /** + * @ingroup batch_miscellaneous + * + * Computes the sign of \c x, assuming x doesn't have any zero + * @param x batch + * @return -1 for each negative element, -1 or +1 for each null element and +1 for each element + */ + template + inline batch signnz(batch const& x) noexcept + { + return kernel::signnz(x, A {}); + } + + /** + * @ingroup batch_trigo + * + * Computes the sine of the batch \c x. + * @param x batch of floating point values. + * @return the sine of \c x. + */ + template + inline batch sin(batch const& x) noexcept + { + return kernel::sin(x, A {}); + } + + /** + * @ingroup batch_trigo + * + * Computes the hyperbolic sine of the batch \c x. + * @param x batch of floating point values. + * @return the hyperbolic sine of \c x. + */ + template + inline batch sinh(batch const& x) noexcept + { + return kernel::sinh(x, A {}); + } + + /** + * @ingroup batch_trigo + * + * Computes the sine and the cosine of the batch \c x. This method is faster + * than calling sine and cosine independently. + * @param x batch of floating point values. + * @return a pair containing the sine then the cosine of batch \c x + */ + template + inline std::pair, batch> sincos(batch const& x) noexcept + { + return kernel::sincos(x, A {}); + } + + /** + * @ingroup batch_math + * + * Computes the square root of the batch \c x. + * @param x batch of floating point values. + * @return the square root of \c x. + */ + template + inline batch sqrt(batch const& x) noexcept + { + return kernel::sqrt(x, A {}); + } + + /** + * @ingroup batch_arithmetic + * + * Computes the saturate difference of the batch \c x and the batch \c y. + * \c x. + * @tparam X the actual type of batch. + * @param x batch involved in the saturated difference. + * @param y batch involved in the saturated difference. + * @return the result of the saturated difference. + */ + template + inline auto ssub(T const& x, Tp const& y) noexcept -> decltype(x - y) + { + using B = decltype(x + y); + using A = typename B::arch_type; + return kernel::ssub(B(x), B(y), A {}); + } + + /** + * @ingroup batch_data_transfer + * + * Copy content of batch \c src to the buffer \c dst. The + * memory needs to be aligned. + * @param mem the memory buffer to write to + * @param val the batch to copy + */ + template + inline void store_as(To* dst, batch const& src, aligned_mode) noexcept + { + kernel::store_aligned(dst, src, A {}); + } + + template + inline void store_as(bool* dst, batch_bool const& src, aligned_mode) noexcept + { + kernel::store(src, dst, A {}); + } + + template + inline void store_as(std::complex* dst, batch, A> const& src, aligned_mode) noexcept + { + kernel::store_complex_aligned(dst, src, A {}); + } + + /** + * @ingroup batch_data_transfer + * + * Copy content of batch \c src to the buffer \c dst. The + * memory does not need to be aligned. + * @param mem the memory buffer to write to + * @param val the batch to copy + */ + template + inline void store_as(To* dst, batch const& src, unaligned_mode) noexcept + { + kernel::store_unaligned(dst, src, A {}); + } + + template + inline void store_as(bool* dst, batch_bool const& src, unaligned_mode) noexcept + { + kernel::store(src, dst, A {}); + } + + template + inline void store_as(std::complex* dst, batch, A> const& src, unaligned_mode) noexcept + { + kernel::store_complex_unaligned(dst, src, A {}); + } + + /** + * @ingroup batch_data_transfer + * + * Copy content of batch \c val to the buffer \c mem. The + * memory does not need to be aligned. + * @param mem the memory buffer to write to + * @param val the batch to copy from + */ + template + inline void store(T* mem, batch const& val, aligned_mode = {}) noexcept + { + store_as(mem, val, aligned_mode {}); + } + + /** + * @ingroup batch_data_transfer + * + * Copy content of batch \c val to the buffer \c mem. The + * memory does not need to be aligned. + * @param mem the memory buffer to write to + * @param val the batch to copy from + */ + template + inline void store(T* mem, batch const& val, unaligned_mode) noexcept + { + store_as(mem, val, unaligned_mode {}); + } + + /** + * @ingroup batch_data_transfer + * + * Copy content of batch \c val to the buffer \c mem. The + * memory needs to be aligned. + * @param mem the memory buffer to write to + * @param val the batch to copy from + */ + template + inline void store_aligned(T* mem, batch const& val) noexcept + { + store_as(mem, val, aligned_mode {}); + } + + /** + * @ingroup batch_data_transfer + * + * Copy content of batch \c val to the buffer \c mem. The + * memory does not need to be aligned. + * @param mem the memory buffer to write to + * @param val the batch to copy + */ + template + inline void store_unaligned(T* mem, batch const& val) noexcept + { + store_as(mem, val, unaligned_mode {}); + } + + /** + * @ingroup batch_arithmetic + * + * Computes the difference between \c x and \c y + * @tparam X the actual type of batch. + * @param x scalar or batch of scalars + * @param y scalar or batch of scalars + * @return the difference between \c x and \c y + */ + template + inline auto sub(T const& x, Tp const& y) noexcept -> decltype(x - y) + { + return x - y; + } + + /** + * @ingroup batch_trigo + * + * Computes the tangent of the batch \c x. + * @param x batch of floating point values. + * @return the tangent of \c x. + */ + template + inline batch tan(batch const& x) noexcept + { + return kernel::tan(x, A {}); + } + + /** + * @ingroup batch_trigo + * + * Computes the hyperbolic tangent of the batch \c x. + * @param x batch of floating point values. + * @return the hyperbolic tangent of \c x. + */ + template + inline batch tanh(batch const& x) noexcept + { + return kernel::tanh(x, A {}); + } + + /** + * @ingroup batch_math_extra + * + * Computes the gamma function of the batch \c x. + * @param x batch of floating point values. + * @return the gamma function of \c x. + */ + template + inline batch tgamma(batch const& x) noexcept + { + return kernel::tgamma(x, A {}); + } + + /** + * @ingroup batch_conversion + * + * Perform a conversion from \c i to a value of an floating point type of the same size as \c T + * @param i batch of integers. + * @return \c i converted to a value of an floating point type of the same size as \c T + */ + template + inline batch, A> to_float(batch const& i) noexcept + { + return kernel::to_float(i, A {}); + } + + /** + * @ingroup batch_conversion + * + * Perform a conversion from \c x to a value of an integer type of the same size as \c T + * @param x batch. + * @return \c x converted to a value of an integer type of the same size as \c T + */ + template + inline batch, A> to_int(batch const& x) noexcept + { + return kernel::to_int(x, A {}); + } + + /** + * @ingroup batch_rounding + * + * Computes the batch of nearest integer values not greater in magnitude + * than scalars in \c x. + * @param x batch of floating point values. + * @return the batch of nearest integer values not greater in magnitude than \c x. + */ + template + inline batch trunc(batch const& x) noexcept + { + return kernel::trunc(x, A {}); + } + + /** + * @ingroup batch_data_transfer + * + * Unpack and interleave data from the HIGH half of batches \c x and \c y. + * Store the results in the Return value. + * @param x a batch of integer or floating point or double precision values. + * @param y a batch of integer or floating point or double precision values. + * @return a batch of the high part of shuffled values. + */ + template + inline batch zip_hi(batch const& x, batch const& y) noexcept + { + return kernel::zip_hi(x, y, A {}); + } + + /** + * @ingroup batch_data_transfer + * + * Unpack and interleave data from the LOW half of batches \c x and \c y. + * Store the results in the Return value. + * @param x a batch of integer or floating point or double precision values. + * @param y a batch of integer or floating point or double precision values. + * @return a batch of the low part of shuffled values. + */ + template + inline batch zip_lo(batch const& x, batch const& y) noexcept + { + return kernel::zip_lo(x, y, A {}); + } + + // bitwise_cast + template ::value, int>::type = 3> + inline batch bitwise_cast(batch_bool const& self) noexcept + { + T z(0); + return select(self, batch(T(~z)), batch(z)); + } + + template ::value, int>::type = 3> + inline batch bitwise_cast(batch_bool const& self) noexcept + { + T z0(0), z1(0); + using int_type = as_unsigned_integer_t; + int_type value(~int_type(0)); + std::memcpy(&z1, &value, sizeof(int_type)); + return select(self, batch(z1), batch(z0)); + } + + /** + * @ingroup batch_bool_reducers + * + * Returns true if all the boolean values in the batch are true, + * false otherwise. + * @param x the batch to reduce. + * @return a boolean scalar. + */ + template + inline bool all(batch_bool const& x) noexcept + { + return kernel::all(x, A {}); + } + + /** + * @ingroup batch_bool_reducers + * + * Return true if any of the boolean values in the batch is true, + * false otherwise. + * @param x the batch to reduce. + * @return a boolean scalar. + */ + template + inline bool any(batch_bool const& x) noexcept + { + return kernel::any(x, A {}); + } + + /** + * @ingroup batch_miscellaneous + * + * Dump the content of batch \c x to stream \c o + * @param o the stream where the batch is dumped + * @param x batch to dump. + * @return a reference to \c o + */ + template + inline std::ostream& operator<<(std::ostream& o, batch const& x) noexcept + { + constexpr auto size = batch::size; + alignas(A::alignment()) T buffer[size]; + x.store_aligned(&buffer[0]); + o << '('; + for (std::size_t i = 0; i < size - 1; ++i) + o << buffer[i] << ", "; + return o << buffer[size - 1] << ')'; + } } #endif - diff --git a/third_party/xsimd/types/xsimd_avx2_register.hpp b/third_party/xsimd/types/xsimd_avx2_register.hpp index 5f55f4308..a02cdf848 100644 --- a/third_party/xsimd/types/xsimd_avx2_register.hpp +++ b/third_party/xsimd/types/xsimd_avx2_register.hpp @@ -1,13 +1,13 @@ /*************************************************************************** -* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * -* Martin Renou * -* Copyright (c) QuantStack * -* Copyright (c) Serge Guelton * -* * -* Distributed under the terms of the BSD 3-Clause License. * -* * -* The full license is in the file LICENSE, distributed with this software. * -****************************************************************************/ + * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * + * Martin Renou * + * Copyright (c) QuantStack * + * Copyright (c) Serge Guelton * + * * + * Distributed under the terms of the BSD 3-Clause License. * + * * + * The full license is in the file LICENSE, distributed with this software. * + ****************************************************************************/ #ifndef XSIMD_AVX2_REGISTER_HPP #define XSIMD_AVX2_REGISTER_HPP @@ -23,10 +23,10 @@ namespace xsimd */ struct avx2 : avx { - static constexpr bool supported() { return XSIMD_WITH_AVX2; } - static constexpr bool available() { return true; } - static constexpr unsigned version() { return generic::version(2, 2, 0); } - static constexpr char const* name() { return "avx2"; } + static constexpr bool supported() noexcept { return XSIMD_WITH_AVX2; } + static constexpr bool available() noexcept { return true; } + static constexpr unsigned version() noexcept { return generic::version(2, 2, 0); } + static constexpr char const* name() noexcept { return "avx2"; } }; #if XSIMD_WITH_AVX2 @@ -38,4 +38,3 @@ namespace xsimd } #endif - diff --git a/third_party/xsimd/types/xsimd_avx512bw_register.hpp b/third_party/xsimd/types/xsimd_avx512bw_register.hpp index 0cabe1fa0..49633f5db 100644 --- a/third_party/xsimd/types/xsimd_avx512bw_register.hpp +++ b/third_party/xsimd/types/xsimd_avx512bw_register.hpp @@ -1,44 +1,48 @@ /*************************************************************************** -* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * -* Martin Renou * -* Copyright (c) QuantStack * -* Copyright (c) Serge Guelton * -* * -* Distributed under the terms of the BSD 3-Clause License. * -* * -* The full license is in the file LICENSE, distributed with this software. * -****************************************************************************/ + * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * + * Martin Renou * + * Copyright (c) QuantStack * + * Copyright (c) Serge Guelton * + * * + * Distributed under the terms of the BSD 3-Clause License. * + * * + * The full license is in the file LICENSE, distributed with this software. * + ****************************************************************************/ #ifndef XSIMD_AVX512BW_REGISTER_HPP #define XSIMD_AVX512BW_REGISTER_HPP #include "./xsimd_avx512dq_register.hpp" -namespace xsimd { - - /** - * @ingroup arch - * - * AVX512BW instructions - */ - struct avx512bw : avx512dq { - static constexpr bool supported() { return XSIMD_WITH_AVX512BW; } - static constexpr bool available() { return true; } - static constexpr unsigned version() { return generic::version(3, 4, 0); } - static constexpr char const* name() { return "avx512bw"; } - }; +namespace xsimd +{ + + /** + * @ingroup arch + * + * AVX512BW instructions + */ + struct avx512bw : avx512dq + { + static constexpr bool supported() noexcept { return XSIMD_WITH_AVX512BW; } + static constexpr bool available() noexcept { return true; } + static constexpr unsigned version() noexcept { return generic::version(3, 4, 0); } + static constexpr char const* name() noexcept { return "avx512bw"; } + }; #if XSIMD_WITH_AVX512BW -namespace types { -template struct get_bool_simd_register { - using type = simd_avx512_bool_register; -}; + namespace types + { + template + struct get_bool_simd_register + { + using type = simd_avx512_bool_register; + }; - XSIMD_DECLARE_SIMD_REGISTER_ALIAS(avx512bw, avx512dq); + XSIMD_DECLARE_SIMD_REGISTER_ALIAS(avx512bw, avx512dq); - } + } #endif } #endif - diff --git a/third_party/xsimd/types/xsimd_avx512cd_register.hpp b/third_party/xsimd/types/xsimd_avx512cd_register.hpp index d7d55423a..a63a546e1 100644 --- a/third_party/xsimd/types/xsimd_avx512cd_register.hpp +++ b/third_party/xsimd/types/xsimd_avx512cd_register.hpp @@ -1,44 +1,48 @@ /*************************************************************************** -* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * -* Martin Renou * -* Copyright (c) QuantStack * -* Copyright (c) Serge Guelton * -* * -* Distributed under the terms of the BSD 3-Clause License. * -* * -* The full license is in the file LICENSE, distributed with this software. * -****************************************************************************/ + * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * + * Martin Renou * + * Copyright (c) QuantStack * + * Copyright (c) Serge Guelton * + * * + * Distributed under the terms of the BSD 3-Clause License. * + * * + * The full license is in the file LICENSE, distributed with this software. * + ****************************************************************************/ #ifndef XSIMD_AVX512CD_REGISTER_HPP #define XSIMD_AVX512CD_REGISTER_HPP -#include "./xsimd_avx512cd_register.hpp" +#include "./xsimd_avx512f_register.hpp" -namespace xsimd { +namespace xsimd +{ - /** - * @ingroup arch - * - * AVX512CD instrutions - */ - struct avx512cd : avx512f { - static constexpr bool supported() { return XSIMD_WITH_AVX512BW; } - static constexpr bool available() { return true; } - static constexpr unsigned version() { return generic::version(3, 2, 0); } - static constexpr char const* name() { return "avx512cd"; } - }; + /** + * @ingroup arch + * + * AVX512CD instrutions + */ + struct avx512cd : avx512f + { + static constexpr bool supported() noexcept { return XSIMD_WITH_AVX512CD; } + static constexpr bool available() noexcept { return true; } + static constexpr unsigned version() noexcept { return generic::version(3, 2, 0); } + static constexpr char const* name() noexcept { return "avx512cd"; } + }; #if XSIMD_WITH_AVX512BW -namespace types { -template struct get_bool_simd_register { - using type = simd_avx512_bool_register; -}; + namespace types + { + template + struct get_bool_simd_register + { + using type = simd_avx512_bool_register; + }; - XSIMD_DECLARE_SIMD_REGISTER_ALIAS(avx512cd, avx512f); + XSIMD_DECLARE_SIMD_REGISTER_ALIAS(avx512cd, avx512f); - } + } #endif } #endif - diff --git a/third_party/xsimd/types/xsimd_avx512dq_register.hpp b/third_party/xsimd/types/xsimd_avx512dq_register.hpp index 37550bafb..41846e711 100644 --- a/third_party/xsimd/types/xsimd_avx512dq_register.hpp +++ b/third_party/xsimd/types/xsimd_avx512dq_register.hpp @@ -1,44 +1,48 @@ /*************************************************************************** -* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * -* Martin Renou * -* Copyright (c) QuantStack * -* Copyright (c) Serge Guelton * -* * -* Distributed under the terms of the BSD 3-Clause License. * -* * -* The full license is in the file LICENSE, distributed with this software. * -****************************************************************************/ + * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * + * Martin Renou * + * Copyright (c) QuantStack * + * Copyright (c) Serge Guelton * + * * + * Distributed under the terms of the BSD 3-Clause License. * + * * + * The full license is in the file LICENSE, distributed with this software. * + ****************************************************************************/ #ifndef XSIMD_AVX512DQ_REGISTER_HPP #define XSIMD_AVX512DQ_REGISTER_HPP #include "./xsimd_avx512cd_register.hpp" -namespace xsimd { - - /** - * @ingroup arch - * - * AVX512DQ instructions - */ - struct avx512dq : avx512cd { - static constexpr bool supported() { return XSIMD_WITH_AVX512DQ; } - static constexpr bool available() { return true; } - static constexpr unsigned version() { return generic::version(3, 3, 0); } - static constexpr char const* name() { return "avx512dq"; } - }; +namespace xsimd +{ + + /** + * @ingroup arch + * + * AVX512DQ instructions + */ + struct avx512dq : avx512cd + { + static constexpr bool supported() noexcept { return XSIMD_WITH_AVX512DQ; } + static constexpr bool available() noexcept { return true; } + static constexpr unsigned version() noexcept { return generic::version(3, 3, 0); } + static constexpr char const* name() noexcept { return "avx512dq"; } + }; #if XSIMD_WITH_AVX512DQ -namespace types { -template struct get_bool_simd_register { - using type = simd_avx512_bool_register; -}; + namespace types + { + template + struct get_bool_simd_register + { + using type = simd_avx512_bool_register; + }; - XSIMD_DECLARE_SIMD_REGISTER_ALIAS(avx512dq, avx512cd); + XSIMD_DECLARE_SIMD_REGISTER_ALIAS(avx512dq, avx512cd); - } + } #endif } #endif - diff --git a/third_party/xsimd/types/xsimd_avx512f_register.hpp b/third_party/xsimd/types/xsimd_avx512f_register.hpp index b54412494..c24a91af5 100644 --- a/third_party/xsimd/types/xsimd_avx512f_register.hpp +++ b/third_party/xsimd/types/xsimd_avx512f_register.hpp @@ -1,71 +1,75 @@ /*************************************************************************** -* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * -* Martin Renou * -* Copyright (c) QuantStack * -* Copyright (c) Serge Guelton * -* * -* Distributed under the terms of the BSD 3-Clause License. * -* * -* The full license is in the file LICENSE, distributed with this software. * -****************************************************************************/ + * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * + * Martin Renou * + * Copyright (c) QuantStack * + * Copyright (c) Serge Guelton * + * * + * Distributed under the terms of the BSD 3-Clause License. * + * * + * The full license is in the file LICENSE, distributed with this software. * + ****************************************************************************/ #ifndef XSIMD_AVX512F_REGISTER_HPP #define XSIMD_AVX512F_REGISTER_HPP #include "./xsimd_generic_arch.hpp" -namespace xsimd { +namespace xsimd +{ - /** - * @ingroup arch - * - * AVX512F instructions - */ - struct avx512f : generic { - static constexpr bool supported() { return XSIMD_WITH_AVX512F; } - static constexpr bool available() { return true; } - static constexpr unsigned version() { return generic::version(3, 1, 0); } - static constexpr std::size_t alignment() { return 64; } - static constexpr bool requires_alignment() { return true; } - static constexpr char const* name() { return "avx512f"; } - }; + /** + * @ingroup arch + * + * AVX512F instructions + */ + struct avx512f : generic + { + static constexpr bool supported() noexcept { return XSIMD_WITH_AVX512F; } + static constexpr bool available() noexcept { return true; } + static constexpr unsigned version() noexcept { return generic::version(3, 1, 0); } + static constexpr std::size_t alignment() noexcept { return 64; } + static constexpr bool requires_alignment() noexcept { return true; } + static constexpr char const* name() noexcept { return "avx512f"; } + }; #if XSIMD_WITH_AVX512F -namespace types { -template struct simd_avx512_bool_register { - using register_type = typename std::conditional< - (sizeof(T) < 4), std::conditional<(sizeof(T) == 1), __mmask64, __mmask32>, - std::conditional<(sizeof(T) == 4), __mmask16, __mmask8>>::type::type; - register_type data; - simd_avx512_bool_register() = default; - simd_avx512_bool_register(register_type r) { data = r; } - operator register_type() const { return data; } -}; -template struct get_bool_simd_register { - using type = simd_avx512_bool_register; -}; + namespace types + { + template + struct simd_avx512_bool_register + { + using register_type = typename std::conditional< + (sizeof(T) < 4), std::conditional<(sizeof(T) == 1), __mmask64, __mmask32>, + std::conditional<(sizeof(T) == 4), __mmask16, __mmask8>>::type::type; + register_type data; + simd_avx512_bool_register() = default; + simd_avx512_bool_register(register_type r) { data = r; } + operator register_type() const noexcept { return data; } + }; + template + struct get_bool_simd_register + { + using type = simd_avx512_bool_register; + }; + XSIMD_DECLARE_SIMD_REGISTER(bool, avx512f, __m512i); + XSIMD_DECLARE_SIMD_REGISTER(signed char, avx512f, __m512i); + XSIMD_DECLARE_SIMD_REGISTER(unsigned char, avx512f, __m512i); + XSIMD_DECLARE_SIMD_REGISTER(char, avx512f, __m512i); + XSIMD_DECLARE_SIMD_REGISTER(unsigned short, avx512f, __m512i); + XSIMD_DECLARE_SIMD_REGISTER(short, avx512f, __m512i); + XSIMD_DECLARE_SIMD_REGISTER(unsigned int, avx512f, __m512i); + XSIMD_DECLARE_SIMD_REGISTER(int, avx512f, __m512i); + XSIMD_DECLARE_SIMD_REGISTER(unsigned long int, avx512f, __m512i); + XSIMD_DECLARE_SIMD_REGISTER(long int, avx512f, __m512i); + XSIMD_DECLARE_SIMD_REGISTER(unsigned long long int, avx512f, __m512i); + XSIMD_DECLARE_SIMD_REGISTER(long long int, avx512f, __m512i); + XSIMD_DECLARE_SIMD_REGISTER(float, avx512f, __m512); + XSIMD_DECLARE_SIMD_REGISTER(double, avx512f, __m512d); - - XSIMD_DECLARE_SIMD_REGISTER(bool, avx512f, __m512i); - XSIMD_DECLARE_SIMD_REGISTER(signed char, avx512f, __m512i); - XSIMD_DECLARE_SIMD_REGISTER(unsigned char, avx512f, __m512i); - XSIMD_DECLARE_SIMD_REGISTER(char, avx512f, __m512i); - XSIMD_DECLARE_SIMD_REGISTER(unsigned short, avx512f, __m512i); - XSIMD_DECLARE_SIMD_REGISTER(short, avx512f, __m512i); - XSIMD_DECLARE_SIMD_REGISTER(unsigned int, avx512f, __m512i); - XSIMD_DECLARE_SIMD_REGISTER(int, avx512f, __m512i); - XSIMD_DECLARE_SIMD_REGISTER(unsigned long int, avx512f, __m512i); - XSIMD_DECLARE_SIMD_REGISTER(long int, avx512f, __m512i); - XSIMD_DECLARE_SIMD_REGISTER(unsigned long long int, avx512f, __m512i); - XSIMD_DECLARE_SIMD_REGISTER(long long int, avx512f, __m512i); - XSIMD_DECLARE_SIMD_REGISTER(float, avx512f, __m512); - XSIMD_DECLARE_SIMD_REGISTER(double, avx512f, __m512d); - - } + } #endif } #endif - diff --git a/third_party/xsimd/types/xsimd_avx_register.hpp b/third_party/xsimd/types/xsimd_avx_register.hpp index e24bd0215..596a6b702 100644 --- a/third_party/xsimd/types/xsimd_avx_register.hpp +++ b/third_party/xsimd/types/xsimd_avx_register.hpp @@ -1,58 +1,62 @@ /*************************************************************************** -* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * -* Martin Renou * -* Copyright (c) QuantStack * -* Copyright (c) Serge Guelton * -* * -* Distributed under the terms of the BSD 3-Clause License. * -* * -* The full license is in the file LICENSE, distributed with this software. * -****************************************************************************/ + * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * + * Martin Renou * + * Copyright (c) QuantStack * + * Copyright (c) Serge Guelton * + * * + * Distributed under the terms of the BSD 3-Clause License. * + * * + * The full license is in the file LICENSE, distributed with this software. * + ****************************************************************************/ #ifndef XSIMD_AVX_REGISTER_HPP #define XSIMD_AVX_REGISTER_HPP #include "./xsimd_generic_arch.hpp" -namespace xsimd { - - /** - * @ingroup arch - * - * AVX instructions - */ - struct avx : generic { - static constexpr bool supported() { return XSIMD_WITH_AVX; } - static constexpr bool available() { return true; } - static constexpr unsigned version() { return generic::version(2, 1, 0); } - static constexpr std::size_t alignment() { return 32; } - static constexpr bool requires_alignment() { return true; } - static constexpr char const* name() { return "avx"; } - }; +namespace xsimd +{ + + /** + * @ingroup arch + * + * AVX instructions + */ + struct avx : generic + { + static constexpr bool supported() noexcept { return XSIMD_WITH_AVX; } + static constexpr bool available() noexcept { return true; } + static constexpr unsigned version() noexcept { return generic::version(2, 1, 0); } + static constexpr std::size_t alignment() noexcept { return 32; } + static constexpr bool requires_alignment() noexcept { return true; } + static constexpr char const* name() noexcept { return "avx"; } + }; } #if XSIMD_WITH_AVX #include -namespace xsimd { - namespace types { - - XSIMD_DECLARE_SIMD_REGISTER(bool, avx, __m256i); - XSIMD_DECLARE_SIMD_REGISTER(signed char, avx, __m256i); - XSIMD_DECLARE_SIMD_REGISTER(unsigned char, avx, __m256i); - XSIMD_DECLARE_SIMD_REGISTER(char, avx, __m256i); - XSIMD_DECLARE_SIMD_REGISTER(unsigned short, avx, __m256i); - XSIMD_DECLARE_SIMD_REGISTER(short, avx, __m256i); - XSIMD_DECLARE_SIMD_REGISTER(unsigned int, avx, __m256i); - XSIMD_DECLARE_SIMD_REGISTER(int, avx, __m256i); - XSIMD_DECLARE_SIMD_REGISTER(unsigned long int, avx, __m256i); - XSIMD_DECLARE_SIMD_REGISTER(long int, avx, __m256i); - XSIMD_DECLARE_SIMD_REGISTER(unsigned long long int, avx, __m256i); - XSIMD_DECLARE_SIMD_REGISTER(long long int, avx, __m256i); - XSIMD_DECLARE_SIMD_REGISTER(float, avx, __m256); - XSIMD_DECLARE_SIMD_REGISTER(double, avx, __m256d); - } +namespace xsimd +{ + namespace types + { + + XSIMD_DECLARE_SIMD_REGISTER(bool, avx, __m256i); + XSIMD_DECLARE_SIMD_REGISTER(signed char, avx, __m256i); + XSIMD_DECLARE_SIMD_REGISTER(unsigned char, avx, __m256i); + XSIMD_DECLARE_SIMD_REGISTER(char, avx, __m256i); + XSIMD_DECLARE_SIMD_REGISTER(unsigned short, avx, __m256i); + XSIMD_DECLARE_SIMD_REGISTER(short, avx, __m256i); + XSIMD_DECLARE_SIMD_REGISTER(unsigned int, avx, __m256i); + XSIMD_DECLARE_SIMD_REGISTER(int, avx, __m256i); + XSIMD_DECLARE_SIMD_REGISTER(unsigned long int, avx, __m256i); + XSIMD_DECLARE_SIMD_REGISTER(long int, avx, __m256i); + XSIMD_DECLARE_SIMD_REGISTER(unsigned long long int, avx, __m256i); + XSIMD_DECLARE_SIMD_REGISTER(long long int, avx, __m256i); + XSIMD_DECLARE_SIMD_REGISTER(float, avx, __m256); + XSIMD_DECLARE_SIMD_REGISTER(double, avx, __m256d); + } } #endif #endif diff --git a/third_party/xsimd/types/xsimd_batch.hpp b/third_party/xsimd/types/xsimd_batch.hpp index f0481de87..96c105962 100644 --- a/third_party/xsimd/types/xsimd_batch.hpp +++ b/third_party/xsimd/types/xsimd_batch.hpp @@ -1,13 +1,13 @@ /*************************************************************************** -* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * -* Martin Renou * -* Copyright (c) QuantStack * -* Copyright (c) Serge Guelton * -* * -* Distributed under the terms of the BSD 3-Clause License. * -* * -* The full license is in the file LICENSE, distributed with this software. * -****************************************************************************/ + * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * + * Martin Renou * + * Copyright (c) QuantStack * + * Copyright (c) Serge Guelton * + * * + * Distributed under the terms of the BSD 3-Clause License. * + * * + * The full license is in the file LICENSE, distributed with this software. * + ****************************************************************************/ #ifndef XSIMD_BATCH_HPP #define XSIMD_BATCH_HPP @@ -31,11 +31,10 @@ namespace xsimd * @tparam T the type of the underlying values. * @tparam A the architecture this batch is tied too. **/ - template + template class batch : public types::simd_register { public: - static constexpr std::size_t size = sizeof(types::simd_register) / sizeof(T); using value_type = T; @@ -45,148 +44,148 @@ namespace xsimd // constructors batch() = default; - batch(T val); - batch(std::initializer_list data); - explicit batch(batch_bool_type const &b); - batch(register_type reg); + batch(T val) noexcept; + batch(std::initializer_list data) noexcept; + explicit batch(batch_bool_type const& b) noexcept; + batch(register_type reg) noexcept; - template - static XSIMD_NO_DISCARD batch broadcast(U val); + template + static XSIMD_NO_DISCARD batch broadcast(U val) noexcept; // memory operators - template - void store_aligned(U * mem) const; - template - void store_unaligned(U * mem) const; - template - void store(U * mem, aligned_mode) const; - template - void store(U * mem, unaligned_mode) const; - - template - static XSIMD_NO_DISCARD batch load_aligned(U const* mem) ; - template - static XSIMD_NO_DISCARD batch load_unaligned(U const* mem); - template - static XSIMD_NO_DISCARD batch load(U const* mem, aligned_mode); - template - static XSIMD_NO_DISCARD batch load(U const* mem, unaligned_mode); - - T get(std::size_t i) const; + template + void store_aligned(U* mem) const noexcept; + template + void store_unaligned(U* mem) const noexcept; + template + void store(U* mem, aligned_mode) const noexcept; + template + void store(U* mem, unaligned_mode) const noexcept; + + template + static XSIMD_NO_DISCARD batch load_aligned(U const* mem) noexcept; + template + static XSIMD_NO_DISCARD batch load_unaligned(U const* mem) noexcept; + template + static XSIMD_NO_DISCARD batch load(U const* mem, aligned_mode) noexcept; + template + static XSIMD_NO_DISCARD batch load(U const* mem, unaligned_mode) noexcept; + + T get(std::size_t i) const noexcept; // comparison operators - batch_bool_type operator==(batch const& other) const; - batch_bool_type operator!=(batch const& other) const; - batch_bool_type operator>=(batch const& other) const; - batch_bool_type operator<=(batch const& other) const; - batch_bool_type operator>(batch const& other) const; - batch_bool_type operator<(batch const& other) const; + batch_bool_type operator==(batch const& other) const noexcept; + batch_bool_type operator!=(batch const& other) const noexcept; + batch_bool_type operator>=(batch const& other) const noexcept; + batch_bool_type operator<=(batch const& other) const noexcept; + batch_bool_type operator>(batch const& other) const noexcept; + batch_bool_type operator<(batch const& other) const noexcept; // Update operators - batch& operator+=(batch const& other); - batch& operator-=(batch const& other); - batch& operator*=(batch const& other); - batch& operator/=(batch const& other); - batch& operator%=(batch const& other); - batch& operator&=(batch const& other); - batch& operator|=(batch const& other); - batch& operator^=(batch const& other); - batch& operator>>=(int32_t other); - batch& operator>>=(batch const& other); - batch& operator<<=(int32_t other); - batch& operator<<=(batch const& other); + batch& operator+=(batch const& other) noexcept; + batch& operator-=(batch const& other) noexcept; + batch& operator*=(batch const& other) noexcept; + batch& operator/=(batch const& other) noexcept; + batch& operator%=(batch const& other) noexcept; + batch& operator&=(batch const& other) noexcept; + batch& operator|=(batch const& other) noexcept; + batch& operator^=(batch const& other) noexcept; + batch& operator>>=(int32_t other) noexcept; + batch& operator>>=(batch const& other) noexcept; + batch& operator<<=(int32_t other) noexcept; + batch& operator<<=(batch const& other) noexcept; // incr/decr operators - batch& operator++(); - batch& operator--(); - batch operator++(int); - batch operator--(int); + batch& operator++() noexcept; + batch& operator--() noexcept; + batch operator++(int) noexcept; + batch operator--(int) noexcept; // unary operators - batch_bool_type operator!() const; - batch operator~() const; - batch operator-() const; - batch operator+() const; + batch_bool_type operator!() const noexcept; + batch operator~() const noexcept; + batch operator-() const noexcept; + batch operator+() const noexcept; // arithmetic operators. They are defined as friend to enable automatic // conversion of parameters from scalar to batch. Inline implementation // is required to avoid warnings. - friend batch operator+(batch const& self, batch const& other) + friend batch operator+(batch const& self, batch const& other) noexcept { return batch(self) += other; } - friend batch operator-(batch const& self, batch const& other) + friend batch operator-(batch const& self, batch const& other) noexcept { return batch(self) -= other; } - - friend batch operator*(batch const& self, batch const& other) + + friend batch operator*(batch const& self, batch const& other) noexcept { return batch(self) *= other; } - - friend batch operator/(batch const& self, batch const& other) + + friend batch operator/(batch const& self, batch const& other) noexcept { return batch(self) /= other; } - - friend batch operator%(batch const& self, batch const& other) + + friend batch operator%(batch const& self, batch const& other) noexcept { return batch(self) %= other; } - friend batch operator&(batch const& self, batch const& other) + friend batch operator&(batch const& self, batch const& other) noexcept { return batch(self) &= other; } - friend batch operator|(batch const& self, batch const& other) + friend batch operator|(batch const& self, batch const& other) noexcept { return batch(self) |= other; } - friend batch operator^(batch const& self, batch const& other) + friend batch operator^(batch const& self, batch const& other) noexcept { return batch(self) ^= other; } - friend batch operator>>(batch const& self, batch const& other) + friend batch operator>>(batch const& self, batch const& other) noexcept { - return batch(self) >>= other; + return batch(self) >>= other; } - friend batch operator<<(batch const& self, batch const& other) + friend batch operator<<(batch const& self, batch const& other) noexcept { return batch(self) <<= other; } - friend batch operator>>(batch const& self, int32_t other) + friend batch operator>>(batch const& self, int32_t other) noexcept { return batch(self) >>= other; } - friend batch operator<<(batch const& self, int32_t other) + friend batch operator<<(batch const& self, int32_t other) noexcept { return batch(self) <<= other; } - friend batch operator&&(batch const& self, batch const& other) + friend batch operator&&(batch const& self, batch const& other) noexcept { return batch(self).logical_and(other); } - friend batch operator||(batch const& self, batch const& other) + friend batch operator||(batch const& self, batch const& other) noexcept { return batch(self).logical_or(other); } + private: - - template - batch(T const* data, detail::index_sequence); + template + batch(T const* data, detail::index_sequence) noexcept; - batch logical_and(batch const& other) const; - batch logical_or(batch const& other) const; + batch logical_and(batch const& other) const noexcept; + batch logical_or(batch const& other) const noexcept; }; template @@ -201,11 +200,10 @@ namespace xsimd * @tparam T the type of the predicated values. * @tparam A the architecture this batch is tied too. **/ - template + template class batch_bool : public types::get_bool_simd_register_t { public: - static constexpr std::size_t size = sizeof(types::simd_register) / sizeof(T); using value_type = bool; @@ -215,40 +213,39 @@ namespace xsimd // constructors batch_bool() = default; - batch_bool(bool val); - batch_bool(register_type reg); - batch_bool(std::initializer_list data); + batch_bool(bool val) noexcept; + batch_bool(register_type reg) noexcept; + batch_bool(std::initializer_list data) noexcept; // memory operators - void store_aligned(bool * mem) const; - void store_unaligned(bool * mem) const; - static XSIMD_NO_DISCARD batch_bool load_aligned(bool const * mem); - static XSIMD_NO_DISCARD batch_bool load_unaligned(bool const * mem); + void store_aligned(bool* mem) const noexcept; + void store_unaligned(bool* mem) const noexcept; + static XSIMD_NO_DISCARD batch_bool load_aligned(bool const* mem) noexcept; + static XSIMD_NO_DISCARD batch_bool load_unaligned(bool const* mem) noexcept; - bool get(std::size_t i) const; + bool get(std::size_t i) const noexcept; // comparison operators - batch_bool operator==(batch_bool const& other) const; - batch_bool operator!=(batch_bool const& other) const; + batch_bool operator==(batch_bool const& other) const noexcept; + batch_bool operator!=(batch_bool const& other) const noexcept; // logical operators - batch_bool operator~() const; - batch_bool operator!() const; - batch_bool operator&(batch_bool const& other) const; - batch_bool operator|(batch_bool const& other) const; - batch_bool operator&&(batch_bool const& other) const; - batch_bool operator||(batch_bool const& other) const; + batch_bool operator~() const noexcept; + batch_bool operator!() const noexcept; + batch_bool operator&(batch_bool const& other) const noexcept; + batch_bool operator|(batch_bool const& other) const noexcept; + batch_bool operator&&(batch_bool const& other) const noexcept; + batch_bool operator||(batch_bool const& other) const noexcept; private: - - template - batch_bool(bool const* data, detail::index_sequence); + template + batch_bool(bool const* data, detail::index_sequence) noexcept; template - static register_type make_register(detail::index_sequence, U u, V... v); + static register_type make_register(detail::index_sequence, U u, V... v) noexcept; template - static register_type make_register(detail::index_sequence<>, V... v); + static register_type make_register(detail::index_sequence<>, V... v) noexcept; }; template @@ -262,11 +259,10 @@ namespace xsimd * @tparam T the type of the underlying values. * @tparam A the architecture this batch is tied too. **/ - template + template class batch, A> { public: - using value_type = std::complex; using real_batch = batch; using arch_type = A; @@ -275,102 +271,101 @@ namespace xsimd // constructors batch() = default; - batch(value_type const& val); - batch(real_batch const& real, real_batch const& imag); - - batch(real_batch const& real); - batch(T val); - batch(std::initializer_list data); - explicit batch(batch_bool_type const& b); + batch(value_type const& val) noexcept; + batch(real_batch const& real, real_batch const& imag) noexcept; + + batch(real_batch const& real) noexcept; + batch(T val) noexcept; + batch(std::initializer_list data) noexcept; + explicit batch(batch_bool_type const& b) noexcept; // memory operators - static XSIMD_NO_DISCARD batch load_aligned(const T* real_src, const T* imag_src=nullptr); - static XSIMD_NO_DISCARD batch load_unaligned(const T* real_src, const T* imag_src=nullptr); - void store_aligned(T* real_dst, T* imag_dst) const; - void store_unaligned(T* real_dst, T* imag_dst) const; - - static XSIMD_NO_DISCARD batch load_aligned(const value_type* src); - static XSIMD_NO_DISCARD batch load_unaligned(const value_type* src); - void store_aligned(value_type* dst) const; - void store_unaligned(value_type* dst) const; - - template - static XSIMD_NO_DISCARD batch load(U const* mem, aligned_mode); - template - static XSIMD_NO_DISCARD batch load(U const* mem, unaligned_mode); - template - void store(U * mem, aligned_mode) const; - template - void store(U * mem, unaligned_mode) const; - - real_batch real() const; - real_batch imag() const; - - value_type get(std::size_t i) const; + static XSIMD_NO_DISCARD batch load_aligned(const T* real_src, const T* imag_src = nullptr) noexcept; + static XSIMD_NO_DISCARD batch load_unaligned(const T* real_src, const T* imag_src = nullptr) noexcept; + void store_aligned(T* real_dst, T* imag_dst) const noexcept; + void store_unaligned(T* real_dst, T* imag_dst) const noexcept; + + static XSIMD_NO_DISCARD batch load_aligned(const value_type* src) noexcept; + static XSIMD_NO_DISCARD batch load_unaligned(const value_type* src) noexcept; + void store_aligned(value_type* dst) const noexcept; + void store_unaligned(value_type* dst) const noexcept; + + template + static XSIMD_NO_DISCARD batch load(U const* mem, aligned_mode) noexcept; + template + static XSIMD_NO_DISCARD batch load(U const* mem, unaligned_mode) noexcept; + template + void store(U* mem, aligned_mode) const noexcept; + template + void store(U* mem, unaligned_mode) const noexcept; + + real_batch real() const noexcept; + real_batch imag() const noexcept; + + value_type get(std::size_t i) const noexcept; #ifdef XSIMD_ENABLE_XTL_COMPLEX // xtl-related methods - template - batch(xtl::xcomplex const& val); - template - batch(std::initializer_list> data); - - template - static XSIMD_NO_DISCARD batch load_aligned(const xtl::xcomplex* src); - template - static XSIMD_NO_DISCARD batch load_unaligned(const xtl::xcomplex* src); - template - void store_aligned(xtl::xcomplex* dst) const; - template - void store_unaligned(xtl::xcomplex* dst) const; + template + batch(xtl::xcomplex const& val) noexcept; + template + batch(std::initializer_list> data) noexcept; + + template + static XSIMD_NO_DISCARD batch load_aligned(const xtl::xcomplex* src) noexcept; + template + static XSIMD_NO_DISCARD batch load_unaligned(const xtl::xcomplex* src) noexcept; + template + void store_aligned(xtl::xcomplex* dst) const noexcept; + template + void store_unaligned(xtl::xcomplex* dst) const noexcept; #endif // comparison operators - batch_bool operator==(batch const& other) const; - batch_bool operator!=(batch const& other) const; + batch_bool operator==(batch const& other) const noexcept; + batch_bool operator!=(batch const& other) const noexcept; // Update operators - batch& operator+=(batch const& other); - batch& operator-=(batch const& other); - batch& operator*=(batch const& other); - batch& operator/=(batch const& other); + batch& operator+=(batch const& other) noexcept; + batch& operator-=(batch const& other) noexcept; + batch& operator*=(batch const& other) noexcept; + batch& operator/=(batch const& other) noexcept; // incr/decr operators - batch& operator++(); - batch& operator--(); - batch operator++(int); - batch operator--(int); + batch& operator++() noexcept; + batch& operator--() noexcept; + batch operator++(int) noexcept; + batch operator--(int) noexcept; // unary operators - batch_bool_type operator!() const; - batch operator~() const; - batch operator-() const; - batch operator+() const; + batch_bool_type operator!() const noexcept; + batch operator~() const noexcept; + batch operator-() const noexcept; + batch operator+() const noexcept; // arithmetic operators. They are defined as friend to enable automatic // conversion of parameters from scalar to batch - friend batch operator+(batch const& self, batch const& other) + friend batch operator+(batch const& self, batch const& other) noexcept { return batch(self) += other; } - friend batch operator-(batch const& self, batch const& other) + friend batch operator-(batch const& self, batch const& other) noexcept { return batch(self) -= other; } - - friend batch operator*(batch const& self, batch const& other) + + friend batch operator*(batch const& self, batch const& other) noexcept { return batch(self) *= other; } - friend batch operator/(batch const& self, batch const& other) + friend batch operator/(batch const& self, batch const& other) noexcept { return batch(self) /= other; } private: - real_batch m_real; real_batch m_imag; }; @@ -378,49 +373,10 @@ namespace xsimd template constexpr std::size_t batch, A>::size; - /******************* - * real_batch_type * - *******************/ - - template - struct real_batch_type - { - using type = B; - }; - - template - struct real_batch_type, A>> - { - using type = batch; - }; - - template - using real_batch_type_t = typename real_batch_type::type; - - /********************** - * complex_batch_type * - **********************/ - - template - struct complex_batch_type - { - using real_value_type = typename B::value_type; - using arch_type = typename B::arch_type; - using type = batch, arch_type>; - }; - - template - struct complex_batch_type, A>> - { - using type = batch, A>; - }; - - template - using complex_batch_type_t = typename complex_batch_type::type; } -#include "../types/xsimd_batch_constant.hpp" #include "../arch/xsimd_isa.hpp" +#include "../types/xsimd_batch_constant.hpp" namespace xsimd { @@ -429,40 +385,41 @@ namespace xsimd * batch constructors * **********************/ - template - batch::batch(T val) - : types::simd_register(kernel::broadcast(val, A{})) + template + inline batch::batch(T val) noexcept + : types::simd_register(kernel::broadcast(val, A {})) { } - template - batch::batch(std::initializer_list data) + template + inline batch::batch(std::initializer_list data) noexcept : batch(data.begin(), detail::make_index_sequence()) { + assert(data.size() == size && "consistent initialization"); } - template - batch::batch(batch_bool const &b) - : batch(kernel::from_bool(b, A{})) + template + inline batch::batch(batch_bool const& b) noexcept + : batch(kernel::from_bool(b, A {})) { } - template - batch::batch(register_type reg) - : types::simd_register({reg}) + template + inline batch::batch(register_type reg) noexcept + : types::simd_register({ reg }) { } - template - template - batch::batch(T const*data, detail::index_sequence) - : batch(kernel::set(batch{}, A{}, data[Is]...)) + template + template + inline batch::batch(T const* data, detail::index_sequence) noexcept + : batch(kernel::set(batch {}, A {}, data[Is]...)) { } template - template - XSIMD_NO_DISCARD batch batch::broadcast(U val) + template + inline XSIMD_NO_DISCARD batch batch::broadcast(U val) noexcept { return batch(static_cast(val)); } @@ -472,15 +429,15 @@ namespace xsimd **************************/ /** - * Copy content of this batch to the buffer \c mem. The - * memory needs to be aligned. - * @param mem the memory buffer to read - */ - template - template - void batch::store_aligned(U* mem) const + * Copy content of this batch to the buffer \c mem. The + * memory needs to be aligned. + * @param mem the memory buffer to read + */ + template + template + inline void batch::store_aligned(U* mem) const noexcept { - kernel::store_aligned(mem, *this, A{}); + kernel::store_aligned(mem, *this, A {}); } /** @@ -488,23 +445,23 @@ namespace xsimd * memory does not need to be aligned. * @param mem the memory buffer to write to */ - template - template - void batch::store_unaligned(U* mem) const + template + template + inline void batch::store_unaligned(U* mem) const noexcept { - kernel::store_unaligned(mem, *this, A{}); + kernel::store_unaligned(mem, *this, A {}); } - template - template - void batch::store(U * mem, aligned_mode) const + template + template + inline void batch::store(U* mem, aligned_mode) const noexcept { return store_aligned(mem); } - template - template - void batch::store(U * mem, unaligned_mode) const + template + template + inline void batch::store(U* mem, unaligned_mode) const noexcept { return store_unaligned(mem); } @@ -516,11 +473,11 @@ namespace xsimd * @param mem the memory buffer to read from. * @return a new batch instance. */ - template - template - batch batch::load_aligned(U const* mem) + template + template + inline batch batch::load_aligned(U const* mem) noexcept { - return kernel::load_aligned(mem, kernel::convert{}, A{}); + return kernel::load_aligned(mem, kernel::convert {}, A {}); } /** @@ -530,29 +487,29 @@ namespace xsimd * @param mem the memory buffer to read from. * @return a new batch instance. */ - template - template - batch batch::load_unaligned(U const* mem) + template + template + inline batch batch::load_unaligned(U const* mem) noexcept { - return kernel::load_unaligned(mem, kernel::convert{}, A{}); + return kernel::load_unaligned(mem, kernel::convert {}, A {}); } - template - template - batch batch::load(U const* mem, aligned_mode) + template + template + inline batch batch::load(U const* mem, aligned_mode) noexcept { return load_aligned(mem); } - template - template - batch batch::load(U const* mem, unaligned_mode) + template + template + inline batch batch::load(U const* mem, unaligned_mode) noexcept { return load_unaligned(mem); } template - T batch::get(std::size_t i) const + inline T batch::get(std::size_t i) const noexcept { alignas(A::alignment()) T buffer[size]; store_aligned(&buffer[0]); @@ -563,145 +520,145 @@ namespace xsimd * batch comparison operators * ******************************/ - template - batch_bool batch::operator==(batch const& other) const + template + inline batch_bool batch::operator==(batch const& other) const noexcept { - return kernel::eq(*this, other, A{}); + return kernel::eq(*this, other, A {}); } - template - batch_bool batch::operator!=(batch const& other) const + template + inline batch_bool batch::operator!=(batch const& other) const noexcept { - return kernel::neq(*this, other, A{}); + return kernel::neq(*this, other, A {}); } - template - batch_bool batch::operator>=(batch const& other) const + template + inline batch_bool batch::operator>=(batch const& other) const noexcept { - return kernel::ge(*this, other, A{}); + return kernel::ge(*this, other, A {}); } - template - batch_bool batch::operator<=(batch const& other) const - { - return kernel::le(*this, other, A{}); + template + inline batch_bool batch::operator<=(batch const& other) const noexcept + { + return kernel::le(*this, other, A {}); } - template - batch_bool batch::operator>(batch const& other) const - { - return kernel::gt(*this, other, A{}); + template + inline batch_bool batch::operator>(batch const& other) const noexcept + { + return kernel::gt(*this, other, A {}); } - template - batch_bool batch::operator<(batch const& other) const - { - return kernel::lt(*this, other, A{}); + template + inline batch_bool batch::operator<(batch const& other) const noexcept + { + return kernel::lt(*this, other, A {}); } - + /************************** * batch update operators * **************************/ - template - batch& batch::operator+=(batch const& other) + template + inline batch& batch::operator+=(batch const& other) noexcept { - return *this = kernel::add(*this, other, A{}); + return *this = kernel::add(*this, other, A {}); } - template - batch& batch::operator-=(batch const& other) + template + inline batch& batch::operator-=(batch const& other) noexcept { - return *this = kernel::sub(*this, other, A{}); + return *this = kernel::sub(*this, other, A {}); } - template - batch& batch::operator*=(batch const& other) + template + inline batch& batch::operator*=(batch const& other) noexcept { - return *this = kernel::mul(*this, other, A{}); + return *this = kernel::mul(*this, other, A {}); } - template - batch& batch::operator/=(batch const& other) + template + inline batch& batch::operator/=(batch const& other) noexcept { - return *this = kernel::div(*this, other, A{}); + return *this = kernel::div(*this, other, A {}); } - template - batch& batch::operator%=(batch const& other) + template + inline batch& batch::operator%=(batch const& other) noexcept { - return *this = kernel::mod(*this, other, A{}); + return *this = kernel::mod(*this, other, A {}); } - template - batch& batch::operator&=(batch const& other) + template + inline batch& batch::operator&=(batch const& other) noexcept { - return *this = kernel::bitwise_and(*this, other, A{}); + return *this = kernel::bitwise_and(*this, other, A {}); } - template - batch& batch::operator|=(batch const& other) + template + inline batch& batch::operator|=(batch const& other) noexcept { - return *this = kernel::bitwise_or(*this, other, A{}); + return *this = kernel::bitwise_or(*this, other, A {}); } - template - batch& batch::operator^=(batch const& other) + template + inline batch& batch::operator^=(batch const& other) noexcept { - return *this = kernel::bitwise_xor(*this, other, A{}); + return *this = kernel::bitwise_xor(*this, other, A {}); } - template - batch& batch::operator>>=(batch const& other) + template + inline batch& batch::operator>>=(batch const& other) noexcept { - return *this = kernel::bitwise_rshift(*this, other, A{}); + return *this = kernel::bitwise_rshift(*this, other, A {}); } - template - batch& batch::operator<<=(batch const& other) - { - return *this = kernel::bitwise_lshift(*this, other, A{}); + template + inline batch& batch::operator<<=(batch const& other) noexcept + { + return *this = kernel::bitwise_lshift(*this, other, A {}); } - template - batch& batch::operator>>=(int32_t other) - { - return *this = kernel::bitwise_rshift(*this, other, A{}); + template + inline batch& batch::operator>>=(int32_t other) noexcept + { + return *this = kernel::bitwise_rshift(*this, other, A {}); } - template - batch& batch::operator<<=(int32_t other) - { - return *this = kernel::bitwise_lshift(*this, other, A{}); + template + inline batch& batch::operator<<=(int32_t other) noexcept + { + return *this = kernel::bitwise_lshift(*this, other, A {}); } /***************************** * batch incr/decr operators * *****************************/ - template - batch& batch::operator++() - { + template + inline batch& batch::operator++() noexcept + { return operator+=(1); } - template - batch& batch::operator--() - { + template + inline batch& batch::operator--() noexcept + { return operator-=(1); } - - template - batch batch::operator++(int) + + template + inline batch batch::operator++(int) noexcept { batch copy(*this); operator+=(1); return copy; } - template - batch batch::operator--(int) - { + template + inline batch batch::operator--(int) noexcept + { batch copy(*this); operator-=(1); return copy; @@ -711,26 +668,26 @@ namespace xsimd * batch unary operators * *************************/ - template - batch_bool batch::operator!() const + template + inline batch_bool batch::operator!() const noexcept { - return kernel::eq(*this, batch(0), A{}); + return kernel::eq(*this, batch(0), A {}); } - template - batch batch::operator~() const + template + inline batch batch::operator~() const noexcept { - return kernel::bitwise_not(*this, A{}); + return kernel::bitwise_not(*this, A {}); } - template - batch batch::operator-() const - { - return kernel::neg(*this, A{}); + template + inline batch batch::operator-() const noexcept + { + return kernel::neg(*this, A {}); } - template - batch batch::operator+() const + template + inline batch batch::operator+() const noexcept { return *this; } @@ -739,14 +696,14 @@ namespace xsimd * batch private method * ************************/ - template - batch batch::logical_and(batch const& other) const + template + inline batch batch::logical_and(batch const& other) const noexcept { return kernel::logical_and(*this, other, A()); } - template - batch batch::logical_or(batch const& other) const + template + inline batch batch::logical_or(batch const& other) const noexcept { return kernel::logical_or(*this, other, A()); } @@ -755,21 +712,21 @@ namespace xsimd * batch_bool constructors * ***************************/ - template - template - batch_bool::batch_bool(bool const*data, detail::index_sequence) - : batch_bool(kernel::set(batch_bool{}, A{}, data[Is]...)) + template + template + inline batch_bool::batch_bool(bool const* data, detail::index_sequence) noexcept + : batch_bool(kernel::set(batch_bool {}, A {}, data[Is]...)) { } - template - batch_bool::batch_bool(register_type reg) - : types::get_bool_simd_register_t({reg}) + template + inline batch_bool::batch_bool(register_type reg) noexcept + : types::get_bool_simd_register_t({ reg }) { } - template - batch_bool::batch_bool(std::initializer_list data) + template + inline batch_bool::batch_bool(std::initializer_list data) noexcept : batch_bool(data.begin(), detail::make_index_sequence()) { } @@ -778,36 +735,36 @@ namespace xsimd * batch_bool memory operators * *******************************/ - template - void batch_bool::store_aligned(bool* mem) const + template + inline void batch_bool::store_aligned(bool* mem) const noexcept { - kernel::store(*this, mem, A{}); + kernel::store(*this, mem, A {}); } - template - void batch_bool::store_unaligned(bool* mem) const + template + inline void batch_bool::store_unaligned(bool* mem) const noexcept { store_aligned(mem); } - template - batch_bool batch_bool::load_aligned(bool const* mem) + template + inline batch_bool batch_bool::load_aligned(bool const* mem) noexcept { batch_type ref(0); alignas(A::alignment()) T buffer[size]; - for(std::size_t i = 0; i < size; ++i) + for (std::size_t i = 0; i < size; ++i) buffer[i] = mem[i] ? 1 : 0; return ref != batch_type::load_aligned(&buffer[0]); } - template - batch_bool batch_bool::load_unaligned(bool const* mem) + template + inline batch_bool batch_bool::load_unaligned(bool const* mem) noexcept { return load_aligned(mem); } - template - bool batch_bool::get(std::size_t i) const + template + inline bool batch_bool::get(std::size_t i) const noexcept { alignas(A::alignment()) bool buffer[size]; store_aligned(&buffer[0]); @@ -818,54 +775,54 @@ namespace xsimd * batch_bool comparison operators * ***********************************/ - template - batch_bool batch_bool::operator==(batch_bool const& other) const + template + inline batch_bool batch_bool::operator==(batch_bool const& other) const noexcept { - return kernel::eq(*this, other, A{}).data; + return kernel::eq(*this, other, A {}).data; } - template - batch_bool batch_bool::operator!=(batch_bool const& other) const + template + inline batch_bool batch_bool::operator!=(batch_bool const& other) const noexcept { - return kernel::neq(*this, other, A{}).data; + return kernel::neq(*this, other, A {}).data; } /******************************** * batch_bool logical operators * ********************************/ - template - batch_bool batch_bool::operator~() const + template + inline batch_bool batch_bool::operator~() const noexcept { - return kernel::bitwise_not(*this, A{}).data; + return kernel::bitwise_not(*this, A {}).data; } - template - batch_bool batch_bool::operator!() const - { + template + inline batch_bool batch_bool::operator!() const noexcept + { return operator==(batch_bool(false)); } - template - batch_bool batch_bool::operator&(batch_bool const& other) const + template + inline batch_bool batch_bool::operator&(batch_bool const& other) const noexcept { - return kernel::bitwise_and(*this, other, A{}).data; + return kernel::bitwise_and(*this, other, A {}).data; } - template - batch_bool batch_bool::operator|(batch_bool const& other) const + template + inline batch_bool batch_bool::operator|(batch_bool const& other) const noexcept { - return kernel::bitwise_or(*this, other, A{}).data; + return kernel::bitwise_or(*this, other, A {}).data; } - - template - batch_bool batch_bool::operator&&(batch_bool const& other) const + + template + inline batch_bool batch_bool::operator&&(batch_bool const& other) const noexcept { return operator&(other); } - - template - batch_bool batch_bool::operator||(batch_bool const& other) const + + template + inline batch_bool batch_bool::operator||(batch_bool const& other) const noexcept { return operator|(other); } @@ -874,24 +831,24 @@ namespace xsimd * batch_bool private methods * ******************************/ - template - batch_bool::batch_bool(bool val) - : base_type{make_register(detail::make_index_sequence(), val)} + template + inline batch_bool::batch_bool(bool val) noexcept + : base_type { make_register(detail::make_index_sequence(), val) } { } template template - auto batch_bool::make_register(detail::index_sequence, U u, V... v) -> register_type + inline auto batch_bool::make_register(detail::index_sequence, U u, V... v) noexcept -> register_type { return make_register(detail::index_sequence(), u, u, v...); } template template - auto batch_bool::make_register(detail::index_sequence<>, V... v) -> register_type + inline auto batch_bool::make_register(detail::index_sequence<>, V... v) noexcept -> register_type { - return kernel::set(batch_bool(), A{}, v...).data; + return kernel::set(batch_bool(), A {}, v...).data; } /******************************* @@ -899,38 +856,43 @@ namespace xsimd *******************************/ template - batch, A>::batch(value_type const& val) - : m_real(val.real()), m_imag(val.imag()) + inline batch, A>::batch(value_type const& val) noexcept + : m_real(val.real()) + , m_imag(val.imag()) { } - + template - batch, A>::batch(real_batch const& real, real_batch const& imag) - : m_real(real), m_imag(imag) + inline batch, A>::batch(real_batch const& real, real_batch const& imag) noexcept + : m_real(real) + , m_imag(imag) { } - + template - batch, A>::batch(real_batch const& real) - : m_real(real), m_imag(0) + inline batch, A>::batch(real_batch const& real) noexcept + : m_real(real) + , m_imag(0) { } - + template - batch, A>::batch(T val) - : m_real(val), m_imag(0) + inline batch, A>::batch(T val) noexcept + : m_real(val) + , m_imag(0) { } - + template - batch, A>::batch(std::initializer_list data) - { + inline batch, A>::batch(std::initializer_list data) noexcept + { *this = load_unaligned(data.begin()); } template - batch, A>::batch(batch_bool_type const& b) - : m_real(b), m_imag(0) + inline batch, A>::batch(batch_bool_type const& b) noexcept + : m_real(b) + , m_imag(0) { } @@ -939,96 +901,96 @@ namespace xsimd ***********************************/ template - batch, A> batch, A>::load_aligned(const T* real_src, const T* imag_src) + inline batch, A> batch, A>::load_aligned(const T* real_src, const T* imag_src) noexcept { - return {batch::load_aligned(real_src), imag_src ? batch::load_aligned(imag_src) : batch(0)}; + return { batch::load_aligned(real_src), imag_src ? batch::load_aligned(imag_src) : batch(0) }; } template - batch, A> batch, A>::load_unaligned(const T* real_src, const T* imag_src) + inline batch, A> batch, A>::load_unaligned(const T* real_src, const T* imag_src) noexcept { - return {batch::load_unaligned(real_src), imag_src?batch::load_unaligned(imag_src):batch(0)}; + return { batch::load_unaligned(real_src), imag_src ? batch::load_unaligned(imag_src) : batch(0) }; } - template - batch, A> batch, A>::load_aligned(const value_type* src) + template + inline batch, A> batch, A>::load_aligned(const value_type* src) noexcept { - return kernel::load_complex_aligned(src, kernel::convert{}, A{}); + return kernel::load_complex_aligned(src, kernel::convert {}, A {}); } - template - batch, A> batch, A>::load_unaligned(const value_type* src) + template + inline batch, A> batch, A>::load_unaligned(const value_type* src) noexcept { - return kernel::load_complex_unaligned(src, kernel::convert{}, A{}); + return kernel::load_complex_unaligned(src, kernel::convert {}, A {}); } - template - void batch, A>::store_aligned(value_type* dst) const + template + inline void batch, A>::store_aligned(value_type* dst) const noexcept { - return kernel::store_complex_aligned(dst, *this, A{}); + return kernel::store_complex_aligned(dst, *this, A {}); } - template - void batch, A>::store_unaligned(value_type* dst) const + template + inline void batch, A>::store_unaligned(value_type* dst) const noexcept { - return kernel::store_complex_unaligned(dst, *this, A{}); + return kernel::store_complex_unaligned(dst, *this, A {}); } - template - void batch, A>::store_aligned(T* real_dst, T* imag_dst) const + template + inline void batch, A>::store_aligned(T* real_dst, T* imag_dst) const noexcept { m_real.store_aligned(real_dst); m_imag.store_aligned(imag_dst); } - template - void batch, A>::store_unaligned(T* real_dst, T* imag_dst) const + template + inline void batch, A>::store_unaligned(T* real_dst, T* imag_dst) const noexcept { m_real.store_unaligned(real_dst); m_imag.store_unaligned(imag_dst); } - template - template - batch, A> batch, A>::load(U const* mem, aligned_mode) + template + template + inline batch, A> batch, A>::load(U const* mem, aligned_mode) noexcept { return load_aligned(mem); } - template - template - batch, A> batch, A>::load(U const* mem, unaligned_mode) - { + template + template + inline batch, A> batch, A>::load(U const* mem, unaligned_mode) noexcept + { return load_unaligned(mem); } - template - template - void batch, A>::store(U * mem, aligned_mode) const + template + template + inline void batch, A>::store(U* mem, aligned_mode) const noexcept { return store_aligned(mem); } - - template - template - void batch, A>::store(U * mem, unaligned_mode) const - { + + template + template + inline void batch, A>::store(U* mem, unaligned_mode) const noexcept + { return store_unaligned(mem); } - template - auto batch, A>::real() const -> real_batch + template + inline auto batch, A>::real() const noexcept -> real_batch { - return m_real; + return m_real; } - - template - auto batch, A>::imag() const -> real_batch + + template + inline auto batch, A>::imag() const noexcept -> real_batch { return m_imag; } - template - auto batch, A>::get(std::size_t i) const -> value_type + template + inline auto batch, A>::get(std::size_t i) const noexcept -> value_type { alignas(A::alignment()) value_type buffer[size]; store_aligned(&buffer[0]); @@ -1041,16 +1003,17 @@ namespace xsimd #ifdef XSIMD_ENABLE_XTL_COMPLEX - template - template - batch, A>::batch(xtl::xcomplex const& val) - : m_real(val.real()), m_imag(val.imag()) + template + template + inline batch, A>::batch(xtl::xcomplex const& val) noexcept + : m_real(val.real()) + , m_imag(val.imag()) { } - template - template - batch, A>::batch(std::initializer_list> data) + template + template + inline batch, A>::batch(std::initializer_list> data) noexcept { *this = load_unaligned(data.begin()); } @@ -1059,30 +1022,30 @@ namespace xsimd // stores values and not reference. Unfortunately, this breaks strict // aliasing... - template - template - batch, A> batch, A>::load_aligned(const xtl::xcomplex* src) + template + template + inline batch, A> batch, A>::load_aligned(const xtl::xcomplex* src) noexcept { return load_aligned(reinterpret_cast const*>(src)); } - template - template - batch, A> batch, A>::load_unaligned(const xtl::xcomplex* src) + template + template + inline batch, A> batch, A>::load_unaligned(const xtl::xcomplex* src) noexcept { return load_unaligned(reinterpret_cast const*>(src)); } - template - template - void batch, A>::store_aligned(xtl::xcomplex* dst) const + template + template + inline void batch, A>::store_aligned(xtl::xcomplex* dst) const noexcept { - store_aligned(reinterpret_cast *>(dst)); + store_aligned(reinterpret_cast*>(dst)); } - template - template - void batch, A>::store_unaligned(xtl::xcomplex* dst) const + template + template + inline void batch, A>::store_unaligned(xtl::xcomplex* dst) const noexcept { store_unaligned(reinterpret_cast*>(dst)); } @@ -1094,14 +1057,14 @@ namespace xsimd ***************************************/ template - batch_bool batch, A>::operator==(batch const& other) const + inline batch_bool batch, A>::operator==(batch const& other) const noexcept { return m_real == other.m_real && m_imag == other.m_imag; } - + template - batch_bool batch, A>::operator!=(batch const& other) const - { + inline batch_bool batch, A>::operator!=(batch const& other) const noexcept + { return m_real != other.m_real || m_imag != other.m_imag; } @@ -1110,15 +1073,15 @@ namespace xsimd ***********************************/ template - batch, A>& batch, A>::operator+=(batch const& other) + inline batch, A>& batch, A>::operator+=(batch const& other) noexcept { m_real += other.m_real; m_imag += other.m_imag; return *this; } - + template - batch, A>& batch, A>::operator-=(batch const& other) + inline batch, A>& batch, A>::operator-=(batch const& other) noexcept { m_real -= other.m_real; m_imag -= other.m_imag; @@ -1126,7 +1089,7 @@ namespace xsimd } template - batch, A>& batch, A>::operator*=(batch const& other) + inline batch, A>& batch, A>::operator*=(batch const& other) noexcept { real_batch new_real = real() * other.real() - imag() * other.imag(); real_batch new_imag = real() * other.imag() + imag() * other.real(); @@ -1135,16 +1098,16 @@ namespace xsimd return *this; } - template - batch, A>& batch, A>::operator/=(batch const& other) + template + inline batch, A>& batch, A>::operator/=(batch const& other) noexcept { real_batch a = real(); real_batch b = imag(); real_batch c = other.real(); real_batch d = other.imag(); - real_batch e = c*c + d*d; - m_real = (c*a + d*b) / e; - m_imag = (c*b - d*a) / e; + real_batch e = c * c + d * d; + m_real = (c * a + d * b) / e; + m_imag = (c * b - d * a) / e; return *this; } @@ -1152,28 +1115,28 @@ namespace xsimd * batch incr/decr operators * **************************************/ - template - batch, A>& batch, A>::operator++() - { + template + inline batch, A>& batch, A>::operator++() noexcept + { return operator+=(1); } - template - batch, A>& batch, A>::operator--() - { + template + inline batch, A>& batch, A>::operator--() noexcept + { return operator-=(1); } - template - batch, A> batch, A>::operator++(int) + template + inline batch, A> batch, A>::operator++(int) noexcept { batch copy(*this); operator+=(1); return copy; } - template - batch, A> batch, A>::operator--(int) + template + inline batch, A> batch, A>::operator--(int) noexcept { batch copy(*this); operator-=(1); @@ -1185,29 +1148,28 @@ namespace xsimd **********************************/ template - batch_bool batch, A>::operator!() const - { + inline batch_bool batch, A>::operator!() const noexcept + { return operator==(batch(0)); } - + template - batch, A> batch, A>::operator~() const - { - return {~m_real, ~m_imag}; + inline batch, A> batch, A>::operator~() const noexcept + { + return { ~m_real, ~m_imag }; } template - batch, A> batch, A>::operator-() const - { - return {-m_real, -m_imag}; + inline batch, A> batch, A>::operator-() const noexcept + { + return { -m_real, -m_imag }; } template - batch, A> batch, A>::operator+() const - { - return {+m_real, +m_imag}; + inline batch, A> batch, A>::operator+() const noexcept + { + return { +m_real, +m_imag }; } } #endif - diff --git a/third_party/xsimd/types/xsimd_batch_constant.hpp b/third_party/xsimd/types/xsimd_batch_constant.hpp index c37784fa9..b2052adb3 100644 --- a/third_party/xsimd/types/xsimd_batch_constant.hpp +++ b/third_party/xsimd/types/xsimd_batch_constant.hpp @@ -1,13 +1,13 @@ /*************************************************************************** -* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * -* Martin Renou * -* Copyright (c) QuantStack * -* Copyright (c) Serge Guelton * -* * -* Distributed under the terms of the BSD 3-Clause License. * -* * -* The full license is in the file LICENSE, distributed with this software. * -****************************************************************************/ + * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * + * Martin Renou * + * Copyright (c) QuantStack * + * Copyright (c) Serge Guelton * + * * + * Distributed under the terms of the BSD 3-Clause License. * + * * + * The full license is in the file LICENSE, distributed with this software. * + ****************************************************************************/ #ifndef XSIMD_BATCH_CONSTANT_HPP #define XSIMD_BATCH_CONSTANT_HPP @@ -25,22 +25,22 @@ namespace xsimd using value_type = bool; static_assert(sizeof...(Values) == batch_type::size, "consistent batch size"); - operator batch_bool() const { return {Values...}; } + operator batch_bool() const noexcept { return { Values... }; } - bool get(size_t i) const + bool get(size_t i) const noexcept { - return std::array{{Values...}}[i]; + return std::array { { Values... } }[i]; } - static constexpr int mask() + static constexpr int mask() noexcept { return mask_helper(0, static_cast(Values)...); } - private: - static constexpr int mask_helper(int acc) { return acc; } + private: + static constexpr int mask_helper(int acc) noexcept { return acc; } template - static constexpr int mask_helper(int acc, int mask, Tys... masks) + static constexpr int mask_helper(int acc, int mask, Tys... masks) noexcept { return mask_helper(acc | mask, (masks << 1)...); } @@ -54,24 +54,24 @@ namespace xsimd using value_type = typename batch_type::value_type; static_assert(sizeof...(Values) == batch_type::size, "consistent batch size"); - operator batch_type() const { return {Values...}; } + operator batch_type() const noexcept { return { Values... }; } - constexpr value_type get(size_t i) const + constexpr value_type get(size_t i) const noexcept { - return std::array{Values...}[i]; + return std::array { Values... }[i]; } }; namespace detail { template - constexpr auto make_batch_constant(detail::index_sequence) + inline constexpr auto make_batch_constant(detail::index_sequence) noexcept -> batch_constant { return {}; } template - constexpr auto make_batch_bool_constant(detail::index_sequence) + inline constexpr auto make_batch_bool_constant(detail::index_sequence) noexcept -> batch_bool_constant { return {}; @@ -80,14 +80,13 @@ namespace xsimd } // namespace detail template - constexpr auto make_batch_constant() -> decltype( - detail::make_batch_constant(detail::make_index_sequence())) + inline constexpr auto make_batch_constant() noexcept -> decltype(detail::make_batch_constant(detail::make_index_sequence())) { return detail::make_batch_constant(detail::make_index_sequence()); } template - constexpr auto make_batch_bool_constant() + inline constexpr auto make_batch_bool_constant() noexcept -> decltype(detail::make_batch_bool_constant( detail::make_index_sequence())) { diff --git a/third_party/xsimd/types/xsimd_fma3_register.hpp b/third_party/xsimd/types/xsimd_fma3_register.hpp index 38e0d2a8d..493d52e9e 100644 --- a/third_party/xsimd/types/xsimd_fma3_register.hpp +++ b/third_party/xsimd/types/xsimd_fma3_register.hpp @@ -1,40 +1,42 @@ /*************************************************************************** -* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * -* Martin Renou * -* Copyright (c) QuantStack * -* Copyright (c) Serge Guelton * -* * -* Distributed under the terms of the BSD 3-Clause License. * -* * -* The full license is in the file LICENSE, distributed with this software. * -****************************************************************************/ + * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * + * Martin Renou * + * Copyright (c) QuantStack * + * Copyright (c) Serge Guelton * + * * + * Distributed under the terms of the BSD 3-Clause License. * + * * + * The full license is in the file LICENSE, distributed with this software. * + ****************************************************************************/ #ifndef XSIMD_FMA3_REGISTER_HPP #define XSIMD_FMA3_REGISTER_HPP #include "./xsimd_sse4_2_register.hpp" -namespace xsimd { - - /** - * @ingroup arch - * - * SSE4.2 + FMA instructions - */ - struct fma3 : sse4_2 { - static constexpr bool supported() { return XSIMD_WITH_FMA3; } - static constexpr bool available() { return true; } - static constexpr unsigned version() { return generic::version(1, 5, 0); } - static constexpr char const* name() { return "sse4.2+fma"; } - }; +namespace xsimd +{ + + /** + * @ingroup arch + * + * SSE4.2 + FMA instructions + */ + struct fma3 : sse4_2 + { + static constexpr bool supported() noexcept { return XSIMD_WITH_FMA3; } + static constexpr bool available() noexcept { return true; } + static constexpr unsigned version() noexcept { return generic::version(1, 5, 0); } + static constexpr char const* name() noexcept { return "sse4.2+fma"; } + }; #if XSIMD_WITH_FMA3 - namespace types { + namespace types + { - XSIMD_DECLARE_SIMD_REGISTER_ALIAS(fma3, sse4_2); + XSIMD_DECLARE_SIMD_REGISTER_ALIAS(fma3, sse4_2); - } + } #endif } #endif - diff --git a/third_party/xsimd/types/xsimd_fma5_register.hpp b/third_party/xsimd/types/xsimd_fma5_register.hpp index 6c5cdb2eb..8cd0f3ded 100644 --- a/third_party/xsimd/types/xsimd_fma5_register.hpp +++ b/third_party/xsimd/types/xsimd_fma5_register.hpp @@ -1,41 +1,42 @@ /*************************************************************************** -* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * -* Martin Renou * -* Copyright (c) QuantStack * -* Copyright (c) Serge Guelton * -* * -* Distributed under the terms of the BSD 3-Clause License. * -* * -* The full license is in the file LICENSE, distributed with this software. * -****************************************************************************/ + * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * + * Martin Renou * + * Copyright (c) QuantStack * + * Copyright (c) Serge Guelton * + * * + * Distributed under the terms of the BSD 3-Clause License. * + * * + * The full license is in the file LICENSE, distributed with this software. * + ****************************************************************************/ #ifndef XSIMD_FMA5_REGISTER_HPP #define XSIMD_FMA5_REGISTER_HPP #include "./xsimd_avx2_register.hpp" -namespace xsimd { - - /** - * @ingroup arch - * - * AVX2 + FMA instructions - */ - struct fma5 : avx2 { - static constexpr bool supported() { return XSIMD_WITH_FMA5; } - static constexpr bool available() { return true; } - static constexpr unsigned version() { return generic::version(2, 3, 0); } - static constexpr char const* name() { return "avx2+fma"; } - }; +namespace xsimd +{ + + /** + * @ingroup arch + * + * AVX2 + FMA instructions + */ + struct fma5 : avx2 + { + static constexpr bool supported() noexcept { return XSIMD_WITH_FMA5; } + static constexpr bool available() noexcept { return true; } + static constexpr unsigned version() noexcept { return generic::version(2, 3, 0); } + static constexpr char const* name() noexcept { return "avx2+fma"; } + }; #if XSIMD_WITH_FMA5 - namespace types { + namespace types + { - XSIMD_DECLARE_SIMD_REGISTER_ALIAS(fma5, avx2); + XSIMD_DECLARE_SIMD_REGISTER_ALIAS(fma5, avx2); - } + } #endif } #endif - - diff --git a/third_party/xsimd/types/xsimd_generic_arch.hpp b/third_party/xsimd/types/xsimd_generic_arch.hpp index 3755f999b..e0349b1f5 100644 --- a/third_party/xsimd/types/xsimd_generic_arch.hpp +++ b/third_party/xsimd/types/xsimd_generic_arch.hpp @@ -1,17 +1,19 @@ /*************************************************************************** -* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * -* Martin Renou * -* Copyright (c) QuantStack * -* Copyright (c) Serge Guelton * -* * -* Distributed under the terms of the BSD 3-Clause License. * -* * -* The full license is in the file LICENSE, distributed with this software. * -****************************************************************************/ + * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * + * Martin Renou * + * Copyright (c) QuantStack * + * Copyright (c) Serge Guelton * + * * + * Distributed under the terms of the BSD 3-Clause License. * + * * + * The full license is in the file LICENSE, distributed with this software. * + ****************************************************************************/ #ifndef XSIMD_GENERIC_ARCH_HPP #define XSIMD_GENERIC_ARCH_HPP +#include "../config/xsimd_config.hpp" + /** * @defgroup arch * */ @@ -19,15 +21,14 @@ namespace xsimd { struct generic { - static constexpr bool supported() { return true; } - static constexpr bool available() { return true; } - static constexpr bool requires_alignment() { return false; } + static constexpr bool supported() noexcept { return true; } + static constexpr bool available() noexcept { return true; } + static constexpr std::size_t alignment() noexcept { return 0; } + static constexpr bool requires_alignment() noexcept { return false; } protected: - - static constexpr unsigned version(unsigned major, unsigned minor, unsigned patch) { return major * 10000u + minor * 100u + patch; } + static constexpr unsigned version(unsigned major, unsigned minor, unsigned patch) noexcept { return major * 10000u + minor * 100u + patch; } }; } #endif - diff --git a/third_party/xsimd/types/xsimd_neon64_register.hpp b/third_party/xsimd/types/xsimd_neon64_register.hpp index 68f75eb3b..64a159dc9 100644 --- a/third_party/xsimd/types/xsimd_neon64_register.hpp +++ b/third_party/xsimd/types/xsimd_neon64_register.hpp @@ -1,13 +1,13 @@ /*************************************************************************** -* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * -* Martin Renou * -* Copyright (c) QuantStack * -* Copyright (c) Serge Guelton * -* * -* Distributed under the terms of the BSD 3-Clause License. * -* * -* The full license is in the file LICENSE, distributed with this software. * -****************************************************************************/ + * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * + * Martin Renou * + * Copyright (c) QuantStack * + * Copyright (c) Serge Guelton * + * * + * Distributed under the terms of the BSD 3-Clause License. * + * * + * The full license is in the file LICENSE, distributed with this software. * + ****************************************************************************/ #ifndef XSIMD_NEON64_REGISTER_HPP #define XSIMD_NEON64_REGISTER_HPP @@ -23,12 +23,12 @@ namespace xsimd */ struct neon64 : neon { - static constexpr bool supported() { return XSIMD_WITH_NEON64; } - static constexpr bool available() { return true; } - static constexpr bool requires_alignment() { return true; } - static constexpr std::size_t alignment() { return 16; } - static constexpr unsigned version() { return generic::version(8, 1, 0); } - static constexpr char const* name() { return "arm64+neon"; } + static constexpr bool supported() noexcept { return XSIMD_WITH_NEON64; } + static constexpr bool available() noexcept { return true; } + static constexpr bool requires_alignment() noexcept { return true; } + static constexpr std::size_t alignment() noexcept { return 16; } + static constexpr unsigned version() noexcept { return generic::version(8, 1, 0); } + static constexpr char const* name() noexcept { return "arm64+neon"; } }; #if XSIMD_WITH_NEON64 @@ -50,5 +50,3 @@ namespace xsimd } #endif - - diff --git a/third_party/xsimd/types/xsimd_neon_register.hpp b/third_party/xsimd/types/xsimd_neon_register.hpp index 43a7db442..67a477d9b 100644 --- a/third_party/xsimd/types/xsimd_neon_register.hpp +++ b/third_party/xsimd/types/xsimd_neon_register.hpp @@ -1,19 +1,19 @@ /*************************************************************************** -* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * -* Martin Renou * -* Copyright (c) QuantStack * -* Copyright (c) Serge Guelton * -* * -* Distributed under the terms of the BSD 3-Clause License. * -* * -* The full license is in the file LICENSE, distributed with this software. * -****************************************************************************/ + * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * + * Martin Renou * + * Copyright (c) QuantStack * + * Copyright (c) Serge Guelton * + * * + * Distributed under the terms of the BSD 3-Clause License. * + * * + * The full license is in the file LICENSE, distributed with this software. * + ****************************************************************************/ #ifndef XSIMD_NEON_REGISTER_HPP #define XSIMD_NEON_REGISTER_HPP -#include "xsimd_register.hpp" #include "xsimd_generic_arch.hpp" +#include "xsimd_register.hpp" #if XSIMD_WITH_NEON #include @@ -28,12 +28,12 @@ namespace xsimd */ struct neon : generic { - static constexpr bool supported() { return XSIMD_WITH_NEON; } - static constexpr bool available() { return true; } - static constexpr bool requires_alignment() { return true; } - static constexpr std::size_t alignment() { return 16; } - static constexpr unsigned version() { return generic::version(7, 0, 0); } - static constexpr char const* name() { return "arm32+neon"; } + static constexpr bool supported() noexcept { return XSIMD_WITH_NEON; } + static constexpr bool available() noexcept { return true; } + static constexpr bool requires_alignment() noexcept { return true; } + static constexpr std::size_t alignment() noexcept { return 16; } + static constexpr unsigned version() noexcept { return generic::version(7, 0, 0); } + static constexpr char const* name() noexcept { return "arm32+neon"; } }; #if XSIMD_WITH_NEON @@ -73,21 +73,19 @@ namespace xsimd }; template - using signed_neon_vector_type = typename neon_vector_type_impl<8*sizeof(T)>::signed_type; + using signed_neon_vector_type = typename neon_vector_type_impl<8 * sizeof(T)>::signed_type; template - using unsigned_neon_vector_type = typename neon_vector_type_impl<8*sizeof(T)>::unsigned_type; + using unsigned_neon_vector_type = typename neon_vector_type_impl<8 * sizeof(T)>::unsigned_type; template using neon_vector_type = typename std::conditional::value, - signed_neon_vector_type, - unsigned_neon_vector_type - >::type; + signed_neon_vector_type, + unsigned_neon_vector_type>::type; using char_neon_vector_type = typename std::conditional::value, - signed_neon_vector_type, - unsigned_neon_vector_type - >::type; + signed_neon_vector_type, + unsigned_neon_vector_type>::type; } XSIMD_DECLARE_SIMD_REGISTER(signed char, neon, detail::neon_vector_type); @@ -103,7 +101,9 @@ namespace xsimd XSIMD_DECLARE_SIMD_REGISTER(unsigned long long int, neon, detail::neon_vector_type); XSIMD_DECLARE_SIMD_REGISTER(float, neon, float32x4_t); - struct invalid_register {}; + struct invalid_register + { + }; XSIMD_DECLARE_SIMD_REGISTER(double, neon, invalid_register); namespace detail @@ -132,7 +132,7 @@ namespace xsimd template <> struct get_unsigned_type<8> { - using type = uint64_t; + using type = uint64_t; }; template @@ -150,11 +150,10 @@ namespace xsimd : detail::neon_bool_simd_register { }; - + } #endif } #endif - diff --git a/third_party/xsimd/types/xsimd_register.hpp b/third_party/xsimd/types/xsimd_register.hpp index 5c9551fa4..f8dd99bb0 100644 --- a/third_party/xsimd/types/xsimd_register.hpp +++ b/third_party/xsimd/types/xsimd_register.hpp @@ -1,13 +1,13 @@ /*************************************************************************** -* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * -* Martin Renou * -* Copyright (c) QuantStack * -* Copyright (c) Serge Guelton * -* * -* Distributed under the terms of the BSD 3-Clause License. * -* * -* The full license is in the file LICENSE, distributed with this software. * -****************************************************************************/ + * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * + * Martin Renou * + * Copyright (c) QuantStack * + * Copyright (c) Serge Guelton * + * * + * Distributed under the terms of the BSD 3-Clause License. * + * * + * The full license is in the file LICENSE, distributed with this software. * + ****************************************************************************/ #ifndef XSIMD_REGISTER_HPP #define XSIMD_REGISTER_HPP @@ -19,8 +19,11 @@ namespace xsimd namespace types { - template - struct simd_register; + template + struct simd_register + { + static_assert(Arch::supported(), "usage of simd_register with unsupported architecture"); + }; template struct has_simd_register : std::false_type @@ -28,28 +31,33 @@ namespace xsimd }; #define XSIMD_DECLARE_SIMD_REGISTER(SCALAR_TYPE, ISA, VECTOR_TYPE) \ - template<> \ - struct simd_register\ - {\ - using register_type = VECTOR_TYPE;\ - register_type data;\ - operator register_type() const { return data; }\ - };\ - template <>\ - struct has_simd_register : std::true_type\ - {} + template <> \ + struct simd_register \ + { \ + using register_type = VECTOR_TYPE; \ + register_type data; \ + operator register_type() const noexcept { return data; } \ + }; \ + template <> \ + struct has_simd_register : std::true_type \ + { \ + } -#define XSIMD_DECLARE_SIMD_REGISTER_ALIAS(ISA, ISA_BASE)\ - template \ - struct simd_register : simd_register\ - {\ - using register_type = typename simd_register::register_type;\ - simd_register(register_type reg) : simd_register{reg} {}\ - simd_register() = default;\ - };\ - template\ - struct has_simd_register : has_simd_register\ - {} +#define XSIMD_DECLARE_SIMD_REGISTER_ALIAS(ISA, ISA_BASE) \ + template \ + struct simd_register : simd_register \ + { \ + using register_type = typename simd_register::register_type; \ + simd_register(register_type reg) noexcept \ + : simd_register { reg } \ + { \ + } \ + simd_register() = default; \ + }; \ + template \ + struct has_simd_register : has_simd_register \ + { \ + } template struct get_bool_simd_register @@ -65,10 +73,12 @@ namespace xsimd { // TODO: rename this, as it might conflict with C++20 keyword. // We should use add_const and add_reference to build A const& - template + template using requires_arch = A const&; - template - struct convert {}; + template + struct convert + { + }; } } diff --git a/third_party/xsimd/types/xsimd_sse2_register.hpp b/third_party/xsimd/types/xsimd_sse2_register.hpp index 217541723..60aab8689 100644 --- a/third_party/xsimd/types/xsimd_sse2_register.hpp +++ b/third_party/xsimd/types/xsimd_sse2_register.hpp @@ -1,23 +1,23 @@ /*************************************************************************** -* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * -* Martin Renou * -* Copyright (c) QuantStack * -* Copyright (c) Serge Guelton * -* * -* Distributed under the terms of the BSD 3-Clause License. * -* * -* The full license is in the file LICENSE, distributed with this software. * -****************************************************************************/ + * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * + * Martin Renou * + * Copyright (c) QuantStack * + * Copyright (c) Serge Guelton * + * * + * Distributed under the terms of the BSD 3-Clause License. * + * * + * The full license is in the file LICENSE, distributed with this software. * + ****************************************************************************/ #ifndef XSIMD_SSE2_REGISTER_HPP #define XSIMD_SSE2_REGISTER_HPP -#include "./xsimd_register.hpp" #include "./xsimd_generic_arch.hpp" +#include "./xsimd_register.hpp" #if XSIMD_WITH_SSE2 -#include #include +#include #endif namespace xsimd @@ -29,12 +29,12 @@ namespace xsimd */ struct sse2 : generic { - static constexpr bool supported() { return XSIMD_WITH_SSE2; } - static constexpr bool available() { return true; } - static constexpr bool requires_alignment() { return true; } - static constexpr unsigned version() { return generic::version(1, 2, 0); } - static constexpr std::size_t alignment() { return 16; } - static constexpr char const* name() { return "sse2"; } + static constexpr bool supported() noexcept { return XSIMD_WITH_SSE2; } + static constexpr bool available() noexcept { return true; } + static constexpr bool requires_alignment() noexcept { return true; } + static constexpr unsigned version() noexcept { return generic::version(1, 2, 0); } + static constexpr std::size_t alignment() noexcept { return 16; } + static constexpr char const* name() noexcept { return "sse2"; } }; #if XSIMD_WITH_SSE2 @@ -59,4 +59,3 @@ namespace xsimd } #endif - diff --git a/third_party/xsimd/types/xsimd_sse3_register.hpp b/third_party/xsimd/types/xsimd_sse3_register.hpp index 928ca8dd1..d8dec5ae4 100644 --- a/third_party/xsimd/types/xsimd_sse3_register.hpp +++ b/third_party/xsimd/types/xsimd_sse3_register.hpp @@ -1,13 +1,13 @@ /*************************************************************************** -* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * -* Martin Renou * -* Copyright (c) QuantStack * -* Copyright (c) Serge Guelton * -* * -* Distributed under the terms of the BSD 3-Clause License. * -* * -* The full license is in the file LICENSE, distributed with this software. * -****************************************************************************/ + * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * + * Martin Renou * + * Copyright (c) QuantStack * + * Copyright (c) Serge Guelton * + * * + * Distributed under the terms of the BSD 3-Clause License. * + * * + * The full license is in the file LICENSE, distributed with this software. * + ****************************************************************************/ #ifndef XSIMD_SSE3_REGISTER_HPP #define XSIMD_SSE3_REGISTER_HPP @@ -27,10 +27,10 @@ namespace xsimd */ struct sse3 : sse2 { - static constexpr bool supported() { return XSIMD_WITH_SSE3; } - static constexpr bool available() { return true; } - static constexpr unsigned version() { return generic::version(1, 3, 0); } - static constexpr char const* name() { return "sse3"; } + static constexpr bool supported() noexcept { return XSIMD_WITH_SSE3; } + static constexpr bool available() noexcept { return true; } + static constexpr unsigned version() noexcept { return generic::version(1, 3, 0); } + static constexpr char const* name() noexcept { return "sse3"; } }; #if XSIMD_WITH_SSE3 @@ -43,4 +43,3 @@ namespace xsimd } #endif - diff --git a/third_party/xsimd/types/xsimd_sse4_1_register.hpp b/third_party/xsimd/types/xsimd_sse4_1_register.hpp index 636b2a2cf..2cf0085b0 100644 --- a/third_party/xsimd/types/xsimd_sse4_1_register.hpp +++ b/third_party/xsimd/types/xsimd_sse4_1_register.hpp @@ -1,13 +1,13 @@ /*************************************************************************** -* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * -* Martin Renou * -* Copyright (c) QuantStack * -* Copyright (c) Serge Guelton * -* * -* Distributed under the terms of the BSD 3-Clause License. * -* * -* The full license is in the file LICENSE, distributed with this software. * -****************************************************************************/ + * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * + * Martin Renou * + * Copyright (c) QuantStack * + * Copyright (c) Serge Guelton * + * * + * Distributed under the terms of the BSD 3-Clause License. * + * * + * The full license is in the file LICENSE, distributed with this software. * + ****************************************************************************/ #ifndef XSIMD_SSE4_1_REGISTER_HPP #define XSIMD_SSE4_1_REGISTER_HPP @@ -27,10 +27,10 @@ namespace xsimd */ struct sse4_1 : ssse3 { - static constexpr bool supported() { return XSIMD_WITH_SSE4_1; } - static constexpr bool available() { return true; } - static constexpr unsigned version() { return generic::version(1, 4, 1); } - static constexpr char const* name() { return "sse4.1"; } + static constexpr bool supported() noexcept { return XSIMD_WITH_SSE4_1; } + static constexpr bool available() noexcept { return true; } + static constexpr unsigned version() noexcept { return generic::version(1, 4, 1); } + static constexpr char const* name() noexcept { return "sse4.1"; } }; #if XSIMD_WITH_SSE4_1 diff --git a/third_party/xsimd/types/xsimd_sse4_2_register.hpp b/third_party/xsimd/types/xsimd_sse4_2_register.hpp index f96275683..10c2fe23c 100644 --- a/third_party/xsimd/types/xsimd_sse4_2_register.hpp +++ b/third_party/xsimd/types/xsimd_sse4_2_register.hpp @@ -1,13 +1,13 @@ /*************************************************************************** -* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * -* Martin Renou * -* Copyright (c) QuantStack * -* Copyright (c) Serge Guelton * -* * -* Distributed under the terms of the BSD 3-Clause License. * -* * -* The full license is in the file LICENSE, distributed with this software. * -****************************************************************************/ + * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * + * Martin Renou * + * Copyright (c) QuantStack * + * Copyright (c) Serge Guelton * + * * + * Distributed under the terms of the BSD 3-Clause License. * + * * + * The full license is in the file LICENSE, distributed with this software. * + ****************************************************************************/ #ifndef XSIMD_SSE4_2_REGISTER_HPP #define XSIMD_SSE4_2_REGISTER_HPP @@ -27,10 +27,10 @@ namespace xsimd */ struct sse4_2 : sse4_1 { - static constexpr bool supported() { return XSIMD_WITH_SSE4_2; } - static constexpr bool available() { return true; } - static constexpr unsigned version() { return generic::version(1, 4, 2); } - static constexpr char const* name() { return "sse4.2"; } + static constexpr bool supported() noexcept { return XSIMD_WITH_SSE4_2; } + static constexpr bool available() noexcept { return true; } + static constexpr unsigned version() noexcept { return generic::version(1, 4, 2); } + static constexpr char const* name() noexcept { return "sse4.2"; } }; #if XSIMD_WITH_SSE4_2 @@ -42,4 +42,3 @@ namespace xsimd } #endif - diff --git a/third_party/xsimd/types/xsimd_ssse3_register.hpp b/third_party/xsimd/types/xsimd_ssse3_register.hpp index e6d8c2bb0..0f70633bb 100644 --- a/third_party/xsimd/types/xsimd_ssse3_register.hpp +++ b/third_party/xsimd/types/xsimd_ssse3_register.hpp @@ -1,13 +1,13 @@ /*************************************************************************** -* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * -* Martin Renou * -* Copyright (c) QuantStack * -* Copyright (c) Serge Guelton * -* * -* Distributed under the terms of the BSD 3-Clause License. * -* * -* The full license is in the file LICENSE, distributed with this software. * -****************************************************************************/ + * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * + * Martin Renou * + * Copyright (c) QuantStack * + * Copyright (c) Serge Guelton * + * * + * Distributed under the terms of the BSD 3-Clause License. * + * * + * The full license is in the file LICENSE, distributed with this software. * + ****************************************************************************/ #ifndef XSIMD_SSSE3_REGISTER_HPP #define XSIMD_SSSE3_REGISTER_HPP @@ -27,19 +27,18 @@ namespace xsimd */ struct ssse3 : sse3 { - static constexpr bool supported() { return XSIMD_WITH_SSSE3; } - static constexpr bool available() { return true; } - static constexpr unsigned version() { return generic::version(1, 3, 1); } - static constexpr char const* name() { return "ssse3"; } + static constexpr bool supported() noexcept { return XSIMD_WITH_SSSE3; } + static constexpr bool available() noexcept { return true; } + static constexpr unsigned version() noexcept { return generic::version(1, 3, 1); } + static constexpr char const* name() noexcept { return "ssse3"; } }; #if XSIMD_WITH_SSSE3 - namespace types - { - XSIMD_DECLARE_SIMD_REGISTER_ALIAS(ssse3, sse3); - } + namespace types + { + XSIMD_DECLARE_SIMD_REGISTER_ALIAS(ssse3, sse3); + } #endif } #endif - diff --git a/third_party/xsimd/types/xsimd_traits.hpp b/third_party/xsimd/types/xsimd_traits.hpp index 7b208373f..891eebf2a 100644 --- a/third_party/xsimd/types/xsimd_traits.hpp +++ b/third_party/xsimd/types/xsimd_traits.hpp @@ -1,13 +1,13 @@ /*************************************************************************** -* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * -* Martin Renou * -* Copyright (c) QuantStack * -* Copyright (c) Serge Guelton * -* * -* Distributed under the terms of the BSD 3-Clause License. * -* * -* The full license is in the file LICENSE, distributed with this software. * -****************************************************************************/ + * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * + * Martin Renou * + * Copyright (c) QuantStack * + * Copyright (c) Serge Guelton * + * * + * Distributed under the terms of the BSD 3-Clause License. * + * * + * The full license is in the file LICENSE, distributed with this software. * + ****************************************************************************/ #ifndef XSIMD_TRAITS_HPP #define XSIMD_TRAITS_HPP @@ -113,21 +113,7 @@ namespace xsimd template struct simd_condition { - static constexpr bool value = - (std::is_same::value && !std::is_same::value) || - (std::is_same::value && !std::is_same::value) || - std::is_same::value || - std::is_same::value || - std::is_same::value || - std::is_same::value || - std::is_same::value || - std::is_same::value || - std::is_same::value || - std::is_same::value || - std::is_same::value || - std::is_same::value || - std::is_same::value || - detail::is_complex::value; + static constexpr bool value = (std::is_same::value && !std::is_same::value) || (std::is_same::value && !std::is_same::value) || std::is_same::value || std::is_same::value || std::is_same::value || std::is_same::value || std::is_same::value || std::is_same::value || std::is_same::value || std::is_same::value || std::is_same::value || std::is_same::value || std::is_same::value || detail::is_complex::value; }; template diff --git a/third_party/xsimd/types/xsimd_utils.hpp b/third_party/xsimd/types/xsimd_utils.hpp index 87f6afccb..7014f64ae 100644 --- a/third_party/xsimd/types/xsimd_utils.hpp +++ b/third_party/xsimd/types/xsimd_utils.hpp @@ -1,13 +1,13 @@ /*************************************************************************** -* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * -* Martin Renou * -* Copyright (c) QuantStack * -* Copyright (c) Serge Guelton * -* * -* Distributed under the terms of the BSD 3-Clause License. * -* * -* The full license is in the file LICENSE, distributed with this software. * -****************************************************************************/ + * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * + * Martin Renou * + * Copyright (c) QuantStack * + * Copyright (c) Serge Guelton * + * * + * Distributed under the terms of the BSD 3-Clause License. * + * * + * The full license is in the file LICENSE, distributed with this software. * + ****************************************************************************/ #ifndef XSIMD_UTILS_HPP #define XSIMD_UTILS_HPP @@ -177,18 +177,18 @@ namespace xsimd * bit_cast * ********************/ - template - To bit_cast(From val) { - static_assert(sizeof(From) == sizeof(To), "casting between compatible layout"); - // FIXME: Some old version of GCC don't support that trait - //static_assert(std::is_trivially_copyable::value, "input type is trivially copyable"); - //static_assert(std::is_trivially_copyable::value, "output type is trivially copyable"); - To res; - std::memcpy(&res, &val, sizeof(val)); - return res; + template + inline To bit_cast(From val) noexcept + { + static_assert(sizeof(From) == sizeof(To), "casting between compatible layout"); + // FIXME: Some old version of GCC don't support that trait + // static_assert(std::is_trivially_copyable::value, "input type is trivially copyable"); + // static_assert(std::is_trivially_copyable::value, "output type is trivially copyable"); + To res; + std::memcpy(&res, &val, sizeof(val)); + return res; } - /***************************************** * Backport of index_sequence from c++14 * *****************************************/ @@ -197,68 +197,78 @@ namespace xsimd namespace detail { template - struct identity { using type = T; }; - - #ifdef __cpp_lib_integer_sequence - using std::integer_sequence; - using std::make_integer_sequence; - using std::index_sequence; - using std::make_index_sequence; - - using std::index_sequence_for; - #else - template - struct integer_sequence { - using value_type = T; - static constexpr std::size_t size() noexcept { return sizeof...(Is); } - }; + struct identity + { + using type = T; + }; - template - struct make_integer_sequence_concat; +#ifdef __cpp_lib_integer_sequence + using std::index_sequence; + using std::integer_sequence; + using std::make_index_sequence; + using std::make_integer_sequence; - template - struct make_integer_sequence_concat, - integer_sequence> - : identity> {}; + using std::index_sequence_for; +#else + template + struct integer_sequence + { + using value_type = T; + static constexpr std::size_t size() noexcept { return sizeof...(Is); } + }; - template - struct make_integer_sequence_impl; + template + struct make_integer_sequence_concat; - template - struct make_integer_sequence_impl> : identity> {}; + template + struct make_integer_sequence_concat, + integer_sequence> + : identity> + { + }; - template - struct make_integer_sequence_impl> : identity> {}; + template + struct make_integer_sequence_impl; - template - struct make_integer_sequence_impl> - : make_integer_sequence_concat>::type, - typename make_integer_sequence_impl>::type> {}; + template + struct make_integer_sequence_impl> : identity> + { + }; + template + struct make_integer_sequence_impl> : identity> + { + }; - template - using make_integer_sequence = typename make_integer_sequence_impl>::type; + template + struct make_integer_sequence_impl> + : make_integer_sequence_concat>::type, + typename make_integer_sequence_impl>::type> + { + }; + template + using make_integer_sequence = typename make_integer_sequence_impl>::type; - template - using index_sequence = integer_sequence; + template + using index_sequence = integer_sequence; - template - using make_index_sequence = make_integer_sequence; + template + using make_index_sequence = make_integer_sequence; - template - using index_sequence_for = make_index_sequence; + template + using index_sequence_for = make_index_sequence; - #endif +#endif - template - using int_sequence = integer_sequence; + template + using int_sequence = integer_sequence; - template - using make_int_sequence = make_integer_sequence; + template + using make_int_sequence = make_integer_sequence; - template - using int_sequence_for = make_int_sequence; + template + using int_sequence_for = make_int_sequence; } @@ -269,20 +279,20 @@ namespace xsimd namespace detail { template - const T& get_impl(const std::tuple& t, std::is_same, index_sequence) + inline const T& get_impl(const std::tuple& t, std::is_same, index_sequence) noexcept { return std::get(t); } template - const T& get_impl(const std::tuple& t, std::is_same, index_sequence) + inline const T& get_impl(const std::tuple& t, std::is_same, index_sequence) noexcept { - using tuple_elem = typename std::tuple_element>::type; + using tuple_elem = typename std::tuple_element>::type; return get_impl(t, std::is_same(), index_sequence()); } template - const T& get(const std::tuple& t) + inline const T& get(const std::tuple& t) noexcept { using tuple_elem = typename std::tuple_element<0, std::tuple>::type; return get_impl(t, std::is_same(), make_index_sequence()); @@ -329,32 +339,32 @@ namespace xsimd { // std::array constructor from scalar value ("broadcast") template - constexpr std::array - array_from_scalar_impl(const T& scalar, index_sequence) + inline constexpr std::array + array_from_scalar_impl(const T& scalar, index_sequence) noexcept { // You can safely ignore this silly ternary, the "scalar" is all // that matters. The rest is just a dirty workaround... - return std::array{ (Is+1) ? scalar : T() ... }; + return std::array { (Is + 1) ? scalar : T()... }; } template - constexpr std::array - array_from_scalar(const T& scalar) + inline constexpr std::array + array_from_scalar(const T& scalar) noexcept { return array_from_scalar_impl(scalar, make_index_sequence()); } // std::array constructor from C-style pointer (handled as an array) template - constexpr std::array - array_from_pointer_impl(const T* c_array, index_sequence) + inline constexpr std::array + array_from_pointer_impl(const T* c_array, index_sequence) noexcept { - return std::array{ c_array[Is]... }; + return std::array { c_array[Is]... }; } template - constexpr std::array - array_from_pointer(const T* c_array) + inline constexpr std::array + array_from_pointer(const T* c_array) noexcept { return array_from_pointer_impl(c_array, make_index_sequence()); } @@ -366,20 +376,19 @@ namespace xsimd namespace detail { - template struct bool_pack; + template + struct bool_pack; template using all_true = std::is_same< - bool_pack, bool_pack - >; + bool_pack, bool_pack>; template using is_all_convertible = all_true::value...>; template using is_array_initializer = std::enable_if< - (sizeof...(Args) == N) && is_all_convertible::value - >; + (sizeof...(Args) == N) && is_all_convertible::value>; // Check that a variadic argument pack is a list of N values of type T, // as usable for instantiating a value of type std::array. @@ -416,7 +425,46 @@ namespace xsimd }; #endif } + + /******************* + * real_batch_type * + *******************/ + + template + struct real_batch_type + { + using type = B; + }; + + template + struct real_batch_type, A>> + { + using type = batch; + }; + + template + using real_batch_type_t = typename real_batch_type::type; + + /********************** + * complex_batch_type * + **********************/ + + template + struct complex_batch_type + { + using real_value_type = typename B::value_type; + using arch_type = typename B::arch_type; + using type = batch, arch_type>; + }; + + template + struct complex_batch_type, A>> + { + using type = batch, A>; + }; + + template + using complex_batch_type_t = typename complex_batch_type::type; } #endif - diff --git a/third_party/xsimd/xsimd.hpp b/third_party/xsimd/xsimd.hpp index 12ff609ad..79bdd03d5 100644 --- a/third_party/xsimd/xsimd.hpp +++ b/third_party/xsimd/xsimd.hpp @@ -1,13 +1,13 @@ /*************************************************************************** -* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * -* Martin Renou * -* Copyright (c) QuantStack * -* Copyright (c) Serge Guelton * -* * -* Distributed under the terms of the BSD 3-Clause License. * -* * -* The full license is in the file LICENSE, distributed with this software. * -****************************************************************************/ + * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * + * Martin Renou * + * Copyright (c) QuantStack * + * Copyright (c) Serge Guelton * + * * + * Distributed under the terms of the BSD 3-Clause License. * + * * + * The full license is in the file LICENSE, distributed with this software. * + ****************************************************************************/ #ifndef XSIMD_HPP #define XSIMD_HPP @@ -18,11 +18,19 @@ #define XSIMD_NO_DISCARD #endif +#include "config/xsimd_config.hpp" -#include "types/xsimd_batch.hpp" -#include "types/xsimd_batch_constant.hpp" -#include "types/xsimd_api.hpp" #include "arch/xsimd_scalar.hpp" #include "memory/xsimd_aligned_allocator.hpp" + +#if defined(XSIMD_NO_SUPPORTED_ARCHITECTURE) +// to type definition or anything appart from scalar definition and aligned allocator +#else +#include "types/xsimd_batch.hpp" +#include "types/xsimd_batch_constant.hpp" #include "types/xsimd_traits.hpp" + +// This include must come last +#include "types/xsimd_api.hpp" +#endif #endif