diff --git a/doc/fmtconv.html b/doc/fmtconv.html
index f9c4fb8..1be2df5 100644
--- a/doc/fmtconv.html
+++ b/doc/fmtconv.html
@@ -15,7 +15,7 @@ Abstract
Authors: | | Firesledge (aka Cretindesalpes) |
-Version: | | r29 |
+Version: | | r30 |
Download: | | http://ldesoras.free.fr/prod.html |
Category: | | Format tools |
Requirements: | | Vapoursynth r55 or Avisynth+ 3.7.0 |
@@ -129,7 +129,7 @@ Resizing and chroma subsampling conversions
Bobbing an interlaced stream (here, Top Field First):
c = c.std.SeparateFields (tff=True)
-c = c.fmtc.resample (scalev=2, kernel="cubic", interlaced=True, interlacedd=False)
+c = c.fmtc.resample (scalev=2, kernel="cubic", interlaced=1, interlacedd=0)
Converting a progressive stream from YUV 4:2:2 to 4:2:0 and back to 8 bits:
@@ -140,7 +140,7 @@ Resizing and chroma subsampling conversions
tff = True
c = c.std.SeparateFields (tff=tff)
-c = c.fmtc.resample (css="420", interlaced=True)
+c = c.fmtc.resample (css="420", interlaced=1)
c = c.fmtc.bitdepth (bits=8)
c = c.std.DoubleWeave (tff=tff)
c = c.std.SelectEvery (cycle=2, offsets=0)
@@ -623,7 +623,7 @@ matrix
primaries to perform the intermediary conversion.
The _ColorRange
frame property is set if the fulld
-parameter has been explicitely defined.
+parameter has been explicitely defined or if a preset is used.
If the destination colorspace is a standardized one (as deduced from the
specified matrix), the _Matrix
and _ColorSpace
properties are set, otherwise they are deleted from the frame.
@@ -959,6 +959,7 @@ primaries
wd : float[]: opt;
prims : data : opt;
primd : data : opt;
+ wconv : int : opt; (False)
cpuopt: int : opt; (-1)
)
fmtc_primaries (
@@ -973,6 +974,7 @@ primaries
arrayf wd (undefined),
string prims (undefined),
string primd (undefined),
+ bool wconv (False),
int cpuopt (-1)
) |
@@ -1070,6 +1072,21 @@ Parameters
"redwide" | R G B W (D65) | 0.780308, 0.121595, 0.095612, 0.3217, | 0.304253 1.493994 −0.084589 0.3290 | REDWideGamutRGB |
+wconv
+Indicates we want a full conversion for the white point.
+If set to False
, chromatic adaptation will be used, so the
+white will stay white on the destination illuminant and colors will be adapted
+to implement a real illuminant change.
+This is generally what you want when converting between gamuts: the eye adapts
+to the new white and colors should be matched accordingly.
+If set to True
, the chromatic adaptation is bypassed.
+The white from the source colorspace will appear with a tint if the target
+colorspace has a different white point.
+Use this if you want to emulate what a picture displayed with a monitor using
+the source illuminant looks like on a display using the target illuminant.
+This is also what you want when converting to and from XYZ for further
+operations in this colorspace.
+
cpuopt
Limits the CPU instruction set.
−1: automatic (no limitation),
@@ -1275,7 +1292,7 @@
Parameters
Clip to be resized. Mandatory.
Supported input formats:
-- 8-, 9-, 10-, 12-, 16- and 16-bit integer.
+- 8-, 9-, 10-, 12-, 14- and 16-bit integer.
- 32-bit floating point.
- Any planar colorspace.
@@ -1872,9 +1889,15 @@ Parameters
Indicate the peak white levels in cd/m2.
lws is for the source transfer function, and lwd
for the destination one.
-These parameters are taken into account when display-referred transfer
-functions are used.
-Minimum lw value is 0.1 cd/m2.
+These parameters are taken into account to scale the luminance when the
+following conditions are met:
+
+- display-referred transfer functions are used,
+
- match is set to 2 (display luminance matching) and
+- the EOTF shouldn’t specify any scale for the luminance (the cd/m² value
+for F’ = 1.0).
+
+Minimum lw value is 0.1 cd/m2.
System gamma may be changed according to the lw parameter.
Unless specified, HDR functions use a peak white of 1000 cd/m2.
Similarly, SDR and other functions use 100 cd/m2 by default.
@@ -1973,11 +1996,19 @@
IV) Troubleshooting
V) Changelog
-r30, 2022-xx-xx
+r31, 202x-xx-xx
+
+resample
: fixed 14 to 16 bit AVX2 conversion path, thanks to NSQY for the report.
+
+
+r30, 2022-08-29
+matrix
: The _ColorRange
frame property is now set when a matrix preset is used.
transfer
: Added ACEScct transfer function.
primaries
: Added DCI P3+ and Cinema Gamut presets.
-- Changed the
configure options to compile with Clang.
+primaries
: Added wconv parameter for full conversion.
+- Changed the
configure
options to compile with Clang.
+- Updated datatypes in the examples.
r29, 2022-04-11
diff --git a/src/avisynth.h b/src/avisynth.h
index 679ac2a..b779c49 100644
--- a/src/avisynth.h
+++ b/src/avisynth.h
@@ -442,17 +442,17 @@ extern const AVS_Linkage* AVS_linkage;
# endif
# define AVS_BakedCode(arg) { arg ; }
-# define AVS_LinkCall(arg) !AVS_linkage || offsetof(AVS_Linkage, arg) >= AVS_linkage->Size ? 0 : (this->*(AVS_linkage->arg))
-# define AVS_LinkCall_Void(arg) !AVS_linkage || offsetof(AVS_Linkage, arg) >= AVS_linkage->Size ? (void)0 : (this->*(AVS_linkage->arg))
-# define AVS_LinkCallV(arg) !AVS_linkage || offsetof(AVS_Linkage, arg) >= AVS_linkage->Size ? *this : (this->*(AVS_linkage->arg))
+# define AVS_LinkCall(arg) !AVS_linkage || offsetof(AVS_Linkage, arg) >= (size_t)AVS_linkage->Size ? 0 : (this->*(AVS_linkage->arg))
+# define AVS_LinkCall_Void(arg) !AVS_linkage || offsetof(AVS_Linkage, arg) >= (size_t)AVS_linkage->Size ? (void)0 : (this->*(AVS_linkage->arg))
+# define AVS_LinkCallV(arg) !AVS_linkage || offsetof(AVS_Linkage, arg) >= (size_t)AVS_linkage->Size ? *this : (this->*(AVS_linkage->arg))
// Helper macros for fallback option when a function does not exists
#define CALL_MEMBER_FN(object,ptrToMember) ((object)->*(ptrToMember))
#define AVS_LinkCallOpt(arg, argOpt) !AVS_linkage ? 0 : \
- ( offsetof(AVS_Linkage, arg) >= AVS_linkage->Size ? \
- (offsetof(AVS_Linkage, argOpt) >= AVS_linkage->Size ? 0 : CALL_MEMBER_FN(this, AVS_linkage->argOpt)() ) : \
+ ( offsetof(AVS_Linkage, arg) >= (size_t)AVS_linkage->Size ? \
+ (offsetof(AVS_Linkage, argOpt) >= (size_t)AVS_linkage->Size ? 0 : CALL_MEMBER_FN(this, AVS_linkage->argOpt)() ) : \
CALL_MEMBER_FN(this, AVS_linkage->arg)() )
// AVS_LinkCallOptDefault puts automatically () only after arg
-# define AVS_LinkCallOptDefault(arg, argDefaultValue) !AVS_linkage || offsetof(AVS_Linkage, arg) >= AVS_linkage->Size ? (argDefaultValue) : ((this->*(AVS_linkage->arg))())
+# define AVS_LinkCallOptDefault(arg, argDefaultValue) !AVS_linkage || offsetof(AVS_Linkage, arg) >= (size_t)AVS_linkage->Size ? (argDefaultValue) : ((this->*(AVS_linkage->arg))())
#endif
@@ -1299,7 +1299,7 @@ class GenericVideoFilter : public IClip {
void __stdcall GetAudio(void* buf, int64_t start, int64_t count, IScriptEnvironment* env) { child->GetAudio(buf, start, count, env); }
const VideoInfo& __stdcall GetVideoInfo() { return vi; }
bool __stdcall GetParity(int n) { return child->GetParity(n); }
- int __stdcall SetCacheHints(int cachehints, int frame_range) { AVS_UNUSED(cachehints); AVS_UNUSED(frame_range); return 0; }; // We do not pass cache requests upwards, only to the next filter.
+ int __stdcall SetCacheHints(int cachehints, int frame_range) { AVS_UNUSED(cachehints); AVS_UNUSED(frame_range); return 0; } // We do not pass cache requests upwards, only to the next filter.
};
@@ -1864,7 +1864,7 @@ struct PNeoEnv {
#if defined(BUILDING_AVSCORE) || defined(AVS_STATIC_LIB)
;
#else
- : p(!AVS_linkage || offsetof(AVS_Linkage, GetNeoEnv) >= AVS_linkage->Size ? 0 : AVS_linkage->GetNeoEnv(env)) { }
+ : p(!AVS_linkage || offsetof(AVS_Linkage, GetNeoEnv) >= (size_t)AVS_linkage->Size ? 0 : AVS_linkage->GetNeoEnv(env)) { }
#endif
int operator!() const { return !p; }
diff --git a/src/fmtc/Matrix_vs.cpp b/src/fmtc/Matrix_vs.cpp
index 07c8c8c..2c08360 100644
--- a/src/fmtc/Matrix_vs.cpp
+++ b/src/fmtc/Matrix_vs.cpp
@@ -146,6 +146,7 @@ Matrix::Matrix (const ::VSMap &in, ::VSMap &out, void * /*user_data_ptr*/, ::VSC
const int nbr_expected_coef = _nbr_planes * (_nbr_planes + 1);
bool mat_init_flag = false;
+ bool preset_flag = false;
// Matrix presets
std::string mat (get_arg_str (in, out, "mat", ""));
@@ -182,6 +183,7 @@ Matrix::Matrix (const ::VSMap &in, ::VSMap &out, void * /*user_data_ptr*/, ::VSC
_mat_main = m2d * m2s;
mat_init_flag = true;
+ preset_flag = true;
}
// Custom coefficients
@@ -309,6 +311,7 @@ Matrix::Matrix (const ::VSMap &in, ::VSMap &out, void * /*user_data_ptr*/, ::VSC
vsutl::is_full_range_default (fmt_dst) ? 1 : 0,
0, &_range_set_dst_flag
) != 0);
+ _range_set_dst_flag |= preset_flag;
prepare_matrix_coef (
*this, *_proc_uptr, _mat_main,
diff --git a/src/fmtc/Primaries_vs.cpp b/src/fmtc/Primaries_vs.cpp
index ced609e..e5e2089 100644
--- a/src/fmtc/Primaries_vs.cpp
+++ b/src/fmtc/Primaries_vs.cpp
@@ -105,8 +105,10 @@ Primaries::Primaries (const ::VSMap &in, ::VSMap &out, void *user_data_ptr, ::VS
init (_prim_d, *this, in, out, "rd", "gd", "bd", "wd");
assert (_prim_d.is_ready ());
+ const auto conv_flag = (get_arg_int (in, out, "wconv", 0) != 0);
+
const fmtcl::Mat3 mat_conv =
- fmtcl::PrimUtil::compute_conversion_matrix (_prim_s, _prim_d);
+ fmtcl::PrimUtil::compute_conversion_matrix (_prim_s, _prim_d, conv_flag);
_mat_main.insert3 (mat_conv);
_mat_main.clean3 (1);
diff --git a/src/fmtc/Resample_vs.cpp b/src/fmtc/Resample_vs.cpp
index 3d7e649..971343f 100644
--- a/src/fmtc/Resample_vs.cpp
+++ b/src/fmtc/Resample_vs.cpp
@@ -628,7 +628,7 @@ const ::VSFrame * Resample::get_frame (int n, int activation_reason, void * &fra
if (ret_val != 0)
{
_vsapi.freeFrame (dst_ptr);
- dst_ptr = 0;
+ dst_ptr = nullptr;
}
}
@@ -680,7 +680,7 @@ int Resample::do_process_plane (::VSFrame &dst, int n, int plane_index, void *fr
{
const Ru::FrameInfo & frame_info =
*reinterpret_cast (frame_data_ptr);
- process_plane_proc (
+ ret_val = process_plane_proc (
dst, n, plane_index, frame_ctx, src_node1_sptr, frame_info
);
}
@@ -688,7 +688,7 @@ int Resample::do_process_plane (::VSFrame &dst, int n, int plane_index, void *fr
// Copy (and convert)
else if (proc_mode == vsutl::PlaneProcMode_COPY1)
{
- process_plane_copy (
+ ret_val = process_plane_copy (
dst, n, plane_index, frame_ctx, src_node1_sptr
);
}
diff --git a/src/fmtc/version.h b/src/fmtc/version.h
index 4a4d319..a209cf7 100644
--- a/src/fmtc/version.h
+++ b/src/fmtc/version.h
@@ -1,5 +1,5 @@
#pragma once
-#define fmtc_VERSION 29
+#define fmtc_VERSION 30
#define fmtc_PLUGIN_NAME "fmtconv"
#define fmtc_NAMESPACE "fmtc"
diff --git a/src/fmtcavs/Matrix_avs.cpp b/src/fmtcavs/Matrix_avs.cpp
index db75fcb..b673a79 100644
--- a/src/fmtcavs/Matrix_avs.cpp
+++ b/src/fmtcavs/Matrix_avs.cpp
@@ -73,7 +73,6 @@ Matrix::Matrix (::IScriptEnvironment &env, const ::AVSValue &args)
{
env.ThrowError (fmtcavs_MATRIX ": input must be 4:4:4.");
}
- const int nbr_planes_src = _vi_src.NumComponents ();
if (fmt_src.get_nbr_comp_non_alpha () != _nbr_planes_proc)
{
env.ThrowError (
@@ -129,6 +128,7 @@ Matrix::Matrix (::IScriptEnvironment &env, const ::AVSValue &args)
const int nbr_expected_coef = _nbr_planes_proc * (_nbr_planes_proc + 1);
bool mat_init_flag = false;
+ bool preset_flag = false;
fmtcl::Mat4 mat_main; // Main matrix, float input, float output
// Matrix presets
@@ -169,6 +169,7 @@ Matrix::Matrix (::IScriptEnvironment &env, const ::AVSValue &args)
mat_main = m2d * m2s;
mat_init_flag = true;
+ preset_flag = true;
}
// Alpha plane processing, if any
@@ -267,7 +268,7 @@ Matrix::Matrix (::IScriptEnvironment &env, const ::AVSValue &args)
_fulld_flag = args [Param_FULLD].AsBool (
fmtcl::is_full_range_default (fmt_dst.get_col_fam ())
);
- _range_def_flag = args [Param_FULLD].Defined ();
+ _range_def_flag = (args [Param_FULLD].Defined () || preset_flag);
prepare_matrix_coef (
env, *_proc_uptr, mat_main,
diff --git a/src/fmtcavs/Primaries.h b/src/fmtcavs/Primaries.h
index e1d067b..e3d248e 100644
--- a/src/fmtcavs/Primaries.h
+++ b/src/fmtcavs/Primaries.h
@@ -62,6 +62,7 @@ class Primaries
Param_WD,
Param_PRIMS,
Param_PRIMD,
+ Param_WCONV,
Param_CPUOPT,
Param_NBR_ELT
diff --git a/src/fmtcavs/Primaries_avs.cpp b/src/fmtcavs/Primaries_avs.cpp
index 85ae49f..989148a 100644
--- a/src/fmtcavs/Primaries_avs.cpp
+++ b/src/fmtcavs/Primaries_avs.cpp
@@ -110,8 +110,10 @@ Primaries::Primaries (::IScriptEnvironment &env, const ::AVSValue &args)
init (_prim_d, env, args, Param_RD, Param_GD, Param_BD, Param_WD);
assert (_prim_d.is_ready ());
+ const auto conv_flag = args [Param_WCONV].AsBool (false);
+
const fmtcl::Mat3 mat_conv =
- fmtcl::PrimUtil::compute_conversion_matrix (_prim_s, _prim_d);
+ fmtcl::PrimUtil::compute_conversion_matrix (_prim_s, _prim_d, conv_flag);
_mat_main.insert3 (mat_conv);
_mat_main.clean3 (1);
diff --git a/src/fmtcl/BitBltConv_avx2.cpp b/src/fmtcl/BitBltConv_avx2.cpp
index 3dd2e7c..cad5821 100644
--- a/src/fmtcl/BitBltConv_avx2.cpp
+++ b/src/fmtcl/BitBltConv_avx2.cpp
@@ -75,11 +75,13 @@ void BitBltConv::bitblt_int_to_flt_avx2_switch (uint8_t *dst_ptr, ptrdiff_t dst_
switch ((scale_flag << 16) + (src_fmt << 8) + src_res)
{
fmtcl_BitBltConv_CASE (false, INT16 , 16, i16)
+ fmtcl_BitBltConv_CASE (false, INT16 , 14, i16)
fmtcl_BitBltConv_CASE (false, INT16 , 12, i16)
fmtcl_BitBltConv_CASE (false, INT16 , 10, i16)
fmtcl_BitBltConv_CASE (false, INT16 , 9, i16)
fmtcl_BitBltConv_CASE (false, INT8 , 8, i08)
fmtcl_BitBltConv_CASE (true , INT16 , 16, i16)
+ fmtcl_BitBltConv_CASE (true , INT16 , 14, i16)
fmtcl_BitBltConv_CASE (true , INT16 , 12, i16)
fmtcl_BitBltConv_CASE (true , INT16 , 10, i16)
fmtcl_BitBltConv_CASE (true , INT16 , 9, i16)
@@ -154,10 +156,15 @@ void BitBltConv::bitblt_int_to_int_avx2_switch (fmtcl::SplFmt dst_fmt, int dst_r
switch ((dst_fmt << 20) + (src_fmt << 16) + (dst_res << 8) + src_res)
{
+ fmtcl_BitBltConv_CASE (INT16 , INT16 , 16, 14, i16, i16)
fmtcl_BitBltConv_CASE (INT16 , INT16 , 16, 12, i16, i16)
fmtcl_BitBltConv_CASE (INT16 , INT16 , 16, 10, i16, i16)
fmtcl_BitBltConv_CASE (INT16 , INT16 , 16, 9, i16, i16)
fmtcl_BitBltConv_CASE (INT16 , INT8 , 16, 8, i16, i08)
+ fmtcl_BitBltConv_CASE (INT16 , INT16 , 14, 12, i16, i16)
+ fmtcl_BitBltConv_CASE (INT16 , INT16 , 14, 10, i16, i16)
+ fmtcl_BitBltConv_CASE (INT16 , INT16 , 14, 9, i16, i16)
+ fmtcl_BitBltConv_CASE (INT16 , INT8 , 14, 8, i16, i08)
fmtcl_BitBltConv_CASE (INT16 , INT16 , 12, 10, i16, i16)
fmtcl_BitBltConv_CASE (INT16 , INT16 , 12, 9, i16, i16)
fmtcl_BitBltConv_CASE (INT16 , INT8 , 12, 8, i16, i08)
diff --git a/src/fmtcl/KernelData.cpp b/src/fmtcl/KernelData.cpp
index be4fc13..bf6a83e 100644
--- a/src/fmtcl/KernelData.cpp
+++ b/src/fmtcl/KernelData.cpp
@@ -46,6 +46,7 @@ To Public License, Version 2, as published by Sam Hocevar. See
#include "fstb/def.h"
#include "fstb/fnc.h"
+#include
#include
#include
@@ -296,6 +297,7 @@ void KernelData::invert_kernel (int taps)
assert (ovr_f * support >= taps);
int len = fstb::ceil_int (ovr_s * ovr_f * support) * 2;
len = 1 << (fstb::get_prev_pow_2 (len - 1) + 1); // Next power of 2
+ len = std::max (len, 1); // Shouldn't happen but GCC emits a warning later
const int h_len = len / 2;
std::vector x (len);
diff --git a/src/fmtcl/PrimUtil.cpp b/src/fmtcl/PrimUtil.cpp
index 88ab707..30c6b0a 100644
--- a/src/fmtcl/PrimUtil.cpp
+++ b/src/fmtcl/PrimUtil.cpp
@@ -45,13 +45,20 @@ constexpr int PrimUtil::_nbr_planes;
-Mat3 PrimUtil::compute_conversion_matrix (const RgbSystem &prim_s, const RgbSystem &prim_d)
+// conv_flag indicates we want a full conversion, not a chromatic adatpation
+Mat3 PrimUtil::compute_conversion_matrix (const RgbSystem &prim_s, const RgbSystem &prim_d, bool conv_flag)
{
assert (prim_s.is_ready ());
assert (prim_d.is_ready ());
const Mat3 rgb2xyz = compute_rgb2xyz (prim_s);
const Mat3 xyz2rgb = compute_rgb2xyz (prim_d).invert ();
+
+ if (conv_flag)
+ {
+ return xyz2rgb * rgb2xyz;
+ }
+
const Mat3 adapt = compute_chroma_adapt (prim_s, prim_d);
return xyz2rgb * adapt * rgb2xyz;
diff --git a/src/fmtcl/PrimUtil.h b/src/fmtcl/PrimUtil.h
index 72cf11d..2cbf773 100644
--- a/src/fmtcl/PrimUtil.h
+++ b/src/fmtcl/PrimUtil.h
@@ -44,7 +44,7 @@ class PrimUtil
static constexpr int _nbr_planes = RgbSystem::_nbr_planes;
- static Mat3 compute_conversion_matrix (const RgbSystem &prim_s, const RgbSystem &prim_d);
+ static Mat3 compute_conversion_matrix (const RgbSystem &prim_s, const RgbSystem &prim_d, bool conv_flag);
static Mat3 compute_rgb2xyz (const RgbSystem &prim);
static Mat3 compute_chroma_adapt (const RgbSystem &prim_s, const RgbSystem &prim_d);
static Vec3 conv_xy_to_xyz (const RgbSystem::Vec2 &xy);
diff --git a/src/fstb/Hash.h b/src/fstb/Hash.h
index b18d527..ed281d8 100644
--- a/src/fstb/Hash.h
+++ b/src/fstb/Hash.h
@@ -45,6 +45,7 @@ To Public License, Version 2, as published by Sam Hocevar. See
/*\\\ INCLUDE FILES \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\*/
#include "fstb/def.h"
+#include "fstb/Vu32.h"
#include
@@ -64,8 +65,12 @@ class Hash
static fstb_FORCEINLINE constexpr uint32_t
hash (uint32_t x) noexcept;
+ static fstb_FORCEINLINE Vu32
+ hash (Vu32 x) noexcept;
static fstb_FORCEINLINE constexpr uint32_t
hash_inv (uint32_t x) noexcept;
+ static fstb_FORCEINLINE Vu32
+ hash_inv (Vu32 x) noexcept;
static fstb_FORCEINLINE constexpr uint64_t
hash (uint64_t x) noexcept;
diff --git a/src/fstb/Hash.hpp b/src/fstb/Hash.hpp
index 3a4a3e7..c6644fc 100644
--- a/src/fstb/Hash.hpp
+++ b/src/fstb/Hash.hpp
@@ -122,6 +122,19 @@ constexpr uint32_t Hash::hash (uint32_t x) noexcept
+Vu32 Hash::hash (Vu32 x) noexcept
+{
+ x ^= x >> 16;
+ x *= uint32_t (0x7FEB352Dlu);
+ x ^= x >> 15;
+ x *= uint32_t (0x846CA68Blu);
+ x ^= x >> 16;
+
+ return x;
+}
+
+
+
constexpr uint32_t Hash::hash_inv (uint32_t x) noexcept
{
#if 0
@@ -143,6 +156,19 @@ constexpr uint32_t Hash::hash_inv (uint32_t x) noexcept
+Vu32 Hash::hash_inv (Vu32 x) noexcept
+{
+ x ^= x >> 16;
+ x *= uint32_t (0x43021123lu);
+ x ^= x >> 15 ^ x >> 30;
+ x *= uint32_t (0x1D69E2A5lu);
+ x ^= x >> 16;
+
+ return x;
+}
+
+
+
// SplittableRandom / SplitMix64
constexpr uint64_t Hash::hash (uint64_t x) noexcept
{
diff --git a/src/fstb/Vf32.h b/src/fstb/Vf32.h
new file mode 100644
index 0000000..1a7203d
--- /dev/null
+++ b/src/fstb/Vf32.h
@@ -0,0 +1,356 @@
+/*****************************************************************************
+
+ Vf32.h
+ Author: Laurent de Soras, 2021
+
+--- Legal stuff ---
+
+This program is free software. It comes without any warranty, to
+the extent permitted by applicable law. You can redistribute it
+and/or modify it under the terms of the Do What The Fuck You Want
+To Public License, Version 2, as published by Sam Hocevar. See
+http://www.wtfpl.net/ for more details.
+
+*Tab=3***********************************************************************/
+
+
+
+#pragma once
+#if ! defined (fstb_Vf32_HEADER_INCLUDED)
+#define fstb_Vf32_HEADER_INCLUDED
+
+
+
+/*\\\ INCLUDE FILES \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\*/
+
+#include "fstb/def.h"
+
+#if ! defined (fstb_HAS_SIMD)
+ #include
+#elif (fstb_ARCHI == fstb_ARCHI_X86)
+ #include
+#elif (fstb_ARCHI == fstb_ARCHI_ARM)
+ #include
+#else
+ #error
+#endif
+
+#include
+
+#include
+
+
+
+namespace fstb
+{
+
+
+
+#if ! defined (fstb_HAS_SIMD)
+
+typedef std::array Vf32Native;
+
+#elif fstb_ARCHI == fstb_ARCHI_X86
+
+typedef __m128 Vf32Native;
+
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+
+typedef float32x4_t Vf32Native;
+
+#else // fstb_ARCHI
+#error
+#endif // fstb_ARCHI
+
+
+
+class Vf32
+{
+
+/*\\\ PUBLIC \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\*/
+
+public:
+
+ static constexpr int _len_l2 = 2;
+ static constexpr int _length = 1 << _len_l2;
+ typedef float Scalar;
+
+ Vf32 () = default;
+ fstb_FORCEINLINE
+ Vf32 (Vf32Native a) noexcept : _x { a } {}
+ explicit fstb_FORCEINLINE
+ Vf32 (Scalar a) noexcept;
+ explicit fstb_FORCEINLINE
+ Vf32 (double a) noexcept;
+ explicit fstb_FORCEINLINE
+ Vf32 (int a) noexcept;
+ explicit fstb_FORCEINLINE
+ Vf32 (Scalar a0, Scalar a1, Scalar a2, Scalar a3) noexcept;
+ explicit fstb_FORCEINLINE
+ Vf32 (const std::tuple &a) noexcept;
+ Vf32 (const Vf32 &other) = default;
+ Vf32 (Vf32 &&other) = default;
+ ~Vf32 () = default;
+ Vf32 & operator = (const Vf32 &other) = default;
+ Vf32 & operator = (Vf32 &&other) = default;
+
+ template
+ fstb_FORCEINLINE void
+ store (MEM *ptr) const noexcept;
+ template
+ fstb_FORCEINLINE void
+ store_part (MEM *ptr, int n) const noexcept;
+ template
+ fstb_FORCEINLINE void
+ storeu (MEM *ptr) const noexcept;
+ template
+ fstb_FORCEINLINE void
+ storeu_part (MEM *ptr, int n) const noexcept;
+ template
+ fstb_FORCEINLINE void
+ storeu_pair (MEM *ptr) const noexcept;
+ template
+ fstb_FORCEINLINE void
+ storeu_scalar (MEM *ptr) const noexcept;
+
+ fstb_FORCEINLINE
+ operator Vf32Native () const noexcept { return _x; }
+ fstb_FORCEINLINE explicit
+ operator bool () const noexcept;
+
+ fstb_FORCEINLINE Vf32 &
+ operator += (const Vf32Native &other) noexcept;
+ fstb_FORCEINLINE Vf32 &
+ operator -= (const Vf32Native &other) noexcept;
+ fstb_FORCEINLINE Vf32 &
+ operator *= (const Vf32Native &other) noexcept;
+ fstb_FORCEINLINE Vf32 &
+ operator /= (const Vf32Native &other) noexcept;
+
+ fstb_FORCEINLINE Vf32 &
+ operator &= (const Vf32Native &other) noexcept;
+ fstb_FORCEINLINE Vf32 &
+ operator |= (const Vf32Native &other) noexcept;
+ fstb_FORCEINLINE Vf32 &
+ operator ^= (const Vf32Native &other) noexcept;
+
+ fstb_FORCEINLINE Vf32 &
+ mac (Vf32 a, Vf32 b) noexcept;
+ fstb_FORCEINLINE Vf32 &
+ msu (Vf32 a, Vf32 b) noexcept;
+
+ fstb_FORCEINLINE Vf32
+ operator - () const noexcept;
+ fstb_FORCEINLINE Vf32
+ reverse () const noexcept;
+ fstb_FORCEINLINE Vf32
+ swap_pairs () const noexcept;
+ fstb_FORCEINLINE Vf32
+ monofy_pairs_lo () const noexcept;
+ fstb_FORCEINLINE Vf32
+ monofy_pairs_hi () const noexcept;
+
+ fstb_FORCEINLINE Vf32
+ butterfly_w64 () const noexcept;
+ fstb_FORCEINLINE Vf32
+ butterfly_w32 () const noexcept;
+
+ template
+ fstb_FORCEINLINE Vf32
+ rotate () const noexcept;
+ template
+ fstb_FORCEINLINE float
+ extract () const noexcept;
+ template
+ fstb_FORCEINLINE Vf32
+ insert (float val) const noexcept;
+ template
+ fstb_FORCEINLINE Vf32
+ spread () const noexcept;
+
+ fstb_FORCEINLINE Vf32
+ round () const noexcept;
+ fstb_FORCEINLINE Vf32
+ rcp_approx () const noexcept;
+ fstb_FORCEINLINE Vf32
+ rcp_approx2 () const noexcept;
+ fstb_FORCEINLINE Vf32
+ div_approx (const Vf32 &d) const noexcept;
+ fstb_FORCEINLINE Vf32
+ sqrt_approx () const noexcept;
+ fstb_FORCEINLINE Vf32
+ rsqrt () const noexcept;
+ fstb_FORCEINLINE Vf32
+ rsqrt_approx () const noexcept;
+ template
+ fstb_FORCEINLINE Vf32
+ log2_base (P poly) const noexcept;
+ template
+ fstb_FORCEINLINE Vf32
+ exp2_base (P poly) const noexcept;
+ fstb_FORCEINLINE Vf32
+ signbit () const noexcept;
+ fstb_FORCEINLINE Vf32
+ is_lt_0 () const noexcept;
+
+ fstb_FORCEINLINE std::tuple
+ explode () const noexcept;
+ fstb_FORCEINLINE std::tuple
+ extract_pair () const noexcept;
+ fstb_FORCEINLINE std::tuple
+ spread_pairs () const noexcept;
+
+ fstb_FORCEINLINE float
+ sum_h () const noexcept;
+ fstb_FORCEINLINE float
+ min_h () const noexcept;
+ fstb_FORCEINLINE float
+ max_h () const noexcept;
+
+ fstb_FORCEINLINE bool
+ and_h () const noexcept;
+ fstb_FORCEINLINE bool
+ or_h () const noexcept;
+ fstb_FORCEINLINE unsigned int
+ movemask () const noexcept;
+
+ static fstb_FORCEINLINE Vf32
+ zero () noexcept;
+ static fstb_FORCEINLINE Vf32
+ all1 () noexcept;
+ static fstb_FORCEINLINE Vf32
+ set_pair (float a0, float a1) noexcept;
+ static fstb_FORCEINLINE Vf32
+ set_pair_fill (float a02, float a13) noexcept;
+ static fstb_FORCEINLINE Vf32
+ set_pair_dbl (float a01, float a23) noexcept;
+ static fstb_FORCEINLINE Vf32
+ set_mask (bool m0, bool m1, bool m2, bool m3) noexcept;
+ static fstb_FORCEINLINE Vf32Native
+ signbit_mask () noexcept;
+ static fstb_FORCEINLINE Vf32
+ interleave_pair_lo (Vf32 p0, Vf32 p1) noexcept;
+ static fstb_FORCEINLINE Vf32
+ interleave_pair_hi (Vf32 p0, Vf32 p1) noexcept;
+ static fstb_FORCEINLINE std::tuple
+ interleave (Vf32 p0, Vf32 p1) noexcept;
+ static fstb_FORCEINLINE std::tuple
+ deinterleave (Vf32 i0, Vf32 i1) noexcept;
+ static fstb_FORCEINLINE Vf32
+ deinterleave_lo (Vf32 i0, Vf32 i1) noexcept;
+ static fstb_FORCEINLINE Vf32
+ deinterleave_hi (Vf32 i0, Vf32 i1) noexcept;
+ template
+ static fstb_FORCEINLINE Vf32
+ compose (Vf32 a, Vf32 b) noexcept;
+
+ template
+ static fstb_FORCEINLINE Vf32
+ load (const MEM *ptr) noexcept;
+ template
+ static fstb_FORCEINLINE Vf32
+ loadu (const MEM *ptr) noexcept;
+ template
+ static fstb_FORCEINLINE Vf32
+ loadu_part (const MEM *ptr, int n) noexcept;
+ template
+ static fstb_FORCEINLINE Vf32
+ loadu_pair (const MEM *ptr) noexcept;
+
+
+
+/*\\\ PROTECTED \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\*/
+
+protected:
+
+
+
+/*\\\ PRIVATE \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\*/
+
+private:
+
+ static constexpr int32_t _sign32 = INT32_MIN;
+
+ template
+ fstb_FORCEINLINE void
+ storeu_part_n13 (MEM *ptr, int n) const noexcept;
+
+#if ! defined (fstb_HAS_SIMD)
+public:
+ union Combo
+ {
+ Vf32Native _vf32;
+ int32_t _s32 [_length];
+ uint32_t _u32 [_length];
+ };
+ static_assert (
+ sizeof (Combo) == sizeof (Vf32Native),
+ "Wrong size for the wrapping combo structure"
+ );
+#endif
+ Vf32Native _x;
+private:
+
+
+
+/*\\\ FORBIDDEN MEMBER FUNCTIONS \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\*/
+
+private:
+
+}; // class Vf32
+
+static_assert (
+ sizeof (Vf32) == sizeof (Vf32Native),
+ "Wrong size for the wrapping structure"
+);
+
+
+
+/*\\\ GLOBAL OPERATORS AND FUNCTIONS \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\*/
+
+
+
+fstb_FORCEINLINE Vf32 operator + (Vf32 lhs, const Vf32 &rhs) noexcept;
+fstb_FORCEINLINE Vf32 operator - (Vf32 lhs, const Vf32 &rhs) noexcept;
+fstb_FORCEINLINE Vf32 operator * (Vf32 lhs, const Vf32 &rhs) noexcept;
+fstb_FORCEINLINE Vf32 operator / (Vf32 lhs, const Vf32 &rhs) noexcept;
+fstb_FORCEINLINE Vf32 operator & (Vf32 lhs, const Vf32 &rhs) noexcept;
+fstb_FORCEINLINE Vf32 operator | (Vf32 lhs, const Vf32 &rhs) noexcept;
+fstb_FORCEINLINE Vf32 operator ^ (Vf32 lhs, const Vf32 &rhs) noexcept;
+
+fstb_FORCEINLINE Vf32 operator == (const Vf32 &lhs, const Vf32 &rhs) noexcept;
+fstb_FORCEINLINE Vf32 operator != (const Vf32 &lhs, const Vf32 &rhs) noexcept;
+fstb_FORCEINLINE Vf32 operator < (const Vf32 &lhs, const Vf32 &rhs) noexcept;
+fstb_FORCEINLINE Vf32 operator <= (const Vf32 &lhs, const Vf32 &rhs) noexcept;
+fstb_FORCEINLINE Vf32 operator > (const Vf32 &lhs, const Vf32 &rhs) noexcept;
+fstb_FORCEINLINE Vf32 operator >= (const Vf32 &lhs, const Vf32 &rhs) noexcept;
+
+fstb_FORCEINLINE Vf32 abs (const Vf32 &v) noexcept;
+fstb_FORCEINLINE Vf32 fma (const Vf32 &x, const Vf32 &a, const Vf32 &b) noexcept;
+fstb_FORCEINLINE Vf32 fms (const Vf32 &x, const Vf32 &a, const Vf32 &b) noexcept;
+fstb_FORCEINLINE Vf32 fnma (const Vf32 &x, const Vf32 &a, const Vf32 &b) noexcept;
+fstb_FORCEINLINE Vf32 round (const Vf32 &v) noexcept;
+fstb_FORCEINLINE Vf32 min (const Vf32 &lhs, const Vf32 &rhs) noexcept;
+fstb_FORCEINLINE Vf32 max (const Vf32 &lhs, const Vf32 &rhs) noexcept;
+fstb_FORCEINLINE Vf32 limit (const Vf32 &v, const Vf32 &mi, const Vf32 &ma) noexcept;
+fstb_FORCEINLINE Vf32 select (Vf32 cond, Vf32 v_t, Vf32 v_f) noexcept;
+fstb_FORCEINLINE std::tuple swap_if (Vf32 cond, Vf32 lhs, Vf32 rhs) noexcept;
+fstb_FORCEINLINE Vf32 sqrt (Vf32 v) noexcept;
+fstb_FORCEINLINE Vf32 log2 (Vf32 v) noexcept;
+fstb_FORCEINLINE Vf32 exp2 (Vf32 v) noexcept;
+
+
+
+} // namespace fstb
+
+
+
+#include "fstb/Vf32.hpp"
+
+
+
+#endif // fstb_Vf32_HEADER_INCLUDED
+
+
+
+/*\\\ EOF \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\*/
diff --git a/src/fstb/Vf32.hpp b/src/fstb/Vf32.hpp
new file mode 100644
index 0000000..385cb0b
--- /dev/null
+++ b/src/fstb/Vf32.hpp
@@ -0,0 +1,2181 @@
+/*****************************************************************************
+
+ Vf32.hpp
+ Author: Laurent de Soras, 2021
+
+--- Legal stuff ---
+
+This program is free software. It comes without any warranty, to
+the extent permitted by applicable law. You can redistribute it
+and/or modify it under the terms of the Do What The Fuck You Want
+To Public License, Version 2, as published by Sam Hocevar. See
+http://www.wtfpl.net/ for more details.
+
+*Tab=3***********************************************************************/
+
+
+
+#if ! defined (fstb_Vf32_CODEHEADER_INCLUDED)
+#define fstb_Vf32_CODEHEADER_INCLUDED
+
+
+
+/*\\\ INCLUDE FILES \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\*/
+
+#include "fstb/fnc.h"
+
+#include
+
+#include
+#include
+#include
+
+
+
+namespace fstb
+{
+
+
+
+/*\\\ PUBLIC \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\*/
+
+
+
+// Initialises with a | a | a | a
+Vf32::Vf32 (Scalar a) noexcept
+#if ! defined (fstb_HAS_SIMD)
+: _x { a, a, a, a }
+#elif fstb_ARCHI == fstb_ARCHI_X86
+: _x { _mm_set1_ps (a) }
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+: _x { vdupq_n_f32 (a) }
+#endif // fstb_ARCHI
+{
+ // Nothing
+}
+
+
+
+// Initialises with a | a | a | a
+Vf32::Vf32 (double a) noexcept
+#if ! defined (fstb_HAS_SIMD)
+: _x { Scalar (a), Scalar (a), Scalar (a), Scalar (a) }
+#elif fstb_ARCHI == fstb_ARCHI_X86
+: _x { _mm_set1_ps (Scalar (a)) }
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+: _x { vdupq_n_f32 (Scalar (a)) }
+#endif // fstb_ARCHI
+{
+ // Nothing
+}
+
+
+
+// Initialises with a | a | a | a
+Vf32::Vf32 (int a) noexcept
+#if ! defined (fstb_HAS_SIMD)
+: _x { Scalar (a), Scalar (a), Scalar (a), Scalar (a) }
+#elif fstb_ARCHI == fstb_ARCHI_X86
+: _x { _mm_set1_ps (Scalar (a)) }
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+: _x { vdupq_n_f32 (Scalar (a)) }
+#endif // fstb_ARCHI
+{
+ // Nothing
+}
+
+
+
+// Initialises with a0 | a1 | a2 | a3
+Vf32::Vf32 (Scalar a0, Scalar a1, Scalar a2, Scalar a3) noexcept
+#if ! defined (fstb_HAS_SIMD)
+: _x { a0, a1, a2, a3 }
+#elif fstb_ARCHI == fstb_ARCHI_X86
+: _x { _mm_set_ps (a3, a2, a1, a0) }
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+: _x { a0, a1, a2, a3 }
+#endif // fstb_ARCHI
+{
+ // Nothing
+}
+
+
+
+// Initialises with a0 | a1 | a2 | a3
+Vf32::Vf32 (const std::tuple &a) noexcept
+#if ! defined (fstb_HAS_SIMD)
+: _x { std::get <0> (a), std::get <1> (a), std::get <2> (a), std::get <3> (a) }
+#elif fstb_ARCHI == fstb_ARCHI_X86
+: _x { _mm_set_ps (std::get <3> (a), std::get <2> (a), std::get <1> (a), std::get <0> (a)) }
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+: _x { std::get <0> (a), std::get <1> (a), std::get <2> (a), std::get <3> (a) }
+#endif // fstb_ARCHI
+{
+ // Nothing
+}
+
+
+
+template
+void Vf32::store (MEM *ptr) const noexcept
+{
+ assert (is_ptr_align_nz (ptr, fstb_SIMD128_ALIGN));
+
+#if ! defined (fstb_HAS_SIMD)
+ *reinterpret_cast (ptr) = _x;
+#elif fstb_ARCHI == fstb_ARCHI_X86
+ _mm_store_ps (reinterpret_cast (ptr), _x);
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+ vst1q_f32 (reinterpret_cast (ptr), _x);
+#endif // fstb_ARCHI
+}
+
+
+
+// n = number of scalars to store (from the LSB)
+template
+void Vf32::store_part (MEM *ptr, int n) const noexcept
+{
+ assert (n > 0);
+
+ if (n >= 4)
+ {
+ store (ptr);
+ }
+ else
+ {
+ storeu_part_n13 (ptr, n);
+ }
+}
+
+
+
+template
+void Vf32::storeu (MEM *ptr) const noexcept
+{
+ assert (ptr != nullptr);
+
+#if ! defined (fstb_HAS_SIMD)
+ *reinterpret_cast (ptr) = _x;
+#elif fstb_ARCHI == fstb_ARCHI_X86
+ _mm_storeu_ps (reinterpret_cast (ptr), _x);
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+ vst1q_u8 (reinterpret_cast (ptr), vreinterpretq_u8_f32 (_x));
+#endif // fstb_ARCHI
+}
+
+
+
+// n = number of scalars to store (from the LSB)
+template
+void Vf32::storeu_part (MEM *ptr, int n) const noexcept
+{
+ assert (n > 0);
+
+ if (n >= 4)
+ {
+ storeu (ptr);
+ return;
+ }
+
+ storeu_part_n13 (ptr, n);
+}
+
+
+
+// ptr [0] = v0
+// ptr [1] = v1
+template
+void Vf32::storeu_pair (MEM *ptr) const noexcept
+{
+ assert (ptr != nullptr);
+
+#if ! defined (fstb_HAS_SIMD)
+ auto p = reinterpret_cast (ptr);
+ p [0] = _x [0];
+ p [1] = _x [1];
+#elif fstb_ARCHI == fstb_ARCHI_X86
+ _mm_store_ss (reinterpret_cast (ptr) , _x );
+ const auto v1 = _mm_shuffle_ps (_x, _x, 1 << 0);
+ _mm_store_ss (reinterpret_cast (ptr) + 1, v1);
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+ vst1_u8 (
+ reinterpret_cast (ptr),
+ vreinterpret_u8_f32 (vget_low_f32 (_x))
+ );
+#endif // fstb_ARCHI
+}
+
+
+
+// *ptr = v0
+template
+void Vf32::storeu_scalar (MEM *ptr) const noexcept
+{
+ assert (ptr != nullptr);
+
+#if ! defined (fstb_HAS_SIMD)
+ reinterpret_cast (ptr) [0] = _x [0];
+#elif fstb_ARCHI == fstb_ARCHI_X86
+ _mm_store_ss (reinterpret_cast (ptr), _x);
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+ vst1q_lane_f32 (reinterpret_cast (ptr), _x, 0);
+#endif // fstb_ARCHI
+}
+
+
+
+// Works only with well-formed condition results (tested bits depend on the
+// implementation).
+// For each scalar, true = all bits set, false = all bits cleared
+Vf32::operator bool () const noexcept
+{
+ return and_h ();
+}
+
+
+
+Vf32 & Vf32::operator += (const Vf32Native &other) noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+ _x [0] += other [0];
+ _x [1] += other [1];
+ _x [2] += other [2];
+ _x [3] += other [3];
+#elif fstb_ARCHI == fstb_ARCHI_X86
+ _x = _mm_add_ps (_x, other);
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+ _x = vaddq_f32 (_x, other);
+#endif // fstb_ARCHI
+ return *this;
+}
+
+
+
+Vf32 & Vf32::operator -= (const Vf32Native &other) noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+ _x [0] -= other [0];
+ _x [1] -= other [1];
+ _x [2] -= other [2];
+ _x [3] -= other [3];
+#elif fstb_ARCHI == fstb_ARCHI_X86
+ _x = _mm_sub_ps (_x, other);
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+ _x = vsubq_f32 (_x, other);
+#endif // fstb_ARCHI
+ return *this;
+}
+
+
+
+Vf32 & Vf32::operator *= (const Vf32Native &other) noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+ _x [0] *= other [0];
+ _x [1] *= other [1];
+ _x [2] *= other [2];
+ _x [3] *= other [3];
+#elif fstb_ARCHI == fstb_ARCHI_X86
+ _x = _mm_mul_ps (_x, other);
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+ _x = vmulq_f32 (_x, other);
+#endif // fstb_ARCHI
+ return *this;
+}
+
+
+
+Vf32 & Vf32::operator /= (const Vf32Native &other) noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+ _x [0] /= other [0];
+ _x [1] /= other [1];
+ _x [2] /= other [2];
+ _x [3] /= other [3];
+#elif fstb_ARCHI == fstb_ARCHI_X86
+ _x = _mm_div_ps (_x, other);
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+ _x = _x * (Vf32 { other }.rcp_approx2 ())._x;
+#endif // fstb_ARCHI
+ return *this;
+}
+
+
+
+Vf32 & Vf32::operator &= (const Vf32Native &other) noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+ Combo al { _x };
+ Combo ar { other };
+ al._s32 [0] &= ar._s32 [0];
+ al._s32 [1] &= ar._s32 [1];
+ al._s32 [2] &= ar._s32 [2];
+ al._s32 [3] &= ar._s32 [3];
+ _x = al._vf32;
+#elif fstb_ARCHI == fstb_ARCHI_X86
+ _x = _mm_and_ps (_x, other);
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+ _x = vreinterpretq_f32_u32 (vandq_u32 (
+ vreinterpretq_u32_f32 (_x),
+ vreinterpretq_u32_f32 (other)
+ ));
+#endif // fstb_ARCHI
+ return *this;
+}
+
+
+
+Vf32 & Vf32::operator |= (const Vf32Native &other) noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+ Combo al { _x };
+ Combo ar { other };
+ al._s32 [0] |= ar._s32 [0];
+ al._s32 [1] |= ar._s32 [1];
+ al._s32 [2] |= ar._s32 [2];
+ al._s32 [3] |= ar._s32 [3];
+ _x = al._vf32;
+#elif fstb_ARCHI == fstb_ARCHI_X86
+ _x = _mm_or_ps (_x, other);
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+ _x = vreinterpretq_f32_u32 (vorrq_u32 (
+ vreinterpretq_u32_f32 (_x),
+ vreinterpretq_u32_f32 (other)
+ ));
+#endif // fstb_ARCHI
+ return *this;
+}
+
+
+
+Vf32 & Vf32::operator ^= (const Vf32Native &other) noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+ Combo al { _x };
+ Combo ar { other };
+ al._s32 [0] ^= ar._s32 [0];
+ al._s32 [1] ^= ar._s32 [1];
+ al._s32 [2] ^= ar._s32 [2];
+ al._s32 [3] ^= ar._s32 [3];
+ _x = al._vf32;
+#elif fstb_ARCHI == fstb_ARCHI_X86
+ _x = _mm_xor_ps (_x, other);
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+ _x = vreinterpretq_f32_u32 (veorq_u32 (
+ vreinterpretq_u32_f32 (_x),
+ vreinterpretq_u32_f32 (other)
+ ));
+#endif // fstb_ARCHI
+ return *this;
+}
+
+
+
+// *this += a * b
+Vf32 & Vf32::mac (Vf32 a, Vf32 b) noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+ _x [0] += a._x [0] * b._x [0];
+ _x [1] += a._x [1] * b._x [1];
+ _x [2] += a._x [2] * b._x [2];
+ _x [3] += a._x [3] * b._x [3];
+#elif fstb_ARCHI == fstb_ARCHI_X86
+ _x = _mm_add_ps (_x, _mm_mul_ps (a, b));
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+ #if defined (__ARM_FEATURE_FMA)
+ _x = vfmaq_f32 (_x, a, b);
+ #else
+ _x = vmlaq_f32 (_x, a, b);
+ #endif
+#endif // fstb_ARCHI
+ return *this;
+}
+
+
+
+// *this -= a * b
+Vf32 & Vf32::msu (Vf32 a, Vf32 b) noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+ _x [0] -= a._x [0] * b._x [0];
+ _x [1] -= a._x [1] * b._x [1];
+ _x [2] -= a._x [2] * b._x [2];
+ _x [3] -= a._x [3] * b._x [3];
+#elif fstb_ARCHI == fstb_ARCHI_X86
+ _x = _mm_sub_ps (_x, _mm_mul_ps (a, b));
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+ #if defined (__ARM_FEATURE_FMA)
+ _x = vfmsq_f32 (_x, a, b);
+ #else
+ _x = vmlsq_f32 (_x, a, b);
+ #endif
+#endif // fstb_ARCHI
+ return *this;
+}
+
+
+
+Vf32 Vf32::operator - () const noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+ return Vf32 {
+ -_x [0],
+ -_x [1],
+ -_x [2],
+ -_x [3]
+ };
+#elif fstb_ARCHI == fstb_ARCHI_X86
+ return _mm_xor_ps (_x, signbit_mask ());
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+ return vnegq_f32 (_x);
+#endif // fstb_ARCHI
+}
+
+
+
+Vf32 Vf32::reverse () const noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+ return Vf32 { _x [3], _x [2], _x [1], _x [0] };
+#elif fstb_ARCHI == fstb_ARCHI_X86
+ return _mm_shuffle_ps (_x, _x, (3<<0) + (2<<2) + (1<<4) + (0<<6));
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+ return vrev64q_f32 (vcombine_f32 (vget_high_f32 (_x), vget_low_f32 (_x)));
+#endif // fstb_ARCHI
+}
+
+
+
+Vf32 Vf32::swap_pairs () const noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+ return Vf32 { _x [2], _x [3], _x [0], _x [1] };
+#elif fstb_ARCHI == fstb_ARCHI_X86
+ return _mm_shuffle_ps (_x, _x, (2<<0) + (3<<2) + (0<<4) + (1<<6));
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+ const float32x2_t v01 = vget_low_f32 (_x);
+ const float32x2_t v23 = vget_high_f32 (_x);
+ return vcombine_f32 (v23, v01);
+#endif // fstb_ARCHI
+}
+
+
+
+// a, b, c, d -> a, a, c, c
+Vf32 Vf32::monofy_pairs_lo () const noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+ return Vf32 { _x [0], _x [0], _x [2], _x [2] };
+#elif fstb_ARCHI == fstb_ARCHI_X86
+ return _mm_shuffle_ps (_x, _x, 0xA0);
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+ return vuzpq_f32 (_x, _x).val [0];
+#endif // fstb_ARCHI
+}
+
+
+
+// a, b, c, d -> b, b, d, d
+Vf32 Vf32::monofy_pairs_hi () const noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+ return Vf32 { _x [1], _x [1], _x [3], _x [3] };
+#elif fstb_ARCHI == fstb_ARCHI_X86
+ return _mm_shuffle_ps (_x, _x, 0xF5);
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+ return vuzpq_f32 (_x, _x).val [1];
+#endif // fstb_ARCHI
+}
+
+
+
+// a, b, c, d -> a+c, b+d, a-c, b-d
+Vf32 Vf32::butterfly_w64 () const noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+ return Vf32 {
+ _x [0] + _x [2],
+ _x [1] + _x [3],
+ _x [0] - _x [2],
+ _x [1] - _x [3]
+ };
+#elif fstb_ARCHI == fstb_ARCHI_X86
+ const auto sign = _mm_castsi128_ps (_mm_setr_epi32 (0, 0, _sign32, _sign32));
+ const auto x0 = _mm_shuffle_ps (_x, _x, (2<<0) + (3<<2) + (0<<4) + (1<<6)); // c, d, a, b
+ const auto x1 = _mm_xor_ps (_x, sign); // a, b, -c, -d
+ return x0 + x1;
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+ const auto sign = int32x4_t { 0, 0, _sign32, _sign32 };
+ const auto x0 = vcombine_f32 (vget_high_f32 (_x), vget_low_f32 (_x)); // c, d, a, b
+ const auto x1 = // a, b, -c, -d
+ vreinterpretq_f32_s32 (veorq_s32 (vreinterpretq_s32_f32 (_x), sign));
+ return x0 + x1;
+#endif
+}
+
+
+
+// a, b, c, d -> a+b, a-b, c+d, c-d
+Vf32 Vf32::butterfly_w32 () const noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+ return Vf32 {
+ _x [0] + _x [1],
+ _x [0] + _x [1],
+ _x [2] - _x [3],
+ _x [2] - _x [3]
+ };
+#elif fstb_ARCHI == fstb_ARCHI_X86
+ const auto sign = _mm_castsi128_ps (_mm_setr_epi32 (0, _sign32, 0, _sign32));
+ const auto x0 = _mm_shuffle_ps (_x, _x, (1<<0) + (0<<2) + (3<<4) + (2<<6)); // b, a, d, c
+ const auto x1 = _mm_xor_ps (_x, sign); // a, -b, c, -d
+ return x0 + x1;
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+ const auto sign = int32x4_t { 0, _sign32, 0, _sign32 };
+ const auto x0 = vrev64q_f32 (_x); // b, a, d, c
+ const auto x1 = // a, -b, c, -d
+ vreinterpretq_f32_s32 (veorq_s32 (vreinterpretq_s32_f32 (_x), sign));
+ return x0 + x1;
+#endif
+}
+
+
+
+// Positive = to the left, rotates towards the higher indexes
+template
+Vf32 Vf32::rotate () const noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+ return Vf32 {
+ _x [(0 - SHIFT) & 3],
+ _x [(1 - SHIFT) & 3],
+ _x [(2 - SHIFT) & 3],
+ _x [(3 - SHIFT) & 3]
+ };
+#elif fstb_ARCHI == fstb_ARCHI_X86
+ switch (SHIFT & 3)
+ {
+ case 1: return _mm_shuffle_ps (_x, _x, (2<<6) | (1<<4) | (0<<2) | (3<<0));
+ case 2: return _mm_shuffle_ps (_x, _x, (1<<6) | (0<<4) | (3<<2) | (2<<0));
+ case 3: return _mm_shuffle_ps (_x, _x, (0<<6) | (3<<4) | (2<<2) | (1<<0));
+ default: return *this;
+ }
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+ int32x4_t aa = vreinterpretq_s32_f32 (_x);
+ switch (SHIFT & 3)
+ {
+ case 1: aa = vextq_s32 (aa, aa, 3); break;
+ case 2: aa = vextq_s32 (aa, aa, 2); break;
+ case 3: aa = vextq_s32 (aa, aa, 1); break;
+ default: /* Nothing */ break;
+ }
+ return vreinterpretq_f32_s32 (aa);
+#endif // fstb_ARCHI
+}
+
+
+
+template
+float Vf32::extract () const noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+ return _x [POS & 3];
+#elif fstb_ARCHI == fstb_ARCHI_X86
+ auto a = _x;
+ switch (POS & 3)
+ {
+ case 1: a = _mm_shuffle_ps (a, a, 1); break;
+ case 2: a = _mm_shuffle_ps (a, a, 2); break;
+ case 3: a = _mm_shuffle_ps (a, a, 3); break;
+ default: /* Nothing */ break;
+ }
+ return _mm_cvtss_f32 (a);
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+ return vgetq_lane_f32 (_x, POS & 3);
+#endif // fstb_ARCHI
+}
+
+
+
+template
+Vf32 Vf32::insert (float val) const noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+ auto a = *this;
+ a._x [POS & 3] = val;
+ return a;
+#elif fstb_ARCHI == fstb_ARCHI_X86
+ auto a = rotate <(-POS) & 3> ();
+ a._x = _mm_move_ss (a._x, _mm_set_ss (val));
+ return a.template rotate ();
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+ return vsetq_lane_f32 (val, _x, POS & 3);
+#endif // fstb_ARCHI
+}
+
+
+
+template
+Vf32 Vf32::spread () const noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+ return Vf32 (extract ());
+#elif fstb_ARCHI == fstb_ARCHI_X86
+ return _mm_shuffle_ps (_x, _x, 0x55 * (POS & 3));
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+ return vdupq_n_f32 (vgetq_lane_f32 (_x, POS & 3));
+#endif // fstb_ARCHI
+}
+
+
+
+// Assumes "to nearest" rounding mode on x86
+Vf32 Vf32::round () const noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+ return Vf32 {
+ roundf (_x [0]),
+ roundf (_x [1]),
+ roundf (_x [2]),
+ roundf (_x [3])
+ };
+#elif fstb_ARCHI == fstb_ARCHI_X86
+ return _mm_cvtepi32_ps (_mm_cvtps_epi32 (_x));
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+ const auto zero = vdupq_n_f32 ( 0.0f);
+ const auto m = vdupq_n_f32 (-0.5f);
+ const auto p = vdupq_n_f32 (+0.5f);
+ const auto gt0 = vcgtq_f32 (_x, zero);
+ const auto u = vbslq_f32 (gt0, p, m);
+ return vcvtq_f32_s32 (vcvtq_s32_f32 (vaddq_f32 (_x, u)));
+#endif // fstb_ARCHI
+}
+
+
+
+Vf32 Vf32::rcp_approx () const noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+ return Vf32 {
+ 1.f / _x [0],
+ 1.f / _x [1],
+ 1.f / _x [2],
+ 1.f / _x [3]
+ };
+#elif fstb_ARCHI == fstb_ARCHI_X86
+ return _mm_rcp_ps (_x);
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+ auto r = vrecpeq_f32 (_x);
+ r = vmulq_f32 (vrecpsq_f32 (_x, r), r);
+ return r;
+#endif // fstb_ARCHI
+}
+
+
+
+// With more accuracy
+Vf32 Vf32::rcp_approx2 () const noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+ return rcp_approx ();
+#elif fstb_ARCHI == fstb_ARCHI_X86
+ auto r = _mm_rcp_ps (_x);
+ r = r * (_mm_set1_ps (2.f) - r * _x);
+ return r;
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+ auto r = vrecpeq_f32 (_x);
+ r = vmulq_f32 (vrecpsq_f32 (_x, r), r);
+ r = vmulq_f32 (vrecpsq_f32 (_x, r), r);
+ return r;
+#endif // fstb_ARCHI
+}
+
+
+
+Vf32 Vf32::div_approx (const Vf32 &d) const noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+ return Vf32 {
+ _x [0] / d._x [0],
+ _x [1] / d._x [1],
+ _x [2] / d._x [2],
+ _x [3] / d._x [3]
+ };
+#elif fstb_ARCHI == fstb_ARCHI_X86
+ return _mm_div_ps (_x, d._x);
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+ return _x * d.rcp_approx ()._x;
+#endif // fstb_ARCHI
+}
+
+
+
+Vf32 Vf32::sqrt_approx () const noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+ return Vf32 {
+ sqrtf (_x [0]),
+ sqrtf (_x [1]),
+ sqrtf (_x [2]),
+ sqrtf (_x [3])
+ };
+#elif fstb_ARCHI == fstb_ARCHI_X86
+ // Zero and denormal values will produce INF with _mm_rsqrt_ps(), so
+ // we need a mask.
+ const __m128 z_flag = _mm_cmplt_ps (_x, _mm_set1_ps (FLT_MIN));
+ const __m128 rsqrt_a = _mm_rsqrt_ps (_x);
+ const __m128 sqrt_a = _mm_mul_ps (_x, rsqrt_a);
+ const __m128 sqrt_m = _mm_andnot_ps (z_flag, sqrt_a);
+ return sqrt_m;
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+ const uint32x4_t nz_flag = vtstq_u32 (
+ vreinterpretq_u32_f32 (_x),
+ vreinterpretq_u32_f32 (_x)
+ );
+ auto rs = vrsqrteq_f32 (_x);
+ rs *= vrsqrtsq_f32 (rs * float32x4_t (_x), rs);
+ const auto sqrt_a = rs * float32x4_t (_x);
+ return vreinterpretq_f32_u32 (vandq_u32 (
+ vreinterpretq_u32_f32 (sqrt_a),
+ nz_flag
+ ));
+#endif // fstb_ARCHI
+}
+
+
+
+Vf32 Vf32::rsqrt () const noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+ return Vf32 {
+ 1.f / sqrtf (_x [0]),
+ 1.f / sqrtf (_x [1]),
+ 1.f / sqrtf (_x [2]),
+ 1.f / sqrtf (_x [3])
+ };
+#elif fstb_ARCHI == fstb_ARCHI_X86
+ __m128 rs = _mm_rsqrt_ps (_x);
+ rs = _mm_set1_ps (0.5f) * rs * (_mm_set1_ps (3) - __m128 (_x) * rs * rs);
+ return rs;
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+ auto rs = vrsqrteq_f32 (_x);
+ rs *= vrsqrtsq_f32 (rs * float32x4_t (_x), rs);
+ rs *= vrsqrtsq_f32 (rs * float32x4_t (_x), rs);
+ return rs;
+#endif // fstb_ARCHI
+}
+
+
+
+Vf32 Vf32::rsqrt_approx () const noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+ // Ref:
+ // Jan Kadlec, http://rrrola.wz.cz/inv_sqrt.html, 2010
+ const auto xh = (*this) * Vf32 (0.703952253f);
+ Combo c { _x };
+ c._s32 [0] = 0x5F1FFFF9 - (c._s32 [0] >> 1);
+ c._s32 [1] = 0x5F1FFFF9 - (c._s32 [1] >> 1);
+ c._s32 [2] = 0x5F1FFFF9 - (c._s32 [2] >> 1);
+ c._s32 [3] = 0x5F1FFFF9 - (c._s32 [3] >> 1);
+ auto rs = Vf32 { c._vf32 };
+ rs *= Vf32 (1.681914091f) - xh * rs * rs;
+ return rs;
+#elif fstb_ARCHI == fstb_ARCHI_X86
+ return _mm_rsqrt_ps (_x);
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+ auto rs = vrsqrteq_f32 (_x);
+ rs *= vrsqrtsq_f32 (rs * float32x4_t (_x), rs);
+ return rs;
+#endif // fstb_ARCHI
+}
+
+
+
+// poly is a user-provided Vf32 log2 approximation from [1 ; 2[ to [0 ; 1[
+template
+Vf32 Vf32::log2_base (P poly) const noexcept
+{
+ const int32_t log2_sub = 127;
+
+#if ! defined (fstb_HAS_SIMD)
+
+ assert (
+ _x [0] > 0
+ && _x [1] > 0
+ && _x [2] > 0
+ && _x [3] > 0
+ );
+ Combo c { _x };
+ const int x0 = c._s32 [0];
+ const int x1 = c._s32 [1];
+ const int x2 = c._s32 [2];
+ const int x3 = c._s32 [3];
+ const Vf32 log2_int {
+ float (((x0 >> 23) & 255) - log2_sub),
+ float (((x1 >> 23) & 255) - log2_sub),
+ float (((x2 >> 23) & 255) - log2_sub),
+ float (((x3 >> 23) & 255) - log2_sub)
+ };
+ c._s32 [0] = (x0 & ~(255 << 23)) + (127 << 23);
+ c._s32 [1] = (x1 & ~(255 << 23)) + (127 << 23);
+ c._s32 [2] = (x2 & ~(255 << 23)) + (127 << 23);
+ c._s32 [3] = (x3 & ~(255 << 23)) + (127 << 23);
+ Vf32 part { c._vf32 };
+
+#else // fstb_HAS_SIMD
+
+#if fstb_ARCHI == fstb_ARCHI_X86
+
+ // Extracts the exponent
+ __m128i xi = _mm_castps_si128 (_x);
+ xi = _mm_srli_epi32 (xi, 23);
+ const __m128i l2_sub = _mm_set1_epi32 (log2_sub);
+ xi = _mm_sub_epi32 (xi, l2_sub);
+ const auto log2_int = Vf32 { _mm_cvtepi32_ps (xi) };
+
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+
+ int32x4_t xi = vreinterpretq_s32_f32 (_x);
+ xi = vshrq_n_s32 (xi, 23);
+ const int32x4_t l2_sub = vdupq_n_s32 (log2_sub);
+ xi -= l2_sub;
+ const auto log2_int = Vf32 { vcvtq_f32_s32 (xi) };
+
+#endif // fstb_ARCHI
+
+ // Extracts the multiplicative part in [1 ; 2[
+ const auto mask_mantissa = Vf32 (1.17549421e-38f); // Binary: (1 << 23) - 1
+ auto part = _x & mask_mantissa;
+ const auto bias = Vf32 (1.0f); // Binary: 127 << 23
+ part |= bias;
+
+#endif // fstb_HAS_SIMD
+
+ // Computes the log2 approximation [1 ; 2[ -> [0 ; 1[
+ part = poly (part);
+
+ // Sums the components
+ const auto total = log2_int + part;
+
+ return total;
+}
+
+
+
+// poly is a user-provided Vf32 exp2 approximation from [0 ; 1[ to [1 ; 2[
+template
+Vf32 Vf32::exp2_base (P poly) const noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+
+ const int32_t tx0 = floor_int (_x [0]);
+ const int32_t tx1 = floor_int (_x [1]);
+ const int32_t tx2 = floor_int (_x [2]);
+ const int32_t tx3 = floor_int (_x [3]);
+ const Vf32 frac {
+ _x [0] - static_cast (tx0),
+ _x [1] - static_cast (tx1),
+ _x [2] - static_cast (tx2),
+ _x [3] - static_cast (tx3)
+ };
+
+ Combo combo { poly (frac) };
+
+ combo._s32 [0] += tx0 << 23;
+ combo._s32 [1] += tx1 << 23;
+ combo._s32 [2] += tx2 << 23;
+ combo._s32 [3] += tx3 << 23;
+ assert (
+ combo._vf32 [0] >= 0
+ && combo._vf32 [1] >= 0
+ && combo._vf32 [2] >= 0
+ && combo._vf32 [3] >= 0
+ );
+ return combo._vf32;
+
+#else // fstb_HAS_SIMD
+
+ // Separates the integer and fractional parts
+# if fstb_ARCHI == fstb_ARCHI_X86
+ const auto round_toward_m_i = _mm_set1_ps (-0.5f);
+ auto xi = _mm_cvtps_epi32 (_mm_add_ps (_x, round_toward_m_i));
+ const auto val_floor = Vf32 { _mm_cvtepi32_ps (xi) };
+# elif fstb_ARCHI == fstb_ARCHI_ARM
+ const int round_ofs = 256;
+ int32x4_t xi = vcvtq_s32_f32 (_x + vdupq_n_f32 (float (round_ofs)));
+ xi -= vdupq_n_s32 (round_ofs);
+ const auto val_floor = Vf32 { vcvtq_f32_s32 (xi) };
+# endif // fstb_ARCHI
+
+ auto frac = *this - val_floor;
+
+ // Computes the exp2 approximation [0 ; 1] -> [1 ; 2]
+ frac = poly (frac);
+
+ // Integer part
+# if fstb_ARCHI == fstb_ARCHI_X86
+ xi = _mm_slli_epi32 (xi, 23);
+ xi = _mm_add_epi32 (xi, _mm_castps_si128 (frac));
+ return _mm_castsi128_ps (xi);
+# elif fstb_ARCHI == fstb_ARCHI_ARM
+ xi = vshlq_n_s32 (xi, 23);
+ xi = xi + vreinterpretq_s32_f32 (frac);
+ return vreinterpretq_f32_s32 (xi);
+# endif // fstb_ARCHI
+
+#endif // fstb_HAS_SIMD
+}
+
+
+
+Vf32 Vf32::signbit () const noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+ return Vf32 {
+ copysignf (0.f, _x [0]),
+ copysignf (0.f, _x [1]),
+ copysignf (0.f, _x [2]),
+ copysignf (0.f, _x [3])
+ };
+#elif fstb_ARCHI == fstb_ARCHI_X86
+ return _mm_and_ps (signbit_mask (), _x);
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+ return vreinterpretq_f32_u32 (vandq_u32 (
+ vreinterpretq_u32_f32 (_x),
+ vdupq_n_u32 (0x80000000U)
+ ));
+#endif // fstb_ARCHI
+}
+
+
+
+Vf32 Vf32::is_lt_0 () const noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+ Combo r;
+ r._s32 [0] = (_x [0] < 0) ? -1 : 0;
+ r._s32 [1] = (_x [1] < 0) ? -1 : 0;
+ r._s32 [2] = (_x [2] < 0) ? -1 : 0;
+ r._s32 [3] = (_x [3] < 0) ? -1 : 0;
+ return r._vf32;
+#elif fstb_ARCHI == fstb_ARCHI_X86
+ return _mm_castsi128_ps (_mm_srai_epi32 (_mm_castps_si128 (_x), 31));
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+ return vreinterpretq_f32_s32 (vshrq_n_s32 (vreinterpretq_s32_f32 (_x), 31));
+#endif // fstb_ARCHI
+}
+
+
+
+std::tuple Vf32::explode () const noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+ return std::make_tuple (_x [0], _x [1], _x [2], _x [3]);
+#elif fstb_ARCHI == fstb_ARCHI_X86
+ const auto tmp = _mm_movehl_ps (_x, _x);
+ return std::make_tuple (
+ _mm_cvtss_f32 (_x),
+ _mm_cvtss_f32 (_mm_shuffle_ps (_x, _x, (1<<0))),
+ _mm_cvtss_f32 (tmp),
+ _mm_cvtss_f32 (_mm_shuffle_ps (tmp, tmp, (1<<0)))
+ );
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+ return std::make_tuple (
+ vgetq_lane_f32 (_x, 0),
+ vgetq_lane_f32 (_x, 1),
+ vgetq_lane_f32 (_x, 2),
+ vgetq_lane_f32 (_x, 3)
+ );
+#endif // fstb_ARCHI
+}
+
+
+
+std::tuple Vf32::extract_pair () const noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+ return std::make_tuple (_x [0], _x [1]);
+#elif fstb_ARCHI == fstb_ARCHI_X86
+ return std::make_tuple (
+ _mm_cvtss_f32 (_x),
+ _mm_cvtss_f32 (_mm_shuffle_ps (_x, _x, 1))
+ );
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+ return std::make_tuple (vgetq_lane_f32 (_x, 0), vgetq_lane_f32 (_x, 1));
+#endif // fstb_ARCHI
+}
+
+
+
+// <0> = v0 | v1 | v0 | v1
+// <1> = v2 | v3 | v2 | v3
+std::tuple Vf32::spread_pairs () const noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+ return std::make_tuple (
+ Vf32 { _x [0], _x [1], _x [0], _x [1] },
+ Vf32 { _x [2], _x [3], _x [2], _x [3] }
+ );
+#elif fstb_ARCHI == fstb_ARCHI_X86
+ return std::make_tuple (
+ Vf32 { _mm_shuffle_ps (_x, _x, (0<<0) + (1<<2) + (0<<4) + (1<<6)) },
+ Vf32 { _mm_shuffle_ps (_x, _x, (2<<0) + (3<<2) + (2<<4) + (3<<6)) }
+ );
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+ const float32x2_t v01 = vget_low_f32 (_x);
+ const float32x2_t v23 = vget_high_f32 (_x);
+ return std::make_tuple (
+ Vf32 { vcombine_f32 (v01, v01) },
+ Vf32 { vcombine_f32 (v23, v23) }
+ );
+#endif // fstb_ARCHI
+}
+
+
+
+float Vf32::sum_h () const noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+ return (_x [0] + _x [2]) + (_x [1] + _x [3]);
+#elif fstb_ARCHI == fstb_ARCHI_X86
+ // s = v3,v2,v1,v0
+ const auto s = _mm_shuffle_ps (_x, _x, (3 << 0) | (2 << 2) | (1 << 4) | (0 << 6));
+ const auto v = _mm_add_ps (_x, s); // v0+v3,v1+v2,v2+v1,v3+v0
+ return _mm_cvtss_f32 (_mm_add_ss (v, _mm_movehl_ps (s, v)));
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+ #if fstb_WORD_SIZE == 64
+ return vaddvq_f32 (_x);
+ #else
+ float32x2_t v2 = vadd_f32 (vget_high_f32 (_x), vget_low_f32 (_x));
+ return vget_lane_f32 (vpadd_f32 (v2, v2), 0);
+ #endif
+#endif // fstb_ARCHI
+}
+
+
+
+float Vf32::min_h () const noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+ return std::min (std::min (_x [0], _x [2]), std::min (_x [1], _x [3]));
+#elif fstb_ARCHI == fstb_ARCHI_X86
+ const auto v = _mm_min_ps (_x, _mm_shuffle_ps (_x, _x, (3 << 2) | 2));
+ return _mm_cvtss_f32 (_mm_min_ss (v, _mm_shuffle_ps (v, v, 1)));
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+ float32x2_t v2 = vmin_f32 (vget_high_f32 (_x), vget_low_f32 (_x));
+ return vget_lane_f32 (vpmin_f32 (v2, v2), 0);
+#endif // fstb_ARCHI
+}
+
+
+
+float Vf32::max_h () const noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+ return std::max (std::max (_x [0], _x [2]), std::max (_x [1], _x [3]));
+#elif fstb_ARCHI == fstb_ARCHI_X86
+ const auto v = _mm_max_ps (_x, _mm_shuffle_ps (_x, _x, (3 << 2) | 2));
+ return _mm_cvtss_f32 (_mm_max_ss (v, _mm_shuffle_ps (v, v, 1)));
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+ float32x2_t v2 = vmax_f32 (vget_high_f32 (_x), vget_low_f32 (_x));
+ return vget_lane_f32 (vpmax_f32 (v2, v2), 0);
+#endif // fstb_ARCHI
+}
+
+
+
+// Works only with well-formed condition results (tested bits depends on the implementation).
+// For each scalar, true = all bits set, false = all bits cleared
+bool Vf32::and_h () const noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+ const Combo c { _x };
+ const int32_t t = (c._s32 [0] & c._s32 [1]) & (c._s32 [2] & c._s32 [3]);
+ return (t == -1);
+#elif fstb_ARCHI == fstb_ARCHI_X86
+ return (_mm_movemask_ps (_x) == 15);
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+ const uint32x2_t tmp = vreinterpret_u32_u16 (
+ vqmovn_u32 (vreinterpretq_u32_f32 (_x))
+ );
+ return ( vget_lane_u32 (tmp, 0) == 0xFFFFFFFFU
+ && vget_lane_u32 (tmp, 1) == 0xFFFFFFFFU);
+#endif // fstb_ARCHI
+}
+
+
+
+// Works only with well-formed condition results (tested bits depends on the implementation).
+// For each scalar, true = all bits set, false = all bits cleared
+bool Vf32::or_h () const noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+ Combo c;
+ c._vf32 = _x;
+ const int32_t t = (c._s32 [0] | c._s32 [1]) | (c._s32 [2] | c._s32 [3]);
+ return (t != 0);
+#elif fstb_ARCHI == fstb_ARCHI_X86
+ return (_mm_movemask_ps (_x) != 0);
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+ const uint32x2_t tmp = vreinterpret_u32_u16 (
+ vqmovn_u32 (vreinterpretq_u32_f32 (_x))
+ );
+ return ( vget_lane_u32 (tmp, 0) != 0
+ || vget_lane_u32 (tmp, 1) != 0);
+#endif // fstb_ARCHI
+}
+
+
+
+// Moves the boolean content of each 4 scalar into the lower 4 bits of the
+// return value.
+// Assumes the object is a result of a comparison, with all bits the same
+// in each 32-bit element.
+unsigned int Vf32::movemask () const noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+ Combo c;
+ c._vf32 = _x;
+ return
+ (c._u32 [0] >> 31)
+ | ((c._u32 [1] >> 30) & 2)
+ | ((c._u32 [2] >> 29) & 4)
+ | ((c._u32 [3] >> 28) & 8);
+#elif fstb_ARCHI == fstb_ARCHI_X86
+ return static_cast (_mm_movemask_ps (_x));
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+ uint64x2_t tmp1 =
+ vreinterpretq_u64_f32 (_x); // ddd...ddd ccc...ccc bbb...bbb aaa...aaa
+ tmp1 = vshrq_n_u64 (tmp1, 31); // 000...00d ddd...ddc 000...00b bbb...bba
+ uint64x1_t tmp2 = vsli_n_u64 (
+ vget_high_u64 (tmp1),
+ vget_low_u64 (tmp1),
+ 2
+ );
+ return vget_lane_u32 (vreinterpret_u32_u64 (tmp2), 0) & 0xF;
+#endif // fstb_ARCHI
+}
+
+
+
+Vf32 Vf32::zero () noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+ return Vf32 { 0, 0, 0, 0 };
+#elif fstb_ARCHI == fstb_ARCHI_X86
+ return _mm_setzero_ps ();
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+ return vdupq_n_f32 (0);
+#endif // fstb_ARCHI
+}
+
+
+
+Vf32 Vf32::all1 () noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+ Combo c;
+ c._s32 [0] = -1;
+ c._s32 [1] = -1;
+ c._s32 [2] = -1;
+ c._s32 [3] = -1;
+ return Vf32 { c._vf32 };
+#elif fstb_ARCHI == fstb_ARCHI_X86
+ return _mm_castsi128_ps (_mm_set1_epi32 (-1));
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+ return vreinterpretq_f32_s32 (vdupq_n_s32 (-1));
+#endif // fstb_ARCHI
+}
+
+
+
+// Returns a0 | a1 | ? | ?
+Vf32 Vf32::set_pair (float a0, float a1) noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+ return Vf32 { a0, a1, 0, 0 };
+#elif fstb_ARCHI == fstb_ARCHI_X86
+ return _mm_unpacklo_ps (_mm_set_ss (a0), _mm_set_ss (a1));
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+ return vsetq_lane_f32 (a1, vdupq_n_f32 (a0), 1);
+#endif // fstb_ARCHI
+}
+
+
+
+// Returns a02 | a13 | a02 | a13
+Vf32 Vf32::set_pair_fill (float a02, float a13) noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+ return Vf32 { a02, a13, a02, a13 };
+#elif fstb_ARCHI == fstb_ARCHI_X86
+ return _mm_unpacklo_ps (_mm_set1_ps (a02), _mm_set1_ps (a13));
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+ const float32x2_t v01 = vset_lane_f32 (a13, vdup_n_f32 (a02), 1);
+ return vcombine_f32 (v01, v01);
+#endif // fstb_ARCHI
+}
+
+
+
+// Returns a01 | a01 | a23 | a23
+Vf32 Vf32::set_pair_dbl (float a01, float a23) noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+ return Vf32 { a01, a01, a23, a23 };
+#elif fstb_ARCHI == fstb_ARCHI_X86
+ return _mm_shuffle_ps (_mm_set_ss (a01), _mm_set_ss (a23), 0x00);
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+ return vcombine_f32 (vdup_n_f32 (a01), vdup_n_f32 (a23));
+#endif // fstb_ARCHI
+}
+
+
+
+// "true" must be 1 and nothing else.
+Vf32 Vf32::set_mask (bool m0, bool m1, bool m2, bool m3) noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+ Combo c;
+ c._s32 [0] = -int32_t (m0);
+ c._s32 [1] = -int32_t (m1);
+ c._s32 [2] = -int32_t (m2);
+ c._s32 [3] = -int32_t (m3);
+ return c._vf32;
+#elif 1 // Fast version
+# if fstb_ARCHI == fstb_ARCHI_X86
+ return _mm_castsi128_ps (_mm_sub_epi32 (
+ _mm_setzero_si128 (),
+ _mm_set_epi32 (m3, m2, m1, m0)
+ ));
+# elif fstb_ARCHI == fstb_ARCHI_ARM
+ float32x2_t v01 = vdup_n_f32 (m0);
+ float32x2_t v23 = vdup_n_f32 (m2);
+ v01 = vset_lane_f32 (m1, v01, 1);
+ v23 = vset_lane_f32 (m3, v23, 1);
+ return vreinterpretq_f32_s32 (vnegq_s32 (vreinterpretq_s32_f32 (
+ vcombine_f32 (v01, v23)
+ )));
+# endif // fstb_ARCHI
+#else // Safer but slower version
+# if fstb_ARCHI == fstb_ARCHI_X86
+ return _mm_castsi128_ps (_mm_sub_epi32 (
+ _mm_set_epi32 (!m3, !m2, !m1, !m0),
+ _mm_set1_epi32 (1)
+ ));
+# elif fstb_ARCHI == fstb_ARCHI_ARM
+ float32x2_t v01 = vdup_n_f32 (!m0);
+ float32x2_t v23 = vdup_n_f32 (!m2);
+ v01 = vset_lane_f32 (!m1, v01, 1);
+ v23 = vset_lane_f32 (!m3, v23, 1);
+ const auto one = vdupq_n_s32 (1);
+ return vreinterpretq_f32_s32 (vsubq_s32 (
+ vreinterpretq_s32_f32 (vcombine_f32 (v01, v23)),
+ one
+ ));
+# endif // fstb_ARCHI
+#endif // Versions
+}
+
+
+
+Vf32Native Vf32::signbit_mask () noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+ Combo c;
+ c._u32 [0] = 0x80000000U;
+ c._u32 [1] = 0x80000000U;
+ c._u32 [2] = 0x80000000U;
+ c._u32 [3] = 0x80000000U;
+ return c._vf32;
+#elif fstb_ARCHI == fstb_ARCHI_X86
+// return _mm_set1_ps (-0.f);
+ return _mm_castsi128_ps (_mm_set1_epi32 (0x80000000));
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+ return vreinterpretq_f32_u32 (vdupq_n_u32 (0x80000000U));
+#endif // fstb_ARCHI
+}
+
+
+
+// returns { p0 [0 1], p1 [0 1] }
+Vf32 Vf32::interleave_pair_lo (Vf32 p0, Vf32 p1) noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+ return Vf32 { p0._x [0], p0._x [1], p1._x [0], p1._x [1] };
+#elif fstb_ARCHI == fstb_ARCHI_X86
+ return _mm_shuffle_ps (p0._x, p1._x, (0<<0) + (1<<2) + (0<<4) + (1<<6));
+ // return _mm_movelh_ps (p0, p1);
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+ const float32x2_t p0x = vget_low_f32 (p0._x);
+ const float32x2_t p1x = vget_low_f32 (p1._x);
+ return vcombine_f32 (p0x, p1x);
+#endif // fstb_ARCHI
+}
+
+
+
+// returns { p0 [2 3], p1 [2 3] }
+Vf32 Vf32::interleave_pair_hi (Vf32 p0, Vf32 p1) noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+ return Vf32 { p0._x [2], p0._x [3], p1._x [2], p1._x [3] };
+#elif fstb_ARCHI == fstb_ARCHI_X86
+ return _mm_shuffle_ps (p0._x, p1._x, (2<<0) + (3<<2) + (2<<4) + (3<<6));
+ // return _mm_movehl_ps (p1, p0);
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+ const float32x2_t p0x = vget_high_f32 (p0._x);
+ const float32x2_t p1x = vget_high_f32 (p1._x);
+ return vcombine_f32 (p0x, p1x);
+#endif // fstb_ARCHI
+}
+
+
+
+std::tuple Vf32::interleave (Vf32 p0, Vf32 p1) noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+ return std::make_tuple (
+ Vf32 {
+ p0._x [0],
+ p1._x [0],
+ p0._x [1],
+ p1._x [1]
+ }, Vf32 {
+ p0._x [2],
+ p1._x [2],
+ p0._x [3],
+ p1._x [3]
+ }
+ );
+#elif fstb_ARCHI == fstb_ARCHI_X86
+ return std::make_tuple (
+ Vf32 { _mm_unpacklo_ps (p0._x, p1._x) },
+ Vf32 { _mm_unpackhi_ps (p0._x, p1._x) }
+ );
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+ const float32x4x2_t tmp = vzipq_f32 (p0._x, p1._x);
+ return std::make_tuple (
+ Vf32 { tmp.val [0] },
+ Vf32 { tmp.val [1] }
+ );
+#endif // fstb_ARCHI
+}
+
+
+
+std::tuple Vf32::deinterleave (Vf32 i0, Vf32 i1) noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+ return std::make_tuple (
+ Vf32 {
+ i0._x [0],
+ i0._x [2],
+ i1._x [0],
+ i1._x [2]
+ }, Vf32 {
+ i0._x [1],
+ i0._x [3],
+ i1._x [1],
+ i1._x [3]
+ }
+ );
+#elif fstb_ARCHI == fstb_ARCHI_X86
+ return std::make_tuple (
+ Vf32 { _mm_shuffle_ps (i0._x, i1._x, (0<<0) | (2<<2) | (0<<4) | (2<<6)) },
+ Vf32 { _mm_shuffle_ps (i0._x, i1._x, (1<<0) | (3<<2) | (1<<4) | (3<<6)) }
+ );
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+ const float32x4x2_t tmp = vuzpq_f32 (i0._x, i1._x);
+ return std::make_tuple (
+ Vf32 { tmp.val [0] },
+ Vf32 { tmp.val [1] }
+ );
+#endif // fstb_ARCHI
+}
+
+
+
+Vf32 Vf32::deinterleave_lo (Vf32 i0, Vf32 i1) noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+ return Vf32 { i0._x [0], i0._x [2], i1._x [0], i1._x [2] };
+#elif fstb_ARCHI == fstb_ARCHI_X86
+ return _mm_shuffle_ps (i0._x, i1._x, (0<<0) | (2<<2) | (0<<4) | (2<<6));
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+ return vuzpq_f32 (i0._x, i1._x).val [0];
+#endif // fstb_ARCHI
+}
+
+
+
+Vf32 Vf32::deinterleave_hi (Vf32 i0, Vf32 i1) noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+ return Vf32 { i0._x [1], i0._x [3], i1._x [1], i1._x [3] };
+#elif fstb_ARCHI == fstb_ARCHI_X86
+ return _mm_shuffle_ps (i0._x, i1._x, (1<<0) | (3<<2) | (1<<4) | (3<<6));
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+ return vuzpq_f32 (i0._x, i1._x).val [1];
+#endif // fstb_ARCHI
+}
+
+
+
+// Extracts the vector at the position POS from the double-width vector {a b}
+// Concatenates a [POS...3] with b [0...3-POS]
+template
+Vf32 Vf32::compose (Vf32 a, Vf32 b) noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+ switch (POS & 3)
+ {
+ case 1: return Vf32 { a._x [1], a._x [2], a._x [3], b._x [0] };
+ case 2: return Vf32 { a._x [2], a._x [3], b._x [0], b._x [1] };
+ case 3: return Vf32 { a._x [3], b._x [0], b._x [1], b._x [2] };
+ default: return a;
+ }
+#elif fstb_ARCHI == fstb_ARCHI_X86
+ switch (POS & 3)
+ {
+ case 1:
+ {
+ const auto tmp = _mm_move_ss (a._x, b._x);
+ return _mm_shuffle_ps (tmp, tmp, (0<<6) | (3<<4) | (2<<2) | (1<<0));
+ }
+ case 2:
+ return _mm_shuffle_ps (a._x, b._x, (1<<6) | (0<<4) | (3<<2) | (2<<0));
+ case 3:
+ return _mm_move_ss (
+ _mm_shuffle_ps (b._x, b._x, (2<<6) | (1<<4) | (0<<2) | (3<<0)),
+ _mm_shuffle_ps (a._x, a._x, (2<<6) | (1<<4) | (0<<2) | (3<<0))
+ );
+ default:
+ return a;
+ }
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+ if (POS == 0)
+ {
+ return a;
+ }
+ else
+ {
+ const auto aa = vreinterpretq_s32_f32 (a._x);
+ const auto bb = vreinterpretq_s32_f32 (b._x);
+ return vreinterpretq_f32_s32 (vextq_s32 (aa, bb, POS));
+ }
+#endif // fstb_ARCHI
+}
+
+
+
+template
+Vf32 Vf32::load (const MEM *ptr) noexcept
+{
+ assert (is_ptr_align_nz (ptr, fstb_SIMD128_ALIGN));
+
+#if ! defined (fstb_HAS_SIMD)
+ return *reinterpret_cast (ptr);
+#elif fstb_ARCHI == fstb_ARCHI_X86
+ return _mm_load_ps (reinterpret_cast (ptr));
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+ return vld1q_f32 (reinterpret_cast (ptr));
+#endif // fstb_ARCHI
+}
+
+
+
+template
+Vf32 Vf32::loadu (const MEM *ptr) noexcept
+{
+ assert (ptr != nullptr);
+
+#if ! defined (fstb_HAS_SIMD)
+ return *reinterpret_cast (ptr);
+#elif fstb_ARCHI == fstb_ARCHI_X86
+ return _mm_loadu_ps (reinterpret_cast (ptr));
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+ return vreinterpretq_f32_u8 (
+ vld1q_u8 (reinterpret_cast (ptr))
+ );
+#endif // fstb_ARCHI
+}
+
+
+
+template
+Vf32 Vf32::loadu_part (const MEM *ptr, int n) noexcept
+{
+ assert (n > 0);
+
+ if (n >= 4)
+ {
+ return loadu (ptr);
+ }
+
+ const float * f_ptr = reinterpret_cast (ptr);
+#if ! defined (fstb_HAS_SIMD)
+ Vf32 v;
+ v._x [0] = f_ptr [0];
+ for (int i = 1; i < n; ++i)
+ {
+ v._x [i] = f_ptr [i];
+ }
+ return v;
+#elif fstb_ARCHI == fstb_ARCHI_X86
+ switch (n)
+ {
+ case 1:
+ return _mm_load_ss (f_ptr);
+ case 2:
+# if 1
+ return _mm_castsi128_ps (_mm_loadl_epi64 (
+ reinterpret_cast (ptr)
+ ));
+# else // Higher latency from Skylake
+ return _mm_unpacklo_ps (_mm_load_ss (f_ptr), _mm_load_ss (f_ptr + 1));
+# endif
+ case 3:
+ return _mm_shuffle_ps (
+# if 1
+ _mm_castsi128_ps (_mm_loadl_epi64 (
+ reinterpret_cast (ptr)
+ )),
+# else // Higher latency from Skylake
+ _mm_unpacklo_ps (_mm_load_ss (f_ptr), _mm_load_ss (f_ptr + 1)),
+# endif
+ _mm_load_ss (f_ptr + 2),
+ (0<<0) + (1<<2) + (2<<4)
+ );
+ default:
+ // Keeps the compiler happy with (un)initialisation
+ return loadu (ptr);
+ }
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+ auto v = vmovq_n_f32 (f_ptr [0]);
+ if (n >= 2)
+ {
+ v = vld1q_lane_f32 (f_ptr + 1, v, 1);
+ if (n >= 3)
+ {
+ v = vld1q_lane_f32 (f_ptr + 2, v, 2);
+ }
+ }
+ return v;
+#endif // fstb_ARCHI
+}
+
+
+
+// Returns: ptr [0] | ptr [1] | ? | ?
+template
+Vf32 Vf32::loadu_pair (const MEM *ptr) noexcept
+{
+ assert (ptr != nullptr);
+
+#if ! defined (fstb_HAS_SIMD)
+ auto p = reinterpret_cast (ptr);
+ return Vf32 { p [0], p [1], 0, 0 };
+#elif fstb_ARCHI == fstb_ARCHI_X86
+# if 1
+ return _mm_castsi128_ps (_mm_loadl_epi64 (
+ reinterpret_cast (ptr)
+ ));
+# else // Higher latency from Skylake
+ const auto x0 = _mm_load_ss (reinterpret_cast (ptr) );
+ const auto x1 = _mm_load_ss (reinterpret_cast (ptr) + 1);
+ return _mm_unpacklo_ps (x0, x1);
+# endif
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+ const float32x2_t x = vreinterpret_f32_u8 (
+ vld1_u8 (reinterpret_cast (ptr))
+ );
+ return vcombine_f32 (x, x);
+#endif // fstb_ARCHI
+}
+
+
+
+/*\\\ PROTECTED \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\*/
+
+
+
+/*\\\ PRIVATE \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\*/
+
+
+
+// n = number of scalars to store (from the LSB)
+template
+void Vf32::storeu_part_n13 (MEM *ptr, int n) const noexcept
+{
+ assert (n > 0);
+ assert (n < 4);
+
+ float * f_ptr = reinterpret_cast (ptr);
+
+#if ! defined (fstb_HAS_SIMD)
+
+ for (int i = 0; i < n; ++i)
+ {
+ f_ptr [i] = _x [i];
+ }
+
+#elif fstb_ARCHI == fstb_ARCHI_X86
+
+ _mm_store_ss (f_ptr, _x);
+ if (n >= 2)
+ {
+ _mm_store_ss (f_ptr + 1, _mm_shuffle_ps (_x, _x, 1 << 0));
+ if (n >= 3)
+ {
+ _mm_store_ss (f_ptr + 2, _mm_movehl_ps (_x, _x));
+ }
+ }
+
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+
+ vst1q_lane_f32 (f_ptr + 0, _x, 0);
+ if (n >= 2)
+ {
+ vst1q_lane_f32 (f_ptr + 1, _x, 1);
+ if (n >= 3)
+ {
+ vst1q_lane_f32 (f_ptr + 2, _x, 2);
+ }
+ }
+
+#endif
+}
+
+
+
+/*\\\ GLOBAL OPERATORS AND FUNCTIONS \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\*/
+
+
+
+Vf32 operator + (Vf32 lhs, const Vf32 &rhs) noexcept
+{
+ lhs += rhs;
+ return lhs;
+}
+
+Vf32 operator - (Vf32 lhs, const Vf32 &rhs) noexcept
+{
+ lhs -= rhs;
+ return lhs;
+}
+
+Vf32 operator * (Vf32 lhs, const Vf32 &rhs) noexcept
+{
+ lhs *= rhs;
+ return lhs;
+}
+
+Vf32 operator / (Vf32 lhs, const Vf32 &rhs) noexcept
+{
+ lhs /= rhs;
+ return lhs;
+}
+
+Vf32 operator & (Vf32 lhs, const Vf32 &rhs) noexcept
+{
+ lhs &= rhs;
+ return lhs;
+}
+
+Vf32 operator | (Vf32 lhs, const Vf32 &rhs) noexcept
+{
+ lhs |= rhs;
+ return lhs;
+}
+
+Vf32 operator ^ (Vf32 lhs, const Vf32 &rhs) noexcept
+{
+ lhs ^= rhs;
+ return lhs;
+}
+
+
+
+Vf32 operator == (const Vf32 &lhs, const Vf32 &rhs) noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+ Vf32::Combo r;
+ r._s32 [0] = (lhs._x [0] == rhs._x [0]) ? -1 : 0;
+ r._s32 [1] = (lhs._x [1] == rhs._x [1]) ? -1 : 0;
+ r._s32 [2] = (lhs._x [2] == rhs._x [2]) ? -1 : 0;
+ r._s32 [3] = (lhs._x [3] == rhs._x [3]) ? -1 : 0;
+ return r._vf32;
+#elif fstb_ARCHI == fstb_ARCHI_X86
+ return _mm_cmpeq_ps (lhs, rhs);
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+ return vreinterpretq_f32_u32 (vceqq_f32 (lhs, rhs));
+#endif // fstb_ARCHI
+}
+
+
+
+Vf32 operator != (const Vf32 &lhs, const Vf32 &rhs) noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+ Vf32::Combo r;
+ r._s32 [0] = (lhs._x [0] != rhs._x [0]) ? -1 : 0;
+ r._s32 [1] = (lhs._x [1] != rhs._x [1]) ? -1 : 0;
+ r._s32 [2] = (lhs._x [2] != rhs._x [2]) ? -1 : 0;
+ r._s32 [3] = (lhs._x [3] != rhs._x [3]) ? -1 : 0;
+ return r._vf32;
+#elif fstb_ARCHI == fstb_ARCHI_X86
+ return _mm_cmpneq_ps (lhs, rhs);
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+ return vreinterpretq_f32_u32 (vmvnq_u32 (vceqq_f32 (lhs, rhs)));
+#endif // fstb_ARCHI
+}
+
+
+
+Vf32 operator < (const Vf32 &lhs, const Vf32 &rhs) noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+ Vf32::Combo r;
+ r._s32 [0] = (lhs._x [0] < rhs._x [0]) ? -1 : 0;
+ r._s32 [1] = (lhs._x [1] < rhs._x [1]) ? -1 : 0;
+ r._s32 [2] = (lhs._x [2] < rhs._x [2]) ? -1 : 0;
+ r._s32 [3] = (lhs._x [3] < rhs._x [3]) ? -1 : 0;
+ return r._vf32;
+#elif fstb_ARCHI == fstb_ARCHI_X86
+ return _mm_cmplt_ps (lhs, rhs);
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+ return vreinterpretq_f32_u32 (vcltq_f32 (lhs, rhs));
+#endif // fstb_ARCHI
+}
+
+
+
+Vf32 operator <= (const Vf32 &lhs, const Vf32 &rhs) noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+ Vf32::Combo r;
+ r._s32 [0] = (lhs._x [0] <= rhs._x [0]) ? -1 : 0;
+ r._s32 [1] = (lhs._x [1] <= rhs._x [1]) ? -1 : 0;
+ r._s32 [2] = (lhs._x [2] <= rhs._x [2]) ? -1 : 0;
+ r._s32 [3] = (lhs._x [3] <= rhs._x [3]) ? -1 : 0;
+ return r._vf32;
+#elif fstb_ARCHI == fstb_ARCHI_X86
+ return _mm_cmple_ps (lhs, rhs);
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+ return vreinterpretq_f32_u32 (vcleq_f32 (lhs, rhs));
+#endif // fstb_ARCHI
+}
+
+
+
+Vf32 operator > (const Vf32 &lhs, const Vf32 &rhs) noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+ Vf32::Combo r;
+ r._s32 [0] = (lhs._x [0] > rhs._x [0]) ? -1 : 0;
+ r._s32 [1] = (lhs._x [1] > rhs._x [1]) ? -1 : 0;
+ r._s32 [2] = (lhs._x [2] > rhs._x [2]) ? -1 : 0;
+ r._s32 [3] = (lhs._x [3] > rhs._x [3]) ? -1 : 0;
+ return r._vf32;
+#elif fstb_ARCHI == fstb_ARCHI_X86
+ return _mm_cmpgt_ps (lhs, rhs);
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+ return vreinterpretq_f32_u32 (vcgtq_f32 (lhs, rhs));
+#endif // fstb_ARCHI
+}
+
+
+
+Vf32 operator >= (const Vf32 &lhs, const Vf32 &rhs) noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+ Vf32::Combo r;
+ r._s32 [0] = (lhs._x [0] >= rhs._x [0]) ? -1 : 0;
+ r._s32 [1] = (lhs._x [1] >= rhs._x [1]) ? -1 : 0;
+ r._s32 [2] = (lhs._x [2] >= rhs._x [2]) ? -1 : 0;
+ r._s32 [3] = (lhs._x [3] >= rhs._x [3]) ? -1 : 0;
+ return r._vf32;
+#elif fstb_ARCHI == fstb_ARCHI_X86
+ return _mm_cmpge_ps (lhs, rhs);
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+ return vreinterpretq_f32_u32 (vcgeq_f32 (lhs, rhs));
+#endif // fstb_ARCHI
+}
+
+
+
+Vf32 abs (const Vf32 &v) noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+ return Vf32 {
+ fabsf (v._x [0]),
+ fabsf (v._x [1]),
+ fabsf (v._x [2]),
+ fabsf (v._x [3])
+ };
+#elif fstb_ARCHI == fstb_ARCHI_X86
+ return _mm_andnot_ps (Vf32::signbit_mask (), v);
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+ return vabsq_f32 (v);
+#endif // fstb_ARCHI
+}
+
+
+
+// Returns x * a + b
+Vf32 fma (const Vf32 &x, const Vf32 &a, const Vf32 &b) noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+ return Vf32 {
+ x._x [0] * a._x [0] + b._x [0],
+ x._x [1] * a._x [1] + b._x [1],
+ x._x [2] * a._x [2] + b._x [2],
+ x._x [3] * a._x [3] + b._x [3]
+ };
+#elif fstb_ARCHI == fstb_ARCHI_X86
+ return _mm_add_ps (_mm_mul_ps (x, a), b);
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+ #if defined (__ARM_FEATURE_FMA)
+ return vfmaq_f32 (b, x, a);
+ #else
+ return vmlaq_f32 (b, x, a);
+ #endif
+#endif // fstb_ARCHI
+}
+
+
+
+// Returns x * a - b
+Vf32 fms (const Vf32 &x, const Vf32 &a, const Vf32 &b) noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+ return Vf32 {
+ x._x [0] * a._x [0] - b._x [0],
+ x._x [1] * a._x [1] - b._x [1],
+ x._x [2] * a._x [2] - b._x [2],
+ x._x [3] * a._x [3] - b._x [3]
+ };
+#elif fstb_ARCHI == fstb_ARCHI_X86
+ return _mm_sub_ps (_mm_mul_ps (x, a), b);
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+ #if defined (__ARM_FEATURE_FMA)
+ return -vfmsq_f32 (b, x, a);
+ #else
+ return -vmlsq_f32 (b, x, a);
+ #endif
+#endif // fstb_ARCHI
+}
+
+
+
+// Returns - x * a + b
+Vf32 fnma (const Vf32 &x, const Vf32 &a, const Vf32 &b) noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+ return Vf32 {
+ b._x [0] - x._x [0] * a._x [0],
+ b._x [1] - x._x [1] * a._x [1],
+ b._x [2] - x._x [2] * a._x [2],
+ b._x [3] - x._x [3] * a._x [3]
+ };
+#elif fstb_ARCHI == fstb_ARCHI_X86
+ return _mm_sub_ps (b, _mm_mul_ps (x, a));
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+ #if defined (__ARM_FEATURE_FMA)
+ return vfmsq_f32 (b, x, a);
+ #else
+ return vmlsq_f32 (b, x, a);
+ #endif
+#endif // fstb_ARCHI
+}
+
+
+
+Vf32 round (const Vf32 &v) noexcept
+{
+ return v.round ();
+}
+
+
+
+Vf32 min (const Vf32 &lhs, const Vf32 &rhs) noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+ return Vf32 {
+ std::min (lhs._x [0], rhs._x [0]),
+ std::min (lhs._x [1], rhs._x [1]),
+ std::min (lhs._x [2], rhs._x [2]),
+ std::min (lhs._x [3], rhs._x [3])
+ };
+#elif fstb_ARCHI == fstb_ARCHI_X86
+ return _mm_min_ps (lhs, rhs);
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+ return vminq_f32 (lhs, rhs);
+#endif // fstb_ARCHI
+}
+
+
+
+Vf32 max (const Vf32 &lhs, const Vf32 &rhs) noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+ return Vf32 {
+ std::max (lhs._x [0], rhs._x [0]),
+ std::max (lhs._x [1], rhs._x [1]),
+ std::max (lhs._x [2], rhs._x [2]),
+ std::max (lhs._x [3], rhs._x [3])
+ };
+#elif fstb_ARCHI == fstb_ARCHI_X86
+ return _mm_max_ps (lhs, rhs);
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+ return vmaxq_f32 (lhs, rhs);
+#endif // fstb_ARCHI
+}
+
+
+
+Vf32 limit (const Vf32 &v, const Vf32 &mi, const Vf32 &ma) noexcept
+{
+ return min (max (v, mi), ma);
+}
+
+
+
+Vf32 select (Vf32 cond, Vf32 v_t, Vf32 v_f) noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+ /*** To do: implement as r = v_f ^ ((v_f ^ v_t) & cond) ***/
+ const Vf32::Combo cc { cond };
+ Vf32::Combo ct { v_t };
+ Vf32::Combo cf { v_f };
+ Vf32::Combo r;
+ r._s32 [0] = (ct._s32 [0] & cc._s32 [0]) | (cf._s32 [0] & ~cc._s32 [0]);
+ r._s32 [1] = (ct._s32 [1] & cc._s32 [1]) | (cf._s32 [1] & ~cc._s32 [1]);
+ r._s32 [2] = (ct._s32 [2] & cc._s32 [2]) | (cf._s32 [2] & ~cc._s32 [2]);
+ r._s32 [3] = (ct._s32 [3] & cc._s32 [3]) | (cf._s32 [3] & ~cc._s32 [3]);
+ return r._vf32;
+#elif fstb_ARCHI == fstb_ARCHI_X86
+ const auto cond_1 = _mm_and_ps ( cond, v_t);
+ const auto cond_0 = _mm_andnot_ps (cond, v_f);
+ return _mm_or_ps (cond_0, cond_1);
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+ return vbslq_f32 (vreinterpretq_u32_f32 (cond), v_t, v_f);
+#endif // fstb_ARCHI
+}
+
+
+
+std::tuple swap_if (Vf32 cond, Vf32 lhs, Vf32 rhs) noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+ const Vf32::Combo cc { cond };
+ if (cc._s32 [0] != 0) { std::swap (lhs._x [0], rhs._x [0]); }
+ if (cc._s32 [1] != 0) { std::swap (lhs._x [1], rhs._x [1]); }
+ if (cc._s32 [2] != 0) { std::swap (lhs._x [2], rhs._x [2]); }
+ if (cc._s32 [3] != 0) { std::swap (lhs._x [3], rhs._x [3]); }
+ return std::make_tuple (lhs, rhs);
+#elif fstb_ARCHI == fstb_ARCHI_X86
+ const auto inv = _mm_and_ps (_mm_xor_ps (lhs, rhs), cond);
+ return std::make_tuple (
+ _mm_xor_ps (lhs, inv),
+ _mm_xor_ps (rhs, inv)
+ );
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+ const auto cu32 = vreinterpretq_u32_f32 (cond);
+ return std::make_tuple (
+ vbslq_f32 (cu32, rhs, lhs),
+ vbslq_f32 (cu32, lhs, rhs)
+ );
+#endif // fstb_ARCHI
+}
+
+
+
+Vf32 sqrt (Vf32 v) noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+ return Vf32 {
+ sqrtf (v._x [0]),
+ sqrtf (v._x [1]),
+ sqrtf (v._x [2]),
+ sqrtf (v._x [3])
+ };
+#elif fstb_ARCHI == fstb_ARCHI_X86
+ return _mm_sqrt_ps (v);
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+ const uint32x4_t nz_flag = vtstq_u32 (
+ vreinterpretq_u32_f32 (v),
+ vreinterpretq_u32_f32 (v)
+ );
+ float32x4_t rs = vrsqrteq_f32 (v);
+ rs *= vrsqrtsq_f32 (v, rs * rs);
+ rs *= vrsqrtsq_f32 (v, rs * rs);
+ rs *= vrsqrtsq_f32 (v, rs * rs);
+ const auto sqrt_a = rs * float32x4_t (v);
+ return vreinterpretq_f32_u32 (vandq_u32 (
+ vreinterpretq_u32_f32 (sqrt_a),
+ nz_flag
+ ));
+#endif // fstb_ARCHI
+}
+
+
+
+// Formula by 2DaT
+// 12-13 ulp
+// https://www.kvraudio.com/forum/viewtopic.php?f=33&t=532048
+Vf32 log2 (Vf32 v) noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+
+ assert (v > Vf32 (0));
+ /*** To do: actual approximation matching the SIMD formula ***/
+ return Vf32 {
+ logf (v._x [0]) * float (LOG2_E),
+ logf (v._x [1]) * float (LOG2_E),
+ logf (v._x [2]) * float (LOG2_E),
+ logf (v._x [3]) * float (LOG2_E),
+ };
+
+#else // fstb_HAS_SIMD
+
+ // Rational fraction approximating log2 (x)
+ // [sqrt (0.5) ; sqrt (2)] -> [-0.5 ; 0.5]
+ // f: x -> (x - 1) * (x^2 + c1*x + c0) / (d2*x^2 + d1*x + d0)
+ // No analytic continuity on the full range, although this is "almost" C0
+ // (good enough for single precision).
+ const auto c0 = Vf32 (1.011593342e+01f);
+ const auto c1 = Vf32 (1.929443550e+01f);
+ const auto d0 = Vf32 (2.095932245e+00f);
+ const auto d1 = Vf32 (1.266638851e+01f);
+ const auto d2 = Vf32 (6.316540241e+00f);
+ const auto one = Vf32 (1.0f);
+ const auto multi = Vf32 (1.41421356237f);
+ const auto mmask = ~((1 << 23) - 1);
+
+#if fstb_ARCHI == fstb_ARCHI_X86
+
+ __m128i x_i = _mm_castps_si128 (v);
+ __m128i spl_exp = _mm_castps_si128 (v * multi);
+ spl_exp = _mm_sub_epi32 (spl_exp, _mm_castps_si128 (one));
+ spl_exp = _mm_and_si128 (spl_exp, _mm_set1_epi32 (mmask));
+ const auto spl_mantissa =
+ Vf32 { _mm_castsi128_ps (_mm_sub_epi32 (x_i, spl_exp)) };
+ spl_exp = _mm_srai_epi32 (spl_exp, 23);
+ const auto log2_exponent = Vf32 { _mm_cvtepi32_ps (spl_exp) };
+
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+
+ const int32x4_t x_i = vreinterpretq_s32_f32 (v);
+ int32x4_t spl_exp = vreinterpretq_s32_f32 (v * multi);
+ spl_exp = spl_exp - vreinterpretq_s32_f32 (one);
+ spl_exp = vandq_s32 (spl_exp, vdupq_n_s32 (mmask));
+ const auto spl_mantissa = Vf32 { vreinterpretq_f32_s32 (x_i - spl_exp) };
+ spl_exp = vshrq_n_s32 (spl_exp, 23);
+ const auto log2_exponent = Vf32 { vcvtq_f32_s32 (spl_exp) };
+
+#endif // fstb_ARCHI
+
+ auto num = spl_mantissa + c1;
+ num = fma (num, spl_mantissa, c0);
+ num = fms (num, spl_mantissa, num);
+
+ auto den = d2;
+ den = fma (den, spl_mantissa, d1);
+ den = fma (den, spl_mantissa, d0);
+
+ auto res = num / den;
+ res += log2_exponent;
+
+ return res;
+
+#endif // fstb_HAS_SIMD
+}
+
+
+
+// Formula by 2DaT
+// Coefficients fixed by Andrew Simper to achieve true C0 continuity
+// 3-4 ulp
+// https://www.kvraudio.com/forum/viewtopic.php?p=7161124#p7161124
+// https://www.kvraudio.com/forum/viewtopic.php?p=7677266#p7677266
+Vf32 exp2 (Vf32 v) noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+
+ /*** To do: actual approximation matching the SIMD formula ***/
+ return Vf32 {
+ exp2f (v._x [0]),
+ exp2f (v._x [1]),
+ exp2f (v._x [2]),
+ exp2f (v._x [3]),
+ };
+
+#else // fstb_HAS_SIMD
+
+ // [-0.5, 0.5] 2^x approx polynomial ~ 2.4 ulp
+ const auto c0 = Vf32 (1.000000088673463);
+ const auto c1 = Vf32 (0.69314693211407);
+ const auto c2 = Vf32 (0.24022037362574);
+ const auto c3 = Vf32 (0.0555072548370);
+ const auto c4 = Vf32 (0.0096798351988);
+ const auto c5 = Vf32 (0.0013285658116);
+
+ // Note: the following set of coefficients has a larger error (0.00043
+ // cents, maybe 7 ulp?) but ensures C2 continuity:
+ // c0 = 1.000000237
+ // c1 = 0.69314655
+ // c2 = 0.24021519
+ // c3 = 0.05550965
+ // c4 = 0.00969821
+ // c5 = 0.00132508
+
+ // i = round (v)
+ // v = v - i
+#if fstb_ARCHI == fstb_ARCHI_X86
+ auto i = _mm_cvtps_epi32 (v);
+ v -= _mm_cvtepi32_ps (i);
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+ const int round_ofs = 256;
+ const auto r = Vf32 (round_ofs + 0.5f);
+ auto i = vcvtq_s32_f32 (v + r);
+ i -= vdupq_n_s32 (round_ofs);
+ v -= vcvtq_f32_s32 (i);
+#endif // fstb_ARCHI
+
+ // Estrin-Horner evaluation scheme
+ const auto v2 = v * v;
+ const auto p23 = fma (c3, v, c2);
+ const auto p01 = fma (c1, v, c0);
+ auto p = fma (c5, v, c4);
+ p = fma (p, v2, p23);
+ p = fma (p, v2, p01);
+
+ // i << 23
+ // r = (2^i) * (2^v)
+ // directly in floating point exponent
+#if fstb_ARCHI == fstb_ARCHI_X86
+ i = _mm_slli_epi32 (i, 23);
+ return _mm_castsi128_ps (_mm_add_epi32 (i, _mm_castps_si128 (p)));
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+ i = vshlq_n_s32 (i, 23);
+ return vreinterpretq_f32_s32 (i + vreinterpretq_s32_f32 (p));
+#endif // fstb_ARCHI
+
+#endif // fstb_HAS_SIMD
+}
+
+
+
+} // namespace fstb
+
+
+
+#endif // fstb_Vf32_CODEHEADER_INCLUDED
+
+
+
+/*\\\ EOF \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\*/
diff --git a/src/fstb/Vs32.h b/src/fstb/Vs32.h
new file mode 100644
index 0000000..234d4c0
--- /dev/null
+++ b/src/fstb/Vs32.h
@@ -0,0 +1,257 @@
+/*****************************************************************************
+
+ Vs32.h
+ Author: Laurent de Soras, 2021
+
+--- Legal stuff ---
+
+This program is free software. It comes without any warranty, to
+the extent permitted by applicable law. You can redistribute it
+and/or modify it under the terms of the Do What The Fuck You Want
+To Public License, Version 2, as published by Sam Hocevar. See
+http://www.wtfpl.net/ for more details.
+
+*Tab=3***********************************************************************/
+
+
+
+#pragma once
+#if ! defined (fstb_Vs32_HEADER_INCLUDED)
+#define fstb_Vs32_HEADER_INCLUDED
+
+
+
+/*\\\ INCLUDE FILES \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\*/
+
+#include "fstb/def.h"
+
+#if ! defined (fstb_HAS_SIMD)
+ #include
+#elif (fstb_ARCHI == fstb_ARCHI_X86)
+ #include
+#elif (fstb_ARCHI == fstb_ARCHI_ARM)
+ #include
+#else
+ #error
+#endif
+
+#include
+
+#include
+
+
+
+namespace fstb
+{
+
+
+
+#if ! defined (fstb_HAS_SIMD)
+
+typedef std::array Vs32Native;
+
+#elif fstb_ARCHI == fstb_ARCHI_X86
+
+typedef __m128i Vs32Native;
+
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+
+typedef int32x4_t Vs32Native;
+
+#else // fstb_ARCHI
+#error
+#endif // fstb_ARCHI
+
+
+
+class Vs32
+{
+
+/*\\\ PUBLIC \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\*/
+
+public:
+
+ static constexpr int _len_l2 = 2;
+ static constexpr int _length = 1 << _len_l2;
+ typedef int32_t Scalar;
+
+ Vs32 () = default;
+ fstb_FORCEINLINE
+ Vs32 (Vs32Native a) noexcept : _x { a } {}
+ explicit fstb_FORCEINLINE
+ Vs32 (Scalar a) noexcept;
+ explicit fstb_FORCEINLINE
+ Vs32 (Scalar a0, Scalar a1, Scalar a2, Scalar a3) noexcept;
+ explicit fstb_FORCEINLINE
+ Vs32 (const std::tuple &a) noexcept;
+ Vs32 (const Vs32 &other) = default;
+ Vs32 (Vs32 &&other) = default;
+ ~Vs32 () = default;
+ Vs32 & operator = (const Vs32 &other) = default;
+ Vs32 & operator = (Vs32 &&other) = default;
+
+ template
+ fstb_FORCEINLINE void
+ store (MEM *ptr) const noexcept;
+ template
+ fstb_FORCEINLINE void
+ storeu (MEM *ptr) const noexcept;
+ template
+ fstb_FORCEINLINE void
+ storeu_part (MEM *ptr, int n) const noexcept;
+
+ fstb_FORCEINLINE
+ operator Vs32Native () const noexcept { return _x; }
+ fstb_FORCEINLINE explicit
+ operator bool () const noexcept;
+
+ fstb_FORCEINLINE Vs32 &
+ operator += (const Vs32Native &other) noexcept;
+ fstb_FORCEINLINE Vs32 &
+ operator -= (const Vs32Native &other) noexcept;
+ fstb_FORCEINLINE Vs32 &
+ operator *= (const Vs32Native &other) noexcept;
+
+ fstb_FORCEINLINE Vs32 &
+ operator &= (const Vs32Native &other) noexcept;
+ fstb_FORCEINLINE Vs32 &
+ operator |= (const Vs32Native &other) noexcept;
+ fstb_FORCEINLINE Vs32 &
+ operator ^= (const Vs32Native &other) noexcept;
+
+ fstb_FORCEINLINE Vs32 &
+ operator <<= (int imm) noexcept;
+ fstb_FORCEINLINE Vs32 &
+ operator >>= (int imm) noexcept;
+
+ fstb_FORCEINLINE Vs32
+ operator - () const noexcept;
+ fstb_FORCEINLINE Vs32
+ operator ~ () const noexcept;
+ fstb_FORCEINLINE Vs32
+ is_lt_0 () const noexcept;
+ fstb_FORCEINLINE Vs32
+ reverse () const noexcept;
+
+ template
+ fstb_FORCEINLINE Vs32
+ rotate () const noexcept;
+ template
+ fstb_FORCEINLINE int32_t
+ extract () const noexcept;
+ template
+ fstb_FORCEINLINE Vs32
+ insert (int32_t val) const noexcept;
+ template
+ fstb_FORCEINLINE Vs32
+ spread () const noexcept;
+
+ fstb_FORCEINLINE std::tuple
+ explode () const noexcept;
+
+ fstb_FORCEINLINE int32_t
+ sum_h () const noexcept;
+ fstb_FORCEINLINE int32_t
+ min_h () const noexcept;
+ fstb_FORCEINLINE int32_t
+ max_h () const noexcept;
+
+ fstb_FORCEINLINE bool
+ and_h () const noexcept;
+ fstb_FORCEINLINE bool
+ or_h () const noexcept;
+ fstb_FORCEINLINE unsigned int
+ movemask () const noexcept;
+ fstb_FORCEINLINE int
+ count_bits () const noexcept;
+
+ static fstb_FORCEINLINE Vs32
+ zero () noexcept;
+ static fstb_FORCEINLINE Vs32
+ all1 () noexcept;
+ static fstb_FORCEINLINE Vs32
+ set_mask (bool m0, bool m1, bool m2, bool m3) noexcept;
+ template
+ static fstb_FORCEINLINE Vs32
+ compose (Vs32 a, Vs32 b) noexcept;
+
+ template
+ static fstb_FORCEINLINE Vs32
+ load (const MEM *ptr) noexcept;
+ template
+ static fstb_FORCEINLINE Vs32
+ loadu (const MEM *ptr) noexcept;
+
+
+
+/*\\\ PROTECTED \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\*/
+
+protected:
+
+
+
+/*\\\ PRIVATE \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\*/
+
+private:
+
+#if ! defined (fstb_HAS_SIMD)
+public:
+#endif
+ Vs32Native _x;
+private:
+
+
+
+/*\\\ FORBIDDEN MEMBER FUNCTIONS \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\*/
+
+private:
+
+}; // class Vs32
+
+
+
+/*\\\ GLOBAL OPERATORS AND FUNCTIONS \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\*/
+
+
+
+fstb_FORCEINLINE Vs32 operator + (Vs32 lhs, const Vs32 &rhs) noexcept;
+fstb_FORCEINLINE Vs32 operator - (Vs32 lhs, const Vs32 &rhs) noexcept;
+fstb_FORCEINLINE Vs32 operator * (Vs32 lhs, const Vs32 &rhs) noexcept;
+fstb_FORCEINLINE Vs32 operator & (Vs32 lhs, const Vs32 &rhs) noexcept;
+fstb_FORCEINLINE Vs32 operator | (Vs32 lhs, const Vs32 &rhs) noexcept;
+fstb_FORCEINLINE Vs32 operator ^ (Vs32 lhs, const Vs32 &rhs) noexcept;
+
+template
+fstb_FORCEINLINE Vs32 operator << (Vs32 lhs, T rhs) noexcept;
+template
+fstb_FORCEINLINE Vs32 operator >> (Vs32 lhs, T rhs) noexcept;
+
+fstb_FORCEINLINE Vs32 operator == (const Vs32 &lhs, const Vs32 &rhs) noexcept;
+fstb_FORCEINLINE Vs32 operator != (const Vs32 &lhs, const Vs32 &rhs) noexcept;
+fstb_FORCEINLINE Vs32 operator < (const Vs32 &lhs, const Vs32 &rhs) noexcept;
+fstb_FORCEINLINE Vs32 operator <= (const Vs32 &lhs, const Vs32 &rhs) noexcept;
+fstb_FORCEINLINE Vs32 operator > (const Vs32 &lhs, const Vs32 &rhs) noexcept;
+fstb_FORCEINLINE Vs32 operator >= (const Vs32 &lhs, const Vs32 &rhs) noexcept;
+
+fstb_FORCEINLINE Vs32 abs (const Vs32 &v) noexcept;
+fstb_FORCEINLINE Vs32 min (const Vs32 &lhs, const Vs32 &rhs) noexcept;
+fstb_FORCEINLINE Vs32 max (const Vs32 &lhs, const Vs32 &rhs) noexcept;
+fstb_FORCEINLINE Vs32 limit (const Vs32 &v, const Vs32 &mi, const Vs32 &ma) noexcept;
+fstb_FORCEINLINE Vs32 select (const Vs32 &cond, const Vs32 &v_t, const Vs32 &v_f) noexcept;
+fstb_FORCEINLINE std::tuple swap_if (const Vs32 &cond, Vs32 lhs, Vs32 rhs) noexcept;
+
+
+
+} // namespace fstb
+
+
+
+#include "fstb/Vs32.hpp"
+
+
+
+#endif // fstb_Vs32_HEADER_INCLUDED
+
+
+
+/*\\\ EOF \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\*/
diff --git a/src/fstb/Vs32.hpp b/src/fstb/Vs32.hpp
new file mode 100644
index 0000000..e6b2510
--- /dev/null
+++ b/src/fstb/Vs32.hpp
@@ -0,0 +1,1142 @@
+/*****************************************************************************
+
+ Vs32.hpp
+ Author: Laurent de Soras, 2021
+
+--- Legal stuff ---
+
+This program is free software. It comes without any warranty, to
+the extent permitted by applicable law. You can redistribute it
+and/or modify it under the terms of the Do What The Fuck You Want
+To Public License, Version 2, as published by Sam Hocevar. See
+http://www.wtfpl.net/ for more details.
+
+*Tab=3***********************************************************************/
+
+
+
+#if ! defined (fstb_Vs32_CODEHEADER_INCLUDED)
+#define fstb_Vs32_CODEHEADER_INCLUDED
+
+
+
+/*\\\ INCLUDE FILES \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\*/
+
+#include "fstb/fnc.h"
+
+#include
+
+#include
+
+
+
+namespace fstb
+{
+
+
+
+/*\\\ PUBLIC \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\*/
+
+
+
+// Returns a0 | a0 | a0 | a0
+Vs32::Vs32 (Scalar a) noexcept
+#if ! defined (fstb_HAS_SIMD)
+: _x { a, a, a, a }
+#elif fstb_ARCHI == fstb_ARCHI_X86
+: _x { _mm_set1_epi32 (a) }
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+: _x { vdupq_n_s32 (a) }
+#endif // fstb_ARCHI
+{
+ // Nothing
+}
+
+
+
+// Returns a0 | a1 | a2 | a3
+Vs32::Vs32 (Scalar a0, Scalar a1, Scalar a2, Scalar a3) noexcept
+#if ! defined (fstb_HAS_SIMD)
+: _x { a0, a1, a2, a3 }
+#elif fstb_ARCHI == fstb_ARCHI_X86
+: _x { _mm_set_epi32 (a3, a2, a1, a0) }
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+: _x { a0, a1, a2, a3 }
+#endif // fstb_ARCHI
+{
+ // Nothing
+}
+
+
+
+// Returns a0 | a1 | a2 | a3
+Vs32::Vs32 (const std::tuple &a) noexcept
+#if ! defined (fstb_HAS_SIMD)
+: _x { std::get <0> (a), std::get <1> (a), std::get <2> (a), std::get <3> (a) }
+#elif fstb_ARCHI == fstb_ARCHI_X86
+: _x { _mm_set_epi32 (std::get <3> (a), std::get <2> (a), std::get <1> (a), std::get <0> (a)) }
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+: _x { std::get <0> (a), std::get <1> (a), std::get <2> (a), std::get <3> (a) }
+#endif // fstb_ARCHI
+{
+ // Nothing
+}
+
+
+
+template
+void Vs32::store (MEM *ptr) const noexcept
+{
+ assert (is_ptr_align_nz (ptr, fstb_SIMD128_ALIGN));
+
+#if ! defined (fstb_HAS_SIMD)
+ *reinterpret_cast (ptr) = _x;
+#elif fstb_ARCHI == fstb_ARCHI_X86
+ _mm_store_si128 (reinterpret_cast <__m128i *> (ptr), _x);
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+ vst1q_s32 (reinterpret_cast (ptr), _x);
+#endif // fstb_ARCHI
+}
+
+
+
+template
+void Vs32::storeu (MEM *ptr) const noexcept
+{
+ assert (ptr != nullptr);
+
+#if ! defined (fstb_HAS_SIMD)
+ *reinterpret_cast (ptr) = _x;
+#elif fstb_ARCHI == fstb_ARCHI_X86
+ _mm_storeu_si128 (reinterpret_cast <__m128i *> (ptr), _x);
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+ vst1q_u8 (reinterpret_cast (ptr), vreinterpretq_u8_s32 (_x));
+#endif // fstb_ARCHI
+}
+
+
+
+// n = number of scalars to store (from the LSB)
+template
+void Vs32::storeu_part (MEM *ptr, int n) const noexcept
+{
+ assert (n > 0);
+
+ if (n >= _length)
+ {
+ storeu (ptr);
+ return;
+ }
+
+ int32_t * f_ptr = reinterpret_cast (ptr);
+
+#if ! defined (fstb_HAS_SIMD)
+
+ for (int i = 0; i < n; ++i)
+ {
+ f_ptr [i] = _x [i];
+ }
+
+#elif fstb_ARCHI == fstb_ARCHI_X86
+
+ f_ptr [0] = _mm_cvtsi128_si32 (_x);
+ if (n >= 2)
+ {
+ f_ptr [1] = _mm_cvtsi128_si32 (_mm_shuffle_epi32 (_x, 1 << 0));
+ if (n >= 3)
+ {
+ f_ptr [1] = _mm_cvtsi128_si32 (_mm_shuffle_epi32 (_x, 2 << 0));
+ }
+ }
+
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+
+ vst1q_lane_s32 (f_ptr + 0, _x, 0);
+ if (n >= 2)
+ {
+ vst1q_lane_s32 (f_ptr + 1, _x, 1);
+ if (n >= 3)
+ {
+ vst1q_lane_s32 (f_ptr + 2, _x, 2);
+ }
+ }
+
+#endif
+}
+
+
+
+// Works only with well-formed condition results (tested bits depend on the
+// implementation).
+// For each scalar, true = all bits set, false = all bits cleared
+Vs32::operator bool () const noexcept
+{
+ return and_h ();
+}
+
+
+
+Vs32 & Vs32::operator += (const Vs32Native &other) noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+ _x [0] += other [0];
+ _x [1] += other [1];
+ _x [2] += other [2];
+ _x [3] += other [3];
+#elif fstb_ARCHI == fstb_ARCHI_X86
+ _x = _mm_add_epi32 (_x, other);
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+ _x = vaddq_s32 (_x, other);
+#endif // fstb_ARCHI
+ return *this;
+}
+
+
+
+Vs32 & Vs32::operator -= (const Vs32Native &other) noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+ _x [0] -= other [0];
+ _x [1] -= other [1];
+ _x [2] -= other [2];
+ _x [3] -= other [3];
+#elif fstb_ARCHI == fstb_ARCHI_X86
+ _x = _mm_sub_epi32 (_x, other);
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+ _x = vsubq_s32 (_x, other);
+#endif // fstb_ARCHI
+ return *this;
+}
+
+
+
+Vs32 & Vs32::operator *= (const Vs32Native &other) noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+ _x [0] *= other [0];
+ _x [1] *= other [1];
+ _x [2] *= other [2];
+ _x [3] *= other [3];
+#elif fstb_ARCHI == fstb_ARCHI_X86
+ // Emulation of _mm_mullo_epi32 (SSE4.1)
+# if fstb_COMPILER == fstb_COMPILER_MSVC
+ // For some reason this code is slightly faster on MSVC
+ auto p02_64 = _mm_mul_epu32 (_x, other);
+ auto p13_64 = _mm_mul_epu32 (
+ _mm_srli_si128 (_x , 4),
+ _mm_srli_si128 (other, 4)
+ );
+ p02_64 = _mm_shuffle_epi32 (p02_64, (0 << 0) | (2 << 2));
+ p13_64 = _mm_shuffle_epi32 (p13_64, (0 << 0) | (2 << 2));
+ _x = _mm_unpacklo_epi32 (p02_64, p13_64);
+# else
+ // Code of this function shamelessly borrowed from tp7
+ // https://github.com/tp7/masktools/blob/16bit/masktools/common/simd.h
+ // This code is faster on GCC/Clang
+ const __m128i lhs13 = _mm_shuffle_epi32 (_x, 0xF5); // (-,a3,-,a1)
+ const __m128i rhs13 = _mm_shuffle_epi32 (other, 0xF5); // (-,b3,-,b1)
+ const __m128i prod02 = _mm_mul_epu32 (_x, other); // (-,a2*b2,-,a0*b0)
+ const __m128i prod13 = _mm_mul_epu32 (lhs13, rhs13); // (-,a3*b3,-,a1*b1)
+ const __m128i prod01 = _mm_unpacklo_epi32 (prod02, prod13); // (-,-,a1*b1,a0*b0)
+ const __m128i prod23 = _mm_unpackhi_epi32 (prod02, prod13); // (-,-,a3*b3,a2*b2)
+ _x = _mm_unpacklo_epi64 (prod01 ,prod23); // (ab3,ab2,ab1,ab0)
+# endif // fstb_COMPILER
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+ _x = vmulq_s32 (_x, other);
+#endif // fstb_ARCHI
+ return *this;
+}
+
+
+
+Vs32 & Vs32::operator &= (const Vs32Native &other) noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+ _x [0] &= other [0];
+ _x [1] &= other [1];
+ _x [2] &= other [2];
+ _x [3] &= other [3];
+#elif fstb_ARCHI == fstb_ARCHI_X86
+ _x = _mm_and_si128 (_x, other);
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+ _x = vandq_s32 (_x, other);
+#endif // fstb_ARCHI
+ return *this;
+}
+
+
+
+Vs32 & Vs32::operator |= (const Vs32Native &other) noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+ _x [0] |= other [0];
+ _x [1] |= other [1];
+ _x [2] |= other [2];
+ _x [3] |= other [3];
+#elif fstb_ARCHI == fstb_ARCHI_X86
+ _x = _mm_or_si128 (_x, other);
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+ _x = vorrq_s32 (_x, other);
+#endif // fstb_ARCHI
+ return *this;
+}
+
+
+
+Vs32 & Vs32::operator ^= (const Vs32Native &other) noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+ _x [0] ^= other [0];
+ _x [1] ^= other [1];
+ _x [2] ^= other [2];
+ _x [3] ^= other [3];
+#elif fstb_ARCHI == fstb_ARCHI_X86
+ _x = _mm_xor_si128 (_x, other);
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+ _x = veorq_s32 (_x, other);
+#endif // fstb_ARCHI
+ return *this;
+}
+
+
+
+Vs32 & Vs32::operator <<= (int imm) noexcept
+{
+ assert (imm >= 0);
+ assert (imm <= 32);
+#if ! defined (fstb_HAS_SIMD)
+ _x [0] <<= imm;
+ _x [1] <<= imm;
+ _x [2] <<= imm;
+ _x [3] <<= imm;
+#elif fstb_ARCHI == fstb_ARCHI_X86
+ _x = _mm_slli_epi32 (_x, imm);
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+ _x <<= imm;
+#endif // fstb_ARCHI
+ return *this;
+}
+
+
+
+Vs32 & Vs32::operator >>= (int imm) noexcept
+{
+ assert (imm >= 0);
+ assert (imm <= 32);
+#if ! defined (fstb_HAS_SIMD)
+ _x [0] >>= imm;
+ _x [1] >>= imm;
+ _x [2] >>= imm;
+ _x [3] >>= imm;
+#elif fstb_ARCHI == fstb_ARCHI_X86
+ _x = _mm_srai_epi32 (_x, imm);
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+ _x >>= imm;
+#endif // fstb_ARCHI
+ return *this;
+}
+
+
+
+// -(1<<31) stays constant
+Vs32 Vs32::operator - () const noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+ return Vs32 {
+ -_x [0],
+ -_x [1],
+ -_x [2],
+ -_x [3]
+ };
+#elif fstb_ARCHI == fstb_ARCHI_X86
+ return _mm_sub_epi32 (_mm_setzero_si128 (), _x);
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+ return vnegq_s32 (_x);
+#endif // fstb_ARCHI
+}
+
+
+
+Vs32 Vs32::operator ~ () const noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+ return Vs32 {
+ ~(_x [0]),
+ ~(_x [1]),
+ ~(_x [2]),
+ ~(_x [3])
+ };
+#elif fstb_ARCHI == fstb_ARCHI_X86
+ return _mm_xor_si128 (_x, _mm_set1_epi32 (-1));
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+ return vmvnq_s32 (_x);
+#endif // fstb_ARCHI
+}
+
+
+
+Vs32 Vs32::is_lt_0 () const noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+ return Vs32 {
+ (_x [0] < 0) ? -1 : 0,
+ (_x [1] < 0) ? -1 : 0,
+ (_x [2] < 0) ? -1 : 0,
+ (_x [3] < 0) ? -1 : 0
+ };
+#elif fstb_ARCHI == fstb_ARCHI_X86
+ return _mm_cmplt_epi32 (_x, _mm_setzero_si128 ());
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+ return vshrq_n_s32 (_x, 31);
+#endif // fstb_ARCHI
+}
+
+
+
+Vs32 Vs32::reverse () const noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+ return Vs32 { _x [3], _x [2], _x [1], _x [0] };
+#elif fstb_ARCHI == fstb_ARCHI_X86
+ return _mm_shuffle_epi32 (_x, (3<<0) + (2<<2) + (1<<4) + (0<<6));
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+ return vrev64q_s32 (vcombine_s32 (vget_high_s32 (_x), vget_low_s32 (_x)));
+#endif // fstb_ARCHI
+}
+
+
+
+// Positive = left
+template
+Vs32 Vs32::rotate () const noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+ return Vs32 {
+ _x [(0 - SHIFT) & 3],
+ _x [(1 - SHIFT) & 3],
+ _x [(2 - SHIFT) & 3],
+ _x [(3 - SHIFT) & 3]
+ };
+#elif fstb_ARCHI == fstb_ARCHI_X86
+ switch (SHIFT & 3)
+ {
+ case 1: return _mm_shuffle_epi32 (_x, (2<<6) | (1<<4) | (0<<2) | (3<<0));
+ case 2: return _mm_shuffle_epi32 (_x, (1<<6) | (0<<4) | (3<<2) | (2<<0));
+ case 3: return _mm_shuffle_epi32 (_x, (0<<6) | (3<<4) | (2<<2) | (1<<0));
+ default: return *this;
+ }
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+ switch (SHIFT & 3)
+ {
+ case 1: return vextq_s32 (_x, _x, 3);
+ case 2: return vextq_s32 (_x, _x, 2);
+ case 3: return vextq_s32 (_x, _x, 1);
+ default: return *this;
+ }
+#endif // fstb_ARCHI
+}
+
+
+
+template
+int32_t Vs32::extract () const noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+ return _x [POS & 3];
+#elif fstb_ARCHI == fstb_ARCHI_X86
+ auto a = _x;
+ switch (POS & 3)
+ {
+ case 1: a = _mm_shuffle_epi32 (a, 1); break;
+ case 2: a = _mm_shuffle_epi32 (a, 2); break;
+ case 3: a = _mm_shuffle_epi32 (a, 3); break;
+ default: /* Nothing */ break;
+ }
+ return _mm_cvtsi128_si32 (a);
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+ return vgetq_lane_s32 (_x, POS & 3);
+#endif // fstb_ARCHI
+}
+
+
+
+template
+Vs32 Vs32::insert (int32_t val) const noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+ auto a = *this;
+ a._x [POS & 3] = val;
+ return a;
+#elif fstb_ARCHI == fstb_ARCHI_X86
+ auto a = rotate <(-POS) & 3> ();
+ a._x = _mm_castps_si128 (_mm_move_ss (
+ _mm_castsi128_ps (a._x),
+ _mm_castsi128_ps (_mm_set1_epi32 (val))
+ ));
+ return a.template rotate ();
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+ return vsetq_lane_s32 (val, _x, POS & 3);
+#endif // fstb_ARCHI
+}
+
+
+
+template
+Vs32 Vs32::spread () const noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+ return Vs32 (extract ());
+#elif fstb_ARCHI == fstb_ARCHI_X86
+ return _mm_shuffle_epi32 (_x, 0x55 * (POS & 3));
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+ return vdupq_n_s32 (vgetq_lane_s32 (_x, POS & 3));
+#endif // fstb_ARCHI
+}
+
+
+
+int32_t Vs32::sum_h () const noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+ return (_x [0] + _x [2]) + (_x [1] + _x [3]);
+#elif fstb_ARCHI == fstb_ARCHI_X86
+ // s = v3,v2,v1,v0
+ const auto s = _mm_shuffle_epi32 (_x, (3 << 0) | (2 << 2) | (1 << 4) | (0 << 6));
+ const auto v = _mm_add_epi32 (_x, s); // v0+v3,v1+v2,v2+v1,v3+v0
+ return _mm_cvtsi128_si32 (_mm_add_epi32 (v, _mm_shuffle_epi32 (v, 1 << 0)));
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+ #if fstb_WORD_SIZE == 64
+ return vaddvq_s32 (_x);
+ #else
+ int32x2_t v2 = vadd_s32 (vget_high_s32 (_x), vget_low_s32 (_x));
+ return vget_lane_s32 (vpadd_s32 (v2, v2), 0);
+ #endif
+#endif // fstb_ARCHI
+}
+
+
+
+int32_t Vs32::min_h () const noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+ return std::min (std::min (_x [0], _x [2]), std::min (_x [1], _x [3]));
+#elif fstb_ARCHI == fstb_ARCHI_X86
+ const auto v0 = min (*this, _mm_shuffle_epi32 (_x, (3 << 2) | 2));
+ const auto v1 = _mm_shuffle_epi32 (v0, 1);
+ return std::min (_mm_cvtsi128_si32 (v0), _mm_cvtsi128_si32 (v1));
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+ int32x2_t v2 = vmin_s32 (vget_high_s32 (_x), vget_low_s32 (_x));
+ return vget_lane_s32 (vpmin_s32 (v2, v2), 0);
+#endif // fstb_ARCHI
+}
+
+
+
+int32_t Vs32::max_h () const noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+ return std::max (std::max (_x [0], _x [2]), std::max (_x [1], _x [3]));
+#elif fstb_ARCHI == fstb_ARCHI_X86
+ const auto v0 = max (*this, _mm_shuffle_epi32 (_x, (3 << 2) | 2));
+ const auto v1 = _mm_shuffle_epi32 (v0, 1);
+ return std::max (_mm_cvtsi128_si32 (v0), _mm_cvtsi128_si32 (v1));
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+ int32x2_t v2 = vmax_s32 (vget_high_s32 (_x), vget_low_s32 (_x));
+ return vget_lane_s32 (vpmax_s32 (v2, v2), 0);
+#endif // fstb_ARCHI
+}
+
+
+
+// Works only with well-formed condition results (tested bits depends on the implementation).
+// For each scalar, true = all bits set, false = all bits cleared
+bool Vs32::and_h () const noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+ const int32_t t = (_x [0] & _x [1]) & (_x [2] & _x [3]);
+ return (t == -1);
+#elif fstb_ARCHI == fstb_ARCHI_X86
+ return (_mm_movemask_epi8 (_x) == 0xFFFF);
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+ const uint32x2_t tmp = vreinterpret_u32_u16 (
+ vqmovn_u32 (vreinterpretq_u32_s32 (_x))
+ );
+ return ( vget_lane_u32 (tmp, 0) == 0xFFFFFFFFU
+ && vget_lane_u32 (tmp, 1) == 0xFFFFFFFFU);
+#endif // fstb_ARCHI
+}
+
+
+
+// Works only with well-formed condition results (tested bits depends on the implementation).
+// For each scalar, true = all bits set, false = all bits cleared
+bool Vs32::or_h () const noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+ const int32_t t = (_x [0] | _x [1]) | (_x [2] | _x [3]);
+ return (t != 0);
+#elif fstb_ARCHI == fstb_ARCHI_X86
+ return (_mm_movemask_epi8 (_x) != 0);
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+ const uint32x2_t tmp = vreinterpret_u32_u16 (
+ vqmovn_u32 (vreinterpretq_u32_s32 (_x))
+ );
+ return ( vget_lane_u32 (tmp, 0) != 0
+ || vget_lane_u32 (tmp, 1) != 0);
+#endif // fstb_ARCHI
+}
+
+
+
+// Moves the boolean content of each 4 scalar into the lower 4 bits of the
+// return value.
+// Assumes the object is a result of a comparison, with all bits the same
+// in each 32-bit element.
+unsigned int Vs32::movemask () const noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+ return
+ (_x [0] >> 31)
+ | ((_x [1] >> 30) & 2)
+ | ((_x [2] >> 29) & 4)
+ | ((_x [3] >> 28) & 8);
+#elif fstb_ARCHI == fstb_ARCHI_X86
+ return static_cast (_mm_movemask_ps (_mm_castsi128_ps (_x)));
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+ uint64x2_t tmp1 =
+ vreinterpretq_u64_s32 (_x); // ddd...ddd ccc...ccc bbb...bbb aaa...aaa
+ tmp1 = vshrq_n_u64 (tmp1, 31); // 000...00d ddd...ddc 000...00b bbb...bba
+ uint64x1_t tmp2 = vsli_n_u64 (
+ vget_high_u64 (tmp1),
+ vget_low_u64 (tmp1),
+ 2
+ );
+ return vget_lane_u32 (vreinterpret_u32_u64 (tmp2), 0) & 0xF;
+#endif // fstb_ARCHI
+}
+
+
+
+int Vs32::count_bits () const noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+ // http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
+ uint32_t v0 = _x [0] - ((_x [0] >> 1) & 0x55555555);
+ uint32_t v1 = _x [1] - ((_x [1] >> 1) & 0x55555555);
+ uint32_t v2 = _x [2] - ((_x [2] >> 1) & 0x55555555);
+ uint32_t v3 = _x [3] - ((_x [3] >> 1) & 0x55555555);
+ v0 = (v0 & 0x33333333) + ((v0 >> 2) & 0x33333333);
+ v1 = (v1 & 0x33333333) + ((v1 >> 2) & 0x33333333);
+ v2 = (v2 & 0x33333333) + ((v2 >> 2) & 0x33333333);
+ v3 = (v3 & 0x33333333) + ((v3 >> 2) & 0x33333333);
+ const int c0 = (((v0 + (v0 >> 4)) & 0xF0F0F0FU) * 0x1010101) >> 24;
+ const int c1 = (((v1 + (v1 >> 4)) & 0xF0F0F0FU) * 0x1010101) >> 24;
+ const int c2 = (((v2 + (v2 >> 4)) & 0xF0F0F0FU) * 0x1010101) >> 24;
+ const int c3 = (((v3 + (v3 >> 4)) & 0xF0F0F0FU) * 0x1010101) >> 24;
+ return (c0 + c2) + (c1 + c3);
+#elif fstb_ARCHI == fstb_ARCHI_X86
+ // https://stackoverflow.com/questions/17354971/fast-counting-the-number-of-set-bits-in-m128i-register
+ static const __m128i popcount_mask1 = _mm_set1_epi8 (0x77);
+ static const __m128i popcount_mask2 = _mm_set1_epi8 (0x0F);
+ // Count bits in each 4-bit field.
+ auto x = _x;
+ auto n = _mm_srli_epi64 (x, 1);
+ n = _mm_and_si128 (popcount_mask1, n);
+ x = _mm_sub_epi8 (x, n);
+ n = _mm_srli_epi64 (n, 1);
+ n = _mm_and_si128 (popcount_mask1, n);
+ x = _mm_sub_epi8 (x, n);
+ n = _mm_srli_epi64 (n, 1);
+ n = _mm_and_si128 (popcount_mask1, n);
+ n = _mm_sub_epi8 (x, n);
+ n = _mm_add_epi8 (n, _mm_srli_epi16 (n, 4));
+ n = _mm_and_si128 (popcount_mask2, n);
+ // Counts the number of bits in the low and high 64-bit parts
+ n = _mm_sad_epu8 (n, _mm_setzero_si128 ());
+ // Counts the number of bits in the whole 128-bit register
+ n = _mm_add_epi32 (n, _mm_unpackhi_epi64 (n, n));
+ return _mm_cvtsi128_si32 (n);
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+ const uint8x16_t cnt_8 = vcntq_u8 (vreinterpretq_u8_s32 (_x));
+ const uint16x8_t cnt_16 = vpaddlq_u8 (cnt_8);
+ const uint32x4_t cnt_32 = vpaddlq_u16 (cnt_16);
+ const uint64x2_t cnt_64 = vpaddlq_u32 (cnt_32);
+ const int32x4_t cnt_s = vreinterpretq_s32_u64 (cnt_64);
+ return vgetq_lane_s32 (cnt_s, 0) + vgetq_lane_s32 (cnt_s, 2);
+#endif // fstb_ARCHI
+}
+
+
+
+Vs32 Vs32::zero () noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+ return Vs32 { 0, 0, 0, 0 };
+#elif fstb_ARCHI == fstb_ARCHI_X86
+ return _mm_setzero_si128 ();
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+ return vdupq_n_s32 (0);
+#endif // fstb_ARCHI
+}
+
+
+
+Vs32 Vs32::all1 () noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+ return Vs32 { -1, -1, -1, -1 };
+#elif fstb_ARCHI == fstb_ARCHI_X86
+ return _mm_set1_epi32 (-1);
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+ return vdupq_n_s32 (-1);
+#endif // fstb_ARCHI
+}
+
+
+
+// "true" must be 1 and nothing else.
+Vs32 Vs32::set_mask (bool m0, bool m1, bool m2, bool m3) noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+ return Vs32 {
+ -int32_t (m0),
+ -int32_t (m1),
+ -int32_t (m2),
+ -int32_t (m3),
+ };
+#elif 1 // Fast version
+# if fstb_ARCHI == fstb_ARCHI_X86
+ return _mm_sub_epi32 (
+ _mm_setzero_si128 (),
+ _mm_set_epi32 (m3, m2, m1, m0)
+ );
+# elif fstb_ARCHI == fstb_ARCHI_ARM
+ float32x2_t v01 = vdup_n_f32 (m0);
+ float32x2_t v23 = vdup_n_f32 (m2);
+ v01 = vset_lane_f32 (m1, v01, 1);
+ v23 = vset_lane_f32 (m3, v23, 1);
+ return vnegq_s32 (vreinterpretq_s32_f32 (
+ vcombine_f32 (v01, v23)
+ ));
+# endif // fstb_ARCHI
+#else // Safer but slower version
+# if fstb_ARCHI == fstb_ARCHI_X86
+ return _mm_sub_epi32 (
+ _mm_set_epi32 (!m3, !m2, !m1, !m0),
+ _mm_set1_epi32 (1)
+ );
+# elif fstb_ARCHI == fstb_ARCHI_ARM
+ float32x2_t v01 = vdup_n_f32 (!m0);
+ float32x2_t v23 = vdup_n_f32 (!m2);
+ v01 = vset_lane_f32 (!m1, v01, 1);
+ v23 = vset_lane_f32 (!m3, v23, 1);
+ const auto one = vdupq_n_s32 (1);
+ return vsubq_s32 (
+ vreinterpretq_s32_f32 (vcombine_f32 (v01, v23)),
+ one
+ );
+# endif // fstb_ARCHI
+#endif // Versions
+}
+
+
+
+// Extracts the vector at the position SHIFT from the double-width vector {a b}
+// Concatenates a [SHIFT...3] with b [0...3-SHIFT]
+template
+Vs32 Vs32::compose (Vs32 a, Vs32 b) noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+ switch (POS & 3)
+ {
+ case 1: return Vs32 { a._x [1], a._x [2], a._x [3], b._x [0] };
+ case 2: return Vs32 { a._x [2], a._x [3], b._x [0], b._x [1] };
+ case 3: return Vs32 { a._x [3], b._x [0], b._x [1], b._x [2] };
+ default: return a;
+ }
+ return a;
+#elif fstb_ARCHI == fstb_ARCHI_X86
+ switch (POS & 3)
+ {
+ case 1:
+ {
+ const auto tmp = _mm_castps_si128 (_mm_move_ss (
+ _mm_castsi128_ps (a._x), _mm_castsi128_ps (b._x)
+ ));
+ return _mm_shuffle_epi32 (tmp, (0<<6) | (3<<4) | (2<<2) | (1<<0));
+ }
+ case 2:
+ return _mm_castps_si128 (_mm_shuffle_ps (
+ _mm_castsi128_ps (a._x),
+ _mm_castsi128_ps (b._x),
+ (1<<6) | (0<<4) | (3<<2) | (2<<0)
+ ));
+ case 3:
+ return _mm_castps_si128 (_mm_move_ss (
+ _mm_castsi128_ps (
+ _mm_shuffle_epi32 (b._x, (2<<6) | (1<<4) | (0<<2) | (3<<0))
+ ),
+ _mm_castsi128_ps (
+ _mm_shuffle_epi32 (a._x, (2<<6) | (1<<4) | (0<<2) | (3<<0))
+ )
+ ));
+ default:
+ return a;
+ }
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+ if (POS == 0)
+ {
+ return a;
+ }
+ else
+ {
+ return vextq_s32 (a._x, b._x, POS);
+ }
+#endif // fstb_ARCHI
+}
+
+
+
+template
+Vs32 Vs32::load (const MEM *ptr) noexcept
+{
+ assert (is_ptr_align_nz (ptr, fstb_SIMD128_ALIGN));
+
+#if ! defined (fstb_HAS_SIMD)
+ return *reinterpret_cast (ptr);
+#elif fstb_ARCHI == fstb_ARCHI_X86
+ return _mm_load_si128 (reinterpret_cast (ptr));
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+ return vld1q_s32 (reinterpret_cast (ptr));
+#endif // fstb_ARCHI
+}
+
+
+
+template
+Vs32 Vs32::loadu (const MEM *ptr) noexcept
+{
+ assert (ptr != nullptr);
+
+#if ! defined (fstb_HAS_SIMD)
+ return *reinterpret_cast (ptr);
+#elif fstb_ARCHI == fstb_ARCHI_X86
+ return _mm_loadu_si128 (reinterpret_cast (ptr));
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+ return vreinterpretq_s32_u8 (
+ vld1q_u8 (reinterpret_cast (ptr))
+ );
+#endif // fstb_ARCHI
+}
+
+
+
+/*\\\ PROTECTED \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\*/
+
+
+
+/*\\\ PRIVATE \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\*/
+
+
+
+/*\\\ GLOBAL OPERATORS AND FUNCTIONS \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\*/
+
+
+
+Vs32 operator + (Vs32 lhs, const Vs32 &rhs) noexcept
+{
+ lhs += rhs;
+ return lhs;
+}
+
+Vs32 operator - (Vs32 lhs, const Vs32 &rhs) noexcept
+{
+ lhs -= rhs;
+ return lhs;
+}
+
+Vs32 operator * (Vs32 lhs, const Vs32 &rhs) noexcept
+{
+ lhs *= rhs;
+ return lhs;
+}
+
+Vs32 operator & (Vs32 lhs, const Vs32 &rhs) noexcept
+{
+ lhs &= rhs;
+ return lhs;
+}
+
+Vs32 operator | (Vs32 lhs, const Vs32 &rhs) noexcept
+{
+ lhs |= rhs;
+ return lhs;
+}
+
+Vs32 operator ^ (Vs32 lhs, const Vs32 &rhs) noexcept
+{
+ lhs ^= rhs;
+ return lhs;
+}
+
+
+
+template
+Vs32 operator << (Vs32 lhs, T rhs) noexcept
+{
+ lhs <<= rhs;
+ return lhs;
+}
+
+template
+Vs32 operator >> (Vs32 lhs, T rhs) noexcept
+{
+ lhs >>= rhs;
+ return lhs;
+}
+
+
+
+Vs32 operator == (const Vs32 &lhs, const Vs32 &rhs) noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+ return Vs32 {
+ (lhs._x [0] == rhs._x [0]) ? -1 : 0,
+ (lhs._x [1] == rhs._x [1]) ? -1 : 0,
+ (lhs._x [2] == rhs._x [2]) ? -1 : 0,
+ (lhs._x [3] == rhs._x [3]) ? -1 : 0
+ };
+#elif fstb_ARCHI == fstb_ARCHI_X86
+ return _mm_cmpeq_epi32 (lhs, rhs);
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+ return vreinterpretq_s32_u32 (vceqq_s32 (lhs, rhs));
+#endif // fstb_ARCHI
+}
+
+
+
+Vs32 operator != (const Vs32 &lhs, const Vs32 &rhs) noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+ return Vs32 {
+ (lhs._x [0] != rhs._x [0]) ? -1 : 0,
+ (lhs._x [1] != rhs._x [1]) ? -1 : 0,
+ (lhs._x [2] != rhs._x [2]) ? -1 : 0,
+ (lhs._x [3] != rhs._x [3]) ? -1 : 0
+ };
+#elif fstb_ARCHI == fstb_ARCHI_X86
+ const auto eq = _mm_cmpeq_epi32 (lhs, rhs);
+ return _mm_xor_si128 (eq, _mm_set1_epi32 (-1));
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+ return vreinterpretq_s32_u32 (vmvnq_u32 (vceqq_s32 (lhs, rhs)));
+#endif // fstb_ARCHI
+}
+
+
+
+Vs32 operator < (const Vs32 &lhs, const Vs32 &rhs) noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+ return Vs32 {
+ (lhs._x [0] < rhs._x [0]) ? -1 : 0,
+ (lhs._x [1] < rhs._x [1]) ? -1 : 0,
+ (lhs._x [2] < rhs._x [2]) ? -1 : 0,
+ (lhs._x [3] < rhs._x [3]) ? -1 : 0
+ };
+#elif fstb_ARCHI == fstb_ARCHI_X86
+ return _mm_cmplt_epi32 (lhs, rhs);
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+ return vreinterpretq_s32_u32 (vcltq_s32 (lhs, rhs));
+#endif // fstb_ARCHI
+}
+
+
+
+Vs32 operator <= (const Vs32 &lhs, const Vs32 &rhs) noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+ return Vs32 {
+ (lhs._x [0] <= rhs._x [0]) ? -1 : 0,
+ (lhs._x [1] <= rhs._x [1]) ? -1 : 0,
+ (lhs._x [2] <= rhs._x [2]) ? -1 : 0,
+ (lhs._x [3] <= rhs._x [3]) ? -1 : 0
+ };
+#elif fstb_ARCHI == fstb_ARCHI_X86
+# if 1
+ return (lhs < rhs) | (lhs == rhs);
+# else
+ return ~(lhs > rhs);
+# endif
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+ return vreinterpretq_s32_u32 (vcleq_s32 (lhs, rhs));
+#endif // fstb_ARCHI
+}
+
+
+
+Vs32 operator > (const Vs32 &lhs, const Vs32 &rhs) noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+ return Vs32 {
+ (lhs._x [0] > rhs._x [0]) ? -1 : 0,
+ (lhs._x [1] > rhs._x [1]) ? -1 : 0,
+ (lhs._x [2] > rhs._x [2]) ? -1 : 0,
+ (lhs._x [3] > rhs._x [3]) ? -1 : 0
+ };
+#elif fstb_ARCHI == fstb_ARCHI_X86
+ return _mm_cmpgt_epi32 (lhs, rhs);
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+ return vreinterpretq_s32_u32 (vcgtq_s32 (lhs, rhs));
+#endif // fstb_ARCHI
+}
+
+
+
+Vs32 operator >= (const Vs32 &lhs, const Vs32 &rhs) noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+ return Vs32 {
+ (lhs._x [0] >= rhs._x [0]) ? -1 : 0,
+ (lhs._x [1] >= rhs._x [1]) ? -1 : 0,
+ (lhs._x [2] >= rhs._x [2]) ? -1 : 0,
+ (lhs._x [3] >= rhs._x [3]) ? -1 : 0
+ };
+#elif fstb_ARCHI == fstb_ARCHI_X86
+# if 1
+ return (lhs > rhs) | (lhs == rhs);
+# else
+ return ~(lhs < rhs);
+# endif
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+ return vreinterpretq_s32_u32 (vcgeq_s32 (lhs, rhs));
+#endif // fstb_ARCHI
+}
+
+
+
+// Result is undefined for -(1<<31).
+Vs32 abs (const Vs32 &v) noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+ return Vs32 {
+ std::abs (v._x [0]),
+ std::abs (v._x [1]),
+ std::abs (v._x [2]),
+ std::abs (v._x [3])
+ };
+#elif fstb_ARCHI == fstb_ARCHI_X86
+ const auto v_neg = _mm_sub_epi32 (_mm_setzero_si128 (), v);
+ return max (v, v_neg);
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+ return vqabsq_s32 (v);
+#endif // fstb_ARCHI
+}
+
+
+
+Vs32 min (const Vs32 &lhs, const Vs32 &rhs) noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+ return Vs32 {
+ std::min (lhs._x [0], rhs._x [0]),
+ std::min (lhs._x [1], rhs._x [1]),
+ std::min (lhs._x [2], rhs._x [2]),
+ std::min (lhs._x [3], rhs._x [3])
+ };
+#elif fstb_ARCHI == fstb_ARCHI_X86
+ const auto gt = (lhs > rhs);
+ return _mm_or_si128 (
+ _mm_and_si128 ( gt, rhs),
+ _mm_andnot_si128 (gt, lhs)
+ );
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+ return vminq_s32 (lhs, rhs);
+#endif // fstb_ARCHI
+}
+
+
+
+Vs32 max (const Vs32 &lhs, const Vs32 &rhs) noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+ return Vs32 {
+ std::max (lhs._x [0], rhs._x [0]),
+ std::max (lhs._x [1], rhs._x [1]),
+ std::max (lhs._x [2], rhs._x [2]),
+ std::max (lhs._x [3], rhs._x [3])
+ };
+#elif fstb_ARCHI == fstb_ARCHI_X86
+ const auto lt = (lhs < rhs);
+ return _mm_or_si128 (
+ _mm_and_si128 ( lt, rhs),
+ _mm_andnot_si128 (lt, lhs)
+ );
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+ return vmaxq_s32 (lhs, rhs);
+#endif // fstb_ARCHI
+}
+
+
+
+Vs32 limit (const Vs32 &v, const Vs32 &mi, const Vs32 &ma) noexcept
+{
+ return min (max (v, mi), ma);
+}
+
+
+
+Vs32 select (const Vs32 &cond, const Vs32 &v_t, const Vs32 &v_f) noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+ /*** To do: implement as r = v_f ^ ((v_f ^ v_t) & cond) ***/
+ return Vs32 {
+ (cond._x [0] & v_t._x [0]) | (~cond._x [0] & v_f._x [0]),
+ (cond._x [1] & v_t._x [1]) | (~cond._x [1] & v_f._x [1]),
+ (cond._x [2] & v_t._x [2]) | (~cond._x [2] & v_f._x [2]),
+ (cond._x [3] & v_t._x [3]) | (~cond._x [3] & v_f._x [3])
+ };
+#elif fstb_ARCHI == fstb_ARCHI_X86
+ const auto cond_1 = _mm_and_si128 (cond, v_t);
+ const auto cond_0 = _mm_andnot_si128 (cond, v_f);
+ return _mm_or_si128 (cond_0, cond_1);
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+ return vbslq_s32 (vreinterpretq_u32_s32 (cond), v_t, v_f);
+#endif // fstb_ARCHI
+}
+
+
+
+std::tuple swap_if (const Vs32 &cond, Vs32 lhs, Vs32 rhs) noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+ if (cond._x [0] != 0) { std::swap (lhs._x [0], rhs._x [0]); }
+ if (cond._x [1] != 0) { std::swap (lhs._x [1], rhs._x [1]); }
+ if (cond._x [2] != 0) { std::swap (lhs._x [2], rhs._x [2]); }
+ if (cond._x [3] != 0) { std::swap (lhs._x [3], rhs._x [3]); }
+ return std::make_tuple (lhs, rhs);
+#elif fstb_ARCHI == fstb_ARCHI_X86
+ const auto inv = _mm_and_si128 (_mm_xor_si128 (lhs, rhs), cond);
+ return std::make_tuple (
+ Vs32 (_mm_xor_si128 (lhs, inv)),
+ Vs32 (_mm_xor_si128 (rhs, inv))
+ );
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+ const auto cond_u = vreinterpretq_u32_s32 (cond);
+ return std::make_tuple (
+ Vs32 (vbslq_s32 (cond_u, rhs, lhs)),
+ Vs32 (vbslq_s32 (cond_u, lhs, rhs))
+ );
+#endif // fstb_ARCHI
+}
+
+
+
+} // namespace fstb
+
+
+
+#endif // fstb_Vs32_CODEHEADER_INCLUDED
+
+
+
+/*\\\ EOF \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\*/
diff --git a/src/fstb/Vu32.h b/src/fstb/Vu32.h
new file mode 100644
index 0000000..2341621
--- /dev/null
+++ b/src/fstb/Vu32.h
@@ -0,0 +1,259 @@
+/*****************************************************************************
+
+ Vu32.h
+ Author: Laurent de Soras, 2021
+
+--- Legal stuff ---
+
+This program is free software. It comes without any warranty, to
+the extent permitted by applicable law. You can redistribute it
+and/or modify it under the terms of the Do What The Fuck You Want
+To Public License, Version 2, as published by Sam Hocevar. See
+http://www.wtfpl.net/ for more details.
+
+*Tab=3***********************************************************************/
+
+
+
+#pragma once
+#if ! defined (fstb_Vu32_HEADER_INCLUDED)
+#define fstb_Vu32_HEADER_INCLUDED
+
+
+
+/*\\\ INCLUDE FILES \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\*/
+
+#include "fstb/def.h"
+
+#if ! defined (fstb_HAS_SIMD)
+ #include
+#elif (fstb_ARCHI == fstb_ARCHI_X86)
+ #include
+#elif (fstb_ARCHI == fstb_ARCHI_ARM)
+ #include
+#else
+ #error
+#endif
+
+#include
+
+#include
+
+
+
+namespace fstb
+{
+
+
+
+#if ! defined (fstb_HAS_SIMD)
+
+typedef std::array Vu32Native;
+
+#elif fstb_ARCHI == fstb_ARCHI_X86
+
+typedef __m128i Vu32Native;
+
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+
+typedef uint32x4_t Vu32Native;
+
+#else // fstb_ARCHI
+#error
+#endif // fstb_ARCHI
+
+
+
+class Vu32
+{
+
+/*\\\ PUBLIC \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\*/
+
+public:
+
+ static constexpr int _len_l2 = 2;
+ static constexpr int _length = 1 << _len_l2;
+ typedef uint32_t Scalar;
+
+ Vu32 () = default;
+ fstb_FORCEINLINE
+ Vu32 (Vu32Native a) noexcept : _x { a } {}
+ explicit fstb_FORCEINLINE
+ Vu32 (Scalar a) noexcept;
+ explicit fstb_FORCEINLINE
+ Vu32 (Scalar a0, Scalar a1, Scalar a2, Scalar a3) noexcept;
+ explicit fstb_FORCEINLINE
+ Vu32 (const std::tuple &a) noexcept;
+ Vu32 (const Vu32 &other) = default;
+ Vu32 (Vu32 &&other) = default;
+ ~Vu32 () = default;
+ Vu32 & operator = (const Vu32 &other) = default;
+ Vu32 & operator = (Vu32 &&other) = default;
+
+ template
+ fstb_FORCEINLINE void
+ store (MEM *ptr) const noexcept;
+ template
+ fstb_FORCEINLINE void
+ storeu (MEM *ptr) const noexcept;
+ template
+ fstb_FORCEINLINE void
+ storeu_part (MEM *ptr, int n) const noexcept;
+
+ fstb_FORCEINLINE
+ operator Vu32Native () const noexcept { return _x; }
+ fstb_FORCEINLINE explicit
+ operator bool () const noexcept;
+
+ fstb_FORCEINLINE Vu32 &
+ operator += (const Vu32Native &other) noexcept;
+ fstb_FORCEINLINE Vu32 &
+ operator -= (const Vu32Native &other) noexcept;
+ fstb_FORCEINLINE Vu32 &
+ operator *= (const Vu32Native &other) noexcept;
+ fstb_FORCEINLINE Vu32 &
+ operator *= (const Scalar &other) noexcept;
+
+ fstb_FORCEINLINE Vu32 &
+ operator &= (const Vu32Native &other) noexcept;
+ fstb_FORCEINLINE Vu32 &
+ operator |= (const Vu32Native &other) noexcept;
+ fstb_FORCEINLINE Vu32 &
+ operator ^= (const Vu32Native &other) noexcept;
+
+ fstb_FORCEINLINE Vu32 &
+ operator <<= (int imm) noexcept;
+ fstb_FORCEINLINE Vu32 &
+ operator >>= (int imm) noexcept;
+
+ fstb_FORCEINLINE Vu32
+ operator - () const noexcept;
+ fstb_FORCEINLINE Vu32
+ operator ~ () const noexcept;
+ fstb_FORCEINLINE Vu32
+ reverse () const noexcept;
+
+ template
+ fstb_FORCEINLINE Vu32
+ rotate () const noexcept;
+ template
+ fstb_FORCEINLINE uint32_t
+ extract () const noexcept;
+ template
+ fstb_FORCEINLINE Vu32
+ insert (uint32_t val) const noexcept;
+ template
+ fstb_FORCEINLINE Vu32
+ spread () const noexcept;
+
+ fstb_FORCEINLINE std::tuple
+ explode () const noexcept;
+
+ fstb_FORCEINLINE uint32_t
+ sum_h () const noexcept;
+ fstb_FORCEINLINE uint32_t
+ min_h () const noexcept;
+ fstb_FORCEINLINE uint32_t
+ max_h () const noexcept;
+
+ fstb_FORCEINLINE bool
+ and_h () const noexcept;
+ fstb_FORCEINLINE bool
+ or_h () const noexcept;
+ fstb_FORCEINLINE unsigned int
+ movemask () const noexcept;
+ fstb_FORCEINLINE int
+ count_bits () const noexcept;
+
+ static fstb_FORCEINLINE Vu32
+ zero () noexcept;
+ static fstb_FORCEINLINE Vu32
+ all1 () noexcept;
+ static fstb_FORCEINLINE Vu32
+ set_mask (bool m0, bool m1, bool m2, bool m3) noexcept;
+ template
+ static fstb_FORCEINLINE Vu32
+ compose (Vu32 a, Vu32 b) noexcept;
+ static fstb_FORCEINLINE Vu32
+ flip_msb (Vu32 x) noexcept;
+
+ template
+ static fstb_FORCEINLINE Vu32
+ load (const MEM *ptr) noexcept;
+ template
+ static fstb_FORCEINLINE Vu32
+ loadu (const MEM *ptr) noexcept;
+
+
+
+/*\\\ PROTECTED \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\*/
+
+protected:
+
+
+
+/*\\\ PRIVATE \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\*/
+
+private:
+
+#if ! defined (fstb_HAS_SIMD)
+public:
+#endif
+ Vu32Native _x;
+private:
+
+
+
+/*\\\ FORBIDDEN MEMBER FUNCTIONS \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\*/
+
+private:
+
+}; // class Vu32
+
+
+
+/*\\\ GLOBAL OPERATORS \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\*/
+
+
+
+fstb_FORCEINLINE Vu32 operator + (Vu32 lhs, const Vu32 &rhs) noexcept;
+fstb_FORCEINLINE Vu32 operator - (Vu32 lhs, const Vu32 &rhs) noexcept;
+fstb_FORCEINLINE Vu32 operator * (Vu32 lhs, const Vu32 &rhs) noexcept;
+fstb_FORCEINLINE Vu32 operator * (Vu32 lhs, const Vu32::Scalar rhs) noexcept;
+fstb_FORCEINLINE Vu32 operator & (Vu32 lhs, const Vu32 &rhs) noexcept;
+fstb_FORCEINLINE Vu32 operator | (Vu32 lhs, const Vu32 &rhs) noexcept;
+fstb_FORCEINLINE Vu32 operator ^ (Vu32 lhs, const Vu32 &rhs) noexcept;
+
+template
+fstb_FORCEINLINE Vu32 operator << (Vu32 lhs, T rhs) noexcept;
+template
+fstb_FORCEINLINE Vu32 operator >> (Vu32 lhs, T rhs) noexcept;
+
+fstb_FORCEINLINE Vu32 operator == (const Vu32 &lhs, const Vu32 &rhs) noexcept;
+fstb_FORCEINLINE Vu32 operator != (const Vu32 &lhs, const Vu32 &rhs) noexcept;
+fstb_FORCEINLINE Vu32 operator < (const Vu32 &lhs, const Vu32 &rhs) noexcept;
+fstb_FORCEINLINE Vu32 operator <= (const Vu32 &lhs, const Vu32 &rhs) noexcept;
+fstb_FORCEINLINE Vu32 operator > (const Vu32 &lhs, const Vu32 &rhs) noexcept;
+fstb_FORCEINLINE Vu32 operator >= (const Vu32 &lhs, const Vu32 &rhs) noexcept;
+
+fstb_FORCEINLINE Vu32 min (const Vu32 &lhs, const Vu32 &rhs) noexcept;
+fstb_FORCEINLINE Vu32 max (const Vu32 &lhs, const Vu32 &rhs) noexcept;
+fstb_FORCEINLINE Vu32 limit (const Vu32 &v, const Vu32 &mi, const Vu32 &ma) noexcept;
+fstb_FORCEINLINE Vu32 select (const Vu32 &cond, const Vu32 &v_t, const Vu32 &v_f) noexcept;
+fstb_FORCEINLINE std::tuple swap_if (const Vu32 &cond, Vu32 lhs, Vu32 rhs) noexcept;
+
+
+
+} // namespace fstb
+
+
+
+#include "fstb/Vu32.hpp"
+
+
+
+#endif // fstb_Vu32_HEADER_INCLUDED
+
+
+
+/*\\\ EOF \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\*/
diff --git a/src/fstb/Vu32.hpp b/src/fstb/Vu32.hpp
new file mode 100644
index 0000000..a1f851d
--- /dev/null
+++ b/src/fstb/Vu32.hpp
@@ -0,0 +1,1139 @@
+/*****************************************************************************
+
+ Vu32.hpp
+ Author: Laurent de Soras, 2021
+
+--- Legal stuff ---
+
+This program is free software. It comes without any warranty, to
+the extent permitted by applicable law. You can redistribute it
+and/or modify it under the terms of the Do What The Fuck You Want
+To Public License, Version 2, as published by Sam Hocevar. See
+http://www.wtfpl.net/ for more details.
+
+*Tab=3***********************************************************************/
+
+
+
+#if ! defined (fstb_Vu32_CODEHEADER_INCLUDED)
+#define fstb_Vu32_CODEHEADER_INCLUDED
+
+
+
+/*\\\ INCLUDE FILES \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\*/
+
+#include
+
+#include
+
+
+
+namespace fstb
+{
+
+
+
+/*\\\ PUBLIC \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\*/
+
+
+
+Vu32::Vu32 (Scalar a) noexcept
+#if ! defined (fstb_HAS_SIMD)
+: _x { a, a, a, a }
+#elif fstb_ARCHI == fstb_ARCHI_X86
+: _x { _mm_set1_epi32 (int32_t (a)) }
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+: _x { vdupq_n_u32 (a) }
+#endif // fstb_ARCHI
+{
+ // Nothing
+}
+
+
+
+// Returns a0 | a1 | a2 | a3
+Vu32::Vu32 (Scalar a0, Scalar a1, Scalar a2, Scalar a3) noexcept
+#if ! defined (fstb_HAS_SIMD)
+: _x { a0, a1, a2, a3 }
+#elif fstb_ARCHI == fstb_ARCHI_X86
+: _x { _mm_set_epi32 (a3, a2, a1, a0) }
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+: _x { a0, a1, a2, a3 }
+#endif // fstb_ARCHI
+{
+ // Nothing
+}
+
+
+
+// Returns a0 | a1 | a2 | a3
+Vu32::Vu32 (const std::tuple &a) noexcept
+#if ! defined (fstb_HAS_SIMD)
+: _x { std::get <0> (a), std::get <1> (a), std::get <2> (a), std::get <3> (a) }
+#elif fstb_ARCHI == fstb_ARCHI_X86
+: _x { _mm_set_epi32 (std::get <3> (a), std::get <2> (a), std::get <1> (a), std::get <0> (a)) }
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+: _x { std::get <0> (a), std::get <1> (a), std::get <2> (a), std::get <3> (a) }
+#endif // fstb_ARCHI
+{
+ // Nothing
+}
+
+
+
+template
+void Vu32::store (MEM *ptr) const noexcept
+{
+ assert (is_ptr_align_nz (ptr, fstb_SIMD128_ALIGN));
+
+#if ! defined (fstb_HAS_SIMD)
+ *reinterpret_cast (ptr) = _x;
+#elif fstb_ARCHI == fstb_ARCHI_X86
+ _mm_store_si128 (reinterpret_cast <__m128i *> (ptr), _x);
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+ vst1q_u32 (reinterpret_cast (ptr), _x);
+#endif // fstb_ARCHI
+}
+
+
+
+template
+void Vu32::storeu (MEM *ptr) const noexcept
+{
+ assert (ptr != nullptr);
+
+#if ! defined (fstb_HAS_SIMD)
+ *reinterpret_cast (ptr) = _x;
+#elif fstb_ARCHI == fstb_ARCHI_X86
+ _mm_storeu_si128 (reinterpret_cast <__m128i *> (ptr), _x);
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+ vst1q_u8 (reinterpret_cast (ptr), vreinterpretq_u8_u32 (_x));
+#endif // fstb_ARCHI
+}
+
+
+
+// n = number of scalars to store (from the LSB)
+template
+void Vu32::storeu_part (MEM *ptr, int n) const noexcept
+{
+ assert (n > 0);
+
+ if (n >= _length)
+ {
+ storeu (ptr);
+ return;
+ }
+
+ uint32_t * f_ptr = reinterpret_cast (ptr);
+
+#if ! defined (fstb_HAS_SIMD)
+
+ for (int i = 0; i < n; ++i)
+ {
+ f_ptr [i] = _x [i];
+ }
+
+#elif fstb_ARCHI == fstb_ARCHI_X86
+
+ f_ptr [0] = uint32_t (_mm_cvtsi128_si32 (_x));
+ if (n >= 2)
+ {
+ f_ptr [1] = uint32_t (_mm_cvtsi128_si32 (_mm_shuffle_epi32 (_x, 1 << 0)));
+ if (n >= 3)
+ {
+ f_ptr [1] = uint32_t (_mm_cvtsi128_si32 (_mm_shuffle_epi32 (_x, 2 << 0)));
+ }
+ }
+
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+
+ vst1q_lane_u32 (f_ptr + 0, _x, 0);
+ if (n >= 2)
+ {
+ vst1q_lane_u32 (f_ptr + 1, _x, 1);
+ if (n >= 3)
+ {
+ vst1q_lane_u32 (f_ptr + 2, _x, 2);
+ }
+ }
+
+#endif
+}
+
+
+
+// Works only with well-formed condition results (tested bits depend on the
+// implementation).
+// For each scalar, true = all bits set, false = all bits cleared
+Vu32::operator bool () const noexcept
+{
+ return and_h ();
+}
+
+
+
+Vu32 & Vu32::operator += (const Vu32Native &other) noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+ _x [0] += other [0];
+ _x [1] += other [1];
+ _x [2] += other [2];
+ _x [3] += other [3];
+#elif fstb_ARCHI == fstb_ARCHI_X86
+ _x = _mm_add_epi32 (_x, other);
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+ _x = vaddq_u32 (_x, other);
+#endif // fstb_ARCHI
+ return *this;
+}
+
+
+
+Vu32 & Vu32::operator -= (const Vu32Native &other) noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+ _x [0] -= other [0];
+ _x [1] -= other [1];
+ _x [2] -= other [2];
+ _x [3] -= other [3];
+#elif fstb_ARCHI == fstb_ARCHI_X86
+ _x = _mm_sub_epi32 (_x, other);
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+ _x = vsubq_u32 (_x, other);
+#endif // fstb_ARCHI
+ return *this;
+}
+
+
+
+Vu32 & Vu32::operator *= (const Vu32Native &other) noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+ _x [0] *= other [0];
+ _x [1] *= other [1];
+ _x [2] *= other [2];
+ _x [3] *= other [3];
+#elif fstb_ARCHI == fstb_ARCHI_X86
+ // Emulation of _mm_mullo_epi32 (SSE4.1)
+# if fstb_COMPILER == fstb_COMPILER_MSVC
+ // For some reason this code is slightly faster on MSVC
+ auto p02_64 = _mm_mul_epu32 (_x, other);
+ auto p13_64 = _mm_mul_epu32 (
+ _mm_srli_si128 (_x , 4),
+ _mm_srli_si128 (other, 4)
+ );
+ p02_64 = _mm_shuffle_epi32 (p02_64, (0 << 0) | (2 << 2));
+ p13_64 = _mm_shuffle_epi32 (p13_64, (0 << 0) | (2 << 2));
+ _x = _mm_unpacklo_epi32 (p02_64, p13_64);
+# else
+ // Code of this function shamelessly borrowed from tp7
+ // https://github.com/tp7/masktools/blob/16bit/masktools/common/simd.h
+ // This code is faster on GCC/Clang
+ const __m128i lhs13 = _mm_shuffle_epi32 (_x, 0xF5); // (-,a3,-,a1)
+ const __m128i rhs13 = _mm_shuffle_epi32 (other, 0xF5); // (-,b3,-,b1)
+ const __m128i prod02 = _mm_mul_epu32 (_x, other); // (-,a2*b2,-,a0*b0)
+ const __m128i prod13 = _mm_mul_epu32 (lhs13, rhs13); // (-,a3*b3,-,a1*b1)
+ const __m128i prod01 = _mm_unpacklo_epi32 (prod02, prod13); // (-,-,a1*b1,a0*b0)
+ const __m128i prod23 = _mm_unpackhi_epi32 (prod02, prod13); // (-,-,a3*b3,a2*b2)
+ _x = _mm_unpacklo_epi64 (prod01 ,prod23); // (ab3,ab2,ab1,ab0)
+# endif // fstb_COMPILER
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+ _x = vmulq_u32 (_x, other);
+#endif // fstb_ARCHI
+ return *this;
+}
+
+
+
+Vu32 & Vu32::operator *= (const Scalar &other) noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+ _x [0] *= other;
+ _x [1] *= other;
+ _x [2] *= other;
+ _x [3] *= other;
+#elif fstb_ARCHI == fstb_ARCHI_X86
+ const auto vb = _mm_set1_epi32 (int32_t (other));
+ auto v0 = _mm_shuffle_epi32 (_x, (0<<0) | (1<<4));
+ auto v1 = _mm_shuffle_epi32 (_x, (2<<0) | (3<<4));
+ v0 = _mm_mul_epu32 (v0, vb);
+ v1 = _mm_mul_epu32 (v1, vb);
+ _x = _mm_castps_si128 (_mm_shuffle_ps (
+ _mm_castsi128_ps (v0),
+ _mm_castsi128_ps (v1),
+ (0<<0) | (2<<2) | (0<<4) | (2<<6)
+ ));
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+ _x = vmulq_u32 (_x, vdupq_n_u32 (other));
+#endif // fstb_ARCHI
+ return *this;
+}
+
+
+
+Vu32 & Vu32::operator &= (const Vu32Native &other) noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+ _x [0] &= other [0];
+ _x [1] &= other [1];
+ _x [2] &= other [2];
+ _x [3] &= other [3];
+#elif fstb_ARCHI == fstb_ARCHI_X86
+ _x = _mm_and_si128 (_x, other);
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+ _x = vandq_u32 (_x, other);
+#endif // fstb_ARCHI
+ return *this;
+}
+
+
+
+Vu32 & Vu32::operator |= (const Vu32Native &other) noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+ _x [0] |= other [0];
+ _x [1] |= other [1];
+ _x [2] |= other [2];
+ _x [3] |= other [3];
+#elif fstb_ARCHI == fstb_ARCHI_X86
+ _x = _mm_or_si128 (_x, other);
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+ _x = vorrq_u32 (_x, other);
+#endif // fstb_ARCHI
+ return *this;
+}
+
+
+
+Vu32 & Vu32::operator ^= (const Vu32Native &other) noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+ _x [0] ^= other [0];
+ _x [1] ^= other [1];
+ _x [2] ^= other [2];
+ _x [3] ^= other [3];
+#elif fstb_ARCHI == fstb_ARCHI_X86
+ _x = _mm_xor_si128 (_x, other);
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+ _x = veorq_u32 (_x, other);
+#endif // fstb_ARCHI
+ return *this;
+}
+
+
+
+Vu32 & Vu32::operator <<= (int imm) noexcept
+{
+ assert (imm >= 0);
+ assert (imm <= 32);
+#if ! defined (fstb_HAS_SIMD)
+ _x [0] <<= imm;
+ _x [1] <<= imm;
+ _x [2] <<= imm;
+ _x [3] <<= imm;
+#elif fstb_ARCHI == fstb_ARCHI_X86
+ _x = _mm_slli_epi32 (_x, imm);
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+ _x <<= imm;
+#endif // fstb_ARCHI
+ return *this;
+}
+
+
+
+Vu32 & Vu32::operator >>= (int imm) noexcept
+{
+ assert (imm >= 0);
+ assert (imm <= 32);
+#if ! defined (fstb_HAS_SIMD)
+ _x [0] >>= imm;
+ _x [1] >>= imm;
+ _x [2] >>= imm;
+ _x [3] >>= imm;
+#elif fstb_ARCHI == fstb_ARCHI_X86
+ _x = _mm_srli_epi32 (_x, imm);
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+ _x >>= imm;
+#endif // fstb_ARCHI
+ return *this;
+}
+
+
+
+Vu32 Vu32::operator - () const noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+ return Vu32 {
+ -_x [0],
+ -_x [1],
+ -_x [2],
+ -_x [3]
+ };
+#elif fstb_ARCHI == fstb_ARCHI_X86
+ return _mm_sub_epi32 (_mm_setzero_si128 (), _x);
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+ return vreinterpretq_u32_s32 (vnegq_s32 (vreinterpretq_s32_u32 (_x)));
+#endif // fstb_ARCHI
+}
+
+
+
+Vu32 Vu32::operator ~ () const noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+ return Vu32 {
+ ~(_x [0]),
+ ~(_x [1]),
+ ~(_x [2]),
+ ~(_x [3])
+ };
+#elif fstb_ARCHI == fstb_ARCHI_X86
+ return _mm_xor_si128 (_x, _mm_set1_epi32 (-1));
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+ return vmvnq_u32 (_x);
+#endif // fstb_ARCHI
+}
+
+
+
+Vu32 Vu32::reverse () const noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+ return Vu32 { _x [3], _x [2], _x [1], _x [0] };
+#elif fstb_ARCHI == fstb_ARCHI_X86
+ return _mm_shuffle_epi32 (_x, (3<<0) + (2<<2) + (1<<4) + (0<<6));
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+ return vrev64q_u32 (vcombine_u32 (vget_high_u32 (_x), vget_low_u32 (_x)));
+#endif // fstb_ARCHI
+}
+
+
+
+// Positive = left
+template
+Vu32 Vu32::rotate () const noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+ return Vu32 {
+ _x [(0 - SHIFT) & 3],
+ _x [(1 - SHIFT) & 3],
+ _x [(2 - SHIFT) & 3],
+ _x [(3 - SHIFT) & 3]
+ };
+#elif fstb_ARCHI == fstb_ARCHI_X86
+ switch (SHIFT & 3)
+ {
+ case 1: return _mm_shuffle_epi32 (_x, (2<<6) | (1<<4) | (0<<2) | (3<<0));
+ case 2: return _mm_shuffle_epi32 (_x, (1<<6) | (0<<4) | (3<<2) | (2<<0));
+ case 3: return _mm_shuffle_epi32 (_x, (0<<6) | (3<<4) | (2<<2) | (1<<0));
+ default: return *this;
+ }
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+ switch (SHIFT & 3)
+ {
+ case 1: return vextq_u32 (_x, _x, 3);
+ case 2: return vextq_u32 (_x, _x, 2);
+ case 3: return vextq_u32 (_x, _x, 1);
+ default: return *this;
+ }
+#endif // fstb_ARCHI
+}
+
+
+
+template
+uint32_t Vu32::extract () const noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+ return _x [POS & 3];
+#elif fstb_ARCHI == fstb_ARCHI_X86
+ auto a = _x;
+ switch (POS & 3)
+ {
+ case 1: a = _mm_shuffle_epi32 (a, 1); break;
+ case 2: a = _mm_shuffle_epi32 (a, 2); break;
+ case 3: a = _mm_shuffle_epi32 (a, 3); break;
+ default: /* Nothing */ break;
+ }
+ return Scalar (_mm_cvtsi128_si32 (a));
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+ return vgetq_lane_u32 (_x, POS & 3);
+#endif // fstb_ARCHI
+}
+
+
+
+template
+Vu32 Vu32::insert (uint32_t val) const noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+ auto a = *this;
+ a._x [POS & 3] = val;
+ return a;
+#elif fstb_ARCHI == fstb_ARCHI_X86
+ auto a = rotate <(-POS) & 3> ();
+ a._x = _mm_castps_si128 (_mm_move_ss (
+ _mm_castsi128_ps (a._x),
+ _mm_castsi128_ps (_mm_set1_epi32 (int32_t (val)))
+ ));
+ return a.template rotate ();
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+ return vsetq_lane_u32 (val, _x, POS & 3);
+#endif // fstb_ARCHI
+}
+
+
+
+template
+Vu32 Vu32::spread () const noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+ return Vu32 (extract ());
+#elif fstb_ARCHI == fstb_ARCHI_X86
+ return _mm_shuffle_epi32 (_x, 0x55 * (POS & 3));
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+ return vdupq_n_u32 (vgetq_lane_u32 (_x, POS & 3));
+#endif // fstb_ARCHI
+}
+
+
+
+uint32_t Vu32::sum_h () const noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+ return (_x [0] + _x [2]) + (_x [1] + _x [3]);
+#elif fstb_ARCHI == fstb_ARCHI_X86
+ // s = v3,v2,v1,v0
+ const auto s = _mm_shuffle_epi32 (_x, (3 << 0) | (2 << 2) | (1 << 4) | (0 << 6));
+ const auto v = _mm_add_epi32 (_x, s); // v0+v3,v1+v2,v2+v1,v3+v0
+ return uint32_t (
+ _mm_cvtsi128_si32 (_mm_add_epi32 (v, _mm_shuffle_epi32 (v, 1 << 0)))
+ );
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+ #if fstb_WORD_SIZE == 64
+ return vaddvq_u32 (_x);
+ #else
+ uint32x2_t v2 = vadd_u32 (vget_high_u32 (_x), vget_low_u32 (_x));
+ return vget_lane_u32 (vpadd_u32 (v2, v2), 0);
+ #endif
+#endif // fstb_ARCHI
+}
+
+
+
+uint32_t Vu32::min_h () const noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+ return std::min (std::min (_x [0], _x [2]), std::min (_x [1], _x [3]));
+#elif fstb_ARCHI == fstb_ARCHI_X86
+ const auto v0 = min (*this, _mm_shuffle_epi32 (_x, (3 << 2) | 2));
+ const auto v1 = _mm_shuffle_epi32 (v0, 1);
+ return std::min (
+ uint32_t (_mm_cvtsi128_si32 (v0)), uint32_t (_mm_cvtsi128_si32 (v1))
+ );
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+ uint32x2_t v2 = vmin_u32 (vget_high_u32 (_x), vget_low_u32 (_x));
+ return vget_lane_u32 (vpmin_u32 (v2, v2), 0);
+#endif // fstb_ARCHI
+}
+
+
+
+uint32_t Vu32::max_h () const noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+ return std::max (std::max (_x [0], _x [2]), std::max (_x [1], _x [3]));
+#elif fstb_ARCHI == fstb_ARCHI_X86
+ const auto v0 = max (*this, _mm_shuffle_epi32 (_x, (3 << 2) | 2));
+ const auto v1 = _mm_shuffle_epi32 (v0, 1);
+ return std::max (
+ uint32_t (_mm_cvtsi128_si32 (v0)), uint32_t (_mm_cvtsi128_si32 (v1))
+ );
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+ uint32x2_t v2 = vmax_u32 (vget_high_u32 (_x), vget_low_u32 (_x));
+ return vget_lane_u32 (vpmax_u32 (v2, v2), 0);
+#endif // fstb_ARCHI
+}
+
+
+
+// Works only with well-formed condition results (tested bits depends on the implementation).
+// For each scalar, true = all bits set, false = all bits cleared
+bool Vu32::and_h () const noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+ const uint32_t t = (_x [0] & _x [1]) & (_x [2] & _x [3]);
+ return (t == uint32_t (-1));
+#elif fstb_ARCHI == fstb_ARCHI_X86
+ return (_mm_movemask_epi8 (_x) == 0xFFFF);
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+ const uint32x2_t tmp = vreinterpret_u32_u16 (vqmovn_u32 (_x));
+ return ( vget_lane_u32 (tmp, 0) == 0xFFFFFFFFU
+ && vget_lane_u32 (tmp, 1) == 0xFFFFFFFFU);
+#endif // fstb_ARCHI
+}
+
+
+
+// Works only with well-formed condition results (tested bits depends on the implementation).
+// For each scalar, true = all bits set, false = all bits cleared
+bool Vu32::or_h () const noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+ const uint32_t t = (_x [0] | _x [1]) | (_x [2] | _x [3]);
+ return (t != 0);
+#elif fstb_ARCHI == fstb_ARCHI_X86
+ return (_mm_movemask_epi8 (_x) != 0);
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+ const uint32x2_t tmp = vreinterpret_u32_u16 (vqmovn_u32 (_x));
+ return ( vget_lane_u32 (tmp, 0) != 0
+ || vget_lane_u32 (tmp, 1) != 0);
+#endif // fstb_ARCHI
+}
+
+
+
+// Moves the boolean content of each 4 scalar into the lower 4 bits of the
+// return value.
+// Assumes the object is a result of a comparison, with all bits the same
+// in each 32-bit element.
+unsigned int Vu32::movemask () const noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+ return
+ (_x [0] >> 31)
+ | ((_x [1] >> 30) & 2)
+ | ((_x [2] >> 29) & 4)
+ | ((_x [3] >> 28) & 8);
+#elif fstb_ARCHI == fstb_ARCHI_X86
+ return static_cast (_mm_movemask_ps (_mm_castsi128_ps (_x)));
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+ uint64x2_t tmp1 =
+ vreinterpretq_u64_u32 (_x); // ddd...ddd ccc...ccc bbb...bbb aaa...aaa
+ tmp1 = vshrq_n_u64 (tmp1, 31); // 000...00d ddd...ddc 000...00b bbb...bba
+ uint64x1_t tmp2 = vsli_n_u64 (
+ vget_high_u64 (tmp1),
+ vget_low_u64 (tmp1),
+ 2
+ );
+ return vget_lane_u32 (vreinterpret_u32_u64 (tmp2), 0) & 0xF;
+#endif // fstb_ARCHI
+}
+
+
+
+int Vu32::count_bits () const noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+ // http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
+ uint32_t v0 = _x [0] - ((_x [0] >> 1) & 0x55555555);
+ uint32_t v1 = _x [1] - ((_x [1] >> 1) & 0x55555555);
+ uint32_t v2 = _x [2] - ((_x [2] >> 1) & 0x55555555);
+ uint32_t v3 = _x [3] - ((_x [3] >> 1) & 0x55555555);
+ v0 = (v0 & 0x33333333) + ((v0 >> 2) & 0x33333333);
+ v1 = (v1 & 0x33333333) + ((v1 >> 2) & 0x33333333);
+ v2 = (v2 & 0x33333333) + ((v2 >> 2) & 0x33333333);
+ v3 = (v3 & 0x33333333) + ((v3 >> 2) & 0x33333333);
+ const int c0 = (((v0 + (v0 >> 4)) & 0xF0F0F0FU) * 0x1010101) >> 24;
+ const int c1 = (((v1 + (v1 >> 4)) & 0xF0F0F0FU) * 0x1010101) >> 24;
+ const int c2 = (((v2 + (v2 >> 4)) & 0xF0F0F0FU) * 0x1010101) >> 24;
+ const int c3 = (((v3 + (v3 >> 4)) & 0xF0F0F0FU) * 0x1010101) >> 24;
+ return (c0 + c2) + (c1 + c3);
+#elif fstb_ARCHI == fstb_ARCHI_X86
+ // https://stackoverflow.com/questions/17354971/fast-counting-the-number-of-set-bits-in-m128i-register
+ static const __m128i popcount_mask1 = _mm_set1_epi8 (0x77);
+ static const __m128i popcount_mask2 = _mm_set1_epi8 (0x0F);
+ // Count bits in each 4-bit field.
+ auto x = _x;
+ auto n = _mm_srli_epi64 (x, 1);
+ n = _mm_and_si128 (popcount_mask1, n);
+ x = _mm_sub_epi8 (x, n);
+ n = _mm_srli_epi64 (n, 1);
+ n = _mm_and_si128 (popcount_mask1, n);
+ x = _mm_sub_epi8 (x, n);
+ n = _mm_srli_epi64 (n, 1);
+ n = _mm_and_si128 (popcount_mask1, n);
+ n = _mm_sub_epi8 (x, n);
+ n = _mm_add_epi8 (n, _mm_srli_epi16 (n, 4));
+ n = _mm_and_si128 (popcount_mask2, n);
+ // Counts the number of bits in the low and high 64-bit parts
+ n = _mm_sad_epu8 (n, _mm_setzero_si128 ());
+ // Counts the number of bits in the whole 128-bit register
+ n = _mm_add_epi32 (n, _mm_unpackhi_epi64 (n, n));
+ return _mm_cvtsi128_si32 (n);
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+ const uint8x16_t cnt_8 = vcntq_u8 (vreinterpretq_u8_u32 (_x));
+ const uint16x8_t cnt_16 = vpaddlq_u8 (cnt_8);
+ const uint32x4_t cnt_32 = vpaddlq_u16 (cnt_16);
+ const uint64x2_t cnt_64 = vpaddlq_u32 (cnt_32);
+ const int32x4_t cnt_s = vreinterpretq_s32_u64 (cnt_64);
+ return vgetq_lane_s32 (cnt_s, 0) + vgetq_lane_s32 (cnt_s, 2);
+#endif // fstb_ARCHI
+}
+
+
+
+std::tuple Vu32::explode () const noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+ return std::make_tuple (_x [0], _x [1], _x [2], _x [3]);
+#elif fstb_ARCHI == fstb_ARCHI_X86
+ return std::make_tuple (
+ uint32_t (_mm_cvtsi128_si32 (_x )),
+ uint32_t (_mm_cvtsi128_si32 (_mm_shuffle_epi32 (_x, (1<<0)))),
+ uint32_t (_mm_cvtsi128_si32 (_mm_shuffle_epi32 (_x, (2<<0)))),
+ uint32_t (_mm_cvtsi128_si32 (_mm_shuffle_epi32 (_x, (3<<0))))
+ );
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+ return std::make_tuple (
+ vgetq_lane_u32 (_x, 0),
+ vgetq_lane_u32 (_x, 1),
+ vgetq_lane_u32 (_x, 2),
+ vgetq_lane_u32 (_x, 3)
+ );
+#endif // fstb_ARCHI
+}
+
+
+
+Vu32 Vu32::zero () noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+ return Vu32 { 0, 0, 0, 0 };
+#elif fstb_ARCHI == fstb_ARCHI_X86
+ return _mm_setzero_si128 ();
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+ return vdupq_n_u32 (0);
+#endif // fstb_ARCHI
+}
+
+
+
+Vu32 Vu32::all1 () noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+ return Vu32 { ~Scalar (0), ~Scalar (0), ~Scalar (0), ~Scalar (0) };
+#elif fstb_ARCHI == fstb_ARCHI_X86
+ return _mm_set1_epi32 (-1);
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+ return vdupq_n_u32 (~Scalar (0));
+#endif // fstb_ARCHI
+}
+
+
+
+// "true" must be 1 and nothing else.
+Vu32 Vu32::set_mask (bool m0, bool m1, bool m2, bool m3) noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+ return Vu32 {
+ -uint32_t (m0),
+ -uint32_t (m1),
+ -uint32_t (m2),
+ -uint32_t (m3),
+ };
+#elif 1 // Fast version
+# if fstb_ARCHI == fstb_ARCHI_X86
+ return _mm_sub_epi32 (
+ _mm_setzero_si128 (),
+ _mm_set_epi32 (m3, m2, m1, m0)
+ );
+# elif fstb_ARCHI == fstb_ARCHI_ARM
+ float32x2_t v01 = vdup_n_f32 (m0);
+ float32x2_t v23 = vdup_n_f32 (m2);
+ v01 = vset_lane_f32 (m1, v01, 1);
+ v23 = vset_lane_f32 (m3, v23, 1);
+ return vreinterpretq_u32_s32 (vnegq_s32 (vreinterpretq_s32_f32 (
+ vcombine_f32 (v01, v23)
+ )));
+# endif // fstb_ARCHI
+#else // Safer but slower version
+# if fstb_ARCHI == fstb_ARCHI_X86
+ return _mm_sub_epi32 (
+ _mm_set_epi32 (!m3, !m2, !m1, !m0),
+ _mm_set1_epi32 (1)
+ );
+# elif fstb_ARCHI == fstb_ARCHI_ARM
+ float32x2_t v01 = vdup_n_f32 (!m0);
+ float32x2_t v23 = vdup_n_f32 (!m2);
+ v01 = vset_lane_f32 (!m1, v01, 1);
+ v23 = vset_lane_f32 (!m3, v23, 1);
+ const auto one = vdupq_n_u32 (1);
+ return vsubq_u32 (
+ vreinterpretq_u32_f32 (vcombine_f32 (v01, v23)),
+ one
+ );
+# endif // fstb_ARCHI
+#endif // Versions
+}
+
+
+
+// Extracts the vector at the position SHIFT from the double-width vector {a b}
+// Concatenates a [SHIFT...3] with b [0...3-SHIFT]
+template
+Vu32 Vu32::compose (Vu32 a, Vu32 b) noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+ switch (POS & 3)
+ {
+ case 1: return Vu32 { a._x [1], a._x [2], a._x [3], b._x [0] };
+ case 2: return Vu32 { a._x [2], a._x [3], b._x [0], b._x [1] };
+ case 3: return Vu32 { a._x [3], b._x [0], b._x [1], b._x [2] };
+ default: return a;
+ }
+ return a;
+#elif fstb_ARCHI == fstb_ARCHI_X86
+ switch (POS & 3)
+ {
+ case 1:
+ {
+ const auto tmp = _mm_castps_si128 (_mm_move_ss (
+ _mm_castsi128_ps (a._x), _mm_castsi128_ps (b._x)
+ ));
+ return _mm_shuffle_epi32 (tmp, (0<<6) | (3<<4) | (2<<2) | (1<<0));
+ }
+ case 2:
+ return _mm_castps_si128 (_mm_shuffle_ps (
+ _mm_castsi128_ps (a._x),
+ _mm_castsi128_ps (b._x),
+ (1<<6) | (0<<4) | (3<<2) | (2<<0)
+ ));
+ case 3:
+ return _mm_castps_si128 (_mm_move_ss (
+ _mm_castsi128_ps (
+ _mm_shuffle_epi32 (b._x, (2<<6) | (1<<4) | (0<<2) | (3<<0))
+ ),
+ _mm_castsi128_ps (
+ _mm_shuffle_epi32 (a._x, (2<<6) | (1<<4) | (0<<2) | (3<<0))
+ )
+ ));
+ default:
+ return a;
+ }
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+ if (POS == 0)
+ {
+ return a;
+ }
+ else
+ {
+ return vextq_u32 (a._x, b._x, POS);
+ }
+#endif // fstb_ARCHI
+}
+
+
+
+Vu32 Vu32::flip_msb (Vu32 x) noexcept
+{
+ return x ^ Vu32 (0x80000000U);
+}
+
+
+
+template
+Vu32 Vu32::load (const MEM *ptr) noexcept
+{
+ assert (is_ptr_align_nz (ptr, fstb_SIMD128_ALIGN));
+
+#if ! defined (fstb_HAS_SIMD)
+ return *reinterpret_cast (ptr);
+#elif fstb_ARCHI == fstb_ARCHI_X86
+ return _mm_load_si128 (reinterpret_cast (ptr));
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+ return vld1q_u32 (reinterpret_cast (ptr));
+#endif // fstb_ARCHI
+}
+
+
+
+template
+Vu32 Vu32::loadu (const MEM *ptr) noexcept
+{
+ assert (ptr != nullptr);
+
+#if ! defined (fstb_HAS_SIMD)
+ return *reinterpret_cast (ptr);
+#elif fstb_ARCHI == fstb_ARCHI_X86
+ return _mm_loadu_si128 (reinterpret_cast (ptr));
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+ return vreinterpretq_u32_u8 (
+ vld1q_u8 (reinterpret_cast (ptr))
+ );
+#endif // fstb_ARCHI
+}
+
+
+
+/*\\\ PROTECTED \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\*/
+
+
+
+/*\\\ PRIVATE \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\*/
+
+
+
+/*\\\ GLOBAL OPERATORS \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\*/
+
+
+
+Vu32 operator + (Vu32 lhs, const Vu32 &rhs) noexcept
+{
+ lhs += rhs;
+ return lhs;
+}
+
+Vu32 operator - (Vu32 lhs, const Vu32 &rhs) noexcept
+{
+ lhs -= rhs;
+ return lhs;
+}
+
+Vu32 operator * (Vu32 lhs, const Vu32 &rhs) noexcept
+{
+ lhs *= rhs;
+ return lhs;
+}
+
+Vu32 operator * (Vu32 lhs, const Vu32::Scalar rhs) noexcept
+{
+ lhs *= rhs;
+ return lhs;
+}
+
+Vu32 operator & (Vu32 lhs, const Vu32 &rhs) noexcept
+{
+ lhs &= rhs;
+ return lhs;
+}
+
+Vu32 operator | (Vu32 lhs, const Vu32 &rhs) noexcept
+{
+ lhs |= rhs;
+ return lhs;
+}
+
+Vu32 operator ^ (Vu32 lhs, const Vu32 &rhs) noexcept
+{
+ lhs ^= rhs;
+ return lhs;
+}
+
+
+
+template
+Vu32 operator << (Vu32 lhs, T rhs) noexcept
+{
+ lhs <<= rhs;
+ return lhs;
+}
+
+template
+Vu32 operator >> (Vu32 lhs, T rhs) noexcept
+{
+ lhs >>= rhs;
+ return lhs;
+}
+
+
+
+Vu32 operator == (const Vu32 &lhs, const Vu32 &rhs) noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+ return Vu32 {
+ (lhs._x [0] == rhs._x [0]) ? uint32_t (-1) : 0,
+ (lhs._x [1] == rhs._x [1]) ? uint32_t (-1) : 0,
+ (lhs._x [2] == rhs._x [2]) ? uint32_t (-1) : 0,
+ (lhs._x [3] == rhs._x [3]) ? uint32_t (-1) : 0
+ };
+#elif fstb_ARCHI == fstb_ARCHI_X86
+ return _mm_cmpeq_epi32 (lhs, rhs);
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+ return vceqq_u32 (lhs, rhs);
+#endif // fstb_ARCHI
+}
+
+
+
+Vu32 operator != (const Vu32 &lhs, const Vu32 &rhs) noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+ return Vu32 {
+ (lhs._x [0] != rhs._x [0]) ? uint32_t (-1) : 0,
+ (lhs._x [1] != rhs._x [1]) ? uint32_t (-1) : 0,
+ (lhs._x [2] != rhs._x [2]) ? uint32_t (-1) : 0,
+ (lhs._x [3] != rhs._x [3]) ? uint32_t (-1) : 0
+ };
+#elif fstb_ARCHI == fstb_ARCHI_X86
+ const auto eq = _mm_cmpeq_epi32 (lhs, rhs);
+ return _mm_xor_si128 (eq, _mm_set1_epi32 (-1));
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+ return vmvnq_u32 (vceqq_u32 (lhs, rhs));
+#endif // fstb_ARCHI
+}
+
+
+
+Vu32 operator < (const Vu32 &lhs, const Vu32 &rhs) noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+ return Vu32 {
+ (lhs._x [0] < rhs._x [0]) ? uint32_t (-1) : 0,
+ (lhs._x [1] < rhs._x [1]) ? uint32_t (-1) : 0,
+ (lhs._x [2] < rhs._x [2]) ? uint32_t (-1) : 0,
+ (lhs._x [3] < rhs._x [3]) ? uint32_t (-1) : 0
+ };
+#elif fstb_ARCHI == fstb_ARCHI_X86
+ return Vu32::flip_msb (_mm_cmplt_epi32 (
+ Vu32::flip_msb (lhs), Vu32::flip_msb (rhs)
+ ));
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+ return vcltq_u32 (lhs, rhs);
+#endif // fstb_ARCHI
+}
+
+
+
+Vu32 operator <= (const Vu32 &lhs, const Vu32 &rhs) noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+ return Vu32 {
+ (lhs._x [0] <= rhs._x [0]) ? uint32_t (-1) : 0,
+ (lhs._x [1] <= rhs._x [1]) ? uint32_t (-1) : 0,
+ (lhs._x [2] <= rhs._x [2]) ? uint32_t (-1) : 0,
+ (lhs._x [3] <= rhs._x [3]) ? uint32_t (-1) : 0
+ };
+#elif fstb_ARCHI == fstb_ARCHI_X86
+# if 1
+ return (lhs < rhs) | (lhs == rhs);
+# else
+ return ~(lhs > rhs);
+# endif
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+ return vcleq_u32 (lhs, rhs);
+#endif // fstb_ARCHI
+}
+
+
+
+Vu32 operator > (const Vu32 &lhs, const Vu32 &rhs) noexcept
+{
+ return (rhs < lhs);
+}
+
+
+
+Vu32 operator >= (const Vu32 &lhs, const Vu32 &rhs) noexcept
+{
+ return (rhs <= lhs);
+}
+
+
+
+Vu32 min (const Vu32 &lhs, const Vu32 &rhs) noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+ return Vu32 {
+ std::min (lhs._x [0], rhs._x [0]),
+ std::min (lhs._x [1], rhs._x [1]),
+ std::min (lhs._x [2], rhs._x [2]),
+ std::min (lhs._x [3], rhs._x [3])
+ };
+#elif fstb_ARCHI == fstb_ARCHI_X86
+ const auto gt = (lhs > rhs);
+ return _mm_or_si128 (
+ _mm_and_si128 ( gt, rhs),
+ _mm_andnot_si128 (gt, lhs)
+ );
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+ return vminq_u32 (lhs, rhs);
+#endif // fstb_ARCHI
+}
+
+
+
+Vu32 max (const Vu32 &lhs, const Vu32 &rhs) noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+ return Vu32 {
+ std::max (lhs._x [0], rhs._x [0]),
+ std::max (lhs._x [1], rhs._x [1]),
+ std::max (lhs._x [2], rhs._x [2]),
+ std::max (lhs._x [3], rhs._x [3])
+ };
+#elif fstb_ARCHI == fstb_ARCHI_X86
+ const auto lt = (lhs < rhs);
+ return _mm_or_si128 (
+ _mm_and_si128 ( lt, rhs),
+ _mm_andnot_si128 (lt, lhs)
+ );
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+ return vmaxq_u32 (lhs, rhs);
+#endif // fstb_ARCHI
+}
+
+
+
+Vu32 limit (const Vu32 &v, const Vu32 &mi, const Vu32 &ma) noexcept
+{
+ return min (max (v, mi), ma);
+}
+
+
+
+Vu32 select (const Vu32 &cond, const Vu32 &v_t, const Vu32 &v_f) noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+ /*** To do: implement as r = v_f ^ ((v_f ^ v_t) & cond) ***/
+ return Vu32 {
+ (cond._x [0] & v_t._x [0]) | (~cond._x [0] & v_f._x [0]),
+ (cond._x [1] & v_t._x [1]) | (~cond._x [1] & v_f._x [1]),
+ (cond._x [2] & v_t._x [2]) | (~cond._x [2] & v_f._x [2]),
+ (cond._x [3] & v_t._x [3]) | (~cond._x [3] & v_f._x [3])
+ };
+#elif fstb_ARCHI == fstb_ARCHI_X86
+ const auto cond_1 = _mm_and_si128 (cond, v_t);
+ const auto cond_0 = _mm_andnot_si128 (cond, v_f);
+ return _mm_or_si128 (cond_0, cond_1);
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+ return vbslq_u32 (cond, v_t, v_f);
+#endif // fstb_ARCHI
+}
+
+
+
+std::tuple swap_if (const Vu32 &cond, Vu32 lhs, Vu32 rhs) noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+ if (cond._x [0] != 0) { std::swap (lhs._x [0], rhs._x [0]); }
+ if (cond._x [1] != 0) { std::swap (lhs._x [1], rhs._x [1]); }
+ if (cond._x [2] != 0) { std::swap (lhs._x [2], rhs._x [2]); }
+ if (cond._x [3] != 0) { std::swap (lhs._x [3], rhs._x [3]); }
+ return std::make_tuple (lhs, rhs);
+#elif fstb_ARCHI == fstb_ARCHI_X86
+ const auto inv = _mm_and_si128 (_mm_xor_si128 (lhs, rhs), cond);
+ return std::make_tuple (
+ Vu32 (_mm_xor_si128 (lhs, inv)),
+ Vu32 (_mm_xor_si128 (rhs, inv))
+ );
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+ return std::make_tuple (
+ Vu32 (vbslq_u32 (cond, rhs, lhs)),
+ Vu32 (vbslq_u32 (cond, lhs, rhs))
+ );
+#endif // fstb_ARCHI
+}
+
+
+
+} // namespace fstb
+
+
+
+#endif // fstb_Vu32_CODEHEADER_INCLUDED
+
+
+
+/*\\\ EOF \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\*/
diff --git a/src/fstb/fnc.h b/src/fstb/fnc.h
index 1eee4ae..438ac12 100644
--- a/src/fstb/fnc.h
+++ b/src/fstb/fnc.h
@@ -72,6 +72,7 @@ template
inline constexpr bool is_eq (T v1, T v2, T eps = T (1e-9)) noexcept;
template
inline constexpr bool is_eq_rel (T v1, T v2, T tol = T (1e-6)) noexcept;
+inline constexpr bool is_eq_ulp (float v1, float v2, int32_t tol = 1) noexcept;
inline int get_prev_pow_2 (uint32_t x) noexcept;
inline int get_next_pow_2 (uint32_t x) noexcept;
inline constexpr double sinc (double x) noexcept;
diff --git a/src/fstb/fnc.hpp b/src/fstb/fnc.hpp
index 9ecb26b..c8b1b4c 100644
--- a/src/fstb/fnc.hpp
+++ b/src/fstb/fnc.hpp
@@ -565,6 +565,32 @@ constexpr bool is_eq_rel (T v1, T v2, T tol) noexcept
+// Equality test with a tolerance in ULP.
+// Numbers of opposite sign (excepted 0) are always evaluated as different.
+// https://en.wikipedia.org/wiki/Unit_in_the_last_place
+constexpr bool is_eq_ulp (float v1, float v2, int32_t tol) noexcept
+{
+ assert (tol >= 0);
+
+ if ((v1 < 0) != (v2 < 0))
+ {
+ return (v1 == v2);
+ }
+
+ union Combo
+ {
+ float _f;
+ int32_t _i;
+ };
+ const Combo c1 { v1 };
+ const Combo c2 { v2 };
+ const auto dif = std::abs (c2._i - c1._i);
+
+ return (dif <= tol);
+}
+
+
+
/*
==============================================================================
Name: get_prev_pow2
@@ -1000,6 +1026,7 @@ constexpr T lerp (T v0, T v1, T p) noexcept
// f(x) = ((r3 + r1) / 2 - r2) * x^2 + ((r3 - r1) / 2) * x + r2
// The points must not be aligned so the extremum exists.
// It is not necessariy located between -1 and 1.
+// The value at this point is y = r2 + 0.25 * x * (r3 - r1)
template
constexpr T find_extremum_pos_parabolic (T r1, T r2, T r3) noexcept
{
diff --git a/src/main-avs.cpp b/src/main-avs.cpp
index dd70871..b8350c0 100644
--- a/src/main-avs.cpp
+++ b/src/main-avs.cpp
@@ -1,7 +1,8 @@
-
+#if defined (_WIN32)
#define WIN32_LEAN_AND_MEAN
#define NOMINMAX
#define NOGDI
+#endif
#include "avsutl/fnc.h"
#include "fmtcavs/Bitdepth.h"
@@ -13,13 +14,24 @@
#include "fmtcavs/Transfer.h"
#include "fstb/def.h"
+#if defined (_WIN32)
#include
+#else
+#include "avs/posix.h"
+#endif
#include "avisynth.h"
#if defined (_MSC_VER) && ! defined (NDEBUG) && defined (_DEBUG)
#include
#endif
+#if defined (_WIN32)
+ #define AVS_EXPORT __declspec(dllexport)
+#elif defined(__GNUC__) && __GNUC__ >= 4
+ #define AVS_EXPORT __attribute__((visibility("default")))
+#else
+ #define AVS_EXPORT
+#endif
template
@@ -34,7 +46,7 @@ ::AVSValue __cdecl main_avs_create (::AVSValue args, void *user_data_ptr, ::IScr
const ::AVS_Linkage * AVS_linkage = nullptr;
-extern "C" __declspec (dllexport)
+extern "C" AVS_EXPORT
const char * __stdcall AvisynthPluginInit3 (::IScriptEnvironment *env_ptr, const ::AVS_Linkage * const vectors_ptr)
{
AVS_linkage = vectors_ptr;
@@ -60,7 +72,8 @@ const char * __stdcall AvisynthPluginInit3 (::IScriptEnvironment *env_ptr, const
env_ptr->AddFunction (fmtcavs_PRIMARIES,
"c" "[rs].+" "[gs].+" "[bs].+" // 0
"[ws].+" "[rd].+" "[gd].+" "[bd].+" // 4
- "[wd].+" "[prims]s" "[primd]s" "[cpuopt]i" // 8
+ "[wd].+" "[prims]s" "[primd]s" "[wconv]b" // 8
+ "[cpuopt]i" // 12
, &main_avs_create , nullptr
);
env_ptr->AddFunction (fmtcavs_RESAMPLE,
@@ -94,7 +107,7 @@ const char * __stdcall AvisynthPluginInit3 (::IScriptEnvironment *env_ptr, const
}
-
+#if defined (_WIN32)
static void main_avs_dll_load (::HINSTANCE hinst)
{
fstb::unused (hinst);
@@ -156,3 +169,4 @@ BOOL WINAPI DllMain (::HINSTANCE hinst, ::DWORD reason, ::LPVOID reserved_ptr)
return TRUE;
}
+#endif
diff --git a/src/main-vs.cpp b/src/main-vs.cpp
index 2de215c..3283c3a 100644
--- a/src/main-vs.cpp
+++ b/src/main-vs.cpp
@@ -386,6 +386,7 @@ VS_EXTERNAL_API (void) VapourSynthPluginInit2 (::VSPlugin *plugin_ptr, const ::V
"wd:float[]:opt;"
"prims:data:opt;"
"primd:data:opt;"
+ "wconv:int:opt;"
"cpuopt:int:opt;"
, "clip:vnode;"
, &vsutl::Redirect ::create, nullptr, plugin_ptr