diff --git a/build/unix/Makefile.am b/build/unix/Makefile.am
index 1ede141..e6faebb 100644
--- a/build/unix/Makefile.am
+++ b/build/unix/Makefile.am
@@ -293,14 +293,31 @@ commonsrc = \
         ../../src/fstb/Hash.hpp \
         ../../src/fstb/SingleObj.h \
         ../../src/fstb/SingleObj.hpp \
+        ../../src/fstb/Vf32.h \
+        ../../src/fstb/Vf32.hpp \
+        ../../src/fstb/Vs32.h \
+        ../../src/fstb/Vs32.hpp \
+        ../../src/fstb/Vu32.h \
+        ../../src/fstb/Vu32.hpp \
         ../../src/avstp.h \
         ../../src/AvstpWrapper.cpp \
         ../../src/AvstpWrapper.h
 
 libfmtconv_la_SOURCES = $(commonsrc) \
+        ../../src/avisynth.h \
+        ../../src/main-avs.cpp \
         ../../src/main-vs.cpp \
         ../../src/types.h \
         ../../src/VapourSynth4.h \
+        ../../src/avs/alignment.h \
+        ../../src/avs/capi.h \
+        ../../src/avs/config.h \
+        ../../src/avs/cpuid.h \
+        ../../src/avs/filesystem.h \
+        ../../src/avs/minmax.h \
+        ../../src/avs/posix.h \
+        ../../src/avs/types.h \
+        ../../src/avs/win.h \
         ../../src/fmtc/Bitdepth_vs.cpp \
         ../../src/fmtc/Bitdepth.h \
         ../../src/fmtc/Convert.cpp \
@@ -326,6 +343,40 @@ libfmtconv_la_SOURCES = $(commonsrc) \
         ../../src/fmtc/Transfer_vs.cpp \
         ../../src/fmtc/Transfer.h \
         ../../src/fmtc/version.h \
+        ../../src/fmtcavs/Bitdepth_avs.cpp \
+        ../../src/fmtcavs/Bitdepth.h \
+        ../../src/fmtcavs/CpuOpt_avs.cpp \
+        ../../src/fmtcavs/CpuOpt.h \
+        ../../src/fmtcavs/FmtAvs.cpp \
+        ../../src/fmtcavs/FmtAvs.h \
+        ../../src/fmtcavs/fnc_fmtcavs.cpp \
+        ../../src/fmtcavs/fnc.h \
+        ../../src/fmtcavs/function_names.h \
+        ../../src/fmtcavs/Matrix2020CL_avs.cpp \
+        ../../src/fmtcavs/Matrix2020CL.h \
+        ../../src/fmtcavs/Matrix_avs.cpp \
+        ../../src/fmtcavs/Matrix.h \
+        ../../src/fmtcavs/Primaries_avs.cpp \
+        ../../src/fmtcavs/Primaries.h \
+        ../../src/fmtcavs/ProcAlpha.cpp \
+        ../../src/fmtcavs/ProcAlpha.h \
+        ../../src/fmtcavs/Resample_avs.cpp \
+        ../../src/fmtcavs/Resample.h \
+        ../../src/fmtcavs/Transfer_avs.cpp \
+        ../../src/fmtcavs/Transfer.h \
+        ../../src/avsutl/CsPlane.cpp \
+        ../../src/avsutl/CsPlane.h \
+        ../../src/avsutl/fnc_avsutl.cpp \
+        ../../src/avsutl/fnc.h \
+        ../../src/avsutl/fnc.hpp \
+        ../../src/avsutl/PlaneProcCbInterface_avs.cpp \
+        ../../src/avsutl/PlaneProcCbInterface.h \
+        ../../src/avsutl/PlaneProcessor_avs.cpp \
+        ../../src/avsutl/PlaneProcessor.h \
+        ../../src/avsutl/PlaneProcMode.h \
+        ../../src/avsutl/TFlag.h \
+        ../../src/avsutl/VideoFilterBase.cpp \
+        ../../src/avsutl/VideoFilterBase.h \
         ../../src/vsutl/FilterBase.cpp \
         ../../src/vsutl/FilterBase.h \
         ../../src/vsutl/fnc_vsutl.cpp \
diff --git a/build/unix/configure.ac b/build/unix/configure.ac
index 00b836b..54e060e 100644
--- a/build/unix/configure.ac
+++ b/build/unix/configure.ac
@@ -1,4 +1,4 @@
-AC_INIT([fmtconv], [r29], [http://forum.doom9.org/showthread.php?t=166504], [fmtconv], [http://forum.doom9.org/showthread.php?t=166504])
+AC_INIT([fmtconv], [r30], [http://forum.doom9.org/showthread.php?t=166504], [fmtconv], [http://forum.doom9.org/showthread.php?t=166504])
 AC_CONFIG_MACRO_DIR([m4])
 
 AM_INIT_AUTOMAKE([foreign no-dist-gzip dist-xz subdir-objects no-define])
diff --git a/build/win/common/common.vcxproj b/build/win/common/common.vcxproj
index 2914165..632e152 100644
--- a/build/win/common/common.vcxproj
+++ b/build/win/common/common.vcxproj
@@ -303,6 +303,12 @@
     <ClInclude Include="..\..\..\src\avstp.h" />
     <ClInclude Include="..\..\..\src\AvstpFinder.h" />
     <ClInclude Include="..\..\..\src\AvstpWrapper.h" />
+    <ClInclude Include="..\..\..\src\fstb\Vf32.h" />
+    <ClInclude Include="..\..\..\src\fstb\Vf32.hpp" />
+    <ClInclude Include="..\..\..\src\fstb\Vs32.h" />
+    <ClInclude Include="..\..\..\src\fstb\Vs32.hpp" />
+    <ClInclude Include="..\..\..\src\fstb\Vu32.h" />
+    <ClInclude Include="..\..\..\src\fstb\Vu32.hpp" />
   </ItemGroup>
   <ItemGroup>
     <ClCompile Include="..\..\..\src\fmtcl\ArrayMultiType.cpp" />
diff --git a/build/win/common/common.vcxproj.filters b/build/win/common/common.vcxproj.filters
index d9a0c0e..3ab2a4f 100644
--- a/build/win/common/common.vcxproj.filters
+++ b/build/win/common/common.vcxproj.filters
@@ -749,6 +749,24 @@
     <ClInclude Include="..\..\..\src\fmtcl\TransOpAcesCct.h">
       <Filter>fmtcl</Filter>
     </ClInclude>
+    <ClInclude Include="..\..\..\src\fstb\Vf32.h">
+      <Filter>fstb</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\..\src\fstb\Vf32.hpp">
+      <Filter>fstb</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\..\src\fstb\Vs32.h">
+      <Filter>fstb</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\..\src\fstb\Vs32.hpp">
+      <Filter>fstb</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\..\src\fstb\Vu32.h">
+      <Filter>fstb</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\..\src\fstb\Vu32.hpp">
+      <Filter>fstb</Filter>
+    </ClInclude>
   </ItemGroup>
   <ItemGroup>
     <Filter Include="conc">
diff --git a/doc/fmtconv.html b/doc/fmtconv.html
index f9c4fb8..1be2df5 100644
--- a/doc/fmtconv.html
+++ b/doc/fmtconv.html
@@ -15,7 +15,7 @@ <h2>Abstract</h2>
 
 <table class="n">
 <tr><td class="n"><b>Authors:     </b></td><td class="n">&nbsp;</td><td class="n">Firesledge (aka Cretindesalpes)</td></tr>
-<tr><td class="n"><b>Version:     </b></td><td class="n">&nbsp;</td><td class="n">r29</td></tr>
+<tr><td class="n"><b>Version:     </b></td><td class="n">&nbsp;</td><td class="n">r30</td></tr>
 <tr><td class="n"><b>Download:    </b></td><td class="n">&nbsp;</td><td class="n"><a href="http://ldesoras.free.fr/prod.html#src_fmtconv">http://ldesoras.free.fr/prod.html</a></td></tr>
 <tr><td class="n"><b>Category:    </b></td><td class="n">&nbsp;</td><td class="n">Format tools</td></tr>
 <tr><td class="n"><b>Requirements:</b></td><td class="n">&nbsp;</td><td class="n"><a href="http://www.vapoursynth.com/">Vapoursynth r55</a> or <a href="https://avs-plus.net/">Avisynth+ 3.7.0</a></td></tr>
@@ -129,7 +129,7 @@ <h4>Resizing and chroma subsampling conversions</h4>
 <p>Bobbing an interlaced stream (here, Top Field First):</p>
 
 <pre class="src">c = c.std.SeparateFields (tff=True)
-c = c.fmtc.resample (scalev=2, kernel="cubic", interlaced=True, interlacedd=False)</pre>
+c = c.fmtc.resample (scalev=2, kernel="cubic", interlaced=1, interlacedd=0)</pre>
 
 <p>Converting a progressive stream from YUV 4:2:2 to 4:2:0 and back to 8 bits:</p>
 
@@ -140,7 +140,7 @@ <h4>Resizing and chroma subsampling conversions</h4>
 
 <pre class="src">tff = True
 c = c.std.SeparateFields (tff=tff)
-c = c.fmtc.resample (css="420", interlaced=True)
+c = c.fmtc.resample (css="420", interlaced=1)
 c = c.fmtc.bitdepth (bits=8)
 c = c.std.DoubleWeave (tff=tff)
 c = c.std.SelectEvery (cycle=2, offsets=0)</pre>
@@ -623,7 +623,7 @@ <h3><a id="matrix"></a>matrix</h3>
 </code>primaries</code> to perform the intermediary conversion.</p>
 
 <p>The <code>_ColorRange</code> frame property is set if the <var>fulld</var>
-parameter has been explicitely defined.
+parameter has been explicitely defined or if a preset is used.
 If the destination colorspace is a standardized one (as deduced from the
 specified matrix), the <code>_Matrix</code> and <code>_ColorSpace</code>
 properties are set, otherwise they are deleted from the frame.</p>
@@ -959,6 +959,7 @@ <h3><a id="primaries"></a>primaries</h3>
 	wd    : float[]: opt;
 	prims : data   : opt;
 	primd : data   : opt;
+	wconv : int    : opt; (False)
 	cpuopt: int    : opt; (-1)
 )</pre></td>
 <td class="n"><pre class="proto">fmtc_primaries (
@@ -973,6 +974,7 @@ <h3><a id="primaries"></a>primaries</h3>
 	arrayf wd (undefined),
 	string prims (undefined),
 	string primd (undefined),
+	bool   wconv (False),
 	int    cpuopt (-1)
 )</pre></td>
 </tr>
@@ -1070,6 +1072,21 @@ <h4>Parameters</h4>
 <tr><td><b><code>&quot;redwide&quot;</code></b></td><td>R<br />G<br />B<br />W (D65)</td><td>0.780308,<br />0.121595,<br />0.095612,<br />0.3217,</td><td>0.304253<br />1.493994<br />&minus;0.084589<br />0.3290</td><td>REDWideGamutRGB</td></tr>
 </table>
 
+<p class="var">wconv</p>
+<p>Indicates we want a full conversion for the white point.</p>
+<p>If set to <code>False</code>, chromatic adaptation will be used, so the
+white will stay white on the destination illuminant and colors will be adapted
+to implement a real illuminant change.
+This is generally what you want when converting between gamuts: the eye adapts
+to the new white and colors should be matched accordingly.</p>
+<p>If set to <code>True</code>, the chromatic adaptation is bypassed.
+The white from the source colorspace will appear with a tint if the target
+colorspace has a different white point.
+Use this if you want to emulate what a picture displayed with a monitor using
+the source illuminant looks like on a display using the target illuminant.
+This is also what you want when converting to and from XYZ for further
+operations in this colorspace.</p>
+
 <p class="var">cpuopt</p>
 <p>Limits the CPU instruction set.
 &minus;1: automatic (no limitation),
@@ -1275,7 +1292,7 @@ <h4>Parameters</h4>
 <p>Clip to be resized. Mandatory.
 Supported input formats:</p>
 <ul>
-<li>8-, 9-, 10-, 12-, 16- and 16-bit integer.</li>
+<li>8-, 9-, 10-, 12-, 14- and 16-bit integer.</li>
 <li>32-bit floating point.</li>
 <li>Any planar colorspace.</li>
 </ul>
@@ -1872,9 +1889,15 @@ <h4>Parameters</h4>
 <p>Indicate the peak white levels in cd/m<sup>2</sup>.
 <var>lws</var> is for the source transfer function, and <var>lwd</var>
 for the destination one.
-These parameters are taken into account when display-referred transfer
-functions are used.
-Minimum <var>lw</var> value is 0.1&nbsp;cd/m<sup>2</sup>.
+These parameters are taken into account to scale the luminance when the
+following conditions are met:</p>
+<ul>
+<li>display-referred transfer functions are used,
+<li><var>match</var> is set to 2 (display luminance matching) and</li>
+<li>the EOTF shouldn’t specify any scale for the luminance (the cd/m² value
+for F’&nbsp;= 1.0).</li>
+</ul>
+<p>Minimum <var>lw</var> value is 0.1&nbsp;cd/m<sup>2</sup>.
 System gamma may be changed according to the <var>lw</var> parameter.
 Unless specified, HDR functions use a peak white of 1000&nbsp;cd/m<sup>2</sup>.
 Similarly, SDR and other functions use 100&nbsp;cd/m<sup>2</sup> by default.
@@ -1973,11 +1996,19 @@ <h2><a id="troubleshooting"></a>IV) Troubleshooting</h2>
 
 <h2><a id="changelog"></a>V) Changelog</h2>
 
-<p><b>r30, 2022-xx-xx</b></p>
+<p><b>r31, 202x-xx-xx</b></p>
+<ul>
+<li><code>resample</code>: fixed 14 to 16 bit AVX2 conversion path, thanks to NSQY for the report.</li>
+</ul>
+
+<p><b>r30, 2022-08-29</b></p>
 <ul>
+<li><code>matrix</code>: The <code>_ColorRange</code> frame property is now set when a matrix preset is used.</li>
 <li><code>transfer</code>: Added ACEScct transfer function.</li>
 <li><code>primaries</code>: Added DCI P3+ and Cinema Gamut presets.</li>
-<li>Changed the <code>configure</<code> options to compile with Clang.</li>
+<li><code>primaries</code>: Added <var>wconv</var> parameter for full conversion.</li>
+<li>Changed the <code>configure</code> options to compile with Clang.</li>
+<li>Updated datatypes in the examples.</li>
 </ul>
 
 <p><b>r29, 2022-04-11</b></p>
diff --git a/src/avisynth.h b/src/avisynth.h
index 679ac2a..b779c49 100644
--- a/src/avisynth.h
+++ b/src/avisynth.h
@@ -442,17 +442,17 @@ extern const AVS_Linkage* AVS_linkage;
 # endif
 
 # define AVS_BakedCode(arg) { arg ; }
-# define AVS_LinkCall(arg)  !AVS_linkage || offsetof(AVS_Linkage, arg) >= AVS_linkage->Size ?     0 : (this->*(AVS_linkage->arg))
-# define AVS_LinkCall_Void(arg)  !AVS_linkage || offsetof(AVS_Linkage, arg) >= AVS_linkage->Size ?     (void)0 : (this->*(AVS_linkage->arg))
-# define AVS_LinkCallV(arg) !AVS_linkage || offsetof(AVS_Linkage, arg) >= AVS_linkage->Size ? *this : (this->*(AVS_linkage->arg))
+# define AVS_LinkCall(arg)  !AVS_linkage || offsetof(AVS_Linkage, arg) >= (size_t)AVS_linkage->Size ?     0 : (this->*(AVS_linkage->arg))
+# define AVS_LinkCall_Void(arg)  !AVS_linkage || offsetof(AVS_Linkage, arg) >= (size_t)AVS_linkage->Size ?     (void)0 : (this->*(AVS_linkage->arg))
+# define AVS_LinkCallV(arg) !AVS_linkage || offsetof(AVS_Linkage, arg) >= (size_t)AVS_linkage->Size ? *this : (this->*(AVS_linkage->arg))
 // Helper macros for fallback option when a function does not exists
 #define CALL_MEMBER_FN(object,ptrToMember)  ((object)->*(ptrToMember))
 #define AVS_LinkCallOpt(arg, argOpt)  !AVS_linkage ? 0 : \
-                                      ( offsetof(AVS_Linkage, arg) >= AVS_linkage->Size ? \
-                                        (offsetof(AVS_Linkage, argOpt) >= AVS_linkage->Size ? 0 : CALL_MEMBER_FN(this, AVS_linkage->argOpt)() ) : \
+                                      ( offsetof(AVS_Linkage, arg) >= (size_t)AVS_linkage->Size ? \
+                                        (offsetof(AVS_Linkage, argOpt) >= (size_t)AVS_linkage->Size ? 0 : CALL_MEMBER_FN(this, AVS_linkage->argOpt)() ) : \
                                         CALL_MEMBER_FN(this, AVS_linkage->arg)() )
 // AVS_LinkCallOptDefault puts automatically () only after arg
-# define AVS_LinkCallOptDefault(arg, argDefaultValue)  !AVS_linkage || offsetof(AVS_Linkage, arg) >= AVS_linkage->Size ? (argDefaultValue) : ((this->*(AVS_linkage->arg))())
+# define AVS_LinkCallOptDefault(arg, argDefaultValue)  !AVS_linkage || offsetof(AVS_Linkage, arg) >= (size_t)AVS_linkage->Size ? (argDefaultValue) : ((this->*(AVS_linkage->arg))())
 
 #endif
 
@@ -1299,7 +1299,7 @@ class GenericVideoFilter : public IClip {
   void __stdcall GetAudio(void* buf, int64_t start, int64_t count, IScriptEnvironment* env) { child->GetAudio(buf, start, count, env); }
   const VideoInfo& __stdcall GetVideoInfo() { return vi; }
   bool __stdcall GetParity(int n) { return child->GetParity(n); }
-  int __stdcall SetCacheHints(int cachehints, int frame_range) { AVS_UNUSED(cachehints); AVS_UNUSED(frame_range); return 0; };  // We do not pass cache requests upwards, only to the next filter.
+  int __stdcall SetCacheHints(int cachehints, int frame_range) { AVS_UNUSED(cachehints); AVS_UNUSED(frame_range); return 0; }  // We do not pass cache requests upwards, only to the next filter.
 };
 
 
@@ -1864,7 +1864,7 @@ struct PNeoEnv {
 #if defined(BUILDING_AVSCORE) || defined(AVS_STATIC_LIB)
     ;
 #else
-  : p(!AVS_linkage || offsetof(AVS_Linkage, GetNeoEnv) >= AVS_linkage->Size ? 0 : AVS_linkage->GetNeoEnv(env)) { }
+  : p(!AVS_linkage || offsetof(AVS_Linkage, GetNeoEnv) >= (size_t)AVS_linkage->Size ? 0 : AVS_linkage->GetNeoEnv(env)) { }
 #endif
 
   int operator!() const { return !p; }
diff --git a/src/fmtc/Matrix_vs.cpp b/src/fmtc/Matrix_vs.cpp
index 07c8c8c..2c08360 100644
--- a/src/fmtc/Matrix_vs.cpp
+++ b/src/fmtc/Matrix_vs.cpp
@@ -146,6 +146,7 @@ Matrix::Matrix (const ::VSMap &in, ::VSMap &out, void * /*user_data_ptr*/, ::VSC
 	const int      nbr_expected_coef = _nbr_planes * (_nbr_planes + 1);
 
 	bool           mat_init_flag = false;
+	bool           preset_flag   = false;
 
 	// Matrix presets
 	std::string    mat (get_arg_str (in, out, "mat", ""));
@@ -182,6 +183,7 @@ Matrix::Matrix (const ::VSMap &in, ::VSMap &out, void * /*user_data_ptr*/, ::VSC
 		_mat_main = m2d * m2s;
 
 		mat_init_flag = true;
+		preset_flag   = true;
 	}
 
 	// Custom coefficients
@@ -309,6 +311,7 @@ Matrix::Matrix (const ::VSMap &in, ::VSMap &out, void * /*user_data_ptr*/, ::VSC
 		vsutl::is_full_range_default (fmt_dst) ? 1 : 0,
 		0, &_range_set_dst_flag
 	) != 0);
+	_range_set_dst_flag |= preset_flag;
 
 	prepare_matrix_coef (
 		*this, *_proc_uptr, _mat_main,
diff --git a/src/fmtc/Primaries_vs.cpp b/src/fmtc/Primaries_vs.cpp
index ced609e..e5e2089 100644
--- a/src/fmtc/Primaries_vs.cpp
+++ b/src/fmtc/Primaries_vs.cpp
@@ -105,8 +105,10 @@ Primaries::Primaries (const ::VSMap &in, ::VSMap &out, void *user_data_ptr, ::VS
 	init (_prim_d, *this, in, out, "rd", "gd", "bd", "wd");
 	assert (_prim_d.is_ready ());
 
+	const auto     conv_flag = (get_arg_int (in, out, "wconv", 0) != 0);
+
 	const fmtcl::Mat3 mat_conv =
-		fmtcl::PrimUtil::compute_conversion_matrix (_prim_s, _prim_d);
+		fmtcl::PrimUtil::compute_conversion_matrix (_prim_s, _prim_d, conv_flag);
 	_mat_main.insert3 (mat_conv);
 	_mat_main.clean3 (1);
 
diff --git a/src/fmtc/Resample_vs.cpp b/src/fmtc/Resample_vs.cpp
index 3d7e649..971343f 100644
--- a/src/fmtc/Resample_vs.cpp
+++ b/src/fmtc/Resample_vs.cpp
@@ -628,7 +628,7 @@ const ::VSFrame *	Resample::get_frame (int n, int activation_reason, void * &fra
 		if (ret_val != 0)
 		{
 			_vsapi.freeFrame (dst_ptr);
-			dst_ptr = 0;
+			dst_ptr = nullptr;
 		}
 	}
 
@@ -680,7 +680,7 @@ int	Resample::do_process_plane (::VSFrame &dst, int n, int plane_index, void *fr
 	{
 		const Ru::FrameInfo &   frame_info =
 			*reinterpret_cast <const Ru::FrameInfo *> (frame_data_ptr);
-		process_plane_proc (
+		ret_val = process_plane_proc (
 			dst, n, plane_index, frame_ctx, src_node1_sptr, frame_info
 		);
 	}
@@ -688,7 +688,7 @@ int	Resample::do_process_plane (::VSFrame &dst, int n, int plane_index, void *fr
 	// Copy (and convert)
 	else if (proc_mode == vsutl::PlaneProcMode_COPY1)
 	{
-		process_plane_copy (
+		ret_val = process_plane_copy (
 			dst, n, plane_index, frame_ctx, src_node1_sptr
 		);
 	}
diff --git a/src/fmtc/version.h b/src/fmtc/version.h
index 4a4d319..a209cf7 100644
--- a/src/fmtc/version.h
+++ b/src/fmtc/version.h
@@ -1,5 +1,5 @@
 #pragma once
 
-#define fmtc_VERSION     29
+#define fmtc_VERSION     30
 #define fmtc_PLUGIN_NAME "fmtconv"
 #define fmtc_NAMESPACE   "fmtc"
diff --git a/src/fmtcavs/Matrix_avs.cpp b/src/fmtcavs/Matrix_avs.cpp
index db75fcb..b673a79 100644
--- a/src/fmtcavs/Matrix_avs.cpp
+++ b/src/fmtcavs/Matrix_avs.cpp
@@ -73,7 +73,6 @@ Matrix::Matrix (::IScriptEnvironment &env, const ::AVSValue &args)
 	{
 		env.ThrowError (fmtcavs_MATRIX ": input must be 4:4:4.");
 	}
-	const int      nbr_planes_src = _vi_src.NumComponents ();
 	if (fmt_src.get_nbr_comp_non_alpha () != _nbr_planes_proc)
 	{
 		env.ThrowError (
@@ -129,6 +128,7 @@ Matrix::Matrix (::IScriptEnvironment &env, const ::AVSValue &args)
 	const int      nbr_expected_coef = _nbr_planes_proc * (_nbr_planes_proc + 1);
 
 	bool           mat_init_flag = false;
+	bool           preset_flag   = false;
 	fmtcl::Mat4    mat_main; // Main matrix, float input, float output
 
 	// Matrix presets
@@ -169,6 +169,7 @@ Matrix::Matrix (::IScriptEnvironment &env, const ::AVSValue &args)
 
 		mat_main      = m2d * m2s;
 		mat_init_flag = true;
+		preset_flag   = true;
 	}
 
 	// Alpha plane processing, if any
@@ -267,7 +268,7 @@ Matrix::Matrix (::IScriptEnvironment &env, const ::AVSValue &args)
 	_fulld_flag     = args [Param_FULLD].AsBool (
 		fmtcl::is_full_range_default (fmt_dst.get_col_fam ())
 	);
-	_range_def_flag = args [Param_FULLD].Defined ();
+	_range_def_flag = (args [Param_FULLD].Defined () || preset_flag);
 
 	prepare_matrix_coef (
 		env, *_proc_uptr, mat_main,
diff --git a/src/fmtcavs/Primaries.h b/src/fmtcavs/Primaries.h
index e1d067b..e3d248e 100644
--- a/src/fmtcavs/Primaries.h
+++ b/src/fmtcavs/Primaries.h
@@ -62,6 +62,7 @@ class Primaries
 		Param_WD,
 		Param_PRIMS,
 		Param_PRIMD,
+		Param_WCONV,
 		Param_CPUOPT,
 
 		Param_NBR_ELT
diff --git a/src/fmtcavs/Primaries_avs.cpp b/src/fmtcavs/Primaries_avs.cpp
index 85ae49f..989148a 100644
--- a/src/fmtcavs/Primaries_avs.cpp
+++ b/src/fmtcavs/Primaries_avs.cpp
@@ -110,8 +110,10 @@ Primaries::Primaries (::IScriptEnvironment &env, const ::AVSValue &args)
 	init (_prim_d, env, args, Param_RD, Param_GD, Param_BD, Param_WD);
 	assert (_prim_d.is_ready ());
 
+	const auto     conv_flag = args [Param_WCONV].AsBool (false);
+
 	const fmtcl::Mat3 mat_conv =
-		fmtcl::PrimUtil::compute_conversion_matrix (_prim_s, _prim_d);
+		fmtcl::PrimUtil::compute_conversion_matrix (_prim_s, _prim_d, conv_flag);
 	_mat_main.insert3 (mat_conv);
 	_mat_main.clean3 (1);
 
diff --git a/src/fmtcl/BitBltConv_avx2.cpp b/src/fmtcl/BitBltConv_avx2.cpp
index 3dd2e7c..cad5821 100644
--- a/src/fmtcl/BitBltConv_avx2.cpp
+++ b/src/fmtcl/BitBltConv_avx2.cpp
@@ -75,11 +75,13 @@ void	BitBltConv::bitblt_int_to_flt_avx2_switch (uint8_t *dst_ptr, ptrdiff_t dst_
 	switch ((scale_flag << 16) + (src_fmt << 8) + src_res)
 	{
 	fmtcl_BitBltConv_CASE (false, INT16  , 16, i16)
+	fmtcl_BitBltConv_CASE (false, INT16  , 14, i16)
 	fmtcl_BitBltConv_CASE (false, INT16  , 12, i16)
 	fmtcl_BitBltConv_CASE (false, INT16  , 10, i16)
 	fmtcl_BitBltConv_CASE (false, INT16  ,  9, i16)
 	fmtcl_BitBltConv_CASE (false, INT8   ,  8, i08)
 	fmtcl_BitBltConv_CASE (true , INT16  , 16, i16)
+	fmtcl_BitBltConv_CASE (true , INT16  , 14, i16)
 	fmtcl_BitBltConv_CASE (true , INT16  , 12, i16)
 	fmtcl_BitBltConv_CASE (true , INT16  , 10, i16)
 	fmtcl_BitBltConv_CASE (true , INT16  ,  9, i16)
@@ -154,10 +156,15 @@ void	BitBltConv::bitblt_int_to_int_avx2_switch (fmtcl::SplFmt dst_fmt, int dst_r
 
 	switch ((dst_fmt << 20) + (src_fmt << 16) + (dst_res << 8) + src_res)
 	{
+	fmtcl_BitBltConv_CASE (INT16  , INT16  , 16, 14, i16, i16)
 	fmtcl_BitBltConv_CASE (INT16  , INT16  , 16, 12, i16, i16)
 	fmtcl_BitBltConv_CASE (INT16  , INT16  , 16, 10, i16, i16)
 	fmtcl_BitBltConv_CASE (INT16  , INT16  , 16,  9, i16, i16)
 	fmtcl_BitBltConv_CASE (INT16  , INT8   , 16,  8, i16, i08)
+	fmtcl_BitBltConv_CASE (INT16  , INT16  , 14, 12, i16, i16)
+	fmtcl_BitBltConv_CASE (INT16  , INT16  , 14, 10, i16, i16)
+	fmtcl_BitBltConv_CASE (INT16  , INT16  , 14,  9, i16, i16)
+	fmtcl_BitBltConv_CASE (INT16  , INT8   , 14,  8, i16, i08)
 	fmtcl_BitBltConv_CASE (INT16  , INT16  , 12, 10, i16, i16)
 	fmtcl_BitBltConv_CASE (INT16  , INT16  , 12,  9, i16, i16)
 	fmtcl_BitBltConv_CASE (INT16  , INT8   , 12,  8, i16, i08)
diff --git a/src/fmtcl/KernelData.cpp b/src/fmtcl/KernelData.cpp
index be4fc13..bf6a83e 100644
--- a/src/fmtcl/KernelData.cpp
+++ b/src/fmtcl/KernelData.cpp
@@ -46,6 +46,7 @@ To Public License, Version 2, as published by Sam Hocevar. See
 #include "fstb/def.h"
 #include "fstb/fnc.h"
 
+#include <algorithm>
 #include <stdexcept>
 
 #include <cassert>
@@ -296,6 +297,7 @@ void	KernelData::invert_kernel (int taps)
 	assert (ovr_f * support >= taps);
 	int            len     = fstb::ceil_int (ovr_s * ovr_f * support) * 2;
 	len = 1 << (fstb::get_prev_pow_2 (len - 1) + 1); // Next power of 2
+	len = std::max (len, 1); // Shouldn't happen but GCC emits a warning later
 	const int      h_len   = len / 2;
 
 	std::vector <double> x (len);
diff --git a/src/fmtcl/PrimUtil.cpp b/src/fmtcl/PrimUtil.cpp
index 88ab707..30c6b0a 100644
--- a/src/fmtcl/PrimUtil.cpp
+++ b/src/fmtcl/PrimUtil.cpp
@@ -45,13 +45,20 @@ constexpr int	PrimUtil::_nbr_planes;
 
 
 
-Mat3	PrimUtil::compute_conversion_matrix (const RgbSystem &prim_s, const RgbSystem &prim_d)
+// conv_flag indicates we want a full conversion, not a chromatic adatpation
+Mat3	PrimUtil::compute_conversion_matrix (const RgbSystem &prim_s, const RgbSystem &prim_d, bool conv_flag)
 {
 	assert (prim_s.is_ready ());
 	assert (prim_d.is_ready ());
 
 	const Mat3     rgb2xyz = compute_rgb2xyz (prim_s);
 	const Mat3     xyz2rgb = compute_rgb2xyz (prim_d).invert ();
+
+	if (conv_flag)
+	{
+		return xyz2rgb * rgb2xyz;
+	}
+
 	const Mat3     adapt   = compute_chroma_adapt (prim_s, prim_d);
 
 	return xyz2rgb * adapt * rgb2xyz;
diff --git a/src/fmtcl/PrimUtil.h b/src/fmtcl/PrimUtil.h
index 72cf11d..2cbf773 100644
--- a/src/fmtcl/PrimUtil.h
+++ b/src/fmtcl/PrimUtil.h
@@ -44,7 +44,7 @@ class PrimUtil
 
 	static constexpr int _nbr_planes = RgbSystem::_nbr_planes;
 
-	static Mat3    compute_conversion_matrix (const RgbSystem &prim_s, const RgbSystem &prim_d);
+	static Mat3    compute_conversion_matrix (const RgbSystem &prim_s, const RgbSystem &prim_d, bool conv_flag);
 	static Mat3    compute_rgb2xyz (const RgbSystem &prim);
 	static Mat3    compute_chroma_adapt (const RgbSystem &prim_s, const RgbSystem &prim_d);
 	static Vec3    conv_xy_to_xyz (const RgbSystem::Vec2 &xy);
diff --git a/src/fstb/Hash.h b/src/fstb/Hash.h
index b18d527..ed281d8 100644
--- a/src/fstb/Hash.h
+++ b/src/fstb/Hash.h
@@ -45,6 +45,7 @@ To Public License, Version 2, as published by Sam Hocevar. See
 /*\\\ INCLUDE FILES \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\*/
 
 #include "fstb/def.h"
+#include "fstb/Vu32.h"
 
 #include <cstdint>
 
@@ -64,8 +65,12 @@ class Hash
 
 	static fstb_FORCEINLINE constexpr uint32_t
 	               hash (uint32_t x) noexcept;
+	static fstb_FORCEINLINE Vu32
+	               hash (Vu32 x) noexcept;
 	static fstb_FORCEINLINE constexpr uint32_t
 	               hash_inv (uint32_t x) noexcept;
+	static fstb_FORCEINLINE Vu32
+	               hash_inv (Vu32 x) noexcept;
 
 	static fstb_FORCEINLINE constexpr uint64_t
 	               hash (uint64_t x) noexcept;
diff --git a/src/fstb/Hash.hpp b/src/fstb/Hash.hpp
index 3a4a3e7..c6644fc 100644
--- a/src/fstb/Hash.hpp
+++ b/src/fstb/Hash.hpp
@@ -122,6 +122,19 @@ constexpr uint32_t	Hash::hash (uint32_t x) noexcept
 
 
 
+Vu32	Hash::hash (Vu32 x) noexcept
+{
+	x ^= x >> 16;
+	x *= uint32_t (0x7FEB352Dlu);
+	x ^= x >> 15;
+	x *= uint32_t (0x846CA68Blu);
+	x ^= x >> 16;
+ 
+	return x;
+}
+
+
+
 constexpr uint32_t	Hash::hash_inv (uint32_t x) noexcept
 {
 #if 0
@@ -143,6 +156,19 @@ constexpr uint32_t	Hash::hash_inv (uint32_t x) noexcept
 
 
 
+Vu32	Hash::hash_inv (Vu32 x) noexcept
+{
+	x ^= x >> 16;
+	x *= uint32_t (0x43021123lu);
+	x ^= x >> 15 ^ x >> 30;
+	x *= uint32_t (0x1D69E2A5lu);
+	x ^= x >> 16;
+
+	return x;
+}
+
+
+
 // SplittableRandom / SplitMix64
 constexpr uint64_t	Hash::hash (uint64_t x) noexcept
 {
diff --git a/src/fstb/Vf32.h b/src/fstb/Vf32.h
new file mode 100644
index 0000000..1a7203d
--- /dev/null
+++ b/src/fstb/Vf32.h
@@ -0,0 +1,356 @@
+/*****************************************************************************
+
+        Vf32.h
+        Author: Laurent de Soras, 2021
+
+--- Legal stuff ---
+
+This program is free software. It comes without any warranty, to
+the extent permitted by applicable law. You can redistribute it
+and/or modify it under the terms of the Do What The Fuck You Want
+To Public License, Version 2, as published by Sam Hocevar. See
+http://www.wtfpl.net/ for more details.
+
+*Tab=3***********************************************************************/
+
+
+
+#pragma once
+#if ! defined (fstb_Vf32_HEADER_INCLUDED)
+#define fstb_Vf32_HEADER_INCLUDED
+
+
+
+/*\\\ INCLUDE FILES \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\*/
+
+#include "fstb/def.h"
+
+#if ! defined (fstb_HAS_SIMD)
+	#include <array>
+#elif (fstb_ARCHI == fstb_ARCHI_X86)
+	#include <emmintrin.h>
+#elif (fstb_ARCHI == fstb_ARCHI_ARM)
+	#include <arm_neon.h>
+#else
+	#error
+#endif
+
+#include <tuple>
+
+#include <cstdint>
+
+
+
+namespace fstb
+{
+
+
+
+#if ! defined (fstb_HAS_SIMD)
+
+typedef std::array <float, 4> Vf32Native;
+
+#elif fstb_ARCHI == fstb_ARCHI_X86
+
+typedef __m128      Vf32Native;
+
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+
+typedef float32x4_t Vf32Native;
+
+#else // fstb_ARCHI
+#error
+#endif // fstb_ARCHI
+
+
+
+class Vf32
+{
+
+/*\\\ PUBLIC \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\*/
+
+public:
+
+	static constexpr int _len_l2 = 2;
+	static constexpr int _length = 1 << _len_l2;
+	typedef float Scalar;
+
+	               Vf32 ()                        = default;
+	fstb_FORCEINLINE
+	               Vf32 (Vf32Native a) noexcept : _x { a } {}
+	explicit fstb_FORCEINLINE
+	               Vf32 (Scalar a) noexcept;
+	explicit fstb_FORCEINLINE
+	               Vf32 (double a) noexcept;
+	explicit fstb_FORCEINLINE
+	               Vf32 (int a) noexcept;
+	explicit fstb_FORCEINLINE
+	               Vf32 (Scalar a0, Scalar a1, Scalar a2, Scalar a3) noexcept;
+	explicit fstb_FORCEINLINE
+	               Vf32 (const std::tuple <Scalar, Scalar, Scalar, Scalar> &a) noexcept;
+	               Vf32 (const Vf32 &other)       = default;
+	               Vf32 (Vf32 &&other)            = default;
+	               ~Vf32 ()                       = default;
+	Vf32 &         operator = (const Vf32 &other) = default;
+	Vf32 &         operator = (Vf32 &&other)      = default;
+
+	template <typename MEM>
+	fstb_FORCEINLINE void
+	               store (MEM *ptr) const noexcept;
+	template <typename MEM>
+	fstb_FORCEINLINE void
+	               store_part (MEM *ptr, int n) const noexcept;
+	template <typename MEM>
+	fstb_FORCEINLINE void
+	               storeu (MEM *ptr) const noexcept;
+	template <typename MEM>
+	fstb_FORCEINLINE void
+	               storeu_part (MEM *ptr, int n) const noexcept;
+	template <typename MEM>
+	fstb_FORCEINLINE void
+	               storeu_pair (MEM *ptr) const noexcept;
+	template <typename MEM>
+	fstb_FORCEINLINE void
+	               storeu_scalar (MEM *ptr) const noexcept;
+
+	fstb_FORCEINLINE
+	               operator Vf32Native () const noexcept { return _x; }
+	fstb_FORCEINLINE explicit
+	               operator bool () const noexcept;
+
+	fstb_FORCEINLINE Vf32 &
+	               operator += (const Vf32Native &other) noexcept;
+	fstb_FORCEINLINE Vf32 &
+	               operator -= (const Vf32Native &other) noexcept;
+	fstb_FORCEINLINE Vf32 &
+	               operator *= (const Vf32Native &other) noexcept;
+	fstb_FORCEINLINE Vf32 &
+	               operator /= (const Vf32Native &other) noexcept;
+
+	fstb_FORCEINLINE Vf32 &
+	               operator &= (const Vf32Native &other) noexcept;
+	fstb_FORCEINLINE Vf32 &
+	               operator |= (const Vf32Native &other) noexcept;
+	fstb_FORCEINLINE Vf32 &
+	               operator ^= (const Vf32Native &other) noexcept;
+
+	fstb_FORCEINLINE Vf32 &
+	               mac (Vf32 a, Vf32 b) noexcept;
+	fstb_FORCEINLINE Vf32 &
+	               msu (Vf32 a, Vf32 b) noexcept;
+
+	fstb_FORCEINLINE Vf32
+	               operator - () const noexcept;
+	fstb_FORCEINLINE Vf32
+	               reverse () const noexcept;
+	fstb_FORCEINLINE Vf32
+	               swap_pairs () const noexcept;
+	fstb_FORCEINLINE Vf32
+	               monofy_pairs_lo () const noexcept;
+	fstb_FORCEINLINE Vf32
+	               monofy_pairs_hi () const noexcept;
+
+	fstb_FORCEINLINE Vf32
+	               butterfly_w64 () const noexcept;
+	fstb_FORCEINLINE Vf32
+	               butterfly_w32 () const noexcept;
+
+	template <int SHIFT>
+	fstb_FORCEINLINE Vf32
+	               rotate () const noexcept;
+	template <int POS>
+	fstb_FORCEINLINE float
+	               extract () const noexcept;
+	template <int POS>
+	fstb_FORCEINLINE Vf32
+	               insert (float val) const noexcept;
+	template <int POS>
+	fstb_FORCEINLINE Vf32
+	               spread () const noexcept;
+
+	fstb_FORCEINLINE Vf32
+	               round () const noexcept;
+	fstb_FORCEINLINE Vf32
+	               rcp_approx () const noexcept;
+	fstb_FORCEINLINE Vf32
+	               rcp_approx2 () const noexcept;
+	fstb_FORCEINLINE Vf32
+	               div_approx (const Vf32 &d) const noexcept;
+	fstb_FORCEINLINE Vf32
+	               sqrt_approx () const noexcept;
+	fstb_FORCEINLINE Vf32
+	               rsqrt () const noexcept;
+	fstb_FORCEINLINE Vf32
+	               rsqrt_approx () const noexcept;
+	template <typename P>
+	fstb_FORCEINLINE Vf32
+	               log2_base (P poly) const noexcept;
+	template <typename P>
+	fstb_FORCEINLINE Vf32
+	               exp2_base (P poly) const noexcept;
+	fstb_FORCEINLINE Vf32
+	               signbit () const noexcept;
+	fstb_FORCEINLINE Vf32
+	               is_lt_0 () const noexcept;
+
+	fstb_FORCEINLINE std::tuple <float, float, float, float>
+	               explode () const noexcept;
+	fstb_FORCEINLINE std::tuple <float, float>
+	               extract_pair () const noexcept;
+	fstb_FORCEINLINE std::tuple <Vf32, Vf32>
+	               spread_pairs () const noexcept;
+
+	fstb_FORCEINLINE float
+	               sum_h () const noexcept;
+	fstb_FORCEINLINE float
+	               min_h () const noexcept;
+	fstb_FORCEINLINE float
+	               max_h () const noexcept;
+
+	fstb_FORCEINLINE bool
+	               and_h () const noexcept;
+	fstb_FORCEINLINE bool
+	               or_h () const noexcept;
+	fstb_FORCEINLINE unsigned int
+	               movemask () const noexcept;
+
+	static fstb_FORCEINLINE Vf32
+	               zero () noexcept;
+	static fstb_FORCEINLINE Vf32
+	               all1 () noexcept;
+	static fstb_FORCEINLINE Vf32
+	               set_pair (float a0, float a1) noexcept;
+	static fstb_FORCEINLINE Vf32
+	               set_pair_fill (float a02, float a13) noexcept;
+	static fstb_FORCEINLINE Vf32
+	               set_pair_dbl (float a01, float a23) noexcept;
+	static fstb_FORCEINLINE Vf32
+	               set_mask (bool m0, bool m1, bool m2, bool m3) noexcept;
+	static fstb_FORCEINLINE Vf32Native
+	               signbit_mask () noexcept;
+	static fstb_FORCEINLINE Vf32
+	               interleave_pair_lo (Vf32 p0, Vf32 p1) noexcept;
+	static fstb_FORCEINLINE Vf32
+	               interleave_pair_hi (Vf32 p0, Vf32 p1) noexcept;
+	static fstb_FORCEINLINE std::tuple <Vf32, Vf32>
+	               interleave (Vf32 p0, Vf32 p1) noexcept;
+	static fstb_FORCEINLINE std::tuple <Vf32, Vf32>
+	               deinterleave (Vf32 i0, Vf32 i1) noexcept;
+	static fstb_FORCEINLINE Vf32
+	               deinterleave_lo (Vf32 i0, Vf32 i1) noexcept;
+	static fstb_FORCEINLINE Vf32
+	               deinterleave_hi (Vf32 i0, Vf32 i1) noexcept;
+	template <int POS>
+	static fstb_FORCEINLINE Vf32
+	               compose (Vf32 a, Vf32 b) noexcept;
+
+	template <typename MEM>
+	static fstb_FORCEINLINE Vf32
+	               load (const MEM *ptr) noexcept;
+	template <typename MEM>
+	static fstb_FORCEINLINE Vf32
+	               loadu (const MEM *ptr) noexcept;
+	template <typename MEM>
+	static fstb_FORCEINLINE Vf32
+	               loadu_part (const MEM *ptr, int n) noexcept;
+	template <typename MEM>
+	static fstb_FORCEINLINE Vf32
+	               loadu_pair (const MEM *ptr) noexcept;
+
+
+
+/*\\\ PROTECTED \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\*/
+
+protected:
+
+
+
+/*\\\ PRIVATE \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\*/
+
+private:
+
+	static constexpr int32_t _sign32 = INT32_MIN;
+
+	template <typename MEM>
+	fstb_FORCEINLINE void
+	               storeu_part_n13 (MEM *ptr, int n) const noexcept;
+
+#if ! defined (fstb_HAS_SIMD)
+public:
+	union Combo
+	{
+		Vf32Native     _vf32;
+		int32_t        _s32 [_length];
+		uint32_t       _u32 [_length];
+	};
+	static_assert (
+		sizeof (Combo) == sizeof (Vf32Native),
+		"Wrong size for the wrapping combo structure"
+	);
+#endif
+	Vf32Native  _x;
+private:
+
+
+
+/*\\\ FORBIDDEN MEMBER FUNCTIONS \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\*/
+
+private:
+
+}; // class Vf32
+
+static_assert (
+	sizeof (Vf32) == sizeof (Vf32Native),
+	"Wrong size for the wrapping structure"
+);
+
+
+
+/*\\\ GLOBAL OPERATORS AND FUNCTIONS \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\*/
+
+
+
+fstb_FORCEINLINE Vf32 operator + (Vf32 lhs, const Vf32 &rhs) noexcept;
+fstb_FORCEINLINE Vf32 operator - (Vf32 lhs, const Vf32 &rhs) noexcept;
+fstb_FORCEINLINE Vf32 operator * (Vf32 lhs, const Vf32 &rhs) noexcept;
+fstb_FORCEINLINE Vf32 operator / (Vf32 lhs, const Vf32 &rhs) noexcept;
+fstb_FORCEINLINE Vf32 operator & (Vf32 lhs, const Vf32 &rhs) noexcept;
+fstb_FORCEINLINE Vf32 operator | (Vf32 lhs, const Vf32 &rhs) noexcept;
+fstb_FORCEINLINE Vf32 operator ^ (Vf32 lhs, const Vf32 &rhs) noexcept;
+
+fstb_FORCEINLINE Vf32 operator == (const Vf32 &lhs, const Vf32 &rhs) noexcept;
+fstb_FORCEINLINE Vf32 operator != (const Vf32 &lhs, const Vf32 &rhs) noexcept;
+fstb_FORCEINLINE Vf32 operator <  (const Vf32 &lhs, const Vf32 &rhs) noexcept;
+fstb_FORCEINLINE Vf32 operator <= (const Vf32 &lhs, const Vf32 &rhs) noexcept;
+fstb_FORCEINLINE Vf32 operator >  (const Vf32 &lhs, const Vf32 &rhs) noexcept;
+fstb_FORCEINLINE Vf32 operator >= (const Vf32 &lhs, const Vf32 &rhs) noexcept;
+
+fstb_FORCEINLINE Vf32 abs (const Vf32 &v) noexcept;
+fstb_FORCEINLINE Vf32 fma (const Vf32 &x, const Vf32 &a, const Vf32 &b) noexcept;
+fstb_FORCEINLINE Vf32 fms (const Vf32 &x, const Vf32 &a, const Vf32 &b) noexcept;
+fstb_FORCEINLINE Vf32 fnma (const Vf32 &x, const Vf32 &a, const Vf32 &b) noexcept;
+fstb_FORCEINLINE Vf32 round (const Vf32 &v) noexcept;
+fstb_FORCEINLINE Vf32 min (const Vf32 &lhs, const Vf32 &rhs) noexcept;
+fstb_FORCEINLINE Vf32 max (const Vf32 &lhs, const Vf32 &rhs) noexcept;
+fstb_FORCEINLINE Vf32 limit (const Vf32 &v, const Vf32 &mi, const Vf32 &ma) noexcept;
+fstb_FORCEINLINE Vf32 select (Vf32 cond, Vf32 v_t, Vf32 v_f) noexcept;
+fstb_FORCEINLINE std::tuple <Vf32, Vf32> swap_if (Vf32 cond, Vf32 lhs, Vf32 rhs) noexcept;
+fstb_FORCEINLINE Vf32 sqrt (Vf32 v) noexcept;
+fstb_FORCEINLINE Vf32 log2 (Vf32 v) noexcept;
+fstb_FORCEINLINE Vf32 exp2 (Vf32 v) noexcept;
+
+
+
+}  // namespace fstb
+
+
+
+#include "fstb/Vf32.hpp"
+
+
+
+#endif   // fstb_Vf32_HEADER_INCLUDED
+
+
+
+/*\\\ EOF \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\*/
diff --git a/src/fstb/Vf32.hpp b/src/fstb/Vf32.hpp
new file mode 100644
index 0000000..385cb0b
--- /dev/null
+++ b/src/fstb/Vf32.hpp
@@ -0,0 +1,2181 @@
+/*****************************************************************************
+
+        Vf32.hpp
+        Author: Laurent de Soras, 2021
+
+--- Legal stuff ---
+
+This program is free software. It comes without any warranty, to
+the extent permitted by applicable law. You can redistribute it
+and/or modify it under the terms of the Do What The Fuck You Want
+To Public License, Version 2, as published by Sam Hocevar. See
+http://www.wtfpl.net/ for more details.
+
+*Tab=3***********************************************************************/
+
+
+
+#if ! defined (fstb_Vf32_CODEHEADER_INCLUDED)
+#define fstb_Vf32_CODEHEADER_INCLUDED
+
+
+
+/*\\\ INCLUDE FILES \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\*/
+
+#include  "fstb/fnc.h"
+
+#include <algorithm>
+
+#include <cassert>
+#include <cfloat>
+#include <cmath>
+
+
+
+namespace fstb
+{
+
+
+
+/*\\\ PUBLIC \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\*/
+
+
+
+// Initialises with a | a | a | a
+Vf32::Vf32 (Scalar a) noexcept
+#if ! defined (fstb_HAS_SIMD)
+:	_x { a, a, a, a }
+#elif fstb_ARCHI == fstb_ARCHI_X86
+:	_x { _mm_set1_ps (a) }
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+:	_x { vdupq_n_f32 (a) }
+#endif // fstb_ARCHI
+{
+	// Nothing
+}
+
+
+
+// Initialises with a | a | a | a
+Vf32::Vf32 (double a) noexcept
+#if ! defined (fstb_HAS_SIMD)
+:	_x { Scalar (a), Scalar (a), Scalar (a), Scalar (a) }
+#elif fstb_ARCHI == fstb_ARCHI_X86
+:	_x { _mm_set1_ps (Scalar (a)) }
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+:	_x { vdupq_n_f32 (Scalar (a)) }
+#endif // fstb_ARCHI
+{
+	// Nothing
+}
+
+
+
+// Initialises with a | a | a | a
+Vf32::Vf32 (int a) noexcept
+#if ! defined (fstb_HAS_SIMD)
+:	_x { Scalar (a), Scalar (a), Scalar (a), Scalar (a) }
+#elif fstb_ARCHI == fstb_ARCHI_X86
+:	_x { _mm_set1_ps (Scalar (a)) }
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+:	_x { vdupq_n_f32 (Scalar (a)) }
+#endif // fstb_ARCHI
+{
+	// Nothing
+}
+
+
+
+// Initialises with a0 | a1 | a2 | a3
+Vf32::Vf32 (Scalar a0, Scalar a1, Scalar a2, Scalar a3) noexcept
+#if ! defined (fstb_HAS_SIMD)
+:	_x { a0, a1, a2, a3 }
+#elif fstb_ARCHI == fstb_ARCHI_X86
+:	_x { _mm_set_ps (a3, a2, a1, a0) }
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+:	_x { a0, a1, a2, a3 }
+#endif // fstb_ARCHI
+{
+	// Nothing
+}
+
+
+
+// Initialises with a0 | a1 | a2 | a3
+Vf32::Vf32 (const std::tuple <Scalar, Scalar, Scalar, Scalar> &a) noexcept
+#if ! defined (fstb_HAS_SIMD)
+:	_x { std::get <0> (a), std::get <1> (a), std::get <2> (a), std::get <3> (a) }
+#elif fstb_ARCHI == fstb_ARCHI_X86
+:	_x { _mm_set_ps (std::get <3> (a), std::get <2> (a), std::get <1> (a), std::get <0> (a)) }
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+:	_x { std::get <0> (a), std::get <1> (a), std::get <2> (a), std::get <3> (a) }
+#endif // fstb_ARCHI
+{
+	// Nothing
+}
+
+
+
+template <typename MEM>
+void	Vf32::store (MEM *ptr) const noexcept
+{
+	assert (is_ptr_align_nz (ptr, fstb_SIMD128_ALIGN));
+
+#if ! defined (fstb_HAS_SIMD)
+	*reinterpret_cast <Vf32Native *> (ptr) = _x;
+#elif fstb_ARCHI == fstb_ARCHI_X86
+	_mm_store_ps (reinterpret_cast <float *> (ptr), _x);
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+	vst1q_f32 (reinterpret_cast <float32_t *> (ptr), _x);
+#endif // fstb_ARCHI
+}
+
+
+
+// n = number of scalars to store (from the LSB)
+template <typename MEM>
+void	Vf32::store_part (MEM *ptr, int n) const noexcept
+{
+	assert (n > 0);
+
+	if (n >= 4)
+	{
+		store (ptr);
+	}
+	else
+	{
+		storeu_part_n13 (ptr, n);
+	}
+}
+
+
+
+template <typename MEM>
+void	Vf32::storeu (MEM *ptr) const noexcept
+{
+	assert (ptr != nullptr);
+
+#if ! defined (fstb_HAS_SIMD)
+	*reinterpret_cast <Vf32Native *> (ptr) = _x;
+#elif fstb_ARCHI == fstb_ARCHI_X86
+	_mm_storeu_ps (reinterpret_cast <float *> (ptr), _x);
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+	vst1q_u8 (reinterpret_cast <uint8_t *> (ptr), vreinterpretq_u8_f32 (_x));
+#endif // fstb_ARCHI
+}
+
+
+
+// n = number of scalars to store (from the LSB)
+template <typename MEM>
+void	Vf32::storeu_part (MEM *ptr, int n) const noexcept
+{
+	assert (n > 0);
+
+	if (n >= 4)
+	{
+		storeu (ptr);
+		return;
+	}
+
+	storeu_part_n13 (ptr, n);
+}
+
+
+
+// ptr [0] = v0
+// ptr [1] = v1
+template <typename MEM>
+void	Vf32::storeu_pair (MEM *ptr) const noexcept
+{
+	assert (ptr != nullptr);
+
+#if ! defined (fstb_HAS_SIMD)
+	auto           p = reinterpret_cast <float *> (ptr);
+	p [0] = _x [0];
+	p [1] = _x [1];
+#elif fstb_ARCHI == fstb_ARCHI_X86
+	_mm_store_ss (reinterpret_cast <float *> (ptr)    , _x );
+	const auto     v1 = _mm_shuffle_ps (_x, _x, 1 << 0);
+	_mm_store_ss (reinterpret_cast <float *> (ptr) + 1, v1);
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+	vst1_u8 (
+		reinterpret_cast <uint8_t *> (ptr),
+		vreinterpret_u8_f32 (vget_low_f32 (_x))
+	);
+#endif // fstb_ARCHI
+}
+
+
+
+// *ptr = v0
+template <typename MEM>
+void	Vf32::storeu_scalar (MEM *ptr) const noexcept
+{
+	assert (ptr != nullptr);
+
+#if ! defined (fstb_HAS_SIMD)
+	reinterpret_cast <float *> (ptr) [0] = _x [0];
+#elif fstb_ARCHI == fstb_ARCHI_X86
+	_mm_store_ss (reinterpret_cast <float *> (ptr), _x);
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+	vst1q_lane_f32 (reinterpret_cast <float32_t *> (ptr), _x, 0);
+#endif // fstb_ARCHI
+}
+
+
+
+// Works only with well-formed condition results (tested bits depend on the
+// implementation).
+// For each scalar, true = all bits set, false = all bits cleared
+Vf32::operator bool () const noexcept
+{
+	return and_h ();
+}
+
+
+
+Vf32 &	Vf32::operator += (const Vf32Native &other) noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+	_x [0] += other [0];
+	_x [1] += other [1];
+	_x [2] += other [2];
+	_x [3] += other [3];
+#elif fstb_ARCHI == fstb_ARCHI_X86
+	_x = _mm_add_ps (_x, other);
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+	_x = vaddq_f32 (_x, other);
+#endif // fstb_ARCHI
+	return *this;
+}
+
+
+
+Vf32 &	Vf32::operator -= (const Vf32Native &other) noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+	_x [0] -= other [0];
+	_x [1] -= other [1];
+	_x [2] -= other [2];
+	_x [3] -= other [3];
+#elif fstb_ARCHI == fstb_ARCHI_X86
+	_x = _mm_sub_ps (_x, other);
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+	_x = vsubq_f32 (_x, other);
+#endif // fstb_ARCHI
+	return *this;
+}
+
+
+
+Vf32 &	Vf32::operator *= (const Vf32Native &other) noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+	_x [0] *= other [0];
+	_x [1] *= other [1];
+	_x [2] *= other [2];
+	_x [3] *= other [3];
+#elif fstb_ARCHI == fstb_ARCHI_X86
+	_x = _mm_mul_ps (_x, other);
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+	_x = vmulq_f32 (_x, other);
+#endif // fstb_ARCHI
+	return *this;
+}
+
+
+
+Vf32 &	Vf32::operator /= (const Vf32Native &other) noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+	_x [0] /= other [0];
+	_x [1] /= other [1];
+	_x [2] /= other [2];
+	_x [3] /= other [3];
+#elif fstb_ARCHI == fstb_ARCHI_X86
+	_x = _mm_div_ps (_x, other);
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+	_x = _x * (Vf32 { other }.rcp_approx2 ())._x;
+#endif // fstb_ARCHI
+	return *this;
+}
+
+
+
+Vf32 &	Vf32::operator &= (const Vf32Native &other) noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+	Combo          al { _x };
+	Combo          ar { other };
+	al._s32 [0] &= ar._s32 [0];
+	al._s32 [1] &= ar._s32 [1];
+	al._s32 [2] &= ar._s32 [2];
+	al._s32 [3] &= ar._s32 [3];
+	_x = al._vf32;
+#elif fstb_ARCHI == fstb_ARCHI_X86
+	_x = _mm_and_ps (_x, other);
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+	_x = vreinterpretq_f32_u32 (vandq_u32 (
+		vreinterpretq_u32_f32 (_x),
+		vreinterpretq_u32_f32 (other)
+	));
+#endif // fstb_ARCHI
+	return *this;
+}
+
+
+
+Vf32 &	Vf32::operator |= (const Vf32Native &other) noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+	Combo          al { _x };
+	Combo          ar { other };
+	al._s32 [0] |= ar._s32 [0];
+	al._s32 [1] |= ar._s32 [1];
+	al._s32 [2] |= ar._s32 [2];
+	al._s32 [3] |= ar._s32 [3];
+	_x = al._vf32;
+#elif fstb_ARCHI == fstb_ARCHI_X86
+	_x = _mm_or_ps (_x, other);
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+	_x = vreinterpretq_f32_u32 (vorrq_u32 (
+		vreinterpretq_u32_f32 (_x),
+		vreinterpretq_u32_f32 (other)
+	));
+#endif // fstb_ARCHI
+	return *this;
+}
+
+
+
+Vf32 &	Vf32::operator ^= (const Vf32Native &other) noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+	Combo          al { _x };
+	Combo          ar { other };
+	al._s32 [0] ^= ar._s32 [0];
+	al._s32 [1] ^= ar._s32 [1];
+	al._s32 [2] ^= ar._s32 [2];
+	al._s32 [3] ^= ar._s32 [3];
+	_x = al._vf32;
+#elif fstb_ARCHI == fstb_ARCHI_X86
+	_x = _mm_xor_ps (_x, other);
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+	_x = vreinterpretq_f32_u32 (veorq_u32 (
+		vreinterpretq_u32_f32 (_x),
+		vreinterpretq_u32_f32 (other)
+	));
+#endif // fstb_ARCHI
+	return *this;
+}
+
+
+
+// *this += a * b
+Vf32 &	Vf32::mac (Vf32 a, Vf32 b) noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+	_x [0] += a._x [0] * b._x [0];
+	_x [1] += a._x [1] * b._x [1];
+	_x [2] += a._x [2] * b._x [2];
+	_x [3] += a._x [3] * b._x [3];
+#elif fstb_ARCHI == fstb_ARCHI_X86
+	_x = _mm_add_ps (_x, _mm_mul_ps (a, b));
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+	#if defined (__ARM_FEATURE_FMA)
+	_x = vfmaq_f32 (_x, a, b);
+	#else
+	_x = vmlaq_f32 (_x, a, b);
+	#endif
+#endif // fstb_ARCHI
+	return *this;
+}
+
+
+
+// *this -= a * b
+Vf32 &	Vf32::msu (Vf32 a, Vf32 b) noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+	_x [0] -= a._x [0] * b._x [0];
+	_x [1] -= a._x [1] * b._x [1];
+	_x [2] -= a._x [2] * b._x [2];
+	_x [3] -= a._x [3] * b._x [3];
+#elif fstb_ARCHI == fstb_ARCHI_X86
+	_x = _mm_sub_ps (_x, _mm_mul_ps (a, b));
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+	#if defined (__ARM_FEATURE_FMA)
+	_x = vfmsq_f32 (_x, a, b);
+	#else
+	_x = vmlsq_f32 (_x, a, b);
+	#endif
+#endif // fstb_ARCHI
+	return *this;
+}
+
+
+
+Vf32	Vf32::operator - () const noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+	return Vf32 {
+		-_x [0],
+		-_x [1],
+		-_x [2],
+		-_x [3]
+	};
+#elif fstb_ARCHI == fstb_ARCHI_X86
+	return _mm_xor_ps (_x, signbit_mask ());
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+	return vnegq_f32 (_x);
+#endif // fstb_ARCHI
+}
+
+
+
+Vf32	Vf32::reverse () const noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+	return Vf32 { _x [3], _x [2], _x [1], _x [0] };
+#elif fstb_ARCHI == fstb_ARCHI_X86
+	return _mm_shuffle_ps (_x, _x, (3<<0) + (2<<2) + (1<<4) + (0<<6));
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+	return vrev64q_f32 (vcombine_f32 (vget_high_f32 (_x), vget_low_f32 (_x)));
+#endif // fstb_ARCHI
+}
+
+
+
+Vf32	Vf32::swap_pairs () const noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+	return Vf32 { _x [2], _x [3], _x [0], _x [1] };
+#elif fstb_ARCHI == fstb_ARCHI_X86
+	return _mm_shuffle_ps (_x, _x, (2<<0) + (3<<2) + (0<<4) + (1<<6));
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+	const float32x2_t v01 = vget_low_f32 (_x);
+	const float32x2_t v23 = vget_high_f32 (_x);
+	return vcombine_f32 (v23, v01);
+#endif // fstb_ARCHI
+}
+
+
+
+// a, b, c, d -> a, a, c, c
+Vf32	Vf32::monofy_pairs_lo () const noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+	return Vf32 { _x [0], _x [0], _x [2], _x [2] };
+#elif fstb_ARCHI == fstb_ARCHI_X86
+	return _mm_shuffle_ps (_x, _x, 0xA0);
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+	return vuzpq_f32 (_x, _x).val [0];
+#endif // fstb_ARCHI
+}
+
+
+
+// a, b, c, d -> b, b, d, d
+Vf32	Vf32::monofy_pairs_hi () const noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+	return Vf32 { _x [1], _x [1], _x [3], _x [3] };
+#elif fstb_ARCHI == fstb_ARCHI_X86
+	return _mm_shuffle_ps (_x, _x, 0xF5);
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+	return vuzpq_f32 (_x, _x).val [1];
+#endif // fstb_ARCHI
+}
+
+
+
+// a, b, c, d -> a+c, b+d, a-c, b-d
+Vf32	Vf32::butterfly_w64 () const noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+	return Vf32 {
+		_x [0] + _x [2],
+		_x [1] + _x [3],
+		_x [0] - _x [2],
+		_x [1] - _x [3]
+	};
+#elif fstb_ARCHI == fstb_ARCHI_X86
+	const auto sign = _mm_castsi128_ps (_mm_setr_epi32 (0, 0, _sign32, _sign32));
+	const auto x0   = _mm_shuffle_ps (_x, _x, (2<<0) + (3<<2) + (0<<4) + (1<<6)); // c, d, a, b
+	const auto x1   = _mm_xor_ps (_x, sign); // a, b, -c, -d
+	return x0 + x1;
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+	const auto sign = int32x4_t { 0, 0, _sign32, _sign32 };
+	const auto x0   = vcombine_f32 (vget_high_f32 (_x), vget_low_f32 (_x)); // c, d, a, b
+	const auto x1   = // a, b, -c, -d
+		vreinterpretq_f32_s32 (veorq_s32 (vreinterpretq_s32_f32 (_x), sign));
+	return x0 + x1;
+#endif
+}
+
+
+
+// a, b, c, d -> a+b, a-b, c+d, c-d
+Vf32	Vf32::butterfly_w32 () const noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+	return Vf32 {
+		_x [0] + _x [1],
+		_x [0] + _x [1],
+		_x [2] - _x [3],
+		_x [2] - _x [3]
+	};
+#elif fstb_ARCHI == fstb_ARCHI_X86
+	const auto sign = _mm_castsi128_ps (_mm_setr_epi32 (0, _sign32, 0, _sign32));
+	const auto x0   = _mm_shuffle_ps (_x, _x, (1<<0) + (0<<2) + (3<<4) + (2<<6)); // b, a, d, c
+	const auto x1   = _mm_xor_ps (_x, sign); // a, -b, c, -d
+	return x0 + x1;
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+	const auto sign = int32x4_t { 0, _sign32, 0, _sign32 };
+	const auto x0   = vrev64q_f32 (_x); // b, a, d, c
+	const auto x1   = // a, -b, c, -d
+		vreinterpretq_f32_s32 (veorq_s32 (vreinterpretq_s32_f32 (_x), sign));
+	return x0 + x1;
+#endif
+}
+
+
+
+// Positive = to the left, rotates towards the higher indexes
+template <int SHIFT>
+Vf32	Vf32::rotate () const noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+	return Vf32 {
+		_x [(0 - SHIFT) & 3],
+		_x [(1 - SHIFT) & 3],
+		_x [(2 - SHIFT) & 3],
+		_x [(3 - SHIFT) & 3]
+	};
+#elif fstb_ARCHI == fstb_ARCHI_X86
+	switch (SHIFT & 3)
+	{
+	case 1:  return _mm_shuffle_ps (_x, _x, (2<<6) | (1<<4) | (0<<2) | (3<<0));
+	case 2:  return _mm_shuffle_ps (_x, _x, (1<<6) | (0<<4) | (3<<2) | (2<<0));
+	case 3:  return _mm_shuffle_ps (_x, _x, (0<<6) | (3<<4) | (2<<2) | (1<<0));
+	default: return *this;
+	}
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+	int32x4_t     aa = vreinterpretq_s32_f32 (_x);
+	switch (SHIFT & 3)
+	{
+	case 1:  aa = vextq_s32 (aa, aa, 3); break;
+	case 2:  aa = vextq_s32 (aa, aa, 2); break;
+	case 3:  aa = vextq_s32 (aa, aa, 1); break;
+	default: /* Nothing */               break;
+	}
+	return vreinterpretq_f32_s32 (aa);
+#endif // fstb_ARCHI
+}
+
+
+
+template <int POS>
+float	Vf32::extract () const noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+	return _x [POS & 3];
+#elif fstb_ARCHI == fstb_ARCHI_X86
+	auto           a = _x;
+	switch (POS & 3)
+	{
+	case 1:  a = _mm_shuffle_ps (a, a, 1);	break;
+	case 2:  a = _mm_shuffle_ps (a, a, 2);	break;
+	case 3:  a = _mm_shuffle_ps (a, a, 3);	break;
+	default: /* Nothing */                 break;
+	}
+	return _mm_cvtss_f32 (a);
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+	return vgetq_lane_f32 (_x, POS & 3);
+#endif // fstb_ARCHI
+}
+
+
+
+template <int POS>
+Vf32	Vf32::insert (float val) const noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+	auto           a = *this;
+	a._x [POS & 3] = val;
+	return a;
+#elif fstb_ARCHI == fstb_ARCHI_X86
+	auto           a = rotate <(-POS) & 3> ();
+	a._x = _mm_move_ss (a._x, _mm_set_ss (val));
+	return a.template rotate <POS> ();
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+	return vsetq_lane_f32 (val, _x, POS & 3);
+#endif // fstb_ARCHI
+}
+
+
+
+template <int POS>
+Vf32	Vf32::spread () const noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+	return Vf32 (extract <POS> ());
+#elif fstb_ARCHI == fstb_ARCHI_X86
+	return _mm_shuffle_ps (_x, _x, 0x55 * (POS & 3));
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+	return vdupq_n_f32 (vgetq_lane_f32 (_x, POS & 3));
+#endif // fstb_ARCHI
+}
+
+
+
+// Assumes "to nearest" rounding mode on x86
+Vf32	Vf32::round () const noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+	return Vf32 {
+		roundf (_x [0]),
+		roundf (_x [1]),
+		roundf (_x [2]),
+		roundf (_x [3])
+	};
+#elif fstb_ARCHI == fstb_ARCHI_X86
+	return _mm_cvtepi32_ps (_mm_cvtps_epi32 (_x));
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+	const auto     zero = vdupq_n_f32 ( 0.0f);
+	const auto     m    = vdupq_n_f32 (-0.5f);
+	const auto     p    = vdupq_n_f32 (+0.5f);
+	const auto     gt0  = vcgtq_f32 (_x, zero);
+	const auto     u    = vbslq_f32 (gt0, p, m);
+	return vcvtq_f32_s32 (vcvtq_s32_f32 (vaddq_f32 (_x, u)));
+#endif // fstb_ARCHI
+}
+
+
+
+Vf32	Vf32::rcp_approx () const noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+	return Vf32 {
+		1.f / _x [0],
+		1.f / _x [1],
+		1.f / _x [2],
+		1.f / _x [3]
+	};
+#elif fstb_ARCHI == fstb_ARCHI_X86
+	return _mm_rcp_ps (_x);
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+	auto           r = vrecpeq_f32 (_x);
+	r = vmulq_f32 (vrecpsq_f32 (_x, r), r);
+	return r;
+#endif // fstb_ARCHI
+}
+
+
+
+// With more accuracy
+Vf32	Vf32::rcp_approx2 () const noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+	return rcp_approx ();
+#elif fstb_ARCHI == fstb_ARCHI_X86
+	auto           r = _mm_rcp_ps (_x);
+	r = r * (_mm_set1_ps (2.f) - r * _x);
+	return r;
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+	auto           r = vrecpeq_f32 (_x);
+	r = vmulq_f32 (vrecpsq_f32 (_x, r), r);
+	r = vmulq_f32 (vrecpsq_f32 (_x, r), r);
+	return r;
+#endif // fstb_ARCHI
+}
+
+
+
+Vf32	Vf32::div_approx (const Vf32 &d) const noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+	return Vf32 {
+		_x [0] / d._x [0],
+		_x [1] / d._x [1],
+		_x [2] / d._x [2],
+		_x [3] / d._x [3]
+	};
+#elif fstb_ARCHI == fstb_ARCHI_X86
+	return _mm_div_ps (_x, d._x);
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+	return _x * d.rcp_approx ()._x;
+#endif // fstb_ARCHI
+}
+
+
+
+Vf32	Vf32::sqrt_approx () const noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+	return Vf32 {
+		sqrtf (_x [0]),
+		sqrtf (_x [1]),
+		sqrtf (_x [2]),
+		sqrtf (_x [3])
+	};
+#elif fstb_ARCHI == fstb_ARCHI_X86
+	// Zero and denormal values will produce INF with _mm_rsqrt_ps(), so
+	// we need a mask.
+	const __m128   z_flag  = _mm_cmplt_ps (_x, _mm_set1_ps (FLT_MIN));
+	const __m128   rsqrt_a = _mm_rsqrt_ps (_x);
+	const __m128   sqrt_a  = _mm_mul_ps (_x, rsqrt_a);
+	const __m128   sqrt_m  = _mm_andnot_ps (z_flag, sqrt_a);
+	return sqrt_m;
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+	const uint32x4_t  nz_flag = vtstq_u32 (
+		vreinterpretq_u32_f32 (_x),
+		vreinterpretq_u32_f32 (_x)
+	);
+	auto           rs      = vrsqrteq_f32 (_x);
+	rs *= vrsqrtsq_f32 (rs * float32x4_t (_x), rs);
+	const auto     sqrt_a  = rs * float32x4_t (_x);
+	return vreinterpretq_f32_u32 (vandq_u32 (
+		vreinterpretq_u32_f32 (sqrt_a),
+		nz_flag
+	));
+#endif // fstb_ARCHI
+}
+
+
+
+Vf32	Vf32::rsqrt () const noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+	return Vf32 {
+		1.f / sqrtf (_x [0]),
+		1.f / sqrtf (_x [1]),
+		1.f / sqrtf (_x [2]),
+		1.f / sqrtf (_x [3])
+	};
+#elif fstb_ARCHI == fstb_ARCHI_X86
+	__m128         rs = _mm_rsqrt_ps (_x);
+	rs = _mm_set1_ps (0.5f) * rs * (_mm_set1_ps (3) - __m128 (_x) * rs * rs);
+	return rs;
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+	auto           rs = vrsqrteq_f32 (_x);
+	rs *= vrsqrtsq_f32 (rs * float32x4_t (_x), rs);
+	rs *= vrsqrtsq_f32 (rs * float32x4_t (_x), rs);
+	return rs;
+#endif // fstb_ARCHI
+}
+
+
+
+Vf32	Vf32::rsqrt_approx () const noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+	// Ref:
+	// Jan Kadlec, http://rrrola.wz.cz/inv_sqrt.html, 2010
+	const auto     xh = (*this) * Vf32 (0.703952253f);
+	Combo          c { _x };
+	c._s32 [0] = 0x5F1FFFF9 - (c._s32 [0] >> 1);
+	c._s32 [1] = 0x5F1FFFF9 - (c._s32 [1] >> 1);
+	c._s32 [2] = 0x5F1FFFF9 - (c._s32 [2] >> 1);
+	c._s32 [3] = 0x5F1FFFF9 - (c._s32 [3] >> 1);
+	auto           rs = Vf32 { c._vf32 };
+	rs *= Vf32 (1.681914091f) - xh * rs * rs;
+	return rs;
+#elif fstb_ARCHI == fstb_ARCHI_X86
+	return _mm_rsqrt_ps (_x);
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+	auto           rs = vrsqrteq_f32 (_x);
+	rs *= vrsqrtsq_f32 (rs * float32x4_t (_x), rs);
+	return rs;
+#endif // fstb_ARCHI
+}
+
+
+
+// poly is a user-provided Vf32 log2 approximation from [1 ; 2[ to [0 ; 1[
+template <typename P>
+Vf32	Vf32::log2_base (P poly) const noexcept
+{
+	const int32_t  log2_sub = 127;
+
+#if ! defined (fstb_HAS_SIMD)
+
+	assert (
+	      _x [0] > 0
+		&& _x [1] > 0
+		&& _x [2] > 0
+		&& _x [3] > 0
+	);
+	Combo          c { _x };
+	const int      x0 = c._s32 [0];
+	const int      x1 = c._s32 [1];
+	const int      x2 = c._s32 [2];
+	const int      x3 = c._s32 [3];
+	const Vf32     log2_int {
+		float (((x0 >> 23) & 255) - log2_sub),
+		float (((x1 >> 23) & 255) - log2_sub),
+		float (((x2 >> 23) & 255) - log2_sub),
+		float (((x3 >> 23) & 255) - log2_sub)
+	};
+	c._s32 [0] = (x0 & ~(255 << 23)) + (127 << 23);
+	c._s32 [1] = (x1 & ~(255 << 23)) + (127 << 23);
+	c._s32 [2] = (x2 & ~(255 << 23)) + (127 << 23);
+	c._s32 [3] = (x3 & ~(255 << 23)) + (127 << 23);
+	Vf32           part { c._vf32 };
+
+#else // fstb_HAS_SIMD
+
+#if fstb_ARCHI == fstb_ARCHI_X86
+
+	// Extracts the exponent
+	__m128i        xi = _mm_castps_si128 (_x);
+	xi = _mm_srli_epi32 (xi, 23);
+	const __m128i  l2_sub = _mm_set1_epi32 (log2_sub);
+	xi = _mm_sub_epi32 (xi, l2_sub);
+	const auto     log2_int = Vf32 { _mm_cvtepi32_ps (xi) };
+
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+
+	int32x4_t      xi = vreinterpretq_s32_f32 (_x);
+	xi = vshrq_n_s32 (xi, 23);
+	const int32x4_t   l2_sub = vdupq_n_s32 (log2_sub);
+	xi -= l2_sub;
+	const auto     log2_int = Vf32 { vcvtq_f32_s32 (xi) };
+
+#endif // fstb_ARCHI
+
+	// Extracts the multiplicative part in [1 ; 2[
+	const auto     mask_mantissa = Vf32 (1.17549421e-38f); // Binary: (1 << 23) - 1
+	auto           part          = _x & mask_mantissa;
+	const auto     bias          = Vf32 (1.0f);            // Binary: 127 << 23
+	part |= bias;
+
+#endif // fstb_HAS_SIMD
+
+	// Computes the log2 approximation [1 ; 2[ -> [0 ; 1[
+	part = poly (part);
+
+	// Sums the components
+	const auto     total = log2_int + part;
+
+	return total;
+}
+
+
+
+// poly is a user-provided Vf32 exp2 approximation from [0 ; 1[ to [1 ; 2[
+template <typename P>
+Vf32	Vf32::exp2_base (P poly) const noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+
+	const int32_t  tx0 = floor_int (_x [0]);
+	const int32_t  tx1 = floor_int (_x [1]);
+	const int32_t  tx2 = floor_int (_x [2]);
+	const int32_t  tx3 = floor_int (_x [3]);
+	const Vf32     frac {
+		_x [0] - static_cast <float> (tx0),
+		_x [1] - static_cast <float> (tx1),
+		_x [2] - static_cast <float> (tx2),
+		_x [3] - static_cast <float> (tx3)
+	};
+
+	Combo          combo { poly (frac) };
+
+	combo._s32 [0] += tx0 << 23;
+	combo._s32 [1] += tx1 << 23;
+	combo._s32 [2] += tx2 << 23;
+	combo._s32 [3] += tx3 << 23;
+	assert (
+	      combo._vf32 [0] >= 0
+		&& combo._vf32 [1] >= 0
+		&& combo._vf32 [2] >= 0
+		&& combo._vf32 [3] >= 0
+	);
+	return combo._vf32;
+
+#else // fstb_HAS_SIMD
+
+	// Separates the integer and fractional parts
+# if fstb_ARCHI == fstb_ARCHI_X86
+	const auto     round_toward_m_i = _mm_set1_ps (-0.5f);
+	auto           xi        = _mm_cvtps_epi32 (_mm_add_ps (_x, round_toward_m_i));
+	const auto     val_floor = Vf32 { _mm_cvtepi32_ps (xi) };
+# elif fstb_ARCHI == fstb_ARCHI_ARM
+	const int      round_ofs = 256;
+	int32x4_t      xi = vcvtq_s32_f32 (_x + vdupq_n_f32 (float (round_ofs)));
+	xi -= vdupq_n_s32 (round_ofs);
+	const auto     val_floor = Vf32 { vcvtq_f32_s32 (xi) };
+# endif // fstb_ARCHI
+
+	auto           frac = *this - val_floor;
+
+	// Computes the exp2 approximation [0 ; 1] -> [1 ; 2]
+	frac = poly (frac);
+
+	// Integer part
+# if fstb_ARCHI == fstb_ARCHI_X86
+	xi = _mm_slli_epi32 (xi, 23);
+	xi = _mm_add_epi32 (xi, _mm_castps_si128 (frac));
+	return _mm_castsi128_ps (xi);
+# elif fstb_ARCHI == fstb_ARCHI_ARM
+	xi = vshlq_n_s32 (xi, 23);
+	xi = xi + vreinterpretq_s32_f32 (frac);
+	return vreinterpretq_f32_s32 (xi);
+# endif // fstb_ARCHI
+
+#endif // fstb_HAS_SIMD
+}
+
+
+
+Vf32	Vf32::signbit () const noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+	return Vf32 {
+		copysignf (0.f, _x [0]),
+		copysignf (0.f, _x [1]),
+		copysignf (0.f, _x [2]),
+		copysignf (0.f, _x [3])
+	};
+#elif fstb_ARCHI == fstb_ARCHI_X86
+	return _mm_and_ps (signbit_mask (), _x);
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+	return vreinterpretq_f32_u32 (vandq_u32 (
+		vreinterpretq_u32_f32 (_x),
+		vdupq_n_u32 (0x80000000U)
+	));
+#endif // fstb_ARCHI
+}
+
+
+
+Vf32   Vf32::is_lt_0 () const noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+	Combo          r;
+	r._s32 [0] = (_x [0] < 0) ? -1 : 0;
+	r._s32 [1] = (_x [1] < 0) ? -1 : 0;
+	r._s32 [2] = (_x [2] < 0) ? -1 : 0;
+	r._s32 [3] = (_x [3] < 0) ? -1 : 0;
+	return r._vf32;
+#elif fstb_ARCHI == fstb_ARCHI_X86
+	return _mm_castsi128_ps (_mm_srai_epi32 (_mm_castps_si128 (_x), 31));
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+	return vreinterpretq_f32_s32 (vshrq_n_s32 (vreinterpretq_s32_f32 (_x), 31));
+#endif // fstb_ARCHI
+}
+
+
+
+std::tuple <float, float, float, float>	Vf32::explode () const noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+	return std::make_tuple (_x [0], _x [1], _x [2], _x [3]);
+#elif fstb_ARCHI == fstb_ARCHI_X86
+	const auto     tmp = _mm_movehl_ps (_x, _x);
+	return std::make_tuple (
+		_mm_cvtss_f32 (_x),
+		_mm_cvtss_f32 (_mm_shuffle_ps (_x, _x, (1<<0))),
+		_mm_cvtss_f32 (tmp),
+		_mm_cvtss_f32 (_mm_shuffle_ps (tmp, tmp, (1<<0)))
+	);
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+	return std::make_tuple (
+		vgetq_lane_f32 (_x, 0),
+		vgetq_lane_f32 (_x, 1),
+		vgetq_lane_f32 (_x, 2),
+		vgetq_lane_f32 (_x, 3)
+	);
+#endif // fstb_ARCHI
+}
+
+
+
+std::tuple <float, float>	Vf32::extract_pair () const noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+	return std::make_tuple (_x [0], _x [1]);
+#elif fstb_ARCHI == fstb_ARCHI_X86
+	return std::make_tuple (
+		_mm_cvtss_f32 (_x),
+		_mm_cvtss_f32 (_mm_shuffle_ps (_x, _x, 1))
+	);
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+	return std::make_tuple (vgetq_lane_f32 (_x, 0), vgetq_lane_f32 (_x, 1));
+#endif // fstb_ARCHI
+}
+
+
+
+// <0> = v0 | v1 | v0 | v1
+// <1> = v2 | v3 | v2 | v3
+std::tuple <Vf32, Vf32>	Vf32::spread_pairs () const noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+	return std::make_tuple (
+		Vf32 { _x [0], _x [1], _x [0], _x [1] },
+		Vf32 { _x [2], _x [3], _x [2], _x [3] }
+	);
+#elif fstb_ARCHI == fstb_ARCHI_X86
+	return std::make_tuple (
+		Vf32 { _mm_shuffle_ps (_x, _x, (0<<0) + (1<<2) + (0<<4) + (1<<6)) },
+		Vf32 { _mm_shuffle_ps (_x, _x, (2<<0) + (3<<2) + (2<<4) + (3<<6)) }
+	);
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+	const float32x2_t v01 = vget_low_f32 (_x);
+	const float32x2_t v23 = vget_high_f32 (_x);
+	return std::make_tuple (
+		Vf32 { vcombine_f32 (v01, v01) },
+		Vf32 { vcombine_f32 (v23, v23) }
+	);
+#endif // fstb_ARCHI
+}
+
+
+
+float	Vf32::sum_h () const noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+	return (_x [0] + _x [2]) + (_x [1] + _x [3]);
+#elif fstb_ARCHI == fstb_ARCHI_X86
+	// s = v3,v2,v1,v0
+	const auto s = _mm_shuffle_ps (_x, _x, (3 << 0) | (2 << 2) | (1 << 4) | (0 << 6));
+	const auto v = _mm_add_ps (_x, s); // v0+v3,v1+v2,v2+v1,v3+v0
+	return _mm_cvtss_f32 (_mm_add_ss (v, _mm_movehl_ps (s, v)));
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+	#if fstb_WORD_SIZE == 64
+		return vaddvq_f32 (_x);
+	#else
+		float32x2_t    v2 = vadd_f32 (vget_high_f32 (_x), vget_low_f32 (_x));
+		return vget_lane_f32 (vpadd_f32 (v2, v2), 0);
+	#endif
+#endif // fstb_ARCHI
+}
+
+
+
+float	Vf32::min_h () const noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+	return std::min (std::min (_x [0], _x [2]), std::min (_x [1], _x [3]));
+#elif fstb_ARCHI == fstb_ARCHI_X86
+	const auto     v = _mm_min_ps (_x, _mm_shuffle_ps (_x, _x, (3 << 2) | 2));
+	return _mm_cvtss_f32 (_mm_min_ss (v, _mm_shuffle_ps (v, v, 1)));
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+	float32x2_t    v2 = vmin_f32 (vget_high_f32 (_x), vget_low_f32 (_x));
+	return vget_lane_f32 (vpmin_f32 (v2, v2), 0);
+#endif // fstb_ARCHI
+}
+
+
+
+float	Vf32::max_h () const noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+	return std::max (std::max (_x [0], _x [2]), std::max (_x [1], _x [3]));
+#elif fstb_ARCHI == fstb_ARCHI_X86
+	const auto     v = _mm_max_ps (_x, _mm_shuffle_ps (_x, _x, (3 << 2) | 2));
+	return _mm_cvtss_f32 (_mm_max_ss (v, _mm_shuffle_ps (v, v, 1)));
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+	float32x2_t    v2 = vmax_f32 (vget_high_f32 (_x), vget_low_f32 (_x));
+	return vget_lane_f32 (vpmax_f32 (v2, v2), 0);
+#endif // fstb_ARCHI
+}
+
+
+
+// Works only with well-formed condition results (tested bits depends on the implementation).
+// For each scalar, true = all bits set, false = all bits cleared
+bool   Vf32::and_h () const noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+	const Combo    c { _x };
+	const int32_t  t = (c._s32 [0] & c._s32 [1]) & (c._s32 [2] & c._s32 [3]);
+	return (t == -1);
+#elif fstb_ARCHI == fstb_ARCHI_X86
+	return (_mm_movemask_ps (_x) == 15);
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+	const uint32x2_t  tmp = vreinterpret_u32_u16 (
+		vqmovn_u32 (vreinterpretq_u32_f32 (_x))
+	);
+	return (   vget_lane_u32 (tmp, 0) == 0xFFFFFFFFU
+	        && vget_lane_u32 (tmp, 1) == 0xFFFFFFFFU);
+#endif // fstb_ARCHI
+}
+
+
+
+// Works only with well-formed condition results (tested bits depends on the implementation).
+// For each scalar, true = all bits set, false = all bits cleared
+bool   Vf32::or_h () const noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+	Combo          c;
+	c._vf32 = _x;
+	const int32_t  t = (c._s32 [0] | c._s32 [1]) | (c._s32 [2] | c._s32 [3]);
+	return (t != 0);
+#elif fstb_ARCHI == fstb_ARCHI_X86
+	return (_mm_movemask_ps (_x) != 0);
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+	const uint32x2_t  tmp = vreinterpret_u32_u16 (
+		vqmovn_u32 (vreinterpretq_u32_f32 (_x))
+	);
+	return (   vget_lane_u32 (tmp, 0) != 0
+	        || vget_lane_u32 (tmp, 1) != 0);
+#endif // fstb_ARCHI
+}
+
+
+
+// Moves the boolean content of each 4 scalar into the lower 4 bits of the
+// return value.
+// Assumes the object is a result of a comparison, with all bits the same
+// in each 32-bit element.
+unsigned int	Vf32::movemask () const noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+	Combo          c;
+	c._vf32 = _x;
+	return
+		   (c._u32 [0] >> 31)
+		| ((c._u32 [1] >> 30) & 2)
+		| ((c._u32 [2] >> 29) & 4)
+		| ((c._u32 [3] >> 28) & 8);
+#elif fstb_ARCHI == fstb_ARCHI_X86
+	return static_cast <unsigned int> (_mm_movemask_ps (_x));
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+	uint64x2_t     tmp1 =
+		vreinterpretq_u64_f32 (_x);   // ddd...ddd ccc...ccc bbb...bbb aaa...aaa
+	tmp1 = vshrq_n_u64 (tmp1, 31);   // 000...00d ddd...ddc 000...00b bbb...bba
+	uint64x1_t     tmp2 = vsli_n_u64 (
+		vget_high_u64 (tmp1),
+		vget_low_u64 (tmp1),
+		2
+	);
+	return vget_lane_u32 (vreinterpret_u32_u64 (tmp2), 0) & 0xF;
+#endif // fstb_ARCHI
+}
+
+
+
+Vf32	Vf32::zero () noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+	return Vf32 { 0, 0, 0, 0 };
+#elif fstb_ARCHI == fstb_ARCHI_X86
+	return _mm_setzero_ps ();
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+	return vdupq_n_f32 (0);
+#endif // fstb_ARCHI
+}
+
+
+
+Vf32	Vf32::all1 () noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+	Combo          c;
+	c._s32 [0] = -1;
+	c._s32 [1] = -1;
+	c._s32 [2] = -1;
+	c._s32 [3] = -1;
+	return Vf32 { c._vf32 };
+#elif fstb_ARCHI == fstb_ARCHI_X86
+	return _mm_castsi128_ps (_mm_set1_epi32 (-1));
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+	return vreinterpretq_f32_s32 (vdupq_n_s32 (-1));
+#endif // fstb_ARCHI
+}
+
+
+
+// Returns a0 | a1 | ? | ?
+Vf32	Vf32::set_pair (float a0, float a1) noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+	return Vf32 { a0, a1, 0, 0 };
+#elif fstb_ARCHI == fstb_ARCHI_X86
+	return _mm_unpacklo_ps (_mm_set_ss (a0), _mm_set_ss (a1));
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+	return vsetq_lane_f32 (a1, vdupq_n_f32 (a0), 1);
+#endif // fstb_ARCHI
+}
+
+
+
+// Returns a02 | a13 | a02 | a13
+Vf32	Vf32::set_pair_fill (float a02, float a13) noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+	return Vf32 { a02, a13, a02, a13 };
+#elif fstb_ARCHI == fstb_ARCHI_X86
+	return _mm_unpacklo_ps (_mm_set1_ps (a02), _mm_set1_ps (a13));
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+	const float32x2_t v01 = vset_lane_f32 (a13, vdup_n_f32 (a02), 1);
+	return vcombine_f32 (v01, v01);
+#endif // fstb_ARCHI
+}
+
+
+
+// Returns a01 | a01 | a23 | a23
+Vf32	Vf32::set_pair_dbl (float a01, float a23) noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+	return Vf32 { a01, a01, a23, a23 };
+#elif fstb_ARCHI == fstb_ARCHI_X86
+	return _mm_shuffle_ps (_mm_set_ss (a01), _mm_set_ss (a23), 0x00);
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+	return vcombine_f32 (vdup_n_f32 (a01), vdup_n_f32 (a23));
+#endif // fstb_ARCHI
+}
+
+
+
+// "true" must be 1 and nothing else.
+Vf32	Vf32::set_mask (bool m0, bool m1, bool m2, bool m3) noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+	Combo          c;
+	c._s32 [0] = -int32_t (m0);
+	c._s32 [1] = -int32_t (m1);
+	c._s32 [2] = -int32_t (m2);
+	c._s32 [3] = -int32_t (m3);
+	return c._vf32;
+#elif 1 // Fast version
+# if fstb_ARCHI == fstb_ARCHI_X86
+	return _mm_castsi128_ps (_mm_sub_epi32 (
+		_mm_setzero_si128 (),
+		_mm_set_epi32 (m3, m2, m1, m0)
+	));
+# elif fstb_ARCHI == fstb_ARCHI_ARM
+	float32x2_t    v01 = vdup_n_f32 (m0);
+	float32x2_t    v23 = vdup_n_f32 (m2);
+	v01 = vset_lane_f32 (m1, v01, 1);
+	v23 = vset_lane_f32 (m3, v23, 1);
+	return vreinterpretq_f32_s32 (vnegq_s32 (vreinterpretq_s32_f32 (
+		vcombine_f32 (v01, v23)
+	)));
+# endif // fstb_ARCHI
+#else // Safer but slower version
+# if fstb_ARCHI == fstb_ARCHI_X86
+	return _mm_castsi128_ps (_mm_sub_epi32 (
+		_mm_set_epi32 (!m3, !m2, !m1, !m0),
+		_mm_set1_epi32 (1)
+	));
+# elif fstb_ARCHI == fstb_ARCHI_ARM
+	float32x2_t    v01 = vdup_n_f32 (!m0);
+	float32x2_t    v23 = vdup_n_f32 (!m2);
+	v01 = vset_lane_f32 (!m1, v01, 1);
+	v23 = vset_lane_f32 (!m3, v23, 1);
+	const auto     one  = vdupq_n_s32 (1);
+	return vreinterpretq_f32_s32 (vsubq_s32 (
+		vreinterpretq_s32_f32 (vcombine_f32 (v01, v23)),
+		one
+	));
+# endif // fstb_ARCHI
+#endif // Versions
+}
+
+
+
+Vf32Native	Vf32::signbit_mask () noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+	Combo          c;
+	c._u32 [0] = 0x80000000U;
+	c._u32 [1] = 0x80000000U;
+	c._u32 [2] = 0x80000000U;
+	c._u32 [3] = 0x80000000U;
+	return c._vf32;
+#elif fstb_ARCHI == fstb_ARCHI_X86
+//	return _mm_set1_ps (-0.f);
+	return _mm_castsi128_ps (_mm_set1_epi32 (0x80000000));
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+	return vreinterpretq_f32_u32 (vdupq_n_u32 (0x80000000U)); 
+#endif // fstb_ARCHI
+}
+
+
+
+// returns { p0 [0 1], p1 [0 1] }
+Vf32	Vf32::interleave_pair_lo (Vf32 p0, Vf32 p1) noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+	return Vf32 { p0._x [0], p0._x [1], p1._x [0], p1._x [1] };
+#elif fstb_ARCHI == fstb_ARCHI_X86
+	return _mm_shuffle_ps (p0._x, p1._x, (0<<0) + (1<<2) + (0<<4) + (1<<6));
+	// return _mm_movelh_ps (p0, p1);
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+	const float32x2_t  p0x = vget_low_f32 (p0._x);
+	const float32x2_t  p1x = vget_low_f32 (p1._x);
+	return vcombine_f32 (p0x, p1x);
+#endif // fstb_ARCHI
+}
+
+
+
+// returns { p0 [2 3], p1 [2 3] }
+Vf32	Vf32::interleave_pair_hi (Vf32 p0, Vf32 p1) noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+	return Vf32 { p0._x [2], p0._x [3], p1._x [2], p1._x [3] };
+#elif fstb_ARCHI == fstb_ARCHI_X86
+	return _mm_shuffle_ps (p0._x, p1._x, (2<<0) + (3<<2) + (2<<4) + (3<<6));
+	// return _mm_movehl_ps (p1, p0);
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+	const float32x2_t  p0x = vget_high_f32 (p0._x);
+	const float32x2_t  p1x = vget_high_f32 (p1._x);
+	return vcombine_f32 (p0x, p1x);
+#endif // fstb_ARCHI
+}
+
+
+
+std::tuple <Vf32, Vf32>	Vf32::interleave (Vf32 p0, Vf32 p1) noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+	return std::make_tuple (
+		Vf32 {
+			p0._x [0],
+			p1._x [0],
+			p0._x [1],
+			p1._x [1]
+		}, Vf32 {
+			p0._x [2],
+			p1._x [2],
+			p0._x [3],
+			p1._x [3]
+		}
+	);
+#elif fstb_ARCHI == fstb_ARCHI_X86
+	return std::make_tuple (
+		Vf32 { _mm_unpacklo_ps (p0._x, p1._x) },
+		Vf32 { _mm_unpackhi_ps (p0._x, p1._x) }
+	);
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+	const float32x4x2_t  tmp = vzipq_f32 (p0._x, p1._x);
+	return std::make_tuple (
+		Vf32 { tmp.val [0] },
+		Vf32 { tmp.val [1] }
+	);
+#endif // fstb_ARCHI
+}
+
+
+
+std::tuple <Vf32, Vf32>	Vf32::deinterleave (Vf32 i0, Vf32 i1) noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+	return std::make_tuple (
+		Vf32 {
+			i0._x [0],
+			i0._x [2],
+			i1._x [0],
+			i1._x [2]
+		}, Vf32 {
+			i0._x [1],
+			i0._x [3],
+			i1._x [1],
+			i1._x [3]
+		}
+	);
+#elif fstb_ARCHI == fstb_ARCHI_X86
+	return std::make_tuple (
+		Vf32 { _mm_shuffle_ps (i0._x, i1._x, (0<<0) | (2<<2) | (0<<4) | (2<<6)) },
+		Vf32 { _mm_shuffle_ps (i0._x, i1._x, (1<<0) | (3<<2) | (1<<4) | (3<<6)) }
+	);
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+	const float32x4x2_t  tmp = vuzpq_f32 (i0._x, i1._x);
+	return std::make_tuple (
+		Vf32 { tmp.val [0] },
+		Vf32 { tmp.val [1] }
+	);
+#endif // fstb_ARCHI
+}
+
+
+
+Vf32	Vf32::deinterleave_lo (Vf32 i0, Vf32 i1) noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+	return Vf32 { i0._x [0], i0._x [2], i1._x [0], i1._x [2] };
+#elif fstb_ARCHI == fstb_ARCHI_X86
+	return _mm_shuffle_ps (i0._x, i1._x, (0<<0) | (2<<2) | (0<<4) | (2<<6));
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+	return vuzpq_f32 (i0._x, i1._x).val [0];
+#endif // fstb_ARCHI
+}
+
+
+
+Vf32	Vf32::deinterleave_hi (Vf32 i0, Vf32 i1) noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+	return Vf32 { i0._x [1], i0._x [3], i1._x [1], i1._x [3] };
+#elif fstb_ARCHI == fstb_ARCHI_X86
+	return _mm_shuffle_ps (i0._x, i1._x, (1<<0) | (3<<2) | (1<<4) | (3<<6));
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+	return vuzpq_f32 (i0._x, i1._x).val [1];
+#endif // fstb_ARCHI
+}
+
+
+
+// Extracts the vector at the position POS from the double-width vector {a b}
+// Concatenates a [POS...3] with b [0...3-POS]
+template <int POS>
+Vf32	Vf32::compose (Vf32 a, Vf32 b) noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+	switch (POS & 3)
+	{
+	case 1:  return Vf32 { a._x [1], a._x [2], a._x [3], b._x [0] };
+	case 2:  return Vf32 { a._x [2], a._x [3], b._x [0], b._x [1] };
+	case 3:  return Vf32 { a._x [3], b._x [0], b._x [1], b._x [2] };
+	default: return a;
+	}
+#elif fstb_ARCHI == fstb_ARCHI_X86
+	switch (POS & 3)
+	{
+	case 1:
+		{
+			const auto     tmp = _mm_move_ss (a._x, b._x);
+			return _mm_shuffle_ps (tmp, tmp, (0<<6) | (3<<4) | (2<<2) | (1<<0));
+		}
+	case 2:
+		return _mm_shuffle_ps (a._x, b._x, (1<<6) | (0<<4) | (3<<2) | (2<<0));
+	case 3:
+		return _mm_move_ss (
+			_mm_shuffle_ps (b._x, b._x, (2<<6) | (1<<4) | (0<<2) | (3<<0)),
+			_mm_shuffle_ps (a._x, a._x, (2<<6) | (1<<4) | (0<<2) | (3<<0))
+		);
+	default:
+		return a;
+	}
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+	if (POS == 0)
+	{
+		return a;
+	}
+	else
+	{
+		const auto     aa = vreinterpretq_s32_f32 (a._x);
+		const auto     bb = vreinterpretq_s32_f32 (b._x);
+		return vreinterpretq_f32_s32 (vextq_s32 (aa, bb, POS));
+	}
+#endif // fstb_ARCHI
+}
+
+
+
+template <typename MEM>
+Vf32	Vf32::load (const MEM *ptr) noexcept
+{
+	assert (is_ptr_align_nz (ptr, fstb_SIMD128_ALIGN));
+
+#if ! defined (fstb_HAS_SIMD)
+	return *reinterpret_cast <const Vf32 *> (ptr);
+#elif fstb_ARCHI == fstb_ARCHI_X86
+	return _mm_load_ps (reinterpret_cast <const float *> (ptr));
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+	return vld1q_f32 (reinterpret_cast <const float32_t *> (ptr));
+#endif // fstb_ARCHI
+}
+
+
+
+template <typename MEM>
+Vf32	Vf32::loadu (const MEM *ptr) noexcept
+{
+	assert (ptr != nullptr);
+
+#if ! defined (fstb_HAS_SIMD)
+	return *reinterpret_cast <const Vf32 *> (ptr);
+#elif fstb_ARCHI == fstb_ARCHI_X86
+	return _mm_loadu_ps (reinterpret_cast <const float *> (ptr));
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+	return vreinterpretq_f32_u8 (
+		vld1q_u8 (reinterpret_cast <const uint8_t *> (ptr))
+	);
+#endif // fstb_ARCHI
+}
+
+
+
+template <typename MEM>
+Vf32	Vf32::loadu_part (const MEM *ptr, int n) noexcept
+{
+	assert (n > 0);
+
+	if (n >= 4)
+	{
+		return loadu (ptr);
+	}
+
+	const float *  f_ptr = reinterpret_cast <const float *> (ptr);
+#if ! defined (fstb_HAS_SIMD)
+	Vf32           v;
+	v._x [0] = f_ptr [0];
+	for (int i = 1; i < n; ++i)
+	{
+		v._x [i] = f_ptr [i];
+	}
+	return v;
+#elif fstb_ARCHI == fstb_ARCHI_X86
+	switch (n)
+	{
+	case 1:
+		return _mm_load_ss (f_ptr);
+	case 2:
+# if 1
+		return _mm_castsi128_ps (_mm_loadl_epi64 (
+			reinterpret_cast <const __m128i *> (ptr)
+		));
+# else // Higher latency from Skylake
+		return _mm_unpacklo_ps (_mm_load_ss (f_ptr), _mm_load_ss (f_ptr + 1));
+# endif
+	case 3:
+		return _mm_shuffle_ps (
+# if 1
+			_mm_castsi128_ps (_mm_loadl_epi64 (
+				reinterpret_cast <const __m128i *> (ptr)
+			)),
+# else // Higher latency from Skylake
+			_mm_unpacklo_ps (_mm_load_ss (f_ptr), _mm_load_ss (f_ptr + 1)),
+# endif
+			_mm_load_ss (f_ptr + 2),
+			(0<<0) + (1<<2) + (2<<4)
+		);
+	default:
+		// Keeps the compiler happy with (un)initialisation
+		return loadu (ptr);
+	}
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+	auto           v = vmovq_n_f32 (f_ptr [0]);
+	if (n >= 2)
+	{
+		v = vld1q_lane_f32 (f_ptr + 1, v, 1);
+		if (n >= 3)
+		{
+			v = vld1q_lane_f32 (f_ptr + 2, v, 2);
+		}
+	}
+	return v;
+#endif // fstb_ARCHI
+}
+
+
+
+// Returns: ptr [0] | ptr [1] | ? | ?
+template <typename MEM>
+Vf32	Vf32::loadu_pair (const MEM *ptr) noexcept
+{
+	assert (ptr != nullptr);
+
+#if ! defined (fstb_HAS_SIMD)
+	auto           p = reinterpret_cast <const float *> (ptr);
+	return Vf32 { p [0], p [1], 0, 0 };
+#elif fstb_ARCHI == fstb_ARCHI_X86
+# if 1
+	return _mm_castsi128_ps (_mm_loadl_epi64 (
+		reinterpret_cast <const __m128i *> (ptr)
+	));
+# else // Higher latency from Skylake
+	const auto     x0 = _mm_load_ss (reinterpret_cast <const float *> (ptr)    );
+	const auto     x1 = _mm_load_ss (reinterpret_cast <const float *> (ptr) + 1);
+	return _mm_unpacklo_ps (x0, x1);
+# endif
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+	const float32x2_t x = vreinterpret_f32_u8 (
+		vld1_u8 (reinterpret_cast <const uint8_t *> (ptr))
+	);
+	return vcombine_f32 (x, x);
+#endif // fstb_ARCHI
+}
+
+
+
+/*\\\ PROTECTED \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\*/
+
+
+
+/*\\\ PRIVATE \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\*/
+
+
+
+// n = number of scalars to store (from the LSB)
+template <typename MEM>
+void	Vf32::storeu_part_n13 (MEM *ptr, int n) const noexcept
+{
+	assert (n > 0);
+	assert (n < 4);
+
+	float *        f_ptr = reinterpret_cast <float *> (ptr);
+
+#if ! defined (fstb_HAS_SIMD)
+
+	for (int i = 0; i < n; ++i)
+	{
+		f_ptr [i] = _x [i];
+	}
+
+#elif fstb_ARCHI == fstb_ARCHI_X86
+
+	_mm_store_ss (f_ptr, _x);
+	if (n >= 2)
+	{
+		_mm_store_ss (f_ptr + 1, _mm_shuffle_ps (_x, _x, 1 << 0));
+		if (n >= 3)
+		{
+			_mm_store_ss (f_ptr + 2, _mm_movehl_ps (_x, _x));
+		}
+	}
+
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+
+	vst1q_lane_f32 (f_ptr + 0, _x, 0);
+	if (n >= 2)
+	{
+		vst1q_lane_f32 (f_ptr + 1, _x, 1);
+		if (n >= 3)
+		{
+			vst1q_lane_f32 (f_ptr + 2, _x, 2);
+		}
+	}
+
+#endif
+}
+
+
+
+/*\\\ GLOBAL OPERATORS AND FUNCTIONS \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\*/
+
+
+
+Vf32 operator + (Vf32 lhs, const Vf32 &rhs) noexcept
+{
+	lhs += rhs;
+	return lhs;
+}
+
+Vf32 operator - (Vf32 lhs, const Vf32 &rhs) noexcept
+{
+	lhs -= rhs;
+	return lhs;
+}
+
+Vf32 operator * (Vf32 lhs, const Vf32 &rhs) noexcept
+{
+	lhs *= rhs;
+	return lhs;
+}
+
+Vf32 operator / (Vf32 lhs, const Vf32 &rhs) noexcept
+{
+	lhs /= rhs;
+	return lhs;
+}
+
+Vf32 operator & (Vf32 lhs, const Vf32 &rhs) noexcept
+{
+	lhs &= rhs;
+	return lhs;
+}
+
+Vf32 operator | (Vf32 lhs, const Vf32 &rhs) noexcept
+{
+	lhs |= rhs;
+	return lhs;
+}
+
+Vf32 operator ^ (Vf32 lhs, const Vf32 &rhs) noexcept
+{
+	lhs ^= rhs;
+	return lhs;
+}
+
+
+
+Vf32	operator == (const Vf32 &lhs, const Vf32 &rhs) noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+	Vf32::Combo    r;
+	r._s32 [0] = (lhs._x [0] == rhs._x [0]) ? -1 : 0;
+	r._s32 [1] = (lhs._x [1] == rhs._x [1]) ? -1 : 0;
+	r._s32 [2] = (lhs._x [2] == rhs._x [2]) ? -1 : 0;
+	r._s32 [3] = (lhs._x [3] == rhs._x [3]) ? -1 : 0;
+	return r._vf32;
+#elif fstb_ARCHI == fstb_ARCHI_X86
+	return _mm_cmpeq_ps (lhs, rhs);
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+	return vreinterpretq_f32_u32 (vceqq_f32 (lhs, rhs));
+#endif // fstb_ARCHI
+}
+
+
+
+Vf32	operator != (const Vf32 &lhs, const Vf32 &rhs) noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+	Vf32::Combo    r;
+	r._s32 [0] = (lhs._x [0] != rhs._x [0]) ? -1 : 0;
+	r._s32 [1] = (lhs._x [1] != rhs._x [1]) ? -1 : 0;
+	r._s32 [2] = (lhs._x [2] != rhs._x [2]) ? -1 : 0;
+	r._s32 [3] = (lhs._x [3] != rhs._x [3]) ? -1 : 0;
+	return r._vf32;
+#elif fstb_ARCHI == fstb_ARCHI_X86
+	return _mm_cmpneq_ps (lhs, rhs);
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+	return vreinterpretq_f32_u32 (vmvnq_u32 (vceqq_f32 (lhs, rhs)));
+#endif // fstb_ARCHI
+}
+
+
+
+Vf32	operator <  (const Vf32 &lhs, const Vf32 &rhs) noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+	Vf32::Combo    r;
+	r._s32 [0] = (lhs._x [0] < rhs._x [0]) ? -1 : 0;
+	r._s32 [1] = (lhs._x [1] < rhs._x [1]) ? -1 : 0;
+	r._s32 [2] = (lhs._x [2] < rhs._x [2]) ? -1 : 0;
+	r._s32 [3] = (lhs._x [3] < rhs._x [3]) ? -1 : 0;
+	return r._vf32;
+#elif fstb_ARCHI == fstb_ARCHI_X86
+	return _mm_cmplt_ps (lhs, rhs);
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+	return vreinterpretq_f32_u32 (vcltq_f32 (lhs, rhs));
+#endif // fstb_ARCHI
+}
+
+
+
+Vf32	operator <= (const Vf32 &lhs, const Vf32 &rhs) noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+	Vf32::Combo    r;
+	r._s32 [0] = (lhs._x [0] <= rhs._x [0]) ? -1 : 0;
+	r._s32 [1] = (lhs._x [1] <= rhs._x [1]) ? -1 : 0;
+	r._s32 [2] = (lhs._x [2] <= rhs._x [2]) ? -1 : 0;
+	r._s32 [3] = (lhs._x [3] <= rhs._x [3]) ? -1 : 0;
+	return r._vf32;
+#elif fstb_ARCHI == fstb_ARCHI_X86
+	return _mm_cmple_ps (lhs, rhs);
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+	return vreinterpretq_f32_u32 (vcleq_f32 (lhs, rhs));
+#endif // fstb_ARCHI
+}
+
+
+
+Vf32	operator >  (const Vf32 &lhs, const Vf32 &rhs) noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+	Vf32::Combo    r;
+	r._s32 [0] = (lhs._x [0] > rhs._x [0]) ? -1 : 0;
+	r._s32 [1] = (lhs._x [1] > rhs._x [1]) ? -1 : 0;
+	r._s32 [2] = (lhs._x [2] > rhs._x [2]) ? -1 : 0;
+	r._s32 [3] = (lhs._x [3] > rhs._x [3]) ? -1 : 0;
+	return r._vf32;
+#elif fstb_ARCHI == fstb_ARCHI_X86
+	return _mm_cmpgt_ps (lhs, rhs);
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+	return vreinterpretq_f32_u32 (vcgtq_f32 (lhs, rhs));
+#endif // fstb_ARCHI
+}
+
+
+
+Vf32	operator >= (const Vf32 &lhs, const Vf32 &rhs) noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+	Vf32::Combo    r;
+	r._s32 [0] = (lhs._x [0] >= rhs._x [0]) ? -1 : 0;
+	r._s32 [1] = (lhs._x [1] >= rhs._x [1]) ? -1 : 0;
+	r._s32 [2] = (lhs._x [2] >= rhs._x [2]) ? -1 : 0;
+	r._s32 [3] = (lhs._x [3] >= rhs._x [3]) ? -1 : 0;
+	return r._vf32;
+#elif fstb_ARCHI == fstb_ARCHI_X86
+	return _mm_cmpge_ps (lhs, rhs);
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+	return vreinterpretq_f32_u32 (vcgeq_f32 (lhs, rhs));
+#endif // fstb_ARCHI
+}
+
+
+
+Vf32 abs (const Vf32 &v) noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+	return Vf32 {
+		fabsf (v._x [0]),
+		fabsf (v._x [1]),
+		fabsf (v._x [2]),
+		fabsf (v._x [3])
+	};
+#elif fstb_ARCHI == fstb_ARCHI_X86
+	return _mm_andnot_ps (Vf32::signbit_mask (), v);
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+	return vabsq_f32 (v);
+#endif // fstb_ARCHI
+}
+
+
+
+// Returns x * a + b
+Vf32 fma (const Vf32 &x, const Vf32 &a, const Vf32 &b) noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+	return Vf32 {
+		x._x [0] * a._x [0] + b._x [0],
+		x._x [1] * a._x [1] + b._x [1],
+		x._x [2] * a._x [2] + b._x [2],
+		x._x [3] * a._x [3] + b._x [3]
+	};
+#elif fstb_ARCHI == fstb_ARCHI_X86
+	return _mm_add_ps (_mm_mul_ps (x, a), b);
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+	#if defined (__ARM_FEATURE_FMA)
+	return vfmaq_f32 (b, x, a);
+	#else
+	return vmlaq_f32 (b, x, a);
+	#endif
+#endif // fstb_ARCHI
+}
+
+
+
+// Returns x * a - b
+Vf32 fms (const Vf32 &x, const Vf32 &a, const Vf32 &b) noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+	return Vf32 {
+		x._x [0] * a._x [0] - b._x [0],
+		x._x [1] * a._x [1] - b._x [1],
+		x._x [2] * a._x [2] - b._x [2],
+		x._x [3] * a._x [3] - b._x [3]
+	};
+#elif fstb_ARCHI == fstb_ARCHI_X86
+	return _mm_sub_ps (_mm_mul_ps (x, a), b);
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+	#if defined (__ARM_FEATURE_FMA)
+	return -vfmsq_f32 (b, x, a);
+	#else
+	return -vmlsq_f32 (b, x, a);
+	#endif
+#endif // fstb_ARCHI
+}
+
+
+
+// Returns - x * a + b
+Vf32 fnma (const Vf32 &x, const Vf32 &a, const Vf32 &b) noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+	return Vf32 {
+		b._x [0] - x._x [0] * a._x [0],
+		b._x [1] - x._x [1] * a._x [1],
+		b._x [2] - x._x [2] * a._x [2],
+		b._x [3] - x._x [3] * a._x [3]
+	};
+#elif fstb_ARCHI == fstb_ARCHI_X86
+	return _mm_sub_ps (b, _mm_mul_ps (x, a));
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+	#if defined (__ARM_FEATURE_FMA)
+	return vfmsq_f32 (b, x, a);
+	#else
+	return vmlsq_f32 (b, x, a);
+	#endif
+#endif // fstb_ARCHI
+}
+
+
+
+Vf32 round (const Vf32 &v) noexcept
+{
+	return v.round ();
+}
+
+
+
+Vf32 min (const Vf32 &lhs, const Vf32 &rhs) noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+	return Vf32 {
+		std::min (lhs._x [0], rhs._x [0]),
+		std::min (lhs._x [1], rhs._x [1]),
+		std::min (lhs._x [2], rhs._x [2]),
+		std::min (lhs._x [3], rhs._x [3])
+	};
+#elif fstb_ARCHI == fstb_ARCHI_X86
+	return _mm_min_ps (lhs, rhs);
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+	return vminq_f32 (lhs, rhs);
+#endif // fstb_ARCHI
+}
+
+
+
+Vf32 max (const Vf32 &lhs, const Vf32 &rhs) noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+	return Vf32 {
+		std::max (lhs._x [0], rhs._x [0]),
+		std::max (lhs._x [1], rhs._x [1]),
+		std::max (lhs._x [2], rhs._x [2]),
+		std::max (lhs._x [3], rhs._x [3])
+	};
+#elif fstb_ARCHI == fstb_ARCHI_X86
+	return _mm_max_ps (lhs, rhs);
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+	return vmaxq_f32 (lhs, rhs);
+#endif // fstb_ARCHI
+}
+
+
+
+Vf32 limit (const Vf32 &v, const Vf32 &mi, const Vf32 &ma) noexcept
+{
+	return min (max (v, mi), ma);
+}
+
+
+
+Vf32 select (Vf32 cond, Vf32 v_t, Vf32 v_f) noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+	/*** To do: implement as r = v_f ^ ((v_f ^ v_t) & cond) ***/
+	const Vf32::Combo cc { cond };
+	Vf32::Combo    ct { v_t };
+	Vf32::Combo    cf { v_f };
+	Vf32::Combo    r;
+	r._s32 [0] = (ct._s32 [0] & cc._s32 [0]) | (cf._s32 [0] & ~cc._s32 [0]);
+	r._s32 [1] = (ct._s32 [1] & cc._s32 [1]) | (cf._s32 [1] & ~cc._s32 [1]);
+	r._s32 [2] = (ct._s32 [2] & cc._s32 [2]) | (cf._s32 [2] & ~cc._s32 [2]);
+	r._s32 [3] = (ct._s32 [3] & cc._s32 [3]) | (cf._s32 [3] & ~cc._s32 [3]);
+	return r._vf32;
+#elif fstb_ARCHI == fstb_ARCHI_X86
+	const auto     cond_1 = _mm_and_ps (   cond, v_t);
+	const auto     cond_0 = _mm_andnot_ps (cond, v_f);
+	return _mm_or_ps (cond_0, cond_1);
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+	return vbslq_f32 (vreinterpretq_u32_f32 (cond), v_t, v_f);
+#endif // fstb_ARCHI
+}
+
+
+
+std::tuple <Vf32, Vf32> swap_if (Vf32 cond, Vf32 lhs, Vf32 rhs) noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+	const Vf32::Combo cc { cond };
+	if (cc._s32 [0] != 0) { std::swap (lhs._x [0], rhs._x [0]); }
+	if (cc._s32 [1] != 0) { std::swap (lhs._x [1], rhs._x [1]); }
+	if (cc._s32 [2] != 0) { std::swap (lhs._x [2], rhs._x [2]); }
+	if (cc._s32 [3] != 0) { std::swap (lhs._x [3], rhs._x [3]); }
+	return std::make_tuple (lhs, rhs);
+#elif fstb_ARCHI == fstb_ARCHI_X86
+	const auto     inv = _mm_and_ps (_mm_xor_ps (lhs, rhs), cond);
+	return std::make_tuple (
+		_mm_xor_ps (lhs, inv),
+		_mm_xor_ps (rhs, inv)
+	);
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+	const auto     cu32 = vreinterpretq_u32_f32 (cond);
+	return std::make_tuple (
+		vbslq_f32 (cu32, rhs, lhs),
+		vbslq_f32 (cu32, lhs, rhs)
+	);
+#endif // fstb_ARCHI
+}
+
+
+
+Vf32 sqrt (Vf32 v) noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+	return Vf32 {
+		sqrtf (v._x [0]),
+		sqrtf (v._x [1]),
+		sqrtf (v._x [2]),
+		sqrtf (v._x [3])
+	};
+#elif fstb_ARCHI == fstb_ARCHI_X86
+	return _mm_sqrt_ps (v);
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+	const uint32x4_t  nz_flag = vtstq_u32 (
+		vreinterpretq_u32_f32 (v),
+		vreinterpretq_u32_f32 (v)
+	);
+	float32x4_t    rs      = vrsqrteq_f32 (v);
+	rs *= vrsqrtsq_f32 (v, rs * rs);
+	rs *= vrsqrtsq_f32 (v, rs * rs);
+	rs *= vrsqrtsq_f32 (v, rs * rs);
+	const auto     sqrt_a  = rs * float32x4_t (v);
+	return vreinterpretq_f32_u32 (vandq_u32 (
+		vreinterpretq_u32_f32 (sqrt_a),
+		nz_flag
+	));
+#endif // fstb_ARCHI
+}
+
+
+
+// Formula by 2DaT
+// 12-13 ulp
+// https://www.kvraudio.com/forum/viewtopic.php?f=33&t=532048
+Vf32 log2 (Vf32 v) noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+
+	assert (v > Vf32 (0));
+	/*** To do: actual approximation matching the SIMD formula ***/
+	return Vf32 {
+		logf (v._x [0]) * float (LOG2_E),
+		logf (v._x [1]) * float (LOG2_E),
+		logf (v._x [2]) * float (LOG2_E),
+		logf (v._x [3]) * float (LOG2_E),
+	};
+
+#else // fstb_HAS_SIMD
+
+	// Rational fraction approximating log2 (x)
+	// [sqrt (0.5) ; sqrt (2)] -> [-0.5 ; 0.5]
+	// f: x -> (x - 1) * (x^2 + c1*x + c0) / (d2*x^2 + d1*x + d0)
+	// No analytic continuity on the full range, although this is "almost" C0
+	// (good enough for single precision).
+	const auto     c0    = Vf32 (1.011593342e+01f);
+	const auto     c1    = Vf32 (1.929443550e+01f);
+	const auto     d0    = Vf32 (2.095932245e+00f);
+	const auto     d1    = Vf32 (1.266638851e+01f);
+	const auto     d2    = Vf32 (6.316540241e+00f);
+	const auto     one   = Vf32 (1.0f);
+	const auto     multi = Vf32 (1.41421356237f);
+	const auto     mmask = ~((1 << 23) - 1);
+
+#if fstb_ARCHI == fstb_ARCHI_X86
+
+	__m128i        x_i           = _mm_castps_si128 (v);
+	__m128i        spl_exp       = _mm_castps_si128 (v * multi);
+	spl_exp = _mm_sub_epi32 (spl_exp, _mm_castps_si128 (one));
+	spl_exp = _mm_and_si128 (spl_exp, _mm_set1_epi32 (mmask));
+	const auto     spl_mantissa  =
+		Vf32 { _mm_castsi128_ps (_mm_sub_epi32 (x_i, spl_exp)) };
+	spl_exp = _mm_srai_epi32 (spl_exp, 23);
+	const auto     log2_exponent = Vf32 { _mm_cvtepi32_ps (spl_exp) };
+
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+
+	const int32x4_t   x_i        = vreinterpretq_s32_f32 (v);
+	int32x4_t      spl_exp       = vreinterpretq_s32_f32 (v * multi);
+	spl_exp = spl_exp - vreinterpretq_s32_f32 (one);
+	spl_exp = vandq_s32 (spl_exp, vdupq_n_s32 (mmask));
+	const auto     spl_mantissa  = Vf32 { vreinterpretq_f32_s32 (x_i - spl_exp) };
+	spl_exp = vshrq_n_s32 (spl_exp, 23);
+	const auto     log2_exponent = Vf32 { vcvtq_f32_s32 (spl_exp) };
+
+#endif // fstb_ARCHI
+
+	auto           num = spl_mantissa + c1;
+	num = fma (num, spl_mantissa, c0);
+	num = fms (num, spl_mantissa, num);
+
+	auto           den = d2;
+	den = fma (den, spl_mantissa, d1);
+	den = fma (den, spl_mantissa, d0);
+
+	auto           res = num / den;
+	res += log2_exponent;
+
+	return res;
+
+#endif // fstb_HAS_SIMD
+}
+
+
+
+// Formula by 2DaT
+// Coefficients fixed by Andrew Simper to achieve true C0 continuity
+// 3-4 ulp
+// https://www.kvraudio.com/forum/viewtopic.php?p=7161124#p7161124
+// https://www.kvraudio.com/forum/viewtopic.php?p=7677266#p7677266
+Vf32 exp2 (Vf32 v) noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+
+	/*** To do: actual approximation matching the SIMD formula ***/
+	return Vf32 {
+		exp2f (v._x [0]),
+		exp2f (v._x [1]),
+		exp2f (v._x [2]),
+		exp2f (v._x [3]),
+	};
+
+#else // fstb_HAS_SIMD
+
+	// [-0.5, 0.5] 2^x approx polynomial ~ 2.4 ulp
+	const auto     c0 = Vf32 (1.000000088673463);
+	const auto     c1 = Vf32 (0.69314693211407);
+	const auto     c2 = Vf32 (0.24022037362574);
+	const auto     c3 = Vf32 (0.0555072548370);
+	const auto     c4 = Vf32 (0.0096798351988);
+	const auto     c5 = Vf32 (0.0013285658116);
+
+	// Note: the following set of coefficients has a larger error (0.00043
+	// cents, maybe 7 ulp?) but ensures C2 continuity:
+	// c0 = 1.000000237
+	// c1 = 0.69314655
+	// c2 = 0.24021519
+	// c3 = 0.05550965
+	// c4 = 0.00969821
+	// c5 = 0.00132508
+
+	// i = round (v)
+	// v = v - i
+#if fstb_ARCHI == fstb_ARCHI_X86
+	auto           i = _mm_cvtps_epi32 (v);
+	v -= _mm_cvtepi32_ps (i);
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+	const int      round_ofs = 256;
+	const auto     r = Vf32 (round_ofs + 0.5f);
+	auto           i = vcvtq_s32_f32 (v + r);
+	i -= vdupq_n_s32 (round_ofs);
+	v -= vcvtq_f32_s32 (i);
+#endif // fstb_ARCHI
+
+	// Estrin-Horner evaluation scheme
+	const auto     v2  = v * v;
+	const auto     p23 = fma (c3, v, c2);
+	const auto     p01 = fma (c1, v, c0);
+	auto           p   = fma (c5, v, c4);
+	p = fma (p, v2, p23);
+	p = fma (p, v2, p01);
+
+	// i << 23
+	// r = (2^i) * (2^v)
+	// directly in floating point exponent
+#if fstb_ARCHI == fstb_ARCHI_X86
+	i = _mm_slli_epi32 (i, 23);
+	return _mm_castsi128_ps (_mm_add_epi32 (i, _mm_castps_si128 (p)));
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+	i = vshlq_n_s32 (i, 23);
+	return vreinterpretq_f32_s32 (i + vreinterpretq_s32_f32 (p));
+#endif // fstb_ARCHI
+
+#endif // fstb_HAS_SIMD
+}
+
+
+
+}  // namespace fstb
+
+
+
+#endif   // fstb_Vf32_CODEHEADER_INCLUDED
+
+
+
+/*\\\ EOF \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\*/
diff --git a/src/fstb/Vs32.h b/src/fstb/Vs32.h
new file mode 100644
index 0000000..234d4c0
--- /dev/null
+++ b/src/fstb/Vs32.h
@@ -0,0 +1,257 @@
+/*****************************************************************************
+
+        Vs32.h
+        Author: Laurent de Soras, 2021
+
+--- Legal stuff ---
+
+This program is free software. It comes without any warranty, to
+the extent permitted by applicable law. You can redistribute it
+and/or modify it under the terms of the Do What The Fuck You Want
+To Public License, Version 2, as published by Sam Hocevar. See
+http://www.wtfpl.net/ for more details.
+
+*Tab=3***********************************************************************/
+
+
+
+#pragma once
+#if ! defined (fstb_Vs32_HEADER_INCLUDED)
+#define fstb_Vs32_HEADER_INCLUDED
+
+
+
+/*\\\ INCLUDE FILES \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\*/
+
+#include "fstb/def.h"
+
+#if ! defined (fstb_HAS_SIMD)
+	#include <array>
+#elif (fstb_ARCHI == fstb_ARCHI_X86)
+	#include <emmintrin.h>
+#elif (fstb_ARCHI == fstb_ARCHI_ARM)
+	#include <arm_neon.h>
+#else
+	#error
+#endif
+
+#include <tuple>
+
+#include <cstdint>
+
+
+
+namespace fstb
+{
+
+
+
+#if ! defined (fstb_HAS_SIMD)
+
+typedef std::array <int32_t, 4> Vs32Native;
+
+#elif fstb_ARCHI == fstb_ARCHI_X86
+
+typedef __m128i   Vs32Native;
+
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+
+typedef int32x4_t Vs32Native;
+
+#else // fstb_ARCHI
+#error
+#endif // fstb_ARCHI
+
+
+
+class Vs32
+{
+
+/*\\\ PUBLIC \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\*/
+
+public:
+
+	static constexpr int _len_l2 = 2;
+	static constexpr int _length = 1 << _len_l2;
+	typedef int32_t Scalar;
+
+	               Vs32 ()                        = default;
+   fstb_FORCEINLINE
+	               Vs32 (Vs32Native a) noexcept : _x { a } {}
+   explicit fstb_FORCEINLINE
+	               Vs32 (Scalar a) noexcept;
+	explicit fstb_FORCEINLINE
+	               Vs32 (Scalar a0, Scalar a1, Scalar a2, Scalar a3) noexcept;
+	explicit fstb_FORCEINLINE
+	               Vs32 (const std::tuple <Scalar, Scalar, Scalar, Scalar> &a) noexcept;
+	               Vs32 (const Vs32 &other)       = default;
+	               Vs32 (Vs32 &&other)            = default;
+	               ~Vs32 ()                       = default;
+	Vs32 &         operator = (const Vs32 &other) = default;
+	Vs32 &         operator = (Vs32 &&other)      = default;
+
+	template <typename MEM>
+	fstb_FORCEINLINE void
+	               store (MEM *ptr) const noexcept;
+	template <typename MEM>
+	fstb_FORCEINLINE void
+	               storeu (MEM *ptr) const noexcept;
+	template <typename MEM>
+	fstb_FORCEINLINE void
+	               storeu_part (MEM *ptr, int n) const noexcept;
+
+	fstb_FORCEINLINE
+	               operator Vs32Native () const noexcept { return _x; }
+	fstb_FORCEINLINE explicit
+	               operator bool () const noexcept;
+
+	fstb_FORCEINLINE Vs32 &
+	               operator += (const Vs32Native &other) noexcept;
+	fstb_FORCEINLINE Vs32 &
+	               operator -= (const Vs32Native &other) noexcept;
+	fstb_FORCEINLINE Vs32 &
+	               operator *= (const Vs32Native &other) noexcept;
+
+	fstb_FORCEINLINE Vs32 &
+	               operator &= (const Vs32Native &other) noexcept;
+	fstb_FORCEINLINE Vs32 &
+	               operator |= (const Vs32Native &other) noexcept;
+	fstb_FORCEINLINE Vs32 &
+	               operator ^= (const Vs32Native &other) noexcept;
+
+	fstb_FORCEINLINE Vs32 &
+	               operator <<= (int imm) noexcept;
+	fstb_FORCEINLINE Vs32 &
+	               operator >>= (int imm) noexcept;
+
+	fstb_FORCEINLINE Vs32
+	               operator - () const noexcept;
+	fstb_FORCEINLINE Vs32
+	               operator ~ () const noexcept;
+	fstb_FORCEINLINE Vs32
+	               is_lt_0 () const noexcept;
+	fstb_FORCEINLINE Vs32
+	               reverse () const noexcept;
+
+	template <int SHIFT>
+	fstb_FORCEINLINE Vs32
+	               rotate () const noexcept;
+	template <int POS>
+	fstb_FORCEINLINE int32_t
+	               extract () const noexcept;
+	template <int POS>
+	fstb_FORCEINLINE Vs32
+	               insert (int32_t val) const noexcept;
+	template <int POS>
+	fstb_FORCEINLINE Vs32
+	               spread () const noexcept;
+
+	fstb_FORCEINLINE std::tuple <int32_t, int32_t, int32_t, int32_t>
+	               explode () const noexcept;
+
+	fstb_FORCEINLINE int32_t
+	               sum_h () const noexcept;
+	fstb_FORCEINLINE int32_t
+	               min_h () const noexcept;
+	fstb_FORCEINLINE int32_t
+	               max_h () const noexcept;
+
+	fstb_FORCEINLINE bool
+	               and_h () const noexcept;
+	fstb_FORCEINLINE bool
+	               or_h () const noexcept;
+	fstb_FORCEINLINE unsigned int
+	               movemask () const noexcept;
+	fstb_FORCEINLINE int
+	               count_bits () const noexcept;
+
+	static fstb_FORCEINLINE Vs32
+	               zero () noexcept;
+	static fstb_FORCEINLINE Vs32
+	               all1 () noexcept;
+	static fstb_FORCEINLINE Vs32
+	               set_mask (bool m0, bool m1, bool m2, bool m3) noexcept;
+	template <int POS>
+	static fstb_FORCEINLINE Vs32
+	               compose (Vs32 a, Vs32 b) noexcept;
+
+	template <typename MEM>
+	static fstb_FORCEINLINE Vs32
+	               load (const MEM *ptr) noexcept;
+	template <typename MEM>
+	static fstb_FORCEINLINE Vs32
+	               loadu (const MEM *ptr) noexcept;
+
+
+
+/*\\\ PROTECTED \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\*/
+
+protected:
+
+
+
+/*\\\ PRIVATE \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\*/
+
+private:
+
+#if ! defined (fstb_HAS_SIMD)
+public:
+#endif
+	Vs32Native  _x;
+private:
+
+
+
+/*\\\ FORBIDDEN MEMBER FUNCTIONS \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\*/
+
+private:
+
+}; // class Vs32
+
+
+
+/*\\\ GLOBAL OPERATORS AND FUNCTIONS \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\*/
+
+
+
+fstb_FORCEINLINE Vs32 operator + (Vs32 lhs, const Vs32 &rhs) noexcept;
+fstb_FORCEINLINE Vs32 operator - (Vs32 lhs, const Vs32 &rhs) noexcept;
+fstb_FORCEINLINE Vs32 operator * (Vs32 lhs, const Vs32 &rhs) noexcept;
+fstb_FORCEINLINE Vs32 operator & (Vs32 lhs, const Vs32 &rhs) noexcept;
+fstb_FORCEINLINE Vs32 operator | (Vs32 lhs, const Vs32 &rhs) noexcept;
+fstb_FORCEINLINE Vs32 operator ^ (Vs32 lhs, const Vs32 &rhs) noexcept;
+
+template <typename T>
+fstb_FORCEINLINE Vs32 operator << (Vs32 lhs, T rhs) noexcept;
+template <typename T>
+fstb_FORCEINLINE Vs32 operator >> (Vs32 lhs, T rhs) noexcept;
+
+fstb_FORCEINLINE Vs32 operator == (const Vs32 &lhs, const Vs32 &rhs) noexcept;
+fstb_FORCEINLINE Vs32 operator != (const Vs32 &lhs, const Vs32 &rhs) noexcept;
+fstb_FORCEINLINE Vs32 operator <  (const Vs32 &lhs, const Vs32 &rhs) noexcept;
+fstb_FORCEINLINE Vs32 operator <= (const Vs32 &lhs, const Vs32 &rhs) noexcept;
+fstb_FORCEINLINE Vs32 operator >  (const Vs32 &lhs, const Vs32 &rhs) noexcept;
+fstb_FORCEINLINE Vs32 operator >= (const Vs32 &lhs, const Vs32 &rhs) noexcept;
+
+fstb_FORCEINLINE Vs32 abs (const Vs32 &v) noexcept;
+fstb_FORCEINLINE Vs32 min (const Vs32 &lhs, const Vs32 &rhs) noexcept;
+fstb_FORCEINLINE Vs32 max (const Vs32 &lhs, const Vs32 &rhs) noexcept;
+fstb_FORCEINLINE Vs32 limit (const Vs32 &v, const Vs32 &mi, const Vs32 &ma) noexcept;
+fstb_FORCEINLINE Vs32 select (const Vs32 &cond, const Vs32 &v_t, const Vs32 &v_f) noexcept;
+fstb_FORCEINLINE std::tuple <Vs32, Vs32> swap_if (const Vs32 &cond, Vs32 lhs, Vs32 rhs) noexcept;
+
+
+
+}  // namespace fstb
+
+
+
+#include "fstb/Vs32.hpp"
+
+
+
+#endif   // fstb_Vs32_HEADER_INCLUDED
+
+
+
+/*\\\ EOF \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\*/
diff --git a/src/fstb/Vs32.hpp b/src/fstb/Vs32.hpp
new file mode 100644
index 0000000..e6b2510
--- /dev/null
+++ b/src/fstb/Vs32.hpp
@@ -0,0 +1,1142 @@
+/*****************************************************************************
+
+        Vs32.hpp
+        Author: Laurent de Soras, 2021
+
+--- Legal stuff ---
+
+This program is free software. It comes without any warranty, to
+the extent permitted by applicable law. You can redistribute it
+and/or modify it under the terms of the Do What The Fuck You Want
+To Public License, Version 2, as published by Sam Hocevar. See
+http://www.wtfpl.net/ for more details.
+
+*Tab=3***********************************************************************/
+
+
+
+#if ! defined (fstb_Vs32_CODEHEADER_INCLUDED)
+#define fstb_Vs32_CODEHEADER_INCLUDED
+
+
+
+/*\\\ INCLUDE FILES \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\*/
+
+#include "fstb/fnc.h"
+
+#include <algorithm>
+
+#include <cassert>
+
+
+
+namespace fstb
+{
+
+
+
+/*\\\ PUBLIC \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\*/
+
+
+
+// Returns a0 | a0 | a0 | a0
+Vs32::Vs32 (Scalar a) noexcept
+#if ! defined (fstb_HAS_SIMD)
+:	_x { a, a, a, a }
+#elif fstb_ARCHI == fstb_ARCHI_X86
+:	_x { _mm_set1_epi32 (a) }
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+:	_x { vdupq_n_s32 (a) }
+#endif // fstb_ARCHI
+{
+	// Nothing
+}
+
+
+
+// Returns a0 | a1 | a2 | a3
+Vs32::Vs32 (Scalar a0, Scalar a1, Scalar a2, Scalar a3) noexcept
+#if ! defined (fstb_HAS_SIMD)
+:	_x { a0, a1, a2, a3 }
+#elif fstb_ARCHI == fstb_ARCHI_X86
+:	_x { _mm_set_epi32 (a3, a2, a1, a0) }
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+:	_x { a0, a1, a2, a3 }
+#endif // fstb_ARCHI
+{
+	// Nothing
+}
+
+
+
+// Returns a0 | a1 | a2 | a3
+Vs32::Vs32 (const std::tuple <Scalar, Scalar, Scalar, Scalar> &a) noexcept
+#if ! defined (fstb_HAS_SIMD)
+:	_x { std::get <0> (a), std::get <1> (a), std::get <2> (a), std::get <3> (a) }
+#elif fstb_ARCHI == fstb_ARCHI_X86
+:	_x { _mm_set_epi32 (std::get <3> (a), std::get <2> (a), std::get <1> (a), std::get <0> (a)) }
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+:	_x { std::get <0> (a), std::get <1> (a), std::get <2> (a), std::get <3> (a) }
+#endif // fstb_ARCHI
+{
+	// Nothing
+}
+
+
+
+template <typename MEM>
+void	Vs32::store (MEM *ptr) const noexcept
+{
+	assert (is_ptr_align_nz (ptr, fstb_SIMD128_ALIGN));
+
+#if ! defined (fstb_HAS_SIMD)
+	*reinterpret_cast <Vs32Native *> (ptr) = _x;
+#elif fstb_ARCHI == fstb_ARCHI_X86
+	_mm_store_si128 (reinterpret_cast <__m128i *> (ptr), _x);
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+	vst1q_s32 (reinterpret_cast <int32_t *> (ptr), _x);
+#endif // fstb_ARCHI
+}
+
+
+
+template <typename MEM>
+void	Vs32::storeu (MEM *ptr) const noexcept
+{
+	assert (ptr != nullptr);
+
+#if ! defined (fstb_HAS_SIMD)
+	*reinterpret_cast <Vs32Native *> (ptr) = _x;
+#elif fstb_ARCHI == fstb_ARCHI_X86
+	_mm_storeu_si128 (reinterpret_cast <__m128i *> (ptr), _x);
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+	vst1q_u8 (reinterpret_cast <uint8_t *> (ptr), vreinterpretq_u8_s32 (_x));
+#endif // fstb_ARCHI
+}
+
+
+
+// n = number of scalars to store (from the LSB)
+template <typename MEM>
+void	Vs32::storeu_part (MEM *ptr, int n) const noexcept
+{
+	assert (n > 0);
+
+	if (n >= _length)
+	{
+		storeu (ptr);
+		return;
+	}
+
+	int32_t *      f_ptr = reinterpret_cast <int32_t *> (ptr);
+
+#if ! defined (fstb_HAS_SIMD)
+
+	for (int i = 0; i < n; ++i)
+	{
+		f_ptr [i] = _x [i];
+	}
+
+#elif fstb_ARCHI == fstb_ARCHI_X86
+
+	f_ptr [0] = _mm_cvtsi128_si32 (_x);
+	if (n >= 2)
+	{
+		f_ptr [1] = _mm_cvtsi128_si32 (_mm_shuffle_epi32 (_x, 1 << 0));
+		if (n >= 3)
+		{
+			f_ptr [1] = _mm_cvtsi128_si32 (_mm_shuffle_epi32 (_x, 2 << 0));
+		}
+	}
+
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+
+	vst1q_lane_s32 (f_ptr + 0, _x, 0);
+	if (n >= 2)
+	{
+		vst1q_lane_s32 (f_ptr + 1, _x, 1);
+		if (n >= 3)
+		{
+			vst1q_lane_s32 (f_ptr + 2, _x, 2);
+		}
+	}
+
+#endif
+}
+
+
+
+// Works only with well-formed condition results (tested bits depend on the
+// implementation).
+// For each scalar, true = all bits set, false = all bits cleared
+Vs32::operator bool () const noexcept
+{
+	return and_h ();
+}
+
+
+
+Vs32 &	Vs32::operator += (const Vs32Native &other) noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+	_x [0] += other [0];
+	_x [1] += other [1];
+	_x [2] += other [2];
+	_x [3] += other [3];
+#elif fstb_ARCHI == fstb_ARCHI_X86
+	_x = _mm_add_epi32 (_x, other);
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+	_x = vaddq_s32 (_x, other);
+#endif // fstb_ARCHI
+	return *this;
+}
+
+
+
+Vs32 &	Vs32::operator -= (const Vs32Native &other) noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+	_x [0] -= other [0];
+	_x [1] -= other [1];
+	_x [2] -= other [2];
+	_x [3] -= other [3];
+#elif fstb_ARCHI == fstb_ARCHI_X86
+	_x = _mm_sub_epi32 (_x, other);
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+	_x = vsubq_s32 (_x, other);
+#endif // fstb_ARCHI
+	return *this;
+}
+
+
+
+Vs32 &	Vs32::operator *= (const Vs32Native &other) noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+	_x [0] *= other [0];
+	_x [1] *= other [1];
+	_x [2] *= other [2];
+	_x [3] *= other [3];
+#elif fstb_ARCHI == fstb_ARCHI_X86
+	// Emulation of _mm_mullo_epi32 (SSE4.1)
+# if fstb_COMPILER == fstb_COMPILER_MSVC
+	// For some reason this code is slightly faster on MSVC
+	auto           p02_64 = _mm_mul_epu32 (_x, other);
+	auto           p13_64 = _mm_mul_epu32 (
+		_mm_srli_si128 (_x   , 4),
+		_mm_srli_si128 (other, 4)
+	);
+	p02_64 = _mm_shuffle_epi32 (p02_64, (0 << 0) | (2 << 2));
+	p13_64 = _mm_shuffle_epi32 (p13_64, (0 << 0) | (2 << 2));
+	_x     = _mm_unpacklo_epi32 (p02_64, p13_64);
+# else
+	// Code of this function shamelessly borrowed from tp7
+	// https://github.com/tp7/masktools/blob/16bit/masktools/common/simd.h
+	// This code is faster on GCC/Clang
+	const __m128i  lhs13  = _mm_shuffle_epi32 (_x, 0xF5);        // (-,a3,-,a1)
+	const __m128i  rhs13  = _mm_shuffle_epi32 (other, 0xF5);     // (-,b3,-,b1)
+	const __m128i  prod02 = _mm_mul_epu32 (_x, other);           // (-,a2*b2,-,a0*b0)
+	const __m128i  prod13 = _mm_mul_epu32 (lhs13, rhs13);        // (-,a3*b3,-,a1*b1)
+	const __m128i  prod01 = _mm_unpacklo_epi32 (prod02, prod13); // (-,-,a1*b1,a0*b0)
+	const __m128i  prod23 = _mm_unpackhi_epi32 (prod02, prod13); // (-,-,a3*b3,a2*b2)
+	_x                    = _mm_unpacklo_epi64 (prod01 ,prod23); // (ab3,ab2,ab1,ab0)
+# endif // fstb_COMPILER
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+	_x = vmulq_s32 (_x, other);
+#endif // fstb_ARCHI
+	return *this;
+}
+
+
+
+Vs32 &	Vs32::operator &= (const Vs32Native &other) noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+	_x [0] &= other [0];
+	_x [1] &= other [1];
+	_x [2] &= other [2];
+	_x [3] &= other [3];
+#elif fstb_ARCHI == fstb_ARCHI_X86
+	_x = _mm_and_si128 (_x, other);
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+	_x = vandq_s32 (_x, other);
+#endif // fstb_ARCHI
+	return *this;
+}
+
+
+
+Vs32 &	Vs32::operator |= (const Vs32Native &other) noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+	_x [0] |= other [0];
+	_x [1] |= other [1];
+	_x [2] |= other [2];
+	_x [3] |= other [3];
+#elif fstb_ARCHI == fstb_ARCHI_X86
+	_x = _mm_or_si128 (_x, other);
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+	_x = vorrq_s32 (_x, other);
+#endif // fstb_ARCHI
+	return *this;
+}
+
+
+
+Vs32 &	Vs32::operator ^= (const Vs32Native &other) noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+	_x [0] ^= other [0];
+	_x [1] ^= other [1];
+	_x [2] ^= other [2];
+	_x [3] ^= other [3];
+#elif fstb_ARCHI == fstb_ARCHI_X86
+	_x = _mm_xor_si128 (_x, other);
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+	_x = veorq_s32 (_x, other);
+#endif // fstb_ARCHI
+	return *this;
+}
+
+
+
+Vs32 &	Vs32::operator <<= (int imm) noexcept
+{
+	assert (imm >= 0);
+	assert (imm <= 32);
+#if ! defined (fstb_HAS_SIMD)
+	_x [0] <<= imm;
+	_x [1] <<= imm;
+	_x [2] <<= imm;
+	_x [3] <<= imm;
+#elif fstb_ARCHI == fstb_ARCHI_X86
+	_x = _mm_slli_epi32 (_x, imm);
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+	_x <<= imm;
+#endif // fstb_ARCHI
+	return *this;
+}
+
+
+
+Vs32 &	Vs32::operator >>= (int imm) noexcept
+{
+	assert (imm >= 0);
+	assert (imm <= 32);
+#if ! defined (fstb_HAS_SIMD)
+	_x [0] >>= imm;
+	_x [1] >>= imm;
+	_x [2] >>= imm;
+	_x [3] >>= imm;
+#elif fstb_ARCHI == fstb_ARCHI_X86
+	_x = _mm_srai_epi32 (_x, imm);
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+	_x >>= imm;
+#endif // fstb_ARCHI
+	return *this;
+}
+
+
+
+// -(1<<31) stays constant
+Vs32	Vs32::operator - () const noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+	return Vs32 {
+		-_x [0],
+		-_x [1],
+		-_x [2],
+		-_x [3]
+	};
+#elif fstb_ARCHI == fstb_ARCHI_X86
+	return _mm_sub_epi32 (_mm_setzero_si128 (), _x);
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+	return vnegq_s32 (_x);
+#endif // fstb_ARCHI
+}
+
+
+
+Vs32 	Vs32::operator ~ () const noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+	return Vs32 {
+		~(_x [0]),
+		~(_x [1]),
+		~(_x [2]),
+		~(_x [3])
+	};
+#elif fstb_ARCHI == fstb_ARCHI_X86
+	return _mm_xor_si128 (_x, _mm_set1_epi32 (-1));
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+	return vmvnq_s32 (_x);
+#endif // fstb_ARCHI
+}
+
+
+
+Vs32	Vs32::is_lt_0 () const noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+	return Vs32 {
+		(_x [0] < 0) ? -1 : 0,
+		(_x [1] < 0) ? -1 : 0,
+		(_x [2] < 0) ? -1 : 0,
+		(_x [3] < 0) ? -1 : 0
+	};
+#elif fstb_ARCHI == fstb_ARCHI_X86
+	return _mm_cmplt_epi32 (_x, _mm_setzero_si128 ());
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+	return vshrq_n_s32 (_x, 31);
+#endif // fstb_ARCHI
+}
+
+
+
+Vs32	Vs32::reverse () const noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+	return Vs32 { _x [3], _x [2], _x [1], _x [0] };
+#elif fstb_ARCHI == fstb_ARCHI_X86
+	return _mm_shuffle_epi32 (_x, (3<<0) + (2<<2) + (1<<4) + (0<<6));
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+	return vrev64q_s32 (vcombine_s32 (vget_high_s32 (_x), vget_low_s32 (_x)));
+#endif // fstb_ARCHI
+}
+
+
+
+// Positive = left
+template <int SHIFT>
+Vs32	Vs32::rotate () const noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+	return Vs32 {
+		_x [(0 - SHIFT) & 3],
+		_x [(1 - SHIFT) & 3],
+		_x [(2 - SHIFT) & 3],
+		_x [(3 - SHIFT) & 3]
+	};
+#elif fstb_ARCHI == fstb_ARCHI_X86
+	switch (SHIFT & 3)
+	{
+	case 1:  return _mm_shuffle_epi32 (_x, (2<<6) | (1<<4) | (0<<2) | (3<<0));
+	case 2:  return _mm_shuffle_epi32 (_x, (1<<6) | (0<<4) | (3<<2) | (2<<0));
+	case 3:  return _mm_shuffle_epi32 (_x, (0<<6) | (3<<4) | (2<<2) | (1<<0));
+	default: return *this;
+	}
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+	switch (SHIFT & 3)
+	{
+	case 1:  return vextq_s32 (_x, _x, 3);
+	case 2:  return vextq_s32 (_x, _x, 2);
+	case 3:  return vextq_s32 (_x, _x, 1);
+	default: return *this;
+	}
+#endif // fstb_ARCHI
+}
+
+
+
+template <int POS>
+int32_t	Vs32::extract () const noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+	return _x [POS & 3];
+#elif fstb_ARCHI == fstb_ARCHI_X86
+	auto           a = _x;
+	switch (POS & 3)
+	{
+	case 1:  a = _mm_shuffle_epi32 (a, 1); break;
+	case 2:  a = _mm_shuffle_epi32 (a, 2); break;
+	case 3:  a = _mm_shuffle_epi32 (a, 3); break;
+	default: /* Nothing */                 break;
+	}
+	return _mm_cvtsi128_si32 (a);
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+	return vgetq_lane_s32 (_x, POS & 3);
+#endif // fstb_ARCHI
+}
+
+
+
+template <int POS>
+Vs32	Vs32::insert (int32_t val) const noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+	auto           a = *this;
+	a._x [POS & 3] = val;
+	return a;
+#elif fstb_ARCHI == fstb_ARCHI_X86
+	auto           a = rotate <(-POS) & 3> ();
+	a._x = _mm_castps_si128 (_mm_move_ss (
+		_mm_castsi128_ps (a._x),
+		_mm_castsi128_ps (_mm_set1_epi32 (val))
+	));
+	return a.template rotate <POS> ();
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+	return vsetq_lane_s32 (val, _x, POS & 3);
+#endif // fstb_ARCHI
+}
+
+
+
+template <int POS>
+Vs32	Vs32::spread () const noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+	return Vs32 (extract <POS> ());
+#elif fstb_ARCHI == fstb_ARCHI_X86
+	return _mm_shuffle_epi32 (_x, 0x55 * (POS & 3));
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+	return vdupq_n_s32 (vgetq_lane_s32 (_x, POS & 3));
+#endif // fstb_ARCHI
+}
+
+
+
+int32_t	Vs32::sum_h () const noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+	return (_x [0] + _x [2]) + (_x [1] + _x [3]);
+#elif fstb_ARCHI == fstb_ARCHI_X86
+	// s = v3,v2,v1,v0
+	const auto s = _mm_shuffle_epi32 (_x, (3 << 0) | (2 << 2) | (1 << 4) | (0 << 6));
+	const auto v = _mm_add_epi32 (_x, s); // v0+v3,v1+v2,v2+v1,v3+v0
+	return _mm_cvtsi128_si32 (_mm_add_epi32 (v, _mm_shuffle_epi32 (v, 1 << 0)));
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+	#if fstb_WORD_SIZE == 64
+		return vaddvq_s32 (_x);
+	#else
+		int32x2_t      v2 = vadd_s32 (vget_high_s32 (_x), vget_low_s32 (_x));
+		return vget_lane_s32 (vpadd_s32 (v2, v2), 0);
+	#endif
+#endif // fstb_ARCHI
+}
+
+
+
+int32_t	Vs32::min_h () const noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+	return std::min (std::min (_x [0], _x [2]), std::min (_x [1], _x [3]));
+#elif fstb_ARCHI == fstb_ARCHI_X86
+	const auto     v0 = min (*this, _mm_shuffle_epi32 (_x, (3 << 2) | 2));
+	const auto     v1 = _mm_shuffle_epi32 (v0, 1);
+	return std::min (_mm_cvtsi128_si32 (v0), _mm_cvtsi128_si32 (v1));
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+	int32x2_t      v2 = vmin_s32 (vget_high_s32 (_x), vget_low_s32 (_x));
+	return vget_lane_s32 (vpmin_s32 (v2, v2), 0);
+#endif // fstb_ARCHI
+}
+
+
+
+int32_t	Vs32::max_h () const noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+	return std::max (std::max (_x [0], _x [2]), std::max (_x [1], _x [3]));
+#elif fstb_ARCHI == fstb_ARCHI_X86
+	const auto     v0 = max (*this, _mm_shuffle_epi32 (_x, (3 << 2) | 2));
+	const auto     v1 = _mm_shuffle_epi32 (v0, 1);
+	return std::max (_mm_cvtsi128_si32 (v0), _mm_cvtsi128_si32 (v1));
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+	int32x2_t      v2 = vmax_s32 (vget_high_s32 (_x), vget_low_s32 (_x));
+	return vget_lane_s32 (vpmax_s32 (v2, v2), 0);
+#endif // fstb_ARCHI
+}
+
+
+
+// Works only with well-formed condition results (tested bits depends on the implementation).
+// For each scalar, true = all bits set, false = all bits cleared
+bool	Vs32::and_h () const noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+	const int32_t  t = (_x [0] & _x [1]) & (_x [2] & _x [3]);
+	return (t == -1);
+#elif fstb_ARCHI == fstb_ARCHI_X86
+	return (_mm_movemask_epi8 (_x) == 0xFFFF);
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+	const uint32x2_t  tmp = vreinterpret_u32_u16 (
+		vqmovn_u32 (vreinterpretq_u32_s32 (_x))
+	);
+	return (   vget_lane_u32 (tmp, 0) == 0xFFFFFFFFU
+	        && vget_lane_u32 (tmp, 1) == 0xFFFFFFFFU);
+#endif // fstb_ARCHI
+}
+
+
+
+// Works only with well-formed condition results (tested bits depends on the implementation).
+// For each scalar, true = all bits set, false = all bits cleared
+bool	Vs32::or_h () const noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+	const int32_t  t = (_x [0] | _x [1]) | (_x [2] | _x [3]);
+	return (t != 0);
+#elif fstb_ARCHI == fstb_ARCHI_X86
+	return (_mm_movemask_epi8 (_x) != 0);
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+	const uint32x2_t  tmp = vreinterpret_u32_u16 (
+		vqmovn_u32 (vreinterpretq_u32_s32 (_x))
+	);
+	return (   vget_lane_u32 (tmp, 0) != 0
+	        || vget_lane_u32 (tmp, 1) != 0);
+#endif // fstb_ARCHI
+}
+
+
+
+// Moves the boolean content of each 4 scalar into the lower 4 bits of the
+// return value.
+// Assumes the object is a result of a comparison, with all bits the same
+// in each 32-bit element.
+unsigned int	Vs32::movemask () const noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+	return
+		   (_x [0] >> 31)
+		| ((_x [1] >> 30) & 2)
+		| ((_x [2] >> 29) & 4)
+		| ((_x [3] >> 28) & 8);
+#elif fstb_ARCHI == fstb_ARCHI_X86
+	return static_cast <unsigned int> (_mm_movemask_ps (_mm_castsi128_ps (_x)));
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+	uint64x2_t     tmp1 =
+		vreinterpretq_u64_s32 (_x);   // ddd...ddd ccc...ccc bbb...bbb aaa...aaa
+	tmp1 = vshrq_n_u64 (tmp1, 31);   // 000...00d ddd...ddc 000...00b bbb...bba
+	uint64x1_t     tmp2 = vsli_n_u64 (
+		vget_high_u64 (tmp1),
+		vget_low_u64 (tmp1),
+		2
+	);
+	return vget_lane_u32 (vreinterpret_u32_u64 (tmp2), 0) & 0xF;
+#endif // fstb_ARCHI
+}
+
+
+
+int	Vs32::count_bits () const noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+	// http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
+	uint32_t       v0 = _x [0] - ((_x [0] >> 1) & 0x55555555);
+	uint32_t       v1 = _x [1] - ((_x [1] >> 1) & 0x55555555);
+	uint32_t       v2 = _x [2] - ((_x [2] >> 1) & 0x55555555);
+	uint32_t       v3 = _x [3] - ((_x [3] >> 1) & 0x55555555);
+	v0 = (v0 & 0x33333333) + ((v0 >> 2) & 0x33333333);
+	v1 = (v1 & 0x33333333) + ((v1 >> 2) & 0x33333333);
+	v2 = (v2 & 0x33333333) + ((v2 >> 2) & 0x33333333);
+	v3 = (v3 & 0x33333333) + ((v3 >> 2) & 0x33333333);
+	const int      c0 = (((v0 + (v0 >> 4)) & 0xF0F0F0FU) * 0x1010101) >> 24;
+	const int      c1 = (((v1 + (v1 >> 4)) & 0xF0F0F0FU) * 0x1010101) >> 24;
+	const int      c2 = (((v2 + (v2 >> 4)) & 0xF0F0F0FU) * 0x1010101) >> 24;
+	const int      c3 = (((v3 + (v3 >> 4)) & 0xF0F0F0FU) * 0x1010101) >> 24;
+	return (c0 + c2) + (c1 + c3);
+#elif fstb_ARCHI == fstb_ARCHI_X86
+	// https://stackoverflow.com/questions/17354971/fast-counting-the-number-of-set-bits-in-m128i-register
+	static const __m128i  popcount_mask1 = _mm_set1_epi8 (0x77);
+	static const __m128i  popcount_mask2 = _mm_set1_epi8 (0x0F);
+	// Count bits in each 4-bit field.
+	auto           x = _x;
+	auto           n = _mm_srli_epi64 (x, 1);
+	n = _mm_and_si128 (popcount_mask1, n);
+	x = _mm_sub_epi8 (x, n);
+	n = _mm_srli_epi64 (n, 1);
+	n = _mm_and_si128 (popcount_mask1, n);
+	x = _mm_sub_epi8 (x, n);
+	n = _mm_srli_epi64 (n, 1);
+	n = _mm_and_si128 (popcount_mask1, n);
+	n = _mm_sub_epi8 (x, n);
+	n = _mm_add_epi8 (n, _mm_srli_epi16 (n, 4));
+	n = _mm_and_si128 (popcount_mask2, n);
+	// Counts the number of bits in the low and high 64-bit parts
+	n = _mm_sad_epu8 (n, _mm_setzero_si128 ());
+	// Counts the number of bits in the whole 128-bit register
+	n = _mm_add_epi32 (n, _mm_unpackhi_epi64 (n, n));
+	return _mm_cvtsi128_si32 (n);
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+	const uint8x16_t  cnt_8  = vcntq_u8 (vreinterpretq_u8_s32 (_x));
+	const uint16x8_t  cnt_16 = vpaddlq_u8 (cnt_8);
+	const uint32x4_t  cnt_32 = vpaddlq_u16 (cnt_16);
+	const uint64x2_t  cnt_64 = vpaddlq_u32 (cnt_32);
+	const int32x4_t   cnt_s  = vreinterpretq_s32_u64 (cnt_64);
+	return vgetq_lane_s32 (cnt_s, 0) + vgetq_lane_s32 (cnt_s, 2);
+#endif // fstb_ARCHI
+}
+
+
+
+Vs32	Vs32::zero () noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+	return Vs32 { 0, 0, 0, 0 };
+#elif fstb_ARCHI == fstb_ARCHI_X86
+	return _mm_setzero_si128 ();
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+	return vdupq_n_s32 (0);
+#endif // fstb_ARCHI
+}
+
+
+
+Vs32	Vs32::all1 () noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+	return Vs32 { -1, -1, -1, -1 };
+#elif fstb_ARCHI == fstb_ARCHI_X86
+	return _mm_set1_epi32 (-1);
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+	return vdupq_n_s32 (-1);
+#endif // fstb_ARCHI
+}
+
+
+
+// "true" must be 1 and nothing else.
+Vs32	Vs32::set_mask (bool m0, bool m1, bool m2, bool m3) noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+	return Vs32 {
+		-int32_t (m0),
+		-int32_t (m1),
+		-int32_t (m2),
+		-int32_t (m3),
+	};
+#elif 1 // Fast version
+# if fstb_ARCHI == fstb_ARCHI_X86
+	return _mm_sub_epi32 (
+		_mm_setzero_si128 (),
+		_mm_set_epi32 (m3, m2, m1, m0)
+	);
+# elif fstb_ARCHI == fstb_ARCHI_ARM
+	float32x2_t    v01 = vdup_n_f32 (m0);
+	float32x2_t    v23 = vdup_n_f32 (m2);
+	v01 = vset_lane_f32 (m1, v01, 1);
+	v23 = vset_lane_f32 (m3, v23, 1);
+	return vnegq_s32 (vreinterpretq_s32_f32 (
+		vcombine_f32 (v01, v23)
+	));
+# endif // fstb_ARCHI
+#else // Safer but slower version
+# if fstb_ARCHI == fstb_ARCHI_X86
+	return _mm_sub_epi32 (
+		_mm_set_epi32 (!m3, !m2, !m1, !m0),
+		_mm_set1_epi32 (1)
+	);
+# elif fstb_ARCHI == fstb_ARCHI_ARM
+	float32x2_t    v01 = vdup_n_f32 (!m0);
+	float32x2_t    v23 = vdup_n_f32 (!m2);
+	v01 = vset_lane_f32 (!m1, v01, 1);
+	v23 = vset_lane_f32 (!m3, v23, 1);
+	const auto     one  = vdupq_n_s32 (1);
+	return vsubq_s32 (
+		vreinterpretq_s32_f32 (vcombine_f32 (v01, v23)),
+		one
+	);
+# endif // fstb_ARCHI
+#endif // Versions
+}
+
+
+
+// Extracts the vector at the position SHIFT from the double-width vector {a b}
+// Concatenates a [SHIFT...3] with b [0...3-SHIFT]
+template <int POS>
+Vs32	Vs32::compose (Vs32 a, Vs32 b) noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+	switch (POS & 3)
+	{
+	case 1:  return Vs32 { a._x [1], a._x [2], a._x [3], b._x [0] };
+	case 2:  return Vs32 { a._x [2], a._x [3], b._x [0], b._x [1] };
+	case 3:  return Vs32 { a._x [3], b._x [0], b._x [1], b._x [2] };
+	default: return a;
+	}
+	return a;
+#elif fstb_ARCHI == fstb_ARCHI_X86
+	switch (POS & 3)
+	{
+	case 1:
+		{
+			const auto     tmp = _mm_castps_si128 (_mm_move_ss (
+				_mm_castsi128_ps (a._x), _mm_castsi128_ps (b._x)
+			));
+			return _mm_shuffle_epi32 (tmp, (0<<6) | (3<<4) | (2<<2) | (1<<0));
+		}
+	case 2:
+		return _mm_castps_si128 (_mm_shuffle_ps (
+			_mm_castsi128_ps (a._x),
+			_mm_castsi128_ps (b._x),
+			(1<<6) | (0<<4) | (3<<2) | (2<<0)
+		));
+	case 3:
+		return _mm_castps_si128 (_mm_move_ss (
+			_mm_castsi128_ps (
+				_mm_shuffle_epi32 (b._x, (2<<6) | (1<<4) | (0<<2) | (3<<0))
+			),
+			_mm_castsi128_ps (
+				_mm_shuffle_epi32 (a._x, (2<<6) | (1<<4) | (0<<2) | (3<<0))
+			)
+		));
+	default:
+		return a;
+	}
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+	if (POS == 0)
+	{
+		return a;
+	}
+	else
+	{
+		return vextq_s32 (a._x, b._x, POS);
+	}
+#endif // fstb_ARCHI
+}
+
+
+
+template <typename MEM>
+Vs32	Vs32::load (const MEM *ptr) noexcept
+{
+	assert (is_ptr_align_nz (ptr, fstb_SIMD128_ALIGN));
+
+#if ! defined (fstb_HAS_SIMD)
+	return *reinterpret_cast <const Vs32 *> (ptr);
+#elif fstb_ARCHI == fstb_ARCHI_X86
+	return _mm_load_si128 (reinterpret_cast <const __m128i *> (ptr));
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+	return vld1q_s32 (reinterpret_cast <const int32_t *> (ptr));
+#endif // fstb_ARCHI
+}
+
+
+
+template <typename MEM>
+Vs32	Vs32::loadu (const MEM *ptr) noexcept
+{
+	assert (ptr != nullptr);
+
+#if ! defined (fstb_HAS_SIMD)
+	return *reinterpret_cast <const Vs32 *> (ptr);
+#elif fstb_ARCHI == fstb_ARCHI_X86
+	return _mm_loadu_si128 (reinterpret_cast <const __m128i *> (ptr));
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+	return vreinterpretq_s32_u8 (
+		vld1q_u8 (reinterpret_cast <const uint8_t *> (ptr))
+	);
+#endif // fstb_ARCHI
+}
+
+
+
+/*\\\ PROTECTED \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\*/
+
+
+
+/*\\\ PRIVATE \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\*/
+
+
+
+/*\\\ GLOBAL OPERATORS AND FUNCTIONS \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\*/
+
+
+
+Vs32 operator + (Vs32 lhs, const Vs32 &rhs) noexcept
+{
+	lhs += rhs;
+	return lhs;
+}
+
+Vs32 operator - (Vs32 lhs, const Vs32 &rhs) noexcept
+{
+	lhs -= rhs;
+	return lhs;
+}
+
+Vs32 operator * (Vs32 lhs, const Vs32 &rhs) noexcept
+{
+	lhs *= rhs;
+	return lhs;
+}
+
+Vs32 operator & (Vs32 lhs, const Vs32 &rhs) noexcept
+{
+	lhs &= rhs;
+	return lhs;
+}
+
+Vs32 operator | (Vs32 lhs, const Vs32 &rhs) noexcept
+{
+	lhs |= rhs;
+	return lhs;
+}
+
+Vs32 operator ^ (Vs32 lhs, const Vs32 &rhs) noexcept
+{
+	lhs ^= rhs;
+	return lhs;
+}
+
+
+
+template <typename T>
+Vs32 operator << (Vs32 lhs, T rhs) noexcept
+{
+	lhs <<= rhs;
+	return lhs;
+}
+
+template <typename T>
+Vs32 operator >> (Vs32 lhs, T rhs) noexcept
+{
+	lhs >>= rhs;
+	return lhs;
+}
+
+
+
+Vs32 operator == (const Vs32 &lhs, const Vs32 &rhs) noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+	return Vs32 {
+		(lhs._x [0] == rhs._x [0]) ? -1 : 0,
+		(lhs._x [1] == rhs._x [1]) ? -1 : 0,
+		(lhs._x [2] == rhs._x [2]) ? -1 : 0,
+		(lhs._x [3] == rhs._x [3]) ? -1 : 0
+	};
+#elif fstb_ARCHI == fstb_ARCHI_X86
+	return _mm_cmpeq_epi32 (lhs, rhs);
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+	return vreinterpretq_s32_u32 (vceqq_s32 (lhs, rhs));
+#endif // fstb_ARCHI
+}
+
+
+
+Vs32 operator != (const Vs32 &lhs, const Vs32 &rhs) noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+	return Vs32 {
+		(lhs._x [0] != rhs._x [0]) ? -1 : 0,
+		(lhs._x [1] != rhs._x [1]) ? -1 : 0,
+		(lhs._x [2] != rhs._x [2]) ? -1 : 0,
+		(lhs._x [3] != rhs._x [3]) ? -1 : 0
+	};
+#elif fstb_ARCHI == fstb_ARCHI_X86
+	const auto     eq = _mm_cmpeq_epi32 (lhs, rhs);
+	return _mm_xor_si128 (eq, _mm_set1_epi32 (-1));
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+	return vreinterpretq_s32_u32 (vmvnq_u32 (vceqq_s32 (lhs, rhs)));
+#endif // fstb_ARCHI
+}
+
+
+
+Vs32 operator <  (const Vs32 &lhs, const Vs32 &rhs) noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+	return Vs32 {
+		(lhs._x [0] < rhs._x [0]) ? -1 : 0,
+		(lhs._x [1] < rhs._x [1]) ? -1 : 0,
+		(lhs._x [2] < rhs._x [2]) ? -1 : 0,
+		(lhs._x [3] < rhs._x [3]) ? -1 : 0
+	};
+#elif fstb_ARCHI == fstb_ARCHI_X86
+	return _mm_cmplt_epi32 (lhs, rhs);
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+	return vreinterpretq_s32_u32 (vcltq_s32 (lhs, rhs));
+#endif // fstb_ARCHI
+}
+
+
+
+Vs32 operator <= (const Vs32 &lhs, const Vs32 &rhs) noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+	return Vs32 {
+		(lhs._x [0] <= rhs._x [0]) ? -1 : 0,
+		(lhs._x [1] <= rhs._x [1]) ? -1 : 0,
+		(lhs._x [2] <= rhs._x [2]) ? -1 : 0,
+		(lhs._x [3] <= rhs._x [3]) ? -1 : 0
+	};
+#elif fstb_ARCHI == fstb_ARCHI_X86
+# if 1
+	return (lhs < rhs) | (lhs == rhs);
+# else
+	return ~(lhs > rhs);
+# endif
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+	return vreinterpretq_s32_u32 (vcleq_s32 (lhs, rhs));
+#endif // fstb_ARCHI
+}
+
+
+
+Vs32 operator >  (const Vs32 &lhs, const Vs32 &rhs) noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+	return Vs32 {
+		(lhs._x [0] > rhs._x [0]) ? -1 : 0,
+		(lhs._x [1] > rhs._x [1]) ? -1 : 0,
+		(lhs._x [2] > rhs._x [2]) ? -1 : 0,
+		(lhs._x [3] > rhs._x [3]) ? -1 : 0
+	};
+#elif fstb_ARCHI == fstb_ARCHI_X86
+	return _mm_cmpgt_epi32 (lhs, rhs);
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+	return vreinterpretq_s32_u32 (vcgtq_s32 (lhs, rhs));
+#endif // fstb_ARCHI
+}
+
+
+
+Vs32 operator >= (const Vs32 &lhs, const Vs32 &rhs) noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+	return Vs32 {
+		(lhs._x [0] >= rhs._x [0]) ? -1 : 0,
+		(lhs._x [1] >= rhs._x [1]) ? -1 : 0,
+		(lhs._x [2] >= rhs._x [2]) ? -1 : 0,
+		(lhs._x [3] >= rhs._x [3]) ? -1 : 0
+	};
+#elif fstb_ARCHI == fstb_ARCHI_X86
+# if 1
+	return (lhs > rhs) | (lhs == rhs);
+# else
+	return ~(lhs < rhs);
+# endif
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+	return vreinterpretq_s32_u32 (vcgeq_s32 (lhs, rhs));
+#endif // fstb_ARCHI
+}
+
+
+
+// Result is undefined for -(1<<31).
+Vs32 abs (const Vs32 &v) noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+	return Vs32 {
+		std::abs (v._x [0]),
+		std::abs (v._x [1]),
+		std::abs (v._x [2]),
+		std::abs (v._x [3])
+	};
+#elif fstb_ARCHI == fstb_ARCHI_X86
+	const auto     v_neg = _mm_sub_epi32 (_mm_setzero_si128 (), v);
+	return max (v, v_neg);
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+	return vqabsq_s32 (v);
+#endif // fstb_ARCHI
+}
+
+
+
+Vs32 min (const Vs32 &lhs, const Vs32 &rhs) noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+	return Vs32 {
+		std::min (lhs._x [0], rhs._x [0]),
+		std::min (lhs._x [1], rhs._x [1]),
+		std::min (lhs._x [2], rhs._x [2]),
+		std::min (lhs._x [3], rhs._x [3])
+	};
+#elif fstb_ARCHI == fstb_ARCHI_X86
+	const auto     gt = (lhs > rhs);
+	return _mm_or_si128 (
+		_mm_and_si128 (   gt, rhs),
+		_mm_andnot_si128 (gt, lhs)
+	);
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+	return vminq_s32 (lhs, rhs);
+#endif // fstb_ARCHI
+}
+
+
+
+Vs32 max (const Vs32 &lhs, const Vs32 &rhs) noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+	return Vs32 {
+		std::max (lhs._x [0], rhs._x [0]),
+		std::max (lhs._x [1], rhs._x [1]),
+		std::max (lhs._x [2], rhs._x [2]),
+		std::max (lhs._x [3], rhs._x [3])
+	};
+#elif fstb_ARCHI == fstb_ARCHI_X86
+	const auto     lt = (lhs < rhs);
+	return _mm_or_si128 (
+		_mm_and_si128 (   lt, rhs),
+		_mm_andnot_si128 (lt, lhs)
+	);
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+	return vmaxq_s32 (lhs, rhs);
+#endif // fstb_ARCHI
+}
+
+
+
+Vs32 limit (const Vs32 &v, const Vs32 &mi, const Vs32 &ma) noexcept
+{
+	return min (max (v, mi), ma);
+}
+
+
+
+Vs32 select (const Vs32 &cond, const Vs32 &v_t, const Vs32 &v_f) noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+	/*** To do: implement as r = v_f ^ ((v_f ^ v_t) & cond) ***/
+	return Vs32 {
+		(cond._x [0] & v_t._x [0]) | (~cond._x [0] & v_f._x [0]),
+		(cond._x [1] & v_t._x [1]) | (~cond._x [1] & v_f._x [1]),
+		(cond._x [2] & v_t._x [2]) | (~cond._x [2] & v_f._x [2]),
+		(cond._x [3] & v_t._x [3]) | (~cond._x [3] & v_f._x [3])
+	};
+#elif fstb_ARCHI == fstb_ARCHI_X86
+	const auto     cond_1 = _mm_and_si128 (cond, v_t);
+	const auto     cond_0 = _mm_andnot_si128 (cond, v_f);
+	return _mm_or_si128 (cond_0, cond_1);
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+	return vbslq_s32 (vreinterpretq_u32_s32 (cond), v_t, v_f);
+#endif // fstb_ARCHI
+}
+
+
+
+std::tuple <Vs32, Vs32> swap_if (const Vs32 &cond, Vs32 lhs, Vs32 rhs) noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+	if (cond._x [0] != 0) { std::swap (lhs._x [0], rhs._x [0]); }
+	if (cond._x [1] != 0) { std::swap (lhs._x [1], rhs._x [1]); }
+	if (cond._x [2] != 0) { std::swap (lhs._x [2], rhs._x [2]); }
+	if (cond._x [3] != 0) { std::swap (lhs._x [3], rhs._x [3]); }
+	return std::make_tuple (lhs, rhs);
+#elif fstb_ARCHI == fstb_ARCHI_X86
+	const auto     inv = _mm_and_si128 (_mm_xor_si128 (lhs, rhs), cond);
+	return std::make_tuple (
+		Vs32 (_mm_xor_si128 (lhs, inv)),
+		Vs32 (_mm_xor_si128 (rhs, inv))
+	);
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+	const auto     cond_u = vreinterpretq_u32_s32 (cond);
+	return std::make_tuple (
+		Vs32 (vbslq_s32 (cond_u, rhs, lhs)),
+		Vs32 (vbslq_s32 (cond_u, lhs, rhs))
+	);
+#endif // fstb_ARCHI
+}
+
+
+
+}  // namespace fstb
+
+
+
+#endif   // fstb_Vs32_CODEHEADER_INCLUDED
+
+
+
+/*\\\ EOF \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\*/
diff --git a/src/fstb/Vu32.h b/src/fstb/Vu32.h
new file mode 100644
index 0000000..2341621
--- /dev/null
+++ b/src/fstb/Vu32.h
@@ -0,0 +1,259 @@
+/*****************************************************************************
+
+        Vu32.h
+        Author: Laurent de Soras, 2021
+
+--- Legal stuff ---
+
+This program is free software. It comes without any warranty, to
+the extent permitted by applicable law. You can redistribute it
+and/or modify it under the terms of the Do What The Fuck You Want
+To Public License, Version 2, as published by Sam Hocevar. See
+http://www.wtfpl.net/ for more details.
+
+*Tab=3***********************************************************************/
+
+
+
+#pragma once
+#if ! defined (fstb_Vu32_HEADER_INCLUDED)
+#define fstb_Vu32_HEADER_INCLUDED
+
+
+
+/*\\\ INCLUDE FILES \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\*/
+
+#include "fstb/def.h"
+
+#if ! defined (fstb_HAS_SIMD)
+	#include <array>
+#elif (fstb_ARCHI == fstb_ARCHI_X86)
+	#include <emmintrin.h>
+#elif (fstb_ARCHI == fstb_ARCHI_ARM)
+	#include <arm_neon.h>
+#else
+	#error
+#endif
+
+#include <tuple>
+
+#include <cstdint>
+
+
+
+namespace fstb
+{
+
+
+
+#if ! defined (fstb_HAS_SIMD)
+
+typedef std::array <uint32_t, 4> Vu32Native;
+
+#elif fstb_ARCHI == fstb_ARCHI_X86
+
+typedef __m128i   Vu32Native;
+
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+
+typedef uint32x4_t Vu32Native;
+
+#else // fstb_ARCHI
+#error
+#endif // fstb_ARCHI
+
+
+
+class Vu32
+{
+
+/*\\\ PUBLIC \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\*/
+
+public:
+
+	static constexpr int _len_l2 = 2;
+	static constexpr int _length = 1 << _len_l2;
+	typedef uint32_t Scalar;
+
+	               Vu32 ()                        = default;
+   fstb_FORCEINLINE
+	               Vu32 (Vu32Native a) noexcept : _x { a } {}
+   explicit fstb_FORCEINLINE
+	               Vu32 (Scalar a) noexcept;
+	explicit fstb_FORCEINLINE
+	               Vu32 (Scalar a0, Scalar a1, Scalar a2, Scalar a3) noexcept;
+	explicit fstb_FORCEINLINE
+	               Vu32 (const std::tuple <Scalar, Scalar, Scalar, Scalar> &a) noexcept;
+	               Vu32 (const Vu32 &other)       = default;
+	               Vu32 (Vu32 &&other)            = default;
+	               ~Vu32 ()                       = default;
+	Vu32 &         operator = (const Vu32 &other) = default;
+	Vu32 &         operator = (Vu32 &&other)      = default;
+
+	template <typename MEM>
+	fstb_FORCEINLINE void
+	               store (MEM *ptr) const noexcept;
+	template <typename MEM>
+	fstb_FORCEINLINE void
+	               storeu (MEM *ptr) const noexcept;
+	template <typename MEM>
+	fstb_FORCEINLINE void
+	               storeu_part (MEM *ptr, int n) const noexcept;
+
+	fstb_FORCEINLINE
+	               operator Vu32Native () const noexcept { return _x; }
+	fstb_FORCEINLINE explicit
+	               operator bool () const noexcept;
+
+	fstb_FORCEINLINE Vu32 &
+	               operator += (const Vu32Native &other) noexcept;
+	fstb_FORCEINLINE Vu32 &
+	               operator -= (const Vu32Native &other) noexcept;
+	fstb_FORCEINLINE Vu32 &
+	               operator *= (const Vu32Native &other) noexcept;
+	fstb_FORCEINLINE Vu32 &
+	               operator *= (const Scalar &other) noexcept;
+
+	fstb_FORCEINLINE Vu32 &
+	               operator &= (const Vu32Native &other) noexcept;
+	fstb_FORCEINLINE Vu32 &
+	               operator |= (const Vu32Native &other) noexcept;
+	fstb_FORCEINLINE Vu32 &
+	               operator ^= (const Vu32Native &other) noexcept;
+
+	fstb_FORCEINLINE Vu32 &
+	               operator <<= (int imm) noexcept;
+	fstb_FORCEINLINE Vu32 &
+	               operator >>= (int imm) noexcept;
+
+	fstb_FORCEINLINE Vu32
+	               operator - () const noexcept;
+	fstb_FORCEINLINE Vu32
+	               operator ~ () const noexcept;
+	fstb_FORCEINLINE Vu32
+	               reverse () const noexcept;
+
+	template <int SHIFT>
+	fstb_FORCEINLINE Vu32
+	               rotate () const noexcept;
+	template <int POS>
+	fstb_FORCEINLINE uint32_t
+	               extract () const noexcept;
+	template <int POS>
+	fstb_FORCEINLINE Vu32
+	               insert (uint32_t val) const noexcept;
+	template <int POS>
+	fstb_FORCEINLINE Vu32
+	               spread () const noexcept;
+
+	fstb_FORCEINLINE std::tuple <uint32_t, uint32_t, uint32_t, uint32_t>
+	               explode () const noexcept;
+
+	fstb_FORCEINLINE uint32_t
+	               sum_h () const noexcept;
+	fstb_FORCEINLINE uint32_t
+	               min_h () const noexcept;
+	fstb_FORCEINLINE uint32_t
+	               max_h () const noexcept;
+
+	fstb_FORCEINLINE bool
+	               and_h () const noexcept;
+	fstb_FORCEINLINE bool
+	               or_h () const noexcept;
+	fstb_FORCEINLINE unsigned int
+	               movemask () const noexcept;
+	fstb_FORCEINLINE int
+	               count_bits () const noexcept;
+
+	static fstb_FORCEINLINE Vu32
+	               zero () noexcept;
+	static fstb_FORCEINLINE Vu32
+	               all1 () noexcept;
+	static fstb_FORCEINLINE Vu32
+	               set_mask (bool m0, bool m1, bool m2, bool m3) noexcept;
+	template <int POS>
+	static fstb_FORCEINLINE Vu32
+	               compose (Vu32 a, Vu32 b) noexcept;
+	static fstb_FORCEINLINE Vu32
+	               flip_msb (Vu32 x) noexcept;
+
+	template <typename MEM>
+	static fstb_FORCEINLINE Vu32
+	               load (const MEM *ptr) noexcept;
+	template <typename MEM>
+	static fstb_FORCEINLINE Vu32
+	               loadu (const MEM *ptr) noexcept;
+
+
+
+/*\\\ PROTECTED \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\*/
+
+protected:
+
+
+
+/*\\\ PRIVATE \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\*/
+
+private:
+
+#if ! defined (fstb_HAS_SIMD)
+public:
+#endif
+	Vu32Native  _x;
+private:
+
+
+
+/*\\\ FORBIDDEN MEMBER FUNCTIONS \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\*/
+
+private:
+
+}; // class Vu32
+
+
+
+/*\\\ GLOBAL OPERATORS \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\*/
+
+
+
+fstb_FORCEINLINE Vu32 operator + (Vu32 lhs, const Vu32 &rhs) noexcept;
+fstb_FORCEINLINE Vu32 operator - (Vu32 lhs, const Vu32 &rhs) noexcept;
+fstb_FORCEINLINE Vu32 operator * (Vu32 lhs, const Vu32 &rhs) noexcept;
+fstb_FORCEINLINE Vu32 operator * (Vu32 lhs, const Vu32::Scalar rhs) noexcept;
+fstb_FORCEINLINE Vu32 operator & (Vu32 lhs, const Vu32 &rhs) noexcept;
+fstb_FORCEINLINE Vu32 operator | (Vu32 lhs, const Vu32 &rhs) noexcept;
+fstb_FORCEINLINE Vu32 operator ^ (Vu32 lhs, const Vu32 &rhs) noexcept;
+
+template <typename T>
+fstb_FORCEINLINE Vu32 operator << (Vu32 lhs, T rhs) noexcept;
+template <typename T>
+fstb_FORCEINLINE Vu32 operator >> (Vu32 lhs, T rhs) noexcept;
+
+fstb_FORCEINLINE Vu32 operator == (const Vu32 &lhs, const Vu32 &rhs) noexcept;
+fstb_FORCEINLINE Vu32 operator != (const Vu32 &lhs, const Vu32 &rhs) noexcept;
+fstb_FORCEINLINE Vu32 operator <  (const Vu32 &lhs, const Vu32 &rhs) noexcept;
+fstb_FORCEINLINE Vu32 operator <= (const Vu32 &lhs, const Vu32 &rhs) noexcept;
+fstb_FORCEINLINE Vu32 operator >  (const Vu32 &lhs, const Vu32 &rhs) noexcept;
+fstb_FORCEINLINE Vu32 operator >= (const Vu32 &lhs, const Vu32 &rhs) noexcept;
+
+fstb_FORCEINLINE Vu32 min (const Vu32 &lhs, const Vu32 &rhs) noexcept;
+fstb_FORCEINLINE Vu32 max (const Vu32 &lhs, const Vu32 &rhs) noexcept;
+fstb_FORCEINLINE Vu32 limit (const Vu32 &v, const Vu32 &mi, const Vu32 &ma) noexcept;
+fstb_FORCEINLINE Vu32 select (const Vu32 &cond, const Vu32 &v_t, const Vu32 &v_f) noexcept;
+fstb_FORCEINLINE std::tuple <Vu32, Vu32> swap_if (const Vu32 &cond, Vu32 lhs, Vu32 rhs) noexcept;
+
+
+
+}  // namespace fstb
+
+
+
+#include "fstb/Vu32.hpp"
+
+
+
+#endif   // fstb_Vu32_HEADER_INCLUDED
+
+
+
+/*\\\ EOF \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\*/
diff --git a/src/fstb/Vu32.hpp b/src/fstb/Vu32.hpp
new file mode 100644
index 0000000..a1f851d
--- /dev/null
+++ b/src/fstb/Vu32.hpp
@@ -0,0 +1,1139 @@
+/*****************************************************************************
+
+        Vu32.hpp
+        Author: Laurent de Soras, 2021
+
+--- Legal stuff ---
+
+This program is free software. It comes without any warranty, to
+the extent permitted by applicable law. You can redistribute it
+and/or modify it under the terms of the Do What The Fuck You Want
+To Public License, Version 2, as published by Sam Hocevar. See
+http://www.wtfpl.net/ for more details.
+
+*Tab=3***********************************************************************/
+
+
+
+#if ! defined (fstb_Vu32_CODEHEADER_INCLUDED)
+#define fstb_Vu32_CODEHEADER_INCLUDED
+
+
+
+/*\\\ INCLUDE FILES \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\*/
+
+#include <algorithm>
+
+#include <cassert>
+
+
+
+namespace fstb
+{
+
+
+
+/*\\\ PUBLIC \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\*/
+
+
+
+Vu32::Vu32 (Scalar a) noexcept
+#if ! defined (fstb_HAS_SIMD)
+:	_x { a, a, a, a }
+#elif fstb_ARCHI == fstb_ARCHI_X86
+:	_x { _mm_set1_epi32 (int32_t (a)) }
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+:	_x { vdupq_n_u32 (a) }
+#endif // fstb_ARCHI
+{
+	// Nothing
+}
+
+
+
+// Returns a0 | a1 | a2 | a3
+Vu32::Vu32 (Scalar a0, Scalar a1, Scalar a2, Scalar a3) noexcept
+#if ! defined (fstb_HAS_SIMD)
+:	_x { a0, a1, a2, a3 }
+#elif fstb_ARCHI == fstb_ARCHI_X86
+:	_x { _mm_set_epi32 (a3, a2, a1, a0) }
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+:	_x { a0, a1, a2, a3 }
+#endif // fstb_ARCHI
+{
+	// Nothing
+}
+
+
+
+// Returns a0 | a1 | a2 | a3
+Vu32::Vu32 (const std::tuple <Scalar, Scalar, Scalar, Scalar> &a) noexcept
+#if ! defined (fstb_HAS_SIMD)
+:	_x { std::get <0> (a), std::get <1> (a), std::get <2> (a), std::get <3> (a) }
+#elif fstb_ARCHI == fstb_ARCHI_X86
+:	_x { _mm_set_epi32 (std::get <3> (a), std::get <2> (a), std::get <1> (a), std::get <0> (a)) }
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+:	_x { std::get <0> (a), std::get <1> (a), std::get <2> (a), std::get <3> (a) }
+#endif // fstb_ARCHI
+{
+	// Nothing
+}
+
+
+
+template <typename MEM>
+void	Vu32::store (MEM *ptr) const noexcept
+{
+	assert (is_ptr_align_nz (ptr, fstb_SIMD128_ALIGN));
+
+#if ! defined (fstb_HAS_SIMD)
+	*reinterpret_cast <Vu32Native *> (ptr) = _x;
+#elif fstb_ARCHI == fstb_ARCHI_X86
+	_mm_store_si128 (reinterpret_cast <__m128i *> (ptr), _x);
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+	vst1q_u32 (reinterpret_cast <uint32_t *> (ptr), _x);
+#endif // fstb_ARCHI
+}
+
+
+
+template <typename MEM>
+void	Vu32::storeu (MEM *ptr) const noexcept
+{
+	assert (ptr != nullptr);
+
+#if ! defined (fstb_HAS_SIMD)
+	*reinterpret_cast <Vu32Native *> (ptr) = _x;
+#elif fstb_ARCHI == fstb_ARCHI_X86
+	_mm_storeu_si128 (reinterpret_cast <__m128i *> (ptr), _x);
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+	vst1q_u8 (reinterpret_cast <uint8_t *> (ptr), vreinterpretq_u8_u32 (_x));
+#endif // fstb_ARCHI
+}
+
+
+
+// n = number of scalars to store (from the LSB)
+template <typename MEM>
+void	Vu32::storeu_part (MEM *ptr, int n) const noexcept
+{
+	assert (n > 0);
+
+	if (n >= _length)
+	{
+		storeu (ptr);
+		return;
+	}
+
+	uint32_t *      f_ptr = reinterpret_cast <uint32_t *> (ptr);
+
+#if ! defined (fstb_HAS_SIMD)
+
+	for (int i = 0; i < n; ++i)
+	{
+		f_ptr [i] = _x [i];
+	}
+
+#elif fstb_ARCHI == fstb_ARCHI_X86
+
+	f_ptr [0] = uint32_t (_mm_cvtsi128_si32 (_x));
+	if (n >= 2)
+	{
+		f_ptr [1] = uint32_t (_mm_cvtsi128_si32 (_mm_shuffle_epi32 (_x, 1 << 0)));
+		if (n >= 3)
+		{
+			f_ptr [1] = uint32_t (_mm_cvtsi128_si32 (_mm_shuffle_epi32 (_x, 2 << 0)));
+		}
+	}
+
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+
+	vst1q_lane_u32 (f_ptr + 0, _x, 0);
+	if (n >= 2)
+	{
+		vst1q_lane_u32 (f_ptr + 1, _x, 1);
+		if (n >= 3)
+		{
+			vst1q_lane_u32 (f_ptr + 2, _x, 2);
+		}
+	}
+
+#endif
+}
+
+
+
+// Works only with well-formed condition results (tested bits depend on the
+// implementation).
+// For each scalar, true = all bits set, false = all bits cleared
+Vu32::operator bool () const noexcept
+{
+	return and_h ();
+}
+
+
+
+Vu32 &	Vu32::operator += (const Vu32Native &other) noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+	_x [0] += other [0];
+	_x [1] += other [1];
+	_x [2] += other [2];
+	_x [3] += other [3];
+#elif fstb_ARCHI == fstb_ARCHI_X86
+	_x = _mm_add_epi32 (_x, other);
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+	_x = vaddq_u32 (_x, other);
+#endif // fstb_ARCHI
+	return *this;
+}
+
+
+
+Vu32 &   Vu32::operator -= (const Vu32Native &other) noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+	_x [0] -= other [0];
+	_x [1] -= other [1];
+	_x [2] -= other [2];
+	_x [3] -= other [3];
+#elif fstb_ARCHI == fstb_ARCHI_X86
+	_x = _mm_sub_epi32 (_x, other);
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+	_x = vsubq_u32 (_x, other);
+#endif // fstb_ARCHI
+	return *this;
+}
+
+
+
+Vu32 &   Vu32::operator *= (const Vu32Native &other) noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+	_x [0] *= other [0];
+	_x [1] *= other [1];
+	_x [2] *= other [2];
+	_x [3] *= other [3];
+#elif fstb_ARCHI == fstb_ARCHI_X86
+	// Emulation of _mm_mullo_epi32 (SSE4.1)
+# if fstb_COMPILER == fstb_COMPILER_MSVC
+	// For some reason this code is slightly faster on MSVC
+	auto           p02_64 = _mm_mul_epu32 (_x, other);
+	auto           p13_64 = _mm_mul_epu32 (
+		_mm_srli_si128 (_x   , 4),
+		_mm_srli_si128 (other, 4)
+	);
+	p02_64 = _mm_shuffle_epi32 (p02_64, (0 << 0) | (2 << 2));
+	p13_64 = _mm_shuffle_epi32 (p13_64, (0 << 0) | (2 << 2));
+	_x     = _mm_unpacklo_epi32 (p02_64, p13_64);
+# else
+	// Code of this function shamelessly borrowed from tp7
+	// https://github.com/tp7/masktools/blob/16bit/masktools/common/simd.h
+	// This code is faster on GCC/Clang
+	const __m128i  lhs13  = _mm_shuffle_epi32 (_x, 0xF5);        // (-,a3,-,a1)
+	const __m128i  rhs13  = _mm_shuffle_epi32 (other, 0xF5);     // (-,b3,-,b1)
+	const __m128i  prod02 = _mm_mul_epu32 (_x, other);           // (-,a2*b2,-,a0*b0)
+	const __m128i  prod13 = _mm_mul_epu32 (lhs13, rhs13);        // (-,a3*b3,-,a1*b1)
+	const __m128i  prod01 = _mm_unpacklo_epi32 (prod02, prod13); // (-,-,a1*b1,a0*b0)
+	const __m128i  prod23 = _mm_unpackhi_epi32 (prod02, prod13); // (-,-,a3*b3,a2*b2)
+	_x                    = _mm_unpacklo_epi64 (prod01 ,prod23); // (ab3,ab2,ab1,ab0)
+# endif // fstb_COMPILER
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+	_x = vmulq_u32 (_x, other);
+#endif // fstb_ARCHI
+	return *this;
+}
+
+
+
+Vu32 &   Vu32::operator *= (const Scalar &other) noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+	_x [0] *= other;
+	_x [1] *= other;
+	_x [2] *= other;
+	_x [3] *= other;
+#elif fstb_ARCHI == fstb_ARCHI_X86
+	const auto     vb = _mm_set1_epi32 (int32_t (other));
+	auto           v0 = _mm_shuffle_epi32 (_x, (0<<0) | (1<<4));
+	auto           v1 = _mm_shuffle_epi32 (_x, (2<<0) | (3<<4));
+	v0 = _mm_mul_epu32 (v0, vb);
+	v1 = _mm_mul_epu32 (v1, vb);
+	_x = _mm_castps_si128 (_mm_shuffle_ps (
+		_mm_castsi128_ps (v0),
+		_mm_castsi128_ps (v1),
+		(0<<0) | (2<<2) | (0<<4) | (2<<6)
+	));
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+	_x = vmulq_u32 (_x, vdupq_n_u32 (other));
+#endif // fstb_ARCHI
+	return *this;
+}
+
+
+
+Vu32 &   Vu32::operator &= (const Vu32Native &other) noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+	_x [0] &= other [0];
+	_x [1] &= other [1];
+	_x [2] &= other [2];
+	_x [3] &= other [3];
+#elif fstb_ARCHI == fstb_ARCHI_X86
+	_x = _mm_and_si128 (_x, other);
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+	_x = vandq_u32 (_x, other);
+#endif // fstb_ARCHI
+	return *this;
+}
+
+
+
+Vu32 &   Vu32::operator |= (const Vu32Native &other) noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+	_x [0] |= other [0];
+	_x [1] |= other [1];
+	_x [2] |= other [2];
+	_x [3] |= other [3];
+#elif fstb_ARCHI == fstb_ARCHI_X86
+	_x = _mm_or_si128 (_x, other);
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+	_x = vorrq_u32 (_x, other);
+#endif // fstb_ARCHI
+	return *this;
+}
+
+
+
+Vu32 &   Vu32::operator ^= (const Vu32Native &other) noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+	_x [0] ^= other [0];
+	_x [1] ^= other [1];
+	_x [2] ^= other [2];
+	_x [3] ^= other [3];
+#elif fstb_ARCHI == fstb_ARCHI_X86
+	_x = _mm_xor_si128 (_x, other);
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+	_x = veorq_u32 (_x, other);
+#endif // fstb_ARCHI
+	return *this;
+}
+
+
+
+Vu32 &   Vu32::operator <<= (int imm) noexcept
+{
+	assert (imm >= 0);
+	assert (imm <= 32);
+#if ! defined (fstb_HAS_SIMD)
+	_x [0] <<= imm;
+	_x [1] <<= imm;
+	_x [2] <<= imm;
+	_x [3] <<= imm;
+#elif fstb_ARCHI == fstb_ARCHI_X86
+	_x = _mm_slli_epi32 (_x, imm);
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+	_x <<= imm;
+#endif // fstb_ARCHI
+	return *this;
+}
+
+
+
+Vu32 &   Vu32::operator >>= (int imm) noexcept
+{
+	assert (imm >= 0);
+	assert (imm <= 32);
+#if ! defined (fstb_HAS_SIMD)
+	_x [0] >>= imm;
+	_x [1] >>= imm;
+	_x [2] >>= imm;
+	_x [3] >>= imm;
+#elif fstb_ARCHI == fstb_ARCHI_X86
+	_x = _mm_srli_epi32 (_x, imm);
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+	_x >>= imm;
+#endif // fstb_ARCHI
+	return *this;
+}
+
+
+
+Vu32	Vu32::operator - () const noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+	return Vu32 {
+		-_x [0],
+		-_x [1],
+		-_x [2],
+		-_x [3]
+	};
+#elif fstb_ARCHI == fstb_ARCHI_X86
+	return _mm_sub_epi32 (_mm_setzero_si128 (), _x);
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+	return vreinterpretq_u32_s32 (vnegq_s32 (vreinterpretq_s32_u32 (_x)));
+#endif // fstb_ARCHI
+}
+
+
+
+Vu32 	Vu32::operator ~ () const noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+	return Vu32 {
+		~(_x [0]),
+		~(_x [1]),
+		~(_x [2]),
+		~(_x [3])
+	};
+#elif fstb_ARCHI == fstb_ARCHI_X86
+	return _mm_xor_si128 (_x, _mm_set1_epi32 (-1));
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+	return vmvnq_u32 (_x);
+#endif // fstb_ARCHI
+}
+
+
+
+Vu32	Vu32::reverse () const noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+	return Vu32 { _x [3], _x [2], _x [1], _x [0] };
+#elif fstb_ARCHI == fstb_ARCHI_X86
+	return _mm_shuffle_epi32 (_x, (3<<0) + (2<<2) + (1<<4) + (0<<6));
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+	return vrev64q_u32 (vcombine_u32 (vget_high_u32 (_x), vget_low_u32 (_x)));
+#endif // fstb_ARCHI
+}
+
+
+
+// Positive = left
+template <int SHIFT>
+Vu32	Vu32::rotate () const noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+	return Vu32 {
+		_x [(0 - SHIFT) & 3],
+		_x [(1 - SHIFT) & 3],
+		_x [(2 - SHIFT) & 3],
+		_x [(3 - SHIFT) & 3]
+	};
+#elif fstb_ARCHI == fstb_ARCHI_X86
+	switch (SHIFT & 3)
+	{
+	case 1:  return _mm_shuffle_epi32 (_x, (2<<6) | (1<<4) | (0<<2) | (3<<0));
+	case 2:  return _mm_shuffle_epi32 (_x, (1<<6) | (0<<4) | (3<<2) | (2<<0));
+	case 3:  return _mm_shuffle_epi32 (_x, (0<<6) | (3<<4) | (2<<2) | (1<<0));
+	default: return *this;
+	}
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+	switch (SHIFT & 3)
+	{
+	case 1:  return vextq_u32 (_x, _x, 3);
+	case 2:  return vextq_u32 (_x, _x, 2);
+	case 3:  return vextq_u32 (_x, _x, 1);
+	default: return *this;
+	}
+#endif // fstb_ARCHI
+}
+
+
+
+template <int POS>
+uint32_t	Vu32::extract () const noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+	return _x [POS & 3];
+#elif fstb_ARCHI == fstb_ARCHI_X86
+	auto           a = _x;
+	switch (POS & 3)
+	{
+	case 1: a = _mm_shuffle_epi32 (a, 1); break;
+	case 2: a = _mm_shuffle_epi32 (a, 2); break;
+	case 3: a = _mm_shuffle_epi32 (a, 3); break;
+	default: /* Nothing */                break;
+	}
+	return Scalar (_mm_cvtsi128_si32 (a));
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+	return vgetq_lane_u32 (_x, POS & 3);
+#endif // fstb_ARCHI
+}
+
+
+
+template <int POS>
+Vu32	Vu32::insert (uint32_t val) const noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+	auto           a = *this;
+	a._x [POS & 3] = val;
+	return a;
+#elif fstb_ARCHI == fstb_ARCHI_X86
+	auto           a = rotate <(-POS) & 3> ();
+	a._x = _mm_castps_si128 (_mm_move_ss (
+		_mm_castsi128_ps (a._x),
+		_mm_castsi128_ps (_mm_set1_epi32 (int32_t (val)))
+	));
+	return a.template rotate <POS> ();
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+	return vsetq_lane_u32 (val, _x, POS & 3);
+#endif // fstb_ARCHI
+}
+
+
+
+template <int POS>
+Vu32	Vu32::spread () const noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+	return Vu32 (extract <POS> ());
+#elif fstb_ARCHI == fstb_ARCHI_X86
+	return _mm_shuffle_epi32 (_x, 0x55 * (POS & 3));
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+	return vdupq_n_u32 (vgetq_lane_u32 (_x, POS & 3));
+#endif // fstb_ARCHI
+}
+
+
+
+uint32_t	Vu32::sum_h () const noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+	return (_x [0] + _x [2]) + (_x [1] + _x [3]);
+#elif fstb_ARCHI == fstb_ARCHI_X86
+	// s = v3,v2,v1,v0
+	const auto s = _mm_shuffle_epi32 (_x, (3 << 0) | (2 << 2) | (1 << 4) | (0 << 6));
+	const auto v = _mm_add_epi32 (_x, s); // v0+v3,v1+v2,v2+v1,v3+v0
+	return uint32_t (
+		_mm_cvtsi128_si32 (_mm_add_epi32 (v, _mm_shuffle_epi32 (v, 1 << 0)))
+	);
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+	#if fstb_WORD_SIZE == 64
+		return vaddvq_u32 (_x);
+	#else
+		uint32x2_t     v2 = vadd_u32 (vget_high_u32 (_x), vget_low_u32 (_x));
+		return vget_lane_u32 (vpadd_u32 (v2, v2), 0);
+	#endif
+#endif // fstb_ARCHI
+}
+
+
+
+uint32_t	Vu32::min_h () const noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+	return std::min (std::min (_x [0], _x [2]), std::min (_x [1], _x [3]));
+#elif fstb_ARCHI == fstb_ARCHI_X86
+	const auto     v0 = min (*this, _mm_shuffle_epi32 (_x, (3 << 2) | 2));
+	const auto     v1 = _mm_shuffle_epi32 (v0, 1);
+	return std::min (
+		uint32_t (_mm_cvtsi128_si32 (v0)), uint32_t (_mm_cvtsi128_si32 (v1))
+	);
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+	uint32x2_t     v2 = vmin_u32 (vget_high_u32 (_x), vget_low_u32 (_x));
+	return vget_lane_u32 (vpmin_u32 (v2, v2), 0);
+#endif // fstb_ARCHI
+}
+
+
+
+uint32_t	Vu32::max_h () const noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+	return std::max (std::max (_x [0], _x [2]), std::max (_x [1], _x [3]));
+#elif fstb_ARCHI == fstb_ARCHI_X86
+	const auto     v0 = max (*this, _mm_shuffle_epi32 (_x, (3 << 2) | 2));
+	const auto     v1 = _mm_shuffle_epi32 (v0, 1);
+	return std::max (
+		uint32_t (_mm_cvtsi128_si32 (v0)), uint32_t (_mm_cvtsi128_si32 (v1))
+	);
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+	uint32x2_t     v2 = vmax_u32 (vget_high_u32 (_x), vget_low_u32 (_x));
+	return vget_lane_u32 (vpmax_u32 (v2, v2), 0);
+#endif // fstb_ARCHI
+}
+
+
+
+// Works only with well-formed condition results (tested bits depends on the implementation).
+// For each scalar, true = all bits set, false = all bits cleared
+bool	Vu32::and_h () const noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+	const uint32_t t = (_x [0] & _x [1]) & (_x [2] & _x [3]);
+	return (t == uint32_t (-1));
+#elif fstb_ARCHI == fstb_ARCHI_X86
+	return (_mm_movemask_epi8 (_x) == 0xFFFF);
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+	const uint32x2_t  tmp = vreinterpret_u32_u16 (vqmovn_u32 (_x));
+	return (   vget_lane_u32 (tmp, 0) == 0xFFFFFFFFU
+	        && vget_lane_u32 (tmp, 1) == 0xFFFFFFFFU);
+#endif // fstb_ARCHI
+}
+
+
+
+// Works only with well-formed condition results (tested bits depends on the implementation).
+// For each scalar, true = all bits set, false = all bits cleared
+bool	Vu32::or_h () const noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+	const uint32_t t = (_x [0] | _x [1]) | (_x [2] | _x [3]);
+	return (t != 0);
+#elif fstb_ARCHI == fstb_ARCHI_X86
+	return (_mm_movemask_epi8 (_x) != 0);
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+	const uint32x2_t  tmp = vreinterpret_u32_u16 (vqmovn_u32 (_x));
+	return (   vget_lane_u32 (tmp, 0) != 0
+	        || vget_lane_u32 (tmp, 1) != 0);
+#endif // fstb_ARCHI
+}
+
+
+
+// Moves the boolean content of each 4 scalar into the lower 4 bits of the
+// return value.
+// Assumes the object is a result of a comparison, with all bits the same
+// in each 32-bit element.
+unsigned int	Vu32::movemask () const noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+	return
+		   (_x [0] >> 31)
+		| ((_x [1] >> 30) & 2)
+		| ((_x [2] >> 29) & 4)
+		| ((_x [3] >> 28) & 8);
+#elif fstb_ARCHI == fstb_ARCHI_X86
+	return static_cast <unsigned int> (_mm_movemask_ps (_mm_castsi128_ps (_x)));
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+	uint64x2_t     tmp1 =
+		vreinterpretq_u64_u32 (_x);   // ddd...ddd ccc...ccc bbb...bbb aaa...aaa
+	tmp1 = vshrq_n_u64 (tmp1, 31);   // 000...00d ddd...ddc 000...00b bbb...bba
+	uint64x1_t     tmp2 = vsli_n_u64 (
+		vget_high_u64 (tmp1),
+		vget_low_u64 (tmp1),
+		2
+	);
+	return vget_lane_u32 (vreinterpret_u32_u64 (tmp2), 0) & 0xF;
+#endif // fstb_ARCHI
+}
+
+
+
+int	Vu32::count_bits () const noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+	// http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
+	uint32_t       v0 = _x [0] - ((_x [0] >> 1) & 0x55555555);
+	uint32_t       v1 = _x [1] - ((_x [1] >> 1) & 0x55555555);
+	uint32_t       v2 = _x [2] - ((_x [2] >> 1) & 0x55555555);
+	uint32_t       v3 = _x [3] - ((_x [3] >> 1) & 0x55555555);
+	v0 = (v0 & 0x33333333) + ((v0 >> 2) & 0x33333333);
+	v1 = (v1 & 0x33333333) + ((v1 >> 2) & 0x33333333);
+	v2 = (v2 & 0x33333333) + ((v2 >> 2) & 0x33333333);
+	v3 = (v3 & 0x33333333) + ((v3 >> 2) & 0x33333333);
+	const int      c0 = (((v0 + (v0 >> 4)) & 0xF0F0F0FU) * 0x1010101) >> 24;
+	const int      c1 = (((v1 + (v1 >> 4)) & 0xF0F0F0FU) * 0x1010101) >> 24;
+	const int      c2 = (((v2 + (v2 >> 4)) & 0xF0F0F0FU) * 0x1010101) >> 24;
+	const int      c3 = (((v3 + (v3 >> 4)) & 0xF0F0F0FU) * 0x1010101) >> 24;
+	return (c0 + c2) + (c1 + c3);
+#elif fstb_ARCHI == fstb_ARCHI_X86
+	// https://stackoverflow.com/questions/17354971/fast-counting-the-number-of-set-bits-in-m128i-register
+	static const __m128i  popcount_mask1 = _mm_set1_epi8 (0x77);
+	static const __m128i  popcount_mask2 = _mm_set1_epi8 (0x0F);
+	// Count bits in each 4-bit field.
+	auto           x = _x;
+	auto           n = _mm_srli_epi64 (x, 1);
+	n = _mm_and_si128 (popcount_mask1, n);
+	x = _mm_sub_epi8 (x, n);
+	n = _mm_srli_epi64 (n, 1);
+	n = _mm_and_si128 (popcount_mask1, n);
+	x = _mm_sub_epi8 (x, n);
+	n = _mm_srli_epi64 (n, 1);
+	n = _mm_and_si128 (popcount_mask1, n);
+	n = _mm_sub_epi8 (x, n);
+	n = _mm_add_epi8 (n, _mm_srli_epi16 (n, 4));
+	n = _mm_and_si128 (popcount_mask2, n);
+	// Counts the number of bits in the low and high 64-bit parts
+	n = _mm_sad_epu8 (n, _mm_setzero_si128 ());
+	// Counts the number of bits in the whole 128-bit register
+	n = _mm_add_epi32 (n, _mm_unpackhi_epi64 (n, n));
+	return _mm_cvtsi128_si32 (n);
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+	const uint8x16_t  cnt_8  = vcntq_u8 (vreinterpretq_u8_u32 (_x));
+	const uint16x8_t  cnt_16 = vpaddlq_u8 (cnt_8);
+	const uint32x4_t  cnt_32 = vpaddlq_u16 (cnt_16);
+	const uint64x2_t  cnt_64 = vpaddlq_u32 (cnt_32);
+	const int32x4_t   cnt_s  = vreinterpretq_s32_u64 (cnt_64);
+	return vgetq_lane_s32 (cnt_s, 0) + vgetq_lane_s32 (cnt_s, 2);
+#endif // fstb_ARCHI
+}
+
+
+
+std::tuple <uint32_t, uint32_t, uint32_t, uint32_t>	Vu32::explode () const noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+	return std::make_tuple (_x [0], _x [1], _x [2], _x [3]);
+#elif fstb_ARCHI == fstb_ARCHI_X86
+	return std::make_tuple (
+		uint32_t (_mm_cvtsi128_si32 (_x                            )),
+		uint32_t (_mm_cvtsi128_si32 (_mm_shuffle_epi32 (_x, (1<<0)))),
+		uint32_t (_mm_cvtsi128_si32 (_mm_shuffle_epi32 (_x, (2<<0)))),
+		uint32_t (_mm_cvtsi128_si32 (_mm_shuffle_epi32 (_x, (3<<0))))
+	);
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+	return std::make_tuple (
+		vgetq_lane_u32 (_x, 0),
+		vgetq_lane_u32 (_x, 1),
+		vgetq_lane_u32 (_x, 2),
+		vgetq_lane_u32 (_x, 3)
+	);
+#endif // fstb_ARCHI
+}
+
+
+
+Vu32	Vu32::zero () noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+	return Vu32 { 0, 0, 0, 0 };
+#elif fstb_ARCHI == fstb_ARCHI_X86
+	return _mm_setzero_si128 ();
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+	return vdupq_n_u32 (0);
+#endif // fstb_ARCHI
+}
+
+
+
+Vu32	Vu32::all1 () noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+	return Vu32 { ~Scalar (0), ~Scalar (0), ~Scalar (0), ~Scalar (0) };
+#elif fstb_ARCHI == fstb_ARCHI_X86
+	return _mm_set1_epi32 (-1);
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+	return vdupq_n_u32 (~Scalar (0));
+#endif // fstb_ARCHI
+}
+
+
+
+// "true" must be 1 and nothing else.
+Vu32	Vu32::set_mask (bool m0, bool m1, bool m2, bool m3) noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+	return Vu32 {
+		-uint32_t (m0),
+		-uint32_t (m1),
+		-uint32_t (m2),
+		-uint32_t (m3),
+	};
+#elif 1 // Fast version
+# if fstb_ARCHI == fstb_ARCHI_X86
+	return _mm_sub_epi32 (
+		_mm_setzero_si128 (),
+		_mm_set_epi32 (m3, m2, m1, m0)
+	);
+# elif fstb_ARCHI == fstb_ARCHI_ARM
+	float32x2_t    v01 = vdup_n_f32 (m0);
+	float32x2_t    v23 = vdup_n_f32 (m2);
+	v01 = vset_lane_f32 (m1, v01, 1);
+	v23 = vset_lane_f32 (m3, v23, 1);
+	return vreinterpretq_u32_s32 (vnegq_s32 (vreinterpretq_s32_f32 (
+		vcombine_f32 (v01, v23)
+	)));
+# endif // fstb_ARCHI
+#else // Safer but slower version
+# if fstb_ARCHI == fstb_ARCHI_X86
+	return _mm_sub_epi32 (
+		_mm_set_epi32 (!m3, !m2, !m1, !m0),
+		_mm_set1_epi32 (1)
+	);
+# elif fstb_ARCHI == fstb_ARCHI_ARM
+	float32x2_t    v01 = vdup_n_f32 (!m0);
+	float32x2_t    v23 = vdup_n_f32 (!m2);
+	v01 = vset_lane_f32 (!m1, v01, 1);
+	v23 = vset_lane_f32 (!m3, v23, 1);
+	const auto     one  = vdupq_n_u32 (1);
+	return vsubq_u32 (
+		vreinterpretq_u32_f32 (vcombine_f32 (v01, v23)),
+		one
+	);
+# endif // fstb_ARCHI
+#endif // Versions
+}
+
+
+
+// Extracts the vector at the position SHIFT from the double-width vector {a b}
+// Concatenates a [SHIFT...3] with b [0...3-SHIFT]
+template <int POS>
+Vu32	Vu32::compose (Vu32 a, Vu32 b) noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+	switch (POS & 3)
+	{
+	case 1:  return Vu32 { a._x [1], a._x [2], a._x [3], b._x [0] };
+	case 2:  return Vu32 { a._x [2], a._x [3], b._x [0], b._x [1] };
+	case 3:  return Vu32 { a._x [3], b._x [0], b._x [1], b._x [2] };
+	default: return a;
+	}
+	return a;
+#elif fstb_ARCHI == fstb_ARCHI_X86
+	switch (POS & 3)
+	{
+	case 1:
+		{
+			const auto     tmp = _mm_castps_si128 (_mm_move_ss (
+				_mm_castsi128_ps (a._x), _mm_castsi128_ps (b._x)
+			));
+			return _mm_shuffle_epi32 (tmp, (0<<6) | (3<<4) | (2<<2) | (1<<0));
+		}
+	case 2:
+		return _mm_castps_si128 (_mm_shuffle_ps (
+			_mm_castsi128_ps (a._x),
+			_mm_castsi128_ps (b._x),
+			(1<<6) | (0<<4) | (3<<2) | (2<<0)
+		));
+	case 3:
+		return _mm_castps_si128 (_mm_move_ss (
+			_mm_castsi128_ps (
+				_mm_shuffle_epi32 (b._x, (2<<6) | (1<<4) | (0<<2) | (3<<0))
+			),
+			_mm_castsi128_ps (
+				_mm_shuffle_epi32 (a._x, (2<<6) | (1<<4) | (0<<2) | (3<<0))
+			)
+		));
+	default:
+		return a;
+	}
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+	if (POS == 0)
+	{
+		return a;
+	}
+	else
+	{
+		return vextq_u32 (a._x, b._x, POS);
+	}
+#endif // fstb_ARCHI
+}
+
+
+
+Vu32	Vu32::flip_msb (Vu32 x) noexcept
+{
+	return x ^ Vu32 (0x80000000U);
+}
+
+
+
+template <typename MEM>
+Vu32	Vu32::load (const MEM *ptr) noexcept
+{
+	assert (is_ptr_align_nz (ptr, fstb_SIMD128_ALIGN));
+
+#if ! defined (fstb_HAS_SIMD)
+	return *reinterpret_cast <const Vu32 *> (ptr);
+#elif fstb_ARCHI == fstb_ARCHI_X86
+	return _mm_load_si128 (reinterpret_cast <const __m128i *> (ptr));
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+	return vld1q_u32 (reinterpret_cast <const uint32_t *> (ptr));
+#endif // fstb_ARCHI
+}
+
+
+
+template <typename MEM>
+Vu32	Vu32::loadu (const MEM *ptr) noexcept
+{
+	assert (ptr != nullptr);
+
+#if ! defined (fstb_HAS_SIMD)
+	return *reinterpret_cast <const Vu32 *> (ptr);
+#elif fstb_ARCHI == fstb_ARCHI_X86
+	return _mm_loadu_si128 (reinterpret_cast <const __m128i *> (ptr));
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+	return vreinterpretq_u32_u8 (
+		vld1q_u8 (reinterpret_cast <const uint8_t *> (ptr))
+	);
+#endif // fstb_ARCHI
+}
+
+
+
+/*\\\ PROTECTED \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\*/
+
+
+
+/*\\\ PRIVATE \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\*/
+
+
+
+/*\\\ GLOBAL OPERATORS \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\*/
+
+
+
+Vu32 operator + (Vu32 lhs, const Vu32 &rhs) noexcept
+{
+	lhs += rhs;
+	return lhs;
+}
+
+Vu32 operator - (Vu32 lhs, const Vu32 &rhs) noexcept
+{
+	lhs -= rhs;
+	return lhs;
+}
+
+Vu32 operator * (Vu32 lhs, const Vu32 &rhs) noexcept
+{
+	lhs *= rhs;
+	return lhs;
+}
+
+Vu32 operator * (Vu32 lhs, const Vu32::Scalar rhs) noexcept
+{
+	lhs *= rhs;
+	return lhs;
+}
+
+Vu32 operator & (Vu32 lhs, const Vu32 &rhs) noexcept
+{
+	lhs &= rhs;
+	return lhs;
+}
+
+Vu32 operator | (Vu32 lhs, const Vu32 &rhs) noexcept
+{
+	lhs |= rhs;
+	return lhs;
+}
+
+Vu32 operator ^ (Vu32 lhs, const Vu32 &rhs) noexcept
+{
+	lhs ^= rhs;
+	return lhs;
+}
+
+
+
+template <typename T>
+Vu32 operator << (Vu32 lhs, T rhs) noexcept
+{
+	lhs <<= rhs;
+	return lhs;
+}
+
+template <typename T>
+Vu32 operator >> (Vu32 lhs, T rhs) noexcept
+{
+	lhs >>= rhs;
+	return lhs;
+}
+
+
+
+Vu32 operator == (const Vu32 &lhs, const Vu32 &rhs) noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+	return Vu32 {
+		(lhs._x [0] == rhs._x [0]) ? uint32_t (-1) : 0,
+		(lhs._x [1] == rhs._x [1]) ? uint32_t (-1) : 0,
+		(lhs._x [2] == rhs._x [2]) ? uint32_t (-1) : 0,
+		(lhs._x [3] == rhs._x [3]) ? uint32_t (-1) : 0
+	};
+#elif fstb_ARCHI == fstb_ARCHI_X86
+	return _mm_cmpeq_epi32 (lhs, rhs);
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+	return vceqq_u32 (lhs, rhs);
+#endif // fstb_ARCHI
+}
+
+
+
+Vu32 operator != (const Vu32 &lhs, const Vu32 &rhs) noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+	return Vu32 {
+		(lhs._x [0] != rhs._x [0]) ? uint32_t (-1) : 0,
+		(lhs._x [1] != rhs._x [1]) ? uint32_t (-1) : 0,
+		(lhs._x [2] != rhs._x [2]) ? uint32_t (-1) : 0,
+		(lhs._x [3] != rhs._x [3]) ? uint32_t (-1) : 0
+	};
+#elif fstb_ARCHI == fstb_ARCHI_X86
+	const auto     eq = _mm_cmpeq_epi32 (lhs, rhs);
+	return _mm_xor_si128 (eq, _mm_set1_epi32 (-1));
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+	return vmvnq_u32 (vceqq_u32 (lhs, rhs));
+#endif // fstb_ARCHI
+}
+
+
+
+Vu32 operator <  (const Vu32 &lhs, const Vu32 &rhs) noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+	return Vu32 {
+		(lhs._x [0] < rhs._x [0]) ? uint32_t (-1) : 0,
+		(lhs._x [1] < rhs._x [1]) ? uint32_t (-1) : 0,
+		(lhs._x [2] < rhs._x [2]) ? uint32_t (-1) : 0,
+		(lhs._x [3] < rhs._x [3]) ? uint32_t (-1) : 0
+	};
+#elif fstb_ARCHI == fstb_ARCHI_X86
+	return Vu32::flip_msb (_mm_cmplt_epi32 (
+		Vu32::flip_msb (lhs), Vu32::flip_msb (rhs)
+	));
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+	return vcltq_u32 (lhs, rhs);
+#endif // fstb_ARCHI
+}
+
+
+
+Vu32 operator <= (const Vu32 &lhs, const Vu32 &rhs) noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+	return Vu32 {
+		(lhs._x [0] <= rhs._x [0]) ? uint32_t (-1) : 0,
+		(lhs._x [1] <= rhs._x [1]) ? uint32_t (-1) : 0,
+		(lhs._x [2] <= rhs._x [2]) ? uint32_t (-1) : 0,
+		(lhs._x [3] <= rhs._x [3]) ? uint32_t (-1) : 0
+	};
+#elif fstb_ARCHI == fstb_ARCHI_X86
+# if 1
+	return (lhs < rhs) | (lhs == rhs);
+# else
+	return ~(lhs > rhs);
+# endif
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+	return vcleq_u32 (lhs, rhs);
+#endif // fstb_ARCHI
+}
+
+
+
+Vu32 operator >  (const Vu32 &lhs, const Vu32 &rhs) noexcept
+{
+	return (rhs < lhs);
+}
+
+
+
+Vu32 operator >= (const Vu32 &lhs, const Vu32 &rhs) noexcept
+{
+	return (rhs <= lhs);
+}
+
+
+
+Vu32 min (const Vu32 &lhs, const Vu32 &rhs) noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+	return Vu32 {
+		std::min (lhs._x [0], rhs._x [0]),
+		std::min (lhs._x [1], rhs._x [1]),
+		std::min (lhs._x [2], rhs._x [2]),
+		std::min (lhs._x [3], rhs._x [3])
+	};
+#elif fstb_ARCHI == fstb_ARCHI_X86
+	const auto     gt = (lhs > rhs);
+	return _mm_or_si128 (
+		_mm_and_si128 (   gt, rhs),
+		_mm_andnot_si128 (gt, lhs)
+	);
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+	return vminq_u32 (lhs, rhs);
+#endif // fstb_ARCHI
+}
+
+
+
+Vu32 max (const Vu32 &lhs, const Vu32 &rhs) noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+	return Vu32 {
+		std::max (lhs._x [0], rhs._x [0]),
+		std::max (lhs._x [1], rhs._x [1]),
+		std::max (lhs._x [2], rhs._x [2]),
+		std::max (lhs._x [3], rhs._x [3])
+	};
+#elif fstb_ARCHI == fstb_ARCHI_X86
+	const auto     lt = (lhs < rhs);
+	return _mm_or_si128 (
+		_mm_and_si128 (   lt, rhs),
+		_mm_andnot_si128 (lt, lhs)
+	);
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+	return vmaxq_u32 (lhs, rhs);
+#endif // fstb_ARCHI
+}
+
+
+
+Vu32 limit (const Vu32 &v, const Vu32 &mi, const Vu32 &ma) noexcept
+{
+	return min (max (v, mi), ma);
+}
+
+
+
+Vu32 select (const Vu32 &cond, const Vu32 &v_t, const Vu32 &v_f) noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+	/*** To do: implement as r = v_f ^ ((v_f ^ v_t) & cond) ***/
+	return Vu32 {
+		(cond._x [0] & v_t._x [0]) | (~cond._x [0] & v_f._x [0]),
+		(cond._x [1] & v_t._x [1]) | (~cond._x [1] & v_f._x [1]),
+		(cond._x [2] & v_t._x [2]) | (~cond._x [2] & v_f._x [2]),
+		(cond._x [3] & v_t._x [3]) | (~cond._x [3] & v_f._x [3])
+	};
+#elif fstb_ARCHI == fstb_ARCHI_X86
+	const auto     cond_1 = _mm_and_si128 (cond, v_t);
+	const auto     cond_0 = _mm_andnot_si128 (cond, v_f);
+	return _mm_or_si128 (cond_0, cond_1);
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+	return vbslq_u32 (cond, v_t, v_f);
+#endif // fstb_ARCHI
+}
+
+
+
+std::tuple <Vu32, Vu32> swap_if (const Vu32 &cond, Vu32 lhs, Vu32 rhs) noexcept
+{
+#if ! defined (fstb_HAS_SIMD)
+	if (cond._x [0] != 0) { std::swap (lhs._x [0], rhs._x [0]); }
+	if (cond._x [1] != 0) { std::swap (lhs._x [1], rhs._x [1]); }
+	if (cond._x [2] != 0) { std::swap (lhs._x [2], rhs._x [2]); }
+	if (cond._x [3] != 0) { std::swap (lhs._x [3], rhs._x [3]); }
+	return std::make_tuple (lhs, rhs);
+#elif fstb_ARCHI == fstb_ARCHI_X86
+	const auto     inv = _mm_and_si128 (_mm_xor_si128 (lhs, rhs), cond);
+	return std::make_tuple (
+		Vu32 (_mm_xor_si128 (lhs, inv)),
+		Vu32 (_mm_xor_si128 (rhs, inv))
+	);
+#elif fstb_ARCHI == fstb_ARCHI_ARM
+	return std::make_tuple (
+		Vu32 (vbslq_u32 (cond, rhs, lhs)),
+		Vu32 (vbslq_u32 (cond, lhs, rhs))
+	);
+#endif // fstb_ARCHI
+}
+
+
+
+}  // namespace fstb
+
+
+
+#endif   // fstb_Vu32_CODEHEADER_INCLUDED
+
+
+
+/*\\\ EOF \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\*/
diff --git a/src/fstb/fnc.h b/src/fstb/fnc.h
index 1eee4ae..438ac12 100644
--- a/src/fstb/fnc.h
+++ b/src/fstb/fnc.h
@@ -72,6 +72,7 @@ template <class T>
 inline constexpr bool    is_eq (T v1, T v2, T eps = T (1e-9)) noexcept;
 template <class T>
 inline constexpr bool    is_eq_rel (T v1, T v2, T tol = T (1e-6)) noexcept;
+inline constexpr bool    is_eq_ulp (float v1, float v2, int32_t tol = 1) noexcept;
 inline int     get_prev_pow_2 (uint32_t x) noexcept;
 inline int     get_next_pow_2 (uint32_t x) noexcept;
 inline constexpr double  sinc (double x) noexcept;
diff --git a/src/fstb/fnc.hpp b/src/fstb/fnc.hpp
index 9ecb26b..c8b1b4c 100644
--- a/src/fstb/fnc.hpp
+++ b/src/fstb/fnc.hpp
@@ -565,6 +565,32 @@ constexpr bool	is_eq_rel (T v1, T v2, T tol) noexcept
 
 
 
+// Equality test with a tolerance in ULP.
+// Numbers of opposite sign (excepted 0) are always evaluated as different.
+// https://en.wikipedia.org/wiki/Unit_in_the_last_place
+constexpr bool	is_eq_ulp (float v1, float v2, int32_t tol) noexcept
+{
+	assert (tol >= 0);
+
+	if ((v1 < 0) != (v2 < 0))
+	{
+		return (v1 == v2);
+	}
+
+	union Combo
+	{
+		float          _f;
+		int32_t        _i;
+	};
+	const Combo    c1 { v1 };
+	const Combo    c2 { v2 };
+	const auto     dif = std::abs (c2._i - c1._i);
+
+	return (dif <= tol);
+}
+
+
+
 /*
 ==============================================================================
 Name: get_prev_pow2
@@ -1000,6 +1026,7 @@ constexpr T	lerp (T v0, T v1, T p) noexcept
 // f(x) = ((r3 + r1) / 2 - r2) * x^2 + ((r3 - r1) / 2) * x + r2
 // The points must not be aligned so the extremum exists.
 // It is not necessariy located between -1 and 1.
+// The value at this point is y = r2 + 0.25 * x * (r3 - r1)
 template <class T>
 constexpr T	find_extremum_pos_parabolic (T r1, T r2, T r3) noexcept
 {
diff --git a/src/main-avs.cpp b/src/main-avs.cpp
index dd70871..b8350c0 100644
--- a/src/main-avs.cpp
+++ b/src/main-avs.cpp
@@ -1,7 +1,8 @@
-
+#if defined (_WIN32)
 #define WIN32_LEAN_AND_MEAN
 #define NOMINMAX
 #define NOGDI
+#endif
 
 #include "avsutl/fnc.h"
 #include "fmtcavs/Bitdepth.h"
@@ -13,13 +14,24 @@
 #include "fmtcavs/Transfer.h"
 #include "fstb/def.h"
 
+#if defined (_WIN32)
 #include <windows.h>
+#else
+#include "avs/posix.h"
+#endif
 #include "avisynth.h"
 
 #if defined (_MSC_VER) && ! defined (NDEBUG) && defined (_DEBUG)
 	#include	<crtdbg.h>
 #endif
 
+#if defined (_WIN32)
+	#define AVS_EXPORT __declspec(dllexport)
+#elif defined(__GNUC__) && __GNUC__ >= 4
+	#define AVS_EXPORT __attribute__((visibility("default")))
+#else
+	#define AVS_EXPORT
+#endif
 
 
 template <class T>
@@ -34,7 +46,7 @@ ::AVSValue __cdecl	main_avs_create (::AVSValue args, void *user_data_ptr, ::IScr
 
 const ::AVS_Linkage *	AVS_linkage = nullptr;
 
-extern "C" __declspec (dllexport)
+extern "C" AVS_EXPORT
 const char * __stdcall	AvisynthPluginInit3 (::IScriptEnvironment *env_ptr, const ::AVS_Linkage * const vectors_ptr)
 {
 	AVS_linkage = vectors_ptr;
@@ -60,7 +72,8 @@ const char * __stdcall	AvisynthPluginInit3 (::IScriptEnvironment *env_ptr, const
 	env_ptr->AddFunction (fmtcavs_PRIMARIES,
 		"c"      "[rs].+"   "[gs].+"   "[bs].+"    // 0
 		"[ws].+" "[rd].+"   "[gd].+"   "[bd].+"    // 4
-		"[wd].+" "[prims]s" "[primd]s" "[cpuopt]i" // 8
+		"[wd].+" "[prims]s" "[primd]s" "[wconv]b"  // 8
+		"[cpuopt]i"                                // 12
 		, &main_avs_create <fmtcavs::Primaries>, nullptr
 	);
 	env_ptr->AddFunction (fmtcavs_RESAMPLE,
@@ -94,7 +107,7 @@ const char * __stdcall	AvisynthPluginInit3 (::IScriptEnvironment *env_ptr, const
 }
 
 
-
+#if defined (_WIN32)
 static void	main_avs_dll_load (::HINSTANCE hinst)
 {
 	fstb::unused (hinst);
@@ -156,3 +169,4 @@ BOOL WINAPI DllMain (::HINSTANCE hinst, ::DWORD reason, ::LPVOID reserved_ptr)
 
 	return TRUE;
 }
+#endif
diff --git a/src/main-vs.cpp b/src/main-vs.cpp
index 2de215c..3283c3a 100644
--- a/src/main-vs.cpp
+++ b/src/main-vs.cpp
@@ -386,6 +386,7 @@ VS_EXTERNAL_API (void) VapourSynthPluginInit2 (::VSPlugin *plugin_ptr, const ::V
 		"wd:float[]:opt;"
 		"prims:data:opt;"
 		"primd:data:opt;"
+		"wconv:int:opt;"
 		"cpuopt:int:opt;"
 	,	"clip:vnode;"
 	,	&vsutl::Redirect <fmtc::Primaries>::create, nullptr, plugin_ptr

Authors:		Firesledge (aka Cretindesalpes)
Version:		r29
Version:		r30
Download:		http://ldesoras.free.fr/prod.html
Category:		Format tools
Requirements:		Vapoursynth r55 or Avisynth+ 3.7.0
fmtc_primaries ( @@ -973,6 +974,7 @@ primaries arrayf wd (undefined), string prims (undefined), string primd (undefined), + bool wconv (False), int cpuopt (-1) )
`"redwide"`	R G B W (D65)	0.780308, 0.121595, 0.095612, 0.3217,	0.304253 1.493994 −0.084589 0.3290	REDWideGamutRGB