diff --git a/build/unix/Makefile.am b/build/unix/Makefile.am index 1ede141..e6faebb 100644 --- a/build/unix/Makefile.am +++ b/build/unix/Makefile.am @@ -293,14 +293,31 @@ commonsrc = \ ../../src/fstb/Hash.hpp \ ../../src/fstb/SingleObj.h \ ../../src/fstb/SingleObj.hpp \ + ../../src/fstb/Vf32.h \ + ../../src/fstb/Vf32.hpp \ + ../../src/fstb/Vs32.h \ + ../../src/fstb/Vs32.hpp \ + ../../src/fstb/Vu32.h \ + ../../src/fstb/Vu32.hpp \ ../../src/avstp.h \ ../../src/AvstpWrapper.cpp \ ../../src/AvstpWrapper.h libfmtconv_la_SOURCES = $(commonsrc) \ + ../../src/avisynth.h \ + ../../src/main-avs.cpp \ ../../src/main-vs.cpp \ ../../src/types.h \ ../../src/VapourSynth4.h \ + ../../src/avs/alignment.h \ + ../../src/avs/capi.h \ + ../../src/avs/config.h \ + ../../src/avs/cpuid.h \ + ../../src/avs/filesystem.h \ + ../../src/avs/minmax.h \ + ../../src/avs/posix.h \ + ../../src/avs/types.h \ + ../../src/avs/win.h \ ../../src/fmtc/Bitdepth_vs.cpp \ ../../src/fmtc/Bitdepth.h \ ../../src/fmtc/Convert.cpp \ @@ -326,6 +343,40 @@ libfmtconv_la_SOURCES = $(commonsrc) \ ../../src/fmtc/Transfer_vs.cpp \ ../../src/fmtc/Transfer.h \ ../../src/fmtc/version.h \ + ../../src/fmtcavs/Bitdepth_avs.cpp \ + ../../src/fmtcavs/Bitdepth.h \ + ../../src/fmtcavs/CpuOpt_avs.cpp \ + ../../src/fmtcavs/CpuOpt.h \ + ../../src/fmtcavs/FmtAvs.cpp \ + ../../src/fmtcavs/FmtAvs.h \ + ../../src/fmtcavs/fnc_fmtcavs.cpp \ + ../../src/fmtcavs/fnc.h \ + ../../src/fmtcavs/function_names.h \ + ../../src/fmtcavs/Matrix2020CL_avs.cpp \ + ../../src/fmtcavs/Matrix2020CL.h \ + ../../src/fmtcavs/Matrix_avs.cpp \ + ../../src/fmtcavs/Matrix.h \ + ../../src/fmtcavs/Primaries_avs.cpp \ + ../../src/fmtcavs/Primaries.h \ + ../../src/fmtcavs/ProcAlpha.cpp \ + ../../src/fmtcavs/ProcAlpha.h \ + ../../src/fmtcavs/Resample_avs.cpp \ + ../../src/fmtcavs/Resample.h \ + ../../src/fmtcavs/Transfer_avs.cpp \ + ../../src/fmtcavs/Transfer.h \ + ../../src/avsutl/CsPlane.cpp \ + ../../src/avsutl/CsPlane.h \ + ../../src/avsutl/fnc_avsutl.cpp \ + ../../src/avsutl/fnc.h \ + ../../src/avsutl/fnc.hpp \ + ../../src/avsutl/PlaneProcCbInterface_avs.cpp \ + ../../src/avsutl/PlaneProcCbInterface.h \ + ../../src/avsutl/PlaneProcessor_avs.cpp \ + ../../src/avsutl/PlaneProcessor.h \ + ../../src/avsutl/PlaneProcMode.h \ + ../../src/avsutl/TFlag.h \ + ../../src/avsutl/VideoFilterBase.cpp \ + ../../src/avsutl/VideoFilterBase.h \ ../../src/vsutl/FilterBase.cpp \ ../../src/vsutl/FilterBase.h \ ../../src/vsutl/fnc_vsutl.cpp \ diff --git a/build/unix/configure.ac b/build/unix/configure.ac index 00b836b..54e060e 100644 --- a/build/unix/configure.ac +++ b/build/unix/configure.ac @@ -1,4 +1,4 @@ -AC_INIT([fmtconv], [r29], [http://forum.doom9.org/showthread.php?t=166504], [fmtconv], [http://forum.doom9.org/showthread.php?t=166504]) +AC_INIT([fmtconv], [r30], [http://forum.doom9.org/showthread.php?t=166504], [fmtconv], [http://forum.doom9.org/showthread.php?t=166504]) AC_CONFIG_MACRO_DIR([m4]) AM_INIT_AUTOMAKE([foreign no-dist-gzip dist-xz subdir-objects no-define]) diff --git a/build/win/common/common.vcxproj b/build/win/common/common.vcxproj index 2914165..632e152 100644 --- a/build/win/common/common.vcxproj +++ b/build/win/common/common.vcxproj @@ -303,6 +303,12 @@ + + + + + + diff --git a/build/win/common/common.vcxproj.filters b/build/win/common/common.vcxproj.filters index d9a0c0e..3ab2a4f 100644 --- a/build/win/common/common.vcxproj.filters +++ b/build/win/common/common.vcxproj.filters @@ -749,6 +749,24 @@ fmtcl + + fstb + + + fstb + + + fstb + + + fstb + + + fstb + + + fstb + diff --git a/doc/fmtconv.html b/doc/fmtconv.html index f9c4fb8..1be2df5 100644 --- a/doc/fmtconv.html +++ b/doc/fmtconv.html @@ -15,7 +15,7 @@

Abstract

- + @@ -129,7 +129,7 @@

Resizing and chroma subsampling conversions

Bobbing an interlaced stream (here, Top Field First):

c = c.std.SeparateFields (tff=True)
-c = c.fmtc.resample (scalev=2, kernel="cubic", interlaced=True, interlacedd=False)
+c = c.fmtc.resample (scalev=2, kernel="cubic", interlaced=1, interlacedd=0)

Converting a progressive stream from YUV 4:2:2 to 4:2:0 and back to 8 bits:

@@ -140,7 +140,7 @@

Resizing and chroma subsampling conversions

tff = True
 c = c.std.SeparateFields (tff=tff)
-c = c.fmtc.resample (css="420", interlaced=True)
+c = c.fmtc.resample (css="420", interlaced=1)
 c = c.fmtc.bitdepth (bits=8)
 c = c.std.DoubleWeave (tff=tff)
 c = c.std.SelectEvery (cycle=2, offsets=0)
@@ -623,7 +623,7 @@

matrix

primaries to perform the intermediary conversion.

The _ColorRange frame property is set if the fulld -parameter has been explicitely defined. +parameter has been explicitely defined or if a preset is used. If the destination colorspace is a standardized one (as deduced from the specified matrix), the _Matrix and _ColorSpace properties are set, otherwise they are deleted from the frame.

@@ -959,6 +959,7 @@

primaries

wd : float[]: opt; prims : data : opt; primd : data : opt; + wconv : int : opt; (False) cpuopt: int : opt; (-1) ) @@ -1070,6 +1072,21 @@

Parameters

Authors:  Firesledge (aka Cretindesalpes)
Version:  r29
Version:  r30
Download:  http://ldesoras.free.fr/prod.html
Category:  Format tools
Requirements: Vapoursynth r55 or Avisynth+ 3.7.0
fmtc_primaries (
@@ -973,6 +974,7 @@ 

primaries

arrayf wd (undefined), string prims (undefined), string primd (undefined), + bool wconv (False), int cpuopt (-1) )
"redwide"R
G
B
W (D65)
0.780308,
0.121595,
0.095612,
0.3217,
0.304253
1.493994
−0.084589
0.3290
REDWideGamutRGB
+

wconv

+

Indicates we want a full conversion for the white point.

+

If set to False, chromatic adaptation will be used, so the +white will stay white on the destination illuminant and colors will be adapted +to implement a real illuminant change. +This is generally what you want when converting between gamuts: the eye adapts +to the new white and colors should be matched accordingly.

+

If set to True, the chromatic adaptation is bypassed. +The white from the source colorspace will appear with a tint if the target +colorspace has a different white point. +Use this if you want to emulate what a picture displayed with a monitor using +the source illuminant looks like on a display using the target illuminant. +This is also what you want when converting to and from XYZ for further +operations in this colorspace.

+

cpuopt

Limits the CPU instruction set. −1: automatic (no limitation), @@ -1275,7 +1292,7 @@

Parameters

Clip to be resized. Mandatory. Supported input formats:

    -
  • 8-, 9-, 10-, 12-, 16- and 16-bit integer.
  • +
  • 8-, 9-, 10-, 12-, 14- and 16-bit integer.
  • 32-bit floating point.
  • Any planar colorspace.
@@ -1872,9 +1889,15 @@

Parameters

Indicate the peak white levels in cd/m2. lws is for the source transfer function, and lwd for the destination one. -These parameters are taken into account when display-referred transfer -functions are used. -Minimum lw value is 0.1 cd/m2. +These parameters are taken into account to scale the luminance when the +following conditions are met:

+
    +
  • display-referred transfer functions are used, +
  • match is set to 2 (display luminance matching) and
  • +
  • the EOTF shouldn’t specify any scale for the luminance (the cd/m² value +for F’ = 1.0).
  • +
+

Minimum lw value is 0.1 cd/m2. System gamma may be changed according to the lw parameter. Unless specified, HDR functions use a peak white of 1000 cd/m2. Similarly, SDR and other functions use 100 cd/m2 by default. @@ -1973,11 +1996,19 @@

IV) Troubleshooting

V) Changelog

-

r30, 2022-xx-xx

+

r31, 202x-xx-xx

+
    +
  • resample: fixed 14 to 16 bit AVX2 conversion path, thanks to NSQY for the report.
  • +
+ +

r30, 2022-08-29

    +
  • matrix: The _ColorRange frame property is now set when a matrix preset is used.
  • transfer: Added ACEScct transfer function.
  • primaries: Added DCI P3+ and Cinema Gamut presets.
  • -
  • Changed the configure options to compile with Clang.
  • +
  • primaries: Added wconv parameter for full conversion.
  • +
  • Changed the configure options to compile with Clang.
  • +
  • Updated datatypes in the examples.

r29, 2022-04-11

diff --git a/src/avisynth.h b/src/avisynth.h index 679ac2a..b779c49 100644 --- a/src/avisynth.h +++ b/src/avisynth.h @@ -442,17 +442,17 @@ extern const AVS_Linkage* AVS_linkage; # endif # define AVS_BakedCode(arg) { arg ; } -# define AVS_LinkCall(arg) !AVS_linkage || offsetof(AVS_Linkage, arg) >= AVS_linkage->Size ? 0 : (this->*(AVS_linkage->arg)) -# define AVS_LinkCall_Void(arg) !AVS_linkage || offsetof(AVS_Linkage, arg) >= AVS_linkage->Size ? (void)0 : (this->*(AVS_linkage->arg)) -# define AVS_LinkCallV(arg) !AVS_linkage || offsetof(AVS_Linkage, arg) >= AVS_linkage->Size ? *this : (this->*(AVS_linkage->arg)) +# define AVS_LinkCall(arg) !AVS_linkage || offsetof(AVS_Linkage, arg) >= (size_t)AVS_linkage->Size ? 0 : (this->*(AVS_linkage->arg)) +# define AVS_LinkCall_Void(arg) !AVS_linkage || offsetof(AVS_Linkage, arg) >= (size_t)AVS_linkage->Size ? (void)0 : (this->*(AVS_linkage->arg)) +# define AVS_LinkCallV(arg) !AVS_linkage || offsetof(AVS_Linkage, arg) >= (size_t)AVS_linkage->Size ? *this : (this->*(AVS_linkage->arg)) // Helper macros for fallback option when a function does not exists #define CALL_MEMBER_FN(object,ptrToMember) ((object)->*(ptrToMember)) #define AVS_LinkCallOpt(arg, argOpt) !AVS_linkage ? 0 : \ - ( offsetof(AVS_Linkage, arg) >= AVS_linkage->Size ? \ - (offsetof(AVS_Linkage, argOpt) >= AVS_linkage->Size ? 0 : CALL_MEMBER_FN(this, AVS_linkage->argOpt)() ) : \ + ( offsetof(AVS_Linkage, arg) >= (size_t)AVS_linkage->Size ? \ + (offsetof(AVS_Linkage, argOpt) >= (size_t)AVS_linkage->Size ? 0 : CALL_MEMBER_FN(this, AVS_linkage->argOpt)() ) : \ CALL_MEMBER_FN(this, AVS_linkage->arg)() ) // AVS_LinkCallOptDefault puts automatically () only after arg -# define AVS_LinkCallOptDefault(arg, argDefaultValue) !AVS_linkage || offsetof(AVS_Linkage, arg) >= AVS_linkage->Size ? (argDefaultValue) : ((this->*(AVS_linkage->arg))()) +# define AVS_LinkCallOptDefault(arg, argDefaultValue) !AVS_linkage || offsetof(AVS_Linkage, arg) >= (size_t)AVS_linkage->Size ? (argDefaultValue) : ((this->*(AVS_linkage->arg))()) #endif @@ -1299,7 +1299,7 @@ class GenericVideoFilter : public IClip { void __stdcall GetAudio(void* buf, int64_t start, int64_t count, IScriptEnvironment* env) { child->GetAudio(buf, start, count, env); } const VideoInfo& __stdcall GetVideoInfo() { return vi; } bool __stdcall GetParity(int n) { return child->GetParity(n); } - int __stdcall SetCacheHints(int cachehints, int frame_range) { AVS_UNUSED(cachehints); AVS_UNUSED(frame_range); return 0; }; // We do not pass cache requests upwards, only to the next filter. + int __stdcall SetCacheHints(int cachehints, int frame_range) { AVS_UNUSED(cachehints); AVS_UNUSED(frame_range); return 0; } // We do not pass cache requests upwards, only to the next filter. }; @@ -1864,7 +1864,7 @@ struct PNeoEnv { #if defined(BUILDING_AVSCORE) || defined(AVS_STATIC_LIB) ; #else - : p(!AVS_linkage || offsetof(AVS_Linkage, GetNeoEnv) >= AVS_linkage->Size ? 0 : AVS_linkage->GetNeoEnv(env)) { } + : p(!AVS_linkage || offsetof(AVS_Linkage, GetNeoEnv) >= (size_t)AVS_linkage->Size ? 0 : AVS_linkage->GetNeoEnv(env)) { } #endif int operator!() const { return !p; } diff --git a/src/fmtc/Matrix_vs.cpp b/src/fmtc/Matrix_vs.cpp index 07c8c8c..2c08360 100644 --- a/src/fmtc/Matrix_vs.cpp +++ b/src/fmtc/Matrix_vs.cpp @@ -146,6 +146,7 @@ Matrix::Matrix (const ::VSMap &in, ::VSMap &out, void * /*user_data_ptr*/, ::VSC const int nbr_expected_coef = _nbr_planes * (_nbr_planes + 1); bool mat_init_flag = false; + bool preset_flag = false; // Matrix presets std::string mat (get_arg_str (in, out, "mat", "")); @@ -182,6 +183,7 @@ Matrix::Matrix (const ::VSMap &in, ::VSMap &out, void * /*user_data_ptr*/, ::VSC _mat_main = m2d * m2s; mat_init_flag = true; + preset_flag = true; } // Custom coefficients @@ -309,6 +311,7 @@ Matrix::Matrix (const ::VSMap &in, ::VSMap &out, void * /*user_data_ptr*/, ::VSC vsutl::is_full_range_default (fmt_dst) ? 1 : 0, 0, &_range_set_dst_flag ) != 0); + _range_set_dst_flag |= preset_flag; prepare_matrix_coef ( *this, *_proc_uptr, _mat_main, diff --git a/src/fmtc/Primaries_vs.cpp b/src/fmtc/Primaries_vs.cpp index ced609e..e5e2089 100644 --- a/src/fmtc/Primaries_vs.cpp +++ b/src/fmtc/Primaries_vs.cpp @@ -105,8 +105,10 @@ Primaries::Primaries (const ::VSMap &in, ::VSMap &out, void *user_data_ptr, ::VS init (_prim_d, *this, in, out, "rd", "gd", "bd", "wd"); assert (_prim_d.is_ready ()); + const auto conv_flag = (get_arg_int (in, out, "wconv", 0) != 0); + const fmtcl::Mat3 mat_conv = - fmtcl::PrimUtil::compute_conversion_matrix (_prim_s, _prim_d); + fmtcl::PrimUtil::compute_conversion_matrix (_prim_s, _prim_d, conv_flag); _mat_main.insert3 (mat_conv); _mat_main.clean3 (1); diff --git a/src/fmtc/Resample_vs.cpp b/src/fmtc/Resample_vs.cpp index 3d7e649..971343f 100644 --- a/src/fmtc/Resample_vs.cpp +++ b/src/fmtc/Resample_vs.cpp @@ -628,7 +628,7 @@ const ::VSFrame * Resample::get_frame (int n, int activation_reason, void * &fra if (ret_val != 0) { _vsapi.freeFrame (dst_ptr); - dst_ptr = 0; + dst_ptr = nullptr; } } @@ -680,7 +680,7 @@ int Resample::do_process_plane (::VSFrame &dst, int n, int plane_index, void *fr { const Ru::FrameInfo & frame_info = *reinterpret_cast (frame_data_ptr); - process_plane_proc ( + ret_val = process_plane_proc ( dst, n, plane_index, frame_ctx, src_node1_sptr, frame_info ); } @@ -688,7 +688,7 @@ int Resample::do_process_plane (::VSFrame &dst, int n, int plane_index, void *fr // Copy (and convert) else if (proc_mode == vsutl::PlaneProcMode_COPY1) { - process_plane_copy ( + ret_val = process_plane_copy ( dst, n, plane_index, frame_ctx, src_node1_sptr ); } diff --git a/src/fmtc/version.h b/src/fmtc/version.h index 4a4d319..a209cf7 100644 --- a/src/fmtc/version.h +++ b/src/fmtc/version.h @@ -1,5 +1,5 @@ #pragma once -#define fmtc_VERSION 29 +#define fmtc_VERSION 30 #define fmtc_PLUGIN_NAME "fmtconv" #define fmtc_NAMESPACE "fmtc" diff --git a/src/fmtcavs/Matrix_avs.cpp b/src/fmtcavs/Matrix_avs.cpp index db75fcb..b673a79 100644 --- a/src/fmtcavs/Matrix_avs.cpp +++ b/src/fmtcavs/Matrix_avs.cpp @@ -73,7 +73,6 @@ Matrix::Matrix (::IScriptEnvironment &env, const ::AVSValue &args) { env.ThrowError (fmtcavs_MATRIX ": input must be 4:4:4."); } - const int nbr_planes_src = _vi_src.NumComponents (); if (fmt_src.get_nbr_comp_non_alpha () != _nbr_planes_proc) { env.ThrowError ( @@ -129,6 +128,7 @@ Matrix::Matrix (::IScriptEnvironment &env, const ::AVSValue &args) const int nbr_expected_coef = _nbr_planes_proc * (_nbr_planes_proc + 1); bool mat_init_flag = false; + bool preset_flag = false; fmtcl::Mat4 mat_main; // Main matrix, float input, float output // Matrix presets @@ -169,6 +169,7 @@ Matrix::Matrix (::IScriptEnvironment &env, const ::AVSValue &args) mat_main = m2d * m2s; mat_init_flag = true; + preset_flag = true; } // Alpha plane processing, if any @@ -267,7 +268,7 @@ Matrix::Matrix (::IScriptEnvironment &env, const ::AVSValue &args) _fulld_flag = args [Param_FULLD].AsBool ( fmtcl::is_full_range_default (fmt_dst.get_col_fam ()) ); - _range_def_flag = args [Param_FULLD].Defined (); + _range_def_flag = (args [Param_FULLD].Defined () || preset_flag); prepare_matrix_coef ( env, *_proc_uptr, mat_main, diff --git a/src/fmtcavs/Primaries.h b/src/fmtcavs/Primaries.h index e1d067b..e3d248e 100644 --- a/src/fmtcavs/Primaries.h +++ b/src/fmtcavs/Primaries.h @@ -62,6 +62,7 @@ class Primaries Param_WD, Param_PRIMS, Param_PRIMD, + Param_WCONV, Param_CPUOPT, Param_NBR_ELT diff --git a/src/fmtcavs/Primaries_avs.cpp b/src/fmtcavs/Primaries_avs.cpp index 85ae49f..989148a 100644 --- a/src/fmtcavs/Primaries_avs.cpp +++ b/src/fmtcavs/Primaries_avs.cpp @@ -110,8 +110,10 @@ Primaries::Primaries (::IScriptEnvironment &env, const ::AVSValue &args) init (_prim_d, env, args, Param_RD, Param_GD, Param_BD, Param_WD); assert (_prim_d.is_ready ()); + const auto conv_flag = args [Param_WCONV].AsBool (false); + const fmtcl::Mat3 mat_conv = - fmtcl::PrimUtil::compute_conversion_matrix (_prim_s, _prim_d); + fmtcl::PrimUtil::compute_conversion_matrix (_prim_s, _prim_d, conv_flag); _mat_main.insert3 (mat_conv); _mat_main.clean3 (1); diff --git a/src/fmtcl/BitBltConv_avx2.cpp b/src/fmtcl/BitBltConv_avx2.cpp index 3dd2e7c..cad5821 100644 --- a/src/fmtcl/BitBltConv_avx2.cpp +++ b/src/fmtcl/BitBltConv_avx2.cpp @@ -75,11 +75,13 @@ void BitBltConv::bitblt_int_to_flt_avx2_switch (uint8_t *dst_ptr, ptrdiff_t dst_ switch ((scale_flag << 16) + (src_fmt << 8) + src_res) { fmtcl_BitBltConv_CASE (false, INT16 , 16, i16) + fmtcl_BitBltConv_CASE (false, INT16 , 14, i16) fmtcl_BitBltConv_CASE (false, INT16 , 12, i16) fmtcl_BitBltConv_CASE (false, INT16 , 10, i16) fmtcl_BitBltConv_CASE (false, INT16 , 9, i16) fmtcl_BitBltConv_CASE (false, INT8 , 8, i08) fmtcl_BitBltConv_CASE (true , INT16 , 16, i16) + fmtcl_BitBltConv_CASE (true , INT16 , 14, i16) fmtcl_BitBltConv_CASE (true , INT16 , 12, i16) fmtcl_BitBltConv_CASE (true , INT16 , 10, i16) fmtcl_BitBltConv_CASE (true , INT16 , 9, i16) @@ -154,10 +156,15 @@ void BitBltConv::bitblt_int_to_int_avx2_switch (fmtcl::SplFmt dst_fmt, int dst_r switch ((dst_fmt << 20) + (src_fmt << 16) + (dst_res << 8) + src_res) { + fmtcl_BitBltConv_CASE (INT16 , INT16 , 16, 14, i16, i16) fmtcl_BitBltConv_CASE (INT16 , INT16 , 16, 12, i16, i16) fmtcl_BitBltConv_CASE (INT16 , INT16 , 16, 10, i16, i16) fmtcl_BitBltConv_CASE (INT16 , INT16 , 16, 9, i16, i16) fmtcl_BitBltConv_CASE (INT16 , INT8 , 16, 8, i16, i08) + fmtcl_BitBltConv_CASE (INT16 , INT16 , 14, 12, i16, i16) + fmtcl_BitBltConv_CASE (INT16 , INT16 , 14, 10, i16, i16) + fmtcl_BitBltConv_CASE (INT16 , INT16 , 14, 9, i16, i16) + fmtcl_BitBltConv_CASE (INT16 , INT8 , 14, 8, i16, i08) fmtcl_BitBltConv_CASE (INT16 , INT16 , 12, 10, i16, i16) fmtcl_BitBltConv_CASE (INT16 , INT16 , 12, 9, i16, i16) fmtcl_BitBltConv_CASE (INT16 , INT8 , 12, 8, i16, i08) diff --git a/src/fmtcl/KernelData.cpp b/src/fmtcl/KernelData.cpp index be4fc13..bf6a83e 100644 --- a/src/fmtcl/KernelData.cpp +++ b/src/fmtcl/KernelData.cpp @@ -46,6 +46,7 @@ To Public License, Version 2, as published by Sam Hocevar. See #include "fstb/def.h" #include "fstb/fnc.h" +#include #include #include @@ -296,6 +297,7 @@ void KernelData::invert_kernel (int taps) assert (ovr_f * support >= taps); int len = fstb::ceil_int (ovr_s * ovr_f * support) * 2; len = 1 << (fstb::get_prev_pow_2 (len - 1) + 1); // Next power of 2 + len = std::max (len, 1); // Shouldn't happen but GCC emits a warning later const int h_len = len / 2; std::vector x (len); diff --git a/src/fmtcl/PrimUtil.cpp b/src/fmtcl/PrimUtil.cpp index 88ab707..30c6b0a 100644 --- a/src/fmtcl/PrimUtil.cpp +++ b/src/fmtcl/PrimUtil.cpp @@ -45,13 +45,20 @@ constexpr int PrimUtil::_nbr_planes; -Mat3 PrimUtil::compute_conversion_matrix (const RgbSystem &prim_s, const RgbSystem &prim_d) +// conv_flag indicates we want a full conversion, not a chromatic adatpation +Mat3 PrimUtil::compute_conversion_matrix (const RgbSystem &prim_s, const RgbSystem &prim_d, bool conv_flag) { assert (prim_s.is_ready ()); assert (prim_d.is_ready ()); const Mat3 rgb2xyz = compute_rgb2xyz (prim_s); const Mat3 xyz2rgb = compute_rgb2xyz (prim_d).invert (); + + if (conv_flag) + { + return xyz2rgb * rgb2xyz; + } + const Mat3 adapt = compute_chroma_adapt (prim_s, prim_d); return xyz2rgb * adapt * rgb2xyz; diff --git a/src/fmtcl/PrimUtil.h b/src/fmtcl/PrimUtil.h index 72cf11d..2cbf773 100644 --- a/src/fmtcl/PrimUtil.h +++ b/src/fmtcl/PrimUtil.h @@ -44,7 +44,7 @@ class PrimUtil static constexpr int _nbr_planes = RgbSystem::_nbr_planes; - static Mat3 compute_conversion_matrix (const RgbSystem &prim_s, const RgbSystem &prim_d); + static Mat3 compute_conversion_matrix (const RgbSystem &prim_s, const RgbSystem &prim_d, bool conv_flag); static Mat3 compute_rgb2xyz (const RgbSystem &prim); static Mat3 compute_chroma_adapt (const RgbSystem &prim_s, const RgbSystem &prim_d); static Vec3 conv_xy_to_xyz (const RgbSystem::Vec2 &xy); diff --git a/src/fstb/Hash.h b/src/fstb/Hash.h index b18d527..ed281d8 100644 --- a/src/fstb/Hash.h +++ b/src/fstb/Hash.h @@ -45,6 +45,7 @@ To Public License, Version 2, as published by Sam Hocevar. See /*\\\ INCLUDE FILES \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\*/ #include "fstb/def.h" +#include "fstb/Vu32.h" #include @@ -64,8 +65,12 @@ class Hash static fstb_FORCEINLINE constexpr uint32_t hash (uint32_t x) noexcept; + static fstb_FORCEINLINE Vu32 + hash (Vu32 x) noexcept; static fstb_FORCEINLINE constexpr uint32_t hash_inv (uint32_t x) noexcept; + static fstb_FORCEINLINE Vu32 + hash_inv (Vu32 x) noexcept; static fstb_FORCEINLINE constexpr uint64_t hash (uint64_t x) noexcept; diff --git a/src/fstb/Hash.hpp b/src/fstb/Hash.hpp index 3a4a3e7..c6644fc 100644 --- a/src/fstb/Hash.hpp +++ b/src/fstb/Hash.hpp @@ -122,6 +122,19 @@ constexpr uint32_t Hash::hash (uint32_t x) noexcept +Vu32 Hash::hash (Vu32 x) noexcept +{ + x ^= x >> 16; + x *= uint32_t (0x7FEB352Dlu); + x ^= x >> 15; + x *= uint32_t (0x846CA68Blu); + x ^= x >> 16; + + return x; +} + + + constexpr uint32_t Hash::hash_inv (uint32_t x) noexcept { #if 0 @@ -143,6 +156,19 @@ constexpr uint32_t Hash::hash_inv (uint32_t x) noexcept +Vu32 Hash::hash_inv (Vu32 x) noexcept +{ + x ^= x >> 16; + x *= uint32_t (0x43021123lu); + x ^= x >> 15 ^ x >> 30; + x *= uint32_t (0x1D69E2A5lu); + x ^= x >> 16; + + return x; +} + + + // SplittableRandom / SplitMix64 constexpr uint64_t Hash::hash (uint64_t x) noexcept { diff --git a/src/fstb/Vf32.h b/src/fstb/Vf32.h new file mode 100644 index 0000000..1a7203d --- /dev/null +++ b/src/fstb/Vf32.h @@ -0,0 +1,356 @@ +/***************************************************************************** + + Vf32.h + Author: Laurent de Soras, 2021 + +--- Legal stuff --- + +This program is free software. It comes without any warranty, to +the extent permitted by applicable law. You can redistribute it +and/or modify it under the terms of the Do What The Fuck You Want +To Public License, Version 2, as published by Sam Hocevar. See +http://www.wtfpl.net/ for more details. + +*Tab=3***********************************************************************/ + + + +#pragma once +#if ! defined (fstb_Vf32_HEADER_INCLUDED) +#define fstb_Vf32_HEADER_INCLUDED + + + +/*\\\ INCLUDE FILES \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\*/ + +#include "fstb/def.h" + +#if ! defined (fstb_HAS_SIMD) + #include +#elif (fstb_ARCHI == fstb_ARCHI_X86) + #include +#elif (fstb_ARCHI == fstb_ARCHI_ARM) + #include +#else + #error +#endif + +#include + +#include + + + +namespace fstb +{ + + + +#if ! defined (fstb_HAS_SIMD) + +typedef std::array Vf32Native; + +#elif fstb_ARCHI == fstb_ARCHI_X86 + +typedef __m128 Vf32Native; + +#elif fstb_ARCHI == fstb_ARCHI_ARM + +typedef float32x4_t Vf32Native; + +#else // fstb_ARCHI +#error +#endif // fstb_ARCHI + + + +class Vf32 +{ + +/*\\\ PUBLIC \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\*/ + +public: + + static constexpr int _len_l2 = 2; + static constexpr int _length = 1 << _len_l2; + typedef float Scalar; + + Vf32 () = default; + fstb_FORCEINLINE + Vf32 (Vf32Native a) noexcept : _x { a } {} + explicit fstb_FORCEINLINE + Vf32 (Scalar a) noexcept; + explicit fstb_FORCEINLINE + Vf32 (double a) noexcept; + explicit fstb_FORCEINLINE + Vf32 (int a) noexcept; + explicit fstb_FORCEINLINE + Vf32 (Scalar a0, Scalar a1, Scalar a2, Scalar a3) noexcept; + explicit fstb_FORCEINLINE + Vf32 (const std::tuple &a) noexcept; + Vf32 (const Vf32 &other) = default; + Vf32 (Vf32 &&other) = default; + ~Vf32 () = default; + Vf32 & operator = (const Vf32 &other) = default; + Vf32 & operator = (Vf32 &&other) = default; + + template + fstb_FORCEINLINE void + store (MEM *ptr) const noexcept; + template + fstb_FORCEINLINE void + store_part (MEM *ptr, int n) const noexcept; + template + fstb_FORCEINLINE void + storeu (MEM *ptr) const noexcept; + template + fstb_FORCEINLINE void + storeu_part (MEM *ptr, int n) const noexcept; + template + fstb_FORCEINLINE void + storeu_pair (MEM *ptr) const noexcept; + template + fstb_FORCEINLINE void + storeu_scalar (MEM *ptr) const noexcept; + + fstb_FORCEINLINE + operator Vf32Native () const noexcept { return _x; } + fstb_FORCEINLINE explicit + operator bool () const noexcept; + + fstb_FORCEINLINE Vf32 & + operator += (const Vf32Native &other) noexcept; + fstb_FORCEINLINE Vf32 & + operator -= (const Vf32Native &other) noexcept; + fstb_FORCEINLINE Vf32 & + operator *= (const Vf32Native &other) noexcept; + fstb_FORCEINLINE Vf32 & + operator /= (const Vf32Native &other) noexcept; + + fstb_FORCEINLINE Vf32 & + operator &= (const Vf32Native &other) noexcept; + fstb_FORCEINLINE Vf32 & + operator |= (const Vf32Native &other) noexcept; + fstb_FORCEINLINE Vf32 & + operator ^= (const Vf32Native &other) noexcept; + + fstb_FORCEINLINE Vf32 & + mac (Vf32 a, Vf32 b) noexcept; + fstb_FORCEINLINE Vf32 & + msu (Vf32 a, Vf32 b) noexcept; + + fstb_FORCEINLINE Vf32 + operator - () const noexcept; + fstb_FORCEINLINE Vf32 + reverse () const noexcept; + fstb_FORCEINLINE Vf32 + swap_pairs () const noexcept; + fstb_FORCEINLINE Vf32 + monofy_pairs_lo () const noexcept; + fstb_FORCEINLINE Vf32 + monofy_pairs_hi () const noexcept; + + fstb_FORCEINLINE Vf32 + butterfly_w64 () const noexcept; + fstb_FORCEINLINE Vf32 + butterfly_w32 () const noexcept; + + template + fstb_FORCEINLINE Vf32 + rotate () const noexcept; + template + fstb_FORCEINLINE float + extract () const noexcept; + template + fstb_FORCEINLINE Vf32 + insert (float val) const noexcept; + template + fstb_FORCEINLINE Vf32 + spread () const noexcept; + + fstb_FORCEINLINE Vf32 + round () const noexcept; + fstb_FORCEINLINE Vf32 + rcp_approx () const noexcept; + fstb_FORCEINLINE Vf32 + rcp_approx2 () const noexcept; + fstb_FORCEINLINE Vf32 + div_approx (const Vf32 &d) const noexcept; + fstb_FORCEINLINE Vf32 + sqrt_approx () const noexcept; + fstb_FORCEINLINE Vf32 + rsqrt () const noexcept; + fstb_FORCEINLINE Vf32 + rsqrt_approx () const noexcept; + template + fstb_FORCEINLINE Vf32 + log2_base (P poly) const noexcept; + template + fstb_FORCEINLINE Vf32 + exp2_base (P poly) const noexcept; + fstb_FORCEINLINE Vf32 + signbit () const noexcept; + fstb_FORCEINLINE Vf32 + is_lt_0 () const noexcept; + + fstb_FORCEINLINE std::tuple + explode () const noexcept; + fstb_FORCEINLINE std::tuple + extract_pair () const noexcept; + fstb_FORCEINLINE std::tuple + spread_pairs () const noexcept; + + fstb_FORCEINLINE float + sum_h () const noexcept; + fstb_FORCEINLINE float + min_h () const noexcept; + fstb_FORCEINLINE float + max_h () const noexcept; + + fstb_FORCEINLINE bool + and_h () const noexcept; + fstb_FORCEINLINE bool + or_h () const noexcept; + fstb_FORCEINLINE unsigned int + movemask () const noexcept; + + static fstb_FORCEINLINE Vf32 + zero () noexcept; + static fstb_FORCEINLINE Vf32 + all1 () noexcept; + static fstb_FORCEINLINE Vf32 + set_pair (float a0, float a1) noexcept; + static fstb_FORCEINLINE Vf32 + set_pair_fill (float a02, float a13) noexcept; + static fstb_FORCEINLINE Vf32 + set_pair_dbl (float a01, float a23) noexcept; + static fstb_FORCEINLINE Vf32 + set_mask (bool m0, bool m1, bool m2, bool m3) noexcept; + static fstb_FORCEINLINE Vf32Native + signbit_mask () noexcept; + static fstb_FORCEINLINE Vf32 + interleave_pair_lo (Vf32 p0, Vf32 p1) noexcept; + static fstb_FORCEINLINE Vf32 + interleave_pair_hi (Vf32 p0, Vf32 p1) noexcept; + static fstb_FORCEINLINE std::tuple + interleave (Vf32 p0, Vf32 p1) noexcept; + static fstb_FORCEINLINE std::tuple + deinterleave (Vf32 i0, Vf32 i1) noexcept; + static fstb_FORCEINLINE Vf32 + deinterleave_lo (Vf32 i0, Vf32 i1) noexcept; + static fstb_FORCEINLINE Vf32 + deinterleave_hi (Vf32 i0, Vf32 i1) noexcept; + template + static fstb_FORCEINLINE Vf32 + compose (Vf32 a, Vf32 b) noexcept; + + template + static fstb_FORCEINLINE Vf32 + load (const MEM *ptr) noexcept; + template + static fstb_FORCEINLINE Vf32 + loadu (const MEM *ptr) noexcept; + template + static fstb_FORCEINLINE Vf32 + loadu_part (const MEM *ptr, int n) noexcept; + template + static fstb_FORCEINLINE Vf32 + loadu_pair (const MEM *ptr) noexcept; + + + +/*\\\ PROTECTED \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\*/ + +protected: + + + +/*\\\ PRIVATE \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\*/ + +private: + + static constexpr int32_t _sign32 = INT32_MIN; + + template + fstb_FORCEINLINE void + storeu_part_n13 (MEM *ptr, int n) const noexcept; + +#if ! defined (fstb_HAS_SIMD) +public: + union Combo + { + Vf32Native _vf32; + int32_t _s32 [_length]; + uint32_t _u32 [_length]; + }; + static_assert ( + sizeof (Combo) == sizeof (Vf32Native), + "Wrong size for the wrapping combo structure" + ); +#endif + Vf32Native _x; +private: + + + +/*\\\ FORBIDDEN MEMBER FUNCTIONS \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\*/ + +private: + +}; // class Vf32 + +static_assert ( + sizeof (Vf32) == sizeof (Vf32Native), + "Wrong size for the wrapping structure" +); + + + +/*\\\ GLOBAL OPERATORS AND FUNCTIONS \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\*/ + + + +fstb_FORCEINLINE Vf32 operator + (Vf32 lhs, const Vf32 &rhs) noexcept; +fstb_FORCEINLINE Vf32 operator - (Vf32 lhs, const Vf32 &rhs) noexcept; +fstb_FORCEINLINE Vf32 operator * (Vf32 lhs, const Vf32 &rhs) noexcept; +fstb_FORCEINLINE Vf32 operator / (Vf32 lhs, const Vf32 &rhs) noexcept; +fstb_FORCEINLINE Vf32 operator & (Vf32 lhs, const Vf32 &rhs) noexcept; +fstb_FORCEINLINE Vf32 operator | (Vf32 lhs, const Vf32 &rhs) noexcept; +fstb_FORCEINLINE Vf32 operator ^ (Vf32 lhs, const Vf32 &rhs) noexcept; + +fstb_FORCEINLINE Vf32 operator == (const Vf32 &lhs, const Vf32 &rhs) noexcept; +fstb_FORCEINLINE Vf32 operator != (const Vf32 &lhs, const Vf32 &rhs) noexcept; +fstb_FORCEINLINE Vf32 operator < (const Vf32 &lhs, const Vf32 &rhs) noexcept; +fstb_FORCEINLINE Vf32 operator <= (const Vf32 &lhs, const Vf32 &rhs) noexcept; +fstb_FORCEINLINE Vf32 operator > (const Vf32 &lhs, const Vf32 &rhs) noexcept; +fstb_FORCEINLINE Vf32 operator >= (const Vf32 &lhs, const Vf32 &rhs) noexcept; + +fstb_FORCEINLINE Vf32 abs (const Vf32 &v) noexcept; +fstb_FORCEINLINE Vf32 fma (const Vf32 &x, const Vf32 &a, const Vf32 &b) noexcept; +fstb_FORCEINLINE Vf32 fms (const Vf32 &x, const Vf32 &a, const Vf32 &b) noexcept; +fstb_FORCEINLINE Vf32 fnma (const Vf32 &x, const Vf32 &a, const Vf32 &b) noexcept; +fstb_FORCEINLINE Vf32 round (const Vf32 &v) noexcept; +fstb_FORCEINLINE Vf32 min (const Vf32 &lhs, const Vf32 &rhs) noexcept; +fstb_FORCEINLINE Vf32 max (const Vf32 &lhs, const Vf32 &rhs) noexcept; +fstb_FORCEINLINE Vf32 limit (const Vf32 &v, const Vf32 &mi, const Vf32 &ma) noexcept; +fstb_FORCEINLINE Vf32 select (Vf32 cond, Vf32 v_t, Vf32 v_f) noexcept; +fstb_FORCEINLINE std::tuple swap_if (Vf32 cond, Vf32 lhs, Vf32 rhs) noexcept; +fstb_FORCEINLINE Vf32 sqrt (Vf32 v) noexcept; +fstb_FORCEINLINE Vf32 log2 (Vf32 v) noexcept; +fstb_FORCEINLINE Vf32 exp2 (Vf32 v) noexcept; + + + +} // namespace fstb + + + +#include "fstb/Vf32.hpp" + + + +#endif // fstb_Vf32_HEADER_INCLUDED + + + +/*\\\ EOF \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\*/ diff --git a/src/fstb/Vf32.hpp b/src/fstb/Vf32.hpp new file mode 100644 index 0000000..385cb0b --- /dev/null +++ b/src/fstb/Vf32.hpp @@ -0,0 +1,2181 @@ +/***************************************************************************** + + Vf32.hpp + Author: Laurent de Soras, 2021 + +--- Legal stuff --- + +This program is free software. It comes without any warranty, to +the extent permitted by applicable law. You can redistribute it +and/or modify it under the terms of the Do What The Fuck You Want +To Public License, Version 2, as published by Sam Hocevar. See +http://www.wtfpl.net/ for more details. + +*Tab=3***********************************************************************/ + + + +#if ! defined (fstb_Vf32_CODEHEADER_INCLUDED) +#define fstb_Vf32_CODEHEADER_INCLUDED + + + +/*\\\ INCLUDE FILES \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\*/ + +#include "fstb/fnc.h" + +#include + +#include +#include +#include + + + +namespace fstb +{ + + + +/*\\\ PUBLIC \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\*/ + + + +// Initialises with a | a | a | a +Vf32::Vf32 (Scalar a) noexcept +#if ! defined (fstb_HAS_SIMD) +: _x { a, a, a, a } +#elif fstb_ARCHI == fstb_ARCHI_X86 +: _x { _mm_set1_ps (a) } +#elif fstb_ARCHI == fstb_ARCHI_ARM +: _x { vdupq_n_f32 (a) } +#endif // fstb_ARCHI +{ + // Nothing +} + + + +// Initialises with a | a | a | a +Vf32::Vf32 (double a) noexcept +#if ! defined (fstb_HAS_SIMD) +: _x { Scalar (a), Scalar (a), Scalar (a), Scalar (a) } +#elif fstb_ARCHI == fstb_ARCHI_X86 +: _x { _mm_set1_ps (Scalar (a)) } +#elif fstb_ARCHI == fstb_ARCHI_ARM +: _x { vdupq_n_f32 (Scalar (a)) } +#endif // fstb_ARCHI +{ + // Nothing +} + + + +// Initialises with a | a | a | a +Vf32::Vf32 (int a) noexcept +#if ! defined (fstb_HAS_SIMD) +: _x { Scalar (a), Scalar (a), Scalar (a), Scalar (a) } +#elif fstb_ARCHI == fstb_ARCHI_X86 +: _x { _mm_set1_ps (Scalar (a)) } +#elif fstb_ARCHI == fstb_ARCHI_ARM +: _x { vdupq_n_f32 (Scalar (a)) } +#endif // fstb_ARCHI +{ + // Nothing +} + + + +// Initialises with a0 | a1 | a2 | a3 +Vf32::Vf32 (Scalar a0, Scalar a1, Scalar a2, Scalar a3) noexcept +#if ! defined (fstb_HAS_SIMD) +: _x { a0, a1, a2, a3 } +#elif fstb_ARCHI == fstb_ARCHI_X86 +: _x { _mm_set_ps (a3, a2, a1, a0) } +#elif fstb_ARCHI == fstb_ARCHI_ARM +: _x { a0, a1, a2, a3 } +#endif // fstb_ARCHI +{ + // Nothing +} + + + +// Initialises with a0 | a1 | a2 | a3 +Vf32::Vf32 (const std::tuple &a) noexcept +#if ! defined (fstb_HAS_SIMD) +: _x { std::get <0> (a), std::get <1> (a), std::get <2> (a), std::get <3> (a) } +#elif fstb_ARCHI == fstb_ARCHI_X86 +: _x { _mm_set_ps (std::get <3> (a), std::get <2> (a), std::get <1> (a), std::get <0> (a)) } +#elif fstb_ARCHI == fstb_ARCHI_ARM +: _x { std::get <0> (a), std::get <1> (a), std::get <2> (a), std::get <3> (a) } +#endif // fstb_ARCHI +{ + // Nothing +} + + + +template +void Vf32::store (MEM *ptr) const noexcept +{ + assert (is_ptr_align_nz (ptr, fstb_SIMD128_ALIGN)); + +#if ! defined (fstb_HAS_SIMD) + *reinterpret_cast (ptr) = _x; +#elif fstb_ARCHI == fstb_ARCHI_X86 + _mm_store_ps (reinterpret_cast (ptr), _x); +#elif fstb_ARCHI == fstb_ARCHI_ARM + vst1q_f32 (reinterpret_cast (ptr), _x); +#endif // fstb_ARCHI +} + + + +// n = number of scalars to store (from the LSB) +template +void Vf32::store_part (MEM *ptr, int n) const noexcept +{ + assert (n > 0); + + if (n >= 4) + { + store (ptr); + } + else + { + storeu_part_n13 (ptr, n); + } +} + + + +template +void Vf32::storeu (MEM *ptr) const noexcept +{ + assert (ptr != nullptr); + +#if ! defined (fstb_HAS_SIMD) + *reinterpret_cast (ptr) = _x; +#elif fstb_ARCHI == fstb_ARCHI_X86 + _mm_storeu_ps (reinterpret_cast (ptr), _x); +#elif fstb_ARCHI == fstb_ARCHI_ARM + vst1q_u8 (reinterpret_cast (ptr), vreinterpretq_u8_f32 (_x)); +#endif // fstb_ARCHI +} + + + +// n = number of scalars to store (from the LSB) +template +void Vf32::storeu_part (MEM *ptr, int n) const noexcept +{ + assert (n > 0); + + if (n >= 4) + { + storeu (ptr); + return; + } + + storeu_part_n13 (ptr, n); +} + + + +// ptr [0] = v0 +// ptr [1] = v1 +template +void Vf32::storeu_pair (MEM *ptr) const noexcept +{ + assert (ptr != nullptr); + +#if ! defined (fstb_HAS_SIMD) + auto p = reinterpret_cast (ptr); + p [0] = _x [0]; + p [1] = _x [1]; +#elif fstb_ARCHI == fstb_ARCHI_X86 + _mm_store_ss (reinterpret_cast (ptr) , _x ); + const auto v1 = _mm_shuffle_ps (_x, _x, 1 << 0); + _mm_store_ss (reinterpret_cast (ptr) + 1, v1); +#elif fstb_ARCHI == fstb_ARCHI_ARM + vst1_u8 ( + reinterpret_cast (ptr), + vreinterpret_u8_f32 (vget_low_f32 (_x)) + ); +#endif // fstb_ARCHI +} + + + +// *ptr = v0 +template +void Vf32::storeu_scalar (MEM *ptr) const noexcept +{ + assert (ptr != nullptr); + +#if ! defined (fstb_HAS_SIMD) + reinterpret_cast (ptr) [0] = _x [0]; +#elif fstb_ARCHI == fstb_ARCHI_X86 + _mm_store_ss (reinterpret_cast (ptr), _x); +#elif fstb_ARCHI == fstb_ARCHI_ARM + vst1q_lane_f32 (reinterpret_cast (ptr), _x, 0); +#endif // fstb_ARCHI +} + + + +// Works only with well-formed condition results (tested bits depend on the +// implementation). +// For each scalar, true = all bits set, false = all bits cleared +Vf32::operator bool () const noexcept +{ + return and_h (); +} + + + +Vf32 & Vf32::operator += (const Vf32Native &other) noexcept +{ +#if ! defined (fstb_HAS_SIMD) + _x [0] += other [0]; + _x [1] += other [1]; + _x [2] += other [2]; + _x [3] += other [3]; +#elif fstb_ARCHI == fstb_ARCHI_X86 + _x = _mm_add_ps (_x, other); +#elif fstb_ARCHI == fstb_ARCHI_ARM + _x = vaddq_f32 (_x, other); +#endif // fstb_ARCHI + return *this; +} + + + +Vf32 & Vf32::operator -= (const Vf32Native &other) noexcept +{ +#if ! defined (fstb_HAS_SIMD) + _x [0] -= other [0]; + _x [1] -= other [1]; + _x [2] -= other [2]; + _x [3] -= other [3]; +#elif fstb_ARCHI == fstb_ARCHI_X86 + _x = _mm_sub_ps (_x, other); +#elif fstb_ARCHI == fstb_ARCHI_ARM + _x = vsubq_f32 (_x, other); +#endif // fstb_ARCHI + return *this; +} + + + +Vf32 & Vf32::operator *= (const Vf32Native &other) noexcept +{ +#if ! defined (fstb_HAS_SIMD) + _x [0] *= other [0]; + _x [1] *= other [1]; + _x [2] *= other [2]; + _x [3] *= other [3]; +#elif fstb_ARCHI == fstb_ARCHI_X86 + _x = _mm_mul_ps (_x, other); +#elif fstb_ARCHI == fstb_ARCHI_ARM + _x = vmulq_f32 (_x, other); +#endif // fstb_ARCHI + return *this; +} + + + +Vf32 & Vf32::operator /= (const Vf32Native &other) noexcept +{ +#if ! defined (fstb_HAS_SIMD) + _x [0] /= other [0]; + _x [1] /= other [1]; + _x [2] /= other [2]; + _x [3] /= other [3]; +#elif fstb_ARCHI == fstb_ARCHI_X86 + _x = _mm_div_ps (_x, other); +#elif fstb_ARCHI == fstb_ARCHI_ARM + _x = _x * (Vf32 { other }.rcp_approx2 ())._x; +#endif // fstb_ARCHI + return *this; +} + + + +Vf32 & Vf32::operator &= (const Vf32Native &other) noexcept +{ +#if ! defined (fstb_HAS_SIMD) + Combo al { _x }; + Combo ar { other }; + al._s32 [0] &= ar._s32 [0]; + al._s32 [1] &= ar._s32 [1]; + al._s32 [2] &= ar._s32 [2]; + al._s32 [3] &= ar._s32 [3]; + _x = al._vf32; +#elif fstb_ARCHI == fstb_ARCHI_X86 + _x = _mm_and_ps (_x, other); +#elif fstb_ARCHI == fstb_ARCHI_ARM + _x = vreinterpretq_f32_u32 (vandq_u32 ( + vreinterpretq_u32_f32 (_x), + vreinterpretq_u32_f32 (other) + )); +#endif // fstb_ARCHI + return *this; +} + + + +Vf32 & Vf32::operator |= (const Vf32Native &other) noexcept +{ +#if ! defined (fstb_HAS_SIMD) + Combo al { _x }; + Combo ar { other }; + al._s32 [0] |= ar._s32 [0]; + al._s32 [1] |= ar._s32 [1]; + al._s32 [2] |= ar._s32 [2]; + al._s32 [3] |= ar._s32 [3]; + _x = al._vf32; +#elif fstb_ARCHI == fstb_ARCHI_X86 + _x = _mm_or_ps (_x, other); +#elif fstb_ARCHI == fstb_ARCHI_ARM + _x = vreinterpretq_f32_u32 (vorrq_u32 ( + vreinterpretq_u32_f32 (_x), + vreinterpretq_u32_f32 (other) + )); +#endif // fstb_ARCHI + return *this; +} + + + +Vf32 & Vf32::operator ^= (const Vf32Native &other) noexcept +{ +#if ! defined (fstb_HAS_SIMD) + Combo al { _x }; + Combo ar { other }; + al._s32 [0] ^= ar._s32 [0]; + al._s32 [1] ^= ar._s32 [1]; + al._s32 [2] ^= ar._s32 [2]; + al._s32 [3] ^= ar._s32 [3]; + _x = al._vf32; +#elif fstb_ARCHI == fstb_ARCHI_X86 + _x = _mm_xor_ps (_x, other); +#elif fstb_ARCHI == fstb_ARCHI_ARM + _x = vreinterpretq_f32_u32 (veorq_u32 ( + vreinterpretq_u32_f32 (_x), + vreinterpretq_u32_f32 (other) + )); +#endif // fstb_ARCHI + return *this; +} + + + +// *this += a * b +Vf32 & Vf32::mac (Vf32 a, Vf32 b) noexcept +{ +#if ! defined (fstb_HAS_SIMD) + _x [0] += a._x [0] * b._x [0]; + _x [1] += a._x [1] * b._x [1]; + _x [2] += a._x [2] * b._x [2]; + _x [3] += a._x [3] * b._x [3]; +#elif fstb_ARCHI == fstb_ARCHI_X86 + _x = _mm_add_ps (_x, _mm_mul_ps (a, b)); +#elif fstb_ARCHI == fstb_ARCHI_ARM + #if defined (__ARM_FEATURE_FMA) + _x = vfmaq_f32 (_x, a, b); + #else + _x = vmlaq_f32 (_x, a, b); + #endif +#endif // fstb_ARCHI + return *this; +} + + + +// *this -= a * b +Vf32 & Vf32::msu (Vf32 a, Vf32 b) noexcept +{ +#if ! defined (fstb_HAS_SIMD) + _x [0] -= a._x [0] * b._x [0]; + _x [1] -= a._x [1] * b._x [1]; + _x [2] -= a._x [2] * b._x [2]; + _x [3] -= a._x [3] * b._x [3]; +#elif fstb_ARCHI == fstb_ARCHI_X86 + _x = _mm_sub_ps (_x, _mm_mul_ps (a, b)); +#elif fstb_ARCHI == fstb_ARCHI_ARM + #if defined (__ARM_FEATURE_FMA) + _x = vfmsq_f32 (_x, a, b); + #else + _x = vmlsq_f32 (_x, a, b); + #endif +#endif // fstb_ARCHI + return *this; +} + + + +Vf32 Vf32::operator - () const noexcept +{ +#if ! defined (fstb_HAS_SIMD) + return Vf32 { + -_x [0], + -_x [1], + -_x [2], + -_x [3] + }; +#elif fstb_ARCHI == fstb_ARCHI_X86 + return _mm_xor_ps (_x, signbit_mask ()); +#elif fstb_ARCHI == fstb_ARCHI_ARM + return vnegq_f32 (_x); +#endif // fstb_ARCHI +} + + + +Vf32 Vf32::reverse () const noexcept +{ +#if ! defined (fstb_HAS_SIMD) + return Vf32 { _x [3], _x [2], _x [1], _x [0] }; +#elif fstb_ARCHI == fstb_ARCHI_X86 + return _mm_shuffle_ps (_x, _x, (3<<0) + (2<<2) + (1<<4) + (0<<6)); +#elif fstb_ARCHI == fstb_ARCHI_ARM + return vrev64q_f32 (vcombine_f32 (vget_high_f32 (_x), vget_low_f32 (_x))); +#endif // fstb_ARCHI +} + + + +Vf32 Vf32::swap_pairs () const noexcept +{ +#if ! defined (fstb_HAS_SIMD) + return Vf32 { _x [2], _x [3], _x [0], _x [1] }; +#elif fstb_ARCHI == fstb_ARCHI_X86 + return _mm_shuffle_ps (_x, _x, (2<<0) + (3<<2) + (0<<4) + (1<<6)); +#elif fstb_ARCHI == fstb_ARCHI_ARM + const float32x2_t v01 = vget_low_f32 (_x); + const float32x2_t v23 = vget_high_f32 (_x); + return vcombine_f32 (v23, v01); +#endif // fstb_ARCHI +} + + + +// a, b, c, d -> a, a, c, c +Vf32 Vf32::monofy_pairs_lo () const noexcept +{ +#if ! defined (fstb_HAS_SIMD) + return Vf32 { _x [0], _x [0], _x [2], _x [2] }; +#elif fstb_ARCHI == fstb_ARCHI_X86 + return _mm_shuffle_ps (_x, _x, 0xA0); +#elif fstb_ARCHI == fstb_ARCHI_ARM + return vuzpq_f32 (_x, _x).val [0]; +#endif // fstb_ARCHI +} + + + +// a, b, c, d -> b, b, d, d +Vf32 Vf32::monofy_pairs_hi () const noexcept +{ +#if ! defined (fstb_HAS_SIMD) + return Vf32 { _x [1], _x [1], _x [3], _x [3] }; +#elif fstb_ARCHI == fstb_ARCHI_X86 + return _mm_shuffle_ps (_x, _x, 0xF5); +#elif fstb_ARCHI == fstb_ARCHI_ARM + return vuzpq_f32 (_x, _x).val [1]; +#endif // fstb_ARCHI +} + + + +// a, b, c, d -> a+c, b+d, a-c, b-d +Vf32 Vf32::butterfly_w64 () const noexcept +{ +#if ! defined (fstb_HAS_SIMD) + return Vf32 { + _x [0] + _x [2], + _x [1] + _x [3], + _x [0] - _x [2], + _x [1] - _x [3] + }; +#elif fstb_ARCHI == fstb_ARCHI_X86 + const auto sign = _mm_castsi128_ps (_mm_setr_epi32 (0, 0, _sign32, _sign32)); + const auto x0 = _mm_shuffle_ps (_x, _x, (2<<0) + (3<<2) + (0<<4) + (1<<6)); // c, d, a, b + const auto x1 = _mm_xor_ps (_x, sign); // a, b, -c, -d + return x0 + x1; +#elif fstb_ARCHI == fstb_ARCHI_ARM + const auto sign = int32x4_t { 0, 0, _sign32, _sign32 }; + const auto x0 = vcombine_f32 (vget_high_f32 (_x), vget_low_f32 (_x)); // c, d, a, b + const auto x1 = // a, b, -c, -d + vreinterpretq_f32_s32 (veorq_s32 (vreinterpretq_s32_f32 (_x), sign)); + return x0 + x1; +#endif +} + + + +// a, b, c, d -> a+b, a-b, c+d, c-d +Vf32 Vf32::butterfly_w32 () const noexcept +{ +#if ! defined (fstb_HAS_SIMD) + return Vf32 { + _x [0] + _x [1], + _x [0] + _x [1], + _x [2] - _x [3], + _x [2] - _x [3] + }; +#elif fstb_ARCHI == fstb_ARCHI_X86 + const auto sign = _mm_castsi128_ps (_mm_setr_epi32 (0, _sign32, 0, _sign32)); + const auto x0 = _mm_shuffle_ps (_x, _x, (1<<0) + (0<<2) + (3<<4) + (2<<6)); // b, a, d, c + const auto x1 = _mm_xor_ps (_x, sign); // a, -b, c, -d + return x0 + x1; +#elif fstb_ARCHI == fstb_ARCHI_ARM + const auto sign = int32x4_t { 0, _sign32, 0, _sign32 }; + const auto x0 = vrev64q_f32 (_x); // b, a, d, c + const auto x1 = // a, -b, c, -d + vreinterpretq_f32_s32 (veorq_s32 (vreinterpretq_s32_f32 (_x), sign)); + return x0 + x1; +#endif +} + + + +// Positive = to the left, rotates towards the higher indexes +template +Vf32 Vf32::rotate () const noexcept +{ +#if ! defined (fstb_HAS_SIMD) + return Vf32 { + _x [(0 - SHIFT) & 3], + _x [(1 - SHIFT) & 3], + _x [(2 - SHIFT) & 3], + _x [(3 - SHIFT) & 3] + }; +#elif fstb_ARCHI == fstb_ARCHI_X86 + switch (SHIFT & 3) + { + case 1: return _mm_shuffle_ps (_x, _x, (2<<6) | (1<<4) | (0<<2) | (3<<0)); + case 2: return _mm_shuffle_ps (_x, _x, (1<<6) | (0<<4) | (3<<2) | (2<<0)); + case 3: return _mm_shuffle_ps (_x, _x, (0<<6) | (3<<4) | (2<<2) | (1<<0)); + default: return *this; + } +#elif fstb_ARCHI == fstb_ARCHI_ARM + int32x4_t aa = vreinterpretq_s32_f32 (_x); + switch (SHIFT & 3) + { + case 1: aa = vextq_s32 (aa, aa, 3); break; + case 2: aa = vextq_s32 (aa, aa, 2); break; + case 3: aa = vextq_s32 (aa, aa, 1); break; + default: /* Nothing */ break; + } + return vreinterpretq_f32_s32 (aa); +#endif // fstb_ARCHI +} + + + +template +float Vf32::extract () const noexcept +{ +#if ! defined (fstb_HAS_SIMD) + return _x [POS & 3]; +#elif fstb_ARCHI == fstb_ARCHI_X86 + auto a = _x; + switch (POS & 3) + { + case 1: a = _mm_shuffle_ps (a, a, 1); break; + case 2: a = _mm_shuffle_ps (a, a, 2); break; + case 3: a = _mm_shuffle_ps (a, a, 3); break; + default: /* Nothing */ break; + } + return _mm_cvtss_f32 (a); +#elif fstb_ARCHI == fstb_ARCHI_ARM + return vgetq_lane_f32 (_x, POS & 3); +#endif // fstb_ARCHI +} + + + +template +Vf32 Vf32::insert (float val) const noexcept +{ +#if ! defined (fstb_HAS_SIMD) + auto a = *this; + a._x [POS & 3] = val; + return a; +#elif fstb_ARCHI == fstb_ARCHI_X86 + auto a = rotate <(-POS) & 3> (); + a._x = _mm_move_ss (a._x, _mm_set_ss (val)); + return a.template rotate (); +#elif fstb_ARCHI == fstb_ARCHI_ARM + return vsetq_lane_f32 (val, _x, POS & 3); +#endif // fstb_ARCHI +} + + + +template +Vf32 Vf32::spread () const noexcept +{ +#if ! defined (fstb_HAS_SIMD) + return Vf32 (extract ()); +#elif fstb_ARCHI == fstb_ARCHI_X86 + return _mm_shuffle_ps (_x, _x, 0x55 * (POS & 3)); +#elif fstb_ARCHI == fstb_ARCHI_ARM + return vdupq_n_f32 (vgetq_lane_f32 (_x, POS & 3)); +#endif // fstb_ARCHI +} + + + +// Assumes "to nearest" rounding mode on x86 +Vf32 Vf32::round () const noexcept +{ +#if ! defined (fstb_HAS_SIMD) + return Vf32 { + roundf (_x [0]), + roundf (_x [1]), + roundf (_x [2]), + roundf (_x [3]) + }; +#elif fstb_ARCHI == fstb_ARCHI_X86 + return _mm_cvtepi32_ps (_mm_cvtps_epi32 (_x)); +#elif fstb_ARCHI == fstb_ARCHI_ARM + const auto zero = vdupq_n_f32 ( 0.0f); + const auto m = vdupq_n_f32 (-0.5f); + const auto p = vdupq_n_f32 (+0.5f); + const auto gt0 = vcgtq_f32 (_x, zero); + const auto u = vbslq_f32 (gt0, p, m); + return vcvtq_f32_s32 (vcvtq_s32_f32 (vaddq_f32 (_x, u))); +#endif // fstb_ARCHI +} + + + +Vf32 Vf32::rcp_approx () const noexcept +{ +#if ! defined (fstb_HAS_SIMD) + return Vf32 { + 1.f / _x [0], + 1.f / _x [1], + 1.f / _x [2], + 1.f / _x [3] + }; +#elif fstb_ARCHI == fstb_ARCHI_X86 + return _mm_rcp_ps (_x); +#elif fstb_ARCHI == fstb_ARCHI_ARM + auto r = vrecpeq_f32 (_x); + r = vmulq_f32 (vrecpsq_f32 (_x, r), r); + return r; +#endif // fstb_ARCHI +} + + + +// With more accuracy +Vf32 Vf32::rcp_approx2 () const noexcept +{ +#if ! defined (fstb_HAS_SIMD) + return rcp_approx (); +#elif fstb_ARCHI == fstb_ARCHI_X86 + auto r = _mm_rcp_ps (_x); + r = r * (_mm_set1_ps (2.f) - r * _x); + return r; +#elif fstb_ARCHI == fstb_ARCHI_ARM + auto r = vrecpeq_f32 (_x); + r = vmulq_f32 (vrecpsq_f32 (_x, r), r); + r = vmulq_f32 (vrecpsq_f32 (_x, r), r); + return r; +#endif // fstb_ARCHI +} + + + +Vf32 Vf32::div_approx (const Vf32 &d) const noexcept +{ +#if ! defined (fstb_HAS_SIMD) + return Vf32 { + _x [0] / d._x [0], + _x [1] / d._x [1], + _x [2] / d._x [2], + _x [3] / d._x [3] + }; +#elif fstb_ARCHI == fstb_ARCHI_X86 + return _mm_div_ps (_x, d._x); +#elif fstb_ARCHI == fstb_ARCHI_ARM + return _x * d.rcp_approx ()._x; +#endif // fstb_ARCHI +} + + + +Vf32 Vf32::sqrt_approx () const noexcept +{ +#if ! defined (fstb_HAS_SIMD) + return Vf32 { + sqrtf (_x [0]), + sqrtf (_x [1]), + sqrtf (_x [2]), + sqrtf (_x [3]) + }; +#elif fstb_ARCHI == fstb_ARCHI_X86 + // Zero and denormal values will produce INF with _mm_rsqrt_ps(), so + // we need a mask. + const __m128 z_flag = _mm_cmplt_ps (_x, _mm_set1_ps (FLT_MIN)); + const __m128 rsqrt_a = _mm_rsqrt_ps (_x); + const __m128 sqrt_a = _mm_mul_ps (_x, rsqrt_a); + const __m128 sqrt_m = _mm_andnot_ps (z_flag, sqrt_a); + return sqrt_m; +#elif fstb_ARCHI == fstb_ARCHI_ARM + const uint32x4_t nz_flag = vtstq_u32 ( + vreinterpretq_u32_f32 (_x), + vreinterpretq_u32_f32 (_x) + ); + auto rs = vrsqrteq_f32 (_x); + rs *= vrsqrtsq_f32 (rs * float32x4_t (_x), rs); + const auto sqrt_a = rs * float32x4_t (_x); + return vreinterpretq_f32_u32 (vandq_u32 ( + vreinterpretq_u32_f32 (sqrt_a), + nz_flag + )); +#endif // fstb_ARCHI +} + + + +Vf32 Vf32::rsqrt () const noexcept +{ +#if ! defined (fstb_HAS_SIMD) + return Vf32 { + 1.f / sqrtf (_x [0]), + 1.f / sqrtf (_x [1]), + 1.f / sqrtf (_x [2]), + 1.f / sqrtf (_x [3]) + }; +#elif fstb_ARCHI == fstb_ARCHI_X86 + __m128 rs = _mm_rsqrt_ps (_x); + rs = _mm_set1_ps (0.5f) * rs * (_mm_set1_ps (3) - __m128 (_x) * rs * rs); + return rs; +#elif fstb_ARCHI == fstb_ARCHI_ARM + auto rs = vrsqrteq_f32 (_x); + rs *= vrsqrtsq_f32 (rs * float32x4_t (_x), rs); + rs *= vrsqrtsq_f32 (rs * float32x4_t (_x), rs); + return rs; +#endif // fstb_ARCHI +} + + + +Vf32 Vf32::rsqrt_approx () const noexcept +{ +#if ! defined (fstb_HAS_SIMD) + // Ref: + // Jan Kadlec, http://rrrola.wz.cz/inv_sqrt.html, 2010 + const auto xh = (*this) * Vf32 (0.703952253f); + Combo c { _x }; + c._s32 [0] = 0x5F1FFFF9 - (c._s32 [0] >> 1); + c._s32 [1] = 0x5F1FFFF9 - (c._s32 [1] >> 1); + c._s32 [2] = 0x5F1FFFF9 - (c._s32 [2] >> 1); + c._s32 [3] = 0x5F1FFFF9 - (c._s32 [3] >> 1); + auto rs = Vf32 { c._vf32 }; + rs *= Vf32 (1.681914091f) - xh * rs * rs; + return rs; +#elif fstb_ARCHI == fstb_ARCHI_X86 + return _mm_rsqrt_ps (_x); +#elif fstb_ARCHI == fstb_ARCHI_ARM + auto rs = vrsqrteq_f32 (_x); + rs *= vrsqrtsq_f32 (rs * float32x4_t (_x), rs); + return rs; +#endif // fstb_ARCHI +} + + + +// poly is a user-provided Vf32 log2 approximation from [1 ; 2[ to [0 ; 1[ +template +Vf32 Vf32::log2_base (P poly) const noexcept +{ + const int32_t log2_sub = 127; + +#if ! defined (fstb_HAS_SIMD) + + assert ( + _x [0] > 0 + && _x [1] > 0 + && _x [2] > 0 + && _x [3] > 0 + ); + Combo c { _x }; + const int x0 = c._s32 [0]; + const int x1 = c._s32 [1]; + const int x2 = c._s32 [2]; + const int x3 = c._s32 [3]; + const Vf32 log2_int { + float (((x0 >> 23) & 255) - log2_sub), + float (((x1 >> 23) & 255) - log2_sub), + float (((x2 >> 23) & 255) - log2_sub), + float (((x3 >> 23) & 255) - log2_sub) + }; + c._s32 [0] = (x0 & ~(255 << 23)) + (127 << 23); + c._s32 [1] = (x1 & ~(255 << 23)) + (127 << 23); + c._s32 [2] = (x2 & ~(255 << 23)) + (127 << 23); + c._s32 [3] = (x3 & ~(255 << 23)) + (127 << 23); + Vf32 part { c._vf32 }; + +#else // fstb_HAS_SIMD + +#if fstb_ARCHI == fstb_ARCHI_X86 + + // Extracts the exponent + __m128i xi = _mm_castps_si128 (_x); + xi = _mm_srli_epi32 (xi, 23); + const __m128i l2_sub = _mm_set1_epi32 (log2_sub); + xi = _mm_sub_epi32 (xi, l2_sub); + const auto log2_int = Vf32 { _mm_cvtepi32_ps (xi) }; + +#elif fstb_ARCHI == fstb_ARCHI_ARM + + int32x4_t xi = vreinterpretq_s32_f32 (_x); + xi = vshrq_n_s32 (xi, 23); + const int32x4_t l2_sub = vdupq_n_s32 (log2_sub); + xi -= l2_sub; + const auto log2_int = Vf32 { vcvtq_f32_s32 (xi) }; + +#endif // fstb_ARCHI + + // Extracts the multiplicative part in [1 ; 2[ + const auto mask_mantissa = Vf32 (1.17549421e-38f); // Binary: (1 << 23) - 1 + auto part = _x & mask_mantissa; + const auto bias = Vf32 (1.0f); // Binary: 127 << 23 + part |= bias; + +#endif // fstb_HAS_SIMD + + // Computes the log2 approximation [1 ; 2[ -> [0 ; 1[ + part = poly (part); + + // Sums the components + const auto total = log2_int + part; + + return total; +} + + + +// poly is a user-provided Vf32 exp2 approximation from [0 ; 1[ to [1 ; 2[ +template +Vf32 Vf32::exp2_base (P poly) const noexcept +{ +#if ! defined (fstb_HAS_SIMD) + + const int32_t tx0 = floor_int (_x [0]); + const int32_t tx1 = floor_int (_x [1]); + const int32_t tx2 = floor_int (_x [2]); + const int32_t tx3 = floor_int (_x [3]); + const Vf32 frac { + _x [0] - static_cast (tx0), + _x [1] - static_cast (tx1), + _x [2] - static_cast (tx2), + _x [3] - static_cast (tx3) + }; + + Combo combo { poly (frac) }; + + combo._s32 [0] += tx0 << 23; + combo._s32 [1] += tx1 << 23; + combo._s32 [2] += tx2 << 23; + combo._s32 [3] += tx3 << 23; + assert ( + combo._vf32 [0] >= 0 + && combo._vf32 [1] >= 0 + && combo._vf32 [2] >= 0 + && combo._vf32 [3] >= 0 + ); + return combo._vf32; + +#else // fstb_HAS_SIMD + + // Separates the integer and fractional parts +# if fstb_ARCHI == fstb_ARCHI_X86 + const auto round_toward_m_i = _mm_set1_ps (-0.5f); + auto xi = _mm_cvtps_epi32 (_mm_add_ps (_x, round_toward_m_i)); + const auto val_floor = Vf32 { _mm_cvtepi32_ps (xi) }; +# elif fstb_ARCHI == fstb_ARCHI_ARM + const int round_ofs = 256; + int32x4_t xi = vcvtq_s32_f32 (_x + vdupq_n_f32 (float (round_ofs))); + xi -= vdupq_n_s32 (round_ofs); + const auto val_floor = Vf32 { vcvtq_f32_s32 (xi) }; +# endif // fstb_ARCHI + + auto frac = *this - val_floor; + + // Computes the exp2 approximation [0 ; 1] -> [1 ; 2] + frac = poly (frac); + + // Integer part +# if fstb_ARCHI == fstb_ARCHI_X86 + xi = _mm_slli_epi32 (xi, 23); + xi = _mm_add_epi32 (xi, _mm_castps_si128 (frac)); + return _mm_castsi128_ps (xi); +# elif fstb_ARCHI == fstb_ARCHI_ARM + xi = vshlq_n_s32 (xi, 23); + xi = xi + vreinterpretq_s32_f32 (frac); + return vreinterpretq_f32_s32 (xi); +# endif // fstb_ARCHI + +#endif // fstb_HAS_SIMD +} + + + +Vf32 Vf32::signbit () const noexcept +{ +#if ! defined (fstb_HAS_SIMD) + return Vf32 { + copysignf (0.f, _x [0]), + copysignf (0.f, _x [1]), + copysignf (0.f, _x [2]), + copysignf (0.f, _x [3]) + }; +#elif fstb_ARCHI == fstb_ARCHI_X86 + return _mm_and_ps (signbit_mask (), _x); +#elif fstb_ARCHI == fstb_ARCHI_ARM + return vreinterpretq_f32_u32 (vandq_u32 ( + vreinterpretq_u32_f32 (_x), + vdupq_n_u32 (0x80000000U) + )); +#endif // fstb_ARCHI +} + + + +Vf32 Vf32::is_lt_0 () const noexcept +{ +#if ! defined (fstb_HAS_SIMD) + Combo r; + r._s32 [0] = (_x [0] < 0) ? -1 : 0; + r._s32 [1] = (_x [1] < 0) ? -1 : 0; + r._s32 [2] = (_x [2] < 0) ? -1 : 0; + r._s32 [3] = (_x [3] < 0) ? -1 : 0; + return r._vf32; +#elif fstb_ARCHI == fstb_ARCHI_X86 + return _mm_castsi128_ps (_mm_srai_epi32 (_mm_castps_si128 (_x), 31)); +#elif fstb_ARCHI == fstb_ARCHI_ARM + return vreinterpretq_f32_s32 (vshrq_n_s32 (vreinterpretq_s32_f32 (_x), 31)); +#endif // fstb_ARCHI +} + + + +std::tuple Vf32::explode () const noexcept +{ +#if ! defined (fstb_HAS_SIMD) + return std::make_tuple (_x [0], _x [1], _x [2], _x [3]); +#elif fstb_ARCHI == fstb_ARCHI_X86 + const auto tmp = _mm_movehl_ps (_x, _x); + return std::make_tuple ( + _mm_cvtss_f32 (_x), + _mm_cvtss_f32 (_mm_shuffle_ps (_x, _x, (1<<0))), + _mm_cvtss_f32 (tmp), + _mm_cvtss_f32 (_mm_shuffle_ps (tmp, tmp, (1<<0))) + ); +#elif fstb_ARCHI == fstb_ARCHI_ARM + return std::make_tuple ( + vgetq_lane_f32 (_x, 0), + vgetq_lane_f32 (_x, 1), + vgetq_lane_f32 (_x, 2), + vgetq_lane_f32 (_x, 3) + ); +#endif // fstb_ARCHI +} + + + +std::tuple Vf32::extract_pair () const noexcept +{ +#if ! defined (fstb_HAS_SIMD) + return std::make_tuple (_x [0], _x [1]); +#elif fstb_ARCHI == fstb_ARCHI_X86 + return std::make_tuple ( + _mm_cvtss_f32 (_x), + _mm_cvtss_f32 (_mm_shuffle_ps (_x, _x, 1)) + ); +#elif fstb_ARCHI == fstb_ARCHI_ARM + return std::make_tuple (vgetq_lane_f32 (_x, 0), vgetq_lane_f32 (_x, 1)); +#endif // fstb_ARCHI +} + + + +// <0> = v0 | v1 | v0 | v1 +// <1> = v2 | v3 | v2 | v3 +std::tuple Vf32::spread_pairs () const noexcept +{ +#if ! defined (fstb_HAS_SIMD) + return std::make_tuple ( + Vf32 { _x [0], _x [1], _x [0], _x [1] }, + Vf32 { _x [2], _x [3], _x [2], _x [3] } + ); +#elif fstb_ARCHI == fstb_ARCHI_X86 + return std::make_tuple ( + Vf32 { _mm_shuffle_ps (_x, _x, (0<<0) + (1<<2) + (0<<4) + (1<<6)) }, + Vf32 { _mm_shuffle_ps (_x, _x, (2<<0) + (3<<2) + (2<<4) + (3<<6)) } + ); +#elif fstb_ARCHI == fstb_ARCHI_ARM + const float32x2_t v01 = vget_low_f32 (_x); + const float32x2_t v23 = vget_high_f32 (_x); + return std::make_tuple ( + Vf32 { vcombine_f32 (v01, v01) }, + Vf32 { vcombine_f32 (v23, v23) } + ); +#endif // fstb_ARCHI +} + + + +float Vf32::sum_h () const noexcept +{ +#if ! defined (fstb_HAS_SIMD) + return (_x [0] + _x [2]) + (_x [1] + _x [3]); +#elif fstb_ARCHI == fstb_ARCHI_X86 + // s = v3,v2,v1,v0 + const auto s = _mm_shuffle_ps (_x, _x, (3 << 0) | (2 << 2) | (1 << 4) | (0 << 6)); + const auto v = _mm_add_ps (_x, s); // v0+v3,v1+v2,v2+v1,v3+v0 + return _mm_cvtss_f32 (_mm_add_ss (v, _mm_movehl_ps (s, v))); +#elif fstb_ARCHI == fstb_ARCHI_ARM + #if fstb_WORD_SIZE == 64 + return vaddvq_f32 (_x); + #else + float32x2_t v2 = vadd_f32 (vget_high_f32 (_x), vget_low_f32 (_x)); + return vget_lane_f32 (vpadd_f32 (v2, v2), 0); + #endif +#endif // fstb_ARCHI +} + + + +float Vf32::min_h () const noexcept +{ +#if ! defined (fstb_HAS_SIMD) + return std::min (std::min (_x [0], _x [2]), std::min (_x [1], _x [3])); +#elif fstb_ARCHI == fstb_ARCHI_X86 + const auto v = _mm_min_ps (_x, _mm_shuffle_ps (_x, _x, (3 << 2) | 2)); + return _mm_cvtss_f32 (_mm_min_ss (v, _mm_shuffle_ps (v, v, 1))); +#elif fstb_ARCHI == fstb_ARCHI_ARM + float32x2_t v2 = vmin_f32 (vget_high_f32 (_x), vget_low_f32 (_x)); + return vget_lane_f32 (vpmin_f32 (v2, v2), 0); +#endif // fstb_ARCHI +} + + + +float Vf32::max_h () const noexcept +{ +#if ! defined (fstb_HAS_SIMD) + return std::max (std::max (_x [0], _x [2]), std::max (_x [1], _x [3])); +#elif fstb_ARCHI == fstb_ARCHI_X86 + const auto v = _mm_max_ps (_x, _mm_shuffle_ps (_x, _x, (3 << 2) | 2)); + return _mm_cvtss_f32 (_mm_max_ss (v, _mm_shuffle_ps (v, v, 1))); +#elif fstb_ARCHI == fstb_ARCHI_ARM + float32x2_t v2 = vmax_f32 (vget_high_f32 (_x), vget_low_f32 (_x)); + return vget_lane_f32 (vpmax_f32 (v2, v2), 0); +#endif // fstb_ARCHI +} + + + +// Works only with well-formed condition results (tested bits depends on the implementation). +// For each scalar, true = all bits set, false = all bits cleared +bool Vf32::and_h () const noexcept +{ +#if ! defined (fstb_HAS_SIMD) + const Combo c { _x }; + const int32_t t = (c._s32 [0] & c._s32 [1]) & (c._s32 [2] & c._s32 [3]); + return (t == -1); +#elif fstb_ARCHI == fstb_ARCHI_X86 + return (_mm_movemask_ps (_x) == 15); +#elif fstb_ARCHI == fstb_ARCHI_ARM + const uint32x2_t tmp = vreinterpret_u32_u16 ( + vqmovn_u32 (vreinterpretq_u32_f32 (_x)) + ); + return ( vget_lane_u32 (tmp, 0) == 0xFFFFFFFFU + && vget_lane_u32 (tmp, 1) == 0xFFFFFFFFU); +#endif // fstb_ARCHI +} + + + +// Works only with well-formed condition results (tested bits depends on the implementation). +// For each scalar, true = all bits set, false = all bits cleared +bool Vf32::or_h () const noexcept +{ +#if ! defined (fstb_HAS_SIMD) + Combo c; + c._vf32 = _x; + const int32_t t = (c._s32 [0] | c._s32 [1]) | (c._s32 [2] | c._s32 [3]); + return (t != 0); +#elif fstb_ARCHI == fstb_ARCHI_X86 + return (_mm_movemask_ps (_x) != 0); +#elif fstb_ARCHI == fstb_ARCHI_ARM + const uint32x2_t tmp = vreinterpret_u32_u16 ( + vqmovn_u32 (vreinterpretq_u32_f32 (_x)) + ); + return ( vget_lane_u32 (tmp, 0) != 0 + || vget_lane_u32 (tmp, 1) != 0); +#endif // fstb_ARCHI +} + + + +// Moves the boolean content of each 4 scalar into the lower 4 bits of the +// return value. +// Assumes the object is a result of a comparison, with all bits the same +// in each 32-bit element. +unsigned int Vf32::movemask () const noexcept +{ +#if ! defined (fstb_HAS_SIMD) + Combo c; + c._vf32 = _x; + return + (c._u32 [0] >> 31) + | ((c._u32 [1] >> 30) & 2) + | ((c._u32 [2] >> 29) & 4) + | ((c._u32 [3] >> 28) & 8); +#elif fstb_ARCHI == fstb_ARCHI_X86 + return static_cast (_mm_movemask_ps (_x)); +#elif fstb_ARCHI == fstb_ARCHI_ARM + uint64x2_t tmp1 = + vreinterpretq_u64_f32 (_x); // ddd...ddd ccc...ccc bbb...bbb aaa...aaa + tmp1 = vshrq_n_u64 (tmp1, 31); // 000...00d ddd...ddc 000...00b bbb...bba + uint64x1_t tmp2 = vsli_n_u64 ( + vget_high_u64 (tmp1), + vget_low_u64 (tmp1), + 2 + ); + return vget_lane_u32 (vreinterpret_u32_u64 (tmp2), 0) & 0xF; +#endif // fstb_ARCHI +} + + + +Vf32 Vf32::zero () noexcept +{ +#if ! defined (fstb_HAS_SIMD) + return Vf32 { 0, 0, 0, 0 }; +#elif fstb_ARCHI == fstb_ARCHI_X86 + return _mm_setzero_ps (); +#elif fstb_ARCHI == fstb_ARCHI_ARM + return vdupq_n_f32 (0); +#endif // fstb_ARCHI +} + + + +Vf32 Vf32::all1 () noexcept +{ +#if ! defined (fstb_HAS_SIMD) + Combo c; + c._s32 [0] = -1; + c._s32 [1] = -1; + c._s32 [2] = -1; + c._s32 [3] = -1; + return Vf32 { c._vf32 }; +#elif fstb_ARCHI == fstb_ARCHI_X86 + return _mm_castsi128_ps (_mm_set1_epi32 (-1)); +#elif fstb_ARCHI == fstb_ARCHI_ARM + return vreinterpretq_f32_s32 (vdupq_n_s32 (-1)); +#endif // fstb_ARCHI +} + + + +// Returns a0 | a1 | ? | ? +Vf32 Vf32::set_pair (float a0, float a1) noexcept +{ +#if ! defined (fstb_HAS_SIMD) + return Vf32 { a0, a1, 0, 0 }; +#elif fstb_ARCHI == fstb_ARCHI_X86 + return _mm_unpacklo_ps (_mm_set_ss (a0), _mm_set_ss (a1)); +#elif fstb_ARCHI == fstb_ARCHI_ARM + return vsetq_lane_f32 (a1, vdupq_n_f32 (a0), 1); +#endif // fstb_ARCHI +} + + + +// Returns a02 | a13 | a02 | a13 +Vf32 Vf32::set_pair_fill (float a02, float a13) noexcept +{ +#if ! defined (fstb_HAS_SIMD) + return Vf32 { a02, a13, a02, a13 }; +#elif fstb_ARCHI == fstb_ARCHI_X86 + return _mm_unpacklo_ps (_mm_set1_ps (a02), _mm_set1_ps (a13)); +#elif fstb_ARCHI == fstb_ARCHI_ARM + const float32x2_t v01 = vset_lane_f32 (a13, vdup_n_f32 (a02), 1); + return vcombine_f32 (v01, v01); +#endif // fstb_ARCHI +} + + + +// Returns a01 | a01 | a23 | a23 +Vf32 Vf32::set_pair_dbl (float a01, float a23) noexcept +{ +#if ! defined (fstb_HAS_SIMD) + return Vf32 { a01, a01, a23, a23 }; +#elif fstb_ARCHI == fstb_ARCHI_X86 + return _mm_shuffle_ps (_mm_set_ss (a01), _mm_set_ss (a23), 0x00); +#elif fstb_ARCHI == fstb_ARCHI_ARM + return vcombine_f32 (vdup_n_f32 (a01), vdup_n_f32 (a23)); +#endif // fstb_ARCHI +} + + + +// "true" must be 1 and nothing else. +Vf32 Vf32::set_mask (bool m0, bool m1, bool m2, bool m3) noexcept +{ +#if ! defined (fstb_HAS_SIMD) + Combo c; + c._s32 [0] = -int32_t (m0); + c._s32 [1] = -int32_t (m1); + c._s32 [2] = -int32_t (m2); + c._s32 [3] = -int32_t (m3); + return c._vf32; +#elif 1 // Fast version +# if fstb_ARCHI == fstb_ARCHI_X86 + return _mm_castsi128_ps (_mm_sub_epi32 ( + _mm_setzero_si128 (), + _mm_set_epi32 (m3, m2, m1, m0) + )); +# elif fstb_ARCHI == fstb_ARCHI_ARM + float32x2_t v01 = vdup_n_f32 (m0); + float32x2_t v23 = vdup_n_f32 (m2); + v01 = vset_lane_f32 (m1, v01, 1); + v23 = vset_lane_f32 (m3, v23, 1); + return vreinterpretq_f32_s32 (vnegq_s32 (vreinterpretq_s32_f32 ( + vcombine_f32 (v01, v23) + ))); +# endif // fstb_ARCHI +#else // Safer but slower version +# if fstb_ARCHI == fstb_ARCHI_X86 + return _mm_castsi128_ps (_mm_sub_epi32 ( + _mm_set_epi32 (!m3, !m2, !m1, !m0), + _mm_set1_epi32 (1) + )); +# elif fstb_ARCHI == fstb_ARCHI_ARM + float32x2_t v01 = vdup_n_f32 (!m0); + float32x2_t v23 = vdup_n_f32 (!m2); + v01 = vset_lane_f32 (!m1, v01, 1); + v23 = vset_lane_f32 (!m3, v23, 1); + const auto one = vdupq_n_s32 (1); + return vreinterpretq_f32_s32 (vsubq_s32 ( + vreinterpretq_s32_f32 (vcombine_f32 (v01, v23)), + one + )); +# endif // fstb_ARCHI +#endif // Versions +} + + + +Vf32Native Vf32::signbit_mask () noexcept +{ +#if ! defined (fstb_HAS_SIMD) + Combo c; + c._u32 [0] = 0x80000000U; + c._u32 [1] = 0x80000000U; + c._u32 [2] = 0x80000000U; + c._u32 [3] = 0x80000000U; + return c._vf32; +#elif fstb_ARCHI == fstb_ARCHI_X86 +// return _mm_set1_ps (-0.f); + return _mm_castsi128_ps (_mm_set1_epi32 (0x80000000)); +#elif fstb_ARCHI == fstb_ARCHI_ARM + return vreinterpretq_f32_u32 (vdupq_n_u32 (0x80000000U)); +#endif // fstb_ARCHI +} + + + +// returns { p0 [0 1], p1 [0 1] } +Vf32 Vf32::interleave_pair_lo (Vf32 p0, Vf32 p1) noexcept +{ +#if ! defined (fstb_HAS_SIMD) + return Vf32 { p0._x [0], p0._x [1], p1._x [0], p1._x [1] }; +#elif fstb_ARCHI == fstb_ARCHI_X86 + return _mm_shuffle_ps (p0._x, p1._x, (0<<0) + (1<<2) + (0<<4) + (1<<6)); + // return _mm_movelh_ps (p0, p1); +#elif fstb_ARCHI == fstb_ARCHI_ARM + const float32x2_t p0x = vget_low_f32 (p0._x); + const float32x2_t p1x = vget_low_f32 (p1._x); + return vcombine_f32 (p0x, p1x); +#endif // fstb_ARCHI +} + + + +// returns { p0 [2 3], p1 [2 3] } +Vf32 Vf32::interleave_pair_hi (Vf32 p0, Vf32 p1) noexcept +{ +#if ! defined (fstb_HAS_SIMD) + return Vf32 { p0._x [2], p0._x [3], p1._x [2], p1._x [3] }; +#elif fstb_ARCHI == fstb_ARCHI_X86 + return _mm_shuffle_ps (p0._x, p1._x, (2<<0) + (3<<2) + (2<<4) + (3<<6)); + // return _mm_movehl_ps (p1, p0); +#elif fstb_ARCHI == fstb_ARCHI_ARM + const float32x2_t p0x = vget_high_f32 (p0._x); + const float32x2_t p1x = vget_high_f32 (p1._x); + return vcombine_f32 (p0x, p1x); +#endif // fstb_ARCHI +} + + + +std::tuple Vf32::interleave (Vf32 p0, Vf32 p1) noexcept +{ +#if ! defined (fstb_HAS_SIMD) + return std::make_tuple ( + Vf32 { + p0._x [0], + p1._x [0], + p0._x [1], + p1._x [1] + }, Vf32 { + p0._x [2], + p1._x [2], + p0._x [3], + p1._x [3] + } + ); +#elif fstb_ARCHI == fstb_ARCHI_X86 + return std::make_tuple ( + Vf32 { _mm_unpacklo_ps (p0._x, p1._x) }, + Vf32 { _mm_unpackhi_ps (p0._x, p1._x) } + ); +#elif fstb_ARCHI == fstb_ARCHI_ARM + const float32x4x2_t tmp = vzipq_f32 (p0._x, p1._x); + return std::make_tuple ( + Vf32 { tmp.val [0] }, + Vf32 { tmp.val [1] } + ); +#endif // fstb_ARCHI +} + + + +std::tuple Vf32::deinterleave (Vf32 i0, Vf32 i1) noexcept +{ +#if ! defined (fstb_HAS_SIMD) + return std::make_tuple ( + Vf32 { + i0._x [0], + i0._x [2], + i1._x [0], + i1._x [2] + }, Vf32 { + i0._x [1], + i0._x [3], + i1._x [1], + i1._x [3] + } + ); +#elif fstb_ARCHI == fstb_ARCHI_X86 + return std::make_tuple ( + Vf32 { _mm_shuffle_ps (i0._x, i1._x, (0<<0) | (2<<2) | (0<<4) | (2<<6)) }, + Vf32 { _mm_shuffle_ps (i0._x, i1._x, (1<<0) | (3<<2) | (1<<4) | (3<<6)) } + ); +#elif fstb_ARCHI == fstb_ARCHI_ARM + const float32x4x2_t tmp = vuzpq_f32 (i0._x, i1._x); + return std::make_tuple ( + Vf32 { tmp.val [0] }, + Vf32 { tmp.val [1] } + ); +#endif // fstb_ARCHI +} + + + +Vf32 Vf32::deinterleave_lo (Vf32 i0, Vf32 i1) noexcept +{ +#if ! defined (fstb_HAS_SIMD) + return Vf32 { i0._x [0], i0._x [2], i1._x [0], i1._x [2] }; +#elif fstb_ARCHI == fstb_ARCHI_X86 + return _mm_shuffle_ps (i0._x, i1._x, (0<<0) | (2<<2) | (0<<4) | (2<<6)); +#elif fstb_ARCHI == fstb_ARCHI_ARM + return vuzpq_f32 (i0._x, i1._x).val [0]; +#endif // fstb_ARCHI +} + + + +Vf32 Vf32::deinterleave_hi (Vf32 i0, Vf32 i1) noexcept +{ +#if ! defined (fstb_HAS_SIMD) + return Vf32 { i0._x [1], i0._x [3], i1._x [1], i1._x [3] }; +#elif fstb_ARCHI == fstb_ARCHI_X86 + return _mm_shuffle_ps (i0._x, i1._x, (1<<0) | (3<<2) | (1<<4) | (3<<6)); +#elif fstb_ARCHI == fstb_ARCHI_ARM + return vuzpq_f32 (i0._x, i1._x).val [1]; +#endif // fstb_ARCHI +} + + + +// Extracts the vector at the position POS from the double-width vector {a b} +// Concatenates a [POS...3] with b [0...3-POS] +template +Vf32 Vf32::compose (Vf32 a, Vf32 b) noexcept +{ +#if ! defined (fstb_HAS_SIMD) + switch (POS & 3) + { + case 1: return Vf32 { a._x [1], a._x [2], a._x [3], b._x [0] }; + case 2: return Vf32 { a._x [2], a._x [3], b._x [0], b._x [1] }; + case 3: return Vf32 { a._x [3], b._x [0], b._x [1], b._x [2] }; + default: return a; + } +#elif fstb_ARCHI == fstb_ARCHI_X86 + switch (POS & 3) + { + case 1: + { + const auto tmp = _mm_move_ss (a._x, b._x); + return _mm_shuffle_ps (tmp, tmp, (0<<6) | (3<<4) | (2<<2) | (1<<0)); + } + case 2: + return _mm_shuffle_ps (a._x, b._x, (1<<6) | (0<<4) | (3<<2) | (2<<0)); + case 3: + return _mm_move_ss ( + _mm_shuffle_ps (b._x, b._x, (2<<6) | (1<<4) | (0<<2) | (3<<0)), + _mm_shuffle_ps (a._x, a._x, (2<<6) | (1<<4) | (0<<2) | (3<<0)) + ); + default: + return a; + } +#elif fstb_ARCHI == fstb_ARCHI_ARM + if (POS == 0) + { + return a; + } + else + { + const auto aa = vreinterpretq_s32_f32 (a._x); + const auto bb = vreinterpretq_s32_f32 (b._x); + return vreinterpretq_f32_s32 (vextq_s32 (aa, bb, POS)); + } +#endif // fstb_ARCHI +} + + + +template +Vf32 Vf32::load (const MEM *ptr) noexcept +{ + assert (is_ptr_align_nz (ptr, fstb_SIMD128_ALIGN)); + +#if ! defined (fstb_HAS_SIMD) + return *reinterpret_cast (ptr); +#elif fstb_ARCHI == fstb_ARCHI_X86 + return _mm_load_ps (reinterpret_cast (ptr)); +#elif fstb_ARCHI == fstb_ARCHI_ARM + return vld1q_f32 (reinterpret_cast (ptr)); +#endif // fstb_ARCHI +} + + + +template +Vf32 Vf32::loadu (const MEM *ptr) noexcept +{ + assert (ptr != nullptr); + +#if ! defined (fstb_HAS_SIMD) + return *reinterpret_cast (ptr); +#elif fstb_ARCHI == fstb_ARCHI_X86 + return _mm_loadu_ps (reinterpret_cast (ptr)); +#elif fstb_ARCHI == fstb_ARCHI_ARM + return vreinterpretq_f32_u8 ( + vld1q_u8 (reinterpret_cast (ptr)) + ); +#endif // fstb_ARCHI +} + + + +template +Vf32 Vf32::loadu_part (const MEM *ptr, int n) noexcept +{ + assert (n > 0); + + if (n >= 4) + { + return loadu (ptr); + } + + const float * f_ptr = reinterpret_cast (ptr); +#if ! defined (fstb_HAS_SIMD) + Vf32 v; + v._x [0] = f_ptr [0]; + for (int i = 1; i < n; ++i) + { + v._x [i] = f_ptr [i]; + } + return v; +#elif fstb_ARCHI == fstb_ARCHI_X86 + switch (n) + { + case 1: + return _mm_load_ss (f_ptr); + case 2: +# if 1 + return _mm_castsi128_ps (_mm_loadl_epi64 ( + reinterpret_cast (ptr) + )); +# else // Higher latency from Skylake + return _mm_unpacklo_ps (_mm_load_ss (f_ptr), _mm_load_ss (f_ptr + 1)); +# endif + case 3: + return _mm_shuffle_ps ( +# if 1 + _mm_castsi128_ps (_mm_loadl_epi64 ( + reinterpret_cast (ptr) + )), +# else // Higher latency from Skylake + _mm_unpacklo_ps (_mm_load_ss (f_ptr), _mm_load_ss (f_ptr + 1)), +# endif + _mm_load_ss (f_ptr + 2), + (0<<0) + (1<<2) + (2<<4) + ); + default: + // Keeps the compiler happy with (un)initialisation + return loadu (ptr); + } +#elif fstb_ARCHI == fstb_ARCHI_ARM + auto v = vmovq_n_f32 (f_ptr [0]); + if (n >= 2) + { + v = vld1q_lane_f32 (f_ptr + 1, v, 1); + if (n >= 3) + { + v = vld1q_lane_f32 (f_ptr + 2, v, 2); + } + } + return v; +#endif // fstb_ARCHI +} + + + +// Returns: ptr [0] | ptr [1] | ? | ? +template +Vf32 Vf32::loadu_pair (const MEM *ptr) noexcept +{ + assert (ptr != nullptr); + +#if ! defined (fstb_HAS_SIMD) + auto p = reinterpret_cast (ptr); + return Vf32 { p [0], p [1], 0, 0 }; +#elif fstb_ARCHI == fstb_ARCHI_X86 +# if 1 + return _mm_castsi128_ps (_mm_loadl_epi64 ( + reinterpret_cast (ptr) + )); +# else // Higher latency from Skylake + const auto x0 = _mm_load_ss (reinterpret_cast (ptr) ); + const auto x1 = _mm_load_ss (reinterpret_cast (ptr) + 1); + return _mm_unpacklo_ps (x0, x1); +# endif +#elif fstb_ARCHI == fstb_ARCHI_ARM + const float32x2_t x = vreinterpret_f32_u8 ( + vld1_u8 (reinterpret_cast (ptr)) + ); + return vcombine_f32 (x, x); +#endif // fstb_ARCHI +} + + + +/*\\\ PROTECTED \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\*/ + + + +/*\\\ PRIVATE \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\*/ + + + +// n = number of scalars to store (from the LSB) +template +void Vf32::storeu_part_n13 (MEM *ptr, int n) const noexcept +{ + assert (n > 0); + assert (n < 4); + + float * f_ptr = reinterpret_cast (ptr); + +#if ! defined (fstb_HAS_SIMD) + + for (int i = 0; i < n; ++i) + { + f_ptr [i] = _x [i]; + } + +#elif fstb_ARCHI == fstb_ARCHI_X86 + + _mm_store_ss (f_ptr, _x); + if (n >= 2) + { + _mm_store_ss (f_ptr + 1, _mm_shuffle_ps (_x, _x, 1 << 0)); + if (n >= 3) + { + _mm_store_ss (f_ptr + 2, _mm_movehl_ps (_x, _x)); + } + } + +#elif fstb_ARCHI == fstb_ARCHI_ARM + + vst1q_lane_f32 (f_ptr + 0, _x, 0); + if (n >= 2) + { + vst1q_lane_f32 (f_ptr + 1, _x, 1); + if (n >= 3) + { + vst1q_lane_f32 (f_ptr + 2, _x, 2); + } + } + +#endif +} + + + +/*\\\ GLOBAL OPERATORS AND FUNCTIONS \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\*/ + + + +Vf32 operator + (Vf32 lhs, const Vf32 &rhs) noexcept +{ + lhs += rhs; + return lhs; +} + +Vf32 operator - (Vf32 lhs, const Vf32 &rhs) noexcept +{ + lhs -= rhs; + return lhs; +} + +Vf32 operator * (Vf32 lhs, const Vf32 &rhs) noexcept +{ + lhs *= rhs; + return lhs; +} + +Vf32 operator / (Vf32 lhs, const Vf32 &rhs) noexcept +{ + lhs /= rhs; + return lhs; +} + +Vf32 operator & (Vf32 lhs, const Vf32 &rhs) noexcept +{ + lhs &= rhs; + return lhs; +} + +Vf32 operator | (Vf32 lhs, const Vf32 &rhs) noexcept +{ + lhs |= rhs; + return lhs; +} + +Vf32 operator ^ (Vf32 lhs, const Vf32 &rhs) noexcept +{ + lhs ^= rhs; + return lhs; +} + + + +Vf32 operator == (const Vf32 &lhs, const Vf32 &rhs) noexcept +{ +#if ! defined (fstb_HAS_SIMD) + Vf32::Combo r; + r._s32 [0] = (lhs._x [0] == rhs._x [0]) ? -1 : 0; + r._s32 [1] = (lhs._x [1] == rhs._x [1]) ? -1 : 0; + r._s32 [2] = (lhs._x [2] == rhs._x [2]) ? -1 : 0; + r._s32 [3] = (lhs._x [3] == rhs._x [3]) ? -1 : 0; + return r._vf32; +#elif fstb_ARCHI == fstb_ARCHI_X86 + return _mm_cmpeq_ps (lhs, rhs); +#elif fstb_ARCHI == fstb_ARCHI_ARM + return vreinterpretq_f32_u32 (vceqq_f32 (lhs, rhs)); +#endif // fstb_ARCHI +} + + + +Vf32 operator != (const Vf32 &lhs, const Vf32 &rhs) noexcept +{ +#if ! defined (fstb_HAS_SIMD) + Vf32::Combo r; + r._s32 [0] = (lhs._x [0] != rhs._x [0]) ? -1 : 0; + r._s32 [1] = (lhs._x [1] != rhs._x [1]) ? -1 : 0; + r._s32 [2] = (lhs._x [2] != rhs._x [2]) ? -1 : 0; + r._s32 [3] = (lhs._x [3] != rhs._x [3]) ? -1 : 0; + return r._vf32; +#elif fstb_ARCHI == fstb_ARCHI_X86 + return _mm_cmpneq_ps (lhs, rhs); +#elif fstb_ARCHI == fstb_ARCHI_ARM + return vreinterpretq_f32_u32 (vmvnq_u32 (vceqq_f32 (lhs, rhs))); +#endif // fstb_ARCHI +} + + + +Vf32 operator < (const Vf32 &lhs, const Vf32 &rhs) noexcept +{ +#if ! defined (fstb_HAS_SIMD) + Vf32::Combo r; + r._s32 [0] = (lhs._x [0] < rhs._x [0]) ? -1 : 0; + r._s32 [1] = (lhs._x [1] < rhs._x [1]) ? -1 : 0; + r._s32 [2] = (lhs._x [2] < rhs._x [2]) ? -1 : 0; + r._s32 [3] = (lhs._x [3] < rhs._x [3]) ? -1 : 0; + return r._vf32; +#elif fstb_ARCHI == fstb_ARCHI_X86 + return _mm_cmplt_ps (lhs, rhs); +#elif fstb_ARCHI == fstb_ARCHI_ARM + return vreinterpretq_f32_u32 (vcltq_f32 (lhs, rhs)); +#endif // fstb_ARCHI +} + + + +Vf32 operator <= (const Vf32 &lhs, const Vf32 &rhs) noexcept +{ +#if ! defined (fstb_HAS_SIMD) + Vf32::Combo r; + r._s32 [0] = (lhs._x [0] <= rhs._x [0]) ? -1 : 0; + r._s32 [1] = (lhs._x [1] <= rhs._x [1]) ? -1 : 0; + r._s32 [2] = (lhs._x [2] <= rhs._x [2]) ? -1 : 0; + r._s32 [3] = (lhs._x [3] <= rhs._x [3]) ? -1 : 0; + return r._vf32; +#elif fstb_ARCHI == fstb_ARCHI_X86 + return _mm_cmple_ps (lhs, rhs); +#elif fstb_ARCHI == fstb_ARCHI_ARM + return vreinterpretq_f32_u32 (vcleq_f32 (lhs, rhs)); +#endif // fstb_ARCHI +} + + + +Vf32 operator > (const Vf32 &lhs, const Vf32 &rhs) noexcept +{ +#if ! defined (fstb_HAS_SIMD) + Vf32::Combo r; + r._s32 [0] = (lhs._x [0] > rhs._x [0]) ? -1 : 0; + r._s32 [1] = (lhs._x [1] > rhs._x [1]) ? -1 : 0; + r._s32 [2] = (lhs._x [2] > rhs._x [2]) ? -1 : 0; + r._s32 [3] = (lhs._x [3] > rhs._x [3]) ? -1 : 0; + return r._vf32; +#elif fstb_ARCHI == fstb_ARCHI_X86 + return _mm_cmpgt_ps (lhs, rhs); +#elif fstb_ARCHI == fstb_ARCHI_ARM + return vreinterpretq_f32_u32 (vcgtq_f32 (lhs, rhs)); +#endif // fstb_ARCHI +} + + + +Vf32 operator >= (const Vf32 &lhs, const Vf32 &rhs) noexcept +{ +#if ! defined (fstb_HAS_SIMD) + Vf32::Combo r; + r._s32 [0] = (lhs._x [0] >= rhs._x [0]) ? -1 : 0; + r._s32 [1] = (lhs._x [1] >= rhs._x [1]) ? -1 : 0; + r._s32 [2] = (lhs._x [2] >= rhs._x [2]) ? -1 : 0; + r._s32 [3] = (lhs._x [3] >= rhs._x [3]) ? -1 : 0; + return r._vf32; +#elif fstb_ARCHI == fstb_ARCHI_X86 + return _mm_cmpge_ps (lhs, rhs); +#elif fstb_ARCHI == fstb_ARCHI_ARM + return vreinterpretq_f32_u32 (vcgeq_f32 (lhs, rhs)); +#endif // fstb_ARCHI +} + + + +Vf32 abs (const Vf32 &v) noexcept +{ +#if ! defined (fstb_HAS_SIMD) + return Vf32 { + fabsf (v._x [0]), + fabsf (v._x [1]), + fabsf (v._x [2]), + fabsf (v._x [3]) + }; +#elif fstb_ARCHI == fstb_ARCHI_X86 + return _mm_andnot_ps (Vf32::signbit_mask (), v); +#elif fstb_ARCHI == fstb_ARCHI_ARM + return vabsq_f32 (v); +#endif // fstb_ARCHI +} + + + +// Returns x * a + b +Vf32 fma (const Vf32 &x, const Vf32 &a, const Vf32 &b) noexcept +{ +#if ! defined (fstb_HAS_SIMD) + return Vf32 { + x._x [0] * a._x [0] + b._x [0], + x._x [1] * a._x [1] + b._x [1], + x._x [2] * a._x [2] + b._x [2], + x._x [3] * a._x [3] + b._x [3] + }; +#elif fstb_ARCHI == fstb_ARCHI_X86 + return _mm_add_ps (_mm_mul_ps (x, a), b); +#elif fstb_ARCHI == fstb_ARCHI_ARM + #if defined (__ARM_FEATURE_FMA) + return vfmaq_f32 (b, x, a); + #else + return vmlaq_f32 (b, x, a); + #endif +#endif // fstb_ARCHI +} + + + +// Returns x * a - b +Vf32 fms (const Vf32 &x, const Vf32 &a, const Vf32 &b) noexcept +{ +#if ! defined (fstb_HAS_SIMD) + return Vf32 { + x._x [0] * a._x [0] - b._x [0], + x._x [1] * a._x [1] - b._x [1], + x._x [2] * a._x [2] - b._x [2], + x._x [3] * a._x [3] - b._x [3] + }; +#elif fstb_ARCHI == fstb_ARCHI_X86 + return _mm_sub_ps (_mm_mul_ps (x, a), b); +#elif fstb_ARCHI == fstb_ARCHI_ARM + #if defined (__ARM_FEATURE_FMA) + return -vfmsq_f32 (b, x, a); + #else + return -vmlsq_f32 (b, x, a); + #endif +#endif // fstb_ARCHI +} + + + +// Returns - x * a + b +Vf32 fnma (const Vf32 &x, const Vf32 &a, const Vf32 &b) noexcept +{ +#if ! defined (fstb_HAS_SIMD) + return Vf32 { + b._x [0] - x._x [0] * a._x [0], + b._x [1] - x._x [1] * a._x [1], + b._x [2] - x._x [2] * a._x [2], + b._x [3] - x._x [3] * a._x [3] + }; +#elif fstb_ARCHI == fstb_ARCHI_X86 + return _mm_sub_ps (b, _mm_mul_ps (x, a)); +#elif fstb_ARCHI == fstb_ARCHI_ARM + #if defined (__ARM_FEATURE_FMA) + return vfmsq_f32 (b, x, a); + #else + return vmlsq_f32 (b, x, a); + #endif +#endif // fstb_ARCHI +} + + + +Vf32 round (const Vf32 &v) noexcept +{ + return v.round (); +} + + + +Vf32 min (const Vf32 &lhs, const Vf32 &rhs) noexcept +{ +#if ! defined (fstb_HAS_SIMD) + return Vf32 { + std::min (lhs._x [0], rhs._x [0]), + std::min (lhs._x [1], rhs._x [1]), + std::min (lhs._x [2], rhs._x [2]), + std::min (lhs._x [3], rhs._x [3]) + }; +#elif fstb_ARCHI == fstb_ARCHI_X86 + return _mm_min_ps (lhs, rhs); +#elif fstb_ARCHI == fstb_ARCHI_ARM + return vminq_f32 (lhs, rhs); +#endif // fstb_ARCHI +} + + + +Vf32 max (const Vf32 &lhs, const Vf32 &rhs) noexcept +{ +#if ! defined (fstb_HAS_SIMD) + return Vf32 { + std::max (lhs._x [0], rhs._x [0]), + std::max (lhs._x [1], rhs._x [1]), + std::max (lhs._x [2], rhs._x [2]), + std::max (lhs._x [3], rhs._x [3]) + }; +#elif fstb_ARCHI == fstb_ARCHI_X86 + return _mm_max_ps (lhs, rhs); +#elif fstb_ARCHI == fstb_ARCHI_ARM + return vmaxq_f32 (lhs, rhs); +#endif // fstb_ARCHI +} + + + +Vf32 limit (const Vf32 &v, const Vf32 &mi, const Vf32 &ma) noexcept +{ + return min (max (v, mi), ma); +} + + + +Vf32 select (Vf32 cond, Vf32 v_t, Vf32 v_f) noexcept +{ +#if ! defined (fstb_HAS_SIMD) + /*** To do: implement as r = v_f ^ ((v_f ^ v_t) & cond) ***/ + const Vf32::Combo cc { cond }; + Vf32::Combo ct { v_t }; + Vf32::Combo cf { v_f }; + Vf32::Combo r; + r._s32 [0] = (ct._s32 [0] & cc._s32 [0]) | (cf._s32 [0] & ~cc._s32 [0]); + r._s32 [1] = (ct._s32 [1] & cc._s32 [1]) | (cf._s32 [1] & ~cc._s32 [1]); + r._s32 [2] = (ct._s32 [2] & cc._s32 [2]) | (cf._s32 [2] & ~cc._s32 [2]); + r._s32 [3] = (ct._s32 [3] & cc._s32 [3]) | (cf._s32 [3] & ~cc._s32 [3]); + return r._vf32; +#elif fstb_ARCHI == fstb_ARCHI_X86 + const auto cond_1 = _mm_and_ps ( cond, v_t); + const auto cond_0 = _mm_andnot_ps (cond, v_f); + return _mm_or_ps (cond_0, cond_1); +#elif fstb_ARCHI == fstb_ARCHI_ARM + return vbslq_f32 (vreinterpretq_u32_f32 (cond), v_t, v_f); +#endif // fstb_ARCHI +} + + + +std::tuple swap_if (Vf32 cond, Vf32 lhs, Vf32 rhs) noexcept +{ +#if ! defined (fstb_HAS_SIMD) + const Vf32::Combo cc { cond }; + if (cc._s32 [0] != 0) { std::swap (lhs._x [0], rhs._x [0]); } + if (cc._s32 [1] != 0) { std::swap (lhs._x [1], rhs._x [1]); } + if (cc._s32 [2] != 0) { std::swap (lhs._x [2], rhs._x [2]); } + if (cc._s32 [3] != 0) { std::swap (lhs._x [3], rhs._x [3]); } + return std::make_tuple (lhs, rhs); +#elif fstb_ARCHI == fstb_ARCHI_X86 + const auto inv = _mm_and_ps (_mm_xor_ps (lhs, rhs), cond); + return std::make_tuple ( + _mm_xor_ps (lhs, inv), + _mm_xor_ps (rhs, inv) + ); +#elif fstb_ARCHI == fstb_ARCHI_ARM + const auto cu32 = vreinterpretq_u32_f32 (cond); + return std::make_tuple ( + vbslq_f32 (cu32, rhs, lhs), + vbslq_f32 (cu32, lhs, rhs) + ); +#endif // fstb_ARCHI +} + + + +Vf32 sqrt (Vf32 v) noexcept +{ +#if ! defined (fstb_HAS_SIMD) + return Vf32 { + sqrtf (v._x [0]), + sqrtf (v._x [1]), + sqrtf (v._x [2]), + sqrtf (v._x [3]) + }; +#elif fstb_ARCHI == fstb_ARCHI_X86 + return _mm_sqrt_ps (v); +#elif fstb_ARCHI == fstb_ARCHI_ARM + const uint32x4_t nz_flag = vtstq_u32 ( + vreinterpretq_u32_f32 (v), + vreinterpretq_u32_f32 (v) + ); + float32x4_t rs = vrsqrteq_f32 (v); + rs *= vrsqrtsq_f32 (v, rs * rs); + rs *= vrsqrtsq_f32 (v, rs * rs); + rs *= vrsqrtsq_f32 (v, rs * rs); + const auto sqrt_a = rs * float32x4_t (v); + return vreinterpretq_f32_u32 (vandq_u32 ( + vreinterpretq_u32_f32 (sqrt_a), + nz_flag + )); +#endif // fstb_ARCHI +} + + + +// Formula by 2DaT +// 12-13 ulp +// https://www.kvraudio.com/forum/viewtopic.php?f=33&t=532048 +Vf32 log2 (Vf32 v) noexcept +{ +#if ! defined (fstb_HAS_SIMD) + + assert (v > Vf32 (0)); + /*** To do: actual approximation matching the SIMD formula ***/ + return Vf32 { + logf (v._x [0]) * float (LOG2_E), + logf (v._x [1]) * float (LOG2_E), + logf (v._x [2]) * float (LOG2_E), + logf (v._x [3]) * float (LOG2_E), + }; + +#else // fstb_HAS_SIMD + + // Rational fraction approximating log2 (x) + // [sqrt (0.5) ; sqrt (2)] -> [-0.5 ; 0.5] + // f: x -> (x - 1) * (x^2 + c1*x + c0) / (d2*x^2 + d1*x + d0) + // No analytic continuity on the full range, although this is "almost" C0 + // (good enough for single precision). + const auto c0 = Vf32 (1.011593342e+01f); + const auto c1 = Vf32 (1.929443550e+01f); + const auto d0 = Vf32 (2.095932245e+00f); + const auto d1 = Vf32 (1.266638851e+01f); + const auto d2 = Vf32 (6.316540241e+00f); + const auto one = Vf32 (1.0f); + const auto multi = Vf32 (1.41421356237f); + const auto mmask = ~((1 << 23) - 1); + +#if fstb_ARCHI == fstb_ARCHI_X86 + + __m128i x_i = _mm_castps_si128 (v); + __m128i spl_exp = _mm_castps_si128 (v * multi); + spl_exp = _mm_sub_epi32 (spl_exp, _mm_castps_si128 (one)); + spl_exp = _mm_and_si128 (spl_exp, _mm_set1_epi32 (mmask)); + const auto spl_mantissa = + Vf32 { _mm_castsi128_ps (_mm_sub_epi32 (x_i, spl_exp)) }; + spl_exp = _mm_srai_epi32 (spl_exp, 23); + const auto log2_exponent = Vf32 { _mm_cvtepi32_ps (spl_exp) }; + +#elif fstb_ARCHI == fstb_ARCHI_ARM + + const int32x4_t x_i = vreinterpretq_s32_f32 (v); + int32x4_t spl_exp = vreinterpretq_s32_f32 (v * multi); + spl_exp = spl_exp - vreinterpretq_s32_f32 (one); + spl_exp = vandq_s32 (spl_exp, vdupq_n_s32 (mmask)); + const auto spl_mantissa = Vf32 { vreinterpretq_f32_s32 (x_i - spl_exp) }; + spl_exp = vshrq_n_s32 (spl_exp, 23); + const auto log2_exponent = Vf32 { vcvtq_f32_s32 (spl_exp) }; + +#endif // fstb_ARCHI + + auto num = spl_mantissa + c1; + num = fma (num, spl_mantissa, c0); + num = fms (num, spl_mantissa, num); + + auto den = d2; + den = fma (den, spl_mantissa, d1); + den = fma (den, spl_mantissa, d0); + + auto res = num / den; + res += log2_exponent; + + return res; + +#endif // fstb_HAS_SIMD +} + + + +// Formula by 2DaT +// Coefficients fixed by Andrew Simper to achieve true C0 continuity +// 3-4 ulp +// https://www.kvraudio.com/forum/viewtopic.php?p=7161124#p7161124 +// https://www.kvraudio.com/forum/viewtopic.php?p=7677266#p7677266 +Vf32 exp2 (Vf32 v) noexcept +{ +#if ! defined (fstb_HAS_SIMD) + + /*** To do: actual approximation matching the SIMD formula ***/ + return Vf32 { + exp2f (v._x [0]), + exp2f (v._x [1]), + exp2f (v._x [2]), + exp2f (v._x [3]), + }; + +#else // fstb_HAS_SIMD + + // [-0.5, 0.5] 2^x approx polynomial ~ 2.4 ulp + const auto c0 = Vf32 (1.000000088673463); + const auto c1 = Vf32 (0.69314693211407); + const auto c2 = Vf32 (0.24022037362574); + const auto c3 = Vf32 (0.0555072548370); + const auto c4 = Vf32 (0.0096798351988); + const auto c5 = Vf32 (0.0013285658116); + + // Note: the following set of coefficients has a larger error (0.00043 + // cents, maybe 7 ulp?) but ensures C2 continuity: + // c0 = 1.000000237 + // c1 = 0.69314655 + // c2 = 0.24021519 + // c3 = 0.05550965 + // c4 = 0.00969821 + // c5 = 0.00132508 + + // i = round (v) + // v = v - i +#if fstb_ARCHI == fstb_ARCHI_X86 + auto i = _mm_cvtps_epi32 (v); + v -= _mm_cvtepi32_ps (i); +#elif fstb_ARCHI == fstb_ARCHI_ARM + const int round_ofs = 256; + const auto r = Vf32 (round_ofs + 0.5f); + auto i = vcvtq_s32_f32 (v + r); + i -= vdupq_n_s32 (round_ofs); + v -= vcvtq_f32_s32 (i); +#endif // fstb_ARCHI + + // Estrin-Horner evaluation scheme + const auto v2 = v * v; + const auto p23 = fma (c3, v, c2); + const auto p01 = fma (c1, v, c0); + auto p = fma (c5, v, c4); + p = fma (p, v2, p23); + p = fma (p, v2, p01); + + // i << 23 + // r = (2^i) * (2^v) + // directly in floating point exponent +#if fstb_ARCHI == fstb_ARCHI_X86 + i = _mm_slli_epi32 (i, 23); + return _mm_castsi128_ps (_mm_add_epi32 (i, _mm_castps_si128 (p))); +#elif fstb_ARCHI == fstb_ARCHI_ARM + i = vshlq_n_s32 (i, 23); + return vreinterpretq_f32_s32 (i + vreinterpretq_s32_f32 (p)); +#endif // fstb_ARCHI + +#endif // fstb_HAS_SIMD +} + + + +} // namespace fstb + + + +#endif // fstb_Vf32_CODEHEADER_INCLUDED + + + +/*\\\ EOF \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\*/ diff --git a/src/fstb/Vs32.h b/src/fstb/Vs32.h new file mode 100644 index 0000000..234d4c0 --- /dev/null +++ b/src/fstb/Vs32.h @@ -0,0 +1,257 @@ +/***************************************************************************** + + Vs32.h + Author: Laurent de Soras, 2021 + +--- Legal stuff --- + +This program is free software. It comes without any warranty, to +the extent permitted by applicable law. You can redistribute it +and/or modify it under the terms of the Do What The Fuck You Want +To Public License, Version 2, as published by Sam Hocevar. See +http://www.wtfpl.net/ for more details. + +*Tab=3***********************************************************************/ + + + +#pragma once +#if ! defined (fstb_Vs32_HEADER_INCLUDED) +#define fstb_Vs32_HEADER_INCLUDED + + + +/*\\\ INCLUDE FILES \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\*/ + +#include "fstb/def.h" + +#if ! defined (fstb_HAS_SIMD) + #include +#elif (fstb_ARCHI == fstb_ARCHI_X86) + #include +#elif (fstb_ARCHI == fstb_ARCHI_ARM) + #include +#else + #error +#endif + +#include + +#include + + + +namespace fstb +{ + + + +#if ! defined (fstb_HAS_SIMD) + +typedef std::array Vs32Native; + +#elif fstb_ARCHI == fstb_ARCHI_X86 + +typedef __m128i Vs32Native; + +#elif fstb_ARCHI == fstb_ARCHI_ARM + +typedef int32x4_t Vs32Native; + +#else // fstb_ARCHI +#error +#endif // fstb_ARCHI + + + +class Vs32 +{ + +/*\\\ PUBLIC \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\*/ + +public: + + static constexpr int _len_l2 = 2; + static constexpr int _length = 1 << _len_l2; + typedef int32_t Scalar; + + Vs32 () = default; + fstb_FORCEINLINE + Vs32 (Vs32Native a) noexcept : _x { a } {} + explicit fstb_FORCEINLINE + Vs32 (Scalar a) noexcept; + explicit fstb_FORCEINLINE + Vs32 (Scalar a0, Scalar a1, Scalar a2, Scalar a3) noexcept; + explicit fstb_FORCEINLINE + Vs32 (const std::tuple &a) noexcept; + Vs32 (const Vs32 &other) = default; + Vs32 (Vs32 &&other) = default; + ~Vs32 () = default; + Vs32 & operator = (const Vs32 &other) = default; + Vs32 & operator = (Vs32 &&other) = default; + + template + fstb_FORCEINLINE void + store (MEM *ptr) const noexcept; + template + fstb_FORCEINLINE void + storeu (MEM *ptr) const noexcept; + template + fstb_FORCEINLINE void + storeu_part (MEM *ptr, int n) const noexcept; + + fstb_FORCEINLINE + operator Vs32Native () const noexcept { return _x; } + fstb_FORCEINLINE explicit + operator bool () const noexcept; + + fstb_FORCEINLINE Vs32 & + operator += (const Vs32Native &other) noexcept; + fstb_FORCEINLINE Vs32 & + operator -= (const Vs32Native &other) noexcept; + fstb_FORCEINLINE Vs32 & + operator *= (const Vs32Native &other) noexcept; + + fstb_FORCEINLINE Vs32 & + operator &= (const Vs32Native &other) noexcept; + fstb_FORCEINLINE Vs32 & + operator |= (const Vs32Native &other) noexcept; + fstb_FORCEINLINE Vs32 & + operator ^= (const Vs32Native &other) noexcept; + + fstb_FORCEINLINE Vs32 & + operator <<= (int imm) noexcept; + fstb_FORCEINLINE Vs32 & + operator >>= (int imm) noexcept; + + fstb_FORCEINLINE Vs32 + operator - () const noexcept; + fstb_FORCEINLINE Vs32 + operator ~ () const noexcept; + fstb_FORCEINLINE Vs32 + is_lt_0 () const noexcept; + fstb_FORCEINLINE Vs32 + reverse () const noexcept; + + template + fstb_FORCEINLINE Vs32 + rotate () const noexcept; + template + fstb_FORCEINLINE int32_t + extract () const noexcept; + template + fstb_FORCEINLINE Vs32 + insert (int32_t val) const noexcept; + template + fstb_FORCEINLINE Vs32 + spread () const noexcept; + + fstb_FORCEINLINE std::tuple + explode () const noexcept; + + fstb_FORCEINLINE int32_t + sum_h () const noexcept; + fstb_FORCEINLINE int32_t + min_h () const noexcept; + fstb_FORCEINLINE int32_t + max_h () const noexcept; + + fstb_FORCEINLINE bool + and_h () const noexcept; + fstb_FORCEINLINE bool + or_h () const noexcept; + fstb_FORCEINLINE unsigned int + movemask () const noexcept; + fstb_FORCEINLINE int + count_bits () const noexcept; + + static fstb_FORCEINLINE Vs32 + zero () noexcept; + static fstb_FORCEINLINE Vs32 + all1 () noexcept; + static fstb_FORCEINLINE Vs32 + set_mask (bool m0, bool m1, bool m2, bool m3) noexcept; + template + static fstb_FORCEINLINE Vs32 + compose (Vs32 a, Vs32 b) noexcept; + + template + static fstb_FORCEINLINE Vs32 + load (const MEM *ptr) noexcept; + template + static fstb_FORCEINLINE Vs32 + loadu (const MEM *ptr) noexcept; + + + +/*\\\ PROTECTED \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\*/ + +protected: + + + +/*\\\ PRIVATE \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\*/ + +private: + +#if ! defined (fstb_HAS_SIMD) +public: +#endif + Vs32Native _x; +private: + + + +/*\\\ FORBIDDEN MEMBER FUNCTIONS \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\*/ + +private: + +}; // class Vs32 + + + +/*\\\ GLOBAL OPERATORS AND FUNCTIONS \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\*/ + + + +fstb_FORCEINLINE Vs32 operator + (Vs32 lhs, const Vs32 &rhs) noexcept; +fstb_FORCEINLINE Vs32 operator - (Vs32 lhs, const Vs32 &rhs) noexcept; +fstb_FORCEINLINE Vs32 operator * (Vs32 lhs, const Vs32 &rhs) noexcept; +fstb_FORCEINLINE Vs32 operator & (Vs32 lhs, const Vs32 &rhs) noexcept; +fstb_FORCEINLINE Vs32 operator | (Vs32 lhs, const Vs32 &rhs) noexcept; +fstb_FORCEINLINE Vs32 operator ^ (Vs32 lhs, const Vs32 &rhs) noexcept; + +template +fstb_FORCEINLINE Vs32 operator << (Vs32 lhs, T rhs) noexcept; +template +fstb_FORCEINLINE Vs32 operator >> (Vs32 lhs, T rhs) noexcept; + +fstb_FORCEINLINE Vs32 operator == (const Vs32 &lhs, const Vs32 &rhs) noexcept; +fstb_FORCEINLINE Vs32 operator != (const Vs32 &lhs, const Vs32 &rhs) noexcept; +fstb_FORCEINLINE Vs32 operator < (const Vs32 &lhs, const Vs32 &rhs) noexcept; +fstb_FORCEINLINE Vs32 operator <= (const Vs32 &lhs, const Vs32 &rhs) noexcept; +fstb_FORCEINLINE Vs32 operator > (const Vs32 &lhs, const Vs32 &rhs) noexcept; +fstb_FORCEINLINE Vs32 operator >= (const Vs32 &lhs, const Vs32 &rhs) noexcept; + +fstb_FORCEINLINE Vs32 abs (const Vs32 &v) noexcept; +fstb_FORCEINLINE Vs32 min (const Vs32 &lhs, const Vs32 &rhs) noexcept; +fstb_FORCEINLINE Vs32 max (const Vs32 &lhs, const Vs32 &rhs) noexcept; +fstb_FORCEINLINE Vs32 limit (const Vs32 &v, const Vs32 &mi, const Vs32 &ma) noexcept; +fstb_FORCEINLINE Vs32 select (const Vs32 &cond, const Vs32 &v_t, const Vs32 &v_f) noexcept; +fstb_FORCEINLINE std::tuple swap_if (const Vs32 &cond, Vs32 lhs, Vs32 rhs) noexcept; + + + +} // namespace fstb + + + +#include "fstb/Vs32.hpp" + + + +#endif // fstb_Vs32_HEADER_INCLUDED + + + +/*\\\ EOF \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\*/ diff --git a/src/fstb/Vs32.hpp b/src/fstb/Vs32.hpp new file mode 100644 index 0000000..e6b2510 --- /dev/null +++ b/src/fstb/Vs32.hpp @@ -0,0 +1,1142 @@ +/***************************************************************************** + + Vs32.hpp + Author: Laurent de Soras, 2021 + +--- Legal stuff --- + +This program is free software. It comes without any warranty, to +the extent permitted by applicable law. You can redistribute it +and/or modify it under the terms of the Do What The Fuck You Want +To Public License, Version 2, as published by Sam Hocevar. See +http://www.wtfpl.net/ for more details. + +*Tab=3***********************************************************************/ + + + +#if ! defined (fstb_Vs32_CODEHEADER_INCLUDED) +#define fstb_Vs32_CODEHEADER_INCLUDED + + + +/*\\\ INCLUDE FILES \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\*/ + +#include "fstb/fnc.h" + +#include + +#include + + + +namespace fstb +{ + + + +/*\\\ PUBLIC \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\*/ + + + +// Returns a0 | a0 | a0 | a0 +Vs32::Vs32 (Scalar a) noexcept +#if ! defined (fstb_HAS_SIMD) +: _x { a, a, a, a } +#elif fstb_ARCHI == fstb_ARCHI_X86 +: _x { _mm_set1_epi32 (a) } +#elif fstb_ARCHI == fstb_ARCHI_ARM +: _x { vdupq_n_s32 (a) } +#endif // fstb_ARCHI +{ + // Nothing +} + + + +// Returns a0 | a1 | a2 | a3 +Vs32::Vs32 (Scalar a0, Scalar a1, Scalar a2, Scalar a3) noexcept +#if ! defined (fstb_HAS_SIMD) +: _x { a0, a1, a2, a3 } +#elif fstb_ARCHI == fstb_ARCHI_X86 +: _x { _mm_set_epi32 (a3, a2, a1, a0) } +#elif fstb_ARCHI == fstb_ARCHI_ARM +: _x { a0, a1, a2, a3 } +#endif // fstb_ARCHI +{ + // Nothing +} + + + +// Returns a0 | a1 | a2 | a3 +Vs32::Vs32 (const std::tuple &a) noexcept +#if ! defined (fstb_HAS_SIMD) +: _x { std::get <0> (a), std::get <1> (a), std::get <2> (a), std::get <3> (a) } +#elif fstb_ARCHI == fstb_ARCHI_X86 +: _x { _mm_set_epi32 (std::get <3> (a), std::get <2> (a), std::get <1> (a), std::get <0> (a)) } +#elif fstb_ARCHI == fstb_ARCHI_ARM +: _x { std::get <0> (a), std::get <1> (a), std::get <2> (a), std::get <3> (a) } +#endif // fstb_ARCHI +{ + // Nothing +} + + + +template +void Vs32::store (MEM *ptr) const noexcept +{ + assert (is_ptr_align_nz (ptr, fstb_SIMD128_ALIGN)); + +#if ! defined (fstb_HAS_SIMD) + *reinterpret_cast (ptr) = _x; +#elif fstb_ARCHI == fstb_ARCHI_X86 + _mm_store_si128 (reinterpret_cast <__m128i *> (ptr), _x); +#elif fstb_ARCHI == fstb_ARCHI_ARM + vst1q_s32 (reinterpret_cast (ptr), _x); +#endif // fstb_ARCHI +} + + + +template +void Vs32::storeu (MEM *ptr) const noexcept +{ + assert (ptr != nullptr); + +#if ! defined (fstb_HAS_SIMD) + *reinterpret_cast (ptr) = _x; +#elif fstb_ARCHI == fstb_ARCHI_X86 + _mm_storeu_si128 (reinterpret_cast <__m128i *> (ptr), _x); +#elif fstb_ARCHI == fstb_ARCHI_ARM + vst1q_u8 (reinterpret_cast (ptr), vreinterpretq_u8_s32 (_x)); +#endif // fstb_ARCHI +} + + + +// n = number of scalars to store (from the LSB) +template +void Vs32::storeu_part (MEM *ptr, int n) const noexcept +{ + assert (n > 0); + + if (n >= _length) + { + storeu (ptr); + return; + } + + int32_t * f_ptr = reinterpret_cast (ptr); + +#if ! defined (fstb_HAS_SIMD) + + for (int i = 0; i < n; ++i) + { + f_ptr [i] = _x [i]; + } + +#elif fstb_ARCHI == fstb_ARCHI_X86 + + f_ptr [0] = _mm_cvtsi128_si32 (_x); + if (n >= 2) + { + f_ptr [1] = _mm_cvtsi128_si32 (_mm_shuffle_epi32 (_x, 1 << 0)); + if (n >= 3) + { + f_ptr [1] = _mm_cvtsi128_si32 (_mm_shuffle_epi32 (_x, 2 << 0)); + } + } + +#elif fstb_ARCHI == fstb_ARCHI_ARM + + vst1q_lane_s32 (f_ptr + 0, _x, 0); + if (n >= 2) + { + vst1q_lane_s32 (f_ptr + 1, _x, 1); + if (n >= 3) + { + vst1q_lane_s32 (f_ptr + 2, _x, 2); + } + } + +#endif +} + + + +// Works only with well-formed condition results (tested bits depend on the +// implementation). +// For each scalar, true = all bits set, false = all bits cleared +Vs32::operator bool () const noexcept +{ + return and_h (); +} + + + +Vs32 & Vs32::operator += (const Vs32Native &other) noexcept +{ +#if ! defined (fstb_HAS_SIMD) + _x [0] += other [0]; + _x [1] += other [1]; + _x [2] += other [2]; + _x [3] += other [3]; +#elif fstb_ARCHI == fstb_ARCHI_X86 + _x = _mm_add_epi32 (_x, other); +#elif fstb_ARCHI == fstb_ARCHI_ARM + _x = vaddq_s32 (_x, other); +#endif // fstb_ARCHI + return *this; +} + + + +Vs32 & Vs32::operator -= (const Vs32Native &other) noexcept +{ +#if ! defined (fstb_HAS_SIMD) + _x [0] -= other [0]; + _x [1] -= other [1]; + _x [2] -= other [2]; + _x [3] -= other [3]; +#elif fstb_ARCHI == fstb_ARCHI_X86 + _x = _mm_sub_epi32 (_x, other); +#elif fstb_ARCHI == fstb_ARCHI_ARM + _x = vsubq_s32 (_x, other); +#endif // fstb_ARCHI + return *this; +} + + + +Vs32 & Vs32::operator *= (const Vs32Native &other) noexcept +{ +#if ! defined (fstb_HAS_SIMD) + _x [0] *= other [0]; + _x [1] *= other [1]; + _x [2] *= other [2]; + _x [3] *= other [3]; +#elif fstb_ARCHI == fstb_ARCHI_X86 + // Emulation of _mm_mullo_epi32 (SSE4.1) +# if fstb_COMPILER == fstb_COMPILER_MSVC + // For some reason this code is slightly faster on MSVC + auto p02_64 = _mm_mul_epu32 (_x, other); + auto p13_64 = _mm_mul_epu32 ( + _mm_srli_si128 (_x , 4), + _mm_srli_si128 (other, 4) + ); + p02_64 = _mm_shuffle_epi32 (p02_64, (0 << 0) | (2 << 2)); + p13_64 = _mm_shuffle_epi32 (p13_64, (0 << 0) | (2 << 2)); + _x = _mm_unpacklo_epi32 (p02_64, p13_64); +# else + // Code of this function shamelessly borrowed from tp7 + // https://github.com/tp7/masktools/blob/16bit/masktools/common/simd.h + // This code is faster on GCC/Clang + const __m128i lhs13 = _mm_shuffle_epi32 (_x, 0xF5); // (-,a3,-,a1) + const __m128i rhs13 = _mm_shuffle_epi32 (other, 0xF5); // (-,b3,-,b1) + const __m128i prod02 = _mm_mul_epu32 (_x, other); // (-,a2*b2,-,a0*b0) + const __m128i prod13 = _mm_mul_epu32 (lhs13, rhs13); // (-,a3*b3,-,a1*b1) + const __m128i prod01 = _mm_unpacklo_epi32 (prod02, prod13); // (-,-,a1*b1,a0*b0) + const __m128i prod23 = _mm_unpackhi_epi32 (prod02, prod13); // (-,-,a3*b3,a2*b2) + _x = _mm_unpacklo_epi64 (prod01 ,prod23); // (ab3,ab2,ab1,ab0) +# endif // fstb_COMPILER +#elif fstb_ARCHI == fstb_ARCHI_ARM + _x = vmulq_s32 (_x, other); +#endif // fstb_ARCHI + return *this; +} + + + +Vs32 & Vs32::operator &= (const Vs32Native &other) noexcept +{ +#if ! defined (fstb_HAS_SIMD) + _x [0] &= other [0]; + _x [1] &= other [1]; + _x [2] &= other [2]; + _x [3] &= other [3]; +#elif fstb_ARCHI == fstb_ARCHI_X86 + _x = _mm_and_si128 (_x, other); +#elif fstb_ARCHI == fstb_ARCHI_ARM + _x = vandq_s32 (_x, other); +#endif // fstb_ARCHI + return *this; +} + + + +Vs32 & Vs32::operator |= (const Vs32Native &other) noexcept +{ +#if ! defined (fstb_HAS_SIMD) + _x [0] |= other [0]; + _x [1] |= other [1]; + _x [2] |= other [2]; + _x [3] |= other [3]; +#elif fstb_ARCHI == fstb_ARCHI_X86 + _x = _mm_or_si128 (_x, other); +#elif fstb_ARCHI == fstb_ARCHI_ARM + _x = vorrq_s32 (_x, other); +#endif // fstb_ARCHI + return *this; +} + + + +Vs32 & Vs32::operator ^= (const Vs32Native &other) noexcept +{ +#if ! defined (fstb_HAS_SIMD) + _x [0] ^= other [0]; + _x [1] ^= other [1]; + _x [2] ^= other [2]; + _x [3] ^= other [3]; +#elif fstb_ARCHI == fstb_ARCHI_X86 + _x = _mm_xor_si128 (_x, other); +#elif fstb_ARCHI == fstb_ARCHI_ARM + _x = veorq_s32 (_x, other); +#endif // fstb_ARCHI + return *this; +} + + + +Vs32 & Vs32::operator <<= (int imm) noexcept +{ + assert (imm >= 0); + assert (imm <= 32); +#if ! defined (fstb_HAS_SIMD) + _x [0] <<= imm; + _x [1] <<= imm; + _x [2] <<= imm; + _x [3] <<= imm; +#elif fstb_ARCHI == fstb_ARCHI_X86 + _x = _mm_slli_epi32 (_x, imm); +#elif fstb_ARCHI == fstb_ARCHI_ARM + _x <<= imm; +#endif // fstb_ARCHI + return *this; +} + + + +Vs32 & Vs32::operator >>= (int imm) noexcept +{ + assert (imm >= 0); + assert (imm <= 32); +#if ! defined (fstb_HAS_SIMD) + _x [0] >>= imm; + _x [1] >>= imm; + _x [2] >>= imm; + _x [3] >>= imm; +#elif fstb_ARCHI == fstb_ARCHI_X86 + _x = _mm_srai_epi32 (_x, imm); +#elif fstb_ARCHI == fstb_ARCHI_ARM + _x >>= imm; +#endif // fstb_ARCHI + return *this; +} + + + +// -(1<<31) stays constant +Vs32 Vs32::operator - () const noexcept +{ +#if ! defined (fstb_HAS_SIMD) + return Vs32 { + -_x [0], + -_x [1], + -_x [2], + -_x [3] + }; +#elif fstb_ARCHI == fstb_ARCHI_X86 + return _mm_sub_epi32 (_mm_setzero_si128 (), _x); +#elif fstb_ARCHI == fstb_ARCHI_ARM + return vnegq_s32 (_x); +#endif // fstb_ARCHI +} + + + +Vs32 Vs32::operator ~ () const noexcept +{ +#if ! defined (fstb_HAS_SIMD) + return Vs32 { + ~(_x [0]), + ~(_x [1]), + ~(_x [2]), + ~(_x [3]) + }; +#elif fstb_ARCHI == fstb_ARCHI_X86 + return _mm_xor_si128 (_x, _mm_set1_epi32 (-1)); +#elif fstb_ARCHI == fstb_ARCHI_ARM + return vmvnq_s32 (_x); +#endif // fstb_ARCHI +} + + + +Vs32 Vs32::is_lt_0 () const noexcept +{ +#if ! defined (fstb_HAS_SIMD) + return Vs32 { + (_x [0] < 0) ? -1 : 0, + (_x [1] < 0) ? -1 : 0, + (_x [2] < 0) ? -1 : 0, + (_x [3] < 0) ? -1 : 0 + }; +#elif fstb_ARCHI == fstb_ARCHI_X86 + return _mm_cmplt_epi32 (_x, _mm_setzero_si128 ()); +#elif fstb_ARCHI == fstb_ARCHI_ARM + return vshrq_n_s32 (_x, 31); +#endif // fstb_ARCHI +} + + + +Vs32 Vs32::reverse () const noexcept +{ +#if ! defined (fstb_HAS_SIMD) + return Vs32 { _x [3], _x [2], _x [1], _x [0] }; +#elif fstb_ARCHI == fstb_ARCHI_X86 + return _mm_shuffle_epi32 (_x, (3<<0) + (2<<2) + (1<<4) + (0<<6)); +#elif fstb_ARCHI == fstb_ARCHI_ARM + return vrev64q_s32 (vcombine_s32 (vget_high_s32 (_x), vget_low_s32 (_x))); +#endif // fstb_ARCHI +} + + + +// Positive = left +template +Vs32 Vs32::rotate () const noexcept +{ +#if ! defined (fstb_HAS_SIMD) + return Vs32 { + _x [(0 - SHIFT) & 3], + _x [(1 - SHIFT) & 3], + _x [(2 - SHIFT) & 3], + _x [(3 - SHIFT) & 3] + }; +#elif fstb_ARCHI == fstb_ARCHI_X86 + switch (SHIFT & 3) + { + case 1: return _mm_shuffle_epi32 (_x, (2<<6) | (1<<4) | (0<<2) | (3<<0)); + case 2: return _mm_shuffle_epi32 (_x, (1<<6) | (0<<4) | (3<<2) | (2<<0)); + case 3: return _mm_shuffle_epi32 (_x, (0<<6) | (3<<4) | (2<<2) | (1<<0)); + default: return *this; + } +#elif fstb_ARCHI == fstb_ARCHI_ARM + switch (SHIFT & 3) + { + case 1: return vextq_s32 (_x, _x, 3); + case 2: return vextq_s32 (_x, _x, 2); + case 3: return vextq_s32 (_x, _x, 1); + default: return *this; + } +#endif // fstb_ARCHI +} + + + +template +int32_t Vs32::extract () const noexcept +{ +#if ! defined (fstb_HAS_SIMD) + return _x [POS & 3]; +#elif fstb_ARCHI == fstb_ARCHI_X86 + auto a = _x; + switch (POS & 3) + { + case 1: a = _mm_shuffle_epi32 (a, 1); break; + case 2: a = _mm_shuffle_epi32 (a, 2); break; + case 3: a = _mm_shuffle_epi32 (a, 3); break; + default: /* Nothing */ break; + } + return _mm_cvtsi128_si32 (a); +#elif fstb_ARCHI == fstb_ARCHI_ARM + return vgetq_lane_s32 (_x, POS & 3); +#endif // fstb_ARCHI +} + + + +template +Vs32 Vs32::insert (int32_t val) const noexcept +{ +#if ! defined (fstb_HAS_SIMD) + auto a = *this; + a._x [POS & 3] = val; + return a; +#elif fstb_ARCHI == fstb_ARCHI_X86 + auto a = rotate <(-POS) & 3> (); + a._x = _mm_castps_si128 (_mm_move_ss ( + _mm_castsi128_ps (a._x), + _mm_castsi128_ps (_mm_set1_epi32 (val)) + )); + return a.template rotate (); +#elif fstb_ARCHI == fstb_ARCHI_ARM + return vsetq_lane_s32 (val, _x, POS & 3); +#endif // fstb_ARCHI +} + + + +template +Vs32 Vs32::spread () const noexcept +{ +#if ! defined (fstb_HAS_SIMD) + return Vs32 (extract ()); +#elif fstb_ARCHI == fstb_ARCHI_X86 + return _mm_shuffle_epi32 (_x, 0x55 * (POS & 3)); +#elif fstb_ARCHI == fstb_ARCHI_ARM + return vdupq_n_s32 (vgetq_lane_s32 (_x, POS & 3)); +#endif // fstb_ARCHI +} + + + +int32_t Vs32::sum_h () const noexcept +{ +#if ! defined (fstb_HAS_SIMD) + return (_x [0] + _x [2]) + (_x [1] + _x [3]); +#elif fstb_ARCHI == fstb_ARCHI_X86 + // s = v3,v2,v1,v0 + const auto s = _mm_shuffle_epi32 (_x, (3 << 0) | (2 << 2) | (1 << 4) | (0 << 6)); + const auto v = _mm_add_epi32 (_x, s); // v0+v3,v1+v2,v2+v1,v3+v0 + return _mm_cvtsi128_si32 (_mm_add_epi32 (v, _mm_shuffle_epi32 (v, 1 << 0))); +#elif fstb_ARCHI == fstb_ARCHI_ARM + #if fstb_WORD_SIZE == 64 + return vaddvq_s32 (_x); + #else + int32x2_t v2 = vadd_s32 (vget_high_s32 (_x), vget_low_s32 (_x)); + return vget_lane_s32 (vpadd_s32 (v2, v2), 0); + #endif +#endif // fstb_ARCHI +} + + + +int32_t Vs32::min_h () const noexcept +{ +#if ! defined (fstb_HAS_SIMD) + return std::min (std::min (_x [0], _x [2]), std::min (_x [1], _x [3])); +#elif fstb_ARCHI == fstb_ARCHI_X86 + const auto v0 = min (*this, _mm_shuffle_epi32 (_x, (3 << 2) | 2)); + const auto v1 = _mm_shuffle_epi32 (v0, 1); + return std::min (_mm_cvtsi128_si32 (v0), _mm_cvtsi128_si32 (v1)); +#elif fstb_ARCHI == fstb_ARCHI_ARM + int32x2_t v2 = vmin_s32 (vget_high_s32 (_x), vget_low_s32 (_x)); + return vget_lane_s32 (vpmin_s32 (v2, v2), 0); +#endif // fstb_ARCHI +} + + + +int32_t Vs32::max_h () const noexcept +{ +#if ! defined (fstb_HAS_SIMD) + return std::max (std::max (_x [0], _x [2]), std::max (_x [1], _x [3])); +#elif fstb_ARCHI == fstb_ARCHI_X86 + const auto v0 = max (*this, _mm_shuffle_epi32 (_x, (3 << 2) | 2)); + const auto v1 = _mm_shuffle_epi32 (v0, 1); + return std::max (_mm_cvtsi128_si32 (v0), _mm_cvtsi128_si32 (v1)); +#elif fstb_ARCHI == fstb_ARCHI_ARM + int32x2_t v2 = vmax_s32 (vget_high_s32 (_x), vget_low_s32 (_x)); + return vget_lane_s32 (vpmax_s32 (v2, v2), 0); +#endif // fstb_ARCHI +} + + + +// Works only with well-formed condition results (tested bits depends on the implementation). +// For each scalar, true = all bits set, false = all bits cleared +bool Vs32::and_h () const noexcept +{ +#if ! defined (fstb_HAS_SIMD) + const int32_t t = (_x [0] & _x [1]) & (_x [2] & _x [3]); + return (t == -1); +#elif fstb_ARCHI == fstb_ARCHI_X86 + return (_mm_movemask_epi8 (_x) == 0xFFFF); +#elif fstb_ARCHI == fstb_ARCHI_ARM + const uint32x2_t tmp = vreinterpret_u32_u16 ( + vqmovn_u32 (vreinterpretq_u32_s32 (_x)) + ); + return ( vget_lane_u32 (tmp, 0) == 0xFFFFFFFFU + && vget_lane_u32 (tmp, 1) == 0xFFFFFFFFU); +#endif // fstb_ARCHI +} + + + +// Works only with well-formed condition results (tested bits depends on the implementation). +// For each scalar, true = all bits set, false = all bits cleared +bool Vs32::or_h () const noexcept +{ +#if ! defined (fstb_HAS_SIMD) + const int32_t t = (_x [0] | _x [1]) | (_x [2] | _x [3]); + return (t != 0); +#elif fstb_ARCHI == fstb_ARCHI_X86 + return (_mm_movemask_epi8 (_x) != 0); +#elif fstb_ARCHI == fstb_ARCHI_ARM + const uint32x2_t tmp = vreinterpret_u32_u16 ( + vqmovn_u32 (vreinterpretq_u32_s32 (_x)) + ); + return ( vget_lane_u32 (tmp, 0) != 0 + || vget_lane_u32 (tmp, 1) != 0); +#endif // fstb_ARCHI +} + + + +// Moves the boolean content of each 4 scalar into the lower 4 bits of the +// return value. +// Assumes the object is a result of a comparison, with all bits the same +// in each 32-bit element. +unsigned int Vs32::movemask () const noexcept +{ +#if ! defined (fstb_HAS_SIMD) + return + (_x [0] >> 31) + | ((_x [1] >> 30) & 2) + | ((_x [2] >> 29) & 4) + | ((_x [3] >> 28) & 8); +#elif fstb_ARCHI == fstb_ARCHI_X86 + return static_cast (_mm_movemask_ps (_mm_castsi128_ps (_x))); +#elif fstb_ARCHI == fstb_ARCHI_ARM + uint64x2_t tmp1 = + vreinterpretq_u64_s32 (_x); // ddd...ddd ccc...ccc bbb...bbb aaa...aaa + tmp1 = vshrq_n_u64 (tmp1, 31); // 000...00d ddd...ddc 000...00b bbb...bba + uint64x1_t tmp2 = vsli_n_u64 ( + vget_high_u64 (tmp1), + vget_low_u64 (tmp1), + 2 + ); + return vget_lane_u32 (vreinterpret_u32_u64 (tmp2), 0) & 0xF; +#endif // fstb_ARCHI +} + + + +int Vs32::count_bits () const noexcept +{ +#if ! defined (fstb_HAS_SIMD) + // http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel + uint32_t v0 = _x [0] - ((_x [0] >> 1) & 0x55555555); + uint32_t v1 = _x [1] - ((_x [1] >> 1) & 0x55555555); + uint32_t v2 = _x [2] - ((_x [2] >> 1) & 0x55555555); + uint32_t v3 = _x [3] - ((_x [3] >> 1) & 0x55555555); + v0 = (v0 & 0x33333333) + ((v0 >> 2) & 0x33333333); + v1 = (v1 & 0x33333333) + ((v1 >> 2) & 0x33333333); + v2 = (v2 & 0x33333333) + ((v2 >> 2) & 0x33333333); + v3 = (v3 & 0x33333333) + ((v3 >> 2) & 0x33333333); + const int c0 = (((v0 + (v0 >> 4)) & 0xF0F0F0FU) * 0x1010101) >> 24; + const int c1 = (((v1 + (v1 >> 4)) & 0xF0F0F0FU) * 0x1010101) >> 24; + const int c2 = (((v2 + (v2 >> 4)) & 0xF0F0F0FU) * 0x1010101) >> 24; + const int c3 = (((v3 + (v3 >> 4)) & 0xF0F0F0FU) * 0x1010101) >> 24; + return (c0 + c2) + (c1 + c3); +#elif fstb_ARCHI == fstb_ARCHI_X86 + // https://stackoverflow.com/questions/17354971/fast-counting-the-number-of-set-bits-in-m128i-register + static const __m128i popcount_mask1 = _mm_set1_epi8 (0x77); + static const __m128i popcount_mask2 = _mm_set1_epi8 (0x0F); + // Count bits in each 4-bit field. + auto x = _x; + auto n = _mm_srli_epi64 (x, 1); + n = _mm_and_si128 (popcount_mask1, n); + x = _mm_sub_epi8 (x, n); + n = _mm_srli_epi64 (n, 1); + n = _mm_and_si128 (popcount_mask1, n); + x = _mm_sub_epi8 (x, n); + n = _mm_srli_epi64 (n, 1); + n = _mm_and_si128 (popcount_mask1, n); + n = _mm_sub_epi8 (x, n); + n = _mm_add_epi8 (n, _mm_srli_epi16 (n, 4)); + n = _mm_and_si128 (popcount_mask2, n); + // Counts the number of bits in the low and high 64-bit parts + n = _mm_sad_epu8 (n, _mm_setzero_si128 ()); + // Counts the number of bits in the whole 128-bit register + n = _mm_add_epi32 (n, _mm_unpackhi_epi64 (n, n)); + return _mm_cvtsi128_si32 (n); +#elif fstb_ARCHI == fstb_ARCHI_ARM + const uint8x16_t cnt_8 = vcntq_u8 (vreinterpretq_u8_s32 (_x)); + const uint16x8_t cnt_16 = vpaddlq_u8 (cnt_8); + const uint32x4_t cnt_32 = vpaddlq_u16 (cnt_16); + const uint64x2_t cnt_64 = vpaddlq_u32 (cnt_32); + const int32x4_t cnt_s = vreinterpretq_s32_u64 (cnt_64); + return vgetq_lane_s32 (cnt_s, 0) + vgetq_lane_s32 (cnt_s, 2); +#endif // fstb_ARCHI +} + + + +Vs32 Vs32::zero () noexcept +{ +#if ! defined (fstb_HAS_SIMD) + return Vs32 { 0, 0, 0, 0 }; +#elif fstb_ARCHI == fstb_ARCHI_X86 + return _mm_setzero_si128 (); +#elif fstb_ARCHI == fstb_ARCHI_ARM + return vdupq_n_s32 (0); +#endif // fstb_ARCHI +} + + + +Vs32 Vs32::all1 () noexcept +{ +#if ! defined (fstb_HAS_SIMD) + return Vs32 { -1, -1, -1, -1 }; +#elif fstb_ARCHI == fstb_ARCHI_X86 + return _mm_set1_epi32 (-1); +#elif fstb_ARCHI == fstb_ARCHI_ARM + return vdupq_n_s32 (-1); +#endif // fstb_ARCHI +} + + + +// "true" must be 1 and nothing else. +Vs32 Vs32::set_mask (bool m0, bool m1, bool m2, bool m3) noexcept +{ +#if ! defined (fstb_HAS_SIMD) + return Vs32 { + -int32_t (m0), + -int32_t (m1), + -int32_t (m2), + -int32_t (m3), + }; +#elif 1 // Fast version +# if fstb_ARCHI == fstb_ARCHI_X86 + return _mm_sub_epi32 ( + _mm_setzero_si128 (), + _mm_set_epi32 (m3, m2, m1, m0) + ); +# elif fstb_ARCHI == fstb_ARCHI_ARM + float32x2_t v01 = vdup_n_f32 (m0); + float32x2_t v23 = vdup_n_f32 (m2); + v01 = vset_lane_f32 (m1, v01, 1); + v23 = vset_lane_f32 (m3, v23, 1); + return vnegq_s32 (vreinterpretq_s32_f32 ( + vcombine_f32 (v01, v23) + )); +# endif // fstb_ARCHI +#else // Safer but slower version +# if fstb_ARCHI == fstb_ARCHI_X86 + return _mm_sub_epi32 ( + _mm_set_epi32 (!m3, !m2, !m1, !m0), + _mm_set1_epi32 (1) + ); +# elif fstb_ARCHI == fstb_ARCHI_ARM + float32x2_t v01 = vdup_n_f32 (!m0); + float32x2_t v23 = vdup_n_f32 (!m2); + v01 = vset_lane_f32 (!m1, v01, 1); + v23 = vset_lane_f32 (!m3, v23, 1); + const auto one = vdupq_n_s32 (1); + return vsubq_s32 ( + vreinterpretq_s32_f32 (vcombine_f32 (v01, v23)), + one + ); +# endif // fstb_ARCHI +#endif // Versions +} + + + +// Extracts the vector at the position SHIFT from the double-width vector {a b} +// Concatenates a [SHIFT...3] with b [0...3-SHIFT] +template +Vs32 Vs32::compose (Vs32 a, Vs32 b) noexcept +{ +#if ! defined (fstb_HAS_SIMD) + switch (POS & 3) + { + case 1: return Vs32 { a._x [1], a._x [2], a._x [3], b._x [0] }; + case 2: return Vs32 { a._x [2], a._x [3], b._x [0], b._x [1] }; + case 3: return Vs32 { a._x [3], b._x [0], b._x [1], b._x [2] }; + default: return a; + } + return a; +#elif fstb_ARCHI == fstb_ARCHI_X86 + switch (POS & 3) + { + case 1: + { + const auto tmp = _mm_castps_si128 (_mm_move_ss ( + _mm_castsi128_ps (a._x), _mm_castsi128_ps (b._x) + )); + return _mm_shuffle_epi32 (tmp, (0<<6) | (3<<4) | (2<<2) | (1<<0)); + } + case 2: + return _mm_castps_si128 (_mm_shuffle_ps ( + _mm_castsi128_ps (a._x), + _mm_castsi128_ps (b._x), + (1<<6) | (0<<4) | (3<<2) | (2<<0) + )); + case 3: + return _mm_castps_si128 (_mm_move_ss ( + _mm_castsi128_ps ( + _mm_shuffle_epi32 (b._x, (2<<6) | (1<<4) | (0<<2) | (3<<0)) + ), + _mm_castsi128_ps ( + _mm_shuffle_epi32 (a._x, (2<<6) | (1<<4) | (0<<2) | (3<<0)) + ) + )); + default: + return a; + } +#elif fstb_ARCHI == fstb_ARCHI_ARM + if (POS == 0) + { + return a; + } + else + { + return vextq_s32 (a._x, b._x, POS); + } +#endif // fstb_ARCHI +} + + + +template +Vs32 Vs32::load (const MEM *ptr) noexcept +{ + assert (is_ptr_align_nz (ptr, fstb_SIMD128_ALIGN)); + +#if ! defined (fstb_HAS_SIMD) + return *reinterpret_cast (ptr); +#elif fstb_ARCHI == fstb_ARCHI_X86 + return _mm_load_si128 (reinterpret_cast (ptr)); +#elif fstb_ARCHI == fstb_ARCHI_ARM + return vld1q_s32 (reinterpret_cast (ptr)); +#endif // fstb_ARCHI +} + + + +template +Vs32 Vs32::loadu (const MEM *ptr) noexcept +{ + assert (ptr != nullptr); + +#if ! defined (fstb_HAS_SIMD) + return *reinterpret_cast (ptr); +#elif fstb_ARCHI == fstb_ARCHI_X86 + return _mm_loadu_si128 (reinterpret_cast (ptr)); +#elif fstb_ARCHI == fstb_ARCHI_ARM + return vreinterpretq_s32_u8 ( + vld1q_u8 (reinterpret_cast (ptr)) + ); +#endif // fstb_ARCHI +} + + + +/*\\\ PROTECTED \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\*/ + + + +/*\\\ PRIVATE \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\*/ + + + +/*\\\ GLOBAL OPERATORS AND FUNCTIONS \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\*/ + + + +Vs32 operator + (Vs32 lhs, const Vs32 &rhs) noexcept +{ + lhs += rhs; + return lhs; +} + +Vs32 operator - (Vs32 lhs, const Vs32 &rhs) noexcept +{ + lhs -= rhs; + return lhs; +} + +Vs32 operator * (Vs32 lhs, const Vs32 &rhs) noexcept +{ + lhs *= rhs; + return lhs; +} + +Vs32 operator & (Vs32 lhs, const Vs32 &rhs) noexcept +{ + lhs &= rhs; + return lhs; +} + +Vs32 operator | (Vs32 lhs, const Vs32 &rhs) noexcept +{ + lhs |= rhs; + return lhs; +} + +Vs32 operator ^ (Vs32 lhs, const Vs32 &rhs) noexcept +{ + lhs ^= rhs; + return lhs; +} + + + +template +Vs32 operator << (Vs32 lhs, T rhs) noexcept +{ + lhs <<= rhs; + return lhs; +} + +template +Vs32 operator >> (Vs32 lhs, T rhs) noexcept +{ + lhs >>= rhs; + return lhs; +} + + + +Vs32 operator == (const Vs32 &lhs, const Vs32 &rhs) noexcept +{ +#if ! defined (fstb_HAS_SIMD) + return Vs32 { + (lhs._x [0] == rhs._x [0]) ? -1 : 0, + (lhs._x [1] == rhs._x [1]) ? -1 : 0, + (lhs._x [2] == rhs._x [2]) ? -1 : 0, + (lhs._x [3] == rhs._x [3]) ? -1 : 0 + }; +#elif fstb_ARCHI == fstb_ARCHI_X86 + return _mm_cmpeq_epi32 (lhs, rhs); +#elif fstb_ARCHI == fstb_ARCHI_ARM + return vreinterpretq_s32_u32 (vceqq_s32 (lhs, rhs)); +#endif // fstb_ARCHI +} + + + +Vs32 operator != (const Vs32 &lhs, const Vs32 &rhs) noexcept +{ +#if ! defined (fstb_HAS_SIMD) + return Vs32 { + (lhs._x [0] != rhs._x [0]) ? -1 : 0, + (lhs._x [1] != rhs._x [1]) ? -1 : 0, + (lhs._x [2] != rhs._x [2]) ? -1 : 0, + (lhs._x [3] != rhs._x [3]) ? -1 : 0 + }; +#elif fstb_ARCHI == fstb_ARCHI_X86 + const auto eq = _mm_cmpeq_epi32 (lhs, rhs); + return _mm_xor_si128 (eq, _mm_set1_epi32 (-1)); +#elif fstb_ARCHI == fstb_ARCHI_ARM + return vreinterpretq_s32_u32 (vmvnq_u32 (vceqq_s32 (lhs, rhs))); +#endif // fstb_ARCHI +} + + + +Vs32 operator < (const Vs32 &lhs, const Vs32 &rhs) noexcept +{ +#if ! defined (fstb_HAS_SIMD) + return Vs32 { + (lhs._x [0] < rhs._x [0]) ? -1 : 0, + (lhs._x [1] < rhs._x [1]) ? -1 : 0, + (lhs._x [2] < rhs._x [2]) ? -1 : 0, + (lhs._x [3] < rhs._x [3]) ? -1 : 0 + }; +#elif fstb_ARCHI == fstb_ARCHI_X86 + return _mm_cmplt_epi32 (lhs, rhs); +#elif fstb_ARCHI == fstb_ARCHI_ARM + return vreinterpretq_s32_u32 (vcltq_s32 (lhs, rhs)); +#endif // fstb_ARCHI +} + + + +Vs32 operator <= (const Vs32 &lhs, const Vs32 &rhs) noexcept +{ +#if ! defined (fstb_HAS_SIMD) + return Vs32 { + (lhs._x [0] <= rhs._x [0]) ? -1 : 0, + (lhs._x [1] <= rhs._x [1]) ? -1 : 0, + (lhs._x [2] <= rhs._x [2]) ? -1 : 0, + (lhs._x [3] <= rhs._x [3]) ? -1 : 0 + }; +#elif fstb_ARCHI == fstb_ARCHI_X86 +# if 1 + return (lhs < rhs) | (lhs == rhs); +# else + return ~(lhs > rhs); +# endif +#elif fstb_ARCHI == fstb_ARCHI_ARM + return vreinterpretq_s32_u32 (vcleq_s32 (lhs, rhs)); +#endif // fstb_ARCHI +} + + + +Vs32 operator > (const Vs32 &lhs, const Vs32 &rhs) noexcept +{ +#if ! defined (fstb_HAS_SIMD) + return Vs32 { + (lhs._x [0] > rhs._x [0]) ? -1 : 0, + (lhs._x [1] > rhs._x [1]) ? -1 : 0, + (lhs._x [2] > rhs._x [2]) ? -1 : 0, + (lhs._x [3] > rhs._x [3]) ? -1 : 0 + }; +#elif fstb_ARCHI == fstb_ARCHI_X86 + return _mm_cmpgt_epi32 (lhs, rhs); +#elif fstb_ARCHI == fstb_ARCHI_ARM + return vreinterpretq_s32_u32 (vcgtq_s32 (lhs, rhs)); +#endif // fstb_ARCHI +} + + + +Vs32 operator >= (const Vs32 &lhs, const Vs32 &rhs) noexcept +{ +#if ! defined (fstb_HAS_SIMD) + return Vs32 { + (lhs._x [0] >= rhs._x [0]) ? -1 : 0, + (lhs._x [1] >= rhs._x [1]) ? -1 : 0, + (lhs._x [2] >= rhs._x [2]) ? -1 : 0, + (lhs._x [3] >= rhs._x [3]) ? -1 : 0 + }; +#elif fstb_ARCHI == fstb_ARCHI_X86 +# if 1 + return (lhs > rhs) | (lhs == rhs); +# else + return ~(lhs < rhs); +# endif +#elif fstb_ARCHI == fstb_ARCHI_ARM + return vreinterpretq_s32_u32 (vcgeq_s32 (lhs, rhs)); +#endif // fstb_ARCHI +} + + + +// Result is undefined for -(1<<31). +Vs32 abs (const Vs32 &v) noexcept +{ +#if ! defined (fstb_HAS_SIMD) + return Vs32 { + std::abs (v._x [0]), + std::abs (v._x [1]), + std::abs (v._x [2]), + std::abs (v._x [3]) + }; +#elif fstb_ARCHI == fstb_ARCHI_X86 + const auto v_neg = _mm_sub_epi32 (_mm_setzero_si128 (), v); + return max (v, v_neg); +#elif fstb_ARCHI == fstb_ARCHI_ARM + return vqabsq_s32 (v); +#endif // fstb_ARCHI +} + + + +Vs32 min (const Vs32 &lhs, const Vs32 &rhs) noexcept +{ +#if ! defined (fstb_HAS_SIMD) + return Vs32 { + std::min (lhs._x [0], rhs._x [0]), + std::min (lhs._x [1], rhs._x [1]), + std::min (lhs._x [2], rhs._x [2]), + std::min (lhs._x [3], rhs._x [3]) + }; +#elif fstb_ARCHI == fstb_ARCHI_X86 + const auto gt = (lhs > rhs); + return _mm_or_si128 ( + _mm_and_si128 ( gt, rhs), + _mm_andnot_si128 (gt, lhs) + ); +#elif fstb_ARCHI == fstb_ARCHI_ARM + return vminq_s32 (lhs, rhs); +#endif // fstb_ARCHI +} + + + +Vs32 max (const Vs32 &lhs, const Vs32 &rhs) noexcept +{ +#if ! defined (fstb_HAS_SIMD) + return Vs32 { + std::max (lhs._x [0], rhs._x [0]), + std::max (lhs._x [1], rhs._x [1]), + std::max (lhs._x [2], rhs._x [2]), + std::max (lhs._x [3], rhs._x [3]) + }; +#elif fstb_ARCHI == fstb_ARCHI_X86 + const auto lt = (lhs < rhs); + return _mm_or_si128 ( + _mm_and_si128 ( lt, rhs), + _mm_andnot_si128 (lt, lhs) + ); +#elif fstb_ARCHI == fstb_ARCHI_ARM + return vmaxq_s32 (lhs, rhs); +#endif // fstb_ARCHI +} + + + +Vs32 limit (const Vs32 &v, const Vs32 &mi, const Vs32 &ma) noexcept +{ + return min (max (v, mi), ma); +} + + + +Vs32 select (const Vs32 &cond, const Vs32 &v_t, const Vs32 &v_f) noexcept +{ +#if ! defined (fstb_HAS_SIMD) + /*** To do: implement as r = v_f ^ ((v_f ^ v_t) & cond) ***/ + return Vs32 { + (cond._x [0] & v_t._x [0]) | (~cond._x [0] & v_f._x [0]), + (cond._x [1] & v_t._x [1]) | (~cond._x [1] & v_f._x [1]), + (cond._x [2] & v_t._x [2]) | (~cond._x [2] & v_f._x [2]), + (cond._x [3] & v_t._x [3]) | (~cond._x [3] & v_f._x [3]) + }; +#elif fstb_ARCHI == fstb_ARCHI_X86 + const auto cond_1 = _mm_and_si128 (cond, v_t); + const auto cond_0 = _mm_andnot_si128 (cond, v_f); + return _mm_or_si128 (cond_0, cond_1); +#elif fstb_ARCHI == fstb_ARCHI_ARM + return vbslq_s32 (vreinterpretq_u32_s32 (cond), v_t, v_f); +#endif // fstb_ARCHI +} + + + +std::tuple swap_if (const Vs32 &cond, Vs32 lhs, Vs32 rhs) noexcept +{ +#if ! defined (fstb_HAS_SIMD) + if (cond._x [0] != 0) { std::swap (lhs._x [0], rhs._x [0]); } + if (cond._x [1] != 0) { std::swap (lhs._x [1], rhs._x [1]); } + if (cond._x [2] != 0) { std::swap (lhs._x [2], rhs._x [2]); } + if (cond._x [3] != 0) { std::swap (lhs._x [3], rhs._x [3]); } + return std::make_tuple (lhs, rhs); +#elif fstb_ARCHI == fstb_ARCHI_X86 + const auto inv = _mm_and_si128 (_mm_xor_si128 (lhs, rhs), cond); + return std::make_tuple ( + Vs32 (_mm_xor_si128 (lhs, inv)), + Vs32 (_mm_xor_si128 (rhs, inv)) + ); +#elif fstb_ARCHI == fstb_ARCHI_ARM + const auto cond_u = vreinterpretq_u32_s32 (cond); + return std::make_tuple ( + Vs32 (vbslq_s32 (cond_u, rhs, lhs)), + Vs32 (vbslq_s32 (cond_u, lhs, rhs)) + ); +#endif // fstb_ARCHI +} + + + +} // namespace fstb + + + +#endif // fstb_Vs32_CODEHEADER_INCLUDED + + + +/*\\\ EOF \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\*/ diff --git a/src/fstb/Vu32.h b/src/fstb/Vu32.h new file mode 100644 index 0000000..2341621 --- /dev/null +++ b/src/fstb/Vu32.h @@ -0,0 +1,259 @@ +/***************************************************************************** + + Vu32.h + Author: Laurent de Soras, 2021 + +--- Legal stuff --- + +This program is free software. It comes without any warranty, to +the extent permitted by applicable law. You can redistribute it +and/or modify it under the terms of the Do What The Fuck You Want +To Public License, Version 2, as published by Sam Hocevar. See +http://www.wtfpl.net/ for more details. + +*Tab=3***********************************************************************/ + + + +#pragma once +#if ! defined (fstb_Vu32_HEADER_INCLUDED) +#define fstb_Vu32_HEADER_INCLUDED + + + +/*\\\ INCLUDE FILES \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\*/ + +#include "fstb/def.h" + +#if ! defined (fstb_HAS_SIMD) + #include +#elif (fstb_ARCHI == fstb_ARCHI_X86) + #include +#elif (fstb_ARCHI == fstb_ARCHI_ARM) + #include +#else + #error +#endif + +#include + +#include + + + +namespace fstb +{ + + + +#if ! defined (fstb_HAS_SIMD) + +typedef std::array Vu32Native; + +#elif fstb_ARCHI == fstb_ARCHI_X86 + +typedef __m128i Vu32Native; + +#elif fstb_ARCHI == fstb_ARCHI_ARM + +typedef uint32x4_t Vu32Native; + +#else // fstb_ARCHI +#error +#endif // fstb_ARCHI + + + +class Vu32 +{ + +/*\\\ PUBLIC \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\*/ + +public: + + static constexpr int _len_l2 = 2; + static constexpr int _length = 1 << _len_l2; + typedef uint32_t Scalar; + + Vu32 () = default; + fstb_FORCEINLINE + Vu32 (Vu32Native a) noexcept : _x { a } {} + explicit fstb_FORCEINLINE + Vu32 (Scalar a) noexcept; + explicit fstb_FORCEINLINE + Vu32 (Scalar a0, Scalar a1, Scalar a2, Scalar a3) noexcept; + explicit fstb_FORCEINLINE + Vu32 (const std::tuple &a) noexcept; + Vu32 (const Vu32 &other) = default; + Vu32 (Vu32 &&other) = default; + ~Vu32 () = default; + Vu32 & operator = (const Vu32 &other) = default; + Vu32 & operator = (Vu32 &&other) = default; + + template + fstb_FORCEINLINE void + store (MEM *ptr) const noexcept; + template + fstb_FORCEINLINE void + storeu (MEM *ptr) const noexcept; + template + fstb_FORCEINLINE void + storeu_part (MEM *ptr, int n) const noexcept; + + fstb_FORCEINLINE + operator Vu32Native () const noexcept { return _x; } + fstb_FORCEINLINE explicit + operator bool () const noexcept; + + fstb_FORCEINLINE Vu32 & + operator += (const Vu32Native &other) noexcept; + fstb_FORCEINLINE Vu32 & + operator -= (const Vu32Native &other) noexcept; + fstb_FORCEINLINE Vu32 & + operator *= (const Vu32Native &other) noexcept; + fstb_FORCEINLINE Vu32 & + operator *= (const Scalar &other) noexcept; + + fstb_FORCEINLINE Vu32 & + operator &= (const Vu32Native &other) noexcept; + fstb_FORCEINLINE Vu32 & + operator |= (const Vu32Native &other) noexcept; + fstb_FORCEINLINE Vu32 & + operator ^= (const Vu32Native &other) noexcept; + + fstb_FORCEINLINE Vu32 & + operator <<= (int imm) noexcept; + fstb_FORCEINLINE Vu32 & + operator >>= (int imm) noexcept; + + fstb_FORCEINLINE Vu32 + operator - () const noexcept; + fstb_FORCEINLINE Vu32 + operator ~ () const noexcept; + fstb_FORCEINLINE Vu32 + reverse () const noexcept; + + template + fstb_FORCEINLINE Vu32 + rotate () const noexcept; + template + fstb_FORCEINLINE uint32_t + extract () const noexcept; + template + fstb_FORCEINLINE Vu32 + insert (uint32_t val) const noexcept; + template + fstb_FORCEINLINE Vu32 + spread () const noexcept; + + fstb_FORCEINLINE std::tuple + explode () const noexcept; + + fstb_FORCEINLINE uint32_t + sum_h () const noexcept; + fstb_FORCEINLINE uint32_t + min_h () const noexcept; + fstb_FORCEINLINE uint32_t + max_h () const noexcept; + + fstb_FORCEINLINE bool + and_h () const noexcept; + fstb_FORCEINLINE bool + or_h () const noexcept; + fstb_FORCEINLINE unsigned int + movemask () const noexcept; + fstb_FORCEINLINE int + count_bits () const noexcept; + + static fstb_FORCEINLINE Vu32 + zero () noexcept; + static fstb_FORCEINLINE Vu32 + all1 () noexcept; + static fstb_FORCEINLINE Vu32 + set_mask (bool m0, bool m1, bool m2, bool m3) noexcept; + template + static fstb_FORCEINLINE Vu32 + compose (Vu32 a, Vu32 b) noexcept; + static fstb_FORCEINLINE Vu32 + flip_msb (Vu32 x) noexcept; + + template + static fstb_FORCEINLINE Vu32 + load (const MEM *ptr) noexcept; + template + static fstb_FORCEINLINE Vu32 + loadu (const MEM *ptr) noexcept; + + + +/*\\\ PROTECTED \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\*/ + +protected: + + + +/*\\\ PRIVATE \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\*/ + +private: + +#if ! defined (fstb_HAS_SIMD) +public: +#endif + Vu32Native _x; +private: + + + +/*\\\ FORBIDDEN MEMBER FUNCTIONS \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\*/ + +private: + +}; // class Vu32 + + + +/*\\\ GLOBAL OPERATORS \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\*/ + + + +fstb_FORCEINLINE Vu32 operator + (Vu32 lhs, const Vu32 &rhs) noexcept; +fstb_FORCEINLINE Vu32 operator - (Vu32 lhs, const Vu32 &rhs) noexcept; +fstb_FORCEINLINE Vu32 operator * (Vu32 lhs, const Vu32 &rhs) noexcept; +fstb_FORCEINLINE Vu32 operator * (Vu32 lhs, const Vu32::Scalar rhs) noexcept; +fstb_FORCEINLINE Vu32 operator & (Vu32 lhs, const Vu32 &rhs) noexcept; +fstb_FORCEINLINE Vu32 operator | (Vu32 lhs, const Vu32 &rhs) noexcept; +fstb_FORCEINLINE Vu32 operator ^ (Vu32 lhs, const Vu32 &rhs) noexcept; + +template +fstb_FORCEINLINE Vu32 operator << (Vu32 lhs, T rhs) noexcept; +template +fstb_FORCEINLINE Vu32 operator >> (Vu32 lhs, T rhs) noexcept; + +fstb_FORCEINLINE Vu32 operator == (const Vu32 &lhs, const Vu32 &rhs) noexcept; +fstb_FORCEINLINE Vu32 operator != (const Vu32 &lhs, const Vu32 &rhs) noexcept; +fstb_FORCEINLINE Vu32 operator < (const Vu32 &lhs, const Vu32 &rhs) noexcept; +fstb_FORCEINLINE Vu32 operator <= (const Vu32 &lhs, const Vu32 &rhs) noexcept; +fstb_FORCEINLINE Vu32 operator > (const Vu32 &lhs, const Vu32 &rhs) noexcept; +fstb_FORCEINLINE Vu32 operator >= (const Vu32 &lhs, const Vu32 &rhs) noexcept; + +fstb_FORCEINLINE Vu32 min (const Vu32 &lhs, const Vu32 &rhs) noexcept; +fstb_FORCEINLINE Vu32 max (const Vu32 &lhs, const Vu32 &rhs) noexcept; +fstb_FORCEINLINE Vu32 limit (const Vu32 &v, const Vu32 &mi, const Vu32 &ma) noexcept; +fstb_FORCEINLINE Vu32 select (const Vu32 &cond, const Vu32 &v_t, const Vu32 &v_f) noexcept; +fstb_FORCEINLINE std::tuple swap_if (const Vu32 &cond, Vu32 lhs, Vu32 rhs) noexcept; + + + +} // namespace fstb + + + +#include "fstb/Vu32.hpp" + + + +#endif // fstb_Vu32_HEADER_INCLUDED + + + +/*\\\ EOF \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\*/ diff --git a/src/fstb/Vu32.hpp b/src/fstb/Vu32.hpp new file mode 100644 index 0000000..a1f851d --- /dev/null +++ b/src/fstb/Vu32.hpp @@ -0,0 +1,1139 @@ +/***************************************************************************** + + Vu32.hpp + Author: Laurent de Soras, 2021 + +--- Legal stuff --- + +This program is free software. It comes without any warranty, to +the extent permitted by applicable law. You can redistribute it +and/or modify it under the terms of the Do What The Fuck You Want +To Public License, Version 2, as published by Sam Hocevar. See +http://www.wtfpl.net/ for more details. + +*Tab=3***********************************************************************/ + + + +#if ! defined (fstb_Vu32_CODEHEADER_INCLUDED) +#define fstb_Vu32_CODEHEADER_INCLUDED + + + +/*\\\ INCLUDE FILES \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\*/ + +#include + +#include + + + +namespace fstb +{ + + + +/*\\\ PUBLIC \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\*/ + + + +Vu32::Vu32 (Scalar a) noexcept +#if ! defined (fstb_HAS_SIMD) +: _x { a, a, a, a } +#elif fstb_ARCHI == fstb_ARCHI_X86 +: _x { _mm_set1_epi32 (int32_t (a)) } +#elif fstb_ARCHI == fstb_ARCHI_ARM +: _x { vdupq_n_u32 (a) } +#endif // fstb_ARCHI +{ + // Nothing +} + + + +// Returns a0 | a1 | a2 | a3 +Vu32::Vu32 (Scalar a0, Scalar a1, Scalar a2, Scalar a3) noexcept +#if ! defined (fstb_HAS_SIMD) +: _x { a0, a1, a2, a3 } +#elif fstb_ARCHI == fstb_ARCHI_X86 +: _x { _mm_set_epi32 (a3, a2, a1, a0) } +#elif fstb_ARCHI == fstb_ARCHI_ARM +: _x { a0, a1, a2, a3 } +#endif // fstb_ARCHI +{ + // Nothing +} + + + +// Returns a0 | a1 | a2 | a3 +Vu32::Vu32 (const std::tuple &a) noexcept +#if ! defined (fstb_HAS_SIMD) +: _x { std::get <0> (a), std::get <1> (a), std::get <2> (a), std::get <3> (a) } +#elif fstb_ARCHI == fstb_ARCHI_X86 +: _x { _mm_set_epi32 (std::get <3> (a), std::get <2> (a), std::get <1> (a), std::get <0> (a)) } +#elif fstb_ARCHI == fstb_ARCHI_ARM +: _x { std::get <0> (a), std::get <1> (a), std::get <2> (a), std::get <3> (a) } +#endif // fstb_ARCHI +{ + // Nothing +} + + + +template +void Vu32::store (MEM *ptr) const noexcept +{ + assert (is_ptr_align_nz (ptr, fstb_SIMD128_ALIGN)); + +#if ! defined (fstb_HAS_SIMD) + *reinterpret_cast (ptr) = _x; +#elif fstb_ARCHI == fstb_ARCHI_X86 + _mm_store_si128 (reinterpret_cast <__m128i *> (ptr), _x); +#elif fstb_ARCHI == fstb_ARCHI_ARM + vst1q_u32 (reinterpret_cast (ptr), _x); +#endif // fstb_ARCHI +} + + + +template +void Vu32::storeu (MEM *ptr) const noexcept +{ + assert (ptr != nullptr); + +#if ! defined (fstb_HAS_SIMD) + *reinterpret_cast (ptr) = _x; +#elif fstb_ARCHI == fstb_ARCHI_X86 + _mm_storeu_si128 (reinterpret_cast <__m128i *> (ptr), _x); +#elif fstb_ARCHI == fstb_ARCHI_ARM + vst1q_u8 (reinterpret_cast (ptr), vreinterpretq_u8_u32 (_x)); +#endif // fstb_ARCHI +} + + + +// n = number of scalars to store (from the LSB) +template +void Vu32::storeu_part (MEM *ptr, int n) const noexcept +{ + assert (n > 0); + + if (n >= _length) + { + storeu (ptr); + return; + } + + uint32_t * f_ptr = reinterpret_cast (ptr); + +#if ! defined (fstb_HAS_SIMD) + + for (int i = 0; i < n; ++i) + { + f_ptr [i] = _x [i]; + } + +#elif fstb_ARCHI == fstb_ARCHI_X86 + + f_ptr [0] = uint32_t (_mm_cvtsi128_si32 (_x)); + if (n >= 2) + { + f_ptr [1] = uint32_t (_mm_cvtsi128_si32 (_mm_shuffle_epi32 (_x, 1 << 0))); + if (n >= 3) + { + f_ptr [1] = uint32_t (_mm_cvtsi128_si32 (_mm_shuffle_epi32 (_x, 2 << 0))); + } + } + +#elif fstb_ARCHI == fstb_ARCHI_ARM + + vst1q_lane_u32 (f_ptr + 0, _x, 0); + if (n >= 2) + { + vst1q_lane_u32 (f_ptr + 1, _x, 1); + if (n >= 3) + { + vst1q_lane_u32 (f_ptr + 2, _x, 2); + } + } + +#endif +} + + + +// Works only with well-formed condition results (tested bits depend on the +// implementation). +// For each scalar, true = all bits set, false = all bits cleared +Vu32::operator bool () const noexcept +{ + return and_h (); +} + + + +Vu32 & Vu32::operator += (const Vu32Native &other) noexcept +{ +#if ! defined (fstb_HAS_SIMD) + _x [0] += other [0]; + _x [1] += other [1]; + _x [2] += other [2]; + _x [3] += other [3]; +#elif fstb_ARCHI == fstb_ARCHI_X86 + _x = _mm_add_epi32 (_x, other); +#elif fstb_ARCHI == fstb_ARCHI_ARM + _x = vaddq_u32 (_x, other); +#endif // fstb_ARCHI + return *this; +} + + + +Vu32 & Vu32::operator -= (const Vu32Native &other) noexcept +{ +#if ! defined (fstb_HAS_SIMD) + _x [0] -= other [0]; + _x [1] -= other [1]; + _x [2] -= other [2]; + _x [3] -= other [3]; +#elif fstb_ARCHI == fstb_ARCHI_X86 + _x = _mm_sub_epi32 (_x, other); +#elif fstb_ARCHI == fstb_ARCHI_ARM + _x = vsubq_u32 (_x, other); +#endif // fstb_ARCHI + return *this; +} + + + +Vu32 & Vu32::operator *= (const Vu32Native &other) noexcept +{ +#if ! defined (fstb_HAS_SIMD) + _x [0] *= other [0]; + _x [1] *= other [1]; + _x [2] *= other [2]; + _x [3] *= other [3]; +#elif fstb_ARCHI == fstb_ARCHI_X86 + // Emulation of _mm_mullo_epi32 (SSE4.1) +# if fstb_COMPILER == fstb_COMPILER_MSVC + // For some reason this code is slightly faster on MSVC + auto p02_64 = _mm_mul_epu32 (_x, other); + auto p13_64 = _mm_mul_epu32 ( + _mm_srli_si128 (_x , 4), + _mm_srli_si128 (other, 4) + ); + p02_64 = _mm_shuffle_epi32 (p02_64, (0 << 0) | (2 << 2)); + p13_64 = _mm_shuffle_epi32 (p13_64, (0 << 0) | (2 << 2)); + _x = _mm_unpacklo_epi32 (p02_64, p13_64); +# else + // Code of this function shamelessly borrowed from tp7 + // https://github.com/tp7/masktools/blob/16bit/masktools/common/simd.h + // This code is faster on GCC/Clang + const __m128i lhs13 = _mm_shuffle_epi32 (_x, 0xF5); // (-,a3,-,a1) + const __m128i rhs13 = _mm_shuffle_epi32 (other, 0xF5); // (-,b3,-,b1) + const __m128i prod02 = _mm_mul_epu32 (_x, other); // (-,a2*b2,-,a0*b0) + const __m128i prod13 = _mm_mul_epu32 (lhs13, rhs13); // (-,a3*b3,-,a1*b1) + const __m128i prod01 = _mm_unpacklo_epi32 (prod02, prod13); // (-,-,a1*b1,a0*b0) + const __m128i prod23 = _mm_unpackhi_epi32 (prod02, prod13); // (-,-,a3*b3,a2*b2) + _x = _mm_unpacklo_epi64 (prod01 ,prod23); // (ab3,ab2,ab1,ab0) +# endif // fstb_COMPILER +#elif fstb_ARCHI == fstb_ARCHI_ARM + _x = vmulq_u32 (_x, other); +#endif // fstb_ARCHI + return *this; +} + + + +Vu32 & Vu32::operator *= (const Scalar &other) noexcept +{ +#if ! defined (fstb_HAS_SIMD) + _x [0] *= other; + _x [1] *= other; + _x [2] *= other; + _x [3] *= other; +#elif fstb_ARCHI == fstb_ARCHI_X86 + const auto vb = _mm_set1_epi32 (int32_t (other)); + auto v0 = _mm_shuffle_epi32 (_x, (0<<0) | (1<<4)); + auto v1 = _mm_shuffle_epi32 (_x, (2<<0) | (3<<4)); + v0 = _mm_mul_epu32 (v0, vb); + v1 = _mm_mul_epu32 (v1, vb); + _x = _mm_castps_si128 (_mm_shuffle_ps ( + _mm_castsi128_ps (v0), + _mm_castsi128_ps (v1), + (0<<0) | (2<<2) | (0<<4) | (2<<6) + )); +#elif fstb_ARCHI == fstb_ARCHI_ARM + _x = vmulq_u32 (_x, vdupq_n_u32 (other)); +#endif // fstb_ARCHI + return *this; +} + + + +Vu32 & Vu32::operator &= (const Vu32Native &other) noexcept +{ +#if ! defined (fstb_HAS_SIMD) + _x [0] &= other [0]; + _x [1] &= other [1]; + _x [2] &= other [2]; + _x [3] &= other [3]; +#elif fstb_ARCHI == fstb_ARCHI_X86 + _x = _mm_and_si128 (_x, other); +#elif fstb_ARCHI == fstb_ARCHI_ARM + _x = vandq_u32 (_x, other); +#endif // fstb_ARCHI + return *this; +} + + + +Vu32 & Vu32::operator |= (const Vu32Native &other) noexcept +{ +#if ! defined (fstb_HAS_SIMD) + _x [0] |= other [0]; + _x [1] |= other [1]; + _x [2] |= other [2]; + _x [3] |= other [3]; +#elif fstb_ARCHI == fstb_ARCHI_X86 + _x = _mm_or_si128 (_x, other); +#elif fstb_ARCHI == fstb_ARCHI_ARM + _x = vorrq_u32 (_x, other); +#endif // fstb_ARCHI + return *this; +} + + + +Vu32 & Vu32::operator ^= (const Vu32Native &other) noexcept +{ +#if ! defined (fstb_HAS_SIMD) + _x [0] ^= other [0]; + _x [1] ^= other [1]; + _x [2] ^= other [2]; + _x [3] ^= other [3]; +#elif fstb_ARCHI == fstb_ARCHI_X86 + _x = _mm_xor_si128 (_x, other); +#elif fstb_ARCHI == fstb_ARCHI_ARM + _x = veorq_u32 (_x, other); +#endif // fstb_ARCHI + return *this; +} + + + +Vu32 & Vu32::operator <<= (int imm) noexcept +{ + assert (imm >= 0); + assert (imm <= 32); +#if ! defined (fstb_HAS_SIMD) + _x [0] <<= imm; + _x [1] <<= imm; + _x [2] <<= imm; + _x [3] <<= imm; +#elif fstb_ARCHI == fstb_ARCHI_X86 + _x = _mm_slli_epi32 (_x, imm); +#elif fstb_ARCHI == fstb_ARCHI_ARM + _x <<= imm; +#endif // fstb_ARCHI + return *this; +} + + + +Vu32 & Vu32::operator >>= (int imm) noexcept +{ + assert (imm >= 0); + assert (imm <= 32); +#if ! defined (fstb_HAS_SIMD) + _x [0] >>= imm; + _x [1] >>= imm; + _x [2] >>= imm; + _x [3] >>= imm; +#elif fstb_ARCHI == fstb_ARCHI_X86 + _x = _mm_srli_epi32 (_x, imm); +#elif fstb_ARCHI == fstb_ARCHI_ARM + _x >>= imm; +#endif // fstb_ARCHI + return *this; +} + + + +Vu32 Vu32::operator - () const noexcept +{ +#if ! defined (fstb_HAS_SIMD) + return Vu32 { + -_x [0], + -_x [1], + -_x [2], + -_x [3] + }; +#elif fstb_ARCHI == fstb_ARCHI_X86 + return _mm_sub_epi32 (_mm_setzero_si128 (), _x); +#elif fstb_ARCHI == fstb_ARCHI_ARM + return vreinterpretq_u32_s32 (vnegq_s32 (vreinterpretq_s32_u32 (_x))); +#endif // fstb_ARCHI +} + + + +Vu32 Vu32::operator ~ () const noexcept +{ +#if ! defined (fstb_HAS_SIMD) + return Vu32 { + ~(_x [0]), + ~(_x [1]), + ~(_x [2]), + ~(_x [3]) + }; +#elif fstb_ARCHI == fstb_ARCHI_X86 + return _mm_xor_si128 (_x, _mm_set1_epi32 (-1)); +#elif fstb_ARCHI == fstb_ARCHI_ARM + return vmvnq_u32 (_x); +#endif // fstb_ARCHI +} + + + +Vu32 Vu32::reverse () const noexcept +{ +#if ! defined (fstb_HAS_SIMD) + return Vu32 { _x [3], _x [2], _x [1], _x [0] }; +#elif fstb_ARCHI == fstb_ARCHI_X86 + return _mm_shuffle_epi32 (_x, (3<<0) + (2<<2) + (1<<4) + (0<<6)); +#elif fstb_ARCHI == fstb_ARCHI_ARM + return vrev64q_u32 (vcombine_u32 (vget_high_u32 (_x), vget_low_u32 (_x))); +#endif // fstb_ARCHI +} + + + +// Positive = left +template +Vu32 Vu32::rotate () const noexcept +{ +#if ! defined (fstb_HAS_SIMD) + return Vu32 { + _x [(0 - SHIFT) & 3], + _x [(1 - SHIFT) & 3], + _x [(2 - SHIFT) & 3], + _x [(3 - SHIFT) & 3] + }; +#elif fstb_ARCHI == fstb_ARCHI_X86 + switch (SHIFT & 3) + { + case 1: return _mm_shuffle_epi32 (_x, (2<<6) | (1<<4) | (0<<2) | (3<<0)); + case 2: return _mm_shuffle_epi32 (_x, (1<<6) | (0<<4) | (3<<2) | (2<<0)); + case 3: return _mm_shuffle_epi32 (_x, (0<<6) | (3<<4) | (2<<2) | (1<<0)); + default: return *this; + } +#elif fstb_ARCHI == fstb_ARCHI_ARM + switch (SHIFT & 3) + { + case 1: return vextq_u32 (_x, _x, 3); + case 2: return vextq_u32 (_x, _x, 2); + case 3: return vextq_u32 (_x, _x, 1); + default: return *this; + } +#endif // fstb_ARCHI +} + + + +template +uint32_t Vu32::extract () const noexcept +{ +#if ! defined (fstb_HAS_SIMD) + return _x [POS & 3]; +#elif fstb_ARCHI == fstb_ARCHI_X86 + auto a = _x; + switch (POS & 3) + { + case 1: a = _mm_shuffle_epi32 (a, 1); break; + case 2: a = _mm_shuffle_epi32 (a, 2); break; + case 3: a = _mm_shuffle_epi32 (a, 3); break; + default: /* Nothing */ break; + } + return Scalar (_mm_cvtsi128_si32 (a)); +#elif fstb_ARCHI == fstb_ARCHI_ARM + return vgetq_lane_u32 (_x, POS & 3); +#endif // fstb_ARCHI +} + + + +template +Vu32 Vu32::insert (uint32_t val) const noexcept +{ +#if ! defined (fstb_HAS_SIMD) + auto a = *this; + a._x [POS & 3] = val; + return a; +#elif fstb_ARCHI == fstb_ARCHI_X86 + auto a = rotate <(-POS) & 3> (); + a._x = _mm_castps_si128 (_mm_move_ss ( + _mm_castsi128_ps (a._x), + _mm_castsi128_ps (_mm_set1_epi32 (int32_t (val))) + )); + return a.template rotate (); +#elif fstb_ARCHI == fstb_ARCHI_ARM + return vsetq_lane_u32 (val, _x, POS & 3); +#endif // fstb_ARCHI +} + + + +template +Vu32 Vu32::spread () const noexcept +{ +#if ! defined (fstb_HAS_SIMD) + return Vu32 (extract ()); +#elif fstb_ARCHI == fstb_ARCHI_X86 + return _mm_shuffle_epi32 (_x, 0x55 * (POS & 3)); +#elif fstb_ARCHI == fstb_ARCHI_ARM + return vdupq_n_u32 (vgetq_lane_u32 (_x, POS & 3)); +#endif // fstb_ARCHI +} + + + +uint32_t Vu32::sum_h () const noexcept +{ +#if ! defined (fstb_HAS_SIMD) + return (_x [0] + _x [2]) + (_x [1] + _x [3]); +#elif fstb_ARCHI == fstb_ARCHI_X86 + // s = v3,v2,v1,v0 + const auto s = _mm_shuffle_epi32 (_x, (3 << 0) | (2 << 2) | (1 << 4) | (0 << 6)); + const auto v = _mm_add_epi32 (_x, s); // v0+v3,v1+v2,v2+v1,v3+v0 + return uint32_t ( + _mm_cvtsi128_si32 (_mm_add_epi32 (v, _mm_shuffle_epi32 (v, 1 << 0))) + ); +#elif fstb_ARCHI == fstb_ARCHI_ARM + #if fstb_WORD_SIZE == 64 + return vaddvq_u32 (_x); + #else + uint32x2_t v2 = vadd_u32 (vget_high_u32 (_x), vget_low_u32 (_x)); + return vget_lane_u32 (vpadd_u32 (v2, v2), 0); + #endif +#endif // fstb_ARCHI +} + + + +uint32_t Vu32::min_h () const noexcept +{ +#if ! defined (fstb_HAS_SIMD) + return std::min (std::min (_x [0], _x [2]), std::min (_x [1], _x [3])); +#elif fstb_ARCHI == fstb_ARCHI_X86 + const auto v0 = min (*this, _mm_shuffle_epi32 (_x, (3 << 2) | 2)); + const auto v1 = _mm_shuffle_epi32 (v0, 1); + return std::min ( + uint32_t (_mm_cvtsi128_si32 (v0)), uint32_t (_mm_cvtsi128_si32 (v1)) + ); +#elif fstb_ARCHI == fstb_ARCHI_ARM + uint32x2_t v2 = vmin_u32 (vget_high_u32 (_x), vget_low_u32 (_x)); + return vget_lane_u32 (vpmin_u32 (v2, v2), 0); +#endif // fstb_ARCHI +} + + + +uint32_t Vu32::max_h () const noexcept +{ +#if ! defined (fstb_HAS_SIMD) + return std::max (std::max (_x [0], _x [2]), std::max (_x [1], _x [3])); +#elif fstb_ARCHI == fstb_ARCHI_X86 + const auto v0 = max (*this, _mm_shuffle_epi32 (_x, (3 << 2) | 2)); + const auto v1 = _mm_shuffle_epi32 (v0, 1); + return std::max ( + uint32_t (_mm_cvtsi128_si32 (v0)), uint32_t (_mm_cvtsi128_si32 (v1)) + ); +#elif fstb_ARCHI == fstb_ARCHI_ARM + uint32x2_t v2 = vmax_u32 (vget_high_u32 (_x), vget_low_u32 (_x)); + return vget_lane_u32 (vpmax_u32 (v2, v2), 0); +#endif // fstb_ARCHI +} + + + +// Works only with well-formed condition results (tested bits depends on the implementation). +// For each scalar, true = all bits set, false = all bits cleared +bool Vu32::and_h () const noexcept +{ +#if ! defined (fstb_HAS_SIMD) + const uint32_t t = (_x [0] & _x [1]) & (_x [2] & _x [3]); + return (t == uint32_t (-1)); +#elif fstb_ARCHI == fstb_ARCHI_X86 + return (_mm_movemask_epi8 (_x) == 0xFFFF); +#elif fstb_ARCHI == fstb_ARCHI_ARM + const uint32x2_t tmp = vreinterpret_u32_u16 (vqmovn_u32 (_x)); + return ( vget_lane_u32 (tmp, 0) == 0xFFFFFFFFU + && vget_lane_u32 (tmp, 1) == 0xFFFFFFFFU); +#endif // fstb_ARCHI +} + + + +// Works only with well-formed condition results (tested bits depends on the implementation). +// For each scalar, true = all bits set, false = all bits cleared +bool Vu32::or_h () const noexcept +{ +#if ! defined (fstb_HAS_SIMD) + const uint32_t t = (_x [0] | _x [1]) | (_x [2] | _x [3]); + return (t != 0); +#elif fstb_ARCHI == fstb_ARCHI_X86 + return (_mm_movemask_epi8 (_x) != 0); +#elif fstb_ARCHI == fstb_ARCHI_ARM + const uint32x2_t tmp = vreinterpret_u32_u16 (vqmovn_u32 (_x)); + return ( vget_lane_u32 (tmp, 0) != 0 + || vget_lane_u32 (tmp, 1) != 0); +#endif // fstb_ARCHI +} + + + +// Moves the boolean content of each 4 scalar into the lower 4 bits of the +// return value. +// Assumes the object is a result of a comparison, with all bits the same +// in each 32-bit element. +unsigned int Vu32::movemask () const noexcept +{ +#if ! defined (fstb_HAS_SIMD) + return + (_x [0] >> 31) + | ((_x [1] >> 30) & 2) + | ((_x [2] >> 29) & 4) + | ((_x [3] >> 28) & 8); +#elif fstb_ARCHI == fstb_ARCHI_X86 + return static_cast (_mm_movemask_ps (_mm_castsi128_ps (_x))); +#elif fstb_ARCHI == fstb_ARCHI_ARM + uint64x2_t tmp1 = + vreinterpretq_u64_u32 (_x); // ddd...ddd ccc...ccc bbb...bbb aaa...aaa + tmp1 = vshrq_n_u64 (tmp1, 31); // 000...00d ddd...ddc 000...00b bbb...bba + uint64x1_t tmp2 = vsli_n_u64 ( + vget_high_u64 (tmp1), + vget_low_u64 (tmp1), + 2 + ); + return vget_lane_u32 (vreinterpret_u32_u64 (tmp2), 0) & 0xF; +#endif // fstb_ARCHI +} + + + +int Vu32::count_bits () const noexcept +{ +#if ! defined (fstb_HAS_SIMD) + // http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel + uint32_t v0 = _x [0] - ((_x [0] >> 1) & 0x55555555); + uint32_t v1 = _x [1] - ((_x [1] >> 1) & 0x55555555); + uint32_t v2 = _x [2] - ((_x [2] >> 1) & 0x55555555); + uint32_t v3 = _x [3] - ((_x [3] >> 1) & 0x55555555); + v0 = (v0 & 0x33333333) + ((v0 >> 2) & 0x33333333); + v1 = (v1 & 0x33333333) + ((v1 >> 2) & 0x33333333); + v2 = (v2 & 0x33333333) + ((v2 >> 2) & 0x33333333); + v3 = (v3 & 0x33333333) + ((v3 >> 2) & 0x33333333); + const int c0 = (((v0 + (v0 >> 4)) & 0xF0F0F0FU) * 0x1010101) >> 24; + const int c1 = (((v1 + (v1 >> 4)) & 0xF0F0F0FU) * 0x1010101) >> 24; + const int c2 = (((v2 + (v2 >> 4)) & 0xF0F0F0FU) * 0x1010101) >> 24; + const int c3 = (((v3 + (v3 >> 4)) & 0xF0F0F0FU) * 0x1010101) >> 24; + return (c0 + c2) + (c1 + c3); +#elif fstb_ARCHI == fstb_ARCHI_X86 + // https://stackoverflow.com/questions/17354971/fast-counting-the-number-of-set-bits-in-m128i-register + static const __m128i popcount_mask1 = _mm_set1_epi8 (0x77); + static const __m128i popcount_mask2 = _mm_set1_epi8 (0x0F); + // Count bits in each 4-bit field. + auto x = _x; + auto n = _mm_srli_epi64 (x, 1); + n = _mm_and_si128 (popcount_mask1, n); + x = _mm_sub_epi8 (x, n); + n = _mm_srli_epi64 (n, 1); + n = _mm_and_si128 (popcount_mask1, n); + x = _mm_sub_epi8 (x, n); + n = _mm_srli_epi64 (n, 1); + n = _mm_and_si128 (popcount_mask1, n); + n = _mm_sub_epi8 (x, n); + n = _mm_add_epi8 (n, _mm_srli_epi16 (n, 4)); + n = _mm_and_si128 (popcount_mask2, n); + // Counts the number of bits in the low and high 64-bit parts + n = _mm_sad_epu8 (n, _mm_setzero_si128 ()); + // Counts the number of bits in the whole 128-bit register + n = _mm_add_epi32 (n, _mm_unpackhi_epi64 (n, n)); + return _mm_cvtsi128_si32 (n); +#elif fstb_ARCHI == fstb_ARCHI_ARM + const uint8x16_t cnt_8 = vcntq_u8 (vreinterpretq_u8_u32 (_x)); + const uint16x8_t cnt_16 = vpaddlq_u8 (cnt_8); + const uint32x4_t cnt_32 = vpaddlq_u16 (cnt_16); + const uint64x2_t cnt_64 = vpaddlq_u32 (cnt_32); + const int32x4_t cnt_s = vreinterpretq_s32_u64 (cnt_64); + return vgetq_lane_s32 (cnt_s, 0) + vgetq_lane_s32 (cnt_s, 2); +#endif // fstb_ARCHI +} + + + +std::tuple Vu32::explode () const noexcept +{ +#if ! defined (fstb_HAS_SIMD) + return std::make_tuple (_x [0], _x [1], _x [2], _x [3]); +#elif fstb_ARCHI == fstb_ARCHI_X86 + return std::make_tuple ( + uint32_t (_mm_cvtsi128_si32 (_x )), + uint32_t (_mm_cvtsi128_si32 (_mm_shuffle_epi32 (_x, (1<<0)))), + uint32_t (_mm_cvtsi128_si32 (_mm_shuffle_epi32 (_x, (2<<0)))), + uint32_t (_mm_cvtsi128_si32 (_mm_shuffle_epi32 (_x, (3<<0)))) + ); +#elif fstb_ARCHI == fstb_ARCHI_ARM + return std::make_tuple ( + vgetq_lane_u32 (_x, 0), + vgetq_lane_u32 (_x, 1), + vgetq_lane_u32 (_x, 2), + vgetq_lane_u32 (_x, 3) + ); +#endif // fstb_ARCHI +} + + + +Vu32 Vu32::zero () noexcept +{ +#if ! defined (fstb_HAS_SIMD) + return Vu32 { 0, 0, 0, 0 }; +#elif fstb_ARCHI == fstb_ARCHI_X86 + return _mm_setzero_si128 (); +#elif fstb_ARCHI == fstb_ARCHI_ARM + return vdupq_n_u32 (0); +#endif // fstb_ARCHI +} + + + +Vu32 Vu32::all1 () noexcept +{ +#if ! defined (fstb_HAS_SIMD) + return Vu32 { ~Scalar (0), ~Scalar (0), ~Scalar (0), ~Scalar (0) }; +#elif fstb_ARCHI == fstb_ARCHI_X86 + return _mm_set1_epi32 (-1); +#elif fstb_ARCHI == fstb_ARCHI_ARM + return vdupq_n_u32 (~Scalar (0)); +#endif // fstb_ARCHI +} + + + +// "true" must be 1 and nothing else. +Vu32 Vu32::set_mask (bool m0, bool m1, bool m2, bool m3) noexcept +{ +#if ! defined (fstb_HAS_SIMD) + return Vu32 { + -uint32_t (m0), + -uint32_t (m1), + -uint32_t (m2), + -uint32_t (m3), + }; +#elif 1 // Fast version +# if fstb_ARCHI == fstb_ARCHI_X86 + return _mm_sub_epi32 ( + _mm_setzero_si128 (), + _mm_set_epi32 (m3, m2, m1, m0) + ); +# elif fstb_ARCHI == fstb_ARCHI_ARM + float32x2_t v01 = vdup_n_f32 (m0); + float32x2_t v23 = vdup_n_f32 (m2); + v01 = vset_lane_f32 (m1, v01, 1); + v23 = vset_lane_f32 (m3, v23, 1); + return vreinterpretq_u32_s32 (vnegq_s32 (vreinterpretq_s32_f32 ( + vcombine_f32 (v01, v23) + ))); +# endif // fstb_ARCHI +#else // Safer but slower version +# if fstb_ARCHI == fstb_ARCHI_X86 + return _mm_sub_epi32 ( + _mm_set_epi32 (!m3, !m2, !m1, !m0), + _mm_set1_epi32 (1) + ); +# elif fstb_ARCHI == fstb_ARCHI_ARM + float32x2_t v01 = vdup_n_f32 (!m0); + float32x2_t v23 = vdup_n_f32 (!m2); + v01 = vset_lane_f32 (!m1, v01, 1); + v23 = vset_lane_f32 (!m3, v23, 1); + const auto one = vdupq_n_u32 (1); + return vsubq_u32 ( + vreinterpretq_u32_f32 (vcombine_f32 (v01, v23)), + one + ); +# endif // fstb_ARCHI +#endif // Versions +} + + + +// Extracts the vector at the position SHIFT from the double-width vector {a b} +// Concatenates a [SHIFT...3] with b [0...3-SHIFT] +template +Vu32 Vu32::compose (Vu32 a, Vu32 b) noexcept +{ +#if ! defined (fstb_HAS_SIMD) + switch (POS & 3) + { + case 1: return Vu32 { a._x [1], a._x [2], a._x [3], b._x [0] }; + case 2: return Vu32 { a._x [2], a._x [3], b._x [0], b._x [1] }; + case 3: return Vu32 { a._x [3], b._x [0], b._x [1], b._x [2] }; + default: return a; + } + return a; +#elif fstb_ARCHI == fstb_ARCHI_X86 + switch (POS & 3) + { + case 1: + { + const auto tmp = _mm_castps_si128 (_mm_move_ss ( + _mm_castsi128_ps (a._x), _mm_castsi128_ps (b._x) + )); + return _mm_shuffle_epi32 (tmp, (0<<6) | (3<<4) | (2<<2) | (1<<0)); + } + case 2: + return _mm_castps_si128 (_mm_shuffle_ps ( + _mm_castsi128_ps (a._x), + _mm_castsi128_ps (b._x), + (1<<6) | (0<<4) | (3<<2) | (2<<0) + )); + case 3: + return _mm_castps_si128 (_mm_move_ss ( + _mm_castsi128_ps ( + _mm_shuffle_epi32 (b._x, (2<<6) | (1<<4) | (0<<2) | (3<<0)) + ), + _mm_castsi128_ps ( + _mm_shuffle_epi32 (a._x, (2<<6) | (1<<4) | (0<<2) | (3<<0)) + ) + )); + default: + return a; + } +#elif fstb_ARCHI == fstb_ARCHI_ARM + if (POS == 0) + { + return a; + } + else + { + return vextq_u32 (a._x, b._x, POS); + } +#endif // fstb_ARCHI +} + + + +Vu32 Vu32::flip_msb (Vu32 x) noexcept +{ + return x ^ Vu32 (0x80000000U); +} + + + +template +Vu32 Vu32::load (const MEM *ptr) noexcept +{ + assert (is_ptr_align_nz (ptr, fstb_SIMD128_ALIGN)); + +#if ! defined (fstb_HAS_SIMD) + return *reinterpret_cast (ptr); +#elif fstb_ARCHI == fstb_ARCHI_X86 + return _mm_load_si128 (reinterpret_cast (ptr)); +#elif fstb_ARCHI == fstb_ARCHI_ARM + return vld1q_u32 (reinterpret_cast (ptr)); +#endif // fstb_ARCHI +} + + + +template +Vu32 Vu32::loadu (const MEM *ptr) noexcept +{ + assert (ptr != nullptr); + +#if ! defined (fstb_HAS_SIMD) + return *reinterpret_cast (ptr); +#elif fstb_ARCHI == fstb_ARCHI_X86 + return _mm_loadu_si128 (reinterpret_cast (ptr)); +#elif fstb_ARCHI == fstb_ARCHI_ARM + return vreinterpretq_u32_u8 ( + vld1q_u8 (reinterpret_cast (ptr)) + ); +#endif // fstb_ARCHI +} + + + +/*\\\ PROTECTED \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\*/ + + + +/*\\\ PRIVATE \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\*/ + + + +/*\\\ GLOBAL OPERATORS \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\*/ + + + +Vu32 operator + (Vu32 lhs, const Vu32 &rhs) noexcept +{ + lhs += rhs; + return lhs; +} + +Vu32 operator - (Vu32 lhs, const Vu32 &rhs) noexcept +{ + lhs -= rhs; + return lhs; +} + +Vu32 operator * (Vu32 lhs, const Vu32 &rhs) noexcept +{ + lhs *= rhs; + return lhs; +} + +Vu32 operator * (Vu32 lhs, const Vu32::Scalar rhs) noexcept +{ + lhs *= rhs; + return lhs; +} + +Vu32 operator & (Vu32 lhs, const Vu32 &rhs) noexcept +{ + lhs &= rhs; + return lhs; +} + +Vu32 operator | (Vu32 lhs, const Vu32 &rhs) noexcept +{ + lhs |= rhs; + return lhs; +} + +Vu32 operator ^ (Vu32 lhs, const Vu32 &rhs) noexcept +{ + lhs ^= rhs; + return lhs; +} + + + +template +Vu32 operator << (Vu32 lhs, T rhs) noexcept +{ + lhs <<= rhs; + return lhs; +} + +template +Vu32 operator >> (Vu32 lhs, T rhs) noexcept +{ + lhs >>= rhs; + return lhs; +} + + + +Vu32 operator == (const Vu32 &lhs, const Vu32 &rhs) noexcept +{ +#if ! defined (fstb_HAS_SIMD) + return Vu32 { + (lhs._x [0] == rhs._x [0]) ? uint32_t (-1) : 0, + (lhs._x [1] == rhs._x [1]) ? uint32_t (-1) : 0, + (lhs._x [2] == rhs._x [2]) ? uint32_t (-1) : 0, + (lhs._x [3] == rhs._x [3]) ? uint32_t (-1) : 0 + }; +#elif fstb_ARCHI == fstb_ARCHI_X86 + return _mm_cmpeq_epi32 (lhs, rhs); +#elif fstb_ARCHI == fstb_ARCHI_ARM + return vceqq_u32 (lhs, rhs); +#endif // fstb_ARCHI +} + + + +Vu32 operator != (const Vu32 &lhs, const Vu32 &rhs) noexcept +{ +#if ! defined (fstb_HAS_SIMD) + return Vu32 { + (lhs._x [0] != rhs._x [0]) ? uint32_t (-1) : 0, + (lhs._x [1] != rhs._x [1]) ? uint32_t (-1) : 0, + (lhs._x [2] != rhs._x [2]) ? uint32_t (-1) : 0, + (lhs._x [3] != rhs._x [3]) ? uint32_t (-1) : 0 + }; +#elif fstb_ARCHI == fstb_ARCHI_X86 + const auto eq = _mm_cmpeq_epi32 (lhs, rhs); + return _mm_xor_si128 (eq, _mm_set1_epi32 (-1)); +#elif fstb_ARCHI == fstb_ARCHI_ARM + return vmvnq_u32 (vceqq_u32 (lhs, rhs)); +#endif // fstb_ARCHI +} + + + +Vu32 operator < (const Vu32 &lhs, const Vu32 &rhs) noexcept +{ +#if ! defined (fstb_HAS_SIMD) + return Vu32 { + (lhs._x [0] < rhs._x [0]) ? uint32_t (-1) : 0, + (lhs._x [1] < rhs._x [1]) ? uint32_t (-1) : 0, + (lhs._x [2] < rhs._x [2]) ? uint32_t (-1) : 0, + (lhs._x [3] < rhs._x [3]) ? uint32_t (-1) : 0 + }; +#elif fstb_ARCHI == fstb_ARCHI_X86 + return Vu32::flip_msb (_mm_cmplt_epi32 ( + Vu32::flip_msb (lhs), Vu32::flip_msb (rhs) + )); +#elif fstb_ARCHI == fstb_ARCHI_ARM + return vcltq_u32 (lhs, rhs); +#endif // fstb_ARCHI +} + + + +Vu32 operator <= (const Vu32 &lhs, const Vu32 &rhs) noexcept +{ +#if ! defined (fstb_HAS_SIMD) + return Vu32 { + (lhs._x [0] <= rhs._x [0]) ? uint32_t (-1) : 0, + (lhs._x [1] <= rhs._x [1]) ? uint32_t (-1) : 0, + (lhs._x [2] <= rhs._x [2]) ? uint32_t (-1) : 0, + (lhs._x [3] <= rhs._x [3]) ? uint32_t (-1) : 0 + }; +#elif fstb_ARCHI == fstb_ARCHI_X86 +# if 1 + return (lhs < rhs) | (lhs == rhs); +# else + return ~(lhs > rhs); +# endif +#elif fstb_ARCHI == fstb_ARCHI_ARM + return vcleq_u32 (lhs, rhs); +#endif // fstb_ARCHI +} + + + +Vu32 operator > (const Vu32 &lhs, const Vu32 &rhs) noexcept +{ + return (rhs < lhs); +} + + + +Vu32 operator >= (const Vu32 &lhs, const Vu32 &rhs) noexcept +{ + return (rhs <= lhs); +} + + + +Vu32 min (const Vu32 &lhs, const Vu32 &rhs) noexcept +{ +#if ! defined (fstb_HAS_SIMD) + return Vu32 { + std::min (lhs._x [0], rhs._x [0]), + std::min (lhs._x [1], rhs._x [1]), + std::min (lhs._x [2], rhs._x [2]), + std::min (lhs._x [3], rhs._x [3]) + }; +#elif fstb_ARCHI == fstb_ARCHI_X86 + const auto gt = (lhs > rhs); + return _mm_or_si128 ( + _mm_and_si128 ( gt, rhs), + _mm_andnot_si128 (gt, lhs) + ); +#elif fstb_ARCHI == fstb_ARCHI_ARM + return vminq_u32 (lhs, rhs); +#endif // fstb_ARCHI +} + + + +Vu32 max (const Vu32 &lhs, const Vu32 &rhs) noexcept +{ +#if ! defined (fstb_HAS_SIMD) + return Vu32 { + std::max (lhs._x [0], rhs._x [0]), + std::max (lhs._x [1], rhs._x [1]), + std::max (lhs._x [2], rhs._x [2]), + std::max (lhs._x [3], rhs._x [3]) + }; +#elif fstb_ARCHI == fstb_ARCHI_X86 + const auto lt = (lhs < rhs); + return _mm_or_si128 ( + _mm_and_si128 ( lt, rhs), + _mm_andnot_si128 (lt, lhs) + ); +#elif fstb_ARCHI == fstb_ARCHI_ARM + return vmaxq_u32 (lhs, rhs); +#endif // fstb_ARCHI +} + + + +Vu32 limit (const Vu32 &v, const Vu32 &mi, const Vu32 &ma) noexcept +{ + return min (max (v, mi), ma); +} + + + +Vu32 select (const Vu32 &cond, const Vu32 &v_t, const Vu32 &v_f) noexcept +{ +#if ! defined (fstb_HAS_SIMD) + /*** To do: implement as r = v_f ^ ((v_f ^ v_t) & cond) ***/ + return Vu32 { + (cond._x [0] & v_t._x [0]) | (~cond._x [0] & v_f._x [0]), + (cond._x [1] & v_t._x [1]) | (~cond._x [1] & v_f._x [1]), + (cond._x [2] & v_t._x [2]) | (~cond._x [2] & v_f._x [2]), + (cond._x [3] & v_t._x [3]) | (~cond._x [3] & v_f._x [3]) + }; +#elif fstb_ARCHI == fstb_ARCHI_X86 + const auto cond_1 = _mm_and_si128 (cond, v_t); + const auto cond_0 = _mm_andnot_si128 (cond, v_f); + return _mm_or_si128 (cond_0, cond_1); +#elif fstb_ARCHI == fstb_ARCHI_ARM + return vbslq_u32 (cond, v_t, v_f); +#endif // fstb_ARCHI +} + + + +std::tuple swap_if (const Vu32 &cond, Vu32 lhs, Vu32 rhs) noexcept +{ +#if ! defined (fstb_HAS_SIMD) + if (cond._x [0] != 0) { std::swap (lhs._x [0], rhs._x [0]); } + if (cond._x [1] != 0) { std::swap (lhs._x [1], rhs._x [1]); } + if (cond._x [2] != 0) { std::swap (lhs._x [2], rhs._x [2]); } + if (cond._x [3] != 0) { std::swap (lhs._x [3], rhs._x [3]); } + return std::make_tuple (lhs, rhs); +#elif fstb_ARCHI == fstb_ARCHI_X86 + const auto inv = _mm_and_si128 (_mm_xor_si128 (lhs, rhs), cond); + return std::make_tuple ( + Vu32 (_mm_xor_si128 (lhs, inv)), + Vu32 (_mm_xor_si128 (rhs, inv)) + ); +#elif fstb_ARCHI == fstb_ARCHI_ARM + return std::make_tuple ( + Vu32 (vbslq_u32 (cond, rhs, lhs)), + Vu32 (vbslq_u32 (cond, lhs, rhs)) + ); +#endif // fstb_ARCHI +} + + + +} // namespace fstb + + + +#endif // fstb_Vu32_CODEHEADER_INCLUDED + + + +/*\\\ EOF \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\*/ diff --git a/src/fstb/fnc.h b/src/fstb/fnc.h index 1eee4ae..438ac12 100644 --- a/src/fstb/fnc.h +++ b/src/fstb/fnc.h @@ -72,6 +72,7 @@ template inline constexpr bool is_eq (T v1, T v2, T eps = T (1e-9)) noexcept; template inline constexpr bool is_eq_rel (T v1, T v2, T tol = T (1e-6)) noexcept; +inline constexpr bool is_eq_ulp (float v1, float v2, int32_t tol = 1) noexcept; inline int get_prev_pow_2 (uint32_t x) noexcept; inline int get_next_pow_2 (uint32_t x) noexcept; inline constexpr double sinc (double x) noexcept; diff --git a/src/fstb/fnc.hpp b/src/fstb/fnc.hpp index 9ecb26b..c8b1b4c 100644 --- a/src/fstb/fnc.hpp +++ b/src/fstb/fnc.hpp @@ -565,6 +565,32 @@ constexpr bool is_eq_rel (T v1, T v2, T tol) noexcept +// Equality test with a tolerance in ULP. +// Numbers of opposite sign (excepted 0) are always evaluated as different. +// https://en.wikipedia.org/wiki/Unit_in_the_last_place +constexpr bool is_eq_ulp (float v1, float v2, int32_t tol) noexcept +{ + assert (tol >= 0); + + if ((v1 < 0) != (v2 < 0)) + { + return (v1 == v2); + } + + union Combo + { + float _f; + int32_t _i; + }; + const Combo c1 { v1 }; + const Combo c2 { v2 }; + const auto dif = std::abs (c2._i - c1._i); + + return (dif <= tol); +} + + + /* ============================================================================== Name: get_prev_pow2 @@ -1000,6 +1026,7 @@ constexpr T lerp (T v0, T v1, T p) noexcept // f(x) = ((r3 + r1) / 2 - r2) * x^2 + ((r3 - r1) / 2) * x + r2 // The points must not be aligned so the extremum exists. // It is not necessariy located between -1 and 1. +// The value at this point is y = r2 + 0.25 * x * (r3 - r1) template constexpr T find_extremum_pos_parabolic (T r1, T r2, T r3) noexcept { diff --git a/src/main-avs.cpp b/src/main-avs.cpp index dd70871..b8350c0 100644 --- a/src/main-avs.cpp +++ b/src/main-avs.cpp @@ -1,7 +1,8 @@ - +#if defined (_WIN32) #define WIN32_LEAN_AND_MEAN #define NOMINMAX #define NOGDI +#endif #include "avsutl/fnc.h" #include "fmtcavs/Bitdepth.h" @@ -13,13 +14,24 @@ #include "fmtcavs/Transfer.h" #include "fstb/def.h" +#if defined (_WIN32) #include +#else +#include "avs/posix.h" +#endif #include "avisynth.h" #if defined (_MSC_VER) && ! defined (NDEBUG) && defined (_DEBUG) #include #endif +#if defined (_WIN32) + #define AVS_EXPORT __declspec(dllexport) +#elif defined(__GNUC__) && __GNUC__ >= 4 + #define AVS_EXPORT __attribute__((visibility("default"))) +#else + #define AVS_EXPORT +#endif template @@ -34,7 +46,7 @@ ::AVSValue __cdecl main_avs_create (::AVSValue args, void *user_data_ptr, ::IScr const ::AVS_Linkage * AVS_linkage = nullptr; -extern "C" __declspec (dllexport) +extern "C" AVS_EXPORT const char * __stdcall AvisynthPluginInit3 (::IScriptEnvironment *env_ptr, const ::AVS_Linkage * const vectors_ptr) { AVS_linkage = vectors_ptr; @@ -60,7 +72,8 @@ const char * __stdcall AvisynthPluginInit3 (::IScriptEnvironment *env_ptr, const env_ptr->AddFunction (fmtcavs_PRIMARIES, "c" "[rs].+" "[gs].+" "[bs].+" // 0 "[ws].+" "[rd].+" "[gd].+" "[bd].+" // 4 - "[wd].+" "[prims]s" "[primd]s" "[cpuopt]i" // 8 + "[wd].+" "[prims]s" "[primd]s" "[wconv]b" // 8 + "[cpuopt]i" // 12 , &main_avs_create , nullptr ); env_ptr->AddFunction (fmtcavs_RESAMPLE, @@ -94,7 +107,7 @@ const char * __stdcall AvisynthPluginInit3 (::IScriptEnvironment *env_ptr, const } - +#if defined (_WIN32) static void main_avs_dll_load (::HINSTANCE hinst) { fstb::unused (hinst); @@ -156,3 +169,4 @@ BOOL WINAPI DllMain (::HINSTANCE hinst, ::DWORD reason, ::LPVOID reserved_ptr) return TRUE; } +#endif diff --git a/src/main-vs.cpp b/src/main-vs.cpp index 2de215c..3283c3a 100644 --- a/src/main-vs.cpp +++ b/src/main-vs.cpp @@ -386,6 +386,7 @@ VS_EXTERNAL_API (void) VapourSynthPluginInit2 (::VSPlugin *plugin_ptr, const ::V "wd:float[]:opt;" "prims:data:opt;" "primd:data:opt;" + "wconv:int:opt;" "cpuopt:int:opt;" , "clip:vnode;" , &vsutl::Redirect ::create, nullptr, plugin_ptr