From 1c7fae34e643cf312990ea6b69495068c0f19741 Mon Sep 17 00:00:00 2001 From: EleonoreMizo Date: Thu, 22 Jul 2021 10:34:45 +0200 Subject: [PATCH] bitdepth: refactoring Processing code has moved to fmtcl::Dither. This should make the bitdepth functionnalities more independent of the Vapoursynth API. --- build/unix/Makefile.am | 2 + build/win/fmtconv.vcxproj | 2 + build/win/fmtconv.vcxproj.filters | 6 + src/fmtc/Bitdepth.cpp | 2550 +--------------------------- src/fmtc/Bitdepth.h | 351 +--- src/fmtc/fnc.cpp | 18 + src/fmtc/fnc.h | 3 + src/fmtcl/Dither.cpp | 2604 +++++++++++++++++++++++++++++ src/fmtcl/Dither.h | 448 +++++ src/fmtcl/SplFmt.h | 2 + src/fmtcl/SplFmt.hpp | 20 + 11 files changed, 3156 insertions(+), 2850 deletions(-) create mode 100644 src/fmtcl/Dither.cpp create mode 100644 src/fmtcl/Dither.h diff --git a/build/unix/Makefile.am b/build/unix/Makefile.am index b4b9cda..7858dcf 100644 --- a/build/unix/Makefile.am +++ b/build/unix/Makefile.am @@ -171,6 +171,8 @@ libfmtconv_la_SOURCES = \ ../../src/fmtcl/DiscreteFirCustom.h \ ../../src/fmtcl/DiscreteFirInterface.cpp \ ../../src/fmtcl/DiscreteFirInterface.h \ + ../../src/fmtcl/Dither.cpp \ + ../../src/fmtcl/Dither.h \ ../../src/fmtcl/ErrDifBuf.cpp \ ../../src/fmtcl/ErrDifBuf.h \ ../../src/fmtcl/ErrDifBuf.hpp \ diff --git a/build/win/fmtconv.vcxproj b/build/win/fmtconv.vcxproj index a46e34f..182b083 100644 --- a/build/win/fmtconv.vcxproj +++ b/build/win/fmtconv.vcxproj @@ -148,6 +148,7 @@ + @@ -321,6 +322,7 @@ + diff --git a/build/win/fmtconv.vcxproj.filters b/build/win/fmtconv.vcxproj.filters index 9348f20..793c29c 100644 --- a/build/win/fmtconv.vcxproj.filters +++ b/build/win/fmtconv.vcxproj.filters @@ -526,6 +526,9 @@ fmtcl + + fmtcl + @@ -750,6 +753,9 @@ fmtcl + + fmtcl + diff --git a/src/fmtc/Bitdepth.cpp b/src/fmtc/Bitdepth.cpp index 1c16a70..f1e0e10 100644 --- a/src/fmtc/Bitdepth.cpp +++ b/src/fmtc/Bitdepth.cpp @@ -25,11 +25,8 @@ To Public License, Version 2, as published by Sam Hocevar. See /*\\\ INCLUDE FILES \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\*/ #include "fmtc/Bitdepth.h" +#include "fmtc/fnc.h" #include "fmtc/SplFmtUtl.h" -#if (fstb_ARCHI == fstb_ARCHI_X86) - #include "fmtcl/ProxyRwSse2.h" -#endif -#include "fmtcl/VoidAndCluster.h" #include "fstb/def.h" #include "fstb/fnc.h" #include "vsutl/CpuOpt.h" @@ -39,7 +36,6 @@ To Public License, Version 2, as published by Sam Hocevar. See #include #include -#include @@ -65,37 +61,12 @@ Bitdepth::Bitdepth (const ::VSMap &in, ::VSMap &out, void *user_data_ptr, ::VSCo #if defined (_MSC_VER) #pragma warning (pop) #endif -, _splfmt_src (fmtcl::SplFmt_ILLEGAL) -, _splfmt_dst (fmtcl::SplFmt_ILLEGAL) -, _scale_info_arr () -, _upconv_flag (false) -, _sse2_flag (false) -, _avx2_flag (false) -, _full_range_in_flag (false) -, _full_range_out_flag (false) -, _range_def_flag (false) -, _dmode (get_arg_int (in, out, "dmode", DMode_FILTERLITE)) -, _pat_size (get_arg_int (in, out, "patsize", PAT_WIDTH)) -, _ampo (get_arg_flt (in, out, "ampo", 1.0)) -, _ampn (get_arg_flt (in, out, "ampn", 0.0)) -, _dyn_flag (get_arg_int (in, out, "dyn", 0) != 0) -, _static_noise_flag (get_arg_int (in, out, "staticnoise", 0) != 0) -, _correlated_planes_flag (get_arg_int (in, out, "corplane", 0) != 0) -, _tpdfo_flag (get_arg_int (in, out, "tpdfo", 0) != 0) -, _tpdfn_flag (get_arg_int (in, out, "tpdfn", 0) != 0) -, _errdif_flag (false) -, _simple_flag (false) -, _dither_pat_arr () -, _amp () -, _buf_factory_uptr () -, _process_seg_int_int_ptr (nullptr) -, _process_seg_flt_int_ptr (nullptr) { fstb::unused (user_data_ptr); vsutl::CpuOpt cpu_opt (*this, in, out); - _sse2_flag = cpu_opt.has_sse2 (); - _avx2_flag = cpu_opt.has_avx2 (); + const bool sse2_flag = cpu_opt.has_sse2 (); + const bool avx2_flag = cpu_opt.has_avx2 (); // Checks the input clip if (_vi_in.format == nullptr) @@ -125,7 +96,8 @@ Bitdepth::Bitdepth (const ::VSMap &in, ::VSMap &out, void *user_data_ptr, ::VSCo } } - _splfmt_src = SplFmtUtl::conv_from_vsformat (fmt_src); + const auto splfmt_src = SplFmtUtl::conv_from_vsformat (fmt_src); + const auto col_fam = conv_colfam_to_fmtcl (fmt_src); // Destination colorspace const ::VSFormat& fmt_dst = get_output_colorspace (in, out, core, fmt_src); @@ -157,7 +129,9 @@ Bitdepth::Bitdepth (const ::VSMap &in, ::VSMap &out, void *user_data_ptr, ::VSCo // Format is validated _vi_out.format = &fmt_dst; - _splfmt_dst = SplFmtUtl::conv_from_vsformat (fmt_dst); + const auto splfmt_dst = SplFmtUtl::conv_from_vsformat (fmt_dst); + + const int w = _vi_in.width; // May be <= 0 // Conversion-related things bool range_def_src_flag = false; @@ -172,126 +146,54 @@ Bitdepth::Bitdepth (const ::VSMap &in, ::VSMap &out, void *user_data_ptr, ::VSCo ) != 0); _range_def_flag = (range_def_src_flag || range_def_dst_flag); - // No dithering required - if ( ( fmt_src.sampleType == ::stInteger - && ( fmt_dst.sampleType == ::stFloat - || ( fmt_src.bitsPerSample <= fmt_dst.bitsPerSample - && ! _full_range_in_flag - && ! _full_range_out_flag))) - || ( fmt_src.sampleType == ::stFloat - && fmt_dst.sampleType == ::stFloat)) - { - _upconv_flag = true; - } - - for (int plane_index = 0; plane_index < fmt_dst.numPlanes; ++plane_index) - { - SclInf & scl_inf = _scale_info_arr [plane_index]; - vsutl::compute_fmt_mac_cst ( - scl_inf._info._gain, - scl_inf._info._add_cst, - *_vi_out.format, _full_range_out_flag, - fmt_src, _full_range_in_flag, - plane_index - ); - - if ( _upconv_flag - && fmt_src.sampleType == ::stInteger - && fmt_dst.sampleType == ::stFloat) - { - scl_inf._ptr = &scl_inf._info; - } - else - { - scl_inf._ptr = nullptr; - } - } - // Dithering parameters - if (_dmode == DMode_ROUND_ALIAS) + fmtcl::Dither::DMode dmode = static_cast ( + get_arg_int (in, out, "dmode", fmtcl::Dither::DMode_FILTERLITE) + ); + if (dmode == fmtcl::Dither::DMode_ROUND_ALIAS) { - _dmode = DMode_ROUND; + dmode = fmtcl::Dither::DMode_ROUND; } - if ( _dmode < 0 - || _dmode >= DMode_NBR_ELT) + if ( dmode < 0 + || dmode >= fmtcl::Dither::DMode_NBR_ELT) { throw_inval_arg ("invalid dmode."); } - if (_ampo < 0) + const double ampo = get_arg_flt (in, out, "ampo", 1.0); + if (ampo < 0) { throw_inval_arg ("ampo cannot be negative."); } - if (_ampn < 0) + + const double ampn = get_arg_flt (in, out, "ampn", 0.0); + if (ampn < 0) { throw_inval_arg ("ampn cannot be negative."); } - if (_pat_size < 4 || PAT_WIDTH % _pat_size != 0) + const int pat_size = + get_arg_int (in, out, "patsize", fmtcl::Dither::_max_pat_width); + if (pat_size < 4 || fmtcl::Dither::_max_pat_width % pat_size != 0) { throw_inval_arg ("Wrong value for patsize."); } - int w = _vi_in.width; - if (_vi_in.width <= 0) - { - w = MAX_UNK_WIDTH; - } - _buf_factory_uptr = - std::unique_ptr (new fmtcl::ErrDifBufFactory (w)); - _buf_pool.set_factory (*_buf_factory_uptr); - - build_dither_pat (); - - // Amplitude precalculations - - // In case of TPDF, rescales the amplitude so the power is kept constant. - // Sum of two noises (uncorrelated signals) -> +3 dB - if (_tpdfo_flag) - { - _ampo *= fstb::SQRT2 * 0.5; - } - if (_tpdfn_flag) - { - _ampn *= fstb::SQRT2 * 0.5; - } - - const int amp_mul = 1 << AMP_BITS; - const int ampo_i_raw = fstb::round_int (_ampo * amp_mul); - const int ampn_i_raw = fstb::round_int (_ampn * amp_mul); - _amp._o_i = std::min (ampo_i_raw, 127); - _amp._n_i = std::min (ampn_i_raw, 127); - _amp._n_f = float (_ampn * (1.0f / 256.0f)); - - _simple_flag = (ampo_i_raw == amp_mul && ampn_i_raw == 0); - - if (_errdif_flag) - { - _amp._e_i = fstb::limit ( - fstb::round_int ((_ampo - 1) * (128 << AMP_BITS)), - 0, - (2048 << AMP_BITS) - 1 - ); - _amp._e_f = fstb::limit (float (_ampo) - 1, 0.0f, 8.0f); - } - - // Processing function initialisation - if (_errdif_flag) - { - init_fnc_errdiff (); - } - else if (_dmode == DMode_QUASIRND) - { - init_fnc_quasirandom (); - } - else if (_dmode == DMode_FAST) - { - init_fnc_fast (); - } - else - { - init_fnc_ordered (); - } + const bool dyn_flag = (get_arg_int (in, out, "dyn", 0) != 0); + const bool static_noise_flag = (get_arg_int (in, out, "staticnoise", 0) != 0); + const bool correlated_planes_flag = (get_arg_int (in, out, "corplane", 0) != 0); + const bool tpdfo_flag = (get_arg_int (in, out, "tpdfo", 0) != 0); + const bool tpdfn_flag = (get_arg_int (in, out, "tpdfn", 0) != 0); + + _engine_uptr = std::make_unique ( + splfmt_src, fmt_src.bitsPerSample, _full_range_in_flag, + splfmt_dst, fmt_dst.bitsPerSample, _full_range_out_flag, + col_fam, fmt_dst.numPlanes, w, + dmode, pat_size, ampo, ampn, + dyn_flag, static_noise_flag, correlated_planes_flag, + tpdfo_flag, tpdfn_flag, + sse2_flag, avx2_flag + ); } @@ -385,30 +287,11 @@ int Bitdepth::do_process_plane (::VSFrameRef &dst, int n, int plane_index, void try { - if (_upconv_flag) - { - fmtcl::BitBltConv blitter (_sse2_flag, _avx2_flag); - blitter.bitblt ( - _splfmt_dst, _vi_out.format->bitsPerSample, - data_dst_ptr, nullptr, stride_dst, - _splfmt_src, _vi_in.format->bitsPerSample, - data_src_ptr, nullptr, stride_src, - w, h, - _scale_info_arr [plane_index]._ptr - ); - } - else - { - dither_plane ( - _splfmt_dst, _vi_out.format->bitsPerSample, - data_dst_ptr, stride_dst, - _splfmt_src, _vi_in.format->bitsPerSample, - data_src_ptr, stride_src, - w, h, - _scale_info_arr [plane_index]._info, - n, plane_index - ); - } + _engine_uptr->process_plane ( + data_dst_ptr, stride_dst, + data_src_ptr, stride_src, + w, h, n, plane_index + ); } catch (std::exception &e) @@ -526,2353 +409,6 @@ const ::VSFormat & Bitdepth::get_output_colorspace (const ::VSMap &in, ::VSMap & -void Bitdepth::build_dither_pat () -{ - _errdif_flag = false; - - switch (_dmode) - { - case DMode_BAYER: - build_dither_pat_bayer (); - break; - - case DMode_FILTERLITE: - case DMode_STUCKI: - case DMode_ATKINSON: - case DMode_FLOYD: - case DMode_OSTRO: - _errdif_flag = true; - _tpdfo_flag = false; - break; - - case DMode_ROUND: - case DMode_FAST: - default: - build_dither_pat_round (); - break; - - case DMode_VOIDCLUST: - build_dither_pat_void_and_cluster (_pat_size); - break; - - case DMode_QUASIRND: - // Nothing - break; - } -} - - - -void Bitdepth::build_dither_pat_round () -{ - PatData & pat_data = _dither_pat_arr [0]; - for (int y = 0; y < PAT_WIDTH; ++y) - { - for (int x = 0; x < PAT_WIDTH; ++x) - { - pat_data [y] [x] = 0; - } - } - - build_next_dither_pat (); -} - - - -void Bitdepth::build_dither_pat_bayer () -{ - assert (fstb::is_pow_2 (int (PAT_WIDTH))); - - PatData & pat_data = _dither_pat_arr [0]; - for (int y = 0; y < PAT_WIDTH; ++y) - { - for (int x = 0; x < PAT_WIDTH; ++x) - { - pat_data [y] [x] = -128; - } - } - - for (int dith_size = 2; dith_size <= PAT_WIDTH; dith_size <<= 1) - { - for (int y = 0; y < PAT_WIDTH; y += 2) - { - for (int x = 0; x < PAT_WIDTH; x += 2) - { - const int xx = (x >> 1) + (PAT_WIDTH >> 1); - const int yy = (y >> 1) + (PAT_WIDTH >> 1); - const int val = (pat_data [yy] [xx] + 128) >> 2; - pat_data [y ] [x ] = int16_t (val + 0-128); - pat_data [y ] [x + 1] = int16_t (val + 128-128); - pat_data [y + 1] [x ] = int16_t (val + 192-128); - pat_data [y + 1] [x + 1] = int16_t (val + 64-128); - } - } - } - - build_next_dither_pat (); -} - - - -void Bitdepth::build_dither_pat_void_and_cluster (int w) -{ - assert (PAT_WIDTH % w == 0); - fmtcl::VoidAndCluster vc_gen; - fmtcl::MatrixWrap pat_raw (w, w); - vc_gen.create_matrix (pat_raw); - - PatData & pat_data = _dither_pat_arr [0]; - const int area = w * w; - for (int y = 0; y < PAT_WIDTH; ++y) - { - for (int x = 0; x < PAT_WIDTH; ++x) - { - pat_data [y] [x] = int16_t (pat_raw (x, y) * 256 / area - 128); - } - } - - build_next_dither_pat (); -} - - - -void Bitdepth::build_next_dither_pat () -{ - if (_tpdfo_flag) - { - for (int y = 0; y < PAT_WIDTH; ++y) - { - for (int x = 0; x < PAT_WIDTH; ++x) - { - const int r = _dither_pat_arr [0] [y] [x]; - const int t = remap_tpdf_scalar (r); - _dither_pat_arr [0] [y] [x] = int16_t (t); - } - } - } - - for (int seq = 1; seq < PAT_PERIOD; ++seq) - { - const int angle = (_dyn_flag) ? seq & 3 : 0; - copy_dither_pat_rotate ( - _dither_pat_arr [seq], - _dither_pat_arr [0], - angle - ); - } -} - - - -void Bitdepth::copy_dither_pat_rotate (PatData &dst, const PatData &src, int angle) noexcept -{ - assert (angle >= 0); - assert (angle < 4); - - static const int sin_arr [4] = { 0, 1, 0, -1 }; - const int s = sin_arr [ angle ]; - const int c = sin_arr [(angle + 1) & 3]; - - assert (fstb::is_pow_2 (int (PAT_WIDTH))); - const int mask = PAT_WIDTH - 1; - - for (int y = 0; y < PAT_WIDTH; ++y) - { - for (int x = 0; x < PAT_WIDTH; ++x) - { - const int xs = (x * c - y * s) & mask; - const int ys = (x * s + y * c) & mask; - - dst [y] [x] = src [ys] [xs]; - } - } -} - - - -// All possible combinations -#define fmtc_Bitdepth_SPAN_INT(SETP, NAMP, NAMF, simple_flag, tpdfo_flag, tpdfn_flag, dst_res, dst_fmt, src_res, src_fmt) \ - switch ( ((simple_flag) << 7) \ - + ((tpdfo_flag) << 23) + ((tpdfn_flag) << 22) \ - + ((dst_res) << 24) + ((dst_fmt) << 16) \ - + ((src_res) << 8) + (src_fmt)) \ - { \ - SETP (NAMP, NAMF, fmtcl::SplFmt_INT8 , uint8_t , 8, fmtcl::SplFmt_INT16, uint16_t, 9) \ - SETP (NAMP, NAMF, fmtcl::SplFmt_INT8 , uint8_t , 8, fmtcl::SplFmt_INT16, uint16_t, 10) \ - SETP (NAMP, NAMF, fmtcl::SplFmt_INT8 , uint8_t , 8, fmtcl::SplFmt_INT16, uint16_t, 11) \ - SETP (NAMP, NAMF, fmtcl::SplFmt_INT8 , uint8_t , 8, fmtcl::SplFmt_INT16, uint16_t, 12) \ - SETP (NAMP, NAMF, fmtcl::SplFmt_INT8 , uint8_t , 8, fmtcl::SplFmt_INT16, uint16_t, 14) \ - SETP (NAMP, NAMF, fmtcl::SplFmt_INT8 , uint8_t , 8, fmtcl::SplFmt_INT16, uint16_t, 16) \ - SETP (NAMP, NAMF, fmtcl::SplFmt_INT16, uint16_t, 9, fmtcl::SplFmt_INT16, uint16_t, 10) \ - SETP (NAMP, NAMF, fmtcl::SplFmt_INT16, uint16_t, 9, fmtcl::SplFmt_INT16, uint16_t, 11) \ - SETP (NAMP, NAMF, fmtcl::SplFmt_INT16, uint16_t, 9, fmtcl::SplFmt_INT16, uint16_t, 12) \ - SETP (NAMP, NAMF, fmtcl::SplFmt_INT16, uint16_t, 9, fmtcl::SplFmt_INT16, uint16_t, 14) \ - SETP (NAMP, NAMF, fmtcl::SplFmt_INT16, uint16_t, 9, fmtcl::SplFmt_INT16, uint16_t, 16) \ - SETP (NAMP, NAMF, fmtcl::SplFmt_INT16, uint16_t, 10, fmtcl::SplFmt_INT16, uint16_t, 11) \ - SETP (NAMP, NAMF, fmtcl::SplFmt_INT16, uint16_t, 10, fmtcl::SplFmt_INT16, uint16_t, 12) \ - SETP (NAMP, NAMF, fmtcl::SplFmt_INT16, uint16_t, 10, fmtcl::SplFmt_INT16, uint16_t, 14) \ - SETP (NAMP, NAMF, fmtcl::SplFmt_INT16, uint16_t, 10, fmtcl::SplFmt_INT16, uint16_t, 16) \ - SETP (NAMP, NAMF, fmtcl::SplFmt_INT16, uint16_t, 12, fmtcl::SplFmt_INT16, uint16_t, 14) \ - SETP (NAMP, NAMF, fmtcl::SplFmt_INT16, uint16_t, 12, fmtcl::SplFmt_INT16, uint16_t, 16) \ - } - -// All possible combinations using float as intermediary data -#define fmtc_Bitdepth_SPAN_FLT(SETP, NAMP, NAMF, simple_flag, tpdfo_flag, tpdfn_flag, dst_res, dst_fmt, src_res, src_fmt) \ - switch ( ((simple_flag) << 7) \ - + ((tpdfo_flag) << 23) + ((tpdfn_flag) << 22) \ - + ((dst_res) << 24) + ((dst_fmt) << 16) \ - + ((src_res) << 8) + (src_fmt)) \ - { \ - SETP (NAMP, NAMF, fmtcl::SplFmt_INT8 , uint8_t , 8, fmtcl::SplFmt_INT8 , uint8_t , 8) \ - SETP (NAMP, NAMF, fmtcl::SplFmt_INT8 , uint8_t , 8, fmtcl::SplFmt_INT16, uint16_t, 9) \ - SETP (NAMP, NAMF, fmtcl::SplFmt_INT8 , uint8_t , 8, fmtcl::SplFmt_INT16, uint16_t, 10) \ - SETP (NAMP, NAMF, fmtcl::SplFmt_INT8 , uint8_t , 8, fmtcl::SplFmt_INT16, uint16_t, 11) \ - SETP (NAMP, NAMF, fmtcl::SplFmt_INT8 , uint8_t , 8, fmtcl::SplFmt_INT16, uint16_t, 12) \ - SETP (NAMP, NAMF, fmtcl::SplFmt_INT8 , uint8_t , 8, fmtcl::SplFmt_INT16, uint16_t, 14) \ - SETP (NAMP, NAMF, fmtcl::SplFmt_INT8 , uint8_t , 8, fmtcl::SplFmt_INT16, uint16_t, 16) \ - SETP (NAMP, NAMF, fmtcl::SplFmt_INT8 , uint8_t , 8, fmtcl::SplFmt_FLOAT, float , 32) \ - SETP (NAMP, NAMF, fmtcl::SplFmt_INT16, uint16_t, 9, fmtcl::SplFmt_INT8 , uint8_t , 8) \ - SETP (NAMP, NAMF, fmtcl::SplFmt_INT16, uint16_t, 9, fmtcl::SplFmt_INT16, uint16_t, 9) \ - SETP (NAMP, NAMF, fmtcl::SplFmt_INT16, uint16_t, 9, fmtcl::SplFmt_INT16, uint16_t, 10) \ - SETP (NAMP, NAMF, fmtcl::SplFmt_INT16, uint16_t, 9, fmtcl::SplFmt_INT16, uint16_t, 11) \ - SETP (NAMP, NAMF, fmtcl::SplFmt_INT16, uint16_t, 9, fmtcl::SplFmt_INT16, uint16_t, 12) \ - SETP (NAMP, NAMF, fmtcl::SplFmt_INT16, uint16_t, 9, fmtcl::SplFmt_INT16, uint16_t, 14) \ - SETP (NAMP, NAMF, fmtcl::SplFmt_INT16, uint16_t, 9, fmtcl::SplFmt_INT16, uint16_t, 16) \ - SETP (NAMP, NAMF, fmtcl::SplFmt_INT16, uint16_t, 9, fmtcl::SplFmt_FLOAT, float , 32) \ - SETP (NAMP, NAMF, fmtcl::SplFmt_INT16, uint16_t, 10, fmtcl::SplFmt_INT8 , uint8_t , 8) \ - SETP (NAMP, NAMF, fmtcl::SplFmt_INT16, uint16_t, 10, fmtcl::SplFmt_INT16, uint16_t, 9) \ - SETP (NAMP, NAMF, fmtcl::SplFmt_INT16, uint16_t, 10, fmtcl::SplFmt_INT16, uint16_t, 10) \ - SETP (NAMP, NAMF, fmtcl::SplFmt_INT16, uint16_t, 10, fmtcl::SplFmt_INT16, uint16_t, 11) \ - SETP (NAMP, NAMF, fmtcl::SplFmt_INT16, uint16_t, 10, fmtcl::SplFmt_INT16, uint16_t, 12) \ - SETP (NAMP, NAMF, fmtcl::SplFmt_INT16, uint16_t, 10, fmtcl::SplFmt_INT16, uint16_t, 14) \ - SETP (NAMP, NAMF, fmtcl::SplFmt_INT16, uint16_t, 10, fmtcl::SplFmt_INT16, uint16_t, 16) \ - SETP (NAMP, NAMF, fmtcl::SplFmt_INT16, uint16_t, 10, fmtcl::SplFmt_FLOAT, float , 32) \ - SETP (NAMP, NAMF, fmtcl::SplFmt_INT16, uint16_t, 12, fmtcl::SplFmt_INT8 , uint8_t , 8) \ - SETP (NAMP, NAMF, fmtcl::SplFmt_INT16, uint16_t, 12, fmtcl::SplFmt_INT16, uint16_t, 9) \ - SETP (NAMP, NAMF, fmtcl::SplFmt_INT16, uint16_t, 12, fmtcl::SplFmt_INT16, uint16_t, 10) \ - SETP (NAMP, NAMF, fmtcl::SplFmt_INT16, uint16_t, 12, fmtcl::SplFmt_INT16, uint16_t, 11) \ - SETP (NAMP, NAMF, fmtcl::SplFmt_INT16, uint16_t, 12, fmtcl::SplFmt_INT16, uint16_t, 12) \ - SETP (NAMP, NAMF, fmtcl::SplFmt_INT16, uint16_t, 12, fmtcl::SplFmt_INT16, uint16_t, 14) \ - SETP (NAMP, NAMF, fmtcl::SplFmt_INT16, uint16_t, 12, fmtcl::SplFmt_INT16, uint16_t, 16) \ - SETP (NAMP, NAMF, fmtcl::SplFmt_INT16, uint16_t, 12, fmtcl::SplFmt_FLOAT, float , 32) \ - SETP (NAMP, NAMF, fmtcl::SplFmt_INT16, uint16_t, 16, fmtcl::SplFmt_INT8 , uint8_t , 8) \ - SETP (NAMP, NAMF, fmtcl::SplFmt_INT16, uint16_t, 16, fmtcl::SplFmt_INT16, uint16_t, 9) \ - SETP (NAMP, NAMF, fmtcl::SplFmt_INT16, uint16_t, 16, fmtcl::SplFmt_INT16, uint16_t, 10) \ - SETP (NAMP, NAMF, fmtcl::SplFmt_INT16, uint16_t, 16, fmtcl::SplFmt_INT16, uint16_t, 11) \ - SETP (NAMP, NAMF, fmtcl::SplFmt_INT16, uint16_t, 16, fmtcl::SplFmt_INT16, uint16_t, 12) \ - SETP (NAMP, NAMF, fmtcl::SplFmt_INT16, uint16_t, 16, fmtcl::SplFmt_INT16, uint16_t, 14) \ - SETP (NAMP, NAMF, fmtcl::SplFmt_INT16, uint16_t, 16, fmtcl::SplFmt_INT16, uint16_t, 16) \ - SETP (NAMP, NAMF, fmtcl::SplFmt_INT16, uint16_t, 16, fmtcl::SplFmt_FLOAT, float , 32) \ - } - - - -#define fmtc_Bitdepth_SET_FNC_MULTI(FCASE, NAMP, NAMF, DF, DT, DP, SF, ST, SP) \ - FCASE (false, false, false, NAMP, NAMF, DF, DT, DP, SF, ST, SP) \ - FCASE (false, false, true , NAMP, NAMF, DF, DT, DP, SF, ST, SP) \ - FCASE (false, true , false, NAMP, NAMF, DF, DT, DP, SF, ST, SP) \ - FCASE (false, true , true , NAMP, NAMF, DF, DT, DP, SF, ST, SP) \ - FCASE (true , false, false, NAMP, NAMF, DF, DT, DP, SF, ST, SP) \ - FCASE (true , false, true , NAMP, NAMF, DF, DT, DP, SF, ST, SP) \ - FCASE (true , true , false, NAMP, NAMF, DF, DT, DP, SF, ST, SP) \ - FCASE (true , true , true , NAMP, NAMF, DF, DT, DP, SF, ST, SP) - -#define fmtc_Bitdepth_SET_FNC_INT_CASE(simple_flag, tpdfo_flag, tpdfn_flag, NAMP, NAMF, DF, DT, DP, SF, ST, SP) \ - case (simple_flag << 7) + (tpdfn_flag << 22) + (tpdfo_flag << 23) \ - + (DP << 24) + (DF << 16) + (SP << 8) + SF: \ - _process_seg_int_int_ptr = &ThisType::process_seg_##NAMF##_int_int_cpp < \ - simple_flag, tpdfo_flag, tpdfn_flag, DT, DP, ST, SP \ - >; \ - break; - -#define fmtc_Bitdepth_SET_FNC_INT(NAMP, NAMF, DF, DT, DP, SF, ST, SP) \ - fmtc_Bitdepth_SET_FNC_MULTI (fmtc_Bitdepth_SET_FNC_INT_CASE, \ - NAMP, NAMF, DF, DT, DP, SF, ST, SP) - -#define fmtc_Bitdepth_SET_FNC_FLT_CASE(simple_flag, tpdfo_flag, tpdfn_flag,NAMP, NAMF, DF, DT, DP, SF, ST, SP) \ - case (simple_flag << 7) + (tpdfn_flag << 22) + (tpdfo_flag << 23) \ - + (DP << 24) + (DF << 16) + (SP << 8) + SF: \ - _process_seg_flt_int_ptr = &ThisType::process_seg_##NAMF##_flt_int_cpp < \ - simple_flag, tpdfo_flag, tpdfn_flag, DT, DP, ST \ - >; \ - break; - -#define fmtc_Bitdepth_SET_FNC_FLT(NAMP, NAMF, DF, DT, DP, SF, ST, SP) \ - fmtc_Bitdepth_SET_FNC_MULTI (fmtc_Bitdepth_SET_FNC_FLT_CASE, \ - NAMP, NAMF, DF, DT, DP, SF, ST, SP) - -#define fmtc_Bitdepth_SET_FNC_INT_SSE2_CASE(simple_flag, tpdfo_flag, tpdfn_flag, NAMP, NAMF, DF, DT, DP, SF, ST, SP) \ - case (simple_flag << 7) + (tpdfn_flag << 22) + (tpdfo_flag << 23) \ - + (DP << 24) + (DF << 16) + (SP << 8) + SF: \ - _process_seg_int_int_ptr = &ThisType::process_seg_##NAMF##_int_int_sse2 < \ - simple_flag, tpdfo_flag, tpdfn_flag, DF, DP, SF, SP \ - >; \ - break; - -#define fmtc_Bitdepth_SET_FNC_INT_SSE2(NAMP, NAMF, DF, DT, DP, SF, ST, SP) \ - fmtc_Bitdepth_SET_FNC_MULTI (fmtc_Bitdepth_SET_FNC_INT_SSE2_CASE, \ - NAMP, NAMF, DF, DT, DP, SF, ST, SP) - -#define fmtc_Bitdepth_SET_FNC_FLT_SSE2_CASE(simple_flag, tpdfo_flag, tpdfn_flag, NAMP, NAMF, DF, DT, DP, SF, ST, SP) \ - case (simple_flag << 7) + (tpdfn_flag << 22) + (tpdfo_flag << 23) \ - + (DP << 24) + (DF << 16) + (SP << 8) + SF: \ - _process_seg_flt_int_ptr = &ThisType::process_seg_##NAMF##_flt_int_sse2 < \ - simple_flag, tpdfo_flag, tpdfn_flag, DF, DP, SF \ - >; \ - break; - -#define fmtc_Bitdepth_SET_FNC_FLT_SSE2(NAMP, NAMF, DF, DT, DP, SF, ST, SP) \ - fmtc_Bitdepth_SET_FNC_MULTI (fmtc_Bitdepth_SET_FNC_FLT_SSE2_CASE, \ - NAMP, NAMF, DF, DT, DP, SF, ST, SP) - - - -void Bitdepth::init_fnc_fast () noexcept -{ - const fmtcl::SplFmt dst_fmt = _splfmt_dst; - const int dst_res = _vi_out.format->bitsPerSample; - const fmtcl::SplFmt src_fmt = _splfmt_src; - const int src_res = _vi_in.format->bitsPerSample; - - fmtc_Bitdepth_SPAN_INT ( - fmtc_Bitdepth_SET_FNC_INT, fast, fast, false, false, false, - dst_res, dst_fmt, src_res, src_fmt - ) - fmtc_Bitdepth_SPAN_FLT ( - fmtc_Bitdepth_SET_FNC_FLT, fast, fast, false, false, false, - dst_res, dst_fmt, src_res, src_fmt - ) - -#if (fstb_ARCHI == fstb_ARCHI_X86) - if (_sse2_flag) - { - fmtc_Bitdepth_SPAN_INT ( - fmtc_Bitdepth_SET_FNC_INT_SSE2, fast, fast, false, false, false, - dst_res, dst_fmt, src_res, src_fmt - ) - fmtc_Bitdepth_SPAN_FLT ( - fmtc_Bitdepth_SET_FNC_FLT_SSE2, fast, fast, false, false, false, - dst_res, dst_fmt, src_res, src_fmt - ) - } -#endif -} - - - -void Bitdepth::init_fnc_ordered () noexcept -{ - assert (! _errdif_flag); - - const fmtcl::SplFmt dst_fmt = _splfmt_dst; - const int dst_res = _vi_out.format->bitsPerSample; - const fmtcl::SplFmt src_fmt = _splfmt_src; - const int src_res = _vi_in.format->bitsPerSample; - - fmtc_Bitdepth_SPAN_INT ( - fmtc_Bitdepth_SET_FNC_INT, - ord, ord, _simple_flag, _tpdfo_flag, _tpdfn_flag, - dst_res, dst_fmt, src_res, src_fmt - ) - fmtc_Bitdepth_SPAN_FLT ( - fmtc_Bitdepth_SET_FNC_FLT, - ord, ord, _simple_flag, _tpdfo_flag, _tpdfn_flag, - dst_res, dst_fmt, src_res, src_fmt - ) - -#if (fstb_ARCHI == fstb_ARCHI_X86) - if (_sse2_flag) - { - fmtc_Bitdepth_SPAN_INT ( - fmtc_Bitdepth_SET_FNC_INT_SSE2, - ord, ord, _simple_flag, _tpdfo_flag, _tpdfn_flag, - dst_res, dst_fmt, src_res, src_fmt - ) - fmtc_Bitdepth_SPAN_FLT ( - fmtc_Bitdepth_SET_FNC_FLT_SSE2, - ord, ord, _simple_flag, _tpdfo_flag, _tpdfn_flag, - dst_res, dst_fmt, src_res, src_fmt - ) - } -#endif -} - - - -void Bitdepth::init_fnc_quasirandom () noexcept -{ - assert (! _errdif_flag); - - const fmtcl::SplFmt dst_fmt = _splfmt_dst; - const int dst_res = _vi_out.format->bitsPerSample; - const fmtcl::SplFmt src_fmt = _splfmt_src; - const int src_res = _vi_in.format->bitsPerSample; - - fmtc_Bitdepth_SPAN_INT ( - fmtc_Bitdepth_SET_FNC_INT, - qrs, qrs, _simple_flag, _tpdfo_flag, _tpdfn_flag, - dst_res, dst_fmt, src_res, src_fmt - ) - fmtc_Bitdepth_SPAN_FLT ( - fmtc_Bitdepth_SET_FNC_FLT, - qrs, qrs, _simple_flag, _tpdfo_flag, _tpdfn_flag, - dst_res, dst_fmt, src_res, src_fmt - ) - -#if (fstb_ARCHI == fstb_ARCHI_X86) - if (_sse2_flag) - { - fmtc_Bitdepth_SPAN_INT ( - fmtc_Bitdepth_SET_FNC_INT_SSE2, - qrs, qrs, _simple_flag, _tpdfo_flag, _tpdfn_flag, - dst_res, dst_fmt, src_res, src_fmt - ) - fmtc_Bitdepth_SPAN_FLT ( - fmtc_Bitdepth_SET_FNC_FLT_SSE2, - qrs, qrs, _simple_flag, _tpdfo_flag, _tpdfn_flag, - dst_res, dst_fmt, src_res, src_fmt - ) - } -#endif -} - - - -#undef fmtc_Bitdepth_SET_FNC_MULTI -#undef fmtc_Bitdepth_SET_FNC_INT_CASE -#undef fmtc_Bitdepth_SET_FNC_INT -#undef fmtc_Bitdepth_SET_FNC_FLT_CASE -#undef fmtc_Bitdepth_SET_FNC_FLT -#undef fmtc_Bitdepth_SET_FNC_INT_SSE2_CASE -#undef fmtc_Bitdepth_SET_FNC_INT_SSE2 -#undef fmtc_Bitdepth_SET_FNC_FLT_SSE2_CASE -#undef fmtc_Bitdepth_SET_FNC_FLT_SSE2 - - - -#define fmtc_Bitdepth_SET_FNC_ERRDIF_INT_CASE(simple_flag, tpdfn_flag, NAMP, NAMF, DF, DT, DP, SF, ST, SP) \ - case (simple_flag << 7) + (tpdfn_flag << 22) \ - + (DP << 24) + (DF << 16) + (SP << 8) + SF: \ - _process_seg_int_int_ptr = &ThisType::process_seg_errdif_int_int_cpp < \ - simple_flag, tpdfn_flag, Diffuse##NAMF \ - >; \ - break; - -#define fmtc_Bitdepth_SET_FNC_ERRDIF_INT(NAMP, NAMF, DF, DT, DP, SF, ST, SP) \ - fmtc_Bitdepth_SET_FNC_ERRDIF_INT_CASE (false, false, NAMP, NAMF, DF, DT, DP, SF, ST, SP) \ - fmtc_Bitdepth_SET_FNC_ERRDIF_INT_CASE (false, true , NAMP, NAMF, DF, DT, DP, SF, ST, SP) \ - fmtc_Bitdepth_SET_FNC_ERRDIF_INT_CASE (true , false, NAMP, NAMF, DF, DT, DP, SF, ST, SP) \ - fmtc_Bitdepth_SET_FNC_ERRDIF_INT_CASE (true , true , NAMP, NAMF, DF, DT, DP, SF, ST, SP) - -#define fmtc_Bitdepth_SET_FNC_ERRDIF_FLT_CASE(simple_flag, tpdfn_flag, NAMP, NAMF, DF, DT, DP, SF, ST, SP) \ - case (simple_flag << 7) + (tpdfn_flag << 22) \ - + (DP << 24) + (DF << 16) + (SP << 8) + SF: \ - _process_seg_flt_int_ptr = &ThisType::process_seg_errdif_flt_int_cpp < \ - simple_flag, tpdfn_flag, Diffuse##NAMF \ - >; \ - break; - -#define fmtc_Bitdepth_SET_FNC_ERRDIF_FLT(NAMP, NAMF, DF, DT, DP, SF, ST, SP) \ - fmtc_Bitdepth_SET_FNC_ERRDIF_FLT_CASE (false, false, NAMP, NAMF, DF, DT, DP, SF, ST, SP) \ - fmtc_Bitdepth_SET_FNC_ERRDIF_FLT_CASE (false, true , NAMP, NAMF, DF, DT, DP, SF, ST, SP) \ - fmtc_Bitdepth_SET_FNC_ERRDIF_FLT_CASE (true , false, NAMP, NAMF, DF, DT, DP, SF, ST, SP) \ - fmtc_Bitdepth_SET_FNC_ERRDIF_FLT_CASE (true , true , NAMP, NAMF, DF, DT, DP, SF, ST, SP) - - - -void Bitdepth::init_fnc_errdiff () noexcept -{ - assert (_errdif_flag); - - const fmtcl::SplFmt dst_fmt = _splfmt_dst; - const int dst_res = _vi_out.format->bitsPerSample; - const fmtcl::SplFmt src_fmt = _splfmt_src; - const int src_res = _vi_in.format->bitsPerSample; - - switch (_dmode) - { - case DMode_FILTERLITE: - fmtc_Bitdepth_SPAN_INT ( - fmtc_Bitdepth_SET_FNC_ERRDIF_INT, - errdif, FilterLite, _simple_flag, false, _tpdfn_flag, - dst_res, dst_fmt, src_res, src_fmt - ) - fmtc_Bitdepth_SPAN_FLT ( - fmtc_Bitdepth_SET_FNC_ERRDIF_FLT, - errdif, FilterLite, _simple_flag, false, _tpdfn_flag, - dst_res, dst_fmt, src_res, src_fmt - ) - break; - - case DMode_STUCKI: - fmtc_Bitdepth_SPAN_INT ( - fmtc_Bitdepth_SET_FNC_ERRDIF_INT, - errdif, Stucki, _simple_flag, false, _tpdfn_flag, - dst_res, dst_fmt, src_res, src_fmt - ) - fmtc_Bitdepth_SPAN_FLT ( - fmtc_Bitdepth_SET_FNC_ERRDIF_FLT, - errdif, Stucki, _simple_flag, false, _tpdfn_flag, - dst_res, dst_fmt, src_res, src_fmt - ) - break; - - case DMode_ATKINSON: - fmtc_Bitdepth_SPAN_INT ( - fmtc_Bitdepth_SET_FNC_ERRDIF_INT, - errdif, Atkinson, _simple_flag, false, _tpdfn_flag, - dst_res, dst_fmt, src_res, src_fmt - ) - fmtc_Bitdepth_SPAN_FLT ( - fmtc_Bitdepth_SET_FNC_ERRDIF_FLT, - errdif, Atkinson, _simple_flag, false, _tpdfn_flag, - dst_res, dst_fmt, src_res, src_fmt - ) - break; - - case DMode_FLOYD: - fmtc_Bitdepth_SPAN_INT ( - fmtc_Bitdepth_SET_FNC_ERRDIF_INT, - errdif, FloydSteinberg, _simple_flag, false, _tpdfn_flag, - dst_res, dst_fmt, src_res, src_fmt - ) - fmtc_Bitdepth_SPAN_FLT ( - fmtc_Bitdepth_SET_FNC_ERRDIF_FLT, - errdif, FloydSteinberg, _simple_flag, false, _tpdfn_flag, - dst_res, dst_fmt, src_res, src_fmt - ) - break; - - case DMode_OSTRO: - fmtc_Bitdepth_SPAN_INT ( - fmtc_Bitdepth_SET_FNC_ERRDIF_INT, - errdif, Ostromoukhov, _simple_flag, false, _tpdfn_flag, - dst_res, dst_fmt, src_res, src_fmt - ) - fmtc_Bitdepth_SPAN_FLT ( - fmtc_Bitdepth_SET_FNC_ERRDIF_FLT, - errdif, Ostromoukhov, _simple_flag, false, _tpdfn_flag, - dst_res, dst_fmt, src_res, src_fmt - ) - break; - - default: - break; - } -} - - - -#undef fmtc_Bitdepth_SET_FNC_ERRDIF_INT_CASE -#undef fmtc_Bitdepth_SET_FNC_ERRDIF_INT -#undef fmtc_Bitdepth_SET_FNC_ERRDIF_FLT_CASE -#undef fmtc_Bitdepth_SET_FNC_ERRDIF_FLT - - - -#undef fmtc_Bitdepth_SPAN_INT -#undef fmtc_Bitdepth_SPAN_FLT - - - -void Bitdepth::dither_plane (fmtcl::SplFmt dst_fmt, int dst_res, uint8_t *dst_ptr, int dst_stride, fmtcl::SplFmt src_fmt, int src_res, const uint8_t *src_ptr, int src_stride, int w, int h, const fmtcl::BitBltConv::ScaleInfo &scale_info, int frame_index, int plane_index) -{ - fstb::unused (dst_fmt); - assert (dst_fmt >= 0); - assert (dst_fmt < fmtcl::SplFmt_NBR_ELT); - assert (dst_res >= 8); - assert (dst_ptr != nullptr); - assert (src_fmt >= 0); - assert (src_fmt < fmtcl::SplFmt_NBR_ELT); - assert (src_res >= 8); - assert (src_ptr != nullptr); - assert (w > 0); - assert (h > 0); - - SegContext ctx; - ctx._scale_info_ptr = &scale_info; - ctx._amp = _amp; - - uint32_t rnd_state = 0; - if (! _correlated_planes_flag) - { - rnd_state += plane_index << 16; - } - if (_static_noise_flag) - { - rnd_state += 55555; - } - else - { - rnd_state += frame_index; - } - ctx._rnd_state = rnd_state; - - const bool sc_flag = - ( src_fmt == fmtcl::SplFmt_FLOAT - || ! fstb::is_eq (scale_info._gain * double ((uint64_t (1)) << (src_res - dst_res)), 1.0, 1e-6) - || ! fstb::is_null (scale_info._add_cst, 1e-6)); - - void (* process_ptr) (uint8_t *dst_ptr, const uint8_t *src_ptr, int w, SegContext &ctx) = - (sc_flag) - ? _process_seg_flt_int_ptr - : _process_seg_int_int_ptr; - assert (process_ptr != nullptr); - - fmtcl::ErrDifBuf * ed_buf_ptr = nullptr; - if (_errdif_flag) - { - ed_buf_ptr = _buf_pool.take_obj (); - if (ed_buf_ptr == nullptr) - { - throw_rt_err ("cannot allocate memory for temporary buffer."); - } - ed_buf_ptr->clear ((sc_flag) ? sizeof (float) : sizeof (int16_t)); - } - - switch (_dmode) - { - case DMode_BAYER: - case DMode_ROUND: - case DMode_VOIDCLUST: - { - int pat_index = 0; - if (! _correlated_planes_flag) - { - pat_index += plane_index; - } - if (_dyn_flag) - { - pat_index += frame_index; - } - pat_index &= PAT_PERIOD - 1; - const PatData& pattern = _dither_pat_arr [pat_index]; - ctx._pattern_ptr = &pattern; - } - break; - - case DMode_FAST: - // Nothing - break; - - case DMode_QUASIRND: - ctx._qrs_seed = 0; - if (_dyn_flag) - { - ctx._qrs_seed += uint32_t (frame_index * 73); - } - if (! _correlated_planes_flag) - { - ctx._qrs_seed += uint32_t (plane_index * 263); - } - break; - - case DMode_FILTERLITE: - case DMode_STUCKI: - case DMode_ATKINSON: - case DMode_FLOYD: - case DMode_OSTRO: - ctx._ed_buf_ptr = ed_buf_ptr; - break; - - default: - assert (false); - throw_logic_err ("unexpected dithering algorithm"); - break; - } - - for (int y = 0; y < h; ++y) - { - ctx._y = y; - - (*process_ptr) (dst_ptr, src_ptr, w, ctx); - - src_ptr += src_stride; - dst_ptr += dst_stride; - } - - if (ed_buf_ptr != nullptr) - { - _buf_pool.return_obj (*ed_buf_ptr); - ed_buf_ptr = nullptr; - } -} - - - -template -void Bitdepth::process_seg_fast_int_int_cpp (uint8_t * fstb_RESTRICT dst_ptr, const uint8_t * fstb_RESTRICT src_ptr, int w, SegContext &ctx) noexcept -{ - fstb::unused (ctx); - - assert (dst_ptr != nullptr); - assert (src_ptr != nullptr); - assert (w > 0); - - constexpr int DIF_BITS = SRC_BITS - DST_BITS; - static_assert (DIF_BITS >= 0, "This function cannot increase bidepth."); - - const SRC_TYPE * fstb_RESTRICT src_n_ptr = reinterpret_cast (src_ptr); - DST_TYPE * fstb_RESTRICT dst_n_ptr = reinterpret_cast < DST_TYPE *> (dst_ptr); - - for (int pos = 0; pos < w; ++pos) - { - const int s = src_n_ptr [pos]; - const int pix = s >> DIF_BITS; - dst_n_ptr [pos] = static_cast (pix); - } -} - - - -template -void Bitdepth::process_seg_fast_flt_int_cpp (uint8_t * fstb_RESTRICT dst_ptr, const uint8_t * fstb_RESTRICT src_ptr, int w, SegContext &ctx) noexcept -{ - assert (dst_ptr != nullptr); - assert (src_ptr != nullptr); - assert (w > 0); - assert (ctx._scale_info_ptr != nullptr); - - const SRC_TYPE * fstb_RESTRICT src_n_ptr = reinterpret_cast (src_ptr); - DST_TYPE * fstb_RESTRICT dst_n_ptr = reinterpret_cast < DST_TYPE *> (dst_ptr); - - const float mul = float (ctx._scale_info_ptr->_gain); - const float add = float (ctx._scale_info_ptr->_add_cst); - const int vmax = (1 << DST_BITS) - 1; - - for (int pos = 0; pos < w; ++pos) - { - float s = float (src_n_ptr [pos]); - s = s * mul + add; - const int quant = fstb::conv_int_fast (s); - const int pix = fstb::limit (quant, 0, vmax); - dst_n_ptr [pos] = static_cast (pix); - } -} - - - -#if (fstb_ARCHI == fstb_ARCHI_X86) - - - -template -void Bitdepth::process_seg_fast_int_int_sse2 (uint8_t * fstb_RESTRICT dst_ptr, const uint8_t * fstb_RESTRICT src_ptr, int w, SegContext &ctx) noexcept -{ - fstb::unused (ctx); - assert (dst_ptr != nullptr); - assert (src_ptr != nullptr); - assert (w > 0); - - constexpr int DIF_BITS = SRC_BITS - DST_BITS; - static_assert (DIF_BITS >= 0, "This function cannot increase bidepth."); - - typedef typename fmtcl::ProxyRwSse2 ::PtrConst::Type SrcPtr; - typedef typename fmtcl::ProxyRwSse2 ::Ptr::Type DstPtr; - SrcPtr src_n_ptr = reinterpret_cast (src_ptr); - DstPtr dst_n_ptr = reinterpret_cast (dst_ptr); - const __m128i zero = _mm_setzero_si128 (); - const __m128i mask_lsb = _mm_set1_epi16 (0x00FF); - - for (int pos = 0; pos < w; pos += 8) - { - const __m128i s = - fmtcl::ProxyRwSse2 ::read_i16 (src_n_ptr + pos, zero); - const __m128i pix = _mm_srli_epi16 (s, DIF_BITS); - fmtcl::ProxyRwSse2 ::write_i16 (dst_n_ptr + pos, pix, mask_lsb); - } -} - - - -template -void Bitdepth::process_seg_fast_flt_int_sse2 (uint8_t * fstb_RESTRICT dst_ptr, const uint8_t * fstb_RESTRICT src_ptr, int w, SegContext &ctx) noexcept -{ - assert (dst_ptr != nullptr); - assert (src_ptr != nullptr); - assert (w > 0); - assert (ctx._scale_info_ptr != nullptr); - - typedef typename fmtcl::ProxyRwSse2 ::PtrConst::Type SrcPtr; - typedef typename fmtcl::ProxyRwSse2 ::Ptr::Type DstPtr; - SrcPtr src_n_ptr = reinterpret_cast (src_ptr); - DstPtr dst_n_ptr = reinterpret_cast (dst_ptr); - - const __m128 mul = _mm_set1_ps (float (ctx._scale_info_ptr->_gain)); - const __m128 add = _mm_set1_ps (float (ctx._scale_info_ptr->_add_cst)); - const __m128 vmax = _mm_set1_ps (float ((1 << DST_BITS) - 1)); - const __m128 zero_f = _mm_setzero_ps (); - const __m128i zero_i = _mm_setzero_si128 (); - const __m128i mask_lsb = _mm_set1_epi16 (0x00FF); - const __m128i sign_bit = _mm_set1_epi16 (-0x8000); - const __m128 offset = _mm_set1_ps (-32768); - - for (int pos = 0; pos < w; pos += 8) - { - __m128 s0; - __m128 s1; - fmtcl::ProxyRwSse2 ::read_flt ( - src_n_ptr + pos, s0, s1, zero_i - ); - s0 = _mm_add_ps (_mm_mul_ps (s0, mul), add); - s1 = _mm_add_ps (_mm_mul_ps (s1, mul), add); - s0 = _mm_max_ps (_mm_min_ps (s0, vmax), zero_f); - s1 = _mm_max_ps (_mm_min_ps (s1, vmax), zero_f); - fmtcl::ProxyRwSse2 ::write_flt ( - dst_n_ptr + pos, s0, s1, mask_lsb, sign_bit, offset - ); - } -} - - - -#endif - - - -template -void Bitdepth::process_seg_ord_int_int_cpp (uint8_t * fstb_RESTRICT dst_ptr, const uint8_t * fstb_RESTRICT src_ptr, int w, SegContext &ctx) noexcept -{ - const PatRow & fstb_RESTRICT pattern = ctx.extract_pattern_row (); - - process_seg_common_int_int_cpp < - S_FLAG, TN_FLAG, DST_TYPE, DST_BITS, SRC_TYPE, SRC_BITS - > (dst_ptr, src_ptr, w, ctx, - [&] (int pos) - { - return pattern [pos & (PAT_WIDTH - 1)]; - } - ); -} - - - -template -void Bitdepth::process_seg_ord_flt_int_cpp (uint8_t * fstb_RESTRICT dst_ptr, const uint8_t * fstb_RESTRICT src_ptr, int w, SegContext &ctx) noexcept -{ - const PatRow & fstb_RESTRICT pattern = ctx.extract_pattern_row (); - - process_seg_common_flt_int_cpp < - S_FLAG, TN_FLAG, DST_TYPE, DST_BITS, SRC_TYPE - > (dst_ptr, src_ptr, w, ctx, - [&] (int pos) - { - return pattern [pos & (PAT_WIDTH - 1)]; - } - ); -} - - - -#if (fstb_ARCHI == fstb_ARCHI_X86) - - - -template -void Bitdepth::process_seg_ord_int_int_sse2 (uint8_t * fstb_RESTRICT dst_ptr, const uint8_t * fstb_RESTRICT src_ptr, int w, SegContext &ctx) noexcept -{ - const PatRow & fstb_RESTRICT pattern = ctx.extract_pattern_row (); - - process_seg_common_int_int_sse2 < - S_FLAG, TN_FLAG, DST_FMT, DST_BITS, SRC_FMT, SRC_BITS - > (dst_ptr, src_ptr, w, ctx, - [&] (int pos) - { - return _mm_load_si128 (reinterpret_cast ( - &pattern [pos & (PAT_WIDTH - 1)] - )); // 8 s16 [-128 ; +127] - } - ); -} - - - -template -void Bitdepth::process_seg_ord_flt_int_sse2 (uint8_t * fstb_RESTRICT dst_ptr, const uint8_t * fstb_RESTRICT src_ptr, int w, SegContext &ctx) noexcept -{ - const PatRow & fstb_RESTRICT pattern = ctx.extract_pattern_row (); - - process_seg_common_flt_int_sse2 < - S_FLAG, TN_FLAG, DST_FMT, DST_BITS, SRC_FMT - > (dst_ptr, src_ptr, w, ctx, - [&] (int pos) - { - return _mm_load_si128 (reinterpret_cast ( - &pattern [pos & (PAT_WIDTH - 1)] - )); // 8 s16 [-128 ; +127] - } - ); -} - - - -#endif // fstb_ARCHI_X86 - - - -template -void Bitdepth::process_seg_qrs_int_int_cpp (uint8_t * fstb_RESTRICT dst_ptr, const uint8_t * fstb_RESTRICT src_ptr, int w, SegContext &ctx) noexcept -{ - // alpha1 = 1 / x, with x real solution of: x^3 - x - 1 = 0 - // Also: - // alpha1 = (curt (2) * sq (curt (3))) - // / (curt (9 - sqrt (69)) + curt (9 + sqrt (69))) - constexpr double alpha1 = 1.0 / 1.3247179572447460259609088544781; - constexpr double alpha2 = alpha1 * alpha1; - constexpr int sc_l2 = 16; // 16 bits of fractional values - constexpr float sc_mul = float (1 << sc_l2); - constexpr int qrs_shf = sc_l2 - 9; - constexpr int qrs_inc = int (alpha1 * sc_mul + 0.5f); - uint32_t qrs_cnt = uint32_t (std::llrint ( - (alpha2 * double (ctx._y + ctx._qrs_seed)) * sc_mul - )); - - process_seg_common_int_int_cpp < - S_FLAG, TN_FLAG, DST_TYPE, DST_BITS, SRC_TYPE, SRC_BITS - > (dst_ptr, src_ptr, w, ctx, - [&] (int /*pos*/) - { - const int p = (qrs_cnt >> qrs_shf) & 0x1FF; - int dith_o = (p > 255) ? 512 - 128 - p : p - 128; // s8 - qrs_cnt += qrs_inc; - - if (TO_FLAG) - { - dith_o = remap_tpdf_scalar (dith_o); - } - - return dith_o; - } - ); -} - - - -template -void Bitdepth::process_seg_qrs_flt_int_cpp (uint8_t * fstb_RESTRICT dst_ptr, const uint8_t * fstb_RESTRICT src_ptr, int w, SegContext &ctx) noexcept -{ - // alpha1 = 1 / x, with x real solution of: x^3 - x - 1 = 0 - // Also: - // alpha1 = (curt (2) * sq (curt (3))) - // / (curt (9 - sqrt (69)) + curt (9 + sqrt (69))) - constexpr double alpha1 = 1.0 / 1.3247179572447460259609088544781; - constexpr double alpha2 = alpha1 * alpha1; - constexpr int sc_l2 = 16; // 16 bits of fractional values - constexpr float sc_mul = float (1 << sc_l2); - constexpr int qrs_shf = sc_l2 - 9; - constexpr int qrs_inc = int (alpha1 * sc_mul + 0.5f); - uint32_t qrs_cnt = uint32_t (std::llrint ( - (alpha2 * double (ctx._y + ctx._qrs_seed)) * sc_mul - )); - - process_seg_common_flt_int_cpp < - S_FLAG, TN_FLAG, DST_TYPE, DST_BITS, SRC_TYPE - > (dst_ptr, src_ptr, w, ctx, - [&] (int /*pos*/) - { - const int p = (qrs_cnt >> qrs_shf) & 0x1FF; - int dith_o = (p > 255) ? 512 - 128 - p : p - 128; // s8 - qrs_cnt += qrs_inc; - - if (TO_FLAG) - { - dith_o = remap_tpdf_scalar (dith_o); - } - - return dith_o; - } - ); -} - - - -#if (fstb_ARCHI == fstb_ARCHI_X86) - - - -template -void Bitdepth::process_seg_qrs_int_int_sse2 (uint8_t * fstb_RESTRICT dst_ptr, const uint8_t * fstb_RESTRICT src_ptr, int w, SegContext &ctx) noexcept -{ - // alpha1 = 1 / x, with x real solution of: x^3 - x - 1 = 0 - // Also: - // alpha1 = (curt (2) * sq (curt (3))) - // / (curt (9 - sqrt (69)) + curt (9 + sqrt (69))) - constexpr double alpha1 = 1.0 / 1.3247179572447460259609088544781; - constexpr double alpha2 = alpha1 * alpha1; - constexpr int sc_l2 = 16; // 16 bits of fractional values - constexpr float sc_mul = float (1 << sc_l2); - constexpr int qrs_shf = sc_l2 - 9; - constexpr int qrs_inc = int (alpha1 * sc_mul + 0.5f); - uint32_t qrs_cnt = uint32_t (std::llrint ( - (alpha2 * double (ctx._y + ctx._qrs_seed)) * sc_mul - )); - - const __m128i qrs_inc_4 = _mm_set1_epi32 (4 * qrs_inc); - __m128i qrs_cnt_4 = _mm_set1_epi32 (qrs_cnt); - const __m128i qrs_ofs = _mm_set_epi32 (qrs_inc * 3, qrs_inc * 2, qrs_inc, 0); - qrs_cnt_4 = _mm_add_epi32 (qrs_cnt_4, qrs_ofs); - const __m128i qrs_msk = _mm_set1_epi32 (0x1FF); - const __m128i c128 = _mm_set1_epi16 (128); - const __m128i c256 = _mm_set1_epi16 (256); - const __m128i c384 = _mm_set1_epi16 (384); - - process_seg_common_int_int_sse2 < - S_FLAG, TN_FLAG, DST_FMT, DST_BITS, SRC_FMT, SRC_BITS - > (dst_ptr, src_ptr, w, ctx, - [&] (int /*pos*/) - { - auto p03 = _mm_srli_epi32 (qrs_cnt_4, qrs_shf); - p03 = _mm_and_si128 (p03, qrs_msk); - qrs_cnt_4 = _mm_add_epi32 (qrs_cnt_4, qrs_inc_4); - auto p47 = _mm_srli_epi32 (qrs_cnt_4, qrs_shf); - p47 = _mm_and_si128 (p47, qrs_msk); - qrs_cnt_4 = _mm_add_epi32 (qrs_cnt_4, qrs_inc_4); - const auto p = _mm_packs_epi32 (p03, p47); - const auto tri_a = _mm_sub_epi16 (p, c128); - const auto tri_d = _mm_sub_epi16 (c384, p); - const auto cond = _mm_cmplt_epi16 (p, c256); - auto dith_o = _mm_or_si128 ( - _mm_and_si128 (cond, tri_a), - _mm_andnot_si128 (cond, tri_d) - ); - - if (TO_FLAG) - { - dith_o = remap_tpdf_vec (dith_o); - } - - return dith_o; // 8 s16 [-128 ; +127] or [-256 ; +255] - } - ); -} - - - -template -void Bitdepth::process_seg_qrs_flt_int_sse2 (uint8_t * fstb_RESTRICT dst_ptr, const uint8_t * fstb_RESTRICT src_ptr, int w, SegContext &ctx) noexcept -{ - // alpha1 = 1 / x, with x real solution of: x^3 - x - 1 = 0 - // Also: - // alpha1 = (curt (2) * sq (curt (3))) - // / (curt (9 - sqrt (69)) + curt (9 + sqrt (69))) - constexpr double alpha1 = 1.0 / 1.3247179572447460259609088544781; - constexpr double alpha2 = alpha1 * alpha1; - constexpr int sc_l2 = 16; // 16 bits of fractional values - constexpr float sc_mul = float (1 << sc_l2); - constexpr int qrs_shf = sc_l2 - 9; - constexpr int qrs_inc = int (alpha1 * sc_mul + 0.5f); - uint32_t qrs_cnt = uint32_t (std::llrint ( - (alpha2 * double (ctx._y + ctx._qrs_seed)) * sc_mul - )); - - const __m128i qrs_inc_4 = _mm_set1_epi32 (4 * qrs_inc); - __m128i qrs_cnt_4 = _mm_set1_epi32 (qrs_cnt); - const __m128i qrs_ofs = _mm_set_epi32 (qrs_inc * 3, qrs_inc * 2, qrs_inc, 0); - qrs_cnt_4 = _mm_add_epi32 (qrs_cnt_4, qrs_ofs); - const __m128i qrs_msk = _mm_set1_epi32 (0x1FF); - const __m128i c128 = _mm_set1_epi16 (128); - const __m128i c256 = _mm_set1_epi16 (256); - const __m128i c384 = _mm_set1_epi16 (384); - - process_seg_common_flt_int_sse2 < - S_FLAG, TN_FLAG, DST_FMT, DST_BITS, SRC_FMT - > (dst_ptr, src_ptr, w, ctx, - [&] (int /*pos*/) - { - auto p03 = _mm_srli_epi32 (qrs_cnt_4, qrs_shf); - p03 = _mm_and_si128 (p03, qrs_msk); - qrs_cnt_4 = _mm_add_epi32 (qrs_cnt_4, qrs_inc_4); - auto p47 = _mm_srli_epi32 (qrs_cnt_4, qrs_shf); - p47 = _mm_and_si128 (p47, qrs_msk); - qrs_cnt_4 = _mm_add_epi32 (qrs_cnt_4, qrs_inc_4); - const auto p = _mm_packs_epi32 (p03, p47); - const auto tri_a = _mm_sub_epi16 (p, c128); - const auto tri_d = _mm_sub_epi16 (c384, p); - const auto cond = _mm_cmplt_epi16 (p, c256); - auto dith_o = _mm_or_si128 ( - _mm_and_si128 (cond, tri_a), - _mm_andnot_si128 (cond, tri_d) - ); - - if (TO_FLAG) - { - dith_o = remap_tpdf_vec (dith_o); - } - - return dith_o; // 8 s16 [-128 ; +127] - } - ); -} - - - -#endif // fstb_ARCHI_X86 - - - -template -void Bitdepth::process_seg_common_int_int_cpp (uint8_t * fstb_RESTRICT dst_ptr, const uint8_t * fstb_RESTRICT src_ptr, int w, SegContext &ctx, DFNC dither_fnc) noexcept -{ - assert (dst_ptr != nullptr); - assert (src_ptr != nullptr); - assert (w > 0); - - constexpr int DIF_BITS = SRC_BITS - DST_BITS; - static_assert (DIF_BITS >= 1, "This function must reduce bidepth."); - - uint32_t & rnd_state = ctx._rnd_state; - - const SRC_TYPE * fstb_RESTRICT src_n_ptr = reinterpret_cast (src_ptr); - DST_TYPE * fstb_RESTRICT dst_n_ptr = reinterpret_cast < DST_TYPE *> (dst_ptr); - - const int rcst = 1 << (DIF_BITS - 1); - const int vmax = (1 << DST_BITS) - 1; - - const int ao = ctx._amp._o_i; // s8 - const int an = ctx._amp._n_i; // s8 - - for (int pos = 0; pos < w; ++pos) - { - const int s = src_n_ptr [pos]; - - const int dith_o = dither_fnc (pos); // s8 - int dither; - if (S_FLAG) - { - constexpr int DIT_SHFT = 8 - DIF_BITS; - dither = fstb::sshift_r (dith_o); - } - else - { - const int dith_n = generate_dith_n_scalar (rnd_state); // s8 - - constexpr int DIT_SHFT = AMP_BITS + 8 - DIF_BITS; - dither = fstb::sshift_r (dith_o * ao + dith_n * an); // s16 = s8 * s8 // s16 = s16 >> cst - } - const int sum = s + dither; // s16+ - const int quant = (sum + rcst) >> DIF_BITS; // s16 - - const int pix = fstb::limit (quant, 0, vmax); - dst_n_ptr [pos] = static_cast (pix); - } - - if (! S_FLAG) - { - generate_rnd_eol (rnd_state); - } -} - - - -// int dither_fnc (int pos) noexcept; -// Must provide the ordered dither value, in [-128 ; +127] nominal range -// (doubled for TPDF) -template -void Bitdepth::process_seg_common_flt_int_cpp (uint8_t * fstb_RESTRICT dst_ptr, const uint8_t * fstb_RESTRICT src_ptr, int w, SegContext &ctx, DFNC dither_fnc) noexcept -{ - assert (dst_ptr != nullptr); - assert (src_ptr != nullptr); - assert (w > 0); - - const SRC_TYPE * fstb_RESTRICT src_n_ptr = reinterpret_cast (src_ptr); - DST_TYPE * fstb_RESTRICT dst_n_ptr = reinterpret_cast < DST_TYPE *> (dst_ptr); - - uint32_t & rnd_state = ctx._rnd_state; - - const int ao = ctx._amp._o_i; // s8 - const int an = ctx._amp._n_i; // s8 - - const float mul = float (ctx._scale_info_ptr->_gain); - const float add = float (ctx._scale_info_ptr->_add_cst); - const float qt = 1.0f / (1 << ((S_FLAG ? 0 : AMP_BITS) + 8)); - const int vmax = (1 << DST_BITS) - 1; - - for (int pos = 0; pos < w; ++pos) - { - float s = float (src_n_ptr [pos]); - s = s * mul + add; - - const int dith_o = dither_fnc (pos); // s8 - - float dither; - if (S_FLAG) - { - dither = float (dith_o) * qt; - } - else - { - const int dith_n = generate_dith_n_scalar (rnd_state); // s8 - dither = float (dith_o * ao + dith_n * an) * qt; - } - const float sum = s + dither; - const int quant = fstb::round_int (sum); - - const int pix = fstb::limit (quant, 0, vmax); - dst_n_ptr [pos] = static_cast (pix); - } - - if (! S_FLAG) - { - generate_rnd_eol (rnd_state); - } -} - - - -template -int Bitdepth::generate_dith_n_scalar (uint32_t &rnd_state) noexcept -{ - generate_rnd (rnd_state); - int dith_n = int8_t (rnd_state >> 24); - if (T_FLAG) - { - generate_rnd (rnd_state); - dith_n += int8_t (rnd_state >> 24); - } - - return dith_n; -} - - - -int Bitdepth::remap_tpdf_scalar (int d) noexcept -{ - // [-128 ; 127] to [-32767 ; +32767], representing [-1 ; 1] (15-bit scale) - auto x2 = d * d; - x2 += x2; - x2 = std::min (x2, 0x7FFFF); // Saturated here because of -min * -min overflow - auto x4 = (x2 * x2 ) >> 15; - auto x8 = (x4 * x4 ) >> 15; - auto x16 = (x8 * x8 ) >> 15; - auto x32 = (x16 * x16) >> 15; - - // 15-bit scale - constexpr int c3 = 0x8000 * 5 / 8; - constexpr int c33 = 0x8000 * 3 / 8; - - // 15-bit scale - auto sum_s15 = (x2 * c3 + x32 * c33) >> 15; - const auto x_s15 = d << 8; - const auto sum_s7 = (sum_s15 * x_s15) >> (30 - 7); - - d += sum_s7; - - return d; -} - - - -#if (fstb_ARCHI == fstb_ARCHI_X86) - - - -// __m128i dither_fnc (int pos) noexcept; -// Must provide the ordered dither values as a vector of 8 x int16_t, -// in [-128 ; +127] nominal range (doubled for TPDF) -template -void Bitdepth::process_seg_common_int_int_sse2 (uint8_t * fstb_RESTRICT dst_ptr, const uint8_t * fstb_RESTRICT src_ptr, int w, SegContext &ctx, DFNC dither_fnc) noexcept -{ - assert (dst_ptr != nullptr); - assert (src_ptr != nullptr); - assert (w > 0); - - constexpr int DIF_BITS = SRC_BITS - DST_BITS; - static_assert (DIF_BITS >= 0, "This function cannot increase bidepth."); - - uint32_t & rnd_state = ctx._rnd_state; - - typedef typename fmtcl::ProxyRwSse2 ::PtrConst::Type SrcPtr; - typedef typename fmtcl::ProxyRwSse2 ::Ptr::Type DstPtr; - SrcPtr src_n_ptr = reinterpret_cast (src_ptr); - DstPtr dst_n_ptr = reinterpret_cast (dst_ptr); - const __m128i zero = _mm_setzero_si128 (); - const __m128i mask_lsb = _mm_set1_epi16 (0x00FF); - const __m128i sign_bit = _mm_set1_epi16 (-0x8000); - const __m128i rcst = _mm_set1_epi16 (1 << (DIF_BITS - 1)); - const __m128i vmax = _mm_set1_epi16 ((1 << DST_BITS) - 1); - - const __m128i ampo_i = _mm_set1_epi16 (int16_t (ctx._amp._o_i)); // 8 ?16 [0 ; 255] - const __m128i ampn_i = _mm_set1_epi16 (int16_t (ctx._amp._n_i)); // 8 ?16 [0 ; 255] - - for (int pos = 0; pos < w; pos += 8) - { - const __m128i s = // 8 u16 - fmtcl::ProxyRwSse2 ::read_i16 (src_n_ptr + pos, zero); - - // 8 s16 [-128 ; +127] or [-256 ; 255] - __m128i dith_o = dither_fnc (pos); - - __m128i dither; - if (S_FLAG) - { - constexpr int DIT_SHFT = 8 - DIF_BITS; - dither = _mm_srai_epi16 (dith_o, DIT_SHFT); - } - else - { - // Random generation. 8 s16 [-128 ; 127] or [-256 ; 255] - __m128i dith_n = generate_dith_n_vec (rnd_state); - - dith_o = _mm_mullo_epi16 (dith_o, ampo_i); // 8 s16 (full range) - dith_n = _mm_mullo_epi16 (dith_n, ampn_i); // 8 s16 (full range) - dither = _mm_adds_epi16 (dith_o, dith_n); // 8 s16 = s8 * s8 - - constexpr int DIT_SHFT = AMP_BITS + 8 - DIF_BITS; - dither = _mm_srai_epi16 (dither, DIT_SHFT); // 8 s16 = s16 >> cst - } - - const __m128i dith_rcst = _mm_adds_epi16 (dither, rcst); - - __m128i quant; - if (S_FLAG && SRC_BITS < 16) - { - __m128i sum = _mm_adds_epi16 (s, dith_rcst); - quant = _mm_srai_epi16 (sum, DIF_BITS); - } - else - { - __m128i sum = _mm_xor_si128 (s, sign_bit); // 8 s16 - sum = _mm_adds_epi16 (sum, dith_rcst); - sum = _mm_xor_si128 (sum, sign_bit); // 8 u16 - quant = _mm_srli_epi16 (sum, DIF_BITS); - } - - __m128i pix = quant; - if (SRC_BITS < 16) - { - pix = _mm_max_epi16 (pix, zero); - pix = _mm_min_epi16 (pix, vmax); - } - - fmtcl::ProxyRwSse2 ::write_i16 (dst_n_ptr + pos, pix, mask_lsb); - } - - if (! S_FLAG) - { - generate_rnd_eol (rnd_state); - } -} - - - -template -void Bitdepth::process_seg_common_flt_int_sse2 (uint8_t * fstb_RESTRICT dst_ptr, const uint8_t * fstb_RESTRICT src_ptr, int w, SegContext &ctx, DFNC dither_fnc) noexcept -{ - assert (dst_ptr != nullptr); - assert (src_ptr != nullptr); - assert (w > 0); - assert (((_mm_getcsr () >> 13) & 3) == 0); // 00 = Round to nearest (even) - - uint32_t & rnd_state = ctx._rnd_state; - - const float qt_cst = 1.0f / ( - 65536.0f * float (1 << ((S_FLAG ? 0 : AMP_BITS) + 8)) - ); - - typedef typename fmtcl::ProxyRwSse2 ::PtrConst::Type SrcPtr; - typedef typename fmtcl::ProxyRwSse2 ::Ptr::Type DstPtr; - SrcPtr src_n_ptr = reinterpret_cast (src_ptr); - DstPtr dst_n_ptr = reinterpret_cast (dst_ptr); - const __m128 zero_f = _mm_setzero_ps (); - const __m128i zero_i = _mm_setzero_si128 (); - const __m128 mul = _mm_set1_ps (float (ctx._scale_info_ptr->_gain)); - const __m128 add = _mm_set1_ps (float (ctx._scale_info_ptr->_add_cst)); - const __m128 qt = _mm_set1_ps (qt_cst); - const __m128 vmax = _mm_set1_ps ((1 << DST_BITS) - 1); - const __m128 offset = _mm_set1_ps (-32768); - const __m128i mask_lsb = _mm_set1_epi16 (0x00FF); - const __m128i sign_bit = _mm_set1_epi16 (-0x8000); - - const __m128i ampo_i = _mm_set1_epi16 (int16_t (ctx._amp._o_i)); // 8 ?16 [0 ; 255] - const __m128i ampn_i = _mm_set1_epi16 (int16_t (ctx._amp._n_i)); // 8 ?16 [0 ; 255] - - for (int pos = 0; pos < w; pos += 8) - { - __m128 s0; - __m128 s1; - fmtcl::ProxyRwSse2 ::read_flt ( - src_n_ptr + pos, s0, s1, zero_i - ); - s0 = _mm_add_ps (_mm_mul_ps (s0, mul), add); - s1 = _mm_add_ps (_mm_mul_ps (s1, mul), add); - - // 8 s16 [-128 ; +127] or [-256 ; 255] - __m128i dith_o = dither_fnc (pos); - - __m128i dither; - if (S_FLAG) - { - dither = dith_o; - } - else - { - // Random generation. 8 s16 [-128 ; 127] or [-256 ; 255] - __m128i dith_n = generate_dith_n_vec (rnd_state); - - dith_o = _mm_mullo_epi16 (dith_o, ampo_i); // 8 s16 (full range) - dith_n = _mm_mullo_epi16 (dith_n, ampn_i); // 8 s16 (full range) - dither = _mm_adds_epi16 (dith_o, dith_n); // 8 s16 = s8 * s8 - } - - __m128i dither_03i = _mm_unpacklo_epi16 (zero_i, dither); // 4 s32 << 16 - __m128i dither_47i = _mm_unpackhi_epi16 (zero_i, dither); // 4 s32 << 16 - __m128 dither_03 = _mm_cvtepi32_ps (dither_03i); - __m128 dither_47 = _mm_cvtepi32_ps (dither_47i); - dither_03 = _mm_mul_ps (dither_03, qt); - dither_47 = _mm_mul_ps (dither_47, qt); - - s0 = _mm_add_ps (s0, dither_03); - s1 = _mm_add_ps (s1, dither_47); - - s0 = _mm_max_ps (_mm_min_ps (s0, vmax), zero_f); - s1 = _mm_max_ps (_mm_min_ps (s1, vmax), zero_f); - - fmtcl::ProxyRwSse2 ::write_flt ( - dst_n_ptr + pos, s0, s1, mask_lsb, sign_bit, offset - ); - } - - if (! S_FLAG) - { - generate_rnd_eol (rnd_state); - } -} - - - -template -__m128i Bitdepth::generate_dith_n_vec (uint32_t &rnd_state) noexcept -{ - generate_rnd (rnd_state); - const uint32_t rnd_03 = rnd_state; - generate_rnd (rnd_state); - const uint32_t rnd_47 = rnd_state; - const auto zero = _mm_setzero_si128 (); - - if (T_FLAG) - { - generate_rnd (rnd_state); - const uint32_t rnd_03x = rnd_state; - generate_rnd (rnd_state); - const uint32_t rnd_47x = rnd_state; - const auto rnd_val = _mm_set_epi32 (rnd_47x, rnd_03x, rnd_47, rnd_03); - const auto c256_16 = _mm_set1_epi16 (0x100); - const auto x0 = _mm_unpacklo_epi8 (rnd_val, zero); - const auto x1 = _mm_unpackhi_epi8 (rnd_val, zero); - const auto dith_n = _mm_sub_epi16 (_mm_add_epi16 (x0, x1), c256_16); - return dith_n; // 8 s16 [-256 ; 255] - } - - else - { - const auto rnd_val = _mm_set_epi32 (0, 0, rnd_47, rnd_03); - const auto c128_16 = _mm_set1_epi16 (0x80); - const auto x0 = _mm_unpacklo_epi8 (rnd_val, zero); // 8 ?16 [0 ; 255] - const auto dith_n = _mm_sub_epi16 (x0, c128_16); - - return dith_n; // 8 s16 [-128 ; 127] - } -} - - - -// d: 8 s16 [-128 ; 127] -// Returns: 8 s16 [-256 ; 255] -// Formula: -// f: [-1 ; +1] -> [-2 ; +2] -// x -> x + 5/8 * x^3 + 3/8 * x^33 -__m128i Bitdepth::remap_tpdf_vec (__m128i d) noexcept -{ - // [-128 ; 127] to [-32767 ; +32767], representing [-1 ; 1] (15-bit scale) - auto x2 = _mm_mullo_epi16 (d , d ); - x2 = _mm_adds_epi16 (x2 , x2 ); // Saturated here because of -min * -min overflow - auto x4 = _mm_mulhi_epi16 (x2 , x2 ); - x4 = _mm_add_epi16 (x4 , x4 ); - auto x8 = _mm_mulhi_epi16 (x4 , x4 ); - x8 = _mm_add_epi16 (x8 , x8 ); - auto x16 = _mm_mulhi_epi16 (x8 , x8 ); - x16 = _mm_add_epi16 (x16, x16); - auto x32 = _mm_mulhi_epi16 (x16, x16); - x32 = _mm_add_epi16 (x32, x32); - - // 15-bit scale - const auto c3 = _mm_set1_epi16 (0x8000 * 5 / 8); - const auto c33 = _mm_set1_epi16 (0x8000 * 3 / 8); - - // 14-bit scale - auto sum_s14 = _mm_mulhi_epi16 (x2, c3); - sum_s14 = _mm_add_epi16 (sum_s14, _mm_mulhi_epi16 (x32, c33)); - - const auto x_s15 = _mm_slli_epi16 (d, 8); - const auto sum_s13 = _mm_mulhi_epi16 (sum_s14, x_s15); - - const auto sum_s7 = _mm_srai_epi16 (sum_s13, 13 - 7); - - d = _mm_add_epi16 (d, sum_s7); - - return d; -} - - - -#endif - - - -template -void Bitdepth::process_seg_errdif_int_int_cpp (uint8_t * fstb_RESTRICT dst_ptr, const uint8_t * fstb_RESTRICT src_ptr, int w, SegContext &ctx) noexcept -{ - assert (dst_ptr != nullptr); - assert (src_ptr != nullptr); - assert (w > 0); - assert (ctx._y >= 0); - - typedef typename ERRDIF::SrcType SRC_TYPE; - typedef typename ERRDIF::DstType DST_TYPE; - constexpr int SRC_BITS = ERRDIF::SRC_BITS; - constexpr int DST_BITS = ERRDIF::DST_BITS; - - uint32_t & rnd_state = ctx._rnd_state; - fmtcl::ErrDifBuf & fstb_RESTRICT ed_buf = *ctx._ed_buf_ptr; - - const SRC_TYPE * fstb_RESTRICT src_n_ptr = reinterpret_cast (src_ptr); - DST_TYPE * fstb_RESTRICT dst_n_ptr = reinterpret_cast < DST_TYPE *> (dst_ptr); - - const int ae = ctx._amp._e_i; - - // Makes e1 point on the default buffer line for single-line - // error diffusor because we use it in prepare_next_line() - int e0 = 0; - int e1 = 0; - if (ERRDIF::NBR_ERR_LINES == 2) - { - e0 = ctx._y & 1 ; - e1 = 1 - (ctx._y & 1); - } - int16_t * err0_ptr = ed_buf.get_buf (e0); - int16_t * err1_ptr = ed_buf.get_buf (e1); - - int err_nxt0 = ed_buf.use_mem (0); - int err_nxt1 = ed_buf.use_mem (1); - - // Forward - if ((ctx._y & 1) == 0) - { - for (int x = 0; x < w; ++x) - { - int err = err_nxt0; - SRC_TYPE src_raw; - - quantize_pix_int < - S_FLAG, T_FLAG, DST_TYPE, DST_BITS, SRC_TYPE, SRC_BITS - > ( - dst_n_ptr, src_n_ptr, src_raw, x, err, rnd_state, ae, ctx._amp._n_i - ); - ERRDIF::template diffuse <1> ( - err, err_nxt0, err_nxt1, - err0_ptr + x, err1_ptr + x, src_raw - ); - } - ERRDIF::prepare_next_line (err1_ptr + w); - } - - // Backward - else - { - for (int x = w - 1; x >= 0; --x) - { - int err = err_nxt0; - SRC_TYPE src_raw; - - quantize_pix_int < - S_FLAG, T_FLAG, DST_TYPE, DST_BITS, SRC_TYPE, SRC_BITS - > ( - dst_n_ptr, src_n_ptr, src_raw, x, err, rnd_state, ae, ctx._amp._n_i - ); - ERRDIF::template diffuse <-1> ( - err, err_nxt0, err_nxt1, - err0_ptr + x, err1_ptr + x, src_raw - ); - } - ERRDIF::prepare_next_line (err1_ptr - 1); - } - - ed_buf.use_mem (0) = int16_t (err_nxt0); - ed_buf.use_mem (1) = int16_t (err_nxt1); - - if (! S_FLAG) - { - generate_rnd_eol (rnd_state); - } -} - - - -template -void Bitdepth::process_seg_errdif_flt_int_cpp (uint8_t * fstb_RESTRICT dst_ptr, const uint8_t * fstb_RESTRICT src_ptr, int w, SegContext &ctx) noexcept -{ - assert (dst_ptr != nullptr); - assert (src_ptr != nullptr); - assert (w > 0); - assert (ctx._y >= 0); - - typedef typename ERRDIF::SrcType SRC_TYPE; - typedef typename ERRDIF::DstType DST_TYPE; - constexpr int DST_BITS = ERRDIF::DST_BITS; - - uint32_t & rnd_state = ctx._rnd_state; - fmtcl::ErrDifBuf & fstb_RESTRICT ed_buf = *ctx._ed_buf_ptr; - - const SRC_TYPE * fstb_RESTRICT src_n_ptr = reinterpret_cast (src_ptr); - DST_TYPE * fstb_RESTRICT dst_n_ptr = reinterpret_cast < DST_TYPE *> (dst_ptr); - - const float mul = float (ctx._scale_info_ptr->_gain); - const float add = float (ctx._scale_info_ptr->_add_cst); - const float ae = float (ctx._amp._e_f); - const float an = float (ctx._amp._n_f); - - // Makes e1 point on the default buffer line for single-line - // error diffusor because we use it in prepare_next_line() - int e0 = 0; - int e1 = 0; - if (ERRDIF::NBR_ERR_LINES == 2) - { - e0 = ctx._y & 1 ; - e1 = 1 - (ctx._y & 1); - } - float * err0_ptr = ed_buf.get_buf (e0); - float * err1_ptr = ed_buf.get_buf (e1); - - float err_nxt0 = ed_buf.use_mem (0); - float err_nxt1 = ed_buf.use_mem (1); - - // Forward - if ((ctx._y & 1) == 0) - { - for (int x = 0; x < w; ++x) - { - float err = err_nxt0; - SRC_TYPE src_raw; - - quantize_pix_flt ( - dst_n_ptr, src_n_ptr, src_raw, x, err, rnd_state, ae, an, mul, add - ); - ERRDIF::template diffuse <1> ( - err, err_nxt0, err_nxt1, - err0_ptr + x, err1_ptr + x, src_raw - ); - } - ERRDIF::prepare_next_line (err1_ptr + w); - } - - // Backward - else - { - for (int x = w - 1; x >= 0; --x) - { - float err = err_nxt0; - SRC_TYPE src_raw; - - quantize_pix_flt ( - dst_n_ptr, src_n_ptr, src_raw, x, err, rnd_state, ae, an, mul, add - ); - ERRDIF::template diffuse <-1> ( - err, err_nxt0, err_nxt1, - err0_ptr + x, err1_ptr + x, src_raw - ); - } - ERRDIF::prepare_next_line (err1_ptr - 1); - } - - ed_buf.use_mem (0) = err_nxt0; - ed_buf.use_mem (1) = err_nxt1; - - if (! S_FLAG) - { - generate_rnd_eol (rnd_state); - } -} - - - -void Bitdepth::generate_rnd (uint32_t &state) noexcept -{ - state = state * uint32_t (1664525) + 1013904223; -} - - - -void Bitdepth::generate_rnd_eol (uint32_t &state) noexcept -{ - state = state * uint32_t (1103515245) + 12345; - if ((state & 0x2000000) != 0) - { - state = state * uint32_t (134775813) + 1; - } -} - - - -const Bitdepth::PatRow & Bitdepth::SegContext::extract_pattern_row () const noexcept -{ - assert (_pattern_ptr != nullptr); - assert (_y >= 0); - - return ((*_pattern_ptr) [_y & (PAT_WIDTH - 1)]); -} - - - -template -void Bitdepth::quantize_pix_int (DST_TYPE * fstb_RESTRICT dst_ptr, const SRC_TYPE * fstb_RESTRICT src_ptr, SRC_TYPE &src_raw, int x, int & fstb_RESTRICT err, uint32_t &rnd_state, int ampe_i, int ampn_i) noexcept -{ - constexpr int DIF_BITS = SRC_BITS - DST_BITS; - constexpr int TMP_BITS = - (DIF_BITS < 6 && SRC_BITS < ERR_RES && DST_BITS < ERR_RES) - ? ERR_RES - : SRC_BITS; - constexpr int TMP_SHFT = TMP_BITS - SRC_BITS; - constexpr int TMP_INVS = TMP_BITS - DST_BITS; - - const int rcst = 1 << (TMP_INVS - 1); - const int vmax = (1 << DST_BITS) - 1; - - src_raw = src_ptr [x]; - const int src = src_raw << TMP_SHFT; - const int preq = src + err; - - int sum = preq; - if (! S_FLAG) - { - enum { DIT_SHFT = AMP_BITS + 8 - TMP_INVS }; // May be negative - - const int dith_n = generate_dith_n_scalar (rnd_state); // s8 - const int err_add = (err < 0) ? -ampe_i : ampe_i; - const int noise = - fstb::sshift_r (dith_n * ampn_i + err_add); // s16 = s8 * s8 // s16 = s16 >> cst - - sum += noise; - } - - const int quant = (sum + rcst) >> TMP_INVS; - - err = preq - (quant << TMP_INVS); - const int pix = fstb::limit (quant, 0, vmax); - - dst_ptr [x] = static_cast (pix); -} - - - -template -static inline SRC_TYPE Bitdepth_extract_src (SRC_TYPE src_read, float src) noexcept -{ - fstb::unused (src); - - return (src_read); -} - -static inline float Bitdepth_extract_src (float src_read, float src) noexcept -{ - fstb::unused (src_read); - - return (src); -} - -template -void Bitdepth::quantize_pix_flt (DST_TYPE * fstb_RESTRICT dst_ptr, const SRC_TYPE * fstb_RESTRICT src_ptr, SRC_TYPE &src_raw, int x, float & fstb_RESTRICT err, uint32_t &rnd_state, float ampe_f, float ampn_f, float mul, float add) noexcept -{ - const int vmax = (1 << DST_BITS) - 1; - - const SRC_TYPE src_read = src_ptr [x]; - const float src = float (src_read) * mul + add; - src_raw = Bitdepth_extract_src (src_read, src); - const float preq = src + err; - - float sum = preq; - if (! S_FLAG) - { - const int dith_n = generate_dith_n_scalar (rnd_state); // s8 - const float err_add = (err < 0) ? -ampe_f : (err > 0) ? ampe_f : 0; - const float noise = float (dith_n) * ampn_f + err_add; - - sum += noise; - } - - const int quant = fstb::round_int (sum); - - err = preq - float (quant); - const int pix = fstb::limit (quant, 0, vmax); - - dst_ptr [x] = static_cast (pix); -} - - - -// Original coefficients : 7, 3, 5, 1 -// Optimised coefficients for serpentine scan: 7, 4, 5, 0 -// Source: -// Sam Hocevar and Gary Niger, -// Reinstating Floyd-Steinberg: Improved Metrics for Quality Assessment -// of Error Diffusion Algorithms, -// Lecture Notes in Computer Science LNCS 5099, pp. 38–45, 2008 -// (Proceedings of the International Conference on Image and Signal Processing -// ICISP 2008) ISSN 0302-9743 - -#define fmtc_Bitdepth_FS_OPTIMIZED_SERPENTINE_COEF - -template -template -void Bitdepth::DiffuseFloydSteinberg ::diffuse (int err, int & fstb_RESTRICT err_nxt0, int & fstb_RESTRICT err_nxt1, int16_t * fstb_RESTRICT err0_ptr, int16_t * fstb_RESTRICT err1_ptr, SRC_TYPE src_raw) noexcept -{ - fstb::unused (err_nxt1, err1_ptr, src_raw); - -#if defined (fmtc_Bitdepth_FS_OPTIMIZED_SERPENTINE_COEF) - const int e1 = 0; - const int e3 = (err * 4 + 8) >> 4; -#else - const int e1 = (err + 8) >> 4; - const int e3 = (err * 3 + 8) >> 4; -#endif - const int e5 = (err * 5 + 8) >> 4; - const int e7 = err - e1 - e3 - e5; - spread_error (e1, e3, e5, e7, err_nxt0, err0_ptr); -} - -template -template -void Bitdepth::DiffuseFloydSteinberg ::diffuse (float err, float & fstb_RESTRICT err_nxt0, float & fstb_RESTRICT err_nxt1, float * fstb_RESTRICT err0_ptr, float * fstb_RESTRICT err1_ptr, SRC_TYPE src_raw) noexcept -{ - fstb::unused (err_nxt1, err1_ptr, src_raw); - -#if defined (fmtc_Bitdepth_FS_OPTIMIZED_SERPENTINE_COEF) - const float e1 = 0; - const float e3 = err * (4.0f / 16); -#else - const float e1 = err * (1.0f / 16); - const float e3 = err * (3.0f / 16); -#endif - const float e5 = err * (5.0f / 16); - const float e7 = err * (7.0f / 16); - spread_error (e1, e3, e5, e7, err_nxt0, err0_ptr); -} - -template -template -void Bitdepth::DiffuseFloydSteinberg ::prepare_next_line (EB * fstb_RESTRICT err_ptr) noexcept -{ - // Nothing - fstb::unused (err_ptr); -} - -template -template -void Bitdepth::DiffuseFloydSteinberg ::spread_error (ET e1, ET e3, ET e5, ET e7, ET & fstb_RESTRICT err_nxt0, EB * fstb_RESTRICT err0_ptr) noexcept -{ - err_nxt0 = err0_ptr [DIR]; - err0_ptr [-DIR] += EB (e3); - err0_ptr [ 0] += EB (e5); - err0_ptr [ DIR] = EB (e1); - err_nxt0 += e7; -} - - - -template -template -void Bitdepth::DiffuseFilterLite ::diffuse (int err, int & fstb_RESTRICT err_nxt0, int & fstb_RESTRICT err_nxt1, int16_t * fstb_RESTRICT err0_ptr, int16_t * fstb_RESTRICT err1_ptr, SRC_TYPE src_raw) noexcept -{ - fstb::unused (err_nxt1, err1_ptr, src_raw); - - const int e1 = (err + 2) >> 2; - const int e2 = err - 2 * e1; - spread_error (e1, e2, err_nxt0, err0_ptr); -} - -template -template -void Bitdepth::DiffuseFilterLite ::diffuse (float err, float & fstb_RESTRICT err_nxt0, float & fstb_RESTRICT err_nxt1, float * fstb_RESTRICT err0_ptr, float * fstb_RESTRICT err1_ptr, SRC_TYPE src_raw) noexcept -{ - fstb::unused (err_nxt1, err1_ptr, src_raw); - - const float e1 = err * (1.0f / 4); - const float e2 = err * (2.0f / 4); - spread_error (e1, e2, err_nxt0, err0_ptr); -} - -template -template -void Bitdepth::DiffuseFilterLite ::prepare_next_line (EB * fstb_RESTRICT err_ptr) noexcept -{ - err_ptr [0] = EB (0); -} - -template -template -void Bitdepth::DiffuseFilterLite ::spread_error (ET e1, ET e2, ET & fstb_RESTRICT err_nxt0, EB * fstb_RESTRICT err0_ptr) noexcept -{ - err_nxt0 = err0_ptr [DIR]; - err0_ptr [-DIR] += EB (e1); - err0_ptr [ 0] = EB (e1); - err_nxt0 += e2; -} - - - -template -template -void Bitdepth::DiffuseStucki ::diffuse (int err, int & fstb_RESTRICT err_nxt0, int & fstb_RESTRICT err_nxt1, int16_t * fstb_RESTRICT err0_ptr, int16_t * fstb_RESTRICT err1_ptr, SRC_TYPE src_raw) noexcept -{ - fstb::unused (src_raw); - - const int m = (err << 4) / 42; - const int e1 = (m + 8) >> 4; - const int e2 = (m + 4) >> 3; - const int e4 = (m + 2) >> 2; -// const int e8 = (m + 1) >> 1; - const int sum = (e1 << 1) + ((e2 + e4) << 2); - const int e8 = (err - sum + 1) >> 1; - spread_error (e1, e2, e4, e8, err_nxt0, err_nxt1, err0_ptr, err1_ptr); -} - -template -template -void Bitdepth::DiffuseStucki ::diffuse (float err, float & fstb_RESTRICT err_nxt0, float & fstb_RESTRICT err_nxt1, float * fstb_RESTRICT err0_ptr, float * fstb_RESTRICT err1_ptr, SRC_TYPE src_raw) noexcept -{ - fstb::unused (src_raw); - - const float e1 = err * (1.0f / 42); - const float e2 = err * (2.0f / 42); - const float e4 = err * (4.0f / 42); - const float e8 = err * (8.0f / 42); - spread_error (e1, e2, e4, e8, err_nxt0, err_nxt1, err0_ptr, err1_ptr); -} - -template -template -void Bitdepth::DiffuseStucki ::prepare_next_line (EB * fstb_RESTRICT err_ptr) noexcept -{ - // Nothing - fstb::unused (err_ptr); -} - -template -template -void Bitdepth::DiffuseStucki ::spread_error (ET e1, ET e2, ET e4, ET e8, ET & fstb_RESTRICT err_nxt0, ET & fstb_RESTRICT err_nxt1, EB * fstb_RESTRICT err0_ptr, EB * fstb_RESTRICT err1_ptr) noexcept -{ - err_nxt0 = err_nxt1 + e8; - err_nxt1 = err1_ptr [DIR * 2] + e4; - err0_ptr [-DIR * 2] += EB (e2); - err0_ptr [-DIR ] += EB (e4); - err0_ptr [ 0 ] += EB (e8); - err0_ptr [ DIR ] += EB (e4); - err0_ptr [ DIR * 2] += EB (e2); - err1_ptr [-DIR * 2] += EB (e1); - err1_ptr [-DIR ] += EB (e2); - err1_ptr [ 0 ] += EB (e4); - err1_ptr [ DIR ] += EB (e2); - err1_ptr [ DIR * 2] = EB (e1); -} - - - -template -template -void Bitdepth::DiffuseAtkinson ::diffuse (int err, int & fstb_RESTRICT err_nxt0, int & fstb_RESTRICT err_nxt1, int16_t * fstb_RESTRICT err0_ptr, int16_t * fstb_RESTRICT err1_ptr, SRC_TYPE src_raw) noexcept -{ - fstb::unused (src_raw); - - const int e1 = (err + 4) >> 3; - spread_error (e1, err_nxt0, err_nxt1, err0_ptr, err1_ptr); -} - -template -template -void Bitdepth::DiffuseAtkinson ::diffuse (float err, float & fstb_RESTRICT err_nxt0, float & fstb_RESTRICT err_nxt1, float * fstb_RESTRICT err0_ptr, float * fstb_RESTRICT err1_ptr, SRC_TYPE src_raw) noexcept -{ - fstb::unused (src_raw); - - const float e1 = err * (1.0f / 8); - spread_error (e1, err_nxt0, err_nxt1, err0_ptr, err1_ptr); -} - -template -template -void Bitdepth::DiffuseAtkinson ::prepare_next_line (EB * fstb_RESTRICT err_ptr) noexcept -{ - err_ptr [0] = EB (0); -} - -template -template -void Bitdepth::DiffuseAtkinson ::spread_error (ET e1, ET & fstb_RESTRICT err_nxt0, ET & fstb_RESTRICT err_nxt1, EB * fstb_RESTRICT err0_ptr, EB * fstb_RESTRICT err1_ptr) noexcept -{ - err_nxt0 = err_nxt1 + e1; - err_nxt1 = err1_ptr [2 * DIR] + e1; - err0_ptr [-DIR] += EB (e1); - err0_ptr [ 0] += EB (e1); - err0_ptr [+DIR] += EB (e1); - err1_ptr [ 0] = EB (e1); -} - - - -template -template -int Bitdepth::DiffuseOstromoukhovBase2 ::get_index (SRC_TYPE src_raw) noexcept -{ - constexpr int DIF_BITS = SRC_BITS - DST_BITS; - - return (fstb::sshift_l < - int, - DiffuseOstromoukhovBase::T_BITS - DIF_BITS - > (src_raw) & DiffuseOstromoukhovBase::T_MASK); -} - -template -int Bitdepth::DiffuseOstromoukhovBase2 ::get_index (float src_raw) noexcept -{ - return - fstb::round_int (src_raw * DiffuseOstromoukhovBase::T_LEN) - & DiffuseOstromoukhovBase::T_MASK; -} - -// Victor Ostromoukhov, -// A Simple and Efficient Error-Diffusion Algorithm -// Proceedings of SIGGRAPH 2001, in ACM Computer Graphics, -// Annual Conference Series, pp. 567-572, 2001. -// Not optimised at all -template -template -void Bitdepth::DiffuseOstromoukhov ::diffuse (int err, int & fstb_RESTRICT err_nxt0, int & fstb_RESTRICT err_nxt1, int16_t * fstb_RESTRICT err0_ptr, int16_t * fstb_RESTRICT err1_ptr, SRC_TYPE src_raw) noexcept -{ - fstb::unused (err_nxt1, err1_ptr); - - constexpr int DIF_BITS = SRC_BITS - DST_BITS; - - const int index = fstb::sshift_l < - int, - DiffuseOstromoukhov::T_BITS - DIF_BITS - > (src_raw) & DiffuseOstromoukhov::T_MASK; - const typename ThisType::TableEntry & fstb_RESTRICT te = ThisType::_table [index]; - const int d = te._sum; - - const int e1 = err * te._c0 / d; - const int e2 = err * te._c1 / d; - const int e3 = err - e1 - e2; - - spread_error (e1, e2, e3, err_nxt0, err0_ptr); -} - -template -template -void Bitdepth::DiffuseOstromoukhov ::diffuse (float err, float & fstb_RESTRICT err_nxt0, float & fstb_RESTRICT err_nxt1, float * fstb_RESTRICT err0_ptr, float * fstb_RESTRICT err1_ptr, SRC_TYPE src_raw) noexcept -{ - fstb::unused (err_nxt1, err1_ptr); - - const int index = DiffuseOstromoukhov::get_index (src_raw); - const typename ThisType::TableEntry & fstb_RESTRICT te = ThisType::_table [index]; - const float invd = te._inv_sum; - - const float e1 = err * float (te._c0) * invd; - const float e2 = err * float (te._c1) * invd; - const float e3 = err - e1 - e2; - - spread_error (e1, e2, e3, err_nxt0, err0_ptr); -} - -template -template -void Bitdepth::DiffuseOstromoukhov ::prepare_next_line (EB * fstb_RESTRICT err_ptr) noexcept -{ - err_ptr [0] = EB (0); -} - -template -template -void Bitdepth::DiffuseOstromoukhov ::spread_error (ET e1, ET e2, ET e3, ET & fstb_RESTRICT err_nxt0, EB * fstb_RESTRICT err0_ptr) noexcept -{ - err_nxt0 = err0_ptr [DIR]; - err0_ptr [-DIR] += EB (e2); - err0_ptr [ 0] = EB (e3); - err_nxt0 += e1; -} - - - -const std::array < - Bitdepth::DiffuseOstromoukhovBase::TableEntry, - Bitdepth::DiffuseOstromoukhovBase::T_LEN -> Bitdepth::DiffuseOstromoukhovBase::_table = -{{ - { 13, 0, 5, 18, 1.0f / 18 }, - { 13, 0, 5, 18, 1.0f / 18 }, - { 21, 0, 10, 31, 1.0f / 31 }, - { 7, 0, 4, 11, 1.0f / 11 }, - { 8, 0, 5, 13, 1.0f / 13 }, - { 47, 3, 28, 78, 1.0f / 78 }, - { 23, 3, 13, 39, 1.0f / 39 }, - { 15, 3, 8, 26, 1.0f / 26 }, - { 22, 6, 11, 39, 1.0f / 39 }, - { 43, 15, 20, 78, 1.0f / 78 }, - { 7, 3, 3, 13, 1.0f / 13 }, - { 501, 224, 211, 936, 1.0f / 936 }, - { 249, 116, 103, 468, 1.0f / 468 }, - { 165, 80, 67, 312, 1.0f / 312 }, - { 123, 62, 49, 234, 1.0f / 234 }, - { 489, 256, 191, 936, 1.0f / 936 }, - { 81, 44, 31, 156, 1.0f / 156 }, - { 483, 272, 181, 936, 1.0f / 936 }, - { 60, 35, 22, 117, 1.0f / 117 }, - { 53, 32, 19, 104, 1.0f / 104 }, - { 237, 148, 83, 468, 1.0f / 468 }, - { 471, 304, 161, 936, 1.0f / 936 }, - { 3, 2, 1, 6, 1.0f / 6 }, - { 481, 314, 185, 980, 1.0f / 980 }, - { 354, 226, 155, 735, 1.0f / 735 }, - { 1389, 866, 685, 2940, 1.0f / 2940 }, - { 227, 138, 125, 490, 1.0f / 490 }, - { 267, 158, 163, 588, 1.0f / 588 }, - { 327, 188, 220, 735, 1.0f / 735 }, - { 61, 34, 45, 140, 1.0f / 140 }, - { 627, 338, 505, 1470, 1.0f / 1470 }, - { 1227, 638, 1075, 2940, 1.0f / 2940 }, - - { 20, 10, 19, 49, 1.0f / 49 }, - { 1937, 1000, 1767, 4704, 1.0f / 4704 }, - { 977, 520, 855, 2352, 1.0f / 2352 }, - { 657, 360, 551, 1568, 1.0f / 1568 }, - { 71, 40, 57, 168, 1.0f / 168 }, - { 2005, 1160, 1539, 4704, 1.0f / 4704 }, - { 337, 200, 247, 784, 1.0f / 784 }, - { 2039, 1240, 1425, 4704, 1.0f / 4704 }, - { 257, 160, 171, 588, 1.0f / 588 }, - { 691, 440, 437, 1568, 1.0f / 1568 }, - { 1045, 680, 627, 2352, 1.0f / 2352 }, - { 301, 200, 171, 672, 1.0f / 672 }, - { 177, 120, 95, 392, 1.0f / 392 }, - { 2141, 1480, 1083, 4704, 1.0f / 4704 }, - { 1079, 760, 513, 2352, 1.0f / 2352 }, - { 725, 520, 323, 1568, 1.0f / 1568 }, - { 137, 100, 57, 294, 1.0f / 294 }, - { 2209, 1640, 855, 4704, 1.0f / 4704 }, - { 53, 40, 19, 112, 1.0f / 112 }, - { 2243, 1720, 741, 4704, 1.0f / 4704 }, - { 565, 440, 171, 1176, 1.0f / 1176 }, - { 759, 600, 209, 1568, 1.0f / 1568 }, - { 1147, 920, 285, 2352, 1.0f / 2352 }, - { 2311, 1880, 513, 4704, 1.0f / 4704 }, - { 97, 80, 19, 196, 1.0f / 196 }, - { 335, 280, 57, 672, 1.0f / 672 }, - { 1181, 1000, 171, 2352, 1.0f / 2352 }, - { 793, 680, 95, 1568, 1.0f / 1568 }, - { 599, 520, 57, 1176, 1.0f / 1176 }, - { 2413, 2120, 171, 4704, 1.0f / 4704 }, - { 405, 360, 19, 784, 1.0f / 784 }, - { 2447, 2200, 57, 4704, 1.0f / 4704 }, - - { 11, 10, 0, 21, 1.0f / 21 }, - { 158, 151, 3, 312, 1.0f / 312 }, - { 178, 179, 7, 364, 1.0f / 364 }, - { 1030, 1091, 63, 2184, 1.0f / 2184 }, - { 248, 277, 21, 546, 1.0f / 546 }, - { 318, 375, 35, 728, 1.0f / 728 }, - { 458, 571, 63, 1092, 1.0f / 1092 }, - { 878, 1159, 147, 2184, 1.0f / 2184 }, - { 5, 7, 1, 13, 1.0f / 13 }, - { 172, 181, 37, 390, 1.0f / 390 }, - { 97, 76, 22, 195, 1.0f / 195 }, - { 72, 41, 17, 130, 1.0f / 130 }, - { 119, 47, 29, 195, 1.0f / 195 }, - { 4, 1, 1, 6, 1.0f / 6 }, - { 4, 1, 1, 6, 1.0f / 6 }, - { 4, 1, 1, 6, 1.0f / 6 }, - { 4, 1, 1, 6, 1.0f / 6 }, - { 4, 1, 1, 6, 1.0f / 6 }, - { 4, 1, 1, 6, 1.0f / 6 }, - { 4, 1, 1, 6, 1.0f / 6 }, - { 4, 1, 1, 6, 1.0f / 6 }, - { 4, 1, 1, 6, 1.0f / 6 }, - { 65, 18, 17, 100, 1.0f / 100 }, - { 95, 29, 26, 150, 1.0f / 150 }, - { 185, 62, 53, 300, 1.0f / 300 }, - { 30, 11, 9, 50, 1.0f / 50 }, - { 35, 14, 11, 60, 1.0f / 60 }, - { 85, 37, 28, 150, 1.0f / 150 }, - { 55, 26, 19, 100, 1.0f / 100 }, - { 80, 41, 29, 150, 1.0f / 150 }, - { 155, 86, 59, 300, 1.0f / 300 }, - { 5, 3, 2, 10, 1.0f / 10 }, - - { 5, 3, 2, 10, 1.0f / 10 }, - { 5, 3, 2, 10, 1.0f / 10 }, - { 5, 3, 2, 10, 1.0f / 10 }, - { 5, 3, 2, 10, 1.0f / 10 }, - { 5, 3, 2, 10, 1.0f / 10 }, - { 5, 3, 2, 10, 1.0f / 10 }, - { 5, 3, 2, 10, 1.0f / 10 }, - { 5, 3, 2, 10, 1.0f / 10 }, - { 5, 3, 2, 10, 1.0f / 10 }, - { 5, 3, 2, 10, 1.0f / 10 }, - { 5, 3, 2, 10, 1.0f / 10 }, - { 5, 3, 2, 10, 1.0f / 10 }, - { 305, 176, 119, 600, 1.0f / 600 }, - { 155, 86, 59, 300, 1.0f / 300 }, - { 105, 56, 39, 200, 1.0f / 200 }, - { 80, 41, 29, 150, 1.0f / 150 }, - { 65, 32, 23, 120, 1.0f / 120 }, - { 55, 26, 19, 100, 1.0f / 100 }, - { 335, 152, 113, 600, 1.0f / 600 }, - { 85, 37, 28, 150, 1.0f / 150 }, - { 115, 48, 37, 200, 1.0f / 200 }, - { 35, 14, 11, 60, 1.0f / 60 }, - { 355, 136, 109, 600, 1.0f / 600 }, - { 30, 11, 9, 50, 1.0f / 50 }, - { 365, 128, 107, 600, 1.0f / 600 }, - { 185, 62, 53, 300, 1.0f / 300 }, - { 25, 8, 7, 40, 1.0f / 40 }, - { 95, 29, 26, 150, 1.0f / 150 }, - { 385, 112, 103, 600, 1.0f / 600 }, - { 65, 18, 17, 100, 1.0f / 100 }, - { 395, 104, 101, 600, 1.0f / 600 }, - { 4, 1, 1, 6, 1.0f / 6 }, - - // Symetric - { 4, 1, 1, 6, 1.0f / 6 }, - { 395, 104, 101, 600, 1.0f / 600 }, - { 65, 18, 17, 100, 1.0f / 100 }, - { 385, 112, 103, 600, 1.0f / 600 }, - { 95, 29, 26, 150, 1.0f / 150 }, - { 25, 8, 7, 40, 1.0f / 40 }, - { 185, 62, 53, 300, 1.0f / 300 }, - { 365, 128, 107, 600, 1.0f / 600 }, - { 30, 11, 9, 50, 1.0f / 50 }, - { 355, 136, 109, 600, 1.0f / 600 }, - { 35, 14, 11, 60, 1.0f / 60 }, - { 115, 48, 37, 200, 1.0f / 200 }, - { 85, 37, 28, 150, 1.0f / 150 }, - { 335, 152, 113, 600, 1.0f / 600 }, - { 55, 26, 19, 100, 1.0f / 100 }, - { 65, 32, 23, 120, 1.0f / 120 }, - { 80, 41, 29, 150, 1.0f / 150 }, - { 105, 56, 39, 200, 1.0f / 200 }, - { 155, 86, 59, 300, 1.0f / 300 }, - { 305, 176, 119, 600, 1.0f / 600 }, - { 5, 3, 2, 10, 1.0f / 10 }, - { 5, 3, 2, 10, 1.0f / 10 }, - { 5, 3, 2, 10, 1.0f / 10 }, - { 5, 3, 2, 10, 1.0f / 10 }, - { 5, 3, 2, 10, 1.0f / 10 }, - { 5, 3, 2, 10, 1.0f / 10 }, - { 5, 3, 2, 10, 1.0f / 10 }, - { 5, 3, 2, 10, 1.0f / 10 }, - { 5, 3, 2, 10, 1.0f / 10 }, - { 5, 3, 2, 10, 1.0f / 10 }, - { 5, 3, 2, 10, 1.0f / 10 }, - { 5, 3, 2, 10, 1.0f / 10 }, - - { 5, 3, 2, 10, 1.0f / 10 }, - { 155, 86, 59, 300, 1.0f / 300 }, - { 80, 41, 29, 150, 1.0f / 150 }, - { 55, 26, 19, 100, 1.0f / 100 }, - { 85, 37, 28, 150, 1.0f / 150 }, - { 35, 14, 11, 60, 1.0f / 60 }, - { 30, 11, 9, 50, 1.0f / 50 }, - { 185, 62, 53, 300, 1.0f / 300 }, - { 95, 29, 26, 150, 1.0f / 150 }, - { 65, 18, 17, 100, 1.0f / 100 }, - { 4, 1, 1, 6, 1.0f / 6 }, - { 4, 1, 1, 6, 1.0f / 6 }, - { 4, 1, 1, 6, 1.0f / 6 }, - { 4, 1, 1, 6, 1.0f / 6 }, - { 4, 1, 1, 6, 1.0f / 6 }, - { 4, 1, 1, 6, 1.0f / 6 }, - { 4, 1, 1, 6, 1.0f / 6 }, - { 4, 1, 1, 6, 1.0f / 6 }, - { 4, 1, 1, 6, 1.0f / 6 }, - { 119, 47, 29, 195, 1.0f / 195 }, - { 72, 41, 17, 130, 1.0f / 130 }, - { 97, 76, 22, 195, 1.0f / 195 }, - { 172, 181, 37, 390, 1.0f / 390 }, - { 5, 7, 1, 13, 1.0f / 13 }, - { 878, 1159, 147, 2184, 1.0f / 2184 }, - { 458, 571, 63, 1092, 1.0f / 1092 }, - { 318, 375, 35, 728, 1.0f / 728 }, - { 248, 277, 21, 546, 1.0f / 546 }, - { 1030, 1091, 63, 2184, 1.0f / 2184 }, - { 178, 179, 7, 364, 1.0f / 364 }, - { 158, 151, 3, 312, 1.0f / 312 }, - { 11, 10, 0, 21, 1.0f / 21 }, - - { 2447, 2200, 57, 4704, 1.0f / 4704 }, - { 405, 360, 19, 784, 1.0f / 784 }, - { 2413, 2120, 171, 4704, 1.0f / 4704 }, - { 599, 520, 57, 1176, 1.0f / 1176 }, - { 793, 680, 95, 1568, 1.0f / 1568 }, - { 1181, 1000, 171, 2352, 1.0f / 2352 }, - { 335, 280, 57, 672, 1.0f / 672 }, - { 97, 80, 19, 196, 1.0f / 196 }, - { 2311, 1880, 513, 4704, 1.0f / 4704 }, - { 1147, 920, 285, 2352, 1.0f / 2352 }, - { 759, 600, 209, 1568, 1.0f / 1568 }, - { 565, 440, 171, 1176, 1.0f / 1176 }, - { 2243, 1720, 741, 4704, 1.0f / 4704 }, - { 53, 40, 19, 112, 1.0f / 112 }, - { 2209, 1640, 855, 4704, 1.0f / 4704 }, - { 137, 100, 57, 294, 1.0f / 294 }, - { 725, 520, 323, 1568, 1.0f / 1568 }, - { 1079, 760, 513, 2352, 1.0f / 2352 }, - { 2141, 1480, 1083, 4704, 1.0f / 4704 }, - { 177, 120, 95, 392, 1.0f / 392 }, - { 301, 200, 171, 672, 1.0f / 672 }, - { 1045, 680, 627, 2352, 1.0f / 2352 }, - { 691, 440, 437, 1568, 1.0f / 1568 }, - { 257, 160, 171, 588, 1.0f / 588 }, - { 2039, 1240, 1425, 4704, 1.0f / 4704 }, - { 337, 200, 247, 784, 1.0f / 784 }, - { 2005, 1160, 1539, 4704, 1.0f / 4704 }, - { 71, 40, 57, 168, 1.0f / 168 }, - { 657, 360, 551, 1568, 1.0f / 1568 }, - { 977, 520, 855, 2352, 1.0f / 2352 }, - { 1937, 1000, 1767, 4704, 1.0f / 4704 }, - { 20, 10, 19, 49, 1.0f / 49 }, - - { 1227, 638, 1075, 2940, 1.0f / 2940 }, - { 627, 338, 505, 1470, 1.0f / 1470 }, - { 61, 34, 45, 140, 1.0f / 140 }, - { 327, 188, 220, 735, 1.0f / 735 }, - { 267, 158, 163, 588, 1.0f / 588 }, - { 227, 138, 125, 490, 1.0f / 490 }, - { 1389, 866, 685, 2940, 1.0f / 2940 }, - { 354, 226, 155, 735, 1.0f / 735 }, - { 481, 314, 185, 980, 1.0f / 980 }, - { 3, 2, 1, 6, 1.0f / 6 }, - { 471, 304, 161, 936, 1.0f / 936 }, - { 237, 148, 83, 468, 1.0f / 468 }, - { 53, 32, 19, 104, 1.0f / 104 }, - { 60, 35, 22, 117, 1.0f / 117 }, - { 483, 272, 181, 936, 1.0f / 936 }, - { 81, 44, 31, 156, 1.0f / 156 }, - { 489, 256, 191, 936, 1.0f / 936 }, - { 123, 62, 49, 234, 1.0f / 234 }, - { 165, 80, 67, 312, 1.0f / 312 }, - { 249, 116, 103, 468, 1.0f / 468 }, - { 501, 224, 211, 936, 1.0f / 936 }, - { 7, 3, 3, 13, 1.0f / 13 }, - { 43, 15, 20, 78, 1.0f / 78 }, - { 22, 6, 11, 39, 1.0f / 39 }, - { 15, 3, 8, 26, 1.0f / 26 }, - { 23, 3, 13, 39, 1.0f / 39 }, - { 47, 3, 28, 78, 1.0f / 78 }, - { 8, 0, 5, 13, 1.0f / 13 }, - { 7, 0, 4, 11, 1.0f / 11 }, - { 21, 0, 10, 31, 1.0f / 31 }, - { 13, 0, 5, 18, 1.0f / 18 }, - { 13, 0, 5, 18, 1.0f / 18 } -}}; - - - } // namespace fmtc diff --git a/src/fmtc/Bitdepth.h b/src/fmtc/Bitdepth.h index 823610c..3bdcfdd 100644 --- a/src/fmtc/Bitdepth.h +++ b/src/fmtc/Bitdepth.h @@ -27,22 +27,14 @@ To Public License, Version 2, as published by Sam Hocevar. See /*\\\ INCLUDE FILES \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\*/ -#include "conc/ObjPool.h" -#include "fmtcl/BitBltConv.h" -#include "fmtcl/ErrDifBuf.h" -#include "fmtcl/ErrDifBufFactory.h" -#include "fmtcl/SplFmt.h" -#include "fstb/def.h" -#include "fstb/ArrayAlign.h" +#include "fmtcl/Dither.h" #include "vsutl/FilterBase.h" #include "vsutl/NodeRefSPtr.h" #include "vsutl/PlaneProcCbInterface.h" #include "vsutl/PlaneProcessor.h" #include "VapourSynth.h" -#include #include -#include @@ -85,308 +77,9 @@ class Bitdepth private: - static const int MAX_NBR_PLANES = 3; - static const int PAT_WIDTH = 32; // Number of pixels for halftone dithering - static const int PAT_PERIOD = 4; // Must be a power of 2 (because cycled with & as modulo) - static const int AMP_BITS = 5; // Bit depth of the amplitude fractionnal part. The whole thing is 7 bits, and we need a few bits for the integer part. - static const int ERR_RES = 24; // Resolution (bits) of the temporary data for error diffusion when source bitdepth is not high enough (relative to the destination bitdepth) to guarantee an accurate error diffusion. - static const int MAX_UNK_WIDTH = 65536; // Maximum width (pixels) for variable formats - - enum DMode - { - DMode_ROUND_ALIAS = -1, - DMode_BAYER = 0, - DMode_ROUND, // 1 - DMode_FAST, // 2 - DMode_FILTERLITE, // 3 - DMode_STUCKI, // 4 - DMode_ATKINSON, // 5 - DMode_FLOYD, // 6 - DMode_OSTRO, // 7 - DMode_VOIDCLUST, // 8 - DMode_QUASIRND, // 9 - - DMode_NBR_ELT - }; - - class SclInf - { - public: - fmtcl::BitBltConv::ScaleInfo - _info; - fmtcl::BitBltConv::ScaleInfo * // 0 if _info is not used. - _ptr = 0; - }; - - typedef int16_t PatRow [PAT_WIDTH]; // Contains data in [-128; +127] - typedef PatRow PatData [PAT_WIDTH]; // [y] [x] - typedef fstb::ArrayAlign PatDataArray; - - class AmpInfo - { - public: - int _o_i = 0; // [0 ; 127], 1.0 = 1 << AMP_BITS - int _n_i = 0; // [0 ; 127], 1.0 = 1 << AMP_BITS - int _e_i = 0; // [0 ; 2047], 1.0 = 256 - float _e_f = 0; - float _n_f = 0; - }; - - class SegContext - { - public: - inline const PatRow & - extract_pattern_row () const noexcept; - const PatData* _pattern_ptr = nullptr; // Ordered dithering - uint32_t _rnd_state = 0; // Anything excepted fast mode - const fmtcl::BitBltConv::ScaleInfo * // Float processing - _scale_info_ptr = nullptr; - fmtcl::ErrDifBuf * // Error diffusion - _ed_buf_ptr = nullptr; - int _y = -1; // Ordered dithering and error diffusion - uint32_t _qrs_seed = 0; // For the quasirandom sequences - AmpInfo _amp; - }; - const ::VSFormat & get_output_colorspace (const ::VSMap &in, ::VSMap &out, ::VSCore &core, const ::VSFormat &fmt_src) const; - void build_dither_pat (); - void build_dither_pat_round (); - void build_dither_pat_bayer (); - void build_dither_pat_void_and_cluster (int w); - void build_next_dither_pat (); - void copy_dither_pat_rotate (PatData &dst, const PatData &src, int angle) noexcept; - void init_fnc_fast () noexcept; - void init_fnc_ordered () noexcept; - void init_fnc_quasirandom () noexcept; - void init_fnc_errdiff () noexcept; - - void dither_plane (fmtcl::SplFmt dst_fmt, int dst_res, uint8_t *dst_ptr, int dst_stride, fmtcl::SplFmt src_fmt, int src_res, const uint8_t *src_ptr, int src_stride, int w, int h, const fmtcl::BitBltConv::ScaleInfo &scale_info, int frame_index, int plane_index); - - template - static void process_seg_fast_int_int_cpp (uint8_t * fstb_RESTRICT dst_ptr, const uint8_t * fstb_RESTRICT src_ptr, int w, SegContext &/*ctx*/) noexcept; - template - static void process_seg_fast_flt_int_cpp (uint8_t * fstb_RESTRICT dst_ptr, const uint8_t * fstb_RESTRICT src_ptr, int w, SegContext &ctx) noexcept; - -#if (fstb_ARCHI == fstb_ARCHI_X86) - template - static void process_seg_fast_int_int_sse2 (uint8_t * fstb_RESTRICT dst_ptr, const uint8_t * fstb_RESTRICT src_ptr, int w, SegContext &/*ctx*/) noexcept; - template - static void process_seg_fast_flt_int_sse2 (uint8_t * fstb_RESTRICT dst_ptr, const uint8_t * fstb_RESTRICT src_ptr, int w, SegContext &ctx) noexcept; -#endif - - template - static void process_seg_ord_int_int_cpp (uint8_t * fstb_RESTRICT dst_ptr, const uint8_t * fstb_RESTRICT src_ptr, int w, SegContext &ctx) noexcept; - template - static void process_seg_ord_flt_int_cpp (uint8_t * fstb_RESTRICT dst_ptr, const uint8_t * fstb_RESTRICT src_ptr, int w, SegContext &ctx) noexcept; - -#if (fstb_ARCHI == fstb_ARCHI_X86) - template - static void process_seg_ord_int_int_sse2 (uint8_t * fstb_RESTRICT dst_ptr, const uint8_t * fstb_RESTRICT src_ptr, int w, SegContext &ctx) noexcept; - template - static void process_seg_ord_flt_int_sse2 (uint8_t * fstb_RESTRICT dst_ptr, const uint8_t * fstb_RESTRICT src_ptr, int w, SegContext &ctx) noexcept; -#endif - - template - static void process_seg_qrs_int_int_cpp (uint8_t * fstb_RESTRICT dst_ptr, const uint8_t * fstb_RESTRICT src_ptr, int w, SegContext &ctx) noexcept; - template - static void process_seg_qrs_flt_int_cpp (uint8_t * fstb_RESTRICT dst_ptr, const uint8_t * fstb_RESTRICT src_ptr, int w, SegContext &ctx) noexcept; - -#if (fstb_ARCHI == fstb_ARCHI_X86) - template - static void process_seg_qrs_int_int_sse2 (uint8_t * fstb_RESTRICT dst_ptr, const uint8_t * fstb_RESTRICT src_ptr, int w, SegContext &ctx) noexcept; - template - static void process_seg_qrs_flt_int_sse2 (uint8_t * fstb_RESTRICT dst_ptr, const uint8_t * fstb_RESTRICT src_ptr, int w, SegContext &ctx) noexcept; -#endif - - template - static fstb_FORCEINLINE void - process_seg_common_int_int_cpp (uint8_t * fstb_RESTRICT dst_ptr, const uint8_t * fstb_RESTRICT src_ptr, int w, SegContext &ctx, DFNC dither_fnc) noexcept; - template - static fstb_FORCEINLINE void - process_seg_common_flt_int_cpp (uint8_t * fstb_RESTRICT dst_ptr, const uint8_t * fstb_RESTRICT src_ptr, int w, SegContext &ctx, DFNC dither_fnc) noexcept; - template - static fstb_FORCEINLINE int - generate_dith_n_scalar (uint32_t &rnd_state) noexcept; - static fstb_FORCEINLINE int - remap_tpdf_scalar (int d) noexcept; - -#if (fstb_ARCHI == fstb_ARCHI_X86) - template - static fstb_FORCEINLINE void - process_seg_common_int_int_sse2 (uint8_t * fstb_RESTRICT dst_ptr, const uint8_t * fstb_RESTRICT src_ptr, int w, SegContext &ctx, DFNC dither_fnc) noexcept; - template - static fstb_FORCEINLINE void - process_seg_common_flt_int_sse2 (uint8_t * fstb_RESTRICT dst_ptr, const uint8_t * fstb_RESTRICT src_ptr, int w, SegContext &ctx, DFNC dither_fnc) noexcept; - template - static fstb_FORCEINLINE __m128i - generate_dith_n_vec (uint32_t &rnd_state) noexcept; - static fstb_FORCEINLINE __m128i - remap_tpdf_vec (__m128i d) noexcept; -#endif - - template - static void process_seg_errdif_int_int_cpp (uint8_t * fstb_RESTRICT dst_ptr, const uint8_t * fstb_RESTRICT src_ptr, int w, SegContext &ctx) noexcept; - template - static void process_seg_errdif_flt_int_cpp (uint8_t * fstb_RESTRICT dst_ptr, const uint8_t * fstb_RESTRICT src_ptr, int w, SegContext &ctx) noexcept; - - static inline void - generate_rnd (uint32_t &state) noexcept; - static inline void - generate_rnd_eol (uint32_t &state) noexcept; - - template - static inline void - quantize_pix_int (DST_TYPE * fstb_RESTRICT dst_ptr, const SRC_TYPE * fstb_RESTRICT src_ptr, SRC_TYPE &src_raw, int x, int & fstb_RESTRICT err, uint32_t &rnd_state, int ampe_i, int ampn_i) noexcept; - template - static inline void - quantize_pix_flt (DST_TYPE * fstb_RESTRICT dst_ptr, const SRC_TYPE * fstb_RESTRICT src_ptr, SRC_TYPE &src_raw, int x, float & fstb_RESTRICT err, uint32_t &rnd_state, float ampe_f, float ampn_f, float mul, float add) noexcept; - - template - class ErrDifAddParam - { - public: - typedef DT DstType; - typedef ST SrcType; - static const int DST_BITS = DB; - static const int SRC_BITS = SB; - static const int NBR_ERR_LINES = EL; - }; - - template - class DiffuseFloydSteinberg - : public ErrDifAddParam - { - public: - template - static fstb_FORCEINLINE void - diffuse (int err, int & fstb_RESTRICT err_nxt0, int & fstb_RESTRICT err_nxt1, int16_t * fstb_RESTRICT err0_ptr, int16_t * fstb_RESTRICT err1_ptr, SRC_TYPE src_raw) noexcept; - template - static fstb_FORCEINLINE void - diffuse (float err, float & fstb_RESTRICT err_nxt0, float & fstb_RESTRICT err_nxt1, float * fstb_RESTRICT err0_ptr, float * fstb_RESTRICT err1_ptr, SRC_TYPE src_raw) noexcept; - template - static fstb_FORCEINLINE void - prepare_next_line (EB * fstb_RESTRICT err_ptr) noexcept; - private: - template - static fstb_FORCEINLINE void - spread_error (ET e1, ET e3, ET e5, ET e7, ET & fstb_RESTRICT err_nxt0, EB * fstb_RESTRICT err0_ptr) noexcept; - }; - - template - class DiffuseFilterLite - : public ErrDifAddParam - { - public: - template - static fstb_FORCEINLINE void - diffuse (int err, int & fstb_RESTRICT err_nxt0, int & fstb_RESTRICT err_nxt1, int16_t * fstb_RESTRICT err0_ptr, int16_t * fstb_RESTRICT err1_ptr, SRC_TYPE src_raw) noexcept; - template - static fstb_FORCEINLINE void - diffuse (float err, float & fstb_RESTRICT err_nxt0, float & fstb_RESTRICT err_nxt1, float * fstb_RESTRICT err0_ptr, float * fstb_RESTRICT err1_ptr, SRC_TYPE src_raw) noexcept; - template - static fstb_FORCEINLINE void - prepare_next_line (EB * fstb_RESTRICT err_ptr) noexcept; - private: - template - static fstb_FORCEINLINE void - spread_error (ET e1, ET e2, ET & fstb_RESTRICT err_nxt0, EB * fstb_RESTRICT err0_ptr) noexcept; - }; - - template - class DiffuseStucki - : public ErrDifAddParam - { - public: - template - static fstb_FORCEINLINE void - diffuse (int err, int & fstb_RESTRICT err_nxt0, int & fstb_RESTRICT err_nxt1, int16_t * fstb_RESTRICT err0_ptr, int16_t * fstb_RESTRICT err1_ptr, SRC_TYPE src_raw) noexcept; - template - static fstb_FORCEINLINE void - diffuse (float err, float & fstb_RESTRICT err_nxt0, float & fstb_RESTRICT err_nxt1, float * fstb_RESTRICT err0_ptr, float * fstb_RESTRICT err1_ptr, SRC_TYPE src_raw) noexcept; - template - static fstb_FORCEINLINE void - prepare_next_line (EB * fstb_RESTRICT err_ptr) noexcept; - private: - template - static fstb_FORCEINLINE void - spread_error (ET e1, ET e2, ET e4, ET e8, ET & fstb_RESTRICT err_nxt0, ET & fstb_RESTRICT err_nxt1, EB * fstb_RESTRICT err0_ptr, EB * fstb_RESTRICT err1_ptr) noexcept; - }; - - template - class DiffuseAtkinson - : public ErrDifAddParam - { - public: - template - static fstb_FORCEINLINE void - diffuse (int err, int & fstb_RESTRICT err_nxt0, int & fstb_RESTRICT err_nxt1, int16_t * fstb_RESTRICT err0_ptr, int16_t * fstb_RESTRICT err1_ptr, SRC_TYPE src_raw) noexcept; - template - static fstb_FORCEINLINE void - diffuse (float err, float & fstb_RESTRICT err_nxt0, float & fstb_RESTRICT err_nxt1, float * fstb_RESTRICT err0_ptr, float * fstb_RESTRICT err1_ptr, SRC_TYPE src_raw) noexcept; - template - static fstb_FORCEINLINE void - prepare_next_line (EB * fstb_RESTRICT err_ptr) noexcept; - private: - template - static fstb_FORCEINLINE void - spread_error (ET e1, ET & fstb_RESTRICT err_nxt0, ET & fstb_RESTRICT err_nxt1, EB * fstb_RESTRICT err0_ptr, EB * fstb_RESTRICT err1_ptr) noexcept; - }; - - class DiffuseOstromoukhovBase - { - public: - struct TableEntry - { - int _c0; - int _c1; - int _c2; // Actually not used - int _sum; - float _inv_sum; // Possible optimization: store 1/_c0 and 1/_c1 instead of this field. - }; - static const int T_BITS = 8; - static const int T_LEN = 1 << T_BITS; - static const int T_MASK = T_LEN - 1; - - static const std::array - _table; - }; - - template - class DiffuseOstromoukhovBase2 - : public DiffuseOstromoukhovBase - { - public: - template - static inline int - get_index (SRC_TYPE src_raw) noexcept; - static inline int - get_index (float src_raw) noexcept; - }; - - template - class DiffuseOstromoukhov - : public ErrDifAddParam - , public DiffuseOstromoukhovBase2 - { - public: - typedef DiffuseOstromoukhov ThisType; - template - static fstb_FORCEINLINE void - diffuse (int err, int & fstb_RESTRICT err_nxt0, int & fstb_RESTRICT err_nxt1, int16_t * fstb_RESTRICT err0_ptr, int16_t * fstb_RESTRICT err1_ptr, SRC_TYPE src_raw) noexcept; - template - static fstb_FORCEINLINE void - diffuse (float err, float & fstb_RESTRICT err_nxt0, float & fstb_RESTRICT err_nxt1, float * fstb_RESTRICT err0_ptr, float * fstb_RESTRICT err1_ptr, SRC_TYPE src_raw) noexcept; - template - static fstb_FORCEINLINE void - prepare_next_line (EB * fstb_RESTRICT err_ptr) noexcept; - private: - template - static fstb_FORCEINLINE void - spread_error (ET e1, ET e2, ET e3, ET & fstb_RESTRICT err_nxt0, EB * fstb_RESTRICT err0_ptr) noexcept; - }; - vsutl::NodeRefSPtr _clip_src_sptr; const ::VSVideoInfo @@ -395,41 +88,13 @@ class Bitdepth vsutl::PlaneProcessor _plane_processor; - fmtcl::SplFmt _splfmt_src; - fmtcl::SplFmt _splfmt_dst; - - std::array - _scale_info_arr; - bool _upconv_flag; - bool _sse2_flag; - bool _avx2_flag; - bool _full_range_in_flag; - bool _full_range_out_flag; - bool _range_def_flag; - - int _dmode; - int _pat_size; // Must be a divisor of PAT_WIDTH - double _ampo; - double _ampn; - bool _dyn_flag; - bool _static_noise_flag; - bool _correlated_planes_flag; - bool _tpdfo_flag; - bool _tpdfn_flag; - - bool _errdif_flag; // Indicates a dithering method using error diffusion. - bool _simple_flag; // Simplified implementation for ampo == 1 and ampn == 0 - PatDataArray _dither_pat_arr; // Contains levels for ordered dithering - - AmpInfo _amp; - - conc::ObjPool - _buf_pool; - std::unique_ptr - _buf_factory_uptr; - - void (* _process_seg_int_int_ptr) (uint8_t * fstb_RESTRICT dst_ptr, const uint8_t * fstb_RESTRICT src_ptr, int w, SegContext &ctx); - void (* _process_seg_flt_int_ptr) (uint8_t * fstb_RESTRICT dst_ptr, const uint8_t * fstb_RESTRICT src_ptr, int w, SegContext &ctx); + + bool _full_range_in_flag = false; + bool _full_range_out_flag = false; + bool _range_def_flag = false; + + std::unique_ptr + _engine_uptr; diff --git a/src/fmtc/fnc.cpp b/src/fmtc/fnc.cpp index 96d2343..d7641ee 100644 --- a/src/fmtc/fnc.cpp +++ b/src/fmtc/fnc.cpp @@ -133,6 +133,24 @@ fmtcl::SplFmt conv_vsfmt_to_splfmt (const ::VSFormat &fmt) +fmtcl::ColorFamily conv_colfam_to_fmtcl (const ::VSFormat &fmt) +{ + auto col_fam = fmtcl::ColorFamily_INVALID; + + switch (fmt.colorFamily) + { + case cmGray: col_fam = fmtcl::ColorFamily_GRAY; break; + case cmRGB: col_fam = fmtcl::ColorFamily_RGB; break; + case cmYUV: col_fam = fmtcl::ColorFamily_YUV; break; + case cmYCoCg: col_fam = fmtcl::ColorFamily_YCGCO; break; + default: assert (false); break; + } + + return col_fam; +} + + + void prepare_matrix_coef (const vsutl::FilterBase &filter, fmtcl::MatrixProc &mat_proc, const fmtcl::Mat4 &mat_main, const ::VSFormat &fmt_dst, bool full_range_dst_flag, const ::VSFormat &fmt_src, bool full_range_src_flag, fmtcl::ColorSpaceH265 csp_out, int plane_out) { const bool int_proc_flag = diff --git a/src/fmtc/fnc.h b/src/fmtc/fnc.h index 5403664..aab7d3c 100644 --- a/src/fmtc/fnc.h +++ b/src/fmtc/fnc.h @@ -27,6 +27,7 @@ To Public License, Version 2, as published by Sam Hocevar. See /*\\\ INCLUDE FILES \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\*/ +#include "fmtcl/ColorFamily.h" #include "fmtcl/ColorSpaceH265.h" #include "fmtcl/SplFmt.h" @@ -50,6 +51,8 @@ namespace fmtc fmtcl::SplFmt conv_vsfmt_to_splfmt (const ::VSFormat &fmt); +fmtcl::ColorFamily + conv_colfam_to_fmtcl (const ::VSFormat &fmt); void prepare_matrix_coef (const vsutl::FilterBase &filter, fmtcl::MatrixProc &mat_proc, const fmtcl::Mat4 &mat_main, const ::VSFormat &fmt_dst, bool full_range_dst_flag, const ::VSFormat &fmt_src, bool full_range_src_flag, fmtcl::ColorSpaceH265 csp_out = fmtcl::ColorSpaceH265_UNSPECIFIED, int plane_out = -1); diff --git a/src/fmtcl/Dither.cpp b/src/fmtcl/Dither.cpp new file mode 100644 index 0000000..a4ea526 --- /dev/null +++ b/src/fmtcl/Dither.cpp @@ -0,0 +1,2604 @@ +/***************************************************************************** + + Dither.cpp + Author: Laurent de Soras, 2021 + +--- Legal stuff --- + +This program is free software. It comes without any warranty, to +the extent permitted by applicable law. You can redistribute it +and/or modify it under the terms of the Do What The Fuck You Want +To Public License, Version 2, as published by Sam Hocevar. See +http://www.wtfpl.net/ for more details. + +*Tab=3***********************************************************************/ + + + +#if defined (_MSC_VER) + #pragma warning (1 : 4130 4223 4705 4706) + #pragma warning (4 : 4355 4786 4800) +#endif + + + +/*\\\ INCLUDE FILES \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\*/ + +#include "fstb/def.h" + +#include "fmtcl/Dither.h" +#include "fmtcl/fnc.h" +#if (fstb_ARCHI == fstb_ARCHI_X86) + #include "fmtcl/ProxyRwSse2.h" +#endif +#include "fmtcl/VoidAndCluster.h" +#include "fstb/fnc.h" + +#include +#include + +#include +#include + + + +namespace fmtcl +{ + + + +/*\\\ PUBLIC \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\*/ + + + +constexpr int Dither::_max_nbr_planes; +constexpr int Dither::_max_pat_width; + + + +Dither::Dither ( + SplFmt src_fmt, int src_res, bool src_full_flag, + SplFmt dst_fmt, int dst_res, bool dst_full_flag, + ColorFamily color_fam, int nbr_planes, int w, + DMode dmode, int pat_size, double ampo, double ampn, + bool dyn_flag, bool static_noise_flag, bool correlated_planes_flag, + bool tpdfo_flag, bool tpdfn_flag, + bool sse2_flag, bool avx2_flag +) +: _splfmt_src (src_fmt) +, _splfmt_dst (dst_fmt) +, _src_res (src_res) +, _dst_res (dst_res) +, _full_range_in_flag (src_full_flag) +, _full_range_out_flag (dst_full_flag) +, _color_fam (color_fam) +, _nbr_planes (nbr_planes) +, _sse2_flag (sse2_flag) +, _avx2_flag (avx2_flag) +, _dmode (dmode) +, _pat_size (pat_size) +, _ampo (ampo) +, _ampn (ampn) +, _dyn_flag (dyn_flag) +, _static_noise_flag (static_noise_flag) +, _correlated_planes_flag (correlated_planes_flag) +, _tpdfo_flag (tpdfo_flag) +, _tpdfn_flag (tpdfn_flag) +{ + assert (src_fmt >= 0); + assert (src_fmt < SplFmt::SplFmt_NBR_ELT); + assert (dst_fmt >= 0); + assert (dst_fmt < SplFmt::SplFmt_NBR_ELT); + assert ( + (SplFmt_is_int (src_fmt) && ( ( src_res >= 8 + && src_res <= 12) + || src_res == 14 + || src_res == 16)) + || (SplFmt_is_float (src_fmt) && src_res == 32 ) + ); + assert ( + (SplFmt_is_int (dst_fmt) && ( ( dst_res >= 8 + && dst_res <= 10) + || dst_res == 12 + || dst_res == 16)) + || (SplFmt_is_float (dst_fmt) && dst_res == 32 ) + ); + assert (color_fam >= 0); + assert (color_fam < ColorFamily_NBR_ELT); + assert (nbr_planes > 0); + assert (nbr_planes < _max_nbr_planes); + assert (dmode >= 0); + assert (dmode < DMode_NBR_ELT); + assert (pat_size >= 4); + assert (_max_pat_width % pat_size == 0); + assert (ampo >= 0); + assert (ampn >= 0); + + // No dithering required + if ( ( SplFmt_is_int (src_fmt) + && ( SplFmt_is_float (dst_fmt) + || ( _src_res <= _dst_res + && ! _full_range_in_flag + && ! _full_range_out_flag))) + || ( SplFmt_is_float (src_fmt) + && SplFmt_is_float (dst_fmt))) + { + _upconv_flag = true; + } + + // Data scaling parameters + for (int plane_index = 0; plane_index < nbr_planes; ++plane_index) + { + SclInf & scl_inf = _scale_info_arr [plane_index]; + fmtcl::compute_fmt_mac_cst ( + scl_inf._info._gain, + scl_inf._info._add_cst, + dst_fmt, dst_res, color_fam, _full_range_out_flag, + src_fmt, src_res, color_fam, _full_range_in_flag, + plane_index + ); + + scl_inf._ptr = nullptr; + if ( _upconv_flag + && SplFmt_is_int (src_fmt) + && SplFmt_is_float (dst_fmt)) + { + scl_inf._ptr = &scl_inf._info; + } + } + + if (w <= 0) + { + w = _max_unk_width; + } + _buf_factory_uptr = std::make_unique (w); + _buf_pool.set_factory (*_buf_factory_uptr); + + build_dither_pat (); + + // Amplitude precalculations + + // In case of TPDF, rescales the amplitude so the power is kept constant. + // Sum of two noises (uncorrelated signals) -> +3 dB + if (_tpdfo_flag) + { + ampo *= fstb::SQRT2 * 0.5; + } + if (_tpdfn_flag) + { + ampn *= fstb::SQRT2 * 0.5; + } + + const int amp_mul = 1 << _amp_bits; + const int ampo_i_raw = fstb::round_int (ampo * amp_mul); + const int ampn_i_raw = fstb::round_int (ampn * amp_mul); + _amp._o_i = std::min (ampo_i_raw, 127); + _amp._n_i = std::min (ampn_i_raw, 127); + _amp._n_f = float (ampn * (1.f / 256.f)); + + if (_errdif_flag) + { + _amp._e_i = fstb::limit ( + fstb::round_int ((ampo - 1) * (128 << _amp_bits)), + 0, + (2048 << _amp_bits) - 1 + ); + _amp._e_f = fstb::limit (float (ampo - 1), 0.f, 8.f); + } + + _simple_flag = (ampo_i_raw == amp_mul && ampn_i_raw == 0); + + // Processing function initialisation + if (_errdif_flag) + { + init_fnc_errdiff (); + } + else if (_dmode == DMode_QUASIRND) + { + init_fnc_quasirandom (); + } + else if (_dmode == DMode_FAST) + { + init_fnc_fast (); + } + else + { + init_fnc_ordered (); + } +} + + + +void Dither::process_plane (uint8_t *dst_ptr, int dst_stride, const uint8_t *src_ptr, int src_stride, int w, int h, int frame_index, int plane_index) +{ + assert (dst_ptr != nullptr); + assert (src_ptr != nullptr); + assert (w > 0); + assert (h > 0); + assert (frame_index >= 0); + assert (plane_index >= 0); + assert (plane_index < _max_nbr_planes); + + if (_upconv_flag) + { + BitBltConv blitter (_sse2_flag, _avx2_flag); + blitter.bitblt ( + _splfmt_dst, _dst_res, dst_ptr, nullptr, dst_stride, + _splfmt_src, _src_res, src_ptr, nullptr, src_stride, + w, h, + _scale_info_arr [plane_index]._ptr + ); + } + else + { + dither_plane ( + dst_ptr, dst_stride, + src_ptr, src_stride, + w, h, + _scale_info_arr [plane_index]._info, + frame_index, plane_index + ); + } +} + + + +/*\\\ PROTECTED \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\*/ + + + +/*\\\ PRIVATE \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\*/ + + + +constexpr int Dither::_pat_period; +constexpr int Dither::_amp_bits; +constexpr int Dither::_err_res; +constexpr int Dither::_max_unk_width; + + + +void Dither::build_dither_pat () +{ + _errdif_flag = false; + + switch (_dmode) + { + case DMode_BAYER: + build_dither_pat_bayer (); + break; + + case DMode_FILTERLITE: + case DMode_STUCKI: + case DMode_ATKINSON: + case DMode_FLOYD: + case DMode_OSTRO: + _errdif_flag = true; + _tpdfo_flag = false; + break; + + case DMode_ROUND: + case DMode_FAST: + default: + build_dither_pat_round (); + break; + + case DMode_VOIDCLUST: + build_dither_pat_void_and_cluster (_pat_size); + break; + + case DMode_QUASIRND: + // Nothing + break; + } +} + + + +void Dither::build_dither_pat_round () +{ + PatData & pat_data = _dither_pat_arr [0]; + for (int y = 0; y < _max_pat_width; ++y) + { + for (int x = 0; x < _max_pat_width; ++x) + { + pat_data [y] [x] = 0; + } + } + + build_next_dither_pat (); +} + + + +void Dither::build_dither_pat_bayer () +{ + assert (fstb::is_pow_2 (int (_max_pat_width))); + + PatData & pat_data = _dither_pat_arr [0]; + for (int y = 0; y < _max_pat_width; ++y) + { + for (int x = 0; x < _max_pat_width; ++x) + { + pat_data [y] [x] = -128; + } + } + + for (int dith_size = 2; dith_size <= _max_pat_width; dith_size <<= 1) + { + for (int y = 0; y < _max_pat_width; y += 2) + { + for (int x = 0; x < _max_pat_width; x += 2) + { + const int xx = (x >> 1) + (_max_pat_width >> 1); + const int yy = (y >> 1) + (_max_pat_width >> 1); + const int val = (pat_data [yy] [xx] + 128) >> 2; + pat_data [y ] [x ] = int16_t (val + 0-128); + pat_data [y ] [x + 1] = int16_t (val + 128-128); + pat_data [y + 1] [x ] = int16_t (val + 192-128); + pat_data [y + 1] [x + 1] = int16_t (val + 64-128); + } + } + } + + build_next_dither_pat (); +} + + + +void Dither::build_dither_pat_void_and_cluster (int w) +{ + assert (_max_pat_width % w == 0); + VoidAndCluster vc_gen; + MatrixWrap pat_raw (w, w); + vc_gen.create_matrix (pat_raw); + + PatData & pat_data = _dither_pat_arr [0]; + const int area = w * w; + for (int y = 0; y < _max_pat_width; ++y) + { + for (int x = 0; x < _max_pat_width; ++x) + { + pat_data [y] [x] = int16_t (pat_raw (x, y) * 256 / area - 128); + } + } + + build_next_dither_pat (); +} + + + +void Dither::build_next_dither_pat () +{ + if (_tpdfo_flag) + { + for (int y = 0; y < _max_pat_width; ++y) + { + for (int x = 0; x < _max_pat_width; ++x) + { + const int r = _dither_pat_arr [0] [y] [x]; + const int t = remap_tpdf_scalar (r); + _dither_pat_arr [0] [y] [x] = int16_t (t); + } + } + } + + for (int seq = 1; seq < _pat_period; ++seq) + { + const int angle = (_dyn_flag) ? seq & 3 : 0; + copy_dither_pat_rotate ( + _dither_pat_arr [seq], + _dither_pat_arr [0], + angle + ); + } +} + + + +void Dither::copy_dither_pat_rotate (PatData &dst, const PatData &src, int angle) noexcept +{ + assert (angle >= 0); + assert (angle < 4); + + static const int sin_arr [4] = { 0, 1, 0, -1 }; + const int s = sin_arr [ angle ]; + const int c = sin_arr [(angle + 1) & 3]; + + assert (fstb::is_pow_2 (int (_max_pat_width))); + const int mask = _max_pat_width - 1; + + for (int y = 0; y < _max_pat_width; ++y) + { + for (int x = 0; x < _max_pat_width; ++x) + { + const int xs = (x * c - y * s) & mask; + const int ys = (x * s + y * c) & mask; + + dst [y] [x] = src [ys] [xs]; + } + } +} + + + +// All possible combinations +#define fmtcl_Dither_SPAN_INT(SETP, NAMP, NAMF, simple_flag, tpdfo_flag, tpdfn_flag, dst_res, dst_fmt, src_res, src_fmt) \ + switch ( ((simple_flag) << 7) \ + + ((tpdfo_flag) << 23) + ((tpdfn_flag) << 22) \ + + ((dst_res) << 24) + ((dst_fmt) << 16) \ + + ((src_res) << 8) + (src_fmt)) \ + { \ + SETP (NAMP, NAMF, SplFmt_INT8 , uint8_t , 8, SplFmt_INT16, uint16_t, 9) \ + SETP (NAMP, NAMF, SplFmt_INT8 , uint8_t , 8, SplFmt_INT16, uint16_t, 10) \ + SETP (NAMP, NAMF, SplFmt_INT8 , uint8_t , 8, SplFmt_INT16, uint16_t, 11) \ + SETP (NAMP, NAMF, SplFmt_INT8 , uint8_t , 8, SplFmt_INT16, uint16_t, 12) \ + SETP (NAMP, NAMF, SplFmt_INT8 , uint8_t , 8, SplFmt_INT16, uint16_t, 14) \ + SETP (NAMP, NAMF, SplFmt_INT8 , uint8_t , 8, SplFmt_INT16, uint16_t, 16) \ + SETP (NAMP, NAMF, SplFmt_INT16, uint16_t, 9, SplFmt_INT16, uint16_t, 10) \ + SETP (NAMP, NAMF, SplFmt_INT16, uint16_t, 9, SplFmt_INT16, uint16_t, 11) \ + SETP (NAMP, NAMF, SplFmt_INT16, uint16_t, 9, SplFmt_INT16, uint16_t, 12) \ + SETP (NAMP, NAMF, SplFmt_INT16, uint16_t, 9, SplFmt_INT16, uint16_t, 14) \ + SETP (NAMP, NAMF, SplFmt_INT16, uint16_t, 9, SplFmt_INT16, uint16_t, 16) \ + SETP (NAMP, NAMF, SplFmt_INT16, uint16_t, 10, SplFmt_INT16, uint16_t, 11) \ + SETP (NAMP, NAMF, SplFmt_INT16, uint16_t, 10, SplFmt_INT16, uint16_t, 12) \ + SETP (NAMP, NAMF, SplFmt_INT16, uint16_t, 10, SplFmt_INT16, uint16_t, 14) \ + SETP (NAMP, NAMF, SplFmt_INT16, uint16_t, 10, SplFmt_INT16, uint16_t, 16) \ + SETP (NAMP, NAMF, SplFmt_INT16, uint16_t, 12, SplFmt_INT16, uint16_t, 14) \ + SETP (NAMP, NAMF, SplFmt_INT16, uint16_t, 12, SplFmt_INT16, uint16_t, 16) \ + } + +// All possible combinations using float as intermediary data +#define fmtcl_Dither_SPAN_FLT(SETP, NAMP, NAMF, simple_flag, tpdfo_flag, tpdfn_flag, dst_res, dst_fmt, src_res, src_fmt) \ + switch ( ((simple_flag) << 7) \ + + ((tpdfo_flag) << 23) + ((tpdfn_flag) << 22) \ + + ((dst_res) << 24) + ((dst_fmt) << 16) \ + + ((src_res) << 8) + (src_fmt)) \ + { \ + SETP (NAMP, NAMF, SplFmt_INT8 , uint8_t , 8, SplFmt_INT8 , uint8_t , 8) \ + SETP (NAMP, NAMF, SplFmt_INT8 , uint8_t , 8, SplFmt_INT16, uint16_t, 9) \ + SETP (NAMP, NAMF, SplFmt_INT8 , uint8_t , 8, SplFmt_INT16, uint16_t, 10) \ + SETP (NAMP, NAMF, SplFmt_INT8 , uint8_t , 8, SplFmt_INT16, uint16_t, 11) \ + SETP (NAMP, NAMF, SplFmt_INT8 , uint8_t , 8, SplFmt_INT16, uint16_t, 12) \ + SETP (NAMP, NAMF, SplFmt_INT8 , uint8_t , 8, SplFmt_INT16, uint16_t, 14) \ + SETP (NAMP, NAMF, SplFmt_INT8 , uint8_t , 8, SplFmt_INT16, uint16_t, 16) \ + SETP (NAMP, NAMF, SplFmt_INT8 , uint8_t , 8, SplFmt_FLOAT, float , 32) \ + SETP (NAMP, NAMF, SplFmt_INT16, uint16_t, 9, SplFmt_INT8 , uint8_t , 8) \ + SETP (NAMP, NAMF, SplFmt_INT16, uint16_t, 9, SplFmt_INT16, uint16_t, 9) \ + SETP (NAMP, NAMF, SplFmt_INT16, uint16_t, 9, SplFmt_INT16, uint16_t, 10) \ + SETP (NAMP, NAMF, SplFmt_INT16, uint16_t, 9, SplFmt_INT16, uint16_t, 11) \ + SETP (NAMP, NAMF, SplFmt_INT16, uint16_t, 9, SplFmt_INT16, uint16_t, 12) \ + SETP (NAMP, NAMF, SplFmt_INT16, uint16_t, 9, SplFmt_INT16, uint16_t, 14) \ + SETP (NAMP, NAMF, SplFmt_INT16, uint16_t, 9, SplFmt_INT16, uint16_t, 16) \ + SETP (NAMP, NAMF, SplFmt_INT16, uint16_t, 9, SplFmt_FLOAT, float , 32) \ + SETP (NAMP, NAMF, SplFmt_INT16, uint16_t, 10, SplFmt_INT8 , uint8_t , 8) \ + SETP (NAMP, NAMF, SplFmt_INT16, uint16_t, 10, SplFmt_INT16, uint16_t, 9) \ + SETP (NAMP, NAMF, SplFmt_INT16, uint16_t, 10, SplFmt_INT16, uint16_t, 10) \ + SETP (NAMP, NAMF, SplFmt_INT16, uint16_t, 10, SplFmt_INT16, uint16_t, 11) \ + SETP (NAMP, NAMF, SplFmt_INT16, uint16_t, 10, SplFmt_INT16, uint16_t, 12) \ + SETP (NAMP, NAMF, SplFmt_INT16, uint16_t, 10, SplFmt_INT16, uint16_t, 14) \ + SETP (NAMP, NAMF, SplFmt_INT16, uint16_t, 10, SplFmt_INT16, uint16_t, 16) \ + SETP (NAMP, NAMF, SplFmt_INT16, uint16_t, 10, SplFmt_FLOAT, float , 32) \ + SETP (NAMP, NAMF, SplFmt_INT16, uint16_t, 12, SplFmt_INT8 , uint8_t , 8) \ + SETP (NAMP, NAMF, SplFmt_INT16, uint16_t, 12, SplFmt_INT16, uint16_t, 9) \ + SETP (NAMP, NAMF, SplFmt_INT16, uint16_t, 12, SplFmt_INT16, uint16_t, 10) \ + SETP (NAMP, NAMF, SplFmt_INT16, uint16_t, 12, SplFmt_INT16, uint16_t, 11) \ + SETP (NAMP, NAMF, SplFmt_INT16, uint16_t, 12, SplFmt_INT16, uint16_t, 12) \ + SETP (NAMP, NAMF, SplFmt_INT16, uint16_t, 12, SplFmt_INT16, uint16_t, 14) \ + SETP (NAMP, NAMF, SplFmt_INT16, uint16_t, 12, SplFmt_INT16, uint16_t, 16) \ + SETP (NAMP, NAMF, SplFmt_INT16, uint16_t, 12, SplFmt_FLOAT, float , 32) \ + SETP (NAMP, NAMF, SplFmt_INT16, uint16_t, 16, SplFmt_INT8 , uint8_t , 8) \ + SETP (NAMP, NAMF, SplFmt_INT16, uint16_t, 16, SplFmt_INT16, uint16_t, 9) \ + SETP (NAMP, NAMF, SplFmt_INT16, uint16_t, 16, SplFmt_INT16, uint16_t, 10) \ + SETP (NAMP, NAMF, SplFmt_INT16, uint16_t, 16, SplFmt_INT16, uint16_t, 11) \ + SETP (NAMP, NAMF, SplFmt_INT16, uint16_t, 16, SplFmt_INT16, uint16_t, 12) \ + SETP (NAMP, NAMF, SplFmt_INT16, uint16_t, 16, SplFmt_INT16, uint16_t, 14) \ + SETP (NAMP, NAMF, SplFmt_INT16, uint16_t, 16, SplFmt_INT16, uint16_t, 16) \ + SETP (NAMP, NAMF, SplFmt_INT16, uint16_t, 16, SplFmt_FLOAT, float , 32) \ + } + + + +#define fmtcl_Dither_SET_FNC_MULTI(FCASE, NAMP, NAMF, DF, DT, DP, SF, ST, SP) \ + FCASE (false, false, false, NAMP, NAMF, DF, DT, DP, SF, ST, SP) \ + FCASE (false, false, true , NAMP, NAMF, DF, DT, DP, SF, ST, SP) \ + FCASE (false, true , false, NAMP, NAMF, DF, DT, DP, SF, ST, SP) \ + FCASE (false, true , true , NAMP, NAMF, DF, DT, DP, SF, ST, SP) \ + FCASE (true , false, false, NAMP, NAMF, DF, DT, DP, SF, ST, SP) \ + FCASE (true , false, true , NAMP, NAMF, DF, DT, DP, SF, ST, SP) \ + FCASE (true , true , false, NAMP, NAMF, DF, DT, DP, SF, ST, SP) \ + FCASE (true , true , true , NAMP, NAMF, DF, DT, DP, SF, ST, SP) + +#define fmtcl_Dither_SET_FNC_INT_CASE(simple_flag, tpdfo_flag, tpdfn_flag, NAMP, NAMF, DF, DT, DP, SF, ST, SP) \ + case (simple_flag << 7) + (tpdfn_flag << 22) + (tpdfo_flag << 23) \ + + (DP << 24) + (DF << 16) + (SP << 8) + SF: \ + _process_seg_int_int_ptr = &process_seg_##NAMF##_int_int_cpp < \ + simple_flag, tpdfo_flag, tpdfn_flag, DT, DP, ST, SP \ + >; \ + break; + +#define fmtcl_Dither_SET_FNC_INT(NAMP, NAMF, DF, DT, DP, SF, ST, SP) \ + fmtcl_Dither_SET_FNC_MULTI (fmtcl_Dither_SET_FNC_INT_CASE, \ + NAMP, NAMF, DF, DT, DP, SF, ST, SP) + +#define fmtcl_Dither_SET_FNC_FLT_CASE(simple_flag, tpdfo_flag, tpdfn_flag,NAMP, NAMF, DF, DT, DP, SF, ST, SP) \ + case (simple_flag << 7) + (tpdfn_flag << 22) + (tpdfo_flag << 23) \ + + (DP << 24) + (DF << 16) + (SP << 8) + SF: \ + _process_seg_flt_int_ptr = &process_seg_##NAMF##_flt_int_cpp < \ + simple_flag, tpdfo_flag, tpdfn_flag, DT, DP, ST \ + >; \ + break; + +#define fmtcl_Dither_SET_FNC_FLT(NAMP, NAMF, DF, DT, DP, SF, ST, SP) \ + fmtcl_Dither_SET_FNC_MULTI (fmtcl_Dither_SET_FNC_FLT_CASE, \ + NAMP, NAMF, DF, DT, DP, SF, ST, SP) + +#define fmtcl_Dither_SET_FNC_INT_SSE2_CASE(simple_flag, tpdfo_flag, tpdfn_flag, NAMP, NAMF, DF, DT, DP, SF, ST, SP) \ + case (simple_flag << 7) + (tpdfn_flag << 22) + (tpdfo_flag << 23) \ + + (DP << 24) + (DF << 16) + (SP << 8) + SF: \ + _process_seg_int_int_ptr = &process_seg_##NAMF##_int_int_sse2 < \ + simple_flag, tpdfo_flag, tpdfn_flag, DF, DP, SF, SP \ + >; \ + break; + +#define fmtcl_Dither_SET_FNC_INT_SSE2(NAMP, NAMF, DF, DT, DP, SF, ST, SP) \ + fmtcl_Dither_SET_FNC_MULTI (fmtcl_Dither_SET_FNC_INT_SSE2_CASE, \ + NAMP, NAMF, DF, DT, DP, SF, ST, SP) + +#define fmtcl_Dither_SET_FNC_FLT_SSE2_CASE(simple_flag, tpdfo_flag, tpdfn_flag, NAMP, NAMF, DF, DT, DP, SF, ST, SP) \ + case (simple_flag << 7) + (tpdfn_flag << 22) + (tpdfo_flag << 23) \ + + (DP << 24) + (DF << 16) + (SP << 8) + SF: \ + _process_seg_flt_int_ptr = &process_seg_##NAMF##_flt_int_sse2 < \ + simple_flag, tpdfo_flag, tpdfn_flag, DF, DP, SF \ + >; \ + break; + +#define fmtcl_Dither_SET_FNC_FLT_SSE2(NAMP, NAMF, DF, DT, DP, SF, ST, SP) \ + fmtcl_Dither_SET_FNC_MULTI (fmtcl_Dither_SET_FNC_FLT_SSE2_CASE, \ + NAMP, NAMF, DF, DT, DP, SF, ST, SP) + + + +void Dither::init_fnc_fast () noexcept +{ + fmtcl_Dither_SPAN_INT ( + fmtcl_Dither_SET_FNC_INT, fast, fast, false, false, false, + _dst_res, _splfmt_dst, _src_res, _splfmt_src + ) + fmtcl_Dither_SPAN_FLT ( + fmtcl_Dither_SET_FNC_FLT, fast, fast, false, false, false, + _dst_res, _splfmt_dst, _src_res, _splfmt_src + ) + +#if (fstb_ARCHI == fstb_ARCHI_X86) + if (_sse2_flag) + { + fmtcl_Dither_SPAN_INT ( + fmtcl_Dither_SET_FNC_INT_SSE2, fast, fast, false, false, false, + _dst_res, _splfmt_dst, _src_res, _splfmt_src + ) + fmtcl_Dither_SPAN_FLT ( + fmtcl_Dither_SET_FNC_FLT_SSE2, fast, fast, false, false, false, + _dst_res, _splfmt_dst, _src_res, _splfmt_src + ) + } +#endif +} + + + +void Dither::init_fnc_ordered () noexcept +{ + assert (! _errdif_flag); + + fmtcl_Dither_SPAN_INT ( + fmtcl_Dither_SET_FNC_INT, + ord, ord, _simple_flag, _tpdfo_flag, _tpdfn_flag, + _dst_res, _splfmt_dst, _src_res, _splfmt_src + ) + fmtcl_Dither_SPAN_FLT ( + fmtcl_Dither_SET_FNC_FLT, + ord, ord, _simple_flag, _tpdfo_flag, _tpdfn_flag, + _dst_res, _splfmt_dst, _src_res, _splfmt_src + ) + +#if (fstb_ARCHI == fstb_ARCHI_X86) + if (_sse2_flag) + { + fmtcl_Dither_SPAN_INT ( + fmtcl_Dither_SET_FNC_INT_SSE2, + ord, ord, _simple_flag, _tpdfo_flag, _tpdfn_flag, + _dst_res, _splfmt_dst, _src_res, _splfmt_src + ) + fmtcl_Dither_SPAN_FLT ( + fmtcl_Dither_SET_FNC_FLT_SSE2, + ord, ord, _simple_flag, _tpdfo_flag, _tpdfn_flag, + _dst_res, _splfmt_dst, _src_res, _splfmt_src + ) + } +#endif +} + + + +void Dither::init_fnc_quasirandom () noexcept +{ + assert (! _errdif_flag); + + fmtcl_Dither_SPAN_INT ( + fmtcl_Dither_SET_FNC_INT, + qrs, qrs, _simple_flag, _tpdfo_flag, _tpdfn_flag, + _dst_res, _splfmt_dst, _src_res, _splfmt_src + ) + fmtcl_Dither_SPAN_FLT ( + fmtcl_Dither_SET_FNC_FLT, + qrs, qrs, _simple_flag, _tpdfo_flag, _tpdfn_flag, + _dst_res, _splfmt_dst, _src_res, _splfmt_src + ) + +#if (fstb_ARCHI == fstb_ARCHI_X86) + if (_sse2_flag) + { + fmtcl_Dither_SPAN_INT ( + fmtcl_Dither_SET_FNC_INT_SSE2, + qrs, qrs, _simple_flag, _tpdfo_flag, _tpdfn_flag, + _dst_res, _splfmt_dst, _src_res, _splfmt_src + ) + fmtcl_Dither_SPAN_FLT ( + fmtcl_Dither_SET_FNC_FLT_SSE2, + qrs, qrs, _simple_flag, _tpdfo_flag, _tpdfn_flag, + _dst_res, _splfmt_dst, _src_res, _splfmt_src + ) + } +#endif +} + + + +#undef fmtcl_Dither_SET_FNC_MULTI +#undef fmtcl_Dither_SET_FNC_INT_CASE +#undef fmtcl_Dither_SET_FNC_INT +#undef fmtcl_Dither_SET_FNC_FLT_CASE +#undef fmtcl_Dither_SET_FNC_FLT +#undef fmtcl_Dither_SET_FNC_INT_SSE2_CASE +#undef fmtcl_Dither_SET_FNC_INT_SSE2 +#undef fmtcl_Dither_SET_FNC_FLT_SSE2_CASE +#undef fmtcl_Dither_SET_FNC_FLT_SSE2 + + + +#define fmtcl_Dither_SET_FNC_ERRDIF_INT_CASE(simple_flag, tpdfn_flag, NAMP, NAMF, DF, DT, DP, SF, ST, SP) \ + case (simple_flag << 7) + (tpdfn_flag << 22) \ + + (DP << 24) + (DF << 16) + (SP << 8) + SF: \ + _process_seg_int_int_ptr = &process_seg_errdif_int_int_cpp < \ + simple_flag, tpdfn_flag, Diffuse##NAMF \ + >; \ + break; + +#define fmtcl_Dither_SET_FNC_ERRDIF_INT(NAMP, NAMF, DF, DT, DP, SF, ST, SP) \ + fmtcl_Dither_SET_FNC_ERRDIF_INT_CASE (false, false, NAMP, NAMF, DF, DT, DP, SF, ST, SP) \ + fmtcl_Dither_SET_FNC_ERRDIF_INT_CASE (false, true , NAMP, NAMF, DF, DT, DP, SF, ST, SP) \ + fmtcl_Dither_SET_FNC_ERRDIF_INT_CASE (true , false, NAMP, NAMF, DF, DT, DP, SF, ST, SP) \ + fmtcl_Dither_SET_FNC_ERRDIF_INT_CASE (true , true , NAMP, NAMF, DF, DT, DP, SF, ST, SP) + +#define fmtcl_Dither_SET_FNC_ERRDIF_FLT_CASE(simple_flag, tpdfn_flag, NAMP, NAMF, DF, DT, DP, SF, ST, SP) \ + case (simple_flag << 7) + (tpdfn_flag << 22) \ + + (DP << 24) + (DF << 16) + (SP << 8) + SF: \ + _process_seg_flt_int_ptr = &process_seg_errdif_flt_int_cpp < \ + simple_flag, tpdfn_flag, Diffuse##NAMF \ + >; \ + break; + +#define fmtcl_Dither_SET_FNC_ERRDIF_FLT(NAMP, NAMF, DF, DT, DP, SF, ST, SP) \ + fmtcl_Dither_SET_FNC_ERRDIF_FLT_CASE (false, false, NAMP, NAMF, DF, DT, DP, SF, ST, SP) \ + fmtcl_Dither_SET_FNC_ERRDIF_FLT_CASE (false, true , NAMP, NAMF, DF, DT, DP, SF, ST, SP) \ + fmtcl_Dither_SET_FNC_ERRDIF_FLT_CASE (true , false, NAMP, NAMF, DF, DT, DP, SF, ST, SP) \ + fmtcl_Dither_SET_FNC_ERRDIF_FLT_CASE (true , true , NAMP, NAMF, DF, DT, DP, SF, ST, SP) + + + +void Dither::init_fnc_errdiff () noexcept +{ + assert (_errdif_flag); + + switch (_dmode) + { + case DMode_FILTERLITE: + fmtcl_Dither_SPAN_INT ( + fmtcl_Dither_SET_FNC_ERRDIF_INT, + errdif, FilterLite, _simple_flag, false, _tpdfn_flag, + _dst_res, _splfmt_dst, _src_res, _splfmt_src + ) + fmtcl_Dither_SPAN_FLT ( + fmtcl_Dither_SET_FNC_ERRDIF_FLT, + errdif, FilterLite, _simple_flag, false, _tpdfn_flag, + _dst_res, _splfmt_dst, _src_res, _splfmt_src + ) + break; + + case DMode_STUCKI: + fmtcl_Dither_SPAN_INT ( + fmtcl_Dither_SET_FNC_ERRDIF_INT, + errdif, Stucki, _simple_flag, false, _tpdfn_flag, + _dst_res, _splfmt_dst, _src_res, _splfmt_src + ) + fmtcl_Dither_SPAN_FLT ( + fmtcl_Dither_SET_FNC_ERRDIF_FLT, + errdif, Stucki, _simple_flag, false, _tpdfn_flag, + _dst_res, _splfmt_dst, _src_res, _splfmt_src + ) + break; + + case DMode_ATKINSON: + fmtcl_Dither_SPAN_INT ( + fmtcl_Dither_SET_FNC_ERRDIF_INT, + errdif, Atkinson, _simple_flag, false, _tpdfn_flag, + _dst_res, _splfmt_dst, _src_res, _splfmt_src + ) + fmtcl_Dither_SPAN_FLT ( + fmtcl_Dither_SET_FNC_ERRDIF_FLT, + errdif, Atkinson, _simple_flag, false, _tpdfn_flag, + _dst_res, _splfmt_dst, _src_res, _splfmt_src + ) + break; + + case DMode_FLOYD: + fmtcl_Dither_SPAN_INT ( + fmtcl_Dither_SET_FNC_ERRDIF_INT, + errdif, FloydSteinberg, _simple_flag, false, _tpdfn_flag, + _dst_res, _splfmt_dst, _src_res, _splfmt_src + ) + fmtcl_Dither_SPAN_FLT ( + fmtcl_Dither_SET_FNC_ERRDIF_FLT, + errdif, FloydSteinberg, _simple_flag, false, _tpdfn_flag, + _dst_res, _splfmt_dst, _src_res, _splfmt_src + ) + break; + + case DMode_OSTRO: + fmtcl_Dither_SPAN_INT ( + fmtcl_Dither_SET_FNC_ERRDIF_INT, + errdif, Ostromoukhov, _simple_flag, false, _tpdfn_flag, + _dst_res, _splfmt_dst, _src_res, _splfmt_src + ) + fmtcl_Dither_SPAN_FLT ( + fmtcl_Dither_SET_FNC_ERRDIF_FLT, + errdif, Ostromoukhov, _simple_flag, false, _tpdfn_flag, + _dst_res, _splfmt_dst, _src_res, _splfmt_src + ) + break; + + default: + break; + } +} + + + +#undef fmtcl_Dither_SET_FNC_ERRDIF_INT_CASE +#undef fmtcl_Dither_SET_FNC_ERRDIF_INT +#undef fmtcl_Dither_SET_FNC_ERRDIF_FLT_CASE +#undef fmtcl_Dither_SET_FNC_ERRDIF_FLT + + + +#undef fmtcl_Dither_SPAN_INT +#undef fmtcl_Dither_SPAN_FLT + + + +void Dither::dither_plane (uint8_t *dst_ptr, int dst_stride, const uint8_t *src_ptr, int src_stride, int w, int h, const BitBltConv::ScaleInfo &scale_info, int frame_index, int plane_index) +{ + assert (dst_ptr != nullptr); + assert (src_ptr != nullptr); + assert (w > 0); + assert (h > 0); + + SegContext ctx; + ctx._scale_info_ptr = &scale_info; + ctx._amp = _amp; + + uint32_t rnd_state = 0; + if (! _correlated_planes_flag) + { + rnd_state += plane_index << 16; + } + if (_static_noise_flag) + { + rnd_state += 55555; + } + else + { + rnd_state += frame_index; + } + ctx._rnd_state = rnd_state; + + const bool sc_flag = + ( _splfmt_src == SplFmt_FLOAT + || ! fstb::is_eq (scale_info._gain * double ((uint64_t (1)) << (_src_res - _dst_res)), 1.0, 1e-6) + || ! fstb::is_null (scale_info._add_cst, 1e-6)); + + void (* process_ptr) (uint8_t *dst_ptr, const uint8_t *src_ptr, int w, SegContext &ctx) = + (sc_flag) + ? _process_seg_flt_int_ptr + : _process_seg_int_int_ptr; + assert (process_ptr != nullptr); + + ErrDifBuf * ed_buf_ptr = nullptr; + if (_errdif_flag) + { + ed_buf_ptr = _buf_pool.take_obj (); + if (ed_buf_ptr == nullptr) + { + throw std::runtime_error ( + "cannot allocate memory for temporary buffer." + ); + } + ed_buf_ptr->clear ((sc_flag) ? sizeof (float) : sizeof (int16_t)); + } + + switch (_dmode) + { + case DMode_BAYER: + case DMode_ROUND: + case DMode_VOIDCLUST: + { + int pat_index = 0; + if (! _correlated_planes_flag) + { + pat_index += plane_index; + } + if (_dyn_flag) + { + pat_index += frame_index; + } + pat_index &= _pat_period - 1; + const PatData& pattern = _dither_pat_arr [pat_index]; + ctx._pattern_ptr = &pattern; + } + break; + + case DMode_FAST: + // Nothing + break; + + case DMode_QUASIRND: + ctx._qrs_seed = 0; + if (_dyn_flag) + { + ctx._qrs_seed += uint32_t (frame_index * 73); + } + if (! _correlated_planes_flag) + { + ctx._qrs_seed += uint32_t (plane_index * 263); + } + break; + + case DMode_FILTERLITE: + case DMode_STUCKI: + case DMode_ATKINSON: + case DMode_FLOYD: + case DMode_OSTRO: + ctx._ed_buf_ptr = ed_buf_ptr; + break; + + default: + assert (false); + throw std::logic_error ("unexpected dithering algorithm"); + break; + } + + for (int y = 0; y < h; ++y) + { + ctx._y = y; + + (*process_ptr) (dst_ptr, src_ptr, w, ctx); + + src_ptr += src_stride; + dst_ptr += dst_stride; + } + + if (ed_buf_ptr != nullptr) + { + _buf_pool.return_obj (*ed_buf_ptr); + ed_buf_ptr = nullptr; + } +} + + + +template +void Dither::process_seg_fast_int_int_cpp (uint8_t * fstb_RESTRICT dst_ptr, const uint8_t * fstb_RESTRICT src_ptr, int w, SegContext &ctx) noexcept +{ + fstb::unused (ctx); + + assert (dst_ptr != nullptr); + assert (src_ptr != nullptr); + assert (w > 0); + + constexpr int dif_bits = SRC_BITS - DST_BITS; + static_assert (dif_bits >= 0, "This function cannot increase bidepth."); + + const SRC_TYPE * fstb_RESTRICT src_n_ptr = reinterpret_cast (src_ptr); + DST_TYPE * fstb_RESTRICT dst_n_ptr = reinterpret_cast < DST_TYPE *> (dst_ptr); + + for (int pos = 0; pos < w; ++pos) + { + const int s = src_n_ptr [pos]; + const int pix = s >> dif_bits; + dst_n_ptr [pos] = static_cast (pix); + } +} + + + +template +void Dither::process_seg_fast_flt_int_cpp (uint8_t * fstb_RESTRICT dst_ptr, const uint8_t * fstb_RESTRICT src_ptr, int w, SegContext &ctx) noexcept +{ + assert (dst_ptr != nullptr); + assert (src_ptr != nullptr); + assert (w > 0); + assert (ctx._scale_info_ptr != nullptr); + + const SRC_TYPE * fstb_RESTRICT src_n_ptr = reinterpret_cast (src_ptr); + DST_TYPE * fstb_RESTRICT dst_n_ptr = reinterpret_cast < DST_TYPE *> (dst_ptr); + + const float mul = float (ctx._scale_info_ptr->_gain); + const float add = float (ctx._scale_info_ptr->_add_cst); + const int vmax = (1 << DST_BITS) - 1; + + for (int pos = 0; pos < w; ++pos) + { + float s = float (src_n_ptr [pos]); + s = s * mul + add; + const int quant = fstb::conv_int_fast (s); + const int pix = fstb::limit (quant, 0, vmax); + dst_n_ptr [pos] = static_cast (pix); + } +} + + + +#if (fstb_ARCHI == fstb_ARCHI_X86) + + + +template +void Dither::process_seg_fast_int_int_sse2 (uint8_t * fstb_RESTRICT dst_ptr, const uint8_t * fstb_RESTRICT src_ptr, int w, SegContext &ctx) noexcept +{ + fstb::unused (ctx); + assert (dst_ptr != nullptr); + assert (src_ptr != nullptr); + assert (w > 0); + + constexpr int dif_bits = SRC_BITS - DST_BITS; + static_assert (dif_bits >= 0, "This function cannot increase bidepth."); + + typedef typename ProxyRwSse2 ::PtrConst::Type SrcPtr; + typedef typename ProxyRwSse2 ::Ptr::Type DstPtr; + SrcPtr src_n_ptr = reinterpret_cast (src_ptr); + DstPtr dst_n_ptr = reinterpret_cast (dst_ptr); + const __m128i zero = _mm_setzero_si128 (); + const __m128i mask_lsb = _mm_set1_epi16 (0x00FF); + + for (int pos = 0; pos < w; pos += 8) + { + const __m128i s = + ProxyRwSse2 ::read_i16 (src_n_ptr + pos, zero); + const __m128i pix = _mm_srli_epi16 (s, dif_bits); + ProxyRwSse2 ::write_i16 (dst_n_ptr + pos, pix, mask_lsb); + } +} + + + +template +void Dither::process_seg_fast_flt_int_sse2 (uint8_t * fstb_RESTRICT dst_ptr, const uint8_t * fstb_RESTRICT src_ptr, int w, SegContext &ctx) noexcept +{ + assert (dst_ptr != nullptr); + assert (src_ptr != nullptr); + assert (w > 0); + assert (ctx._scale_info_ptr != nullptr); + + typedef typename ProxyRwSse2 ::PtrConst::Type SrcPtr; + typedef typename ProxyRwSse2 ::Ptr::Type DstPtr; + SrcPtr src_n_ptr = reinterpret_cast (src_ptr); + DstPtr dst_n_ptr = reinterpret_cast (dst_ptr); + + const __m128 mul = _mm_set1_ps (float (ctx._scale_info_ptr->_gain)); + const __m128 add = _mm_set1_ps (float (ctx._scale_info_ptr->_add_cst)); + const __m128 vmax = _mm_set1_ps (float ((1 << DST_BITS) - 1)); + const __m128 zero_f = _mm_setzero_ps (); + const __m128i zero_i = _mm_setzero_si128 (); + const __m128i mask_lsb = _mm_set1_epi16 (0x00FF); + const __m128i sign_bit = _mm_set1_epi16 (-0x8000); + const __m128 offset = _mm_set1_ps (-32768); + + for (int pos = 0; pos < w; pos += 8) + { + __m128 s0; + __m128 s1; + ProxyRwSse2 ::read_flt ( + src_n_ptr + pos, s0, s1, zero_i + ); + s0 = _mm_add_ps (_mm_mul_ps (s0, mul), add); + s1 = _mm_add_ps (_mm_mul_ps (s1, mul), add); + s0 = _mm_max_ps (_mm_min_ps (s0, vmax), zero_f); + s1 = _mm_max_ps (_mm_min_ps (s1, vmax), zero_f); + ProxyRwSse2 ::write_flt ( + dst_n_ptr + pos, s0, s1, mask_lsb, sign_bit, offset + ); + } +} + + + +#endif + + + +template +void Dither::process_seg_ord_int_int_cpp (uint8_t * fstb_RESTRICT dst_ptr, const uint8_t * fstb_RESTRICT src_ptr, int w, SegContext &ctx) noexcept +{ + const PatRow & fstb_RESTRICT pattern = ctx.extract_pattern_row (); + + process_seg_common_int_int_cpp < + S_FLAG, TN_FLAG, DST_TYPE, DST_BITS, SRC_TYPE, SRC_BITS + > (dst_ptr, src_ptr, w, ctx, + [&] (int pos) + { + return pattern [pos & (_max_pat_width - 1)]; + } + ); +} + + + +template +void Dither::process_seg_ord_flt_int_cpp (uint8_t * fstb_RESTRICT dst_ptr, const uint8_t * fstb_RESTRICT src_ptr, int w, SegContext &ctx) noexcept +{ + const PatRow & fstb_RESTRICT pattern = ctx.extract_pattern_row (); + + process_seg_common_flt_int_cpp < + S_FLAG, TN_FLAG, DST_TYPE, DST_BITS, SRC_TYPE + > (dst_ptr, src_ptr, w, ctx, + [&] (int pos) + { + return pattern [pos & (_max_pat_width - 1)]; + } + ); +} + + + +#if (fstb_ARCHI == fstb_ARCHI_X86) + + + +template +void Dither::process_seg_ord_int_int_sse2 (uint8_t * fstb_RESTRICT dst_ptr, const uint8_t * fstb_RESTRICT src_ptr, int w, SegContext &ctx) noexcept +{ + const PatRow & fstb_RESTRICT pattern = ctx.extract_pattern_row (); + + process_seg_common_int_int_sse2 < + S_FLAG, TN_FLAG, DST_FMT, DST_BITS, SRC_FMT, SRC_BITS + > (dst_ptr, src_ptr, w, ctx, + [&] (int pos) + { + return _mm_load_si128 (reinterpret_cast ( + &pattern [pos & (_max_pat_width - 1)] + )); // 8 s16 [-128 ; +127] + } + ); +} + + + +template +void Dither::process_seg_ord_flt_int_sse2 (uint8_t * fstb_RESTRICT dst_ptr, const uint8_t * fstb_RESTRICT src_ptr, int w, SegContext &ctx) noexcept +{ + const PatRow & fstb_RESTRICT pattern = ctx.extract_pattern_row (); + + process_seg_common_flt_int_sse2 < + S_FLAG, TN_FLAG, DST_FMT, DST_BITS, SRC_FMT + > (dst_ptr, src_ptr, w, ctx, + [&] (int pos) + { + return _mm_load_si128 (reinterpret_cast ( + &pattern [pos & (_max_pat_width - 1)] + )); // 8 s16 [-128 ; +127] + } + ); +} + + + +#endif // fstb_ARCHI_X86 + + + +template +void Dither::process_seg_qrs_int_int_cpp (uint8_t * fstb_RESTRICT dst_ptr, const uint8_t * fstb_RESTRICT src_ptr, int w, SegContext &ctx) noexcept +{ + // alpha1 = 1 / x, with x real solution of: x^3 - x - 1 = 0 + // Also: + // alpha1 = (curt (2) * sq (curt (3))) + // / (curt (9 - sqrt (69)) + curt (9 + sqrt (69))) + constexpr double alpha1 = 1.0 / 1.3247179572447460259609088544781; + constexpr double alpha2 = alpha1 * alpha1; + constexpr int sc_l2 = 16; // 16 bits of fractional values + constexpr float sc_mul = float (1 << sc_l2); + constexpr int qrs_shf = sc_l2 - 9; + constexpr int qrs_inc = int (alpha1 * sc_mul + 0.5f); + uint32_t qrs_cnt = uint32_t (std::llrint ( + (alpha2 * double (ctx._y + ctx._qrs_seed)) * sc_mul + )); + + process_seg_common_int_int_cpp < + S_FLAG, TN_FLAG, DST_TYPE, DST_BITS, SRC_TYPE, SRC_BITS + > (dst_ptr, src_ptr, w, ctx, + [&] (int /*pos*/) + { + const int p = (qrs_cnt >> qrs_shf) & 0x1FF; + int dith_o = (p > 255) ? 512 - 128 - p : p - 128; // s8 + qrs_cnt += qrs_inc; + + if (TO_FLAG) + { + dith_o = remap_tpdf_scalar (dith_o); + } + + return dith_o; + } + ); +} + + + +template +void Dither::process_seg_qrs_flt_int_cpp (uint8_t * fstb_RESTRICT dst_ptr, const uint8_t * fstb_RESTRICT src_ptr, int w, SegContext &ctx) noexcept +{ + // alpha1 = 1 / x, with x real solution of: x^3 - x - 1 = 0 + // Also: + // alpha1 = (curt (2) * sq (curt (3))) + // / (curt (9 - sqrt (69)) + curt (9 + sqrt (69))) + constexpr double alpha1 = 1.0 / 1.3247179572447460259609088544781; + constexpr double alpha2 = alpha1 * alpha1; + constexpr int sc_l2 = 16; // 16 bits of fractional values + constexpr float sc_mul = float (1 << sc_l2); + constexpr int qrs_shf = sc_l2 - 9; + constexpr int qrs_inc = int (alpha1 * sc_mul + 0.5f); + uint32_t qrs_cnt = uint32_t (std::llrint ( + (alpha2 * double (ctx._y + ctx._qrs_seed)) * sc_mul + )); + + process_seg_common_flt_int_cpp < + S_FLAG, TN_FLAG, DST_TYPE, DST_BITS, SRC_TYPE + > (dst_ptr, src_ptr, w, ctx, + [&] (int /*pos*/) + { + const int p = (qrs_cnt >> qrs_shf) & 0x1FF; + int dith_o = (p > 255) ? 512 - 128 - p : p - 128; // s8 + qrs_cnt += qrs_inc; + + if (TO_FLAG) + { + dith_o = remap_tpdf_scalar (dith_o); + } + + return dith_o; + } + ); +} + + + +#if (fstb_ARCHI == fstb_ARCHI_X86) + + + +template +void Dither::process_seg_qrs_int_int_sse2 (uint8_t * fstb_RESTRICT dst_ptr, const uint8_t * fstb_RESTRICT src_ptr, int w, SegContext &ctx) noexcept +{ + // alpha1 = 1 / x, with x real solution of: x^3 - x - 1 = 0 + // Also: + // alpha1 = (curt (2) * sq (curt (3))) + // / (curt (9 - sqrt (69)) + curt (9 + sqrt (69))) + constexpr double alpha1 = 1.0 / 1.3247179572447460259609088544781; + constexpr double alpha2 = alpha1 * alpha1; + constexpr int sc_l2 = 16; // 16 bits of fractional values + constexpr float sc_mul = float (1 << sc_l2); + constexpr int qrs_shf = sc_l2 - 9; + constexpr int qrs_inc = int (alpha1 * sc_mul + 0.5f); + uint32_t qrs_cnt = uint32_t (std::llrint ( + (alpha2 * double (ctx._y + ctx._qrs_seed)) * sc_mul + )); + + const __m128i qrs_inc_4 = _mm_set1_epi32 (4 * qrs_inc); + __m128i qrs_cnt_4 = _mm_set1_epi32 (qrs_cnt); + const __m128i qrs_ofs = _mm_set_epi32 (qrs_inc * 3, qrs_inc * 2, qrs_inc, 0); + qrs_cnt_4 = _mm_add_epi32 (qrs_cnt_4, qrs_ofs); + const __m128i qrs_msk = _mm_set1_epi32 (0x1FF); + const __m128i c128 = _mm_set1_epi16 (128); + const __m128i c256 = _mm_set1_epi16 (256); + const __m128i c384 = _mm_set1_epi16 (384); + + process_seg_common_int_int_sse2 < + S_FLAG, TN_FLAG, DST_FMT, DST_BITS, SRC_FMT, SRC_BITS + > (dst_ptr, src_ptr, w, ctx, + [&] (int /*pos*/) + { + auto p03 = _mm_srli_epi32 (qrs_cnt_4, qrs_shf); + p03 = _mm_and_si128 (p03, qrs_msk); + qrs_cnt_4 = _mm_add_epi32 (qrs_cnt_4, qrs_inc_4); + auto p47 = _mm_srli_epi32 (qrs_cnt_4, qrs_shf); + p47 = _mm_and_si128 (p47, qrs_msk); + qrs_cnt_4 = _mm_add_epi32 (qrs_cnt_4, qrs_inc_4); + const auto p = _mm_packs_epi32 (p03, p47); + const auto tri_a = _mm_sub_epi16 (p, c128); + const auto tri_d = _mm_sub_epi16 (c384, p); + const auto cond = _mm_cmplt_epi16 (p, c256); + auto dith_o = _mm_or_si128 ( + _mm_and_si128 (cond, tri_a), + _mm_andnot_si128 (cond, tri_d) + ); + + if (TO_FLAG) + { + dith_o = remap_tpdf_vec (dith_o); + } + + return dith_o; // 8 s16 [-128 ; +127] or [-256 ; +255] + } + ); +} + + + +template +void Dither::process_seg_qrs_flt_int_sse2 (uint8_t * fstb_RESTRICT dst_ptr, const uint8_t * fstb_RESTRICT src_ptr, int w, SegContext &ctx) noexcept +{ + // alpha1 = 1 / x, with x real solution of: x^3 - x - 1 = 0 + // Also: + // alpha1 = (curt (2) * sq (curt (3))) + // / (curt (9 - sqrt (69)) + curt (9 + sqrt (69))) + constexpr double alpha1 = 1.0 / 1.3247179572447460259609088544781; + constexpr double alpha2 = alpha1 * alpha1; + constexpr int sc_l2 = 16; // 16 bits of fractional values + constexpr float sc_mul = float (1 << sc_l2); + constexpr int qrs_shf = sc_l2 - 9; + constexpr int qrs_inc = int (alpha1 * sc_mul + 0.5f); + uint32_t qrs_cnt = uint32_t (std::llrint ( + (alpha2 * double (ctx._y + ctx._qrs_seed)) * sc_mul + )); + + const __m128i qrs_inc_4 = _mm_set1_epi32 (4 * qrs_inc); + __m128i qrs_cnt_4 = _mm_set1_epi32 (qrs_cnt); + const __m128i qrs_ofs = _mm_set_epi32 (qrs_inc * 3, qrs_inc * 2, qrs_inc, 0); + qrs_cnt_4 = _mm_add_epi32 (qrs_cnt_4, qrs_ofs); + const __m128i qrs_msk = _mm_set1_epi32 (0x1FF); + const __m128i c128 = _mm_set1_epi16 (128); + const __m128i c256 = _mm_set1_epi16 (256); + const __m128i c384 = _mm_set1_epi16 (384); + + process_seg_common_flt_int_sse2 < + S_FLAG, TN_FLAG, DST_FMT, DST_BITS, SRC_FMT + > (dst_ptr, src_ptr, w, ctx, + [&] (int /*pos*/) + { + auto p03 = _mm_srli_epi32 (qrs_cnt_4, qrs_shf); + p03 = _mm_and_si128 (p03, qrs_msk); + qrs_cnt_4 = _mm_add_epi32 (qrs_cnt_4, qrs_inc_4); + auto p47 = _mm_srli_epi32 (qrs_cnt_4, qrs_shf); + p47 = _mm_and_si128 (p47, qrs_msk); + qrs_cnt_4 = _mm_add_epi32 (qrs_cnt_4, qrs_inc_4); + const auto p = _mm_packs_epi32 (p03, p47); + const auto tri_a = _mm_sub_epi16 (p, c128); + const auto tri_d = _mm_sub_epi16 (c384, p); + const auto cond = _mm_cmplt_epi16 (p, c256); + auto dith_o = _mm_or_si128 ( + _mm_and_si128 (cond, tri_a), + _mm_andnot_si128 (cond, tri_d) + ); + + if (TO_FLAG) + { + dith_o = remap_tpdf_vec (dith_o); + } + + return dith_o; // 8 s16 [-128 ; +127] + } + ); +} + + + +#endif // fstb_ARCHI_X86 + + + +template +void Dither::process_seg_common_int_int_cpp (uint8_t * fstb_RESTRICT dst_ptr, const uint8_t * fstb_RESTRICT src_ptr, int w, SegContext &ctx, DFNC dither_fnc) noexcept +{ + assert (dst_ptr != nullptr); + assert (src_ptr != nullptr); + assert (w > 0); + + constexpr int dif_bits = SRC_BITS - DST_BITS; + static_assert (dif_bits >= 1, "This function must reduce bidepth."); + + uint32_t & rnd_state = ctx._rnd_state; + + const SRC_TYPE * fstb_RESTRICT src_n_ptr = reinterpret_cast (src_ptr); + DST_TYPE * fstb_RESTRICT dst_n_ptr = reinterpret_cast < DST_TYPE *> (dst_ptr); + + const int rcst = 1 << (dif_bits - 1); + const int vmax = (1 << DST_BITS) - 1; + + const int ao = ctx._amp._o_i; // s8 + const int an = ctx._amp._n_i; // s8 + + for (int pos = 0; pos < w; ++pos) + { + const int s = src_n_ptr [pos]; + + const int dith_o = dither_fnc (pos); // s8 + int dither; + if (S_FLAG) + { + constexpr int dit_shft = 8 - dif_bits; + dither = fstb::sshift_r (dith_o); + } + else + { + const int dith_n = generate_dith_n_scalar (rnd_state); // s8 + + constexpr int dit_shft = _amp_bits + 8 - dif_bits; + dither = fstb::sshift_r (dith_o * ao + dith_n * an); // s16 = s8 * s8 // s16 = s16 >> cst + } + const int sum = s + dither; // s16+ + const int quant = (sum + rcst) >> dif_bits; // s16 + + const int pix = fstb::limit (quant, 0, vmax); + dst_n_ptr [pos] = static_cast (pix); + } + + if (! S_FLAG) + { + generate_rnd_eol (rnd_state); + } +} + + + +// int dither_fnc (int pos) noexcept; +// Must provide the ordered dither value, in [-128 ; +127] nominal range +// (doubled for TPDF) +template +void Dither::process_seg_common_flt_int_cpp (uint8_t * fstb_RESTRICT dst_ptr, const uint8_t * fstb_RESTRICT src_ptr, int w, SegContext &ctx, DFNC dither_fnc) noexcept +{ + assert (dst_ptr != nullptr); + assert (src_ptr != nullptr); + assert (w > 0); + + const SRC_TYPE * fstb_RESTRICT src_n_ptr = reinterpret_cast (src_ptr); + DST_TYPE * fstb_RESTRICT dst_n_ptr = reinterpret_cast < DST_TYPE *> (dst_ptr); + + uint32_t & rnd_state = ctx._rnd_state; + + const int ao = ctx._amp._o_i; // s8 + const int an = ctx._amp._n_i; // s8 + + const float mul = float (ctx._scale_info_ptr->_gain); + const float add = float (ctx._scale_info_ptr->_add_cst); + const float qt = 1.0f / (1 << ((S_FLAG ? 0 : _amp_bits) + 8)); + const int vmax = (1 << DST_BITS) - 1; + + for (int pos = 0; pos < w; ++pos) + { + float s = float (src_n_ptr [pos]); + s = s * mul + add; + + const int dith_o = dither_fnc (pos); // s8 + + float dither; + if (S_FLAG) + { + dither = float (dith_o) * qt; + } + else + { + const int dith_n = generate_dith_n_scalar (rnd_state); // s8 + dither = float (dith_o * ao + dith_n * an) * qt; + } + const float sum = s + dither; + const int quant = fstb::round_int (sum); + + const int pix = fstb::limit (quant, 0, vmax); + dst_n_ptr [pos] = static_cast (pix); + } + + if (! S_FLAG) + { + generate_rnd_eol (rnd_state); + } +} + + + +template +int Dither::generate_dith_n_scalar (uint32_t &rnd_state) noexcept +{ + generate_rnd (rnd_state); + int dith_n = int8_t (rnd_state >> 24); + if (T_FLAG) + { + generate_rnd (rnd_state); + dith_n += int8_t (rnd_state >> 24); + } + + return dith_n; +} + + + +int Dither::remap_tpdf_scalar (int d) noexcept +{ + // [-128 ; 127] to [-32767 ; +32767], representing [-1 ; 1] (15-bit scale) + auto x2 = d * d; + x2 += x2; + x2 = std::min (x2, 0x7FFFF); // Saturated here because of the -min * -min overflow + auto x4 = (x2 * x2 ) >> 15; + auto x8 = (x4 * x4 ) >> 15; + auto x16 = (x8 * x8 ) >> 15; + auto x32 = (x16 * x16) >> 15; + + // 15-bit scale + constexpr int c3 = 0x8000 * 5 / 8; + constexpr int c33 = 0x8000 * 3 / 8; + + // 15-bit scale + auto sum_s15 = (x2 * c3 + x32 * c33) >> 15; + const auto x_s15 = d << 8; + const auto sum_s7 = (sum_s15 * x_s15) >> (30 - 7); + + d += sum_s7; + + return d; +} + + + +#if (fstb_ARCHI == fstb_ARCHI_X86) + + + +// __m128i dither_fnc (int pos) noexcept; +// Must provide the ordered dither values as a vector of 8 x int16_t, +// in [-128 ; +127] nominal range (doubled for TPDF) +template +void Dither::process_seg_common_int_int_sse2 (uint8_t * fstb_RESTRICT dst_ptr, const uint8_t * fstb_RESTRICT src_ptr, int w, SegContext &ctx, DFNC dither_fnc) noexcept +{ + assert (dst_ptr != nullptr); + assert (src_ptr != nullptr); + assert (w > 0); + + constexpr int dif_bits = SRC_BITS - DST_BITS; + static_assert (dif_bits >= 0, "This function cannot increase bidepth."); + + uint32_t & rnd_state = ctx._rnd_state; + + typedef typename ProxyRwSse2 ::PtrConst::Type SrcPtr; + typedef typename ProxyRwSse2 ::Ptr::Type DstPtr; + SrcPtr src_n_ptr = reinterpret_cast (src_ptr); + DstPtr dst_n_ptr = reinterpret_cast (dst_ptr); + const __m128i zero = _mm_setzero_si128 (); + const __m128i mask_lsb = _mm_set1_epi16 (0x00FF); + const __m128i sign_bit = _mm_set1_epi16 (-0x8000); + const __m128i rcst = _mm_set1_epi16 (1 << (dif_bits - 1)); + const __m128i vmax = _mm_set1_epi16 ((1 << DST_BITS) - 1); + + const __m128i ampo_i = _mm_set1_epi16 (int16_t (ctx._amp._o_i)); // 8 ?16 [0 ; 255] + const __m128i ampn_i = _mm_set1_epi16 (int16_t (ctx._amp._n_i)); // 8 ?16 [0 ; 255] + + for (int pos = 0; pos < w; pos += 8) + { + const __m128i s = // 8 u16 + ProxyRwSse2 ::read_i16 (src_n_ptr + pos, zero); + + // 8 s16 [-128 ; +127] or [-256 ; 255] + __m128i dith_o = dither_fnc (pos); + + __m128i dither; + if (S_FLAG) + { + constexpr int dit_shft = 8 - dif_bits; + dither = _mm_srai_epi16 (dith_o, dit_shft); + } + else + { + // Random generation. 8 s16 [-128 ; 127] or [-256 ; 255] + __m128i dith_n = generate_dith_n_vec (rnd_state); + + dith_o = _mm_mullo_epi16 (dith_o, ampo_i); // 8 s16 (full range) + dith_n = _mm_mullo_epi16 (dith_n, ampn_i); // 8 s16 (full range) + dither = _mm_adds_epi16 (dith_o, dith_n); // 8 s16 = s8 * s8 + + constexpr int dit_shft = _amp_bits + 8 - dif_bits; + dither = _mm_srai_epi16 (dither, dit_shft); // 8 s16 = s16 >> cst + } + + const __m128i dith_rcst = _mm_adds_epi16 (dither, rcst); + + __m128i quant; + if (S_FLAG && SRC_BITS < 16) + { + __m128i sum = _mm_adds_epi16 (s, dith_rcst); + quant = _mm_srai_epi16 (sum, dif_bits); + } + else + { + __m128i sum = _mm_xor_si128 (s, sign_bit); // 8 s16 + sum = _mm_adds_epi16 (sum, dith_rcst); + sum = _mm_xor_si128 (sum, sign_bit); // 8 u16 + quant = _mm_srli_epi16 (sum, dif_bits); + } + + __m128i pix = quant; + if (SRC_BITS < 16) + { + pix = _mm_max_epi16 (pix, zero); + pix = _mm_min_epi16 (pix, vmax); + } + + ProxyRwSse2 ::write_i16 (dst_n_ptr + pos, pix, mask_lsb); + } + + if (! S_FLAG) + { + generate_rnd_eol (rnd_state); + } +} + + + +template +void Dither::process_seg_common_flt_int_sse2 (uint8_t * fstb_RESTRICT dst_ptr, const uint8_t * fstb_RESTRICT src_ptr, int w, SegContext &ctx, DFNC dither_fnc) noexcept +{ + assert (dst_ptr != nullptr); + assert (src_ptr != nullptr); + assert (w > 0); + assert (((_mm_getcsr () >> 13) & 3) == 0); // 00 = Round to nearest (even) + + uint32_t & rnd_state = ctx._rnd_state; + + const float qt_cst = 1.0f / ( + 65536.0f * float (1 << ((S_FLAG ? 0 : _amp_bits) + 8)) + ); + + typedef typename ProxyRwSse2 ::PtrConst::Type SrcPtr; + typedef typename ProxyRwSse2 ::Ptr::Type DstPtr; + SrcPtr src_n_ptr = reinterpret_cast (src_ptr); + DstPtr dst_n_ptr = reinterpret_cast (dst_ptr); + const __m128 zero_f = _mm_setzero_ps (); + const __m128i zero_i = _mm_setzero_si128 (); + const __m128 mul = _mm_set1_ps (float (ctx._scale_info_ptr->_gain)); + const __m128 add = _mm_set1_ps (float (ctx._scale_info_ptr->_add_cst)); + const __m128 qt = _mm_set1_ps (qt_cst); + const __m128 vmax = _mm_set1_ps ((1 << DST_BITS) - 1); + const __m128 offset = _mm_set1_ps (-32768); + const __m128i mask_lsb = _mm_set1_epi16 (0x00FF); + const __m128i sign_bit = _mm_set1_epi16 (-0x8000); + + const __m128i ampo_i = _mm_set1_epi16 (int16_t (ctx._amp._o_i)); // 8 ?16 [0 ; 255] + const __m128i ampn_i = _mm_set1_epi16 (int16_t (ctx._amp._n_i)); // 8 ?16 [0 ; 255] + + for (int pos = 0; pos < w; pos += 8) + { + __m128 s0; + __m128 s1; + ProxyRwSse2 ::read_flt ( + src_n_ptr + pos, s0, s1, zero_i + ); + s0 = _mm_add_ps (_mm_mul_ps (s0, mul), add); + s1 = _mm_add_ps (_mm_mul_ps (s1, mul), add); + + // 8 s16 [-128 ; +127] or [-256 ; 255] + __m128i dith_o = dither_fnc (pos); + + __m128i dither; + if (S_FLAG) + { + dither = dith_o; + } + else + { + // Random generation. 8 s16 [-128 ; 127] or [-256 ; 255] + __m128i dith_n = generate_dith_n_vec (rnd_state); + + dith_o = _mm_mullo_epi16 (dith_o, ampo_i); // 8 s16 (full range) + dith_n = _mm_mullo_epi16 (dith_n, ampn_i); // 8 s16 (full range) + dither = _mm_adds_epi16 (dith_o, dith_n); // 8 s16 = s8 * s8 + } + + __m128i dither_03i = _mm_unpacklo_epi16 (zero_i, dither); // 4 s32 << 16 + __m128i dither_47i = _mm_unpackhi_epi16 (zero_i, dither); // 4 s32 << 16 + __m128 dither_03 = _mm_cvtepi32_ps (dither_03i); + __m128 dither_47 = _mm_cvtepi32_ps (dither_47i); + dither_03 = _mm_mul_ps (dither_03, qt); + dither_47 = _mm_mul_ps (dither_47, qt); + + s0 = _mm_add_ps (s0, dither_03); + s1 = _mm_add_ps (s1, dither_47); + + s0 = _mm_max_ps (_mm_min_ps (s0, vmax), zero_f); + s1 = _mm_max_ps (_mm_min_ps (s1, vmax), zero_f); + + ProxyRwSse2 ::write_flt ( + dst_n_ptr + pos, s0, s1, mask_lsb, sign_bit, offset + ); + } + + if (! S_FLAG) + { + generate_rnd_eol (rnd_state); + } +} + + + +template +__m128i Dither::generate_dith_n_vec (uint32_t &rnd_state) noexcept +{ + generate_rnd (rnd_state); + const uint32_t rnd_03 = rnd_state; + generate_rnd (rnd_state); + const uint32_t rnd_47 = rnd_state; + const auto zero = _mm_setzero_si128 (); + + if (T_FLAG) + { + generate_rnd (rnd_state); + const uint32_t rnd_03x = rnd_state; + generate_rnd (rnd_state); + const uint32_t rnd_47x = rnd_state; + const auto rnd_val = _mm_set_epi32 (rnd_47x, rnd_03x, rnd_47, rnd_03); + const auto c256_16 = _mm_set1_epi16 (0x100); + const auto x0 = _mm_unpacklo_epi8 (rnd_val, zero); + const auto x1 = _mm_unpackhi_epi8 (rnd_val, zero); + const auto dith_n = _mm_sub_epi16 (_mm_add_epi16 (x0, x1), c256_16); + return dith_n; // 8 s16 [-256 ; 255] + } + + else + { + const auto rnd_val = _mm_set_epi32 (0, 0, rnd_47, rnd_03); + const auto c128_16 = _mm_set1_epi16 (0x80); + const auto x0 = _mm_unpacklo_epi8 (rnd_val, zero); // 8 ?16 [0 ; 255] + const auto dith_n = _mm_sub_epi16 (x0, c128_16); + + return dith_n; // 8 s16 [-128 ; 127] + } +} + + + +// d: 8 s16 [-128 ; 127] +// Returns: 8 s16 [-256 ; 255] +// Formula: +// f: [-1 ; +1] -> [-2 ; +2] +// x -> x + 5/8 * x^3 + 3/8 * x^33 +// as an approximation of: +// x -> 2 * sign (x) * (1 - sqrt (1 - abs (x))) +__m128i Dither::remap_tpdf_vec (__m128i d) noexcept +{ + // [-128 ; 127] to [-32767 ; +32767], representing [-1 ; 1] (15-bit scale) + auto x2 = _mm_mullo_epi16 (d , d ); + x2 = _mm_adds_epi16 (x2 , x2 ); // Saturated here because of the -min * -min overflow + auto x4 = _mm_mulhi_epi16 (x2 , x2 ); + x4 = _mm_add_epi16 (x4 , x4 ); + auto x8 = _mm_mulhi_epi16 (x4 , x4 ); + x8 = _mm_add_epi16 (x8 , x8 ); + auto x16 = _mm_mulhi_epi16 (x8 , x8 ); + x16 = _mm_add_epi16 (x16, x16); + auto x32 = _mm_mulhi_epi16 (x16, x16); + x32 = _mm_add_epi16 (x32, x32); + + // 15-bit scale + const auto c3 = _mm_set1_epi16 (0x8000 * 5 / 8); + const auto c33 = _mm_set1_epi16 (0x8000 * 3 / 8); + + // 14-bit scale, losing a bit of precision at each mul + auto sum_s14 = _mm_mulhi_epi16 (x2, c3); + sum_s14 = _mm_add_epi16 (sum_s14, _mm_mulhi_epi16 (x32, c33)); + + const auto x_s15 = _mm_slli_epi16 (d, 8); + const auto sum_s13 = _mm_mulhi_epi16 (sum_s14, x_s15); + + const auto sum_s7 = _mm_srai_epi16 (sum_s13, 13 - 7); + + d = _mm_add_epi16 (d, sum_s7); + + return d; +} + + + +#endif + + + +template +constexpr int Dither::ErrDifAddParam ::_dst_bits; +template +constexpr int Dither::ErrDifAddParam ::_src_bits; +template +constexpr int Dither::ErrDifAddParam ::_nbr_err_lines; + + + +template +void Dither::process_seg_errdif_int_int_cpp (uint8_t * fstb_RESTRICT dst_ptr, const uint8_t * fstb_RESTRICT src_ptr, int w, SegContext &ctx) noexcept +{ + assert (dst_ptr != nullptr); + assert (src_ptr != nullptr); + assert (w > 0); + assert (ctx._y >= 0); + + typedef typename ERRDIF::SrcType SRC_TYPE; + typedef typename ERRDIF::DstType DST_TYPE; + constexpr int src_bits = ERRDIF::_src_bits; + constexpr int dst_bits = ERRDIF::_dst_bits; + + uint32_t & rnd_state = ctx._rnd_state; + ErrDifBuf & fstb_RESTRICT ed_buf = *ctx._ed_buf_ptr; + + const SRC_TYPE * fstb_RESTRICT src_n_ptr = reinterpret_cast (src_ptr); + DST_TYPE * fstb_RESTRICT dst_n_ptr = reinterpret_cast < DST_TYPE *> (dst_ptr); + + const int ae = ctx._amp._e_i; + + // Makes e1 point on the default buffer line for single-line + // error diffusor because we use it in prepare_next_line() + int e0 = 0; + int e1 = 0; + if (ERRDIF::_nbr_err_lines == 2) + { + e0 = ctx._y & 1 ; + e1 = 1 - (ctx._y & 1); + } + int16_t * err0_ptr = ed_buf.get_buf (e0); + int16_t * err1_ptr = ed_buf.get_buf (e1); + + int err_nxt0 = ed_buf.use_mem (0); + int err_nxt1 = ed_buf.use_mem (1); + + // Forward + if ((ctx._y & 1) == 0) + { + for (int x = 0; x < w; ++x) + { + int err = err_nxt0; + SRC_TYPE src_raw; + + quantize_pix_int < + S_FLAG, T_FLAG, DST_TYPE, dst_bits, SRC_TYPE, src_bits + > ( + dst_n_ptr, src_n_ptr, src_raw, x, err, rnd_state, ae, ctx._amp._n_i + ); + ERRDIF::template diffuse <1> ( + err, err_nxt0, err_nxt1, + err0_ptr + x, err1_ptr + x, src_raw + ); + } + ERRDIF::prepare_next_line (err1_ptr + w); + } + + // Backward + else + { + for (int x = w - 1; x >= 0; --x) + { + int err = err_nxt0; + SRC_TYPE src_raw; + + quantize_pix_int < + S_FLAG, T_FLAG, DST_TYPE, dst_bits, SRC_TYPE, src_bits + > ( + dst_n_ptr, src_n_ptr, src_raw, x, err, rnd_state, ae, ctx._amp._n_i + ); + ERRDIF::template diffuse <-1> ( + err, err_nxt0, err_nxt1, + err0_ptr + x, err1_ptr + x, src_raw + ); + } + ERRDIF::prepare_next_line (err1_ptr - 1); + } + + ed_buf.use_mem (0) = int16_t (err_nxt0); + ed_buf.use_mem (1) = int16_t (err_nxt1); + + if (! S_FLAG) + { + generate_rnd_eol (rnd_state); + } +} + + + +template +void Dither::process_seg_errdif_flt_int_cpp (uint8_t * fstb_RESTRICT dst_ptr, const uint8_t * fstb_RESTRICT src_ptr, int w, SegContext &ctx) noexcept +{ + assert (dst_ptr != nullptr); + assert (src_ptr != nullptr); + assert (w > 0); + assert (ctx._y >= 0); + + typedef typename ERRDIF::SrcType SRC_TYPE; + typedef typename ERRDIF::DstType DST_TYPE; + constexpr int dst_bits = ERRDIF::_dst_bits; + + uint32_t & rnd_state = ctx._rnd_state; + ErrDifBuf & fstb_RESTRICT ed_buf = *ctx._ed_buf_ptr; + + const SRC_TYPE * fstb_RESTRICT src_n_ptr = reinterpret_cast (src_ptr); + DST_TYPE * fstb_RESTRICT dst_n_ptr = reinterpret_cast < DST_TYPE *> (dst_ptr); + + const float mul = float (ctx._scale_info_ptr->_gain); + const float add = float (ctx._scale_info_ptr->_add_cst); + const float ae = float (ctx._amp._e_f); + const float an = float (ctx._amp._n_f); + + // Makes e1 point on the default buffer line for single-line + // error diffusor because we use it in prepare_next_line() + int e0 = 0; + int e1 = 0; + if (ERRDIF::_nbr_err_lines == 2) + { + e0 = ctx._y & 1 ; + e1 = 1 - (ctx._y & 1); + } + float * err0_ptr = ed_buf.get_buf (e0); + float * err1_ptr = ed_buf.get_buf (e1); + + float err_nxt0 = ed_buf.use_mem (0); + float err_nxt1 = ed_buf.use_mem (1); + + // Forward + if ((ctx._y & 1) == 0) + { + for (int x = 0; x < w; ++x) + { + float err = err_nxt0; + SRC_TYPE src_raw; + + quantize_pix_flt ( + dst_n_ptr, src_n_ptr, src_raw, x, err, rnd_state, ae, an, mul, add + ); + ERRDIF::template diffuse <1> ( + err, err_nxt0, err_nxt1, + err0_ptr + x, err1_ptr + x, src_raw + ); + } + ERRDIF::prepare_next_line (err1_ptr + w); + } + + // Backward + else + { + for (int x = w - 1; x >= 0; --x) + { + float err = err_nxt0; + SRC_TYPE src_raw; + + quantize_pix_flt ( + dst_n_ptr, src_n_ptr, src_raw, x, err, rnd_state, ae, an, mul, add + ); + ERRDIF::template diffuse <-1> ( + err, err_nxt0, err_nxt1, + err0_ptr + x, err1_ptr + x, src_raw + ); + } + ERRDIF::prepare_next_line (err1_ptr - 1); + } + + ed_buf.use_mem (0) = err_nxt0; + ed_buf.use_mem (1) = err_nxt1; + + if (! S_FLAG) + { + generate_rnd_eol (rnd_state); + } +} + + + +void Dither::generate_rnd (uint32_t &state) noexcept +{ + state = state * uint32_t (1664525) + 1013904223; +} + + + +void Dither::generate_rnd_eol (uint32_t &state) noexcept +{ + state = state * uint32_t (1103515245) + 12345; + if ((state & 0x2000000) != 0) + { + state = state * uint32_t (134775813) + 1; + } +} + + + +const Dither::PatRow & Dither::SegContext::extract_pattern_row () const noexcept +{ + assert (_pattern_ptr != nullptr); + assert (_y >= 0); + + return ((*_pattern_ptr) [_y & (_max_pat_width - 1)]); +} + + + +template +void Dither::quantize_pix_int (DST_TYPE * fstb_RESTRICT dst_ptr, const SRC_TYPE * fstb_RESTRICT src_ptr, SRC_TYPE &src_raw, int x, int & fstb_RESTRICT err, uint32_t &rnd_state, int ampe_i, int ampn_i) noexcept +{ + constexpr int dif_bits = SRC_BITS - DST_BITS; + constexpr int tmp_bits = + (dif_bits < 6 && SRC_BITS < _err_res && DST_BITS < _err_res) + ? _err_res + : SRC_BITS; + constexpr int tmp_shft = tmp_bits - SRC_BITS; + constexpr int tmp_invs = tmp_bits - DST_BITS; + + const int rcst = 1 << (tmp_invs - 1); + const int vmax = (1 << DST_BITS) - 1; + + src_raw = src_ptr [x]; + const int src = src_raw << tmp_shft; + const int preq = src + err; + + int sum = preq; + if (! S_FLAG) + { + constexpr int dit_shft = _amp_bits + 8 - tmp_invs; // May be negative + + const int dith_n = generate_dith_n_scalar (rnd_state); // s8 + const int err_add = (err < 0) ? -ampe_i : ampe_i; + const int noise = + fstb::sshift_r (dith_n * ampn_i + err_add); // s16 = s8 * s8 // s16 = s16 >> cst + + sum += noise; + } + + const int quant = (sum + rcst) >> tmp_invs; + + err = preq - (quant << tmp_invs); + const int pix = fstb::limit (quant, 0, vmax); + + dst_ptr [x] = static_cast (pix); +} + + + +template +static inline SRC_TYPE Dither_extract_src (SRC_TYPE src_read, float src) noexcept +{ + fstb::unused (src); + + return (src_read); +} + +static inline float Dither_extract_src (float src_read, float src) noexcept +{ + fstb::unused (src_read); + + return (src); +} + +template +void Dither::quantize_pix_flt (DST_TYPE * fstb_RESTRICT dst_ptr, const SRC_TYPE * fstb_RESTRICT src_ptr, SRC_TYPE &src_raw, int x, float & fstb_RESTRICT err, uint32_t &rnd_state, float ampe_f, float ampn_f, float mul, float add) noexcept +{ + const int vmax = (1 << DST_BITS) - 1; + + const SRC_TYPE src_read = src_ptr [x]; + const float src = float (src_read) * mul + add; + src_raw = Dither_extract_src (src_read, src); + const float preq = src + err; + + float sum = preq; + if (! S_FLAG) + { + const int dith_n = generate_dith_n_scalar (rnd_state); // s8 + const float err_add = (err < 0) ? -ampe_f : (err > 0) ? ampe_f : 0; + const float noise = float (dith_n) * ampn_f + err_add; + + sum += noise; + } + + const int quant = fstb::round_int (sum); + + err = preq - float (quant); + const int pix = fstb::limit (quant, 0, vmax); + + dst_ptr [x] = static_cast (pix); +} + + + +// Original coefficients : 7, 3, 5, 1 +// Optimised coefficients for serpentine scan: 7, 4, 5, 0 +// Source: +// Sam Hocevar and Gary Niger, +// Reinstating Floyd-Steinberg: Improved Metrics for Quality Assessment +// of Error Diffusion Algorithms, +// Lecture Notes in Computer Science LNCS 5099, pp. 38–45, 2008 +// (Proceedings of the International Conference on Image and Signal Processing +// ICISP 2008) ISSN 0302-9743 + +#define fmtcl_Dither_FS_OPTIMIZED_SERPENTINE_COEF + +template +template +void Dither::DiffuseFloydSteinberg ::diffuse (int err, int & fstb_RESTRICT err_nxt0, int & fstb_RESTRICT err_nxt1, int16_t * fstb_RESTRICT err0_ptr, int16_t * fstb_RESTRICT err1_ptr, SRC_TYPE src_raw) noexcept +{ + fstb::unused (err_nxt1, err1_ptr, src_raw); + +#if defined (fmtcl_Dither_FS_OPTIMIZED_SERPENTINE_COEF) + const int e1 = 0; + const int e3 = (err * 4 + 8) >> 4; +#else + const int e1 = (err + 8) >> 4; + const int e3 = (err * 3 + 8) >> 4; +#endif + const int e5 = (err * 5 + 8) >> 4; + const int e7 = err - e1 - e3 - e5; + spread_error (e1, e3, e5, e7, err_nxt0, err0_ptr); +} + +template +template +void Dither::DiffuseFloydSteinberg ::diffuse (float err, float & fstb_RESTRICT err_nxt0, float & fstb_RESTRICT err_nxt1, float * fstb_RESTRICT err0_ptr, float * fstb_RESTRICT err1_ptr, SRC_TYPE src_raw) noexcept +{ + fstb::unused (err_nxt1, err1_ptr, src_raw); + +#if defined (fmtcl_Dither_FS_OPTIMIZED_SERPENTINE_COEF) + const float e1 = 0; + const float e3 = err * (4.0f / 16); +#else + const float e1 = err * (1.0f / 16); + const float e3 = err * (3.0f / 16); +#endif + const float e5 = err * (5.0f / 16); + const float e7 = err * (7.0f / 16); + spread_error (e1, e3, e5, e7, err_nxt0, err0_ptr); +} + +template +template +void Dither::DiffuseFloydSteinberg ::prepare_next_line (EB * fstb_RESTRICT err_ptr) noexcept +{ + // Nothing + fstb::unused (err_ptr); +} + +template +template +void Dither::DiffuseFloydSteinberg ::spread_error (ET e1, ET e3, ET e5, ET e7, ET & fstb_RESTRICT err_nxt0, EB * fstb_RESTRICT err0_ptr) noexcept +{ + err_nxt0 = err0_ptr [DIR]; + err0_ptr [-DIR] += EB (e3); + err0_ptr [ 0] += EB (e5); + err0_ptr [ DIR] = EB (e1); + err_nxt0 += e7; +} + + + +template +template +void Dither::DiffuseFilterLite ::diffuse (int err, int & fstb_RESTRICT err_nxt0, int & fstb_RESTRICT err_nxt1, int16_t * fstb_RESTRICT err0_ptr, int16_t * fstb_RESTRICT err1_ptr, SRC_TYPE src_raw) noexcept +{ + fstb::unused (err_nxt1, err1_ptr, src_raw); + + const int e1 = (err + 2) >> 2; + const int e2 = err - 2 * e1; + spread_error (e1, e2, err_nxt0, err0_ptr); +} + +template +template +void Dither::DiffuseFilterLite ::diffuse (float err, float & fstb_RESTRICT err_nxt0, float & fstb_RESTRICT err_nxt1, float * fstb_RESTRICT err0_ptr, float * fstb_RESTRICT err1_ptr, SRC_TYPE src_raw) noexcept +{ + fstb::unused (err_nxt1, err1_ptr, src_raw); + + const float e1 = err * (1.0f / 4); + const float e2 = err * (2.0f / 4); + spread_error (e1, e2, err_nxt0, err0_ptr); +} + +template +template +void Dither::DiffuseFilterLite ::prepare_next_line (EB * fstb_RESTRICT err_ptr) noexcept +{ + err_ptr [0] = EB (0); +} + +template +template +void Dither::DiffuseFilterLite ::spread_error (ET e1, ET e2, ET & fstb_RESTRICT err_nxt0, EB * fstb_RESTRICT err0_ptr) noexcept +{ + err_nxt0 = err0_ptr [DIR]; + err0_ptr [-DIR] += EB (e1); + err0_ptr [ 0] = EB (e1); + err_nxt0 += e2; +} + + + +template +template +void Dither::DiffuseStucki ::diffuse (int err, int & fstb_RESTRICT err_nxt0, int & fstb_RESTRICT err_nxt1, int16_t * fstb_RESTRICT err0_ptr, int16_t * fstb_RESTRICT err1_ptr, SRC_TYPE src_raw) noexcept +{ + fstb::unused (src_raw); + + const int m = (err << 4) / 42; + const int e1 = (m + 8) >> 4; + const int e2 = (m + 4) >> 3; + const int e4 = (m + 2) >> 2; +// const int e8 = (m + 1) >> 1; + const int sum = (e1 << 1) + ((e2 + e4) << 2); + const int e8 = (err - sum + 1) >> 1; + spread_error (e1, e2, e4, e8, err_nxt0, err_nxt1, err0_ptr, err1_ptr); +} + +template +template +void Dither::DiffuseStucki ::diffuse (float err, float & fstb_RESTRICT err_nxt0, float & fstb_RESTRICT err_nxt1, float * fstb_RESTRICT err0_ptr, float * fstb_RESTRICT err1_ptr, SRC_TYPE src_raw) noexcept +{ + fstb::unused (src_raw); + + const float e1 = err * (1.0f / 42); + const float e2 = err * (2.0f / 42); + const float e4 = err * (4.0f / 42); + const float e8 = err * (8.0f / 42); + spread_error (e1, e2, e4, e8, err_nxt0, err_nxt1, err0_ptr, err1_ptr); +} + +template +template +void Dither::DiffuseStucki ::prepare_next_line (EB * fstb_RESTRICT err_ptr) noexcept +{ + // Nothing + fstb::unused (err_ptr); +} + +template +template +void Dither::DiffuseStucki ::spread_error (ET e1, ET e2, ET e4, ET e8, ET & fstb_RESTRICT err_nxt0, ET & fstb_RESTRICT err_nxt1, EB * fstb_RESTRICT err0_ptr, EB * fstb_RESTRICT err1_ptr) noexcept +{ + err_nxt0 = err_nxt1 + e8; + err_nxt1 = err1_ptr [DIR * 2] + e4; + err0_ptr [-DIR * 2] += EB (e2); + err0_ptr [-DIR ] += EB (e4); + err0_ptr [ 0 ] += EB (e8); + err0_ptr [ DIR ] += EB (e4); + err0_ptr [ DIR * 2] += EB (e2); + err1_ptr [-DIR * 2] += EB (e1); + err1_ptr [-DIR ] += EB (e2); + err1_ptr [ 0 ] += EB (e4); + err1_ptr [ DIR ] += EB (e2); + err1_ptr [ DIR * 2] = EB (e1); +} + + + +template +template +void Dither::DiffuseAtkinson ::diffuse (int err, int & fstb_RESTRICT err_nxt0, int & fstb_RESTRICT err_nxt1, int16_t * fstb_RESTRICT err0_ptr, int16_t * fstb_RESTRICT err1_ptr, SRC_TYPE src_raw) noexcept +{ + fstb::unused (src_raw); + + const int e1 = (err + 4) >> 3; + spread_error (e1, err_nxt0, err_nxt1, err0_ptr, err1_ptr); +} + +template +template +void Dither::DiffuseAtkinson ::diffuse (float err, float & fstb_RESTRICT err_nxt0, float & fstb_RESTRICT err_nxt1, float * fstb_RESTRICT err0_ptr, float * fstb_RESTRICT err1_ptr, SRC_TYPE src_raw) noexcept +{ + fstb::unused (src_raw); + + const float e1 = err * (1.0f / 8); + spread_error (e1, err_nxt0, err_nxt1, err0_ptr, err1_ptr); +} + +template +template +void Dither::DiffuseAtkinson ::prepare_next_line (EB * fstb_RESTRICT err_ptr) noexcept +{ + err_ptr [0] = EB (0); +} + +template +template +void Dither::DiffuseAtkinson ::spread_error (ET e1, ET & fstb_RESTRICT err_nxt0, ET & fstb_RESTRICT err_nxt1, EB * fstb_RESTRICT err0_ptr, EB * fstb_RESTRICT err1_ptr) noexcept +{ + err_nxt0 = err_nxt1 + e1; + err_nxt1 = err1_ptr [2 * DIR] + e1; + err0_ptr [-DIR] += EB (e1); + err0_ptr [ 0] += EB (e1); + err0_ptr [+DIR] += EB (e1); + err1_ptr [ 0] = EB (e1); +} + + + +constexpr int Dither::DiffuseOstromoukhovBase::_t_bits; +constexpr int Dither::DiffuseOstromoukhovBase::_t_len; +constexpr int Dither::DiffuseOstromoukhovBase::_t_mask; + + + +template +template +int Dither::DiffuseOstromoukhovBase2 ::get_index (SRC_TYPE src_raw) noexcept +{ + constexpr int dif_bits = SRC_BITS - DST_BITS; + + return (fstb::sshift_l < + int, + DiffuseOstromoukhovBase::_t_bits - dif_bits + > (src_raw) & DiffuseOstromoukhovBase::_t_mask); +} + +template +int Dither::DiffuseOstromoukhovBase2 ::get_index (float src_raw) noexcept +{ + return + fstb::round_int (src_raw * DiffuseOstromoukhovBase::_t_len) + & DiffuseOstromoukhovBase::_t_mask; +} + +// Victor Ostromoukhov, +// A Simple and Efficient Error-Diffusion Algorithm +// Proceedings of SIGGRAPH 2001, in ACM Computer Graphics, +// Annual Conference Series, pp. 567-572, 2001. +// Not optimised at all +template +template +void Dither::DiffuseOstromoukhov ::diffuse (int err, int & fstb_RESTRICT err_nxt0, int & fstb_RESTRICT err_nxt1, int16_t * fstb_RESTRICT err0_ptr, int16_t * fstb_RESTRICT err1_ptr, SRC_TYPE src_raw) noexcept +{ + fstb::unused (err_nxt1, err1_ptr); + + constexpr int dif_bits = SRC_BITS - DST_BITS; + + const int index = fstb::sshift_l < + int, + DiffuseOstromoukhov::_t_bits - dif_bits + > (src_raw) & DiffuseOstromoukhov::_t_mask; + const typename ThisType::TableEntry & fstb_RESTRICT te = ThisType::_table [index]; + const int d = te._sum; + + const int e1 = err * te._c0 / d; + const int e2 = err * te._c1 / d; + const int e3 = err - e1 - e2; + + spread_error (e1, e2, e3, err_nxt0, err0_ptr); +} + +template +template +void Dither::DiffuseOstromoukhov ::diffuse (float err, float & fstb_RESTRICT err_nxt0, float & fstb_RESTRICT err_nxt1, float * fstb_RESTRICT err0_ptr, float * fstb_RESTRICT err1_ptr, SRC_TYPE src_raw) noexcept +{ + fstb::unused (err_nxt1, err1_ptr); + + const int index = DiffuseOstromoukhov::get_index (src_raw); + const typename ThisType::TableEntry & fstb_RESTRICT te = ThisType::_table [index]; + const float invd = te._inv_sum; + + const float e1 = err * float (te._c0) * invd; + const float e2 = err * float (te._c1) * invd; + const float e3 = err - e1 - e2; + + spread_error (e1, e2, e3, err_nxt0, err0_ptr); +} + +template +template +void Dither::DiffuseOstromoukhov ::prepare_next_line (EB * fstb_RESTRICT err_ptr) noexcept +{ + err_ptr [0] = EB (0); +} + +template +template +void Dither::DiffuseOstromoukhov ::spread_error (ET e1, ET e2, ET e3, ET & fstb_RESTRICT err_nxt0, EB * fstb_RESTRICT err0_ptr) noexcept +{ + err_nxt0 = err0_ptr [DIR]; + err0_ptr [-DIR] += EB (e2); + err0_ptr [ 0] = EB (e3); + err_nxt0 += e1; +} + + + +const std::array < + Dither::DiffuseOstromoukhovBase::TableEntry, + Dither::DiffuseOstromoukhovBase::_t_len +> Dither::DiffuseOstromoukhovBase::_table = +{{ + { 13, 0, 5, 18, 1.0f / 18 }, + { 13, 0, 5, 18, 1.0f / 18 }, + { 21, 0, 10, 31, 1.0f / 31 }, + { 7, 0, 4, 11, 1.0f / 11 }, + { 8, 0, 5, 13, 1.0f / 13 }, + { 47, 3, 28, 78, 1.0f / 78 }, + { 23, 3, 13, 39, 1.0f / 39 }, + { 15, 3, 8, 26, 1.0f / 26 }, + { 22, 6, 11, 39, 1.0f / 39 }, + { 43, 15, 20, 78, 1.0f / 78 }, + { 7, 3, 3, 13, 1.0f / 13 }, + { 501, 224, 211, 936, 1.0f / 936 }, + { 249, 116, 103, 468, 1.0f / 468 }, + { 165, 80, 67, 312, 1.0f / 312 }, + { 123, 62, 49, 234, 1.0f / 234 }, + { 489, 256, 191, 936, 1.0f / 936 }, + { 81, 44, 31, 156, 1.0f / 156 }, + { 483, 272, 181, 936, 1.0f / 936 }, + { 60, 35, 22, 117, 1.0f / 117 }, + { 53, 32, 19, 104, 1.0f / 104 }, + { 237, 148, 83, 468, 1.0f / 468 }, + { 471, 304, 161, 936, 1.0f / 936 }, + { 3, 2, 1, 6, 1.0f / 6 }, + { 481, 314, 185, 980, 1.0f / 980 }, + { 354, 226, 155, 735, 1.0f / 735 }, + { 1389, 866, 685, 2940, 1.0f / 2940 }, + { 227, 138, 125, 490, 1.0f / 490 }, + { 267, 158, 163, 588, 1.0f / 588 }, + { 327, 188, 220, 735, 1.0f / 735 }, + { 61, 34, 45, 140, 1.0f / 140 }, + { 627, 338, 505, 1470, 1.0f / 1470 }, + { 1227, 638, 1075, 2940, 1.0f / 2940 }, + + { 20, 10, 19, 49, 1.0f / 49 }, + { 1937, 1000, 1767, 4704, 1.0f / 4704 }, + { 977, 520, 855, 2352, 1.0f / 2352 }, + { 657, 360, 551, 1568, 1.0f / 1568 }, + { 71, 40, 57, 168, 1.0f / 168 }, + { 2005, 1160, 1539, 4704, 1.0f / 4704 }, + { 337, 200, 247, 784, 1.0f / 784 }, + { 2039, 1240, 1425, 4704, 1.0f / 4704 }, + { 257, 160, 171, 588, 1.0f / 588 }, + { 691, 440, 437, 1568, 1.0f / 1568 }, + { 1045, 680, 627, 2352, 1.0f / 2352 }, + { 301, 200, 171, 672, 1.0f / 672 }, + { 177, 120, 95, 392, 1.0f / 392 }, + { 2141, 1480, 1083, 4704, 1.0f / 4704 }, + { 1079, 760, 513, 2352, 1.0f / 2352 }, + { 725, 520, 323, 1568, 1.0f / 1568 }, + { 137, 100, 57, 294, 1.0f / 294 }, + { 2209, 1640, 855, 4704, 1.0f / 4704 }, + { 53, 40, 19, 112, 1.0f / 112 }, + { 2243, 1720, 741, 4704, 1.0f / 4704 }, + { 565, 440, 171, 1176, 1.0f / 1176 }, + { 759, 600, 209, 1568, 1.0f / 1568 }, + { 1147, 920, 285, 2352, 1.0f / 2352 }, + { 2311, 1880, 513, 4704, 1.0f / 4704 }, + { 97, 80, 19, 196, 1.0f / 196 }, + { 335, 280, 57, 672, 1.0f / 672 }, + { 1181, 1000, 171, 2352, 1.0f / 2352 }, + { 793, 680, 95, 1568, 1.0f / 1568 }, + { 599, 520, 57, 1176, 1.0f / 1176 }, + { 2413, 2120, 171, 4704, 1.0f / 4704 }, + { 405, 360, 19, 784, 1.0f / 784 }, + { 2447, 2200, 57, 4704, 1.0f / 4704 }, + + { 11, 10, 0, 21, 1.0f / 21 }, + { 158, 151, 3, 312, 1.0f / 312 }, + { 178, 179, 7, 364, 1.0f / 364 }, + { 1030, 1091, 63, 2184, 1.0f / 2184 }, + { 248, 277, 21, 546, 1.0f / 546 }, + { 318, 375, 35, 728, 1.0f / 728 }, + { 458, 571, 63, 1092, 1.0f / 1092 }, + { 878, 1159, 147, 2184, 1.0f / 2184 }, + { 5, 7, 1, 13, 1.0f / 13 }, + { 172, 181, 37, 390, 1.0f / 390 }, + { 97, 76, 22, 195, 1.0f / 195 }, + { 72, 41, 17, 130, 1.0f / 130 }, + { 119, 47, 29, 195, 1.0f / 195 }, + { 4, 1, 1, 6, 1.0f / 6 }, + { 4, 1, 1, 6, 1.0f / 6 }, + { 4, 1, 1, 6, 1.0f / 6 }, + { 4, 1, 1, 6, 1.0f / 6 }, + { 4, 1, 1, 6, 1.0f / 6 }, + { 4, 1, 1, 6, 1.0f / 6 }, + { 4, 1, 1, 6, 1.0f / 6 }, + { 4, 1, 1, 6, 1.0f / 6 }, + { 4, 1, 1, 6, 1.0f / 6 }, + { 65, 18, 17, 100, 1.0f / 100 }, + { 95, 29, 26, 150, 1.0f / 150 }, + { 185, 62, 53, 300, 1.0f / 300 }, + { 30, 11, 9, 50, 1.0f / 50 }, + { 35, 14, 11, 60, 1.0f / 60 }, + { 85, 37, 28, 150, 1.0f / 150 }, + { 55, 26, 19, 100, 1.0f / 100 }, + { 80, 41, 29, 150, 1.0f / 150 }, + { 155, 86, 59, 300, 1.0f / 300 }, + { 5, 3, 2, 10, 1.0f / 10 }, + + { 5, 3, 2, 10, 1.0f / 10 }, + { 5, 3, 2, 10, 1.0f / 10 }, + { 5, 3, 2, 10, 1.0f / 10 }, + { 5, 3, 2, 10, 1.0f / 10 }, + { 5, 3, 2, 10, 1.0f / 10 }, + { 5, 3, 2, 10, 1.0f / 10 }, + { 5, 3, 2, 10, 1.0f / 10 }, + { 5, 3, 2, 10, 1.0f / 10 }, + { 5, 3, 2, 10, 1.0f / 10 }, + { 5, 3, 2, 10, 1.0f / 10 }, + { 5, 3, 2, 10, 1.0f / 10 }, + { 5, 3, 2, 10, 1.0f / 10 }, + { 305, 176, 119, 600, 1.0f / 600 }, + { 155, 86, 59, 300, 1.0f / 300 }, + { 105, 56, 39, 200, 1.0f / 200 }, + { 80, 41, 29, 150, 1.0f / 150 }, + { 65, 32, 23, 120, 1.0f / 120 }, + { 55, 26, 19, 100, 1.0f / 100 }, + { 335, 152, 113, 600, 1.0f / 600 }, + { 85, 37, 28, 150, 1.0f / 150 }, + { 115, 48, 37, 200, 1.0f / 200 }, + { 35, 14, 11, 60, 1.0f / 60 }, + { 355, 136, 109, 600, 1.0f / 600 }, + { 30, 11, 9, 50, 1.0f / 50 }, + { 365, 128, 107, 600, 1.0f / 600 }, + { 185, 62, 53, 300, 1.0f / 300 }, + { 25, 8, 7, 40, 1.0f / 40 }, + { 95, 29, 26, 150, 1.0f / 150 }, + { 385, 112, 103, 600, 1.0f / 600 }, + { 65, 18, 17, 100, 1.0f / 100 }, + { 395, 104, 101, 600, 1.0f / 600 }, + { 4, 1, 1, 6, 1.0f / 6 }, + + // Symetric + { 4, 1, 1, 6, 1.0f / 6 }, + { 395, 104, 101, 600, 1.0f / 600 }, + { 65, 18, 17, 100, 1.0f / 100 }, + { 385, 112, 103, 600, 1.0f / 600 }, + { 95, 29, 26, 150, 1.0f / 150 }, + { 25, 8, 7, 40, 1.0f / 40 }, + { 185, 62, 53, 300, 1.0f / 300 }, + { 365, 128, 107, 600, 1.0f / 600 }, + { 30, 11, 9, 50, 1.0f / 50 }, + { 355, 136, 109, 600, 1.0f / 600 }, + { 35, 14, 11, 60, 1.0f / 60 }, + { 115, 48, 37, 200, 1.0f / 200 }, + { 85, 37, 28, 150, 1.0f / 150 }, + { 335, 152, 113, 600, 1.0f / 600 }, + { 55, 26, 19, 100, 1.0f / 100 }, + { 65, 32, 23, 120, 1.0f / 120 }, + { 80, 41, 29, 150, 1.0f / 150 }, + { 105, 56, 39, 200, 1.0f / 200 }, + { 155, 86, 59, 300, 1.0f / 300 }, + { 305, 176, 119, 600, 1.0f / 600 }, + { 5, 3, 2, 10, 1.0f / 10 }, + { 5, 3, 2, 10, 1.0f / 10 }, + { 5, 3, 2, 10, 1.0f / 10 }, + { 5, 3, 2, 10, 1.0f / 10 }, + { 5, 3, 2, 10, 1.0f / 10 }, + { 5, 3, 2, 10, 1.0f / 10 }, + { 5, 3, 2, 10, 1.0f / 10 }, + { 5, 3, 2, 10, 1.0f / 10 }, + { 5, 3, 2, 10, 1.0f / 10 }, + { 5, 3, 2, 10, 1.0f / 10 }, + { 5, 3, 2, 10, 1.0f / 10 }, + { 5, 3, 2, 10, 1.0f / 10 }, + + { 5, 3, 2, 10, 1.0f / 10 }, + { 155, 86, 59, 300, 1.0f / 300 }, + { 80, 41, 29, 150, 1.0f / 150 }, + { 55, 26, 19, 100, 1.0f / 100 }, + { 85, 37, 28, 150, 1.0f / 150 }, + { 35, 14, 11, 60, 1.0f / 60 }, + { 30, 11, 9, 50, 1.0f / 50 }, + { 185, 62, 53, 300, 1.0f / 300 }, + { 95, 29, 26, 150, 1.0f / 150 }, + { 65, 18, 17, 100, 1.0f / 100 }, + { 4, 1, 1, 6, 1.0f / 6 }, + { 4, 1, 1, 6, 1.0f / 6 }, + { 4, 1, 1, 6, 1.0f / 6 }, + { 4, 1, 1, 6, 1.0f / 6 }, + { 4, 1, 1, 6, 1.0f / 6 }, + { 4, 1, 1, 6, 1.0f / 6 }, + { 4, 1, 1, 6, 1.0f / 6 }, + { 4, 1, 1, 6, 1.0f / 6 }, + { 4, 1, 1, 6, 1.0f / 6 }, + { 119, 47, 29, 195, 1.0f / 195 }, + { 72, 41, 17, 130, 1.0f / 130 }, + { 97, 76, 22, 195, 1.0f / 195 }, + { 172, 181, 37, 390, 1.0f / 390 }, + { 5, 7, 1, 13, 1.0f / 13 }, + { 878, 1159, 147, 2184, 1.0f / 2184 }, + { 458, 571, 63, 1092, 1.0f / 1092 }, + { 318, 375, 35, 728, 1.0f / 728 }, + { 248, 277, 21, 546, 1.0f / 546 }, + { 1030, 1091, 63, 2184, 1.0f / 2184 }, + { 178, 179, 7, 364, 1.0f / 364 }, + { 158, 151, 3, 312, 1.0f / 312 }, + { 11, 10, 0, 21, 1.0f / 21 }, + + { 2447, 2200, 57, 4704, 1.0f / 4704 }, + { 405, 360, 19, 784, 1.0f / 784 }, + { 2413, 2120, 171, 4704, 1.0f / 4704 }, + { 599, 520, 57, 1176, 1.0f / 1176 }, + { 793, 680, 95, 1568, 1.0f / 1568 }, + { 1181, 1000, 171, 2352, 1.0f / 2352 }, + { 335, 280, 57, 672, 1.0f / 672 }, + { 97, 80, 19, 196, 1.0f / 196 }, + { 2311, 1880, 513, 4704, 1.0f / 4704 }, + { 1147, 920, 285, 2352, 1.0f / 2352 }, + { 759, 600, 209, 1568, 1.0f / 1568 }, + { 565, 440, 171, 1176, 1.0f / 1176 }, + { 2243, 1720, 741, 4704, 1.0f / 4704 }, + { 53, 40, 19, 112, 1.0f / 112 }, + { 2209, 1640, 855, 4704, 1.0f / 4704 }, + { 137, 100, 57, 294, 1.0f / 294 }, + { 725, 520, 323, 1568, 1.0f / 1568 }, + { 1079, 760, 513, 2352, 1.0f / 2352 }, + { 2141, 1480, 1083, 4704, 1.0f / 4704 }, + { 177, 120, 95, 392, 1.0f / 392 }, + { 301, 200, 171, 672, 1.0f / 672 }, + { 1045, 680, 627, 2352, 1.0f / 2352 }, + { 691, 440, 437, 1568, 1.0f / 1568 }, + { 257, 160, 171, 588, 1.0f / 588 }, + { 2039, 1240, 1425, 4704, 1.0f / 4704 }, + { 337, 200, 247, 784, 1.0f / 784 }, + { 2005, 1160, 1539, 4704, 1.0f / 4704 }, + { 71, 40, 57, 168, 1.0f / 168 }, + { 657, 360, 551, 1568, 1.0f / 1568 }, + { 977, 520, 855, 2352, 1.0f / 2352 }, + { 1937, 1000, 1767, 4704, 1.0f / 4704 }, + { 20, 10, 19, 49, 1.0f / 49 }, + + { 1227, 638, 1075, 2940, 1.0f / 2940 }, + { 627, 338, 505, 1470, 1.0f / 1470 }, + { 61, 34, 45, 140, 1.0f / 140 }, + { 327, 188, 220, 735, 1.0f / 735 }, + { 267, 158, 163, 588, 1.0f / 588 }, + { 227, 138, 125, 490, 1.0f / 490 }, + { 1389, 866, 685, 2940, 1.0f / 2940 }, + { 354, 226, 155, 735, 1.0f / 735 }, + { 481, 314, 185, 980, 1.0f / 980 }, + { 3, 2, 1, 6, 1.0f / 6 }, + { 471, 304, 161, 936, 1.0f / 936 }, + { 237, 148, 83, 468, 1.0f / 468 }, + { 53, 32, 19, 104, 1.0f / 104 }, + { 60, 35, 22, 117, 1.0f / 117 }, + { 483, 272, 181, 936, 1.0f / 936 }, + { 81, 44, 31, 156, 1.0f / 156 }, + { 489, 256, 191, 936, 1.0f / 936 }, + { 123, 62, 49, 234, 1.0f / 234 }, + { 165, 80, 67, 312, 1.0f / 312 }, + { 249, 116, 103, 468, 1.0f / 468 }, + { 501, 224, 211, 936, 1.0f / 936 }, + { 7, 3, 3, 13, 1.0f / 13 }, + { 43, 15, 20, 78, 1.0f / 78 }, + { 22, 6, 11, 39, 1.0f / 39 }, + { 15, 3, 8, 26, 1.0f / 26 }, + { 23, 3, 13, 39, 1.0f / 39 }, + { 47, 3, 28, 78, 1.0f / 78 }, + { 8, 0, 5, 13, 1.0f / 13 }, + { 7, 0, 4, 11, 1.0f / 11 }, + { 21, 0, 10, 31, 1.0f / 31 }, + { 13, 0, 5, 18, 1.0f / 18 }, + { 13, 0, 5, 18, 1.0f / 18 } +}}; + + + +} // namespace fmtcl + + + +/*\\\ EOF \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\*/ diff --git a/src/fmtcl/Dither.h b/src/fmtcl/Dither.h new file mode 100644 index 0000000..edcc32a --- /dev/null +++ b/src/fmtcl/Dither.h @@ -0,0 +1,448 @@ +/***************************************************************************** + + Dither.h + Author: Laurent de Soras, 2021 + +--- Legal stuff --- + +This program is free software. It comes without any warranty, to +the extent permitted by applicable law. You can redistribute it +and/or modify it under the terms of the Do What The Fuck You Want +To Public License, Version 2, as published by Sam Hocevar. See +http://www.wtfpl.net/ for more details. + +*Tab=3***********************************************************************/ + + + +#pragma once +#if ! defined (fmtcl_Dither_HEADER_INCLUDED) +#define fmtcl_Dither_HEADER_INCLUDED + + + +/*\\\ INCLUDE FILES \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\*/ + +#include "conc/ObjPool.h" +#include "fmtcl/ColorFamily.h" +#include "fmtcl/BitBltConv.h" +#include "fmtcl/ErrDifBuf.h" +#include "fmtcl/ErrDifBufFactory.h" +#include "fmtcl/SplFmt.h" +#include "fstb/def.h" +#include "fstb/ArrayAlign.h" + +#include +#include +#include + + + +namespace fmtcl +{ + + + +class Dither +{ + +/*\\\ PUBLIC \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\*/ + +public: + + static constexpr int _max_nbr_planes = 3; + static constexpr int _max_pat_width = 32; // Number of pixels for halftone dithering + + enum DMode + { + DMode_ROUND_ALIAS = -1, + DMode_BAYER = 0, + DMode_ROUND, // 1 + DMode_FAST, // 2 + DMode_FILTERLITE, // 3 + DMode_STUCKI, // 4 + DMode_ATKINSON, // 5 + DMode_FLOYD, // 6 + DMode_OSTRO, // 7 + DMode_VOIDCLUST, // 8 + DMode_QUASIRND, // 9 + + DMode_NBR_ELT + }; + + explicit Dither ( + SplFmt src_fmt, int src_res, bool src_full_flag, + SplFmt dst_fmt, int dst_res, bool dst_full_flag, + ColorFamily color_fam, int nbr_planes, int w, + DMode dmode, int pat_size, double ampo, double ampn, + bool dyn_flag, bool static_noise_flag, bool correlated_planes_flag, + bool tpdfo_flag, bool tpdfn_flag, + bool sse2_flag, bool avx2_flag + ); + + void process_plane (uint8_t *dst_ptr, int dst_stride, const uint8_t *src_ptr, int src_stride, int w, int h, int frame_index, int plane_index); + + + +/*\\\ PROTECTED \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\*/ + +protected: + + + +/*\\\ PRIVATE \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\*/ + +private: + + static constexpr int _pat_period = 4; // Must be a power of 2 (because cycled with & as modulo) + static constexpr int _amp_bits = 5; // Bit depth of the amplitude fractionnal part. The whole thing is 7 bits, and we need a few bits for the integer part. + static constexpr int _err_res = 24; // Resolution (bits) of the temporary data for error diffusion when source bitdepth is not high enough (relative to the destination bitdepth) to guarantee an accurate error diffusion. + static constexpr int _max_unk_width = 65536; // Maximum width (pixels) for variable formats + + class SclInf + { + public: + BitBltConv::ScaleInfo + _info; + BitBltConv::ScaleInfo * // 0 if _info is not used. + _ptr = 0; + }; + + typedef int16_t PatRow [_max_pat_width]; // Contains data in [-128; +127] + typedef PatRow PatData [_max_pat_width]; // [y] [x] + typedef fstb::ArrayAlign PatDataArray; + + class AmpInfo + { + public: + int _o_i = 0; // [0 ; 127], 1.0 = 1 << _amp_bits + int _n_i = 0; // [0 ; 127], 1.0 = 1 << _amp_bits + int _e_i = 0; // [0 ; 2047], 1.0 = 256 + float _e_f = 0; + float _n_f = 0; + }; + + class SegContext + { + public: + inline const PatRow & + extract_pattern_row () const noexcept; + const PatData* _pattern_ptr = nullptr; // Ordered dithering + uint32_t _rnd_state = 0; // Anything excepted fast mode + const BitBltConv::ScaleInfo * // Float processing + _scale_info_ptr = nullptr; + ErrDifBuf * // Error diffusion + _ed_buf_ptr = nullptr; + int _y = -1; // Ordered dithering and error diffusion + uint32_t _qrs_seed = 0; // For the quasirandom sequences + AmpInfo _amp; + }; + + void build_dither_pat (); + void build_dither_pat_round (); + void build_dither_pat_bayer (); + void build_dither_pat_void_and_cluster (int w); + void build_next_dither_pat (); + void copy_dither_pat_rotate (PatData &dst, const PatData &src, int angle) noexcept; + void init_fnc_fast () noexcept; + void init_fnc_ordered () noexcept; + void init_fnc_quasirandom () noexcept; + void init_fnc_errdiff () noexcept; + + void dither_plane (uint8_t *dst_ptr, int dst_stride, const uint8_t *src_ptr, int src_stride, int w, int h, const BitBltConv::ScaleInfo &scale_info, int frame_index, int plane_index); + + template + static void process_seg_fast_int_int_cpp (uint8_t * fstb_RESTRICT dst_ptr, const uint8_t * fstb_RESTRICT src_ptr, int w, SegContext &/*ctx*/) noexcept; + template + static void process_seg_fast_flt_int_cpp (uint8_t * fstb_RESTRICT dst_ptr, const uint8_t * fstb_RESTRICT src_ptr, int w, SegContext &ctx) noexcept; + +#if (fstb_ARCHI == fstb_ARCHI_X86) + template + static void process_seg_fast_int_int_sse2 (uint8_t * fstb_RESTRICT dst_ptr, const uint8_t * fstb_RESTRICT src_ptr, int w, SegContext &/*ctx*/) noexcept; + template + static void process_seg_fast_flt_int_sse2 (uint8_t * fstb_RESTRICT dst_ptr, const uint8_t * fstb_RESTRICT src_ptr, int w, SegContext &ctx) noexcept; +#endif + + template + static void process_seg_ord_int_int_cpp (uint8_t * fstb_RESTRICT dst_ptr, const uint8_t * fstb_RESTRICT src_ptr, int w, SegContext &ctx) noexcept; + template + static void process_seg_ord_flt_int_cpp (uint8_t * fstb_RESTRICT dst_ptr, const uint8_t * fstb_RESTRICT src_ptr, int w, SegContext &ctx) noexcept; + +#if (fstb_ARCHI == fstb_ARCHI_X86) + template + static void process_seg_ord_int_int_sse2 (uint8_t * fstb_RESTRICT dst_ptr, const uint8_t * fstb_RESTRICT src_ptr, int w, SegContext &ctx) noexcept; + template + static void process_seg_ord_flt_int_sse2 (uint8_t * fstb_RESTRICT dst_ptr, const uint8_t * fstb_RESTRICT src_ptr, int w, SegContext &ctx) noexcept; +#endif + + template + static void process_seg_qrs_int_int_cpp (uint8_t * fstb_RESTRICT dst_ptr, const uint8_t * fstb_RESTRICT src_ptr, int w, SegContext &ctx) noexcept; + template + static void process_seg_qrs_flt_int_cpp (uint8_t * fstb_RESTRICT dst_ptr, const uint8_t * fstb_RESTRICT src_ptr, int w, SegContext &ctx) noexcept; + +#if (fstb_ARCHI == fstb_ARCHI_X86) + template + static void process_seg_qrs_int_int_sse2 (uint8_t * fstb_RESTRICT dst_ptr, const uint8_t * fstb_RESTRICT src_ptr, int w, SegContext &ctx) noexcept; + template + static void process_seg_qrs_flt_int_sse2 (uint8_t * fstb_RESTRICT dst_ptr, const uint8_t * fstb_RESTRICT src_ptr, int w, SegContext &ctx) noexcept; +#endif + + template + static fstb_FORCEINLINE void + process_seg_common_int_int_cpp (uint8_t * fstb_RESTRICT dst_ptr, const uint8_t * fstb_RESTRICT src_ptr, int w, SegContext &ctx, DFNC dither_fnc) noexcept; + template + static fstb_FORCEINLINE void + process_seg_common_flt_int_cpp (uint8_t * fstb_RESTRICT dst_ptr, const uint8_t * fstb_RESTRICT src_ptr, int w, SegContext &ctx, DFNC dither_fnc) noexcept; + template + static fstb_FORCEINLINE int + generate_dith_n_scalar (uint32_t &rnd_state) noexcept; + static fstb_FORCEINLINE int + remap_tpdf_scalar (int d) noexcept; + +#if (fstb_ARCHI == fstb_ARCHI_X86) + template + static fstb_FORCEINLINE void + process_seg_common_int_int_sse2 (uint8_t * fstb_RESTRICT dst_ptr, const uint8_t * fstb_RESTRICT src_ptr, int w, SegContext &ctx, DFNC dither_fnc) noexcept; + template + static fstb_FORCEINLINE void + process_seg_common_flt_int_sse2 (uint8_t * fstb_RESTRICT dst_ptr, const uint8_t * fstb_RESTRICT src_ptr, int w, SegContext &ctx, DFNC dither_fnc) noexcept; + template + static fstb_FORCEINLINE __m128i + generate_dith_n_vec (uint32_t &rnd_state) noexcept; + static fstb_FORCEINLINE __m128i + remap_tpdf_vec (__m128i d) noexcept; +#endif + + template + static void process_seg_errdif_int_int_cpp (uint8_t * fstb_RESTRICT dst_ptr, const uint8_t * fstb_RESTRICT src_ptr, int w, SegContext &ctx) noexcept; + template + static void process_seg_errdif_flt_int_cpp (uint8_t * fstb_RESTRICT dst_ptr, const uint8_t * fstb_RESTRICT src_ptr, int w, SegContext &ctx) noexcept; + + static inline void + generate_rnd (uint32_t &state) noexcept; + static inline void + generate_rnd_eol (uint32_t &state) noexcept; + + template + static inline void + quantize_pix_int (DST_TYPE * fstb_RESTRICT dst_ptr, const SRC_TYPE * fstb_RESTRICT src_ptr, SRC_TYPE &src_raw, int x, int & fstb_RESTRICT err, uint32_t &rnd_state, int ampe_i, int ampn_i) noexcept; + template + static inline void + quantize_pix_flt (DST_TYPE * fstb_RESTRICT dst_ptr, const SRC_TYPE * fstb_RESTRICT src_ptr, SRC_TYPE &src_raw, int x, float & fstb_RESTRICT err, uint32_t &rnd_state, float ampe_f, float ampn_f, float mul, float add) noexcept; + + template + class ErrDifAddParam + { + public: + typedef DT DstType; + typedef ST SrcType; + static constexpr int _dst_bits = DB; + static constexpr int _src_bits = SB; + static constexpr int _nbr_err_lines = EL; + }; + + template + class DiffuseFloydSteinberg + : public ErrDifAddParam + { + public: + template + static fstb_FORCEINLINE void + diffuse (int err, int & fstb_RESTRICT err_nxt0, int & fstb_RESTRICT err_nxt1, int16_t * fstb_RESTRICT err0_ptr, int16_t * fstb_RESTRICT err1_ptr, SRC_TYPE src_raw) noexcept; + template + static fstb_FORCEINLINE void + diffuse (float err, float & fstb_RESTRICT err_nxt0, float & fstb_RESTRICT err_nxt1, float * fstb_RESTRICT err0_ptr, float * fstb_RESTRICT err1_ptr, SRC_TYPE src_raw) noexcept; + template + static fstb_FORCEINLINE void + prepare_next_line (EB * fstb_RESTRICT err_ptr) noexcept; + private: + template + static fstb_FORCEINLINE void + spread_error (ET e1, ET e3, ET e5, ET e7, ET & fstb_RESTRICT err_nxt0, EB * fstb_RESTRICT err0_ptr) noexcept; + }; + + template + class DiffuseFilterLite + : public ErrDifAddParam + { + public: + template + static fstb_FORCEINLINE void + diffuse (int err, int & fstb_RESTRICT err_nxt0, int & fstb_RESTRICT err_nxt1, int16_t * fstb_RESTRICT err0_ptr, int16_t * fstb_RESTRICT err1_ptr, SRC_TYPE src_raw) noexcept; + template + static fstb_FORCEINLINE void + diffuse (float err, float & fstb_RESTRICT err_nxt0, float & fstb_RESTRICT err_nxt1, float * fstb_RESTRICT err0_ptr, float * fstb_RESTRICT err1_ptr, SRC_TYPE src_raw) noexcept; + template + static fstb_FORCEINLINE void + prepare_next_line (EB * fstb_RESTRICT err_ptr) noexcept; + private: + template + static fstb_FORCEINLINE void + spread_error (ET e1, ET e2, ET & fstb_RESTRICT err_nxt0, EB * fstb_RESTRICT err0_ptr) noexcept; + }; + + template + class DiffuseStucki + : public ErrDifAddParam + { + public: + template + static fstb_FORCEINLINE void + diffuse (int err, int & fstb_RESTRICT err_nxt0, int & fstb_RESTRICT err_nxt1, int16_t * fstb_RESTRICT err0_ptr, int16_t * fstb_RESTRICT err1_ptr, SRC_TYPE src_raw) noexcept; + template + static fstb_FORCEINLINE void + diffuse (float err, float & fstb_RESTRICT err_nxt0, float & fstb_RESTRICT err_nxt1, float * fstb_RESTRICT err0_ptr, float * fstb_RESTRICT err1_ptr, SRC_TYPE src_raw) noexcept; + template + static fstb_FORCEINLINE void + prepare_next_line (EB * fstb_RESTRICT err_ptr) noexcept; + private: + template + static fstb_FORCEINLINE void + spread_error (ET e1, ET e2, ET e4, ET e8, ET & fstb_RESTRICT err_nxt0, ET & fstb_RESTRICT err_nxt1, EB * fstb_RESTRICT err0_ptr, EB * fstb_RESTRICT err1_ptr) noexcept; + }; + + template + class DiffuseAtkinson + : public ErrDifAddParam + { + public: + template + static fstb_FORCEINLINE void + diffuse (int err, int & fstb_RESTRICT err_nxt0, int & fstb_RESTRICT err_nxt1, int16_t * fstb_RESTRICT err0_ptr, int16_t * fstb_RESTRICT err1_ptr, SRC_TYPE src_raw) noexcept; + template + static fstb_FORCEINLINE void + diffuse (float err, float & fstb_RESTRICT err_nxt0, float & fstb_RESTRICT err_nxt1, float * fstb_RESTRICT err0_ptr, float * fstb_RESTRICT err1_ptr, SRC_TYPE src_raw) noexcept; + template + static fstb_FORCEINLINE void + prepare_next_line (EB * fstb_RESTRICT err_ptr) noexcept; + private: + template + static fstb_FORCEINLINE void + spread_error (ET e1, ET & fstb_RESTRICT err_nxt0, ET & fstb_RESTRICT err_nxt1, EB * fstb_RESTRICT err0_ptr, EB * fstb_RESTRICT err1_ptr) noexcept; + }; + + class DiffuseOstromoukhovBase + { + public: + struct TableEntry + { + int _c0; + int _c1; + int _c2; // Actually not used + int _sum; + float _inv_sum; // Possible optimization: store 1/_c0 and 1/_c1 instead of this field. + }; + + static constexpr int _t_bits = 8; + static constexpr int _t_len = 1 << _t_bits; + static constexpr int _t_mask = _t_len - 1; + + static const std::array + _table; + }; + + template + class DiffuseOstromoukhovBase2 + : public DiffuseOstromoukhovBase + { + public: + template + static inline int + get_index (SRC_TYPE src_raw) noexcept; + static inline int + get_index (float src_raw) noexcept; + }; + + template + class DiffuseOstromoukhov + : public ErrDifAddParam + , public DiffuseOstromoukhovBase2 + { + public: + typedef DiffuseOstromoukhov ThisType; + template + static fstb_FORCEINLINE void + diffuse (int err, int & fstb_RESTRICT err_nxt0, int & fstb_RESTRICT err_nxt1, int16_t * fstb_RESTRICT err0_ptr, int16_t * fstb_RESTRICT err1_ptr, SRC_TYPE src_raw) noexcept; + template + static fstb_FORCEINLINE void + diffuse (float err, float & fstb_RESTRICT err_nxt0, float & fstb_RESTRICT err_nxt1, float * fstb_RESTRICT err0_ptr, float * fstb_RESTRICT err1_ptr, SRC_TYPE src_raw) noexcept; + template + static fstb_FORCEINLINE void + prepare_next_line (EB * fstb_RESTRICT err_ptr) noexcept; + private: + template + static fstb_FORCEINLINE void + spread_error (ET e1, ET e2, ET e3, ET & fstb_RESTRICT err_nxt0, EB * fstb_RESTRICT err0_ptr) noexcept; + }; + + SplFmt _splfmt_src = SplFmt_ILLEGAL; + SplFmt _splfmt_dst = SplFmt_ILLEGAL; + int _src_res = 0; + int _dst_res = 0; + bool _full_range_in_flag = false; + bool _full_range_out_flag = false; + ColorFamily _color_fam = ColorFamily_INVALID; + int _nbr_planes = 0; + + std::array + _scale_info_arr; + bool _upconv_flag = false; + bool _sse2_flag = false; + bool _avx2_flag = false; + bool _range_def_flag = false; + + int _dmode = DMode_FAST; + int _pat_size = _max_pat_width; // Must be a divisor of _max_pat_width + double _ampo = 1; + double _ampn = 0; + bool _dyn_flag = false; + bool _static_noise_flag = false; + bool _correlated_planes_flag = false; + bool _tpdfo_flag = false; + bool _tpdfn_flag = false; + + bool _errdif_flag = false; // Indicates a dithering method using error diffusion. + bool _simple_flag = false; // Simplified implementation for ampo == 1 and ampn == 0 + PatDataArray _dither_pat_arr; // Contains levels for ordered dithering + + AmpInfo _amp; + + conc::ObjPool + _buf_pool; + std::unique_ptr + _buf_factory_uptr; + + void (* _process_seg_int_int_ptr) (uint8_t * fstb_RESTRICT dst_ptr, const uint8_t * fstb_RESTRICT src_ptr, int w, SegContext &ctx) = nullptr; + void (* _process_seg_flt_int_ptr) (uint8_t * fstb_RESTRICT dst_ptr, const uint8_t * fstb_RESTRICT src_ptr, int w, SegContext &ctx) = nullptr; + + + +/*\\\ FORBIDDEN MEMBER FUNCTIONS \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\*/ + +private: + + Dither () = delete; + Dither (const Dither &other) = delete; + Dither (Dither &&other) = delete; + Dither & operator = (const Dither &other) = delete; + Dither & operator = (Dither &&other) = delete; + bool operator == (const Dither &other) const = delete; + bool operator != (const Dither &other) const = delete; + +}; // class Dither + + + +} // namespace fmtcl + + + +//#include "fmtcl/Dither.hpp" + + + +#endif // fmtcl_Dither_HEADER_INCLUDED + + + +/*\\\ EOF \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\*/ diff --git a/src/fmtcl/SplFmt.h b/src/fmtcl/SplFmt.h index 3b7421b..a02ac07 100644 --- a/src/fmtcl/SplFmt.h +++ b/src/fmtcl/SplFmt.h @@ -49,6 +49,8 @@ enum SplFmt +inline bool SplFmt_is_float (SplFmt fmt); +inline bool SplFmt_is_int (SplFmt fmt); inline int SplFmt_get_unit_size (SplFmt fmt); inline int SplFmt_get_data_size (SplFmt fmt); diff --git a/src/fmtcl/SplFmt.hpp b/src/fmtcl/SplFmt.hpp index d29be5b..cecea9e 100644 --- a/src/fmtcl/SplFmt.hpp +++ b/src/fmtcl/SplFmt.hpp @@ -35,6 +35,26 @@ namespace fmtcl +bool SplFmt_is_float (SplFmt fmt) +{ + assert (fmt >= 0); + assert (fmt < SplFmt_NBR_ELT); + + return (fmt == SplFmt_FLOAT); +} + + + +bool SplFmt_is_int (SplFmt fmt) +{ + assert (fmt >= 0); + assert (fmt < SplFmt_NBR_ELT); + + return (fmt != SplFmt_FLOAT); +} + + + int SplFmt_get_unit_size (SplFmt fmt) { assert (fmt >= 0);