From 6cae48d61aac221cbf82313f7aaa00d24fe411c1 Mon Sep 17 00:00:00 2001 From: Paul Walker Date: Mon, 17 Apr 2023 09:16:55 -0400 Subject: [PATCH] Move to using the FastMath from BasicBlocks part of the big SC/XT refactor share project I'm chipping away at and also aprt of https://github.com/surge-synthesizer/surge/issues/6532 This will make this library depend on sst-basic-blocks (and that dependency will get stronger as we move forward). Youc an auto-grab it with CPM if you want. --- CMakeLists.txt | 23 +++ include/sst/filters/CutoffWarp.h | 8 +- include/sst/filters/DiodeLadder.h | 3 +- include/sst/filters/K35Filter.h | 8 +- include/sst/filters/ResonanceWarp.h | 10 +- include/sst/filters/TriPoleFilter.h | 5 +- include/sst/filters/VintageLadders.h | 10 +- include/sst/utilities/FastMath.h | 222 --------------------------- 8 files changed, 46 insertions(+), 243 deletions(-) delete mode 100644 include/sst/utilities/FastMath.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 9ca80d3..8bc3705 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -29,6 +29,14 @@ option(SST_FILTERS_BUILD_EXAMPLES "Add targets for building and running sst-filt if (SST_FILTERS_BUILD_TESTS OR SST_FILTERS_BUILD_EXAMPLES) message(STATUS "Importing SIMDE with CPM") + + if (NOT TARGET sst-basic-blocks) + CPMAddPackage(NAME sst-basic-blocks + GITHUB_REPOSITORY surge-synthesizer/sst-basic-blocks + GIT_TAG main + ) + endif() + CPMAddPackage(NAME simde GITHUB_REPOSITORY simd-everywhere/simde VERSION 0.7.2 @@ -37,6 +45,19 @@ if (SST_FILTERS_BUILD_TESTS OR SST_FILTERS_BUILD_EXAMPLES) target_include_directories(simde INTERFACE ${simde_SOURCE_DIR}) endif () +if (SST_GET_BASIC_BLOCKS) + CPMAddPackage(NAME sst-basic-blocks + GITHUB_REPOSITORY surge-synthesizer/sst-basic-blocks + GIT_TAG main + ) +endif() + +if (NOT TARGET sst-basic-blocks) + message(FATAL_ERROR "sst-basic-blocks is not available in this context. Set SST_GET_BASIC_BLOCKS=1 or add it") +else() + target_link_libraries(${PROJECT_NAME} INTERFACE sst-basic-blocks) +endif() + if (SST_FILTERS_BUILD_TESTS) add_subdirectory(tests) endif () @@ -44,3 +65,5 @@ endif () if(SST_FILTERS_BUILD_EXAMPLES) add_subdirectory(examples) endif() + + diff --git a/include/sst/filters/CutoffWarp.h b/include/sst/filters/CutoffWarp.h index 4afc658..f8cfa73 100644 --- a/include/sst/filters/CutoffWarp.h +++ b/include/sst/filters/CutoffWarp.h @@ -4,7 +4,7 @@ #include "QuadFilterUnit.h" #include "FilterCoefficientMaker.h" #include "sst/utilities/basic_dsp.h" -#include "sst/utilities/FastMath.h" +#include "sst/basic-blocks/dsp/FastMath.h" /** * This namespace contains an adaptation of the filter found at @@ -89,7 +89,7 @@ static inline __m128 doNLFilter(const __m128 input, const __m128 a1, const __m12 nf = ojd_waveshaper_ps(out); break; default: // SAT_TANH; the removed SAT_SINE and others are also caught here - nf = utilities::DSP::fasttanhSSEclamped(out); + nf = basic_blocks::dsp::fasttanhSSEclamped(out); break; } @@ -136,8 +136,8 @@ void makeCoefficients(FilterCoefficientMaker *cm, float freq, fl const float normalisedFreq = 2.0f * clampedFrequency(freq, sampleRate, provider) / sampleRate; const float wc = (float)M_PI * normalisedFreq; - const float wsin = utilities::DSP::fastsin(wc); - const float wcos = utilities::DSP::fastcos(wc); + const float wsin = basic_blocks::dsp::fastsin(wc); + const float wcos = basic_blocks::dsp::fastcos(wc); const float alpha = wsin / (2.0f * q); // note we actually calculate the reciprocal of a0 because we only use a0 to normalize the diff --git a/include/sst/filters/DiodeLadder.h b/include/sst/filters/DiodeLadder.h index beec21d..3708ae9 100644 --- a/include/sst/filters/DiodeLadder.h +++ b/include/sst/filters/DiodeLadder.h @@ -2,6 +2,7 @@ #define SST_FILTERS_DIODELADDER_H #include "sst/utilities/basic_dsp.h" +#include "sst/basic-blocks/dsp/FastMath.h" #include "QuadFilterUnit.h" #include "FilterCoefficientMaker.h" @@ -79,7 +80,7 @@ void makeCoefficients(FilterCoefficientMaker *cm, float freq, fl float sampleRate, float sampleRateInv, TuningProvider *provider) { const float wd = clampedFrequency(freq, sampleRate, provider) * 2.0f * (float)M_PI; - const float wa = (2.0f * sampleRate) * utilities::DSP::fasttan(wd * sampleRateInv * 0.5f); + const float wa = (2.0f * sampleRate) * basic_blocks::dsp::fasttan(wd * sampleRateInv * 0.5f); const float g = wa * sampleRateInv * 0.5f; const float G4 = 0.5f * g / (1.0f + g); diff --git a/include/sst/filters/K35Filter.h b/include/sst/filters/K35Filter.h index 6830e07..0e9cbb5 100644 --- a/include/sst/filters/K35Filter.h +++ b/include/sst/filters/K35Filter.h @@ -1,7 +1,7 @@ #ifndef SST_FILTERS_K35FILTER_H #define SST_FILTERS_K35FILTER_H -#include "sst/utilities/FastMath.h" +#include "sst/basic-blocks/dsp/FastMath.h" #include "QuadFilterUnit.h" #include "FilterCoefficientMaker.h" @@ -72,7 +72,7 @@ void makeCoefficients(FilterCoefficientMaker *cm, float freq, fl float C[n_cm_coeffs]; const float wd = clampedFrequency(freq, sampleRate, provider) * 2.0f * (float)M_PI; - const float wa = (2.0f * sampleRate) * utilities::DSP::fasttan(wd * sampleRateInv * 0.5f); + const float wa = (2.0f * sampleRate) * basic_blocks::dsp::fasttan(wd * sampleRateInv * 0.5f); const float g = wa * sampleRateInv * 0.5f; const float gp1 = (1.0f + g); // g plus 1 const float G = g / gp1; @@ -120,7 +120,7 @@ inline __m128 process_lp(QuadFilterUnitState *__restrict f, __m128 input) const __m128 s35 = A(M(f->C[k35_lb], f->R[k35_2z]), M(f->C[k35_hb], f->R[k35_hz])); // alpha * (y1 + s35) const __m128 u_clean = M(f->C[k35_alpha], A(y1, s35)); - const __m128 u_driven = utilities::DSP::fasttanhSSEclamped(M(u_clean, f->C[k35_saturation])); + const __m128 u_driven = basic_blocks::dsp::fasttanhSSEclamped(M(u_clean, f->C[k35_saturation])); const __m128 u = A(M(u_clean, f->C[k35_saturation_blend_inv]), M(u_driven, f->C[k35_saturation_blend])); @@ -145,7 +145,7 @@ inline __m128 process_hp(QuadFilterUnitState *__restrict f, __m128 input) // mk * lpf2(u) const __m128 y_clean = M(f->C[k35_k], u); - const __m128 y_driven = utilities::DSP::fasttanhSSEclamped(M(y_clean, f->C[k35_saturation])); + const __m128 y_driven = basic_blocks::dsp::fasttanhSSEclamped(M(y_clean, f->C[k35_saturation])); const __m128 y = A(M(y_clean, f->C[k35_saturation_blend_inv]), M(y_driven, f->C[k35_saturation_blend])); diff --git a/include/sst/filters/ResonanceWarp.h b/include/sst/filters/ResonanceWarp.h index 0b1d133..34f058f 100644 --- a/include/sst/filters/ResonanceWarp.h +++ b/include/sst/filters/ResonanceWarp.h @@ -4,7 +4,7 @@ #include "QuadFilterUnit.h" #include "FilterCoefficientMaker.h" #include "sst/utilities/basic_dsp.h" -#include "sst/utilities/FastMath.h" +#include "sst/basic-blocks/dsp/FastMath.h" /** * This contains an adaptation of the filter found at @@ -53,8 +53,8 @@ static inline __m128 doNLFilter(const __m128 input, const __m128 a1, const __m12 switch (sat) { case SAT_TANH: - z1 = utilities::DSP::fasttanhSSEclamped(z1); - z2 = utilities::DSP::fasttanhSSEclamped(z2); + z1 = basic_blocks::dsp::fasttanhSSEclamped(z1); + z2 = basic_blocks::dsp::fasttanhSSEclamped(z2); break; default: z1 = utilities::softclip_ps(z1); // note, this is a bit different to Jatin's softclipper @@ -98,8 +98,8 @@ void makeCoefficients(FilterCoefficientMaker *cm, float freq, fl const float wc = 2.0f * (float)M_PI * clampedFrequency(freq, sampleRate, provider) / sampleRate; - const float wsin = utilities::DSP::fastsin(wc); - const float wcos = utilities::DSP::fastcos(wc); + const float wsin = basic_blocks::dsp::fastsin(wc); + const float wcos = basic_blocks::dsp::fastcos(wc); const float alpha = wsin / (2.0f * q); // note we actually calculate the reciprocal of a0 because we only use a0 to normalize the diff --git a/include/sst/filters/TriPoleFilter.h b/include/sst/filters/TriPoleFilter.h index 62351c5..df3a1c4 100644 --- a/include/sst/filters/TriPoleFilter.h +++ b/include/sst/filters/TriPoleFilter.h @@ -3,6 +3,7 @@ #include "QuadFilterUnit.h" #include "FilterCoefficientMaker.h" +#include "sst/basic-blocks/dsp/FastMath.h" /** * This filter is an emulation of the "Threeler" VCF by @@ -261,7 +262,7 @@ static inline __m128 res_func_ps(__m128 x) auto x_less_than = _mm_cmplt_ps(x_abs, F(max_val)); auto y = - A(N(utilities::DSP::fastexpSSE(M(F(beta_exp), N(utilities::abs_ps(A(x, F(c))))))), F(bias)); + A(N(basic_blocks::dsp::fastexpSSE(M(F(beta_exp), N(utilities::abs_ps(A(x, F(c))))))), F(bias)); y = M(sign_ps(x), M(y, F(oneOverMult))); return _mm_or_ps(_mm_and_ps(x_less_than, M(x, F(oneOverMult))), _mm_andnot_ps(x_less_than, y)); @@ -274,7 +275,7 @@ static inline __m128 res_deriv_ps(__m128 x) auto x_abs = utilities::abs_ps(x); auto x_less_than = _mm_cmplt_ps(x_abs, F(max_val)); - auto y = A(utilities::DSP::fastexpSSE(M(F(beta_exp), N(utilities::abs_ps(A(x, F(c)))))), + auto y = A(basic_blocks::dsp::fastexpSSE(M(F(beta_exp), N(utilities::abs_ps(A(x, F(c)))))), F(betaExpOverMult)); return _mm_or_ps(_mm_and_ps(x_less_than, F(one)), _mm_andnot_ps(x_less_than, y)); diff --git a/include/sst/filters/VintageLadders.h b/include/sst/filters/VintageLadders.h index 8d740f6..1e9f7ef 100644 --- a/include/sst/filters/VintageLadders.h +++ b/include/sst/filters/VintageLadders.h @@ -3,7 +3,7 @@ #include "sst/utilities/globals.h" #include "sst/utilities/basic_dsp.h" -#include "sst/utilities/FastMath.h" +#include "sst/basic-blocks/dsp/FastMath.h" #include "QuadFilterUnit.h" #include "FilterCoefficientMaker.h" @@ -365,7 +365,7 @@ inline __m128 process(QuadFilterUnitState *__restrict f, __m128 in) auto acr = A(M(mneg39364, fc2), A(M(m18409, fc), m09968)); // auto tune = (1.0 - exp(-((2 * M_PI) * f * fcr))) / thermal; - auto tune = M(S(one, utilities::DSP::fastexpSSE(M(neg2pi, M(fr, fcr)))), oneoverthermal); + auto tune = M(S(one, basic_blocks::dsp::fastexpSSE(M(neg2pi, M(fr, fcr)))), oneoverthermal); // auto resquad = 4.0 * res * arc; auto resquad = M(four, M(res, acr)); @@ -380,7 +380,7 @@ inline __m128 process(QuadFilterUnitState *__restrict f, __m128 in) // delay[0] = stage[0] = delay[0] + tune * (tanh(input * thermal) - stageTanh[0]); f->R[h_stage + 0] = - A(f->R[h_delay + 0], M(tune, S(utilities::DSP::fasttanhSSEclamped(M(input, thermal)), + A(f->R[h_delay + 0], M(tune, S(basic_blocks::dsp::fasttanhSSEclamped(M(input, thermal)), f->R[h_stageTanh + 0]))); f->R[h_delay + 0] = f->R[h_stage + 0]; @@ -391,11 +391,11 @@ inline __m128 process(QuadFilterUnitState *__restrict f, __m128 in) // stage[k] = delay[k] + tune * ((stageTanh[k-1] = tanh(input * thermal)) - (k != 3 ? // stageTanh[k] : tanh(delay[k] * thermal))); - f->R[h_stageTanh + k - 1] = utilities::DSP::fasttanhSSEclamped(M(input, thermal)); + f->R[h_stageTanh + k - 1] = basic_blocks::dsp::fasttanhSSEclamped(M(input, thermal)); f->R[h_stage + k] = A(f->R[h_delay + k], M(tune, S(f->R[h_stageTanh + k - 1], (k != 3 ? f->R[h_stageTanh + k] - : utilities::DSP::fasttanhSSEclamped( + : basic_blocks::dsp::fasttanhSSEclamped( M(f->R[h_delay + k], thermal)))))); // delay[k] = stage[k]; diff --git a/include/sst/utilities/FastMath.h b/include/sst/utilities/FastMath.h deleted file mode 100644 index 3b9f367..0000000 --- a/include/sst/utilities/FastMath.h +++ /dev/null @@ -1,222 +0,0 @@ -#pragma once - -#include "globals.h" -#include - -/* -** Fast Math Approximations to various Functions -** -** Documentation on each one -*/ - -/* -** These are directly copied from JUCE6 modules/juce_dsp/mathos/juce_FastMathApproximations.h -** -** Since JUCE6 is GPL3 and Surge is GPL3 that copy is fine, but I did want to explicitly -** acknowledge it -*/ - -namespace sst::filters::utilities::DSP -{ - -// JUCE6 Pade approximation of sin valid from -PI to PI with max error of 1e-5 and average error of -// 5e-7 -inline float fastsin(float x) noexcept -{ - auto x2 = x * x; - auto numerator = -x * (-(float)11511339840 + - x2 * ((float)1640635920 + x2 * (-(float)52785432 + x2 * (float)479249))); - auto denominator = - (float)11511339840 + x2 * ((float)277920720 + x2 * ((float)3177720 + x2 * (float)18361)); - return numerator / denominator; -} - -inline __m128 fastsinSSE(__m128 x) noexcept -{ -#define M(a, b) _mm_mul_ps(a, b) -#define A(a, b) _mm_add_ps(a, b) -#define S(a, b) _mm_sub_ps(a, b) -#define F(a) _mm_set_ps1(a) -#define C(x) __m128 m##x = F((float)x) - - /* - auto numerator = -x * (-(float)11511339840 + - x2 * ((float)1640635920 + x2 * (-(float)52785432 + x2 * (float)479249))); - auto denominator = - (float)11511339840 + x2 * ((float)277920720 + x2 * ((float)3177720 + x2 * (float)18361)); - */ - C(11511339840); - C(1640635920); - C(52785432); - C(479249); - C(277920720); - C(3177720); - C(18361); - auto mnegone = F(-1); - - auto x2 = M(x, x); - - auto num = M(mnegone, - M(x, S(M(x2, A(m1640635920, M(x2, S(M(x2, m479249), m52785432)))), m11511339840))); - auto den = A(m11511339840, M(x2, A(m277920720, M(x2, A(m3177720, M(x2, m18361)))))); - -#undef C -#undef M -#undef A -#undef S -#undef F - return _mm_div_ps(num, den); -} - -// JUCE6 Pade approximation of cos valid from -PI to PI with max error of 1e-5 and average error of -// 5e-7 -inline float fastcos(float x) noexcept -{ - auto x2 = x * x; - auto numerator = -(-(float)39251520 + x2 * ((float)18471600 + x2 * (-1075032 + 14615 * x2))); - auto denominator = (float)39251520 + x2 * (1154160 + x2 * (16632 + x2 * 127)); - return numerator / denominator; -} - -inline __m128 fastcosSSE(__m128 x) noexcept -{ -#define M(a, b) _mm_mul_ps(a, b) -#define A(a, b) _mm_add_ps(a, b) -#define S(a, b) _mm_sub_ps(a, b) -#define F(a) _mm_set_ps1(a) -#define C(x) __m128 m##x = F((float)x) - - // auto x2 = x * x; - auto x2 = M(x, x); - - C(39251520); - C(18471600); - C(1075032); - C(14615); - C(1154160); - C(16632); - C(127); - - // auto numerator = -(-(float)39251520 + x2 * ((float)18471600 + x2 * (-1075032 + 14615 * x2))); - auto num = S(m39251520, M(x2, A(m18471600, M(x2, S(M(m14615, x2), m1075032))))); - - // auto denominator = (float)39251520 + x2 * (1154160 + x2 * (16632 + x2 * 127)); - auto den = A(m39251520, M(x2, A(m1154160, M(x2, A(m16632, M(x2, m127)))))); -#undef C -#undef M -#undef A -#undef S -#undef F - return _mm_div_ps(num, den); -} - -/* -** Push x into -PI, PI periodically. There is probably a faster way to do this -*/ -inline float clampToPiRange(float x) -{ - if (x <= (float)M_PI && x >= -(float)M_PI) - return x; - float y = x + (float)M_PI; // so now I am 0,2PI - - // float p = fmod( y, 2.0 * M_PI ); - - constexpr float oo2p = 1.0f / (2.0f * (float)M_PI); - float p = y - 2.0f * (float)M_PI * (int)(y * oo2p); - - if (p < 0) - p += 2.0f * (float)M_PI; - return p - (float)M_PI; -} - -inline __m128 clampToPiRangeSSE(__m128 x) -{ - const auto mpi = _mm_set1_ps((float)M_PI); - const auto m2pi = _mm_set1_ps(2.0f * (float)M_PI); - const auto oo2p = _mm_set1_ps(1.0f / (2.0f * (float)M_PI)); - const auto mz = _mm_setzero_ps(); - - auto y = _mm_add_ps(x, mpi); - auto yip = _mm_cvtepi32_ps(_mm_cvttps_epi32(_mm_mul_ps(y, oo2p))); - auto p = _mm_sub_ps(y, _mm_mul_ps(m2pi, yip)); - auto off = _mm_and_ps(_mm_cmplt_ps(p, mz), m2pi); - p = _mm_add_ps(p, off); - - return _mm_sub_ps(p, mpi); -} - -/* -** Valid in range -5, 5 -*/ -inline float fasttanh(float x) noexcept -{ - auto x2 = x * x; - auto numerator = x * (135135 + x2 * (17325 + x2 * (378 + x2))); - auto denominator = 135135 + x2 * (62370 + x2 * (3150 + 28 * x2)); - return numerator / denominator; -} - -// Valid in range (-PI/2, PI/2) -inline float fasttan(float x) noexcept -{ - auto x2 = x * x; - auto numerator = x * (-135135 + x2 * (17325 + x2 * (-378 + x2))); - auto denominator = -135135 + x2 * (62370 + x2 * (-3150 + 28 * x2)); - return numerator / denominator; -} - -inline __m128 fasttanhSSE(__m128 x) -{ - const __m128 m135135 = _mm_set_ps1(135135), m17325 = _mm_set_ps1(17325), - m378 = _mm_set_ps1(378), m62370 = _mm_set_ps1(62370), - m3150 = _mm_set_ps1(3150), m28 = _mm_set_ps1(28); - -#define M(a, b) _mm_mul_ps(a, b) -#define A(a, b) _mm_add_ps(a, b) - - auto x2 = M(x, x); - auto num = M(x, A(m135135, M(x2, A(m17325, M(x2, A(m378, x2)))))); - auto den = A(m135135, M(x2, A(m62370, M(x2, A(m3150, M(m28, x2)))))); - -#undef M -#undef A - - return _mm_div_ps(num, den); -} - -inline __m128 fasttanhSSEclamped(__m128 x) -{ - auto xc = _mm_min_ps(_mm_set_ps1(5), _mm_max_ps(_mm_set_ps1(-5), x)); - return fasttanhSSE(xc); -} - -/* -** Valid in range -6, 4 -*/ -inline float fastexp(float x) noexcept -{ - auto numerator = 1680 + x * (840 + x * (180 + x * (20 + x))); - auto denominator = 1680 + x * (-840 + x * (180 + x * (-20 + x))); - return numerator / denominator; -} - -inline __m128 fastexpSSE(__m128 x) noexcept -{ -#define M(a, b) _mm_mul_ps(a, b) -#define A(a, b) _mm_add_ps(a, b) -#define F(a) _mm_set_ps1(a) - - const __m128 m1680 = F(1680), m840 = F(840), mneg840 = F(-840), m180 = F(180), - m20 = F(20), mneg20 = F(-20); - - auto num = A(m1680, M(x, A(m840, M(x, A(m180, M(x, A(m20, x))))))); - auto den = A(m1680, M(x, A(mneg840, M(x, A(m180, M(x, A(mneg20, x))))))); - - return _mm_div_ps(num, den); - -#undef M -#undef A -#undef F -} - -} // namespace sst::filters::utilities::DSP