diff --git a/libs/CMakeLists.txt b/libs/CMakeLists.txt index 7a52491f..82023dba 100644 --- a/libs/CMakeLists.txt +++ b/libs/CMakeLists.txt @@ -44,6 +44,8 @@ add_subdirectory(sst/sst-cpputils) # set(SST_PLUGININFRA_FILESYSTEM_FORCE_PLATFORM ON CACHE BOOL "Force platform filesystem") set(SST_PLUGININFRA_PROVIDE_TINYXML ON CACHE BOOL "Get TinyXML") # need this for UserDefaults set(SST_PLUGININFRA_PROVIDE_MINIZ ON CACHE BOOL "Get MiniZ") # need this for UserDefaults +set(SST_BASIC_BLOCKS_SIMD_OMIT_NATIVE_ALIASES ON CACHE BOOL "No Native Aliases for SCXT") # Makes ARM64EC use neon basically + add_subdirectory(sst/sst-basic-blocks) add_subdirectory(sst/sst-plugininfra) add_subdirectory(sst/sst-filters) diff --git a/libs/sst/sst-basic-blocks b/libs/sst/sst-basic-blocks index 9157ccc2..ee4a7505 160000 --- a/libs/sst/sst-basic-blocks +++ b/libs/sst/sst-basic-blocks @@ -1 +1 @@ -Subproject commit 9157ccc2eb7ddb7e028f1f848b4f24359701977d +Subproject commit ee4a75054f5982fd91b7d8a1dbe8a2ceb629a620 diff --git a/libs/sst/sst-effects b/libs/sst/sst-effects index 0f7c3291..94cc27df 160000 --- a/libs/sst/sst-effects +++ b/libs/sst/sst-effects @@ -1 +1 @@ -Subproject commit 0f7c329122cebcca379fe795ef47c0bbcefe7647 +Subproject commit 94cc27df03b38f001615f344149be399e9710f66 diff --git a/libs/sst/sst-filters b/libs/sst/sst-filters index 6514385c..fa419fd4 160000 --- a/libs/sst/sst-filters +++ b/libs/sst/sst-filters @@ -1 +1 @@ -Subproject commit 6514385c5ba5d5bd7143a4cbfccf3e42af514ba7 +Subproject commit fa419fd4d03f4984f4169fd13646baa259962803 diff --git a/libs/sst/sst-waveshapers b/libs/sst/sst-waveshapers index 86a2a81c..2615805a 160000 --- a/libs/sst/sst-waveshapers +++ b/libs/sst/sst-waveshapers @@ -1 +1 @@ -Subproject commit 86a2a81c446347a074b5d5d2be9d5d2d5cb5a8c4 +Subproject commit 2615805adfd7fc8db8eeafff6f4549b653d93cdf diff --git a/src/configuration.h b/src/configuration.h index 1690633a..c5b4b49f 100644 --- a/src/configuration.h +++ b/src/configuration.h @@ -28,8 +28,8 @@ #ifndef SCXT_SRC_CONFIGURATION_H #define SCXT_SRC_CONFIGURATION_H -#include -#include "infrastructure/sse_include.h" +#include // for size_t +#include // for uint16_t etc... namespace scxt { diff --git a/src/dsp/generator.cpp b/src/dsp/generator.cpp index 3e79e462..abc03f5b 100644 --- a/src/dsp/generator.cpp +++ b/src/dsp/generator.cpp @@ -26,7 +26,8 @@ */ #include "generator.h" -#include "infrastructure/sse_include.h" + +#include "sst/basic-blocks/simd/setup.h" #include "resampling.h" #include "data_tables.h" @@ -99,7 +100,7 @@ namespace scxt::dsp { constexpr float I16InvScale = (1.f / (16384.f * 32768.f)); constexpr float I16InvScale2 = (1.f / (32768.f)); -const __m128 I16InvScale_m128 = _mm_set1_ps(I16InvScale); +const auto I16InvScale_m128 = SIMD_MM(set1_ps)(I16InvScale); inline float getFadeGainToAmp(float g) { @@ -327,42 +328,48 @@ void KernelOp::Process( #endif // float32 path (SSE) - __m128 lipol0, tmp[4], sL4, sR4; - lipol0 = _mm_setzero_ps(); - lipol0 = _mm_cvtsi32_ss(lipol0, ks.SampleSubPos & 0xffff); - lipol0 = _mm_shuffle_ps(lipol0, lipol0, _MM_SHUFFLE(0, 0, 0, 0)); - tmp[0] = _mm_add_ps(_mm_mul_ps(*((__m128 *)&sincTable.SincOffsetF32[m0]), lipol0), - *((__m128 *)&sincTable.SincTableF32[m0])); - tmp[1] = _mm_add_ps(_mm_mul_ps(*((__m128 *)&sincTable.SincOffsetF32[m0 + 4]), lipol0), - *((__m128 *)&sincTable.SincTableF32[m0 + 4])); - tmp[2] = _mm_add_ps(_mm_mul_ps(*((__m128 *)&sincTable.SincOffsetF32[m0 + 8]), lipol0), - *((__m128 *)&sincTable.SincTableF32[m0 + 8])); - tmp[3] = _mm_add_ps(_mm_mul_ps(*((__m128 *)&sincTable.SincOffsetF32[m0 + 12]), lipol0), - *((__m128 *)&sincTable.SincTableF32[m0 + 12])); - sL4 = _mm_mul_ps(tmp[0], _mm_loadu_ps(readSampleL)); - sL4 = _mm_add_ps(sL4, _mm_mul_ps(tmp[1], _mm_loadu_ps(readSampleL + 4))); - sL4 = _mm_add_ps(sL4, _mm_mul_ps(tmp[2], _mm_loadu_ps(readSampleL + 8))); - sL4 = _mm_add_ps(sL4, _mm_mul_ps(tmp[3], _mm_loadu_ps(readSampleL + 12))); + SIMD_M128 lipol0, tmp[4], sL4, sR4; + lipol0 = SIMD_MM(setzero_ps)(); + lipol0 = SIMD_MM(cvtsi32_ss)(lipol0, ks.SampleSubPos & 0xffff); + lipol0 = SIMD_MM(shuffle_ps)(lipol0, lipol0, SIMD_MM_SHUFFLE(0, 0, 0, 0)); + tmp[0] = SIMD_MM(add_ps)(SIMD_MM(mul_ps)(*((SIMD_M128 *)&sincTable.SincOffsetF32[m0]), lipol0), + *((SIMD_M128 *)&sincTable.SincTableF32[m0])); + tmp[1] = + SIMD_MM(add_ps)(SIMD_MM(mul_ps)(*((SIMD_M128 *)&sincTable.SincOffsetF32[m0 + 4]), lipol0), + *((SIMD_M128 *)&sincTable.SincTableF32[m0 + 4])); + tmp[2] = + SIMD_MM(add_ps)(SIMD_MM(mul_ps)(*((SIMD_M128 *)&sincTable.SincOffsetF32[m0 + 8]), lipol0), + *((SIMD_M128 *)&sincTable.SincTableF32[m0 + 8])); + tmp[3] = + SIMD_MM(add_ps)(SIMD_MM(mul_ps)(*((SIMD_M128 *)&sincTable.SincOffsetF32[m0 + 12]), lipol0), + *((SIMD_M128 *)&sincTable.SincTableF32[m0 + 12])); + sL4 = SIMD_MM(mul_ps)(tmp[0], SIMD_MM(loadu_ps)(readSampleL)); + sL4 = SIMD_MM(add_ps)(sL4, SIMD_MM(mul_ps)(tmp[1], SIMD_MM(loadu_ps)(readSampleL + 4))); + sL4 = SIMD_MM(add_ps)(sL4, SIMD_MM(mul_ps)(tmp[2], SIMD_MM(loadu_ps)(readSampleL + 8))); + sL4 = SIMD_MM(add_ps)(sL4, SIMD_MM(mul_ps)(tmp[3], SIMD_MM(loadu_ps)(readSampleL + 12))); // sL4 = sst::basic_blocks::mechanics::sum_ps_to_ss(sL4); - sL4 = _mm_hadd_ps(sL4, sL4); - sL4 = _mm_hadd_ps(sL4, sL4); + sL4 = SIMD_MM(hadd_ps)(sL4, sL4); + sL4 = SIMD_MM(hadd_ps)(sL4, sL4); - _mm_store_ss(&OutputL[i], sL4); + SIMD_MM(store_ss)(&OutputL[i], sL4); if constexpr (LOOP_ACTIVE) { if (ks.fadeActive) { - sR4 = _mm_mul_ps(tmp[0], _mm_loadu_ps(readFadeSampleL)); - sR4 = _mm_add_ps(sR4, _mm_mul_ps(tmp[1], _mm_loadu_ps(readFadeSampleL + 4))); - sR4 = _mm_add_ps(sR4, _mm_mul_ps(tmp[2], _mm_loadu_ps(readFadeSampleL + 8))); - sR4 = _mm_add_ps(sR4, _mm_mul_ps(tmp[3], _mm_loadu_ps(readFadeSampleL + 12))); + sR4 = SIMD_MM(mul_ps)(tmp[0], SIMD_MM(loadu_ps)(readFadeSampleL)); + sR4 = SIMD_MM(add_ps)(sR4, + SIMD_MM(mul_ps)(tmp[1], SIMD_MM(loadu_ps)(readFadeSampleL + 4))); + sR4 = SIMD_MM(add_ps)(sR4, + SIMD_MM(mul_ps)(tmp[2], SIMD_MM(loadu_ps)(readFadeSampleL + 8))); + sR4 = SIMD_MM(add_ps)(sR4, + SIMD_MM(mul_ps)(tmp[3], SIMD_MM(loadu_ps)(readFadeSampleL + 12))); // sR4 = sst::basic_blocks::mechanics::sum_ps_to_ss(sR4); - sR4 = _mm_hadd_ps(sR4, sR4); - sR4 = _mm_hadd_ps(sR4, sR4); + sR4 = SIMD_MM(hadd_ps)(sR4, sR4); + sR4 = SIMD_MM(hadd_ps)(sR4, sR4); float fadeVal{0.f}; - _mm_store_ss(&fadeVal, sR4); + SIMD_MM(store_ss)(&fadeVal, sR4); auto fadeGain( getFadeGain(ks.SamplePos, GD->loopUpperBound - ks.loopFade, GD->loopUpperBound)); auto aOut = getFadeGainToAmp(1.f - fadeGain); @@ -378,30 +385,33 @@ void KernelOp::Process( auto readFadeSampleR{ks.ReadFadeSample[1]}; auto OutputR{ks.Output[1]}; - sR4 = _mm_mul_ps(tmp[0], _mm_loadu_ps(readSampleR)); - sR4 = _mm_add_ps(sR4, _mm_mul_ps(tmp[1], _mm_loadu_ps(readSampleR + 4))); - sR4 = _mm_add_ps(sR4, _mm_mul_ps(tmp[2], _mm_loadu_ps(readSampleR + 8))); - sR4 = _mm_add_ps(sR4, _mm_mul_ps(tmp[3], _mm_loadu_ps(readSampleR + 12))); + sR4 = SIMD_MM(mul_ps)(tmp[0], SIMD_MM(loadu_ps)(readSampleR)); + sR4 = SIMD_MM(add_ps)(sR4, SIMD_MM(mul_ps)(tmp[1], SIMD_MM(loadu_ps)(readSampleR + 4))); + sR4 = SIMD_MM(add_ps)(sR4, SIMD_MM(mul_ps)(tmp[2], SIMD_MM(loadu_ps)(readSampleR + 8))); + sR4 = SIMD_MM(add_ps)(sR4, SIMD_MM(mul_ps)(tmp[3], SIMD_MM(loadu_ps)(readSampleR + 12))); // sR4 = sst::basic_blocks::mechanics::sum_ps_to_ss(sR4); - sR4 = _mm_hadd_ps(sR4, sR4); - sR4 = _mm_hadd_ps(sR4, sR4); + sR4 = SIMD_MM(hadd_ps)(sR4, sR4); + sR4 = SIMD_MM(hadd_ps)(sR4, sR4); - _mm_store_ss(&OutputR[i], sR4); + SIMD_MM(store_ss)(&OutputR[i], sR4); if constexpr (LOOP_ACTIVE) { if (ks.fadeActive) { - sR4 = _mm_mul_ps(tmp[0], _mm_loadu_ps(readFadeSampleR)); - sR4 = _mm_add_ps(sR4, _mm_mul_ps(tmp[1], _mm_loadu_ps(readFadeSampleR + 4))); - sR4 = _mm_add_ps(sR4, _mm_mul_ps(tmp[2], _mm_loadu_ps(readFadeSampleR + 8))); - sR4 = _mm_add_ps(sR4, _mm_mul_ps(tmp[3], _mm_loadu_ps(readFadeSampleR + 12))); + sR4 = SIMD_MM(mul_ps)(tmp[0], SIMD_MM(loadu_ps)(readFadeSampleR)); + sR4 = SIMD_MM(add_ps)( + sR4, SIMD_MM(mul_ps)(tmp[1], SIMD_MM(loadu_ps)(readFadeSampleR + 4))); + sR4 = SIMD_MM(add_ps)( + sR4, SIMD_MM(mul_ps)(tmp[2], SIMD_MM(loadu_ps)(readFadeSampleR + 8))); + sR4 = SIMD_MM(add_ps)( + sR4, SIMD_MM(mul_ps)(tmp[3], SIMD_MM(loadu_ps)(readFadeSampleR + 12))); // sR4 = sst::basic_blocks::mechanics::sum_ps_to_ss(sR4); - sR4 = _mm_hadd_ps(sR4, sR4); - sR4 = _mm_hadd_ps(sR4, sR4); + sR4 = SIMD_MM(hadd_ps)(sR4, sR4); + sR4 = SIMD_MM(hadd_ps)(sR4, sR4); float fadeVal{0.f}; - _mm_store_ss(&fadeVal, sR4); + SIMD_MM(store_ss)(&fadeVal, sR4); auto fadeGain(getFadeGain(ks.SamplePos, GD->loopUpperBound - ks.loopFade, GD->loopUpperBound)); auto aOut = getFadeGainToAmp(1.f - fadeGain); @@ -435,41 +445,43 @@ void KernelOp::Process( // int16 // SSE2 path - __m128i lipol0, tmp, sL8A, sR8A, tmp2, sL8B, sR8B; - __m128 fL = _mm_setzero_ps(), fR = _mm_setzero_ps(); - lipol0 = _mm_set1_epi16(ks.SampleSubPos & 0xffff); - - tmp = _mm_add_epi16(_mm_mulhi_epi16(*((__m128i *)&sincTable.SincOffsetI16[m0]), lipol0), - *((__m128i *)&sincTable.SincTableI16[m0])); - sL8A = _mm_madd_epi16(tmp, _mm_loadu_si128((__m128i *)readSampleL)); + SIMD_M128I lipol0, tmp, sL8A, sR8A, tmp2, sL8B, sR8B; + auto fL = SIMD_MM(setzero_ps)(), fR = SIMD_MM(setzero_ps)(); + lipol0 = SIMD_MM(set1_epi16)(ks.SampleSubPos & 0xffff); + + tmp = SIMD_MM(add_epi16)( + SIMD_MM(mulhi_epi16)(*((SIMD_M128I *)&sincTable.SincOffsetI16[m0]), lipol0), + *((SIMD_M128I *)&sincTable.SincTableI16[m0])); + sL8A = SIMD_MM(madd_epi16)(tmp, SIMD_MM(loadu_si128)((SIMD_M128I *)readSampleL)); if constexpr (stereo) - sR8A = _mm_madd_epi16(tmp, _mm_loadu_si128((__m128i *)readSampleR)); + sR8A = SIMD_MM(madd_epi16)(tmp, SIMD_MM(loadu_si128)((SIMD_M128I *)readSampleR)); - tmp2 = _mm_add_epi16(_mm_mulhi_epi16(*((__m128i *)&sincTable.SincOffsetI16[m0 + 8]), lipol0), - *((__m128i *)&sincTable.SincTableI16[m0 + 8])); - sL8B = _mm_madd_epi16(tmp2, _mm_loadu_si128((__m128i *)(readSampleL + 8))); + tmp2 = SIMD_MM(add_epi16)( + SIMD_MM(mulhi_epi16)(*((SIMD_M128I *)&sincTable.SincOffsetI16[m0 + 8]), lipol0), + *((SIMD_M128I *)&sincTable.SincTableI16[m0 + 8])); + sL8B = SIMD_MM(madd_epi16)(tmp2, SIMD_MM(loadu_si128)((SIMD_M128I *)(readSampleL + 8))); if constexpr (stereo) - sR8B = _mm_madd_epi16(tmp2, _mm_loadu_si128((__m128i *)(readSampleR + 8))); + sR8B = SIMD_MM(madd_epi16)(tmp2, SIMD_MM(loadu_si128)((SIMD_M128I *)(readSampleR + 8))); - sL8A = _mm_add_epi32(sL8A, sL8B); + sL8A = SIMD_MM(add_epi32)(sL8A, sL8B); if constexpr (stereo) - sR8A = _mm_add_epi32(sR8A, sR8B); + sR8A = SIMD_MM(add_epi32)(sR8A, sR8B); int l alignas(16)[4], r alignas(16)[4]; - _mm_store_si128((__m128i *)&l, sL8A); + SIMD_MM(store_si128)((SIMD_M128I *)&l, sL8A); if constexpr (stereo) - _mm_store_si128((__m128i *)&r, sR8A); + SIMD_MM(store_si128)((SIMD_M128I *)&r, sR8A); l[0] = (l[0] + l[1]) + (l[2] + l[3]); if constexpr (stereo) r[0] = (r[0] + r[1]) + (r[2] + r[3]); - fL = _mm_mul_ss(_mm_cvtsi32_ss(fL, l[0]), I16InvScale_m128); + fL = SIMD_MM(mul_ss)(SIMD_MM(cvtsi32_ss)(fL, l[0]), I16InvScale_m128); if constexpr (stereo) - fR = _mm_mul_ss(_mm_cvtsi32_ss(fR, r[0]), I16InvScale_m128); + fR = SIMD_MM(mul_ss)(SIMD_MM(cvtsi32_ss)(fR, r[0]), I16InvScale_m128); - _mm_store_ss(&OutputL[i], fL); + SIMD_MM(store_ss)(&OutputL[i], fL); if constexpr (stereo) - _mm_store_ss(&OutputR[i], fR); + SIMD_MM(store_ss)(&OutputR[i], fR); if constexpr (LOOP_ACTIVE) { @@ -481,34 +493,37 @@ void KernelOp::Process( readFadeSampleR = ks.ReadFadeSample[1]; } - sL8A = _mm_madd_epi16(tmp, _mm_loadu_si128((__m128i *)readFadeSampleL)); + sL8A = SIMD_MM(madd_epi16)(tmp, SIMD_MM(loadu_si128)((SIMD_M128I *)readFadeSampleL)); if constexpr (stereo) - sR8A = _mm_madd_epi16(tmp, _mm_loadu_si128((__m128i *)readFadeSampleR)); - sL8B = _mm_madd_epi16(tmp2, _mm_loadu_si128((__m128i *)(readFadeSampleL + 8))); + sR8A = + SIMD_MM(madd_epi16)(tmp, SIMD_MM(loadu_si128)((SIMD_M128I *)readFadeSampleR)); + sL8B = SIMD_MM(madd_epi16)(tmp2, + SIMD_MM(loadu_si128)((SIMD_M128I *)(readFadeSampleL + 8))); if constexpr (stereo) - sR8B = _mm_madd_epi16(tmp2, _mm_loadu_si128((__m128i *)(readFadeSampleR + 8))); + sR8B = SIMD_MM(madd_epi16)( + tmp2, SIMD_MM(loadu_si128)((SIMD_M128I *)(readFadeSampleR + 8))); - sL8A = _mm_add_epi32(sL8A, sL8B); + sL8A = SIMD_MM(add_epi32)(sL8A, sL8B); if constexpr (stereo) - sR8A = _mm_add_epi32(sR8A, sR8B); + sR8A = SIMD_MM(add_epi32)(sR8A, sR8B); int l alignas(16)[4], r alignas(16)[4]; - _mm_store_si128((__m128i *)&l, sL8A); + SIMD_MM(store_si128)((SIMD_M128I *)&l, sL8A); if constexpr (stereo) - _mm_store_si128((__m128i *)&r, sR8A); + SIMD_MM(store_si128)((SIMD_M128I *)&r, sR8A); l[0] = (l[0] + l[1]) + (l[2] + l[3]); if constexpr (stereo) r[0] = (r[0] + r[1]) + (r[2] + r[3]); - fL = _mm_mul_ss(_mm_cvtsi32_ss(fL, l[0]), I16InvScale_m128); + fL = SIMD_MM(mul_ss)(SIMD_MM(cvtsi32_ss)(fL, l[0]), I16InvScale_m128); if constexpr (stereo) - fR = _mm_mul_ss(_mm_cvtsi32_ss(fR, r[0]), I16InvScale_m128); + fR = SIMD_MM(mul_ss)(SIMD_MM(cvtsi32_ss)(fR, r[0]), I16InvScale_m128); float fadeValL{0.f}; float fadeValR{0.f}; - _mm_store_ss(&fadeValL, fL); + SIMD_MM(store_ss)(&fadeValL, fL); if constexpr (stereo) - _mm_store_ss(&fadeValR, fR); + SIMD_MM(store_ss)(&fadeValR, fR); auto fadeGain( getFadeGain(ks.SamplePos, GD->loopUpperBound - ks.loopFade, GD->loopUpperBound)); diff --git a/src/dsp/processor/processor.h b/src/dsp/processor/processor.h index da7595e0..40b25ac7 100644 --- a/src/dsp/processor/processor.h +++ b/src/dsp/processor/processor.h @@ -76,8 +76,9 @@ #include #include #include +#include -#include "infrastructure/sse_include.h" +#include "sst/basic-blocks/simd/setup.h" #include "sst/basic-blocks/mechanics/block-ops.h" #include "datamodel/metadata.h" diff --git a/src/dsp/processor/processor_defs.h b/src/dsp/processor/processor_defs.h index f895862a..7e72fb47 100644 --- a/src/dsp/processor/processor_defs.h +++ b/src/dsp/processor/processor_defs.h @@ -30,7 +30,8 @@ #include "processor.h" #include "datamodel/metadata.h" -#include "infrastructure/sse_include.h" + +#include "sst/basic-blocks/simd/setup.h" #include #include "sst/basic-blocks/dsp/BlockInterpolators.h" diff --git a/src/engine/bus.cpp b/src/engine/bus.cpp index 34a49fe4..f9957ae7 100644 --- a/src/engine/bus.cpp +++ b/src/engine/bus.cpp @@ -27,13 +27,13 @@ #include "bus.h" +#include "sst/basic-blocks/simd/setup.h" + #include "configuration.h" #include "dsp/data_tables.h" #include "engine.h" -#include "infrastructure/sse_include.h" - #include "dsp/data_tables.h" #include "tuning/equal.h" diff --git a/src/engine/group.cpp b/src/engine/group.cpp index ecb966e1..dc3f92f5 100644 --- a/src/engine/group.cpp +++ b/src/engine/group.cpp @@ -29,7 +29,7 @@ #include "bus.h" #include "part.h" -#include "infrastructure/sse_include.h" +#include "sst/basic-blocks/simd/setup.h" #include "sst/basic-blocks/mechanics/block-ops.h" #include "sst/basic-blocks/dsp/PanLaws.h" diff --git a/src/engine/group.h b/src/engine/group.h index cc83eb75..d16fb07c 100644 --- a/src/engine/group.h +++ b/src/engine/group.h @@ -27,8 +27,6 @@ #ifndef SCXT_SRC_ENGINE_GROUP_H #define SCXT_SRC_ENGINE_GROUP_H -#include - #include #include #include diff --git a/src/engine/part.cpp b/src/engine/part.cpp index d4a191ed..955557a5 100644 --- a/src/engine/part.cpp +++ b/src/engine/part.cpp @@ -33,7 +33,7 @@ #include "selection/selection_manager.h" -#include "infrastructure/sse_include.h" +#include "sst/basic-blocks/simd/setup.h" #include "sst/basic-blocks/mechanics/block-ops.h" namespace scxt::engine