From be2b4dd3b5b5a9612ee5903f8740546626ae0664 Mon Sep 17 00:00:00 2001 From: Minchul Lee Date: Fri, 16 Jul 2021 01:28:13 +0900 Subject: [PATCH] implemented (#24), fixed (#27) --- .github/workflows/pull_request_test.yml | 20 +- .github/workflows/release.yml | 20 +- Benchmark.vcxproj | 377 +++++++- Benchmark.vcxproj.filters | 23 +- BenchmarkMv.vcxproj | 370 +++++++- BenchmarkMv.vcxproj.filters | 2 +- EigenRand.sln | 146 +++- EigenRand/Core.h | 14 +- EigenRand/Dists/Basic.h | 206 ++++- EigenRand/Dists/NormalExp.h | 14 +- EigenRand/EigenRand | 4 +- EigenRand/Macro.h | 6 +- EigenRand/MorePacketMath.h | 1057 +---------------------- EigenRand/PacketFilter.h | 2 +- EigenRand/PacketRandomEngine.h | 4 +- EigenRand/RandUtils.h | 12 +- EigenRand/arch/AVX/MorePacketMath.h | 601 +++++++++++++ EigenRand/arch/NEON/MorePacketMath.h | 65 ++ EigenRand/arch/SSE/MorePacketMath.h | 487 +++++++++++ README.md | 5 + TestAccuracy.vcxproj | 366 +++++++- TestAccuracy.vcxproj.filters | 2 +- {test => benchmark}/accuracy.cpp | 95 +- {test => benchmark}/benchmark.cpp | 4 +- {test => benchmark}/benchmark_mv.cpp | 0 {test => benchmark}/comp_scipy.py | 0 doxygen/Doxyfile | 2 +- test/packages.config | 4 + test/test.cpp | 72 ++ test/test.vcxproj | 130 +++ 30 files changed, 2954 insertions(+), 1156 deletions(-) create mode 100644 EigenRand/arch/AVX/MorePacketMath.h create mode 100644 EigenRand/arch/NEON/MorePacketMath.h create mode 100644 EigenRand/arch/SSE/MorePacketMath.h rename {test => benchmark}/accuracy.cpp (99%) rename {test => benchmark}/benchmark.cpp (99%) rename {test => benchmark}/benchmark_mv.cpp (100%) rename {test => benchmark}/comp_scipy.py (100%) create mode 100644 test/packages.config create mode 100644 test/test.cpp create mode 100644 test/test.vcxproj diff --git a/.github/workflows/pull_request_test.yml b/.github/workflows/pull_request_test.yml index c4b0235..d875de0 100644 --- a/.github/workflows/pull_request_test.yml +++ b/.github/workflows/pull_request_test.yml @@ -26,20 +26,20 @@ jobs: mv eigen include - name: Build Bench run: | - g++ -std=c++11 -g -O3 -${{ matrix.arch }} -DNDEBUG -I./ -I./include -Wno-ignored-attributes test/benchmark.cpp -o bench.out + g++ -std=c++11 -g -O3 -${{ matrix.arch }} -DNDEBUG -I./ -I./include -Wno-ignored-attributes benchmark/benchmark.cpp -o bench.out - name: Run Bench run: | cat /proc/cpuinfo ./bench.out - name: Build BenchMv run: | - g++ -std=c++11 -g -O3 -${{ matrix.arch }} -DNDEBUG -I./ -I./include -Wno-ignored-attributes test/benchmark_mv.cpp -o benchmv.out + g++ -std=c++11 -g -O3 -${{ matrix.arch }} -DNDEBUG -I./ -I./include -Wno-ignored-attributes benchmark/benchmark_mv.cpp -o benchmv.out - name: Run BenchMv run: | ./benchmv.out - name: Build Accuracy run: | - g++ -std=c++11 -g -O3 -${{ matrix.arch }} -DNDEBUG -I./ -I./include -Wno-ignored-attributes test/accuracy.cpp -o accuracy.out + g++ -std=c++11 -g -O3 -${{ matrix.arch }} -DNDEBUG -I./ -I./include -Wno-ignored-attributes benchmark/accuracy.cpp -o accuracy.out - name: Run Accuracy run: | ./accuracy.out @@ -61,7 +61,7 @@ jobs: mv eigen-${{ matrix.eigenversion }} include - name: Build Bench run: | - g++ -std=c++11 -g -O3 -${{ matrix.arch }} -DNDEBUG -I./ -I./include -Wno-ignored-attributes test/benchmark.cpp -o bench.out + g++ -std=c++11 -g -O3 -${{ matrix.arch }} -DNDEBUG -I./ -I./include -Wno-ignored-attributes benchmark/benchmark.cpp -o bench.out - name: Run Bench continue-on-error: true run: | @@ -69,14 +69,14 @@ jobs: ./bench.out - name: Build BenchMv run: | - g++ -std=c++11 -g -O3 -${{ matrix.arch }} -DNDEBUG -I./ -I./include -Wno-ignored-attributes test/benchmark_mv.cpp -o benchmv.out + g++ -std=c++11 -g -O3 -${{ matrix.arch }} -DNDEBUG -I./ -I./include -Wno-ignored-attributes benchmark/benchmark_mv.cpp -o benchmv.out - name: Run BenchMv continue-on-error: true run: | ./benchmv.out - name: Build Accuracy run: | - g++ -std=c++11 -g -O3 -${{ matrix.arch }} -DNDEBUG -I./ -I./include -Wno-ignored-attributes test/accuracy.cpp -o accuracy.out + g++ -std=c++11 -g -O3 -${{ matrix.arch }} -DNDEBUG -I./ -I./include -Wno-ignored-attributes benchmark/accuracy.cpp -o accuracy.out - name: Run Accuracy run: | ./accuracy.out @@ -99,20 +99,20 @@ jobs: - uses: ilammy/msvc-dev-cmd@v1 - name: Build Bench run: | - cl.exe /O2 ${{ matrix.arch }} /I.\ /I.\include /D "NDEBUG" /Fe:bench.exe .\test\benchmark.cpp + cl.exe /O2 ${{ matrix.arch }} /I.\ /I.\include /D "NDEBUG" /Fe:bench.exe .\benchmark\benchmark.cpp - name: Run Bench run: | bash -c "cat /proc/cpuinfo" .\bench.exe - name: Build BenchMv run: | - cl.exe /O2 ${{ matrix.arch }} /I.\ /I.\include /D "NDEBUG" /Fe:benchmv.exe .\test\benchmark_mv.cpp + cl.exe /O2 ${{ matrix.arch }} /I.\ /I.\include /D "NDEBUG" /Fe:benchmv.exe .\benchmark\benchmark_mv.cpp - name: Run BenchMv run: | .\benchmv.exe - name: Build Accuracy run: | - cl.exe /O2 ${{ matrix.arch }} /I.\ /I.\include /D "NDEBUG" /Fe:accuracy.exe .\test\accuracy.cpp + cl.exe /O2 ${{ matrix.arch }} /I.\ /I.\include /D "NDEBUG" /Fe:accuracy.exe .\benchmark\accuracy.cpp - name: Run Accuracy run: | .\accuracy.exe @@ -134,4 +134,4 @@ jobs: - name: Test Reference run: | pip install scipy - python test/comp_scipy.py + python benchmark/comp_scipy.py diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 5be30d5..836ac88 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -33,10 +33,10 @@ jobs: - name: Test Reference run: | pip install scipy - python test/comp_scipy.py + python benchmark/comp_scipy.py - name: Build Bench run: | - g++ -std=c++11 -g -O3 -${{ matrix.arch }} -DNDEBUG -I./ -I./include -Wno-ignored-attributes test/benchmark.cpp -o bench.out + g++ -std=c++11 -g -O3 -${{ matrix.arch }} -DNDEBUG -I./ -I./include -Wno-ignored-attributes benchmark/benchmark.cpp -o bench.out - name: Run Bench run: | cat /proc/cpuinfo @@ -44,7 +44,7 @@ jobs: ./bench.out - name: Build BenchMv run: | - g++ -std=c++11 -g -O3 -${{ matrix.arch }} -DNDEBUG -I./ -I./include -Wno-ignored-attributes test/benchmark_mv.cpp -o benchmv.out + g++ -std=c++11 -g -O3 -${{ matrix.arch }} -DNDEBUG -I./ -I./include -Wno-ignored-attributes benchmark/benchmark_mv.cpp -o benchmv.out - name: Run BenchMv run: | ./benchmv.out @@ -72,10 +72,10 @@ jobs: - name: Test Reference run: | pip install scipy - python test/comp_scipy.py + python benchmark/comp_scipy.py - name: Build Bench run: | - g++ -std=c++11 -g -O3 -${{ matrix.arch }} -DNDEBUG -I./ -I./include -Wno-ignored-attributes test/benchmark.cpp -o bench.out + g++ -std=c++11 -g -O3 -${{ matrix.arch }} -DNDEBUG -I./ -I./include -Wno-ignored-attributes benchmark/benchmark.cpp -o bench.out - name: Run Bench continue-on-error: true run: | @@ -84,7 +84,7 @@ jobs: ./bench.out - name: Build BenchMv run: | - g++ -std=c++11 -g -O3 -${{ matrix.arch }} -DNDEBUG -I./ -I./include -Wno-ignored-attributes test/benchmark_mv.cpp -o benchmv.out + g++ -std=c++11 -g -O3 -${{ matrix.arch }} -DNDEBUG -I./ -I./include -Wno-ignored-attributes benchmark/benchmark_mv.cpp -o benchmv.out - name: Run BenchMv continue-on-error: true run: | @@ -113,18 +113,18 @@ jobs: - name: Test Reference run: | pip install scipy - python test/comp_scipy.py + python benchmark/comp_scipy.py - uses: ilammy/msvc-dev-cmd@v1 - name: Build Bench run: | - cl.exe /O2 ${{ matrix.arch }} /I.\ /I.\include /D "NDEBUG" /Fe:bench.exe .\test\benchmark.cpp + cl.exe /O2 ${{ matrix.arch }} /I.\ /I.\include /D "NDEBUG" /Fe:bench.exe .\benchmark\benchmark.cpp - name: Run Bench run: | bash -c "cat /proc/cpuinfo" .\bench.exe - name: Build BenchMv run: | - cl.exe /O2 ${{ matrix.arch }} /I.\ /I.\include /D "NDEBUG" /Fe:benchmv.exe .\test\benchmark_mv.cpp + cl.exe /O2 ${{ matrix.arch }} /I.\ /I.\include /D "NDEBUG" /Fe:benchmv.exe .\benchmark\benchmark_mv.cpp - name: Run BenchMv run: | .\benchmv.exe @@ -146,4 +146,4 @@ jobs: - name: Test Reference run: | pip install scipy - python test/comp_scipy.py + python benchmark/comp_scipy.py diff --git a/Benchmark.vcxproj b/Benchmark.vcxproj index 974047d..571ce67 100644 --- a/Benchmark.vcxproj +++ b/Benchmark.vcxproj @@ -1,10 +1,34 @@ + + Debug + ARM + + + Debug + ARM64 + Debug Win32 + + RelAVX2 + ARM + + + RelAVX2 + ARM64 + + + RelAVX + ARM + + + RelAVX + ARM64 + RelAVX Win32 @@ -21,6 +45,14 @@ RelAVX2 x64 + + RelNoVect + ARM + + + RelNoVect + ARM64 + RelNoVect Win32 @@ -29,6 +61,14 @@ RelNoVect x64 + + RelSSE2 + ARM + + + RelSSE2 + ARM64 + RelSSE2 Win32 @@ -43,9 +83,9 @@ - - - + + + @@ -68,6 +108,9 @@ + + + 15.0 {F45E39EE-2863-4550-8531-31723AD0BC09} @@ -117,6 +160,18 @@ v142 Unicode + + Application + true + v142 + Unicode + + + Application + true + v142 + Unicode + Application false @@ -124,6 +179,20 @@ true Unicode + + Application + false + v142 + true + Unicode + + + Application + false + v142 + true + Unicode + Application false @@ -131,6 +200,20 @@ true Unicode + + Application + false + v142 + true + Unicode + + + Application + false + v142 + true + Unicode + Application false @@ -138,6 +221,20 @@ true Unicode + + Application + false + v142 + true + Unicode + + + Application + false + v142 + true + Unicode + Application false @@ -145,6 +242,20 @@ true Unicode + + Application + false + v142 + true + Unicode + + + Application + false + v142 + true + Unicode + @@ -168,18 +279,48 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + true @@ -189,6 +330,14 @@ true $(SolutionDir);E:\AddInclude;$(IncludePath) + + true + $(SolutionDir);E:\AddInclude;$(IncludePath) + + + true + $(SolutionDir);E:\AddInclude;$(IncludePath) + false $(SolutionDir);E:\AddInclude;$(IncludePath) @@ -209,18 +358,50 @@ false $(SolutionDir);E:\AddInclude;$(IncludePath) + + false + $(SolutionDir);E:\AddInclude;$(IncludePath) + + + false + $(SolutionDir);E:\AddInclude;$(IncludePath) + false $(SolutionDir);E:\AddInclude;$(IncludePath) + + false + $(SolutionDir);E:\AddInclude;$(IncludePath) + + + false + $(SolutionDir);E:\AddInclude;$(IncludePath) + false $(SolutionDir);E:\AddInclude;$(IncludePath) + + false + $(SolutionDir);E:\AddInclude;$(IncludePath) + + + false + $(SolutionDir);E:\AddInclude;$(IncludePath) + false $(SolutionDir);E:\AddInclude;$(IncludePath) + + false + $(SolutionDir);E:\AddInclude;$(IncludePath) + + + false + $(SolutionDir);E:\AddInclude;$(IncludePath) + @@ -252,6 +433,38 @@ true + + + + + Level3 + Disabled + true + USE_ADDON;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) + true + AdvancedVectorExtensions2 + + + Console + true + + + + + + + Level3 + Disabled + true + USE_ADDON;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) + true + AdvancedVectorExtensions2 + + + Console + true + + @@ -349,6 +562,46 @@ true + + + + + Level3 + MaxSpeed + true + true + true + NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + true + StreamingSIMDExtensions2 + + + Console + true + true + true + + + + + + + Level3 + MaxSpeed + true + true + true + NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + true + StreamingSIMDExtensions2 + + + Console + true + true + true + + @@ -368,6 +621,44 @@ true + + + + + Level3 + MaxSpeed + true + true + true + EIGEN_DONT_VECTORIZE;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + true + + + Console + true + true + true + + + + + + + Level3 + MaxSpeed + true + true + true + EIGEN_DONT_VECTORIZE;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + true + + + Console + true + true + true + + @@ -388,6 +679,46 @@ true + + + + + Level3 + MaxSpeed + true + true + true + NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + true + AdvancedVectorExtensions + + + Console + true + true + true + + + + + + + Level3 + MaxSpeed + true + true + true + NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + true + AdvancedVectorExtensions + + + Console + true + true + true + + @@ -408,6 +739,46 @@ true + + + + + Level3 + MaxSpeed + true + true + true + NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + true + AdvancedVectorExtensions2 + + + Console + true + true + true + + + + + + + Level3 + MaxSpeed + true + true + true + NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + true + AdvancedVectorExtensions2 + + + Console + true + true + true + + diff --git a/Benchmark.vcxproj.filters b/Benchmark.vcxproj.filters index 1711a81..4bf2f21 100644 --- a/Benchmark.vcxproj.filters +++ b/Benchmark.vcxproj.filters @@ -19,6 +19,18 @@ {9aa8288d-cadf-40d0-9665-a7e61ad37e47} + + {2dd5c265-07cd-4265-a1c4-fdd2908d029f} + + + {316761d9-7742-4fd3-b64d-4ccb959247d2} + + + {a01afb4b-04af-4be7-aa91-8670aff6591c} + + + {5ba81498-594f-45cf-90f7-230750bfecd4} + @@ -63,12 +75,21 @@ EigenRand\MvDists + + EigenRand\arch\SSE + + + EigenRand\arch\AVX + + + EigenRand\arch\NEON + - + Source Files diff --git a/BenchmarkMv.vcxproj b/BenchmarkMv.vcxproj index 38ceb6e..15e9539 100644 --- a/BenchmarkMv.vcxproj +++ b/BenchmarkMv.vcxproj @@ -1,10 +1,34 @@ + + Debug + ARM + + + Debug + ARM64 + Debug Win32 + + RelAVX2 + ARM + + + RelAVX2 + ARM64 + + + RelAVX + ARM + + + RelAVX + ARM64 + RelAVX Win32 @@ -21,6 +45,14 @@ RelAVX2 x64 + + RelNoVect + ARM + + + RelNoVect + ARM64 + RelNoVect Win32 @@ -29,6 +61,14 @@ RelNoVect x64 + + RelSSE2 + ARM + + + RelSSE2 + ARM64 + RelSSE2 Win32 @@ -46,7 +86,7 @@ - + 15.0 @@ -97,6 +137,18 @@ v142 Unicode + + Application + true + v142 + Unicode + + + Application + true + v142 + Unicode + Application false @@ -104,6 +156,20 @@ true Unicode + + Application + false + v142 + true + Unicode + + + Application + false + v142 + true + Unicode + Application false @@ -111,6 +177,20 @@ true Unicode + + Application + false + v142 + true + Unicode + + + Application + false + v142 + true + Unicode + Application false @@ -118,6 +198,20 @@ true Unicode + + Application + false + v142 + true + Unicode + + + Application + false + v142 + true + Unicode + Application false @@ -125,6 +219,20 @@ true Unicode + + Application + false + v142 + true + Unicode + + + Application + false + v142 + true + Unicode + @@ -148,18 +256,48 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + true @@ -169,6 +307,14 @@ true $(SolutionDir);E:\AddInclude;$(IncludePath) + + true + $(SolutionDir);E:\AddInclude;$(IncludePath) + + + true + $(SolutionDir);E:\AddInclude;$(IncludePath) + false $(SolutionDir);E:\AddInclude;$(IncludePath) @@ -189,18 +335,50 @@ false $(SolutionDir);E:\AddInclude;$(IncludePath) + + false + $(SolutionDir);E:\AddInclude;$(IncludePath) + + + false + $(SolutionDir);E:\AddInclude;$(IncludePath) + false $(SolutionDir);E:\AddInclude;$(IncludePath) + + false + $(SolutionDir);E:\AddInclude;$(IncludePath) + + + false + $(SolutionDir);E:\AddInclude;$(IncludePath) + false $(SolutionDir);E:\AddInclude;$(IncludePath) + + false + $(SolutionDir);E:\AddInclude;$(IncludePath) + + + false + $(SolutionDir);E:\AddInclude;$(IncludePath) + false $(SolutionDir);E:\AddInclude;$(IncludePath) + + false + $(SolutionDir);E:\AddInclude;$(IncludePath) + + + false + $(SolutionDir);E:\AddInclude;$(IncludePath) + @@ -232,6 +410,38 @@ true + + + + + Level3 + Disabled + true + USE_ADDON;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) + true + AdvancedVectorExtensions2 + + + Console + true + + + + + + + Level3 + Disabled + true + USE_ADDON;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) + true + AdvancedVectorExtensions2 + + + Console + true + + @@ -329,6 +539,46 @@ true + + + + + Level3 + MaxSpeed + true + true + true + NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + true + StreamingSIMDExtensions2 + + + Console + true + true + true + + + + + + + Level3 + MaxSpeed + true + true + true + NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + true + StreamingSIMDExtensions2 + + + Console + true + true + true + + @@ -348,6 +598,44 @@ true + + + + + Level3 + MaxSpeed + true + true + true + EIGEN_DONT_VECTORIZE;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + true + + + Console + true + true + true + + + + + + + Level3 + MaxSpeed + true + true + true + EIGEN_DONT_VECTORIZE;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + true + + + Console + true + true + true + + @@ -368,6 +656,46 @@ true + + + + + Level3 + MaxSpeed + true + true + true + NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + true + AdvancedVectorExtensions + + + Console + true + true + true + + + + + + + Level3 + MaxSpeed + true + true + true + NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + true + AdvancedVectorExtensions + + + Console + true + true + true + + @@ -388,6 +716,46 @@ true + + + + + Level3 + MaxSpeed + true + true + true + NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + true + AdvancedVectorExtensions2 + + + Console + true + true + true + + + + + + + Level3 + MaxSpeed + true + true + true + NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + true + AdvancedVectorExtensions2 + + + Console + true + true + true + + diff --git a/BenchmarkMv.vcxproj.filters b/BenchmarkMv.vcxproj.filters index 168a64f..b615f7d 100644 --- a/BenchmarkMv.vcxproj.filters +++ b/BenchmarkMv.vcxproj.filters @@ -14,7 +14,7 @@ - + Source Files diff --git a/EigenRand.sln b/EigenRand.sln index 862c2f0..d8f4ca9 100644 --- a/EigenRand.sln +++ b/EigenRand.sln @@ -5,84 +5,228 @@ VisualStudioVersion = 16.0.30517.126 MinimumVisualStudioVersion = 10.0.40219.1 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "Benchmark", "Benchmark.vcxproj", "{F45E39EE-2863-4550-8531-31723AD0BC09}" EndProject -Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "TestAccuracy", "TestAccuracy.vcxproj", "{8A7F8C9A-2E06-4767-8BBA-E1DE1CB341AB}" +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "Accuracy", "TestAccuracy.vcxproj", "{8A7F8C9A-2E06-4767-8BBA-E1DE1CB341AB}" EndProject Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "BenchmarkMv", "BenchmarkMv.vcxproj", "{85BACD1D-1D4F-4084-8C6A-5C3AC938FE50}" EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "test", "test\test.vcxproj", "{643D8602-FE0D-4EAF-841C-E690EE6E53FD}" +EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|ARM = Debug|ARM + Debug|ARM64 = Debug|ARM64 Debug|x64 = Debug|x64 Debug|x86 = Debug|x86 + RelAVX|ARM = RelAVX|ARM + RelAVX|ARM64 = RelAVX|ARM64 RelAVX|x64 = RelAVX|x64 RelAVX|x86 = RelAVX|x86 + RelAVX2|ARM = RelAVX2|ARM + RelAVX2|ARM64 = RelAVX2|ARM64 RelAVX2|x64 = RelAVX2|x64 RelAVX2|x86 = RelAVX2|x86 + Release|ARM = Release|ARM + Release|ARM64 = Release|ARM64 + Release|x64 = Release|x64 + Release|x86 = Release|x86 + RelNoVect|ARM = RelNoVect|ARM + RelNoVect|ARM64 = RelNoVect|ARM64 RelNoVect|x64 = RelNoVect|x64 RelNoVect|x86 = RelNoVect|x86 + RelSSE2|ARM = RelSSE2|ARM + RelSSE2|ARM64 = RelSSE2|ARM64 RelSSE2|x64 = RelSSE2|x64 RelSSE2|x86 = RelSSE2|x86 EndGlobalSection GlobalSection(ProjectConfigurationPlatforms) = postSolution + {F45E39EE-2863-4550-8531-31723AD0BC09}.Debug|ARM.ActiveCfg = Debug|ARM + {F45E39EE-2863-4550-8531-31723AD0BC09}.Debug|ARM.Build.0 = Debug|ARM + {F45E39EE-2863-4550-8531-31723AD0BC09}.Debug|ARM64.ActiveCfg = Debug|ARM64 + {F45E39EE-2863-4550-8531-31723AD0BC09}.Debug|ARM64.Build.0 = Debug|ARM64 {F45E39EE-2863-4550-8531-31723AD0BC09}.Debug|x64.ActiveCfg = Debug|x64 {F45E39EE-2863-4550-8531-31723AD0BC09}.Debug|x64.Build.0 = Debug|x64 {F45E39EE-2863-4550-8531-31723AD0BC09}.Debug|x86.ActiveCfg = Debug|Win32 {F45E39EE-2863-4550-8531-31723AD0BC09}.Debug|x86.Build.0 = Debug|Win32 + {F45E39EE-2863-4550-8531-31723AD0BC09}.RelAVX|ARM.ActiveCfg = RelAVX|ARM + {F45E39EE-2863-4550-8531-31723AD0BC09}.RelAVX|ARM.Build.0 = RelAVX|ARM + {F45E39EE-2863-4550-8531-31723AD0BC09}.RelAVX|ARM64.ActiveCfg = RelAVX|ARM64 + {F45E39EE-2863-4550-8531-31723AD0BC09}.RelAVX|ARM64.Build.0 = RelAVX|ARM64 {F45E39EE-2863-4550-8531-31723AD0BC09}.RelAVX|x64.ActiveCfg = RelAVX|x64 {F45E39EE-2863-4550-8531-31723AD0BC09}.RelAVX|x64.Build.0 = RelAVX|x64 {F45E39EE-2863-4550-8531-31723AD0BC09}.RelAVX|x86.ActiveCfg = RelAVX|Win32 {F45E39EE-2863-4550-8531-31723AD0BC09}.RelAVX|x86.Build.0 = RelAVX|Win32 + {F45E39EE-2863-4550-8531-31723AD0BC09}.RelAVX2|ARM.ActiveCfg = RelAVX2|ARM + {F45E39EE-2863-4550-8531-31723AD0BC09}.RelAVX2|ARM.Build.0 = RelAVX2|ARM + {F45E39EE-2863-4550-8531-31723AD0BC09}.RelAVX2|ARM64.ActiveCfg = RelAVX2|ARM64 + {F45E39EE-2863-4550-8531-31723AD0BC09}.RelAVX2|ARM64.Build.0 = RelAVX2|ARM64 {F45E39EE-2863-4550-8531-31723AD0BC09}.RelAVX2|x64.ActiveCfg = RelAVX2|x64 {F45E39EE-2863-4550-8531-31723AD0BC09}.RelAVX2|x64.Build.0 = RelAVX2|x64 {F45E39EE-2863-4550-8531-31723AD0BC09}.RelAVX2|x86.ActiveCfg = RelAVX2|Win32 {F45E39EE-2863-4550-8531-31723AD0BC09}.RelAVX2|x86.Build.0 = RelAVX2|Win32 + {F45E39EE-2863-4550-8531-31723AD0BC09}.Release|ARM.ActiveCfg = RelSSE2|ARM + {F45E39EE-2863-4550-8531-31723AD0BC09}.Release|ARM.Build.0 = RelSSE2|ARM + {F45E39EE-2863-4550-8531-31723AD0BC09}.Release|ARM64.ActiveCfg = RelSSE2|ARM64 + {F45E39EE-2863-4550-8531-31723AD0BC09}.Release|ARM64.Build.0 = RelSSE2|ARM64 + {F45E39EE-2863-4550-8531-31723AD0BC09}.Release|x64.ActiveCfg = RelSSE2|x64 + {F45E39EE-2863-4550-8531-31723AD0BC09}.Release|x64.Build.0 = RelSSE2|x64 + {F45E39EE-2863-4550-8531-31723AD0BC09}.Release|x86.ActiveCfg = RelSSE2|Win32 + {F45E39EE-2863-4550-8531-31723AD0BC09}.Release|x86.Build.0 = RelSSE2|Win32 + {F45E39EE-2863-4550-8531-31723AD0BC09}.RelNoVect|ARM.ActiveCfg = RelNoVect|ARM + {F45E39EE-2863-4550-8531-31723AD0BC09}.RelNoVect|ARM.Build.0 = RelNoVect|ARM + {F45E39EE-2863-4550-8531-31723AD0BC09}.RelNoVect|ARM64.ActiveCfg = RelNoVect|ARM64 + {F45E39EE-2863-4550-8531-31723AD0BC09}.RelNoVect|ARM64.Build.0 = RelNoVect|ARM64 {F45E39EE-2863-4550-8531-31723AD0BC09}.RelNoVect|x64.ActiveCfg = RelNoVect|x64 {F45E39EE-2863-4550-8531-31723AD0BC09}.RelNoVect|x64.Build.0 = RelNoVect|x64 {F45E39EE-2863-4550-8531-31723AD0BC09}.RelNoVect|x86.ActiveCfg = RelNoVect|Win32 {F45E39EE-2863-4550-8531-31723AD0BC09}.RelNoVect|x86.Build.0 = RelNoVect|Win32 + {F45E39EE-2863-4550-8531-31723AD0BC09}.RelSSE2|ARM.ActiveCfg = RelSSE2|ARM + {F45E39EE-2863-4550-8531-31723AD0BC09}.RelSSE2|ARM.Build.0 = RelSSE2|ARM + {F45E39EE-2863-4550-8531-31723AD0BC09}.RelSSE2|ARM64.ActiveCfg = RelSSE2|ARM64 + {F45E39EE-2863-4550-8531-31723AD0BC09}.RelSSE2|ARM64.Build.0 = RelSSE2|ARM64 {F45E39EE-2863-4550-8531-31723AD0BC09}.RelSSE2|x64.ActiveCfg = RelSSE2|x64 {F45E39EE-2863-4550-8531-31723AD0BC09}.RelSSE2|x64.Build.0 = RelSSE2|x64 {F45E39EE-2863-4550-8531-31723AD0BC09}.RelSSE2|x86.ActiveCfg = RelSSE2|Win32 {F45E39EE-2863-4550-8531-31723AD0BC09}.RelSSE2|x86.Build.0 = RelSSE2|Win32 + {8A7F8C9A-2E06-4767-8BBA-E1DE1CB341AB}.Debug|ARM.ActiveCfg = Debug|ARM + {8A7F8C9A-2E06-4767-8BBA-E1DE1CB341AB}.Debug|ARM.Build.0 = Debug|ARM + {8A7F8C9A-2E06-4767-8BBA-E1DE1CB341AB}.Debug|ARM64.ActiveCfg = Debug|ARM64 + {8A7F8C9A-2E06-4767-8BBA-E1DE1CB341AB}.Debug|ARM64.Build.0 = Debug|ARM64 {8A7F8C9A-2E06-4767-8BBA-E1DE1CB341AB}.Debug|x64.ActiveCfg = Debug|x64 {8A7F8C9A-2E06-4767-8BBA-E1DE1CB341AB}.Debug|x64.Build.0 = Debug|x64 {8A7F8C9A-2E06-4767-8BBA-E1DE1CB341AB}.Debug|x86.ActiveCfg = Debug|Win32 {8A7F8C9A-2E06-4767-8BBA-E1DE1CB341AB}.Debug|x86.Build.0 = Debug|Win32 + {8A7F8C9A-2E06-4767-8BBA-E1DE1CB341AB}.RelAVX|ARM.ActiveCfg = RelAVX|ARM + {8A7F8C9A-2E06-4767-8BBA-E1DE1CB341AB}.RelAVX|ARM.Build.0 = RelAVX|ARM + {8A7F8C9A-2E06-4767-8BBA-E1DE1CB341AB}.RelAVX|ARM64.ActiveCfg = RelAVX|ARM64 + {8A7F8C9A-2E06-4767-8BBA-E1DE1CB341AB}.RelAVX|ARM64.Build.0 = RelAVX|ARM64 {8A7F8C9A-2E06-4767-8BBA-E1DE1CB341AB}.RelAVX|x64.ActiveCfg = RelAVX|x64 {8A7F8C9A-2E06-4767-8BBA-E1DE1CB341AB}.RelAVX|x64.Build.0 = RelAVX|x64 {8A7F8C9A-2E06-4767-8BBA-E1DE1CB341AB}.RelAVX|x86.ActiveCfg = RelAVX|Win32 {8A7F8C9A-2E06-4767-8BBA-E1DE1CB341AB}.RelAVX|x86.Build.0 = RelAVX|Win32 + {8A7F8C9A-2E06-4767-8BBA-E1DE1CB341AB}.RelAVX2|ARM.ActiveCfg = RelAVX2|ARM + {8A7F8C9A-2E06-4767-8BBA-E1DE1CB341AB}.RelAVX2|ARM.Build.0 = RelAVX2|ARM + {8A7F8C9A-2E06-4767-8BBA-E1DE1CB341AB}.RelAVX2|ARM64.ActiveCfg = RelAVX2|ARM64 + {8A7F8C9A-2E06-4767-8BBA-E1DE1CB341AB}.RelAVX2|ARM64.Build.0 = RelAVX2|ARM64 {8A7F8C9A-2E06-4767-8BBA-E1DE1CB341AB}.RelAVX2|x64.ActiveCfg = RelAVX2|x64 {8A7F8C9A-2E06-4767-8BBA-E1DE1CB341AB}.RelAVX2|x64.Build.0 = RelAVX2|x64 {8A7F8C9A-2E06-4767-8BBA-E1DE1CB341AB}.RelAVX2|x86.ActiveCfg = RelAVX2|Win32 {8A7F8C9A-2E06-4767-8BBA-E1DE1CB341AB}.RelAVX2|x86.Build.0 = RelAVX2|Win32 + {8A7F8C9A-2E06-4767-8BBA-E1DE1CB341AB}.Release|ARM.ActiveCfg = RelSSE2|ARM + {8A7F8C9A-2E06-4767-8BBA-E1DE1CB341AB}.Release|ARM.Build.0 = RelSSE2|ARM + {8A7F8C9A-2E06-4767-8BBA-E1DE1CB341AB}.Release|ARM64.ActiveCfg = RelSSE2|ARM64 + {8A7F8C9A-2E06-4767-8BBA-E1DE1CB341AB}.Release|ARM64.Build.0 = RelSSE2|ARM64 + {8A7F8C9A-2E06-4767-8BBA-E1DE1CB341AB}.Release|x64.ActiveCfg = RelSSE2|x64 + {8A7F8C9A-2E06-4767-8BBA-E1DE1CB341AB}.Release|x64.Build.0 = RelSSE2|x64 + {8A7F8C9A-2E06-4767-8BBA-E1DE1CB341AB}.Release|x86.ActiveCfg = RelSSE2|Win32 + {8A7F8C9A-2E06-4767-8BBA-E1DE1CB341AB}.Release|x86.Build.0 = RelSSE2|Win32 + {8A7F8C9A-2E06-4767-8BBA-E1DE1CB341AB}.RelNoVect|ARM.ActiveCfg = RelNoVect|ARM + {8A7F8C9A-2E06-4767-8BBA-E1DE1CB341AB}.RelNoVect|ARM.Build.0 = RelNoVect|ARM + {8A7F8C9A-2E06-4767-8BBA-E1DE1CB341AB}.RelNoVect|ARM64.ActiveCfg = RelNoVect|ARM64 + {8A7F8C9A-2E06-4767-8BBA-E1DE1CB341AB}.RelNoVect|ARM64.Build.0 = RelNoVect|ARM64 {8A7F8C9A-2E06-4767-8BBA-E1DE1CB341AB}.RelNoVect|x64.ActiveCfg = RelNoVect|x64 {8A7F8C9A-2E06-4767-8BBA-E1DE1CB341AB}.RelNoVect|x64.Build.0 = RelNoVect|x64 {8A7F8C9A-2E06-4767-8BBA-E1DE1CB341AB}.RelNoVect|x86.ActiveCfg = RelNoVect|Win32 {8A7F8C9A-2E06-4767-8BBA-E1DE1CB341AB}.RelNoVect|x86.Build.0 = RelNoVect|Win32 + {8A7F8C9A-2E06-4767-8BBA-E1DE1CB341AB}.RelSSE2|ARM.ActiveCfg = RelSSE2|ARM + {8A7F8C9A-2E06-4767-8BBA-E1DE1CB341AB}.RelSSE2|ARM.Build.0 = RelSSE2|ARM + {8A7F8C9A-2E06-4767-8BBA-E1DE1CB341AB}.RelSSE2|ARM64.ActiveCfg = RelSSE2|ARM64 + {8A7F8C9A-2E06-4767-8BBA-E1DE1CB341AB}.RelSSE2|ARM64.Build.0 = RelSSE2|ARM64 {8A7F8C9A-2E06-4767-8BBA-E1DE1CB341AB}.RelSSE2|x64.ActiveCfg = RelSSE2|x64 {8A7F8C9A-2E06-4767-8BBA-E1DE1CB341AB}.RelSSE2|x64.Build.0 = RelSSE2|x64 {8A7F8C9A-2E06-4767-8BBA-E1DE1CB341AB}.RelSSE2|x86.ActiveCfg = RelSSE2|Win32 {8A7F8C9A-2E06-4767-8BBA-E1DE1CB341AB}.RelSSE2|x86.Build.0 = RelSSE2|Win32 + {85BACD1D-1D4F-4084-8C6A-5C3AC938FE50}.Debug|ARM.ActiveCfg = Debug|ARM + {85BACD1D-1D4F-4084-8C6A-5C3AC938FE50}.Debug|ARM.Build.0 = Debug|ARM + {85BACD1D-1D4F-4084-8C6A-5C3AC938FE50}.Debug|ARM64.ActiveCfg = Debug|ARM64 + {85BACD1D-1D4F-4084-8C6A-5C3AC938FE50}.Debug|ARM64.Build.0 = Debug|ARM64 {85BACD1D-1D4F-4084-8C6A-5C3AC938FE50}.Debug|x64.ActiveCfg = Debug|x64 {85BACD1D-1D4F-4084-8C6A-5C3AC938FE50}.Debug|x64.Build.0 = Debug|x64 {85BACD1D-1D4F-4084-8C6A-5C3AC938FE50}.Debug|x86.ActiveCfg = Debug|Win32 {85BACD1D-1D4F-4084-8C6A-5C3AC938FE50}.Debug|x86.Build.0 = Debug|Win32 + {85BACD1D-1D4F-4084-8C6A-5C3AC938FE50}.RelAVX|ARM.ActiveCfg = RelAVX|ARM + {85BACD1D-1D4F-4084-8C6A-5C3AC938FE50}.RelAVX|ARM.Build.0 = RelAVX|ARM + {85BACD1D-1D4F-4084-8C6A-5C3AC938FE50}.RelAVX|ARM64.ActiveCfg = RelAVX|ARM64 + {85BACD1D-1D4F-4084-8C6A-5C3AC938FE50}.RelAVX|ARM64.Build.0 = RelAVX|ARM64 {85BACD1D-1D4F-4084-8C6A-5C3AC938FE50}.RelAVX|x64.ActiveCfg = RelAVX|x64 {85BACD1D-1D4F-4084-8C6A-5C3AC938FE50}.RelAVX|x64.Build.0 = RelAVX|x64 {85BACD1D-1D4F-4084-8C6A-5C3AC938FE50}.RelAVX|x86.ActiveCfg = RelAVX|Win32 {85BACD1D-1D4F-4084-8C6A-5C3AC938FE50}.RelAVX|x86.Build.0 = RelAVX|Win32 + {85BACD1D-1D4F-4084-8C6A-5C3AC938FE50}.RelAVX2|ARM.ActiveCfg = RelAVX2|ARM + {85BACD1D-1D4F-4084-8C6A-5C3AC938FE50}.RelAVX2|ARM.Build.0 = RelAVX2|ARM + {85BACD1D-1D4F-4084-8C6A-5C3AC938FE50}.RelAVX2|ARM64.ActiveCfg = RelAVX2|ARM64 + {85BACD1D-1D4F-4084-8C6A-5C3AC938FE50}.RelAVX2|ARM64.Build.0 = RelAVX2|ARM64 {85BACD1D-1D4F-4084-8C6A-5C3AC938FE50}.RelAVX2|x64.ActiveCfg = RelAVX2|x64 {85BACD1D-1D4F-4084-8C6A-5C3AC938FE50}.RelAVX2|x64.Build.0 = RelAVX2|x64 {85BACD1D-1D4F-4084-8C6A-5C3AC938FE50}.RelAVX2|x86.ActiveCfg = RelAVX2|Win32 {85BACD1D-1D4F-4084-8C6A-5C3AC938FE50}.RelAVX2|x86.Build.0 = RelAVX2|Win32 + {85BACD1D-1D4F-4084-8C6A-5C3AC938FE50}.Release|ARM.ActiveCfg = RelSSE2|ARM + {85BACD1D-1D4F-4084-8C6A-5C3AC938FE50}.Release|ARM.Build.0 = RelSSE2|ARM + {85BACD1D-1D4F-4084-8C6A-5C3AC938FE50}.Release|ARM64.ActiveCfg = RelSSE2|ARM64 + {85BACD1D-1D4F-4084-8C6A-5C3AC938FE50}.Release|ARM64.Build.0 = RelSSE2|ARM64 + {85BACD1D-1D4F-4084-8C6A-5C3AC938FE50}.Release|x64.ActiveCfg = RelSSE2|x64 + {85BACD1D-1D4F-4084-8C6A-5C3AC938FE50}.Release|x64.Build.0 = RelSSE2|x64 + {85BACD1D-1D4F-4084-8C6A-5C3AC938FE50}.Release|x86.ActiveCfg = RelSSE2|Win32 + {85BACD1D-1D4F-4084-8C6A-5C3AC938FE50}.Release|x86.Build.0 = RelSSE2|Win32 + {85BACD1D-1D4F-4084-8C6A-5C3AC938FE50}.RelNoVect|ARM.ActiveCfg = RelNoVect|ARM + {85BACD1D-1D4F-4084-8C6A-5C3AC938FE50}.RelNoVect|ARM.Build.0 = RelNoVect|ARM + {85BACD1D-1D4F-4084-8C6A-5C3AC938FE50}.RelNoVect|ARM64.ActiveCfg = RelNoVect|ARM64 + {85BACD1D-1D4F-4084-8C6A-5C3AC938FE50}.RelNoVect|ARM64.Build.0 = RelNoVect|ARM64 {85BACD1D-1D4F-4084-8C6A-5C3AC938FE50}.RelNoVect|x64.ActiveCfg = RelNoVect|x64 {85BACD1D-1D4F-4084-8C6A-5C3AC938FE50}.RelNoVect|x64.Build.0 = RelNoVect|x64 {85BACD1D-1D4F-4084-8C6A-5C3AC938FE50}.RelNoVect|x86.ActiveCfg = RelNoVect|Win32 {85BACD1D-1D4F-4084-8C6A-5C3AC938FE50}.RelNoVect|x86.Build.0 = RelNoVect|Win32 + {85BACD1D-1D4F-4084-8C6A-5C3AC938FE50}.RelSSE2|ARM.ActiveCfg = RelSSE2|ARM + {85BACD1D-1D4F-4084-8C6A-5C3AC938FE50}.RelSSE2|ARM.Build.0 = RelSSE2|ARM + {85BACD1D-1D4F-4084-8C6A-5C3AC938FE50}.RelSSE2|ARM64.ActiveCfg = RelSSE2|ARM64 + {85BACD1D-1D4F-4084-8C6A-5C3AC938FE50}.RelSSE2|ARM64.Build.0 = RelSSE2|ARM64 {85BACD1D-1D4F-4084-8C6A-5C3AC938FE50}.RelSSE2|x64.ActiveCfg = RelSSE2|x64 {85BACD1D-1D4F-4084-8C6A-5C3AC938FE50}.RelSSE2|x64.Build.0 = RelSSE2|x64 {85BACD1D-1D4F-4084-8C6A-5C3AC938FE50}.RelSSE2|x86.ActiveCfg = RelSSE2|Win32 {85BACD1D-1D4F-4084-8C6A-5C3AC938FE50}.RelSSE2|x86.Build.0 = RelSSE2|Win32 + {643D8602-FE0D-4EAF-841C-E690EE6E53FD}.Debug|ARM.ActiveCfg = Debug|Win32 + {643D8602-FE0D-4EAF-841C-E690EE6E53FD}.Debug|ARM64.ActiveCfg = Debug|Win32 + {643D8602-FE0D-4EAF-841C-E690EE6E53FD}.Debug|x64.ActiveCfg = Debug|x64 + {643D8602-FE0D-4EAF-841C-E690EE6E53FD}.Debug|x64.Build.0 = Debug|x64 + {643D8602-FE0D-4EAF-841C-E690EE6E53FD}.Debug|x86.ActiveCfg = Debug|Win32 + {643D8602-FE0D-4EAF-841C-E690EE6E53FD}.Debug|x86.Build.0 = Debug|Win32 + {643D8602-FE0D-4EAF-841C-E690EE6E53FD}.RelAVX|ARM.ActiveCfg = Debug|Win32 + {643D8602-FE0D-4EAF-841C-E690EE6E53FD}.RelAVX|ARM.Build.0 = Debug|Win32 + {643D8602-FE0D-4EAF-841C-E690EE6E53FD}.RelAVX|ARM64.ActiveCfg = Debug|Win32 + {643D8602-FE0D-4EAF-841C-E690EE6E53FD}.RelAVX|ARM64.Build.0 = Debug|Win32 + {643D8602-FE0D-4EAF-841C-E690EE6E53FD}.RelAVX|x64.ActiveCfg = Release|x64 + {643D8602-FE0D-4EAF-841C-E690EE6E53FD}.RelAVX|x64.Build.0 = Release|x64 + {643D8602-FE0D-4EAF-841C-E690EE6E53FD}.RelAVX|x86.ActiveCfg = Release|Win32 + {643D8602-FE0D-4EAF-841C-E690EE6E53FD}.RelAVX|x86.Build.0 = Release|Win32 + {643D8602-FE0D-4EAF-841C-E690EE6E53FD}.RelAVX2|ARM.ActiveCfg = Debug|Win32 + {643D8602-FE0D-4EAF-841C-E690EE6E53FD}.RelAVX2|ARM.Build.0 = Debug|Win32 + {643D8602-FE0D-4EAF-841C-E690EE6E53FD}.RelAVX2|ARM64.ActiveCfg = Debug|Win32 + {643D8602-FE0D-4EAF-841C-E690EE6E53FD}.RelAVX2|ARM64.Build.0 = Debug|Win32 + {643D8602-FE0D-4EAF-841C-E690EE6E53FD}.RelAVX2|x64.ActiveCfg = Release|x64 + {643D8602-FE0D-4EAF-841C-E690EE6E53FD}.RelAVX2|x64.Build.0 = Release|x64 + {643D8602-FE0D-4EAF-841C-E690EE6E53FD}.RelAVX2|x86.ActiveCfg = Release|Win32 + {643D8602-FE0D-4EAF-841C-E690EE6E53FD}.RelAVX2|x86.Build.0 = Release|Win32 + {643D8602-FE0D-4EAF-841C-E690EE6E53FD}.Release|ARM.ActiveCfg = Release|Win32 + {643D8602-FE0D-4EAF-841C-E690EE6E53FD}.Release|ARM64.ActiveCfg = Release|Win32 + {643D8602-FE0D-4EAF-841C-E690EE6E53FD}.Release|x64.ActiveCfg = Release|x64 + {643D8602-FE0D-4EAF-841C-E690EE6E53FD}.Release|x64.Build.0 = Release|x64 + {643D8602-FE0D-4EAF-841C-E690EE6E53FD}.Release|x86.ActiveCfg = Release|Win32 + {643D8602-FE0D-4EAF-841C-E690EE6E53FD}.Release|x86.Build.0 = Release|Win32 + {643D8602-FE0D-4EAF-841C-E690EE6E53FD}.RelNoVect|ARM.ActiveCfg = Debug|Win32 + {643D8602-FE0D-4EAF-841C-E690EE6E53FD}.RelNoVect|ARM.Build.0 = Debug|Win32 + {643D8602-FE0D-4EAF-841C-E690EE6E53FD}.RelNoVect|ARM64.ActiveCfg = Debug|Win32 + {643D8602-FE0D-4EAF-841C-E690EE6E53FD}.RelNoVect|ARM64.Build.0 = Debug|Win32 + {643D8602-FE0D-4EAF-841C-E690EE6E53FD}.RelNoVect|x64.ActiveCfg = Release|x64 + {643D8602-FE0D-4EAF-841C-E690EE6E53FD}.RelNoVect|x64.Build.0 = Release|x64 + {643D8602-FE0D-4EAF-841C-E690EE6E53FD}.RelNoVect|x86.ActiveCfg = Release|Win32 + {643D8602-FE0D-4EAF-841C-E690EE6E53FD}.RelNoVect|x86.Build.0 = Release|Win32 + {643D8602-FE0D-4EAF-841C-E690EE6E53FD}.RelSSE2|ARM.ActiveCfg = Debug|Win32 + {643D8602-FE0D-4EAF-841C-E690EE6E53FD}.RelSSE2|ARM.Build.0 = Debug|Win32 + {643D8602-FE0D-4EAF-841C-E690EE6E53FD}.RelSSE2|ARM64.ActiveCfg = Debug|Win32 + {643D8602-FE0D-4EAF-841C-E690EE6E53FD}.RelSSE2|ARM64.Build.0 = Debug|Win32 + {643D8602-FE0D-4EAF-841C-E690EE6E53FD}.RelSSE2|x64.ActiveCfg = Release|x64 + {643D8602-FE0D-4EAF-841C-E690EE6E53FD}.RelSSE2|x64.Build.0 = Release|x64 + {643D8602-FE0D-4EAF-841C-E690EE6E53FD}.RelSSE2|x86.ActiveCfg = Release|Win32 + {643D8602-FE0D-4EAF-841C-E690EE6E53FD}.RelSSE2|x86.Build.0 = Release|Win32 EndGlobalSection GlobalSection(SolutionProperties) = preSolution HideSolutionNode = FALSE diff --git a/EigenRand/Core.h b/EigenRand/Core.h index dd23126..ad6c704 100644 --- a/EigenRand/Core.h +++ b/EigenRand/Core.h @@ -13,15 +13,15 @@ #ifndef EIGENRAND_CORE_H #define EIGENRAND_CORE_H -#include +#include "RandUtils.h" -#include -#include -#include -#include +#include "Dists/Basic.h" +#include "Dists/Discrete.h" +#include "Dists/NormalExp.h" +#include "Dists/GammaPoisson.h" -#include -#include +#include "MvDists/MvNormal.h" +#include "MvDists/Multinomial.h" namespace Eigen { diff --git a/EigenRand/Dists/Basic.h b/EigenRand/Dists/Basic.h index ccd609c..8022522 100644 --- a/EigenRand/Dists/Basic.h +++ b/EigenRand/Dists/Basic.h @@ -238,6 +238,31 @@ namespace Eigen using OptCacheStore = CacheStore; + template + struct ExtractFirstUint; + + template<> + struct ExtractFirstUint + { + template + auto operator()(Packet v) -> decltype(Eigen::internal::pfirst(v)) + { + return Eigen::internal::pfirst(v); + } + }; + + template<> + struct ExtractFirstUint + { + template + auto operator()(Packet v) -> uint64_t + { + uint64_t arr[sizeof(Packet) / 8]; + Eigen::internal::pstoreu((Packet*)arr, v); + return arr[0]; + } + }; + /** * @brief Generator of random bits for integral scalars * @@ -296,13 +321,52 @@ namespace Eigen } }; + /** + * @brief Generator of reals in a range `[a, b]` + * + * @tparam _Scalar any real type + */ + template + class Balanced2Gen : public GenBase, _Scalar> + { + static_assert(std::is_floating_point<_Scalar>::value, "balanced needs floating point types."); + _Scalar slope = 2, bias = -1; + public: + using Scalar = _Scalar; + + /** + * @brief Construct a new balanced generator + * + * @param _a,_b left and right boundary + */ + Balanced2Gen(_Scalar _a = -1, _Scalar _b = 1) + : slope{ _b - _a }, bias{ _a } + { + } + + template + EIGEN_STRONG_INLINE const _Scalar operator() (Rng&& rng) + { + using namespace Eigen::internal; + return ((_Scalar)((int32_t)pfirst(std::forward(rng)()) & 0x7FFFFFFF) / 0x7FFFFFFF) * slope + bias; + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(Rng&& rng) + { + using namespace Eigen::internal; + using RUtils = RandUtils; + return RUtils{}.balanced(std::forward(rng), slope, bias); + } + }; + /** * @brief Generator of reals in a range `[0, 1)` * * @tparam _Scalar any real type */ template - class UniformRealGen : public GenBase, _Scalar> + class StdUniformRealGen : public GenBase, _Scalar> { static_assert(std::is_floating_point<_Scalar>::value, "uniformReal needs floating point types."); @@ -313,14 +377,14 @@ namespace Eigen EIGEN_STRONG_INLINE const _Scalar operator() (Rng&& rng) { using namespace Eigen::internal; - return bit_scalar<_Scalar>{}.to_ur(pfirst(std::forward(rng)())); + return BitScalar<_Scalar>{}.to_ur(ExtractFirstUint<_Scalar>{}(std::forward(rng)())); } template EIGEN_STRONG_INLINE const _Scalar nzur_scalar(Rng&& rng) { using namespace Eigen::internal; - return bit_scalar<_Scalar>{}.to_nzur(pfirst(std::forward(rng)())); + return BitScalar<_Scalar>{}.to_nzur(ExtractFirstUint<_Scalar>{}(std::forward(rng)())); } template @@ -332,6 +396,44 @@ namespace Eigen } }; + template + class UniformRealGen : public GenBase, _Scalar> + { + static_assert(std::is_floating_point<_Scalar>::value, "uniformReal needs floating point types."); + _Scalar bias, slope; + + public: + using Scalar = _Scalar; + + UniformRealGen(_Scalar _min = 0, _Scalar _max = 1) + : bias{ _min }, slope{ _max - _min } + { + } + + UniformRealGen(const UniformRealGen&) = default; + UniformRealGen(UniformRealGen&&) = default; + + UniformRealGen& operator=(const UniformRealGen&) = default; + UniformRealGen& operator=(UniformRealGen&&) = default; + + template + EIGEN_STRONG_INLINE const _Scalar operator() (Rng&& rng) + { + using namespace Eigen::internal; + return bias + BitScalar<_Scalar>{}.to_ur(pfirst(std::forward(rng)())) * slope; + } + + template + EIGEN_STRONG_INLINE const Packet packetOp(Rng&& rng) + { + using namespace Eigen::internal; + using RUtils = RandUtils; + return padd(pmul( + RUtils{}.uniform_real(std::forward(rng)), pset1(slope) + ), pset1(bias)); + } + }; + /** * @brief Generator of Bernoulli distribution @@ -468,7 +570,53 @@ namespace Eigen } template - using UniformRealType = CwiseNullaryOp, typename Derived::Scalar, Urng, true>, const Derived>; + using Balanced2Type = CwiseNullaryOp, typename Derived::Scalar, Urng, true>, const Derived>; + + /** + * @brief generates reals in a range `[a, b]` + * + * @tparam Derived a type of Eigen::DenseBase + * @tparam Urng + * @param rows the number of rows being generated + * @param cols the number of columns being generated + * @param urng c++11-style random number generator + * @param a,b left and right boundary + * @return a random matrix expression with a shape (`rows`, `cols`) + * + * @see Eigen::Rand::BalancedGen + */ + template + inline const Balanced2Type + balanced(Index rows, Index cols, Urng&& urng, typename Derived::Scalar a, typename Derived::Scalar b) + { + return { + rows, cols, { std::forward(urng), Balanced2Gen{a, b} } + }; + } + + /** + * @brief generates reals in a range `[a, b]` + * + * @tparam Derived + * @tparam Urng + * @param o an instance of any type of Eigen::DenseBase + * @param urng c++11-style random number generator + * @param a,b left and right boundary + * @return a random matrix expression of the same shape as `o` + * + * @see Eigen::Rand::BalancedGen + */ + template + inline const Balanced2Type + balancedLike(const Derived& o, Urng&& urng, typename Derived::Scalar a, typename Derived::Scalar b) + { + return { + o.rows(), o.cols(), { std::forward(urng), Balanced2Gen{a, b} } + }; + } + + template + using StdUniformRealType = CwiseNullaryOp, typename Derived::Scalar, Urng, true>, const Derived>; /** * @brief generates reals in a range `[0, 1)` @@ -483,7 +631,7 @@ namespace Eigen * @see Eigen::Rand::UniformRealGen */ template - inline const UniformRealType + inline const StdUniformRealType uniformReal(Index rows, Index cols, Urng&& urng) { return { @@ -503,7 +651,7 @@ namespace Eigen * @see Eigen::Rand::UniformRealGen */ template - inline const UniformRealType + inline const StdUniformRealType uniformRealLike(Derived& o, Urng&& urng) { return { @@ -511,6 +659,52 @@ namespace Eigen }; } + template + using UniformRealType = CwiseNullaryOp, typename Derived::Scalar, Urng, true>, const Derived>; + + /** + * @brief generates reals in a range `[min, max)` + * + * @tparam Derived a type of Eigen::DenseBase + * @tparam Urng + * @param rows the number of rows being generated + * @param cols the number of columns being generated + * @param urng c++11-style random number generator + * @param min, max the range of reals being generated + * @return a random matrix expression with a shape (`rows`, `cols`) + * + * @see Eigen::Rand::UniformRealGen + */ + template + inline const UniformRealType + uniformReal(Index rows, Index cols, Urng&& urng, typename Derived::Scalar min, typename Derived::Scalar max) + { + return { + rows, cols, { std::forward(urng), UniformRealGen{ min, max } } + }; + } + + /** + * @brief generates reals in a range `[min, max)` + * + * @tparam Derived + * @tparam Urng + * @param o an instance of any type of Eigen::DenseBase + * @param urng c++11-style random number generator + * @param min, max the range of reals being generated + * @return a random matrix expression of the same shape as `o` + * + * @see Eigen::Rand::UniformRealGen + */ + template + inline const UniformRealType + uniformRealLike(Derived& o, Urng&& urng, typename Derived::Scalar min, typename Derived::Scalar max) + { + return { + o.rows(), o.cols(), { std::forward(urng), UniformRealGen{ min, max } } + }; + } + template using BernoulliType = CwiseNullaryOp, typename Derived::Scalar, Urng, true>, const Derived>; diff --git a/EigenRand/Dists/NormalExp.h b/EigenRand/Dists/NormalExp.h index 6e90a98..d6c9d07 100644 --- a/EigenRand/Dists/NormalExp.h +++ b/EigenRand/Dists/NormalExp.h @@ -27,7 +27,7 @@ namespace Eigen { static_assert(std::is_floating_point<_Scalar>::value, "normalDist needs floating point types."); bool valid = false; - UniformRealGen<_Scalar> ur; + StdUniformRealGen<_Scalar> ur; public: using Scalar = _Scalar; @@ -186,7 +186,7 @@ namespace Eigen class StudentTGen : public GenBase, _Scalar> { static_assert(std::is_floating_point<_Scalar>::value, "studentT needs floating point types."); - UniformRealGen<_Scalar> ur; + StdUniformRealGen<_Scalar> ur; _Scalar n; public: @@ -256,7 +256,7 @@ namespace Eigen { friend GammaGen<_Scalar>; static_assert(std::is_floating_point<_Scalar>::value, "expDist needs floating point types."); - UniformRealGen<_Scalar> ur; + StdUniformRealGen<_Scalar> ur; _Scalar lambda = 1; public: @@ -481,7 +481,7 @@ namespace Eigen class WeibullGen : public GenBase, _Scalar> { static_assert(std::is_floating_point<_Scalar>::value, "weilbullDist needs floating point types."); - UniformRealGen<_Scalar> ur; + StdUniformRealGen<_Scalar> ur; _Scalar a = 1, b = 1; public: @@ -530,7 +530,7 @@ namespace Eigen class ExtremeValueGen : public GenBase, _Scalar> { static_assert(std::is_floating_point<_Scalar>::value, "extremeValueDist needs floating point types."); - UniformRealGen<_Scalar> ur; + StdUniformRealGen<_Scalar> ur; _Scalar a = 0, b = 1; public: @@ -622,7 +622,7 @@ namespace Eigen class CauchyGen : public GenBase, _Scalar> { static_assert(std::is_floating_point<_Scalar>::value, "cauchyDist needs floating point types."); - UniformRealGen<_Scalar> ur; + StdUniformRealGen<_Scalar> ur; _Scalar a = 0, b = 1; public: @@ -679,7 +679,7 @@ namespace Eigen friend FisherFGen<_Scalar>; static_assert(std::is_floating_point<_Scalar>::value, "betaDist needs floating point types."); int cache_rest_cnt = 0; - UniformRealGen<_Scalar> ur; + StdUniformRealGen<_Scalar> ur; _Scalar a, b; GammaGen<_Scalar> gd1, gd2; diff --git a/EigenRand/EigenRand b/EigenRand/EigenRand index 28d218c..b278786 100644 --- a/EigenRand/EigenRand +++ b/EigenRand/EigenRand @@ -13,7 +13,7 @@ #define EIGENRAND_EIGENRAND_H #include -#include -#include +#include "Macro.h" +#include "Core.h" #endif \ No newline at end of file diff --git a/EigenRand/Macro.h b/EigenRand/Macro.h index 2836f6a..18cf375 100644 --- a/EigenRand/Macro.h +++ b/EigenRand/Macro.h @@ -2,8 +2,8 @@ * @file Macro.h * @author bab2min (bab2min@gmail.com) * @brief - * @version 0.3.4 - * @date 2021-04-25 + * @version 0.3.5 + * @date 2021-07-16 * * @copyright Copyright (c) 2020-2021 * @@ -14,7 +14,7 @@ #define EIGENRAND_WORLD_VERSION 0 #define EIGENRAND_MAJOR_VERSION 3 -#define EIGENRAND_MINOR_VERSION 4 +#define EIGENRAND_MINOR_VERSION 5 #if EIGEN_VERSION_AT_LEAST(3,3,4) #else diff --git a/EigenRand/MorePacketMath.h b/EigenRand/MorePacketMath.h index 3558dc3..61a0404 100644 --- a/EigenRand/MorePacketMath.h +++ b/EigenRand/MorePacketMath.h @@ -30,43 +30,8 @@ namespace Eigen template struct HalfPacket; -#ifdef EIGEN_VECTORIZE_AVX2 - template<> - struct IsIntPacket : std::true_type {}; - - template<> - struct HalfPacket - { - using type = Packet4i; - }; -#endif -#ifdef EIGEN_VECTORIZE_AVX - template<> - struct IsFloatPacket : std::true_type {}; - - template<> - struct IsDoublePacket : std::true_type {}; -#endif -#ifdef EIGEN_VECTORIZE_SSE2 - template<> - struct IsIntPacket : std::true_type {}; - - template<> - struct IsFloatPacket : std::true_type {}; - - template<> - struct IsDoublePacket : std::true_type {}; - - template<> - struct HalfPacket - { - using type = uint64_t; - }; -#endif template - struct reinterpreter - { - }; + struct reinterpreter{}; template inline auto reinterpret_to_float(const Packet& x) @@ -89,6 +54,9 @@ namespace Eigen return reinterpreter{}.to_int(x); } + template + EIGEN_STRONG_INLINE void split_two(const Packet& p, typename HalfPacket::type& a, typename HalfPacket::type& b); + template EIGEN_STRONG_INLINE Packet pseti64(uint64_t a); @@ -222,10 +190,10 @@ namespace Eigen } template - struct bit_scalar; + struct BitScalar; template<> - struct bit_scalar + struct BitScalar { float to_ur(uint32_t x) { @@ -245,7 +213,7 @@ namespace Eigen }; template<> - struct bit_scalar + struct BitScalar { double to_ur(uint64_t x) { @@ -272,7 +240,7 @@ namespace Eigen EIGEN_STRONG_INLINE float2 bit_to_ur_float(uint64_t x) { - bit_scalar bs; + BitScalar bs; float2 ret; ret.f[0] = bs.to_ur(x & 0xFFFFFFFF); ret.f[1] = bs.to_ur(x >> 32); @@ -566,1017 +534,16 @@ namespace Eigen } #ifdef EIGEN_VECTORIZE_AVX -#include - -namespace Eigen -{ - namespace internal - { - template<> - struct reinterpreter - { - EIGEN_STRONG_INLINE Packet8f to_float(const Packet8i& x) - { - return _mm256_castsi256_ps(x); - } - - EIGEN_STRONG_INLINE Packet4d to_double(const Packet8i& x) - { - return _mm256_castsi256_pd(x); - } - - EIGEN_STRONG_INLINE Packet8i to_int(const Packet8i& x) - { - return x; - } - }; - - template<> - struct reinterpreter - { - EIGEN_STRONG_INLINE Packet8f to_float(const Packet8f& x) - { - return x; - } - - EIGEN_STRONG_INLINE Packet4d to_double(const Packet8f& x) - { - return _mm256_castps_pd(x); - } - - EIGEN_STRONG_INLINE Packet8i to_int(const Packet8f& x) - { - return _mm256_castps_si256(x); - } - }; - - template<> - struct reinterpreter - { - EIGEN_STRONG_INLINE Packet8f to_float(const Packet4d& x) - { - return _mm256_castpd_ps(x); - } - - EIGEN_STRONG_INLINE Packet4d to_double(const Packet4d& x) - { - return x; - } - - EIGEN_STRONG_INLINE Packet8i to_int(const Packet4d& x) - { - return _mm256_castpd_si256(x); - } - }; - - EIGEN_STRONG_INLINE void split_two(const Packet8i& x, Packet4i& a, Packet4i& b) - { - a = _mm256_extractf128_si256(x, 0); - b = _mm256_extractf128_si256(x, 1); - } - - EIGEN_STRONG_INLINE Packet8i combine_two(const Packet4i& a, const Packet4i& b) - { - return _mm256_insertf128_si256(_mm256_castsi128_si256(a), b, 1); - } - - EIGEN_STRONG_INLINE void split_two(const Packet8f& x, Packet4f& a, Packet4f& b) - { - a = _mm256_extractf128_ps(x, 0); - b = _mm256_extractf128_ps(x, 1); - } - - EIGEN_STRONG_INLINE Packet8f combine_two(const Packet4f& a, const Packet4f& b) - { - return _mm256_insertf128_ps(_mm256_castps128_ps256(a), b, 1); - } - - - EIGEN_STRONG_INLINE Packet4i combine_low32(const Packet8i& a) - { -#ifdef EIGEN_VECTORIZE_AVX2 - return _mm256_castsi256_si128(_mm256_permutevar8x32_epi32(a, _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7))); -#else - auto sc = _mm256_permutevar_ps(_mm256_castsi256_ps(a), _mm256_setr_epi32(0, 2, 1, 3, 1, 3, 0, 2)); - return _mm_castps_si128(_mm_blend_ps(_mm256_extractf128_ps(sc, 0), _mm256_extractf128_ps(sc, 1), 0b1100)); -#endif - } - - template<> - EIGEN_STRONG_INLINE Packet8i pseti64(uint64_t a) - { - return _mm256_set1_epi64x(a); - } - - template<> - EIGEN_STRONG_INLINE Packet8i padd64(const Packet8i& a, const Packet8i& b) - { -#ifdef EIGEN_VECTORIZE_AVX2 - return _mm256_add_epi64(a, b); -#else - Packet4i a1, a2, b1, b2; - split_two(a, a1, a2); - split_two(b, b1, b2); - return combine_two((Packet4i)_mm_add_epi64(a1, b1), (Packet4i)_mm_add_epi64(a2, b2)); -#endif - } - - template<> - EIGEN_STRONG_INLINE Packet8i psub64(const Packet8i& a, const Packet8i& b) - { -#ifdef EIGEN_VECTORIZE_AVX2 - return _mm256_sub_epi64(a, b); -#else - Packet4i a1, a2, b1, b2; - split_two(a, a1, a2); - split_two(b, b1, b2); - return combine_two((Packet4i)_mm_sub_epi64(a1, b1), (Packet4i)_mm_sub_epi64(a2, b2)); -#endif - } - - template<> - EIGEN_STRONG_INLINE Packet8i pcmpeq(const Packet8i& a, const Packet8i& b) - { -#ifdef EIGEN_VECTORIZE_AVX2 - return _mm256_cmpeq_epi32(a, b); -#else - Packet4i a1, a2, b1, b2; - split_two(a, a1, a2); - split_two(b, b1, b2); - return combine_two((Packet4i)_mm_cmpeq_epi32(a1, b1), (Packet4i)_mm_cmpeq_epi32(a2, b2)); -#endif - } - - template<> - EIGEN_STRONG_INLINE Packet8i psll(const Packet8i& a, int b) - { -#ifdef EIGEN_VECTORIZE_AVX2 - return _mm256_slli_epi32(a, b); -#else - Packet4i a1, a2; - split_two(a, a1, a2); - return combine_two((Packet4i)_mm_slli_epi32(a1, b), (Packet4i)_mm_slli_epi32(a2, b)); -#endif - } - - template<> - EIGEN_STRONG_INLINE Packet8i psrl(const Packet8i& a, int b) - { -#ifdef EIGEN_VECTORIZE_AVX2 - return _mm256_srli_epi32(a, b); -#else - Packet4i a1, a2; - split_two(a, a1, a2); - return combine_two((Packet4i)_mm_srli_epi32(a1, b), (Packet4i)_mm_srli_epi32(a2, b)); -#endif - } - - template<> - EIGEN_STRONG_INLINE Packet8i psll64(const Packet8i& a, int b) - { -#ifdef EIGEN_VECTORIZE_AVX2 - return _mm256_slli_epi64(a, b); -#else - Packet4i a1, a2; - split_two(a, a1, a2); - return combine_two((Packet4i)_mm_slli_epi64(a1, b), (Packet4i)_mm_slli_epi64(a2, b)); -#endif - } - - template<> - EIGEN_STRONG_INLINE Packet8i psrl64(const Packet8i& a, int b) - { -#ifdef EIGEN_VECTORIZE_AVX2 - return _mm256_srli_epi64(a, b); -#else - Packet4i a1, a2; - split_two(a, a1, a2); - return combine_two((Packet4i)_mm_srli_epi64(a1, b), (Packet4i)_mm_srli_epi64(a2, b)); -#endif - } - - template<> EIGEN_STRONG_INLINE Packet8i padd(const Packet8i& a, const Packet8i& b) - { -#ifdef EIGEN_VECTORIZE_AVX2 - return _mm256_add_epi32(a, b); -#else - Packet4i a1, a2, b1, b2; - split_two(a, a1, a2); - split_two(b, b1, b2); - return combine_two((Packet4i)_mm_add_epi32(a1, b1), (Packet4i)_mm_add_epi32(a2, b2)); -#endif - } - - template<> EIGEN_STRONG_INLINE Packet8i psub(const Packet8i& a, const Packet8i& b) - { -#ifdef EIGEN_VECTORIZE_AVX2 - return _mm256_sub_epi32(a, b); -#else - Packet4i a1, a2, b1, b2; - split_two(a, a1, a2); - split_two(b, b1, b2); - return combine_two((Packet4i)_mm_sub_epi32(a1, b1), (Packet4i)_mm_sub_epi32(a2, b2)); -#endif - } - - template<> EIGEN_STRONG_INLINE Packet8i pand(const Packet8i& a, const Packet8i& b) - { -#ifdef EIGEN_VECTORIZE_AVX2 - return _mm256_and_si256(a, b); -#else - return reinterpret_to_int((Packet8f)_mm256_and_ps(reinterpret_to_float(a), reinterpret_to_float(b))); -#endif - } - - template<> EIGEN_STRONG_INLINE Packet8i pandnot(const Packet8i& a, const Packet8i& b) - { -#ifdef EIGEN_VECTORIZE_AVX2 - return _mm256_andnot_si256(a, b); -#else - return reinterpret_to_int((Packet8f)_mm256_andnot_ps(reinterpret_to_float(a), reinterpret_to_float(b))); -#endif - } - - template<> EIGEN_STRONG_INLINE Packet8i por(const Packet8i& a, const Packet8i& b) - { -#ifdef EIGEN_VECTORIZE_AVX2 - return _mm256_or_si256(a, b); -#else - return reinterpret_to_int((Packet8f)_mm256_or_ps(reinterpret_to_float(a), reinterpret_to_float(b))); -#endif - } - - template<> EIGEN_STRONG_INLINE Packet8i pxor(const Packet8i& a, const Packet8i& b) - { -#ifdef EIGEN_VECTORIZE_AVX2 - return _mm256_xor_si256(a, b); -#else - return reinterpret_to_int((Packet8f)_mm256_xor_ps(reinterpret_to_float(a), reinterpret_to_float(b))); -#endif - } - - template<> - EIGEN_STRONG_INLINE Packet8i pcmplt(const Packet8i& a, const Packet8i& b) - { -#ifdef EIGEN_VECTORIZE_AVX2 - return _mm256_cmpgt_epi32(b, a); -#else - Packet4i a1, a2, b1, b2; - split_two(a, a1, a2); - split_two(b, b1, b2); - return combine_two((Packet4i)_mm_cmpgt_epi32(b1, a1), (Packet4i)_mm_cmpgt_epi32(b2, a2)); -#endif - } - - template<> - EIGEN_STRONG_INLINE Packet8i pcmplt64(const Packet8i& a, const Packet8i& b) - { -#ifdef EIGEN_VECTORIZE_AVX2 - return _mm256_cmpgt_epi64(b, a); -#else - Packet4i a1, a2, b1, b2; - split_two(a, a1, a2); - split_two(b, b1, b2); - return combine_two((Packet4i)_mm_cmpgt_epi64(b1, a1), (Packet4i)_mm_cmpgt_epi64(b2, a2)); -#endif - } - - template<> - EIGEN_STRONG_INLINE Packet8f pcmplt(const Packet8f& a, const Packet8f& b) - { - return _mm256_cmp_ps(a, b, _CMP_LT_OQ); - } - - template<> - EIGEN_STRONG_INLINE Packet8f pcmple(const Packet8f& a, const Packet8f& b) - { - return _mm256_cmp_ps(a, b, _CMP_LE_OQ); - } - - template<> - EIGEN_STRONG_INLINE Packet4d pcmplt(const Packet4d& a, const Packet4d& b) - { - return _mm256_cmp_pd(a, b, _CMP_LT_OQ); - } - - template<> - EIGEN_STRONG_INLINE Packet4d pcmple(const Packet4d& a, const Packet4d& b) - { - return _mm256_cmp_pd(a, b, _CMP_LE_OQ); - } - - template<> - EIGEN_STRONG_INLINE Packet8f pblendv(const Packet8f& ifPacket, const Packet8f& thenPacket, const Packet8f& elsePacket) - { - return _mm256_blendv_ps(elsePacket, thenPacket, ifPacket); - } - - template<> - EIGEN_STRONG_INLINE Packet8f pblendv(const Packet8i& ifPacket, const Packet8f& thenPacket, const Packet8f& elsePacket) - { - return pblendv(_mm256_castsi256_ps(ifPacket), thenPacket, elsePacket); - } - - template<> - EIGEN_STRONG_INLINE Packet8i pblendv(const Packet8i& ifPacket, const Packet8i& thenPacket, const Packet8i& elsePacket) - { - return _mm256_castps_si256(_mm256_blendv_ps( - _mm256_castsi256_ps(elsePacket), - _mm256_castsi256_ps(thenPacket), - _mm256_castsi256_ps(ifPacket) - )); - } - - template<> - EIGEN_STRONG_INLINE Packet4d pblendv(const Packet4d& ifPacket, const Packet4d& thenPacket, const Packet4d& elsePacket) - { - return _mm256_blendv_pd(elsePacket, thenPacket, ifPacket); - } - - template<> - EIGEN_STRONG_INLINE Packet4d pblendv(const Packet8i& ifPacket, const Packet4d& thenPacket, const Packet4d& elsePacket) - { - return pblendv(_mm256_castsi256_pd(ifPacket), thenPacket, elsePacket); - } - - template<> - EIGEN_STRONG_INLINE Packet8i pgather(const int* addr, const Packet8i& index) - { -#ifdef EIGEN_VECTORIZE_AVX2 - return _mm256_i32gather_epi32(addr, index, 4); -#else - uint32_t u[8]; - _mm256_storeu_si256((Packet8i*)u, index); - return _mm256_setr_epi32(addr[u[0]], addr[u[1]], addr[u[2]], addr[u[3]], - addr[u[4]], addr[u[5]], addr[u[6]], addr[u[7]]); -#endif - } - - template<> - EIGEN_STRONG_INLINE Packet8f pgather(const float *addr, const Packet8i& index) - { -#ifdef EIGEN_VECTORIZE_AVX2 - return _mm256_i32gather_ps(addr, index, 4); -#else - uint32_t u[8]; - _mm256_storeu_si256((Packet8i*)u, index); - return _mm256_setr_ps(addr[u[0]], addr[u[1]], addr[u[2]], addr[u[3]], - addr[u[4]], addr[u[5]], addr[u[6]], addr[u[7]]); -#endif - } - - template<> - EIGEN_STRONG_INLINE Packet4d pgather(const double *addr, const Packet8i& index, bool upperhalf) - { -#ifdef EIGEN_VECTORIZE_AVX2 - return _mm256_i32gather_pd(addr, _mm256_castsi256_si128(index), 8); -#else - uint32_t u[8]; - _mm256_storeu_si256((Packet8i*)u, index); - if (upperhalf) - { - return _mm256_setr_pd(addr[u[4]], addr[u[5]], addr[u[6]], addr[u[7]]); - } - else - { - return _mm256_setr_pd(addr[u[0]], addr[u[1]], addr[u[2]], addr[u[3]]); - } -#endif - } - - template<> - EIGEN_STRONG_INLINE int pmovemask(const Packet8f& a) - { - return _mm256_movemask_ps(a); - } - - template<> - EIGEN_STRONG_INLINE int pmovemask(const Packet4d& a) - { - return _mm256_movemask_pd(a); - } - - template<> - EIGEN_STRONG_INLINE int pmovemask(const Packet8i& a) - { - return pmovemask(_mm256_castsi256_ps(a)); - } - - template<> - EIGEN_STRONG_INLINE Packet8f ptruncate(const Packet8f& a) - { - return _mm256_round_ps(a, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); - } - - template<> - EIGEN_STRONG_INLINE Packet4d ptruncate(const Packet4d& a) - { - return _mm256_round_pd(a, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); - } - - template<> - EIGEN_STRONG_INLINE Packet8i pcmpeq64(const Packet8i& a, const Packet8i& b) - { -#ifdef EIGEN_VECTORIZE_AVX2 - return _mm256_cmpeq_epi64(a, b); -#else - Packet4i a1, a2, b1, b2; - split_two(a, a1, a2); - split_two(b, b1, b2); - return combine_two((Packet4i)_mm_cmpeq_epi64(a1, b1), (Packet4i)_mm_cmpeq_epi64(a2, b2)); -#endif - } - - template<> - EIGEN_STRONG_INLINE Packet8i pmuluadd64(const Packet8i& a, uint64_t b, uint64_t c) - { - uint64_t u[4]; - _mm256_storeu_si256((__m256i*)u, a); - u[0] = u[0] * b + c; - u[1] = u[1] * b + c; - u[2] = u[2] * b + c; - u[3] = u[3] * b + c; - return _mm256_loadu_si256((__m256i*)u); - } - - EIGEN_STRONG_INLINE __m256d uint64_to_double(__m256i x) { - auto y = _mm256_or_pd(_mm256_castsi256_pd(x), _mm256_set1_pd(0x0010000000000000)); - return _mm256_sub_pd(y, _mm256_set1_pd(0x0010000000000000)); - } - - EIGEN_STRONG_INLINE __m256d int64_to_double(__m256i x) { - x = padd64(x, _mm256_castpd_si256(_mm256_set1_pd(0x0018000000000000))); - return _mm256_sub_pd(_mm256_castsi256_pd(x), _mm256_set1_pd(0x0018000000000000)); - } - - EIGEN_STRONG_INLINE __m256i double_to_int64(__m256d x) { - x = _mm256_add_pd(x, _mm256_set1_pd(0x0018000000000000)); - return psub64( - _mm256_castpd_si256(x), - _mm256_castpd_si256(_mm256_set1_pd(0x0018000000000000)) - ); - } - - template<> - EIGEN_STRONG_INLINE Packet8i pcast64(const Packet4d& a) - { - return double_to_int64(a); - } - - template<> - EIGEN_STRONG_INLINE Packet4d pcast64(const Packet8i& a) - { - return int64_to_double(a); - } - - template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED - Packet4d psin(const Packet4d& x) - { - return _psin(x); - } - - template <> - EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4d - plog(const Packet4d& _x) { - Packet4d x = _x; - _EIGEN_DECLARE_CONST_Packet4d(1, 1.0); - _EIGEN_DECLARE_CONST_Packet4d(half, 0.5); - - auto inv_mant_mask = _mm256_castsi256_pd(pseti64(~0x7ff0000000000000)); - auto min_norm_pos = _mm256_castsi256_pd(pseti64(0x10000000000000)); - auto minus_inf = _mm256_castsi256_pd(pseti64(0xfff0000000000000)); - - // Polynomial coefficients. - _EIGEN_DECLARE_CONST_Packet4d(cephes_SQRTHF, 0.707106781186547524); - _EIGEN_DECLARE_CONST_Packet4d(cephes_log_p0, 7.0376836292E-2); - _EIGEN_DECLARE_CONST_Packet4d(cephes_log_p1, -1.1514610310E-1); - _EIGEN_DECLARE_CONST_Packet4d(cephes_log_p2, 1.1676998740E-1); - _EIGEN_DECLARE_CONST_Packet4d(cephes_log_p3, -1.2420140846E-1); - _EIGEN_DECLARE_CONST_Packet4d(cephes_log_p4, +1.4249322787E-1); - _EIGEN_DECLARE_CONST_Packet4d(cephes_log_p5, -1.6668057665E-1); - _EIGEN_DECLARE_CONST_Packet4d(cephes_log_p6, +2.0000714765E-1); - _EIGEN_DECLARE_CONST_Packet4d(cephes_log_p7, -2.4999993993E-1); - _EIGEN_DECLARE_CONST_Packet4d(cephes_log_p8, +3.3333331174E-1); - _EIGEN_DECLARE_CONST_Packet4d(cephes_log_q1, -2.12194440e-4); - _EIGEN_DECLARE_CONST_Packet4d(cephes_log_q2, 0.693359375); - - Packet4d invalid_mask = _mm256_cmp_pd(x, _mm256_setzero_pd(), _CMP_NGE_UQ); // not greater equal is true if x is NaN - Packet4d iszero_mask = _mm256_cmp_pd(x, _mm256_setzero_pd(), _CMP_EQ_OQ); - - // Truncate input values to the minimum positive normal. - x = pmax(x, min_norm_pos); - - Packet4d emm0 = uint64_to_double(psrl64(_mm256_castpd_si256(x), 52)); - Packet4d e = psub(emm0, pset1(1022)); - - // Set the exponents to -1, i.e. x are in the range [0.5,1). - x = _mm256_and_pd(x, inv_mant_mask); - x = _mm256_or_pd(x, p4d_half); - - // part2: Shift the inputs from the range [0.5,1) to [sqrt(1/2),sqrt(2)) - // and shift by -1. The values are then centered around 0, which improves - // the stability of the polynomial evaluation. - // if( x < SQRTHF ) { - // e -= 1; - // x = x + x - 1.0; - // } else { x = x - 1.0; } - Packet4d mask = _mm256_cmp_pd(x, p4d_cephes_SQRTHF, _CMP_LT_OQ); - Packet4d tmp = _mm256_and_pd(x, mask); - x = psub(x, p4d_1); - e = psub(e, _mm256_and_pd(p4d_1, mask)); - x = padd(x, tmp); - - Packet4d x2 = pmul(x, x); - Packet4d x3 = pmul(x2, x); - - // Evaluate the polynomial approximant of degree 8 in three parts, probably - // to improve instruction-level parallelism. - Packet4d y, y1, y2; - y = pmadd(p4d_cephes_log_p0, x, p4d_cephes_log_p1); - y1 = pmadd(p4d_cephes_log_p3, x, p4d_cephes_log_p4); - y2 = pmadd(p4d_cephes_log_p6, x, p4d_cephes_log_p7); - y = pmadd(y, x, p4d_cephes_log_p2); - y1 = pmadd(y1, x, p4d_cephes_log_p5); - y2 = pmadd(y2, x, p4d_cephes_log_p8); - y = pmadd(y, x3, y1); - y = pmadd(y, x3, y2); - y = pmul(y, x3); - - // Add the logarithm of the exponent back to the result of the interpolation. - y1 = pmul(e, p4d_cephes_log_q1); - tmp = pmul(x2, p4d_half); - y = padd(y, y1); - x = psub(x, tmp); - y2 = pmul(e, p4d_cephes_log_q2); - x = padd(x, y); - x = padd(x, y2); - - // Filter out invalid inputs, i.e. negative arg will be NAN, 0 will be -INF. - return pblendv(iszero_mask, minus_inf, _mm256_or_pd(x, invalid_mask)); - } - -#if EIGEN_VERSION_AT_LEAST(3,3,5) -#else - template<> EIGEN_STRONG_INLINE Packet4f pcast(const Packet4i& a) { - return _mm_cvtepi32_ps(a); - } - - template<> EIGEN_STRONG_INLINE Packet4i pcast(const Packet4f& a) { - return _mm_cvttps_epi32(a); - } -#endif - } -} +#include "arch/AVX/MorePacketMath.h" #endif #ifdef EIGEN_VECTORIZE_SSE2 -#include - -namespace Eigen -{ - namespace internal - { - template<> - struct reinterpreter - { - EIGEN_STRONG_INLINE Packet4f to_float(const Packet4i& x) - { - return _mm_castsi128_ps(x); - } - - EIGEN_STRONG_INLINE Packet2d to_double(const Packet4i& x) - { - return _mm_castsi128_pd(x); - } - - EIGEN_STRONG_INLINE Packet4i to_int(const Packet4i& x) - { - return x; - } - }; - - template<> - struct reinterpreter - { - EIGEN_STRONG_INLINE Packet4f to_float(const Packet4f& x) - { - return x; - } - - EIGEN_STRONG_INLINE Packet2d to_double(const Packet4f& x) - { - return _mm_castps_pd(x); - } - - EIGEN_STRONG_INLINE Packet4i to_int(const Packet4f& x) - { - return _mm_castps_si128(x); - } - }; - - template<> - struct reinterpreter - { - EIGEN_STRONG_INLINE Packet4f to_float(const Packet2d& x) - { - return _mm_castpd_ps(x); - } - - EIGEN_STRONG_INLINE Packet2d to_double(const Packet2d& x) - { - return x; - } - - EIGEN_STRONG_INLINE Packet4i to_int(const Packet2d& x) - { - return _mm_castpd_si128(x); - } - }; - - EIGEN_STRONG_INLINE void split_two(const Packet4i& x, uint64_t& a, uint64_t& b) - { -#ifdef EIGEN_VECTORIZE_SSE4_1 - a = _mm_extract_epi64(x, 0); - b = _mm_extract_epi64(x, 1); -#else - uint64_t u[2]; - _mm_storeu_si128((__m128i*)u, x); - a = u[0]; - b = u[1]; +#include "arch/SSE/MorePacketMath.h" #endif - } - - EIGEN_STRONG_INLINE Packet4i combine_low32(const Packet4i& a, const Packet4i& b) - { - auto sa = _mm_shuffle_epi32(a, _MM_SHUFFLE(3, 1, 2, 0)); - auto sb = _mm_shuffle_epi32(b, _MM_SHUFFLE(2, 0, 3, 1)); - sa = _mm_and_si128(sa, _mm_setr_epi32(-1, -1, 0, 0)); - sb = _mm_and_si128(sb, _mm_setr_epi32(0, 0, -1, -1)); - return _mm_or_si128(sa, sb); - } - - template<> - EIGEN_STRONG_INLINE Packet4i pseti64(uint64_t a) - { - return _mm_set1_epi64x(a); - } - - template<> - EIGEN_STRONG_INLINE Packet4i padd64(const Packet4i& a, const Packet4i& b) - { - return _mm_add_epi64(a, b); - } - - template<> - EIGEN_STRONG_INLINE Packet4i psub64(const Packet4i& a, const Packet4i& b) - { - return _mm_sub_epi64(a, b); - } - - template<> - EIGEN_STRONG_INLINE Packet4i pcmpeq(const Packet4i& a, const Packet4i& b) - { - return _mm_cmpeq_epi32(a, b); - } - - template<> - EIGEN_STRONG_INLINE Packet4i psll(const Packet4i& a, int b) - { - return _mm_slli_epi32(a, b); - } - - template<> - EIGEN_STRONG_INLINE Packet4i psrl(const Packet4i& a, int b) - { - return _mm_srli_epi32(a, b); - } - - template<> - EIGEN_STRONG_INLINE Packet4i psll64(const Packet4i& a, int b) - { - return _mm_slli_epi64(a, b); - } - - template<> - EIGEN_STRONG_INLINE Packet4i psrl64(const Packet4i& a, int b) - { - return _mm_srli_epi64(a, b); - } - - template<> - EIGEN_STRONG_INLINE Packet4i pcmplt(const Packet4i& a, const Packet4i& b) - { - return _mm_cmplt_epi32(a, b); - } - - template<> - EIGEN_STRONG_INLINE Packet4i pcmplt64(const Packet4i& a, const Packet4i& b) - { -#ifdef EIGEN_VECTORIZE_SSE4_2 - return _mm_cmpgt_epi64(b, a); -#else - int64_t u[2], v[2]; - _mm_storeu_si128((__m128i*)u, a); - _mm_storeu_si128((__m128i*)v, b); - return _mm_set_epi64x(u[1] < v[1] ? -1 : 0, u[0] < v[0] ? -1 : 0); -#endif - } - - template<> - EIGEN_STRONG_INLINE Packet4f pcmplt(const Packet4f& a, const Packet4f& b) - { - return _mm_cmplt_ps(a, b); - } - - template<> - EIGEN_STRONG_INLINE Packet4f pcmple(const Packet4f& a, const Packet4f& b) - { - return _mm_cmple_ps(a, b); - } - - template<> - EIGEN_STRONG_INLINE Packet2d pcmplt(const Packet2d& a, const Packet2d& b) - { - return _mm_cmplt_pd(a, b); - } - - template<> - EIGEN_STRONG_INLINE Packet2d pcmple(const Packet2d& a, const Packet2d& b) - { - return _mm_cmple_pd(a, b); - } - - template<> - EIGEN_STRONG_INLINE Packet4f pblendv(const Packet4f& ifPacket, const Packet4f& thenPacket, const Packet4f& elsePacket) - { -#ifdef EIGEN_VECTORIZE_SSE4_1 - return _mm_blendv_ps(elsePacket, thenPacket, ifPacket); -#else - return _mm_or_ps(_mm_and_ps(ifPacket, thenPacket), _mm_andnot_ps(ifPacket, elsePacket)); -#endif - } - - template<> - EIGEN_STRONG_INLINE Packet4f pblendv(const Packet4i& ifPacket, const Packet4f& thenPacket, const Packet4f& elsePacket) - { - return pblendv(_mm_castsi128_ps(ifPacket), thenPacket, elsePacket); - } - - template<> - EIGEN_STRONG_INLINE Packet4i pblendv(const Packet4i& ifPacket, const Packet4i& thenPacket, const Packet4i& elsePacket) - { -#ifdef EIGEN_VECTORIZE_SSE4_1 - return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(elsePacket), _mm_castsi128_ps(thenPacket), _mm_castsi128_ps(ifPacket))); -#else - return _mm_or_si128(_mm_and_si128(ifPacket, thenPacket), _mm_andnot_si128(ifPacket, elsePacket)); +#ifdef EIGEN_VECTORIZE_NEON +#include "arch/NEON/MorePacketMath.h" #endif - } - template<> - EIGEN_STRONG_INLINE Packet2d pblendv(const Packet2d& ifPacket, const Packet2d& thenPacket, const Packet2d& elsePacket) - { -#ifdef EIGEN_VECTORIZE_SSE4_1 - return _mm_blendv_pd(elsePacket, thenPacket, ifPacket); -#else - return _mm_or_pd(_mm_and_pd(ifPacket, thenPacket), _mm_andnot_pd(ifPacket, elsePacket)); #endif - } - - template<> - EIGEN_STRONG_INLINE Packet2d pblendv(const Packet4i& ifPacket, const Packet2d& thenPacket, const Packet2d& elsePacket) - { - return pblendv(_mm_castsi128_pd(ifPacket), thenPacket, elsePacket); - } - - template<> - EIGEN_STRONG_INLINE Packet4i pgather(const int* addr, const Packet4i& index) - { -#ifdef EIGEN_VECTORIZE_AVX2 - return _mm_i32gather_epi32(addr, index, 4); -#else - uint32_t u[4]; - _mm_storeu_si128((__m128i*)u, index); - return _mm_setr_epi32(addr[u[0]], addr[u[1]], addr[u[2]], addr[u[3]]); -#endif - } - - template<> - EIGEN_STRONG_INLINE Packet4f pgather(const float* addr, const Packet4i& index) - { -#ifdef EIGEN_VECTORIZE_AVX2 - return _mm_i32gather_ps(addr, index, 4); -#else - uint32_t u[4]; - _mm_storeu_si128((__m128i*)u, index); - return _mm_setr_ps(addr[u[0]], addr[u[1]], addr[u[2]], addr[u[3]]); -#endif - } - - template<> - EIGEN_STRONG_INLINE Packet2d pgather(const double* addr, const Packet4i& index, bool upperhalf) - { -#ifdef EIGEN_VECTORIZE_AVX2 - return _mm_i32gather_pd(addr, index, 8); -#else - uint32_t u[4]; - _mm_storeu_si128((__m128i*)u, index); - if (upperhalf) - { - return _mm_setr_pd(addr[u[2]], addr[u[3]]); - } - else - { - return _mm_setr_pd(addr[u[0]], addr[u[1]]); - } -#endif - } - - template<> - EIGEN_STRONG_INLINE int pmovemask(const Packet4f& a) - { - return _mm_movemask_ps(a); - } - - template<> - EIGEN_STRONG_INLINE int pmovemask(const Packet2d& a) - { - return _mm_movemask_pd(a); - } - - template<> - EIGEN_STRONG_INLINE int pmovemask(const Packet4i& a) - { - return pmovemask((Packet4f)_mm_castsi128_ps(a)); - } - - template<> - EIGEN_STRONG_INLINE Packet4f ptruncate(const Packet4f& a) - { -#ifdef EIGEN_VECTORIZE_SSE4_1 - return _mm_round_ps(a, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); -#else - auto round = _MM_GET_ROUNDING_MODE(); - _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO); - auto ret = _mm_cvtepi32_ps(_mm_cvtps_epi32(a)); - _MM_SET_ROUNDING_MODE(round); - return ret; -#endif - } - - template<> - EIGEN_STRONG_INLINE Packet2d ptruncate(const Packet2d& a) - { -#ifdef EIGEN_VECTORIZE_SSE4_1 - return _mm_round_pd(a, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); -#else - auto round = _MM_GET_ROUNDING_MODE(); - _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO); - auto ret = _mm_cvtepi32_pd(_mm_cvtpd_epi32(a)); - _MM_SET_ROUNDING_MODE(round); - return ret; -#endif - } - - template<> - EIGEN_STRONG_INLINE Packet4i pcmpeq64(const Packet4i& a, const Packet4i& b) - { -#ifdef EIGEN_VECTORIZE_SSE4_1 - return _mm_cmpeq_epi64(a, b); -#else - Packet4i c = _mm_cmpeq_epi32(a, b); - return pand(c, (Packet4i)_mm_shuffle_epi32(c, _MM_SHUFFLE(2, 3, 0, 1))); -#endif - } - - template<> - EIGEN_STRONG_INLINE Packet4i pmuluadd64(const Packet4i& a, uint64_t b, uint64_t c) - { - uint64_t u[2]; - _mm_storeu_si128((__m128i*)u, a); - u[0] = u[0] * b + c; - u[1] = u[1] * b + c; - return _mm_loadu_si128((__m128i*)u); - } - - EIGEN_STRONG_INLINE __m128d uint64_to_double(__m128i x) { - x = _mm_or_si128(x, _mm_castpd_si128(_mm_set1_pd(0x0010000000000000))); - return _mm_sub_pd(_mm_castsi128_pd(x), _mm_set1_pd(0x0010000000000000)); - } - - EIGEN_STRONG_INLINE __m128d int64_to_double(__m128i x) { - x = _mm_add_epi64(x, _mm_castpd_si128(_mm_set1_pd(0x0018000000000000))); - return _mm_sub_pd(_mm_castsi128_pd(x), _mm_set1_pd(0x0018000000000000)); - } - - EIGEN_STRONG_INLINE __m128i double_to_int64(__m128d x) { - x = _mm_add_pd(x, _mm_set1_pd(0x0018000000000000)); - return _mm_sub_epi64( - _mm_castpd_si128(x), - _mm_castpd_si128(_mm_set1_pd(0x0018000000000000)) - ); - } - - template<> - EIGEN_STRONG_INLINE Packet4i pcast64(const Packet2d& a) - { - return double_to_int64(a); - } - - template<> - EIGEN_STRONG_INLINE Packet2d pcast64(const Packet4i& a) - { - return int64_to_double(a); - } - - template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED - Packet2d psin(const Packet2d& x) - { - return _psin(x); - } - - template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED - Packet2d plog(const Packet2d& _x) - { - Packet2d x = _x; - _EIGEN_DECLARE_CONST_Packet2d(1, 1.0f); - _EIGEN_DECLARE_CONST_Packet2d(half, 0.5f); - _EIGEN_DECLARE_CONST_Packet4i(0x7f, 0x7f); - - auto inv_mant_mask = _mm_castsi128_pd(pseti64(~0x7ff0000000000000)); - auto min_norm_pos = _mm_castsi128_pd(pseti64(0x10000000000000)); - auto minus_inf = _mm_castsi128_pd(pseti64(0xfff0000000000000)); - - /* natural logarithm computed for 4 simultaneous float - return NaN for x <= 0 - */ - _EIGEN_DECLARE_CONST_Packet2d(cephes_SQRTHF, 0.707106781186547524); - _EIGEN_DECLARE_CONST_Packet2d(cephes_log_p0, 7.0376836292E-2); - _EIGEN_DECLARE_CONST_Packet2d(cephes_log_p1, -1.1514610310E-1); - _EIGEN_DECLARE_CONST_Packet2d(cephes_log_p2, 1.1676998740E-1); - _EIGEN_DECLARE_CONST_Packet2d(cephes_log_p3, -1.2420140846E-1); - _EIGEN_DECLARE_CONST_Packet2d(cephes_log_p4, +1.4249322787E-1); - _EIGEN_DECLARE_CONST_Packet2d(cephes_log_p5, -1.6668057665E-1); - _EIGEN_DECLARE_CONST_Packet2d(cephes_log_p6, +2.0000714765E-1); - _EIGEN_DECLARE_CONST_Packet2d(cephes_log_p7, -2.4999993993E-1); - _EIGEN_DECLARE_CONST_Packet2d(cephes_log_p8, +3.3333331174E-1); - _EIGEN_DECLARE_CONST_Packet2d(cephes_log_q1, -2.12194440e-4); - _EIGEN_DECLARE_CONST_Packet2d(cephes_log_q2, 0.693359375); - - - Packet4i emm0; - - Packet2d invalid_mask = _mm_cmpnge_pd(x, _mm_setzero_pd()); // not greater equal is true if x is NaN - Packet2d iszero_mask = _mm_cmpeq_pd(x, _mm_setzero_pd()); - - x = pmax(x, min_norm_pos); /* cut off denormalized stuff */ - emm0 = _mm_srli_epi64(_mm_castpd_si128(x), 52); - - /* keep only the fractional part */ - x = _mm_and_pd(x, inv_mant_mask); - x = _mm_or_pd(x, p2d_half); - - Packet2d e = _mm_sub_pd(uint64_to_double(emm0), pset1(1022)); - - /* part2: - if( x < SQRTHF ) { - e -= 1; - x = x + x - 1.0; - } else { x = x - 1.0; } - */ - Packet2d mask = _mm_cmplt_pd(x, p2d_cephes_SQRTHF); - Packet2d tmp = pand(x, mask); - x = psub(x, p2d_1); - e = psub(e, pand(p2d_1, mask)); - x = padd(x, tmp); - - Packet2d x2 = pmul(x, x); - Packet2d x3 = pmul(x2, x); - - Packet2d y, y1, y2; - y = pmadd(p2d_cephes_log_p0, x, p2d_cephes_log_p1); - y1 = pmadd(p2d_cephes_log_p3, x, p2d_cephes_log_p4); - y2 = pmadd(p2d_cephes_log_p6, x, p2d_cephes_log_p7); - y = pmadd(y, x, p2d_cephes_log_p2); - y1 = pmadd(y1, x, p2d_cephes_log_p5); - y2 = pmadd(y2, x, p2d_cephes_log_p8); - y = pmadd(y, x3, y1); - y = pmadd(y, x3, y2); - y = pmul(y, x3); - - y1 = pmul(e, p2d_cephes_log_q1); - tmp = pmul(x2, p2d_half); - y = padd(y, y1); - x = psub(x, tmp); - y2 = pmul(e, p2d_cephes_log_q2); - x = padd(x, y); - x = padd(x, y2); - // negative arg will be NAN, 0 will be -INF - return pblendv(iszero_mask, minus_inf, _mm_or_pd(x, invalid_mask)); - } - } -} -#endif - -#endif diff --git a/EigenRand/PacketFilter.h b/EigenRand/PacketFilter.h index 692bb31..bf6093d 100644 --- a/EigenRand/PacketFilter.h +++ b/EigenRand/PacketFilter.h @@ -13,7 +13,7 @@ #define EIGENRAND_PACKET_FILTER_H #include -#include +#include "MorePacketMath.h" namespace Eigen { diff --git a/EigenRand/PacketRandomEngine.h b/EigenRand/PacketRandomEngine.h index 055f0a7..20bf9c6 100644 --- a/EigenRand/PacketRandomEngine.h +++ b/EigenRand/PacketRandomEngine.h @@ -15,7 +15,7 @@ #include #include #include -#include +#include "MorePacketMath.h" #include namespace Eigen @@ -537,7 +537,7 @@ namespace Eigen float uniform_real() { - internal::bit_scalar bs; + internal::BitScalar bs; return bs.to_ur(this->operator()()); } }; diff --git a/EigenRand/RandUtils.h b/EigenRand/RandUtils.h index 155a5a2..afca40c 100644 --- a/EigenRand/RandUtils.h +++ b/EigenRand/RandUtils.h @@ -12,9 +12,9 @@ #ifndef EIGENRAND_RAND_UTILS_H #define EIGENRAND_RAND_UTILS_H -#include -#include -#include +#include "MorePacketMath.h" +#include "PacketFilter.h" +#include "PacketRandomEngine.h" namespace Eigen { @@ -38,6 +38,12 @@ namespace Eigen return psub(pmul(this->zero_to_one(rng), pset1(2)), pset1(1)); } + template + EIGEN_STRONG_INLINE PacketType balanced(Rng& rng, Scalar slope, Scalar bias) + { + return padd(pmul(this->zero_to_one(rng), pset1(slope)), pset1(bias)); + } + EIGEN_STRONG_INLINE PacketType nonzero_uniform_real(Rng& rng) { constexpr auto epsilon = std::numeric_limits::type>::epsilon() / 8; diff --git a/EigenRand/arch/AVX/MorePacketMath.h b/EigenRand/arch/AVX/MorePacketMath.h new file mode 100644 index 0000000..ce48db8 --- /dev/null +++ b/EigenRand/arch/AVX/MorePacketMath.h @@ -0,0 +1,601 @@ +/** + * @file MorePacketMath.h + * @author bab2min (bab2min@gmail.com) + * @brief + * @version 0.3.5 + * @date 2021-07-16 + * + * @copyright Copyright (c) 2020-2021 + * + */ + +#ifndef EIGENRAND_MORE_PACKET_MATH_AVX_H +#define EIGENRAND_MORE_PACKET_MATH_AVX_H + +#include + +namespace Eigen +{ + namespace internal + { +#ifdef EIGEN_VECTORIZE_AVX2 + template<> + struct IsIntPacket : std::true_type {}; + + template<> + struct HalfPacket + { + using type = Packet4i; + }; + + template<> + struct HalfPacket + { + using type = Packet4f; + }; +#endif + template<> + struct IsFloatPacket : std::true_type {}; + + template<> + struct IsDoublePacket : std::true_type {}; + + template<> + struct reinterpreter + { + EIGEN_STRONG_INLINE Packet8f to_float(const Packet8i& x) + { + return _mm256_castsi256_ps(x); + } + + EIGEN_STRONG_INLINE Packet4d to_double(const Packet8i& x) + { + return _mm256_castsi256_pd(x); + } + + EIGEN_STRONG_INLINE Packet8i to_int(const Packet8i& x) + { + return x; + } + }; + + template<> + struct reinterpreter + { + EIGEN_STRONG_INLINE Packet8f to_float(const Packet8f& x) + { + return x; + } + + EIGEN_STRONG_INLINE Packet4d to_double(const Packet8f& x) + { + return _mm256_castps_pd(x); + } + + EIGEN_STRONG_INLINE Packet8i to_int(const Packet8f& x) + { + return _mm256_castps_si256(x); + } + }; + + template<> + struct reinterpreter + { + EIGEN_STRONG_INLINE Packet8f to_float(const Packet4d& x) + { + return _mm256_castpd_ps(x); + } + + EIGEN_STRONG_INLINE Packet4d to_double(const Packet4d& x) + { + return x; + } + + EIGEN_STRONG_INLINE Packet8i to_int(const Packet4d& x) + { + return _mm256_castpd_si256(x); + } + }; + + template<> + EIGEN_STRONG_INLINE void split_two(const Packet8i& x, Packet4i& a, Packet4i& b) + { + a = _mm256_extractf128_si256(x, 0); + b = _mm256_extractf128_si256(x, 1); + } + + EIGEN_STRONG_INLINE Packet8i combine_two(const Packet4i& a, const Packet4i& b) + { + return _mm256_insertf128_si256(_mm256_castsi128_si256(a), b, 1); + } + + template<> + EIGEN_STRONG_INLINE void split_two(const Packet8f& x, Packet4f& a, Packet4f& b) + { + a = _mm256_extractf128_ps(x, 0); + b = _mm256_extractf128_ps(x, 1); + } + + EIGEN_STRONG_INLINE Packet8f combine_two(const Packet4f& a, const Packet4f& b) + { + return _mm256_insertf128_ps(_mm256_castps128_ps256(a), b, 1); + } + + + EIGEN_STRONG_INLINE Packet4i combine_low32(const Packet8i& a) + { +#ifdef EIGEN_VECTORIZE_AVX2 + return _mm256_castsi256_si128(_mm256_permutevar8x32_epi32(a, _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7))); +#else + auto sc = _mm256_permutevar_ps(_mm256_castsi256_ps(a), _mm256_setr_epi32(0, 2, 1, 3, 1, 3, 0, 2)); + return _mm_castps_si128(_mm_blend_ps(_mm256_extractf128_ps(sc, 0), _mm256_extractf128_ps(sc, 1), 0b1100)); +#endif + } + + template<> + EIGEN_STRONG_INLINE Packet8i pseti64(uint64_t a) + { + return _mm256_set1_epi64x(a); + } + + template<> + EIGEN_STRONG_INLINE Packet8i padd64(const Packet8i& a, const Packet8i& b) + { +#ifdef EIGEN_VECTORIZE_AVX2 + return _mm256_add_epi64(a, b); +#else + Packet4i a1, a2, b1, b2; + split_two(a, a1, a2); + split_two(b, b1, b2); + return combine_two((Packet4i)_mm_add_epi64(a1, b1), (Packet4i)_mm_add_epi64(a2, b2)); +#endif + } + + template<> + EIGEN_STRONG_INLINE Packet8i psub64(const Packet8i& a, const Packet8i& b) + { +#ifdef EIGEN_VECTORIZE_AVX2 + return _mm256_sub_epi64(a, b); +#else + Packet4i a1, a2, b1, b2; + split_two(a, a1, a2); + split_two(b, b1, b2); + return combine_two((Packet4i)_mm_sub_epi64(a1, b1), (Packet4i)_mm_sub_epi64(a2, b2)); +#endif + } + + template<> + EIGEN_STRONG_INLINE Packet8i pcmpeq(const Packet8i& a, const Packet8i& b) + { +#ifdef EIGEN_VECTORIZE_AVX2 + return _mm256_cmpeq_epi32(a, b); +#else + Packet4i a1, a2, b1, b2; + split_two(a, a1, a2); + split_two(b, b1, b2); + return combine_two((Packet4i)_mm_cmpeq_epi32(a1, b1), (Packet4i)_mm_cmpeq_epi32(a2, b2)); +#endif + } + + template<> + EIGEN_STRONG_INLINE Packet8i psll(const Packet8i& a, int b) + { +#ifdef EIGEN_VECTORIZE_AVX2 + return _mm256_slli_epi32(a, b); +#else + Packet4i a1, a2; + split_two(a, a1, a2); + return combine_two((Packet4i)_mm_slli_epi32(a1, b), (Packet4i)_mm_slli_epi32(a2, b)); +#endif + } + + template<> + EIGEN_STRONG_INLINE Packet8i psrl(const Packet8i& a, int b) + { +#ifdef EIGEN_VECTORIZE_AVX2 + return _mm256_srli_epi32(a, b); +#else + Packet4i a1, a2; + split_two(a, a1, a2); + return combine_two((Packet4i)_mm_srli_epi32(a1, b), (Packet4i)_mm_srli_epi32(a2, b)); +#endif + } + + template<> + EIGEN_STRONG_INLINE Packet8i psll64(const Packet8i& a, int b) + { +#ifdef EIGEN_VECTORIZE_AVX2 + return _mm256_slli_epi64(a, b); +#else + Packet4i a1, a2; + split_two(a, a1, a2); + return combine_two((Packet4i)_mm_slli_epi64(a1, b), (Packet4i)_mm_slli_epi64(a2, b)); +#endif + } + + template<> + EIGEN_STRONG_INLINE Packet8i psrl64(const Packet8i& a, int b) + { +#ifdef EIGEN_VECTORIZE_AVX2 + return _mm256_srli_epi64(a, b); +#else + Packet4i a1, a2; + split_two(a, a1, a2); + return combine_two((Packet4i)_mm_srli_epi64(a1, b), (Packet4i)_mm_srli_epi64(a2, b)); +#endif + } + + template<> EIGEN_STRONG_INLINE Packet8i padd(const Packet8i& a, const Packet8i& b) + { +#ifdef EIGEN_VECTORIZE_AVX2 + return _mm256_add_epi32(a, b); +#else + Packet4i a1, a2, b1, b2; + split_two(a, a1, a2); + split_two(b, b1, b2); + return combine_two((Packet4i)_mm_add_epi32(a1, b1), (Packet4i)_mm_add_epi32(a2, b2)); +#endif + } + + template<> EIGEN_STRONG_INLINE Packet8i psub(const Packet8i& a, const Packet8i& b) + { +#ifdef EIGEN_VECTORIZE_AVX2 + return _mm256_sub_epi32(a, b); +#else + Packet4i a1, a2, b1, b2; + split_two(a, a1, a2); + split_two(b, b1, b2); + return combine_two((Packet4i)_mm_sub_epi32(a1, b1), (Packet4i)_mm_sub_epi32(a2, b2)); +#endif + } + + template<> EIGEN_STRONG_INLINE Packet8i pand(const Packet8i& a, const Packet8i& b) + { +#ifdef EIGEN_VECTORIZE_AVX2 + return _mm256_and_si256(a, b); +#else + return reinterpret_to_int((Packet8f)_mm256_and_ps(reinterpret_to_float(a), reinterpret_to_float(b))); +#endif + } + + template<> EIGEN_STRONG_INLINE Packet8i pandnot(const Packet8i& a, const Packet8i& b) + { +#ifdef EIGEN_VECTORIZE_AVX2 + return _mm256_andnot_si256(a, b); +#else + return reinterpret_to_int((Packet8f)_mm256_andnot_ps(reinterpret_to_float(a), reinterpret_to_float(b))); +#endif + } + + template<> EIGEN_STRONG_INLINE Packet8i por(const Packet8i& a, const Packet8i& b) + { +#ifdef EIGEN_VECTORIZE_AVX2 + return _mm256_or_si256(a, b); +#else + return reinterpret_to_int((Packet8f)_mm256_or_ps(reinterpret_to_float(a), reinterpret_to_float(b))); +#endif + } + + template<> EIGEN_STRONG_INLINE Packet8i pxor(const Packet8i& a, const Packet8i& b) + { +#ifdef EIGEN_VECTORIZE_AVX2 + return _mm256_xor_si256(a, b); +#else + return reinterpret_to_int((Packet8f)_mm256_xor_ps(reinterpret_to_float(a), reinterpret_to_float(b))); +#endif + } + + template<> + EIGEN_STRONG_INLINE Packet8i pcmplt(const Packet8i& a, const Packet8i& b) + { +#ifdef EIGEN_VECTORIZE_AVX2 + return _mm256_cmpgt_epi32(b, a); +#else + Packet4i a1, a2, b1, b2; + split_two(a, a1, a2); + split_two(b, b1, b2); + return combine_two((Packet4i)_mm_cmpgt_epi32(b1, a1), (Packet4i)_mm_cmpgt_epi32(b2, a2)); +#endif + } + + template<> + EIGEN_STRONG_INLINE Packet8i pcmplt64(const Packet8i& a, const Packet8i& b) + { +#ifdef EIGEN_VECTORIZE_AVX2 + return _mm256_cmpgt_epi64(b, a); +#else + Packet4i a1, a2, b1, b2; + split_two(a, a1, a2); + split_two(b, b1, b2); + return combine_two((Packet4i)_mm_cmpgt_epi64(b1, a1), (Packet4i)_mm_cmpgt_epi64(b2, a2)); +#endif + } + + template<> + EIGEN_STRONG_INLINE Packet8f pcmplt(const Packet8f& a, const Packet8f& b) + { + return _mm256_cmp_ps(a, b, _CMP_LT_OQ); + } + + template<> + EIGEN_STRONG_INLINE Packet8f pcmple(const Packet8f& a, const Packet8f& b) + { + return _mm256_cmp_ps(a, b, _CMP_LE_OQ); + } + + template<> + EIGEN_STRONG_INLINE Packet4d pcmplt(const Packet4d& a, const Packet4d& b) + { + return _mm256_cmp_pd(a, b, _CMP_LT_OQ); + } + + template<> + EIGEN_STRONG_INLINE Packet4d pcmple(const Packet4d& a, const Packet4d& b) + { + return _mm256_cmp_pd(a, b, _CMP_LE_OQ); + } + + template<> + EIGEN_STRONG_INLINE Packet8f pblendv(const Packet8f& ifPacket, const Packet8f& thenPacket, const Packet8f& elsePacket) + { + return _mm256_blendv_ps(elsePacket, thenPacket, ifPacket); + } + + template<> + EIGEN_STRONG_INLINE Packet8f pblendv(const Packet8i& ifPacket, const Packet8f& thenPacket, const Packet8f& elsePacket) + { + return pblendv(_mm256_castsi256_ps(ifPacket), thenPacket, elsePacket); + } + + template<> + EIGEN_STRONG_INLINE Packet8i pblendv(const Packet8i& ifPacket, const Packet8i& thenPacket, const Packet8i& elsePacket) + { + return _mm256_castps_si256(_mm256_blendv_ps( + _mm256_castsi256_ps(elsePacket), + _mm256_castsi256_ps(thenPacket), + _mm256_castsi256_ps(ifPacket) + )); + } + + template<> + EIGEN_STRONG_INLINE Packet4d pblendv(const Packet4d& ifPacket, const Packet4d& thenPacket, const Packet4d& elsePacket) + { + return _mm256_blendv_pd(elsePacket, thenPacket, ifPacket); + } + + template<> + EIGEN_STRONG_INLINE Packet4d pblendv(const Packet8i& ifPacket, const Packet4d& thenPacket, const Packet4d& elsePacket) + { + return pblendv(_mm256_castsi256_pd(ifPacket), thenPacket, elsePacket); + } + + template<> + EIGEN_STRONG_INLINE Packet8i pgather(const int* addr, const Packet8i& index) + { +#ifdef EIGEN_VECTORIZE_AVX2 + return _mm256_i32gather_epi32(addr, index, 4); +#else + uint32_t u[8]; + _mm256_storeu_si256((Packet8i*)u, index); + return _mm256_setr_epi32(addr[u[0]], addr[u[1]], addr[u[2]], addr[u[3]], + addr[u[4]], addr[u[5]], addr[u[6]], addr[u[7]]); +#endif + } + + template<> + EIGEN_STRONG_INLINE Packet8f pgather(const float* addr, const Packet8i& index) + { +#ifdef EIGEN_VECTORIZE_AVX2 + return _mm256_i32gather_ps(addr, index, 4); +#else + uint32_t u[8]; + _mm256_storeu_si256((Packet8i*)u, index); + return _mm256_setr_ps(addr[u[0]], addr[u[1]], addr[u[2]], addr[u[3]], + addr[u[4]], addr[u[5]], addr[u[6]], addr[u[7]]); +#endif + } + + template<> + EIGEN_STRONG_INLINE Packet4d pgather(const double* addr, const Packet8i& index, bool upperhalf) + { +#ifdef EIGEN_VECTORIZE_AVX2 + return _mm256_i32gather_pd(addr, _mm256_castsi256_si128(index), 8); +#else + uint32_t u[8]; + _mm256_storeu_si256((Packet8i*)u, index); + if (upperhalf) + { + return _mm256_setr_pd(addr[u[4]], addr[u[5]], addr[u[6]], addr[u[7]]); + } + else + { + return _mm256_setr_pd(addr[u[0]], addr[u[1]], addr[u[2]], addr[u[3]]); + } +#endif + } + + template<> + EIGEN_STRONG_INLINE int pmovemask(const Packet8f& a) + { + return _mm256_movemask_ps(a); + } + + template<> + EIGEN_STRONG_INLINE int pmovemask(const Packet4d& a) + { + return _mm256_movemask_pd(a); + } + + template<> + EIGEN_STRONG_INLINE int pmovemask(const Packet8i& a) + { + return pmovemask(_mm256_castsi256_ps(a)); + } + + template<> + EIGEN_STRONG_INLINE Packet8f ptruncate(const Packet8f& a) + { + return _mm256_round_ps(a, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); + } + + template<> + EIGEN_STRONG_INLINE Packet4d ptruncate(const Packet4d& a) + { + return _mm256_round_pd(a, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); + } + + template<> + EIGEN_STRONG_INLINE Packet8i pcmpeq64(const Packet8i& a, const Packet8i& b) + { +#ifdef EIGEN_VECTORIZE_AVX2 + return _mm256_cmpeq_epi64(a, b); +#else + Packet4i a1, a2, b1, b2; + split_two(a, a1, a2); + split_two(b, b1, b2); + return combine_two((Packet4i)_mm_cmpeq_epi64(a1, b1), (Packet4i)_mm_cmpeq_epi64(a2, b2)); +#endif + } + + template<> + EIGEN_STRONG_INLINE Packet8i pmuluadd64(const Packet8i& a, uint64_t b, uint64_t c) + { + uint64_t u[4]; + _mm256_storeu_si256((__m256i*)u, a); + u[0] = u[0] * b + c; + u[1] = u[1] * b + c; + u[2] = u[2] * b + c; + u[3] = u[3] * b + c; + return _mm256_loadu_si256((__m256i*)u); + } + + EIGEN_STRONG_INLINE __m256d uint64_to_double(__m256i x) { + auto y = _mm256_or_pd(_mm256_castsi256_pd(x), _mm256_set1_pd(0x0010000000000000)); + return _mm256_sub_pd(y, _mm256_set1_pd(0x0010000000000000)); + } + + EIGEN_STRONG_INLINE __m256d int64_to_double(__m256i x) { + x = padd64(x, _mm256_castpd_si256(_mm256_set1_pd(0x0018000000000000))); + return _mm256_sub_pd(_mm256_castsi256_pd(x), _mm256_set1_pd(0x0018000000000000)); + } + + EIGEN_STRONG_INLINE __m256i double_to_int64(__m256d x) { + x = _mm256_add_pd(x, _mm256_set1_pd(0x0018000000000000)); + return psub64( + _mm256_castpd_si256(x), + _mm256_castpd_si256(_mm256_set1_pd(0x0018000000000000)) + ); + } + + template<> + EIGEN_STRONG_INLINE Packet8i pcast64(const Packet4d& a) + { + return double_to_int64(a); + } + + template<> + EIGEN_STRONG_INLINE Packet4d pcast64(const Packet8i& a) + { + return int64_to_double(a); + } + + template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED + Packet4d psin(const Packet4d& x) + { + return _psin(x); + } + + template <> + EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4d + plog(const Packet4d& _x) { + Packet4d x = _x; + _EIGEN_DECLARE_CONST_Packet4d(1, 1.0); + _EIGEN_DECLARE_CONST_Packet4d(half, 0.5); + + auto inv_mant_mask = _mm256_castsi256_pd(pseti64(~0x7ff0000000000000)); + auto min_norm_pos = _mm256_castsi256_pd(pseti64(0x10000000000000)); + auto minus_inf = _mm256_castsi256_pd(pseti64(0xfff0000000000000)); + + // Polynomial coefficients. + _EIGEN_DECLARE_CONST_Packet4d(cephes_SQRTHF, 0.707106781186547524); + _EIGEN_DECLARE_CONST_Packet4d(cephes_log_p0, 7.0376836292E-2); + _EIGEN_DECLARE_CONST_Packet4d(cephes_log_p1, -1.1514610310E-1); + _EIGEN_DECLARE_CONST_Packet4d(cephes_log_p2, 1.1676998740E-1); + _EIGEN_DECLARE_CONST_Packet4d(cephes_log_p3, -1.2420140846E-1); + _EIGEN_DECLARE_CONST_Packet4d(cephes_log_p4, +1.4249322787E-1); + _EIGEN_DECLARE_CONST_Packet4d(cephes_log_p5, -1.6668057665E-1); + _EIGEN_DECLARE_CONST_Packet4d(cephes_log_p6, +2.0000714765E-1); + _EIGEN_DECLARE_CONST_Packet4d(cephes_log_p7, -2.4999993993E-1); + _EIGEN_DECLARE_CONST_Packet4d(cephes_log_p8, +3.3333331174E-1); + _EIGEN_DECLARE_CONST_Packet4d(cephes_log_q1, -2.12194440e-4); + _EIGEN_DECLARE_CONST_Packet4d(cephes_log_q2, 0.693359375); + + Packet4d invalid_mask = _mm256_cmp_pd(x, _mm256_setzero_pd(), _CMP_NGE_UQ); // not greater equal is true if x is NaN + Packet4d iszero_mask = _mm256_cmp_pd(x, _mm256_setzero_pd(), _CMP_EQ_OQ); + + // Truncate input values to the minimum positive normal. + x = pmax(x, min_norm_pos); + + Packet4d emm0 = uint64_to_double(psrl64(_mm256_castpd_si256(x), 52)); + Packet4d e = psub(emm0, pset1(1022)); + + // Set the exponents to -1, i.e. x are in the range [0.5,1). + x = _mm256_and_pd(x, inv_mant_mask); + x = _mm256_or_pd(x, p4d_half); + + // part2: Shift the inputs from the range [0.5,1) to [sqrt(1/2),sqrt(2)) + // and shift by -1. The values are then centered around 0, which improves + // the stability of the polynomial evaluation. + // if( x < SQRTHF ) { + // e -= 1; + // x = x + x - 1.0; + // } else { x = x - 1.0; } + Packet4d mask = _mm256_cmp_pd(x, p4d_cephes_SQRTHF, _CMP_LT_OQ); + Packet4d tmp = _mm256_and_pd(x, mask); + x = psub(x, p4d_1); + e = psub(e, _mm256_and_pd(p4d_1, mask)); + x = padd(x, tmp); + + Packet4d x2 = pmul(x, x); + Packet4d x3 = pmul(x2, x); + + // Evaluate the polynomial approximant of degree 8 in three parts, probably + // to improve instruction-level parallelism. + Packet4d y, y1, y2; + y = pmadd(p4d_cephes_log_p0, x, p4d_cephes_log_p1); + y1 = pmadd(p4d_cephes_log_p3, x, p4d_cephes_log_p4); + y2 = pmadd(p4d_cephes_log_p6, x, p4d_cephes_log_p7); + y = pmadd(y, x, p4d_cephes_log_p2); + y1 = pmadd(y1, x, p4d_cephes_log_p5); + y2 = pmadd(y2, x, p4d_cephes_log_p8); + y = pmadd(y, x3, y1); + y = pmadd(y, x3, y2); + y = pmul(y, x3); + + // Add the logarithm of the exponent back to the result of the interpolation. + y1 = pmul(e, p4d_cephes_log_q1); + tmp = pmul(x2, p4d_half); + y = padd(y, y1); + x = psub(x, tmp); + y2 = pmul(e, p4d_cephes_log_q2); + x = padd(x, y); + x = padd(x, y2); + + // Filter out invalid inputs, i.e. negative arg will be NAN, 0 will be -INF. + return pblendv(iszero_mask, minus_inf, _mm256_or_pd(x, invalid_mask)); + } + +#if EIGEN_VERSION_AT_LEAST(3,3,5) +#else + template<> EIGEN_STRONG_INLINE Packet4f pcast(const Packet4i& a) { + return _mm_cvtepi32_ps(a); + } + + template<> EIGEN_STRONG_INLINE Packet4i pcast(const Packet4f& a) { + return _mm_cvttps_epi32(a); + } +#endif + } +} + +#endif \ No newline at end of file diff --git a/EigenRand/arch/NEON/MorePacketMath.h b/EigenRand/arch/NEON/MorePacketMath.h new file mode 100644 index 0000000..b73b828 --- /dev/null +++ b/EigenRand/arch/NEON/MorePacketMath.h @@ -0,0 +1,65 @@ +/** + * @file MorePacketMath.h + * @author bab2min (bab2min@gmail.com) + * @brief + * @version 0.4.0 + * @date 2021-04-26 + * + * @copyright Copyright (c) 2020-2021 + * + */ + +#ifndef EIGENRAND_MORE_PACKET_MATH_NEON_H +#define EIGENRAND_MORE_PACKET_MATH_NEON_H + +#include + +namespace Eigen +{ + namespace internal + { + template<> + struct IsIntPacket : std::true_type {}; + + template<> + struct IsFloatPacket : std::true_type {}; + + template<> + struct HalfPacket + { + using type = uint64_t; + }; + + + template<> + struct reinterpreter + { + EIGEN_STRONG_INLINE Packet4f to_float(const Packet4i& x) + { + return vreinterpretq_f32_s32(x); + } + + EIGEN_STRONG_INLINE Packet4i to_int(const Packet4i& x) + { + return x; + } + }; + + template<> + struct reinterpreter + { + EIGEN_STRONG_INLINE Packet4f to_float(const Packet4f& x) + { + return x; + } + + EIGEN_STRONG_INLINE Packet4i to_int(const Packet4f& x) + { + return vreinterpretq_s32_f32(x); + } + }; + + } +} + +#endif \ No newline at end of file diff --git a/EigenRand/arch/SSE/MorePacketMath.h b/EigenRand/arch/SSE/MorePacketMath.h new file mode 100644 index 0000000..646f6ea --- /dev/null +++ b/EigenRand/arch/SSE/MorePacketMath.h @@ -0,0 +1,487 @@ +/** + * @file MorePacketMath.h + * @author bab2min (bab2min@gmail.com) + * @brief + * @version 0.3.5 + * @date 2021-07-16 + * + * @copyright Copyright (c) 2020-2021 + * + */ + +#ifndef EIGENRAND_MORE_PACKET_MATH_SSE_H +#define EIGENRAND_MORE_PACKET_MATH_SSE_H + +#include + +namespace Eigen +{ + namespace internal + { + template<> + struct IsIntPacket : std::true_type {}; + + template<> + struct IsFloatPacket : std::true_type {}; + + template<> + struct IsDoublePacket : std::true_type {}; + + template<> + struct HalfPacket + { + using type = uint64_t; + }; + +#ifdef EIGEN_VECTORIZE_AVX +#else + template<> + struct HalfPacket + { + //using type = Packet2f; + }; +#endif + template<> + struct reinterpreter + { + EIGEN_STRONG_INLINE Packet4f to_float(const Packet4i& x) + { + return _mm_castsi128_ps(x); + } + + EIGEN_STRONG_INLINE Packet2d to_double(const Packet4i& x) + { + return _mm_castsi128_pd(x); + } + + EIGEN_STRONG_INLINE Packet4i to_int(const Packet4i& x) + { + return x; + } + }; + + template<> + struct reinterpreter + { + EIGEN_STRONG_INLINE Packet4f to_float(const Packet4f& x) + { + return x; + } + + EIGEN_STRONG_INLINE Packet2d to_double(const Packet4f& x) + { + return _mm_castps_pd(x); + } + + EIGEN_STRONG_INLINE Packet4i to_int(const Packet4f& x) + { + return _mm_castps_si128(x); + } + }; + + template<> + struct reinterpreter + { + EIGEN_STRONG_INLINE Packet4f to_float(const Packet2d& x) + { + return _mm_castpd_ps(x); + } + + EIGEN_STRONG_INLINE Packet2d to_double(const Packet2d& x) + { + return x; + } + + EIGEN_STRONG_INLINE Packet4i to_int(const Packet2d& x) + { + return _mm_castpd_si128(x); + } + }; + + template<> + EIGEN_STRONG_INLINE void split_two(const Packet4i& x, uint64_t& a, uint64_t& b) + { +#ifdef EIGEN_VECTORIZE_SSE4_1 + a = _mm_extract_epi64(x, 0); + b = _mm_extract_epi64(x, 1); +#else + uint64_t u[2]; + _mm_storeu_si128((__m128i*)u, x); + a = u[0]; + b = u[1]; +#endif + } + + EIGEN_STRONG_INLINE Packet4i combine_low32(const Packet4i& a, const Packet4i& b) + { + auto sa = _mm_shuffle_epi32(a, _MM_SHUFFLE(3, 1, 2, 0)); + auto sb = _mm_shuffle_epi32(b, _MM_SHUFFLE(2, 0, 3, 1)); + sa = _mm_and_si128(sa, _mm_setr_epi32(-1, -1, 0, 0)); + sb = _mm_and_si128(sb, _mm_setr_epi32(0, 0, -1, -1)); + return _mm_or_si128(sa, sb); + } + + template<> + EIGEN_STRONG_INLINE Packet4i pseti64(uint64_t a) + { + return _mm_set1_epi64x(a); + } + + template<> + EIGEN_STRONG_INLINE Packet4i padd64(const Packet4i& a, const Packet4i& b) + { + return _mm_add_epi64(a, b); + } + + template<> + EIGEN_STRONG_INLINE Packet4i psub64(const Packet4i& a, const Packet4i& b) + { + return _mm_sub_epi64(a, b); + } + + template<> + EIGEN_STRONG_INLINE Packet4i pcmpeq(const Packet4i& a, const Packet4i& b) + { + return _mm_cmpeq_epi32(a, b); + } + + template<> + EIGEN_STRONG_INLINE Packet4i psll(const Packet4i& a, int b) + { + return _mm_slli_epi32(a, b); + } + + template<> + EIGEN_STRONG_INLINE Packet4i psrl(const Packet4i& a, int b) + { + return _mm_srli_epi32(a, b); + } + + + template<> + EIGEN_STRONG_INLINE Packet4i psll64(const Packet4i& a, int b) + { + return _mm_slli_epi64(a, b); + } + + template<> + EIGEN_STRONG_INLINE Packet4i psrl64(const Packet4i& a, int b) + { + return _mm_srli_epi64(a, b); + } + + template<> + EIGEN_STRONG_INLINE Packet4i pcmplt(const Packet4i& a, const Packet4i& b) + { + return _mm_cmplt_epi32(a, b); + } + + template<> + EIGEN_STRONG_INLINE Packet4i pcmplt64(const Packet4i& a, const Packet4i& b) + { +#ifdef EIGEN_VECTORIZE_SSE4_2 + return _mm_cmpgt_epi64(b, a); +#else + int64_t u[2], v[2]; + _mm_storeu_si128((__m128i*)u, a); + _mm_storeu_si128((__m128i*)v, b); + return _mm_set_epi64x(u[1] < v[1] ? -1 : 0, u[0] < v[0] ? -1 : 0); +#endif + } + + template<> + EIGEN_STRONG_INLINE Packet4f pcmplt(const Packet4f& a, const Packet4f& b) + { + return _mm_cmplt_ps(a, b); + } + + template<> + EIGEN_STRONG_INLINE Packet4f pcmple(const Packet4f& a, const Packet4f& b) + { + return _mm_cmple_ps(a, b); + } + + template<> + EIGEN_STRONG_INLINE Packet2d pcmplt(const Packet2d& a, const Packet2d& b) + { + return _mm_cmplt_pd(a, b); + } + + template<> + EIGEN_STRONG_INLINE Packet2d pcmple(const Packet2d& a, const Packet2d& b) + { + return _mm_cmple_pd(a, b); + } + + template<> + EIGEN_STRONG_INLINE Packet4f pblendv(const Packet4f& ifPacket, const Packet4f& thenPacket, const Packet4f& elsePacket) + { +#ifdef EIGEN_VECTORIZE_SSE4_1 + return _mm_blendv_ps(elsePacket, thenPacket, ifPacket); +#else + return _mm_or_ps(_mm_and_ps(ifPacket, thenPacket), _mm_andnot_ps(ifPacket, elsePacket)); +#endif + } + + template<> + EIGEN_STRONG_INLINE Packet4f pblendv(const Packet4i& ifPacket, const Packet4f& thenPacket, const Packet4f& elsePacket) + { + return pblendv(_mm_castsi128_ps(ifPacket), thenPacket, elsePacket); + } + + template<> + EIGEN_STRONG_INLINE Packet4i pblendv(const Packet4i& ifPacket, const Packet4i& thenPacket, const Packet4i& elsePacket) + { +#ifdef EIGEN_VECTORIZE_SSE4_1 + return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(elsePacket), _mm_castsi128_ps(thenPacket), _mm_castsi128_ps(ifPacket))); +#else + return _mm_or_si128(_mm_and_si128(ifPacket, thenPacket), _mm_andnot_si128(ifPacket, elsePacket)); +#endif + } + + template<> + EIGEN_STRONG_INLINE Packet2d pblendv(const Packet2d& ifPacket, const Packet2d& thenPacket, const Packet2d& elsePacket) + { +#ifdef EIGEN_VECTORIZE_SSE4_1 + return _mm_blendv_pd(elsePacket, thenPacket, ifPacket); +#else + return _mm_or_pd(_mm_and_pd(ifPacket, thenPacket), _mm_andnot_pd(ifPacket, elsePacket)); +#endif + } + + + template<> + EIGEN_STRONG_INLINE Packet2d pblendv(const Packet4i& ifPacket, const Packet2d& thenPacket, const Packet2d& elsePacket) + { + return pblendv(_mm_castsi128_pd(ifPacket), thenPacket, elsePacket); + } + + template<> + EIGEN_STRONG_INLINE Packet4i pgather(const int* addr, const Packet4i& index) + { +#ifdef EIGEN_VECTORIZE_AVX2 + return _mm_i32gather_epi32(addr, index, 4); +#else + uint32_t u[4]; + _mm_storeu_si128((__m128i*)u, index); + return _mm_setr_epi32(addr[u[0]], addr[u[1]], addr[u[2]], addr[u[3]]); +#endif + } + + template<> + EIGEN_STRONG_INLINE Packet4f pgather(const float* addr, const Packet4i& index) + { +#ifdef EIGEN_VECTORIZE_AVX2 + return _mm_i32gather_ps(addr, index, 4); +#else + uint32_t u[4]; + _mm_storeu_si128((__m128i*)u, index); + return _mm_setr_ps(addr[u[0]], addr[u[1]], addr[u[2]], addr[u[3]]); +#endif + } + + template<> + EIGEN_STRONG_INLINE Packet2d pgather(const double* addr, const Packet4i& index, bool upperhalf) + { +#ifdef EIGEN_VECTORIZE_AVX2 + return _mm_i32gather_pd(addr, index, 8); +#else + uint32_t u[4]; + _mm_storeu_si128((__m128i*)u, index); + if (upperhalf) + { + return _mm_setr_pd(addr[u[2]], addr[u[3]]); + } + else + { + return _mm_setr_pd(addr[u[0]], addr[u[1]]); + } +#endif + } + + template<> + EIGEN_STRONG_INLINE int pmovemask(const Packet4f& a) + { + return _mm_movemask_ps(a); + } + + template<> + EIGEN_STRONG_INLINE int pmovemask(const Packet2d& a) + { + return _mm_movemask_pd(a); + } + + template<> + EIGEN_STRONG_INLINE int pmovemask(const Packet4i& a) + { + return pmovemask((Packet4f)_mm_castsi128_ps(a)); + } + + template<> + EIGEN_STRONG_INLINE Packet4f ptruncate(const Packet4f& a) + { +#ifdef EIGEN_VECTORIZE_SSE4_1 + return _mm_round_ps(a, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); +#else + auto round = _MM_GET_ROUNDING_MODE(); + _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO); + auto ret = _mm_cvtepi32_ps(_mm_cvtps_epi32(a)); + _MM_SET_ROUNDING_MODE(round); + return ret; +#endif + } + + template<> + EIGEN_STRONG_INLINE Packet2d ptruncate(const Packet2d& a) + { +#ifdef EIGEN_VECTORIZE_SSE4_1 + return _mm_round_pd(a, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); +#else + auto round = _MM_GET_ROUNDING_MODE(); + _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO); + auto ret = _mm_cvtepi32_pd(_mm_cvtpd_epi32(a)); + _MM_SET_ROUNDING_MODE(round); + return ret; +#endif + } + + template<> + EIGEN_STRONG_INLINE Packet4i pcmpeq64(const Packet4i& a, const Packet4i& b) + { +#ifdef EIGEN_VECTORIZE_SSE4_1 + return _mm_cmpeq_epi64(a, b); +#else + Packet4i c = _mm_cmpeq_epi32(a, b); + return pand(c, (Packet4i)_mm_shuffle_epi32(c, _MM_SHUFFLE(2, 3, 0, 1))); +#endif + } + + template<> + EIGEN_STRONG_INLINE Packet4i pmuluadd64(const Packet4i& a, uint64_t b, uint64_t c) + { + uint64_t u[2]; + _mm_storeu_si128((__m128i*)u, a); + u[0] = u[0] * b + c; + u[1] = u[1] * b + c; + return _mm_loadu_si128((__m128i*)u); + } + + EIGEN_STRONG_INLINE __m128d uint64_to_double(__m128i x) { + x = _mm_or_si128(x, _mm_castpd_si128(_mm_set1_pd(0x0010000000000000))); + return _mm_sub_pd(_mm_castsi128_pd(x), _mm_set1_pd(0x0010000000000000)); + } + + EIGEN_STRONG_INLINE __m128d int64_to_double(__m128i x) { + x = _mm_add_epi64(x, _mm_castpd_si128(_mm_set1_pd(0x0018000000000000))); + return _mm_sub_pd(_mm_castsi128_pd(x), _mm_set1_pd(0x0018000000000000)); + } + + EIGEN_STRONG_INLINE __m128i double_to_int64(__m128d x) { + x = _mm_add_pd(x, _mm_set1_pd(0x0018000000000000)); + return _mm_sub_epi64( + _mm_castpd_si128(x), + _mm_castpd_si128(_mm_set1_pd(0x0018000000000000)) + ); + } + + template<> + EIGEN_STRONG_INLINE Packet4i pcast64(const Packet2d& a) + { + return double_to_int64(a); + } + + template<> + EIGEN_STRONG_INLINE Packet2d pcast64(const Packet4i& a) + { + return int64_to_double(a); + } + + template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED + Packet2d psin(const Packet2d& x) + { + return _psin(x); + } + + template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED + Packet2d plog(const Packet2d& _x) + { + Packet2d x = _x; + _EIGEN_DECLARE_CONST_Packet2d(1, 1.0f); + _EIGEN_DECLARE_CONST_Packet2d(half, 0.5f); + _EIGEN_DECLARE_CONST_Packet4i(0x7f, 0x7f); + + auto inv_mant_mask = _mm_castsi128_pd(pseti64(~0x7ff0000000000000)); + auto min_norm_pos = _mm_castsi128_pd(pseti64(0x10000000000000)); + auto minus_inf = _mm_castsi128_pd(pseti64(0xfff0000000000000)); + + /* natural logarithm computed for 4 simultaneous float + return NaN for x <= 0 + */ + _EIGEN_DECLARE_CONST_Packet2d(cephes_SQRTHF, 0.707106781186547524); + _EIGEN_DECLARE_CONST_Packet2d(cephes_log_p0, 7.0376836292E-2); + _EIGEN_DECLARE_CONST_Packet2d(cephes_log_p1, -1.1514610310E-1); + _EIGEN_DECLARE_CONST_Packet2d(cephes_log_p2, 1.1676998740E-1); + _EIGEN_DECLARE_CONST_Packet2d(cephes_log_p3, -1.2420140846E-1); + _EIGEN_DECLARE_CONST_Packet2d(cephes_log_p4, +1.4249322787E-1); + _EIGEN_DECLARE_CONST_Packet2d(cephes_log_p5, -1.6668057665E-1); + _EIGEN_DECLARE_CONST_Packet2d(cephes_log_p6, +2.0000714765E-1); + _EIGEN_DECLARE_CONST_Packet2d(cephes_log_p7, -2.4999993993E-1); + _EIGEN_DECLARE_CONST_Packet2d(cephes_log_p8, +3.3333331174E-1); + _EIGEN_DECLARE_CONST_Packet2d(cephes_log_q1, -2.12194440e-4); + _EIGEN_DECLARE_CONST_Packet2d(cephes_log_q2, 0.693359375); + + + Packet4i emm0; + + Packet2d invalid_mask = _mm_cmpnge_pd(x, _mm_setzero_pd()); // not greater equal is true if x is NaN + Packet2d iszero_mask = _mm_cmpeq_pd(x, _mm_setzero_pd()); + + x = pmax(x, min_norm_pos); /* cut off denormalized stuff */ + emm0 = _mm_srli_epi64(_mm_castpd_si128(x), 52); + + /* keep only the fractional part */ + x = _mm_and_pd(x, inv_mant_mask); + x = _mm_or_pd(x, p2d_half); + + Packet2d e = _mm_sub_pd(uint64_to_double(emm0), pset1(1022)); + + /* part2: + if( x < SQRTHF ) { + e -= 1; + x = x + x - 1.0; + } else { x = x - 1.0; } + */ + Packet2d mask = _mm_cmplt_pd(x, p2d_cephes_SQRTHF); + Packet2d tmp = pand(x, mask); + x = psub(x, p2d_1); + e = psub(e, pand(p2d_1, mask)); + x = padd(x, tmp); + + Packet2d x2 = pmul(x, x); + Packet2d x3 = pmul(x2, x); + + Packet2d y, y1, y2; + y = pmadd(p2d_cephes_log_p0, x, p2d_cephes_log_p1); + y1 = pmadd(p2d_cephes_log_p3, x, p2d_cephes_log_p4); + y2 = pmadd(p2d_cephes_log_p6, x, p2d_cephes_log_p7); + y = pmadd(y, x, p2d_cephes_log_p2); + y1 = pmadd(y1, x, p2d_cephes_log_p5); + y2 = pmadd(y2, x, p2d_cephes_log_p8); + y = pmadd(y, x3, y1); + y = pmadd(y, x3, y2); + y = pmul(y, x3); + + y1 = pmul(e, p2d_cephes_log_q1); + tmp = pmul(x2, p2d_half); + y = padd(y, y1); + x = psub(x, tmp); + y2 = pmul(e, p2d_cephes_log_q2); + x = padd(x, y); + x = padd(x, y2); + // negative arg will be NAN, 0 will be -INF + return pblendv(iszero_mask, minus_inf, _mm_or_pd(x, invalid_mask)); + } + } +} + +#endif diff --git a/README.md b/README.md index 6ce0fe0..cd5ac45 100644 --- a/README.md +++ b/README.md @@ -347,6 +347,11 @@ The results of EigenRand and C++ std appear to be equivalent within the margin o MIT License ## History +### 0.3.5 (2021-07-16) +* Now `UniformRealGen` generates accurate double values. +* Fixed a bug where non-vectorized double-type `NormalGen` would get stuck in an infinite loop. +* New overloading functions `balanced` and `balancedLike` which generate values over `[a, b]` were added. + ### 0.3.4 (2021-04-25) * Now Eigen 3.3.4 - 3.3.6 versions are additionally supported. diff --git a/TestAccuracy.vcxproj b/TestAccuracy.vcxproj index a21dced..92eefe4 100644 --- a/TestAccuracy.vcxproj +++ b/TestAccuracy.vcxproj @@ -1,10 +1,34 @@ + + Debug + ARM + + + Debug + ARM64 + Debug Win32 + + RelAVX2 + ARM + + + RelAVX2 + ARM64 + + + RelAVX + ARM + + + RelAVX + ARM64 + RelAVX Win32 @@ -21,6 +45,14 @@ RelAVX2 x64 + + RelNoVect + ARM + + + RelNoVect + ARM64 + RelNoVect Win32 @@ -29,6 +61,14 @@ RelNoVect x64 + + RelSSE2 + ARM + + + RelSSE2 + ARM64 + RelSSE2 Win32 @@ -43,7 +83,7 @@ - + 15.0 @@ -51,7 +91,7 @@ Win32Proj EigenAddOn 10.0 - TestAccuracy + Accuracy @@ -94,6 +134,18 @@ v142 Unicode + + Application + true + v142 + Unicode + + + Application + true + v142 + Unicode + Application false @@ -101,6 +153,20 @@ true Unicode + + Application + false + v142 + true + Unicode + + + Application + false + v142 + true + Unicode + Application false @@ -108,6 +174,20 @@ true Unicode + + Application + false + v142 + true + Unicode + + + Application + false + v142 + true + Unicode + Application false @@ -115,6 +195,20 @@ true Unicode + + Application + false + v142 + true + Unicode + + + Application + false + v142 + true + Unicode + Application false @@ -122,6 +216,20 @@ true Unicode + + Application + false + v142 + true + Unicode + + + Application + false + v142 + true + Unicode + @@ -145,18 +253,48 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + true @@ -166,6 +304,14 @@ true $(SolutionDir);E:\AddInclude;$(IncludePath) + + true + $(SolutionDir);E:\AddInclude;$(IncludePath) + + + true + $(SolutionDir);E:\AddInclude;$(IncludePath) + false E:\AddInclude;$(IncludePath) @@ -186,18 +332,50 @@ false $(SolutionDir);E:\AddInclude;$(IncludePath) + + false + $(SolutionDir);E:\AddInclude;$(IncludePath) + + + false + $(SolutionDir);E:\AddInclude;$(IncludePath) + false $(SolutionDir);E:\AddInclude;$(IncludePath) + + false + $(SolutionDir);E:\AddInclude;$(IncludePath) + + + false + $(SolutionDir);E:\AddInclude;$(IncludePath) + false $(SolutionDir);E:\AddInclude;$(IncludePath) + + false + $(SolutionDir);E:\AddInclude;$(IncludePath) + + + false + $(SolutionDir);E:\AddInclude;$(IncludePath) + false $(SolutionDir);E:\AddInclude;$(IncludePath) + + false + $(SolutionDir);E:\AddInclude;$(IncludePath) + + + false + $(SolutionDir);E:\AddInclude;$(IncludePath) + @@ -229,6 +407,36 @@ true + + + + + Level3 + Disabled + true + __ARM_NEON;USE_ADDON;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) + true + + + Console + true + + + + + + + Level3 + Disabled + true + __ARM_NEON;USE_ADDON;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) + true + + + Console + true + + @@ -326,6 +534,46 @@ true + + + + + Level3 + MaxSpeed + true + true + true + __ARM_NEON;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + true + NotSet + + + Console + true + true + true + + + + + + + Level3 + MaxSpeed + true + true + true + __ARM_NEON;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + true + NotSet + + + Console + true + true + true + + @@ -345,6 +593,44 @@ true + + + + + Level3 + MaxSpeed + true + true + true + __ARM_NEON;EIGEN_DONT_VECTORIZE;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + true + + + Console + true + true + true + + + + + + + Level3 + MaxSpeed + true + true + true + __ARM_NEON;EIGEN_DONT_VECTORIZE;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + true + + + Console + true + true + true + + @@ -365,6 +651,44 @@ true + + + + + Level3 + MaxSpeed + true + true + true + __ARM_NEON;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + true + + + Console + true + true + true + + + + + + + Level3 + MaxSpeed + true + true + true + __ARM_NEON;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + true + + + Console + true + true + true + + @@ -385,6 +709,44 @@ true + + + + + Level3 + MaxSpeed + true + true + true + __ARM_NEON;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + true + + + Console + true + true + true + + + + + + + Level3 + MaxSpeed + true + true + true + __ARM_NEON;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + true + + + Console + true + true + true + + diff --git a/TestAccuracy.vcxproj.filters b/TestAccuracy.vcxproj.filters index 1a9c43c..7ff5bf8 100644 --- a/TestAccuracy.vcxproj.filters +++ b/TestAccuracy.vcxproj.filters @@ -6,7 +6,7 @@ - + Source Files diff --git a/test/accuracy.cpp b/benchmark/accuracy.cpp similarity index 99% rename from test/accuracy.cpp rename to benchmark/accuracy.cpp index fa450ae..fc06289 100644 --- a/test/accuracy.cpp +++ b/benchmark/accuracy.cpp @@ -182,7 +182,6 @@ std::map test_eigenrand_cont(size_t size, size_t step, size { std::map ret; Eigen::ArrayXf arr{ size }; - Eigen::ArrayXd arrd{ size }; Rng urng{ seed }; arr = Eigen::Rand::balancedLike(arr, urng); @@ -191,99 +190,101 @@ std::map test_eigenrand_cont(size_t size, size_t step, size arr = Eigen::Rand::uniformRealLike(arr, urng); ret["uniformReal"] = calc_emd_with_cdf(arr, ur_cdf, step); - arrd = Eigen::Rand::uniformRealLike(arrd, urng); - ret["uniformReal/double"] = calc_emd_with_cdf(arrd, ur_cdf, step); - arr = Eigen::Rand::normalLike(arr, urng); ret["normal"] = calc_emd_with_cdf(arr, normal_cdf, step); - arrd = Eigen::Rand::normalLike(arrd, urng); - ret["normal/double"] = calc_emd_with_cdf(arrd, normal_cdf, step); - arr = Eigen::Rand::lognormalLike(arr, urng); ret["lognormal"] = calc_emd_with_cdf(arr, lognormal_cdf, step); - arrd = Eigen::Rand::lognormalLike(arrd, urng); - ret["lognormal/double"] = calc_emd_with_cdf(arrd, lognormal_cdf, step); - arr = Eigen::Rand::gammaLike(arr, urng, 1, 1); ret["gamma(1,1)"] = calc_emd_with_pdf(arr, gamma11_pdf, step); - arrd = Eigen::Rand::gammaLike(arrd, urng, 1, 1); - ret["gamma(1,1)/double"] = calc_emd_with_pdf(arrd, gamma11_pdf, step); - arr = Eigen::Rand::gammaLike(arr, urng, 5, 1); ret["gamma(5,1)"] = calc_emd_with_pdf(arr, gamma51_pdf, step); - arrd = Eigen::Rand::gammaLike(arrd, urng, 5, 1); - ret["gamma(5,1)/double"] = calc_emd_with_pdf(arrd, gamma51_pdf, step); - arr = Eigen::Rand::gammaLike(arr, urng, 0.2, 1); ret["gamma(0.2,1)"] = calc_emd_with_pdf(arr, gamma21_pdf, step); - arrd = Eigen::Rand::gammaLike(arrd, urng, 0.2, 1); - ret["gamma(0.2,1)/double"] = calc_emd_with_pdf(arrd, gamma21_pdf, step); - arr = Eigen::Rand::exponentialLike(arr, urng); ret["exponential"] = calc_emd_with_cdf(arr, exp_cdf, step); - arrd = Eigen::Rand::exponentialLike(arrd, urng); - ret["exponential/double"] = calc_emd_with_cdf(arrd, exp_cdf, step); - arr = Eigen::Rand::weibullLike(arr, urng, 2); ret["weibull(2,1)"] = calc_emd_with_cdf(arr, weibull_cdf, step); - arrd = Eigen::Rand::weibullLike(arrd, urng, 2); - ret["weibull(2,1)/double"] = calc_emd_with_cdf(arrd, weibull_cdf, step); - arr = Eigen::Rand::extremeValueLike(arr, urng, 1, 1); ret["extremeValue(1,1)"] = calc_emd_with_cdf(arr, extreme_value_cdf, step); - arrd = Eigen::Rand::extremeValueLike(arrd, urng, 1, 1); - ret["extremeValue(1,1)/double"] = calc_emd_with_cdf(arrd, extreme_value_cdf, step); - arr = Eigen::Rand::chiSquaredLike(arr, urng, 7); ret["chiSquared(7)"] = calc_emd_with_pdf(arr, chisquared_pdf, step); - arrd = Eigen::Rand::chiSquaredLike(arrd, urng, 7); - ret["chiSquared(7)/double"] = calc_emd_with_pdf(arrd, chisquared_pdf, step); - arr = Eigen::Rand::cauchyLike(arr, urng); ret["cauchy"] = calc_emd_with_cdf(arr, cauchy_cdf, step); - arrd = Eigen::Rand::cauchyLike(arrd, urng); - ret["cauchy/double"] = calc_emd_with_cdf(arrd, cauchy_cdf, step); - arr = Eigen::Rand::studentTLike(arr, urng, 1); ret["studentT(1)"] = calc_emd_with_cdf(arr, cauchy_cdf, step); - arrd = Eigen::Rand::studentTLike(arrd, urng, 1); - ret["studentT(1)/double"] = calc_emd_with_cdf(arrd, cauchy_cdf, step); - arr = Eigen::Rand::studentTLike(arr, urng, 5); ret["studentT(5)"] = calc_emd_with_pdf(arr, student5_pdf, step); - arrd = Eigen::Rand::studentTLike(arrd, urng, 5); - ret["studentT(5)/double"] = calc_emd_with_pdf(arrd, student5_pdf, step); - arr = Eigen::Rand::studentTLike(arr, urng, 20); ret["studentT(20)"] = calc_emd_with_pdf(arr, student20_pdf, step); - arrd = Eigen::Rand::studentTLike(arrd, urng, 20); - ret["studentT(20)/double"] = calc_emd_with_pdf(arrd, student20_pdf, step); - arr = Eigen::Rand::fisherFLike(arr, urng, 1, 1); ret["fisherF(1,1)"] = calc_emd_with_cdf(arr, fisher11_cdf, step); - arrd = Eigen::Rand::fisherFLike(arrd, urng, 1, 1); - ret["fisherF(1,1)/double"] = calc_emd_with_cdf(arrd, fisher11_cdf, step); - arr = Eigen::Rand::fisherFLike(arr, urng, 5, 5); ret["fisherF(5,5)"] = calc_emd_with_pdf(arr, fisher55_pdf, step); +#ifdef TEST_DOUBLE + Eigen::ArrayXd arrd{ size }; + arrd = Eigen::Rand::uniformRealLike(arrd, urng); + ret["uniformReal/double"] = calc_emd_with_cdf(arrd, ur_cdf, step); + + arrd = Eigen::Rand::normalLike(arrd, urng); + ret["normal/double"] = calc_emd_with_cdf(arrd, normal_cdf, step); + + arrd = Eigen::Rand::lognormalLike(arrd, urng); + ret["lognormal/double"] = calc_emd_with_cdf(arrd, lognormal_cdf, step); + + arrd = Eigen::Rand::gammaLike(arrd, urng, 1, 1); + ret["gamma(1,1)/double"] = calc_emd_with_pdf(arrd, gamma11_pdf, step); + + arrd = Eigen::Rand::gammaLike(arrd, urng, 5, 1); + ret["gamma(5,1)/double"] = calc_emd_with_pdf(arrd, gamma51_pdf, step); + + arrd = Eigen::Rand::gammaLike(arrd, urng, 0.2, 1); + ret["gamma(0.2,1)/double"] = calc_emd_with_pdf(arrd, gamma21_pdf, step); + + arrd = Eigen::Rand::exponentialLike(arrd, urng); + ret["exponential/double"] = calc_emd_with_cdf(arrd, exp_cdf, step); + + arrd = Eigen::Rand::weibullLike(arrd, urng, 2); + ret["weibull(2,1)/double"] = calc_emd_with_cdf(arrd, weibull_cdf, step); + + arrd = Eigen::Rand::extremeValueLike(arrd, urng, 1, 1); + ret["extremeValue(1,1)/double"] = calc_emd_with_cdf(arrd, extreme_value_cdf, step); + + arrd = Eigen::Rand::chiSquaredLike(arrd, urng, 7); + ret["chiSquared(7)/double"] = calc_emd_with_pdf(arrd, chisquared_pdf, step); + + arrd = Eigen::Rand::cauchyLike(arrd, urng); + ret["cauchy/double"] = calc_emd_with_cdf(arrd, cauchy_cdf, step); + + arrd = Eigen::Rand::studentTLike(arrd, urng, 1); + ret["studentT(1)/double"] = calc_emd_with_cdf(arrd, cauchy_cdf, step); + + arrd = Eigen::Rand::studentTLike(arrd, urng, 5); + ret["studentT(5)/double"] = calc_emd_with_pdf(arrd, student5_pdf, step); + + arrd = Eigen::Rand::studentTLike(arrd, urng, 20); + ret["studentT(20)/double"] = calc_emd_with_pdf(arrd, student20_pdf, step); + + arrd = Eigen::Rand::fisherFLike(arrd, urng, 1, 1); + ret["fisherF(1,1)/double"] = calc_emd_with_cdf(arrd, fisher11_cdf, step); + arrd = Eigen::Rand::fisherFLike(arrd, urng, 5, 5); ret["fisherF(5,5)/double"] = calc_emd_with_pdf(arrd, fisher55_pdf, step); - +#endif return ret; } diff --git a/test/benchmark.cpp b/benchmark/benchmark.cpp similarity index 99% rename from test/benchmark.cpp rename to benchmark/benchmark.cpp index d1996de..8ecb924 100644 --- a/test/benchmark.cpp +++ b/benchmark/benchmark.cpp @@ -820,7 +820,7 @@ int main(int argc, char** argv) for (size_t i = 0; i < repeat; ++i) { - /*for (auto& p : test_rng(std::mt19937{}, size, "rng\tmt19937", results)) + for (auto& p : test_rng(std::mt19937{}, size, "rng\tmt19937", results)) { time[p.first] += p.second; timeSq[p.first] += p.second * p.second; @@ -866,7 +866,7 @@ int main(int argc, char** argv) { time[p.first] += p.second; timeSq[p.first] += p.second * p.second; - }*/ + } for (auto& p : test_eigenrand(size, "\t:ERand", results)) { diff --git a/test/benchmark_mv.cpp b/benchmark/benchmark_mv.cpp similarity index 100% rename from test/benchmark_mv.cpp rename to benchmark/benchmark_mv.cpp diff --git a/test/comp_scipy.py b/benchmark/comp_scipy.py similarity index 100% rename from test/comp_scipy.py rename to benchmark/comp_scipy.py diff --git a/doxygen/Doxyfile b/doxygen/Doxyfile index 7465b17..1f9444d 100644 --- a/doxygen/Doxyfile +++ b/doxygen/Doxyfile @@ -38,7 +38,7 @@ PROJECT_NAME = "EigenRand" # could be handy for archiving the generated documentation or if some version # control system is used. -PROJECT_NUMBER = 0.3.4 +PROJECT_NUMBER = 0.3.5 # Using the PROJECT_BRIEF tag one can provide an optional one line description # for a project that appears at the top of each page and should give viewer a diff --git a/test/packages.config b/test/packages.config new file mode 100644 index 0000000..434bb12 --- /dev/null +++ b/test/packages.config @@ -0,0 +1,4 @@ + + + + \ No newline at end of file diff --git a/test/test.cpp b/test/test.cpp new file mode 100644 index 0000000..55e9c2e --- /dev/null +++ b/test/test.cpp @@ -0,0 +1,72 @@ +#include +#include +#include + +template +class ContinuousDistTest : public testing::Test +{ +}; + +using ETypes = testing::Types; + +TYPED_TEST_CASE(ContinuousDistTest, ETypes); + +TYPED_TEST(ContinuousDistTest, balanced) +{ + using Matrix = Eigen::Matrix; + Eigen::Rand::Vmt19937_64 gen{ 42 }; + Matrix mat; + + mat = Eigen::Rand::balanced(8, 8, gen); + mat = Eigen::Rand::balanced(3, 3, gen); + mat = Eigen::Rand::balanced(5, 5, gen); + std::cout << mat << std::endl; +} + +TYPED_TEST(ContinuousDistTest, balanced2) +{ + using Matrix = Eigen::Matrix; + Eigen::Rand::Vmt19937_64 gen{ 42 }; + Matrix mat; + + mat = Eigen::Rand::balanced(8, 8, gen, 0.5, 2); + mat = Eigen::Rand::balanced(3, 3, gen, 0.5, 2); + mat = Eigen::Rand::balanced(5, 5, gen, 0.5, 2); + std::cout << mat << std::endl; +} + +TYPED_TEST(ContinuousDistTest, stdNormal) +{ + using Matrix = Eigen::Matrix; + Eigen::Rand::Vmt19937_64 gen{ 42 }; + Matrix mat; + + mat = Eigen::Rand::normal(8, 8, gen); + mat = Eigen::Rand::normal(3, 3, gen); + mat = Eigen::Rand::normal(5, 5, gen); + std::cout << mat << std::endl; +} + +TYPED_TEST(ContinuousDistTest, normal) +{ + using Matrix = Eigen::Matrix; + Eigen::Rand::Vmt19937_64 gen{ 42 }; + Matrix mat; + + mat = Eigen::Rand::normal(8, 8, gen, 1, 2); + mat = Eigen::Rand::normal(3, 3, gen, 1, 2); + mat = Eigen::Rand::normal(5, 5, gen, 1, 2); + std::cout << mat << std::endl; +} + +TYPED_TEST(ContinuousDistTest, exponential) +{ + using Matrix = Eigen::Matrix; + Eigen::Rand::Vmt19937_64 gen{ 42 }; + Matrix mat; + + mat = Eigen::Rand::exponential(8, 8, gen, 2); + mat = Eigen::Rand::exponential(3, 3, gen, 2); + mat = Eigen::Rand::exponential(5, 5, gen, 2); + std::cout << mat << std::endl; +} diff --git a/test/test.vcxproj b/test/test.vcxproj new file mode 100644 index 0000000..8d78d63 --- /dev/null +++ b/test/test.vcxproj @@ -0,0 +1,130 @@ + + + + + Debug + Win32 + + + Release + Win32 + + + Debug + x64 + + + Release + x64 + + + + {643d8602-fe0d-4eaf-841c-e690ee6e53fd} + Win32Proj + 10.0.19041.0 + Application + v142 + Unicode + + + + + + + + + $(SolutionDir);E:\AddInclude;$(VC_IncludePath);$(WindowsSDK_IncludePath); + + + $(SolutionDir);E:\AddInclude;$(VC_IncludePath);$(WindowsSDK_IncludePath); + + + $(SolutionDir);E:\AddInclude;$(VC_IncludePath);$(WindowsSDK_IncludePath); + + + $(SolutionDir);E:\AddInclude;$(VC_IncludePath);$(WindowsSDK_IncludePath); + + + + + + + + + + + + + + + NotUsing + + + Disabled + WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) + EnableFastChecks + MultiThreadedDebugDLL + Level3 + + + true + Console + + + + + NotUsing + + + Disabled + X64;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) + EnableFastChecks + MultiThreadedDebugDLL + Level3 + + + true + Console + + + + + NotUsing + + + WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + MultiThreadedDLL + Level3 + ProgramDatabase + + + true + Console + true + true + + + + + NotUsing + + + X64;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + MultiThreadedDLL + Level3 + ProgramDatabase + + + true + Console + true + true + + + + + This project references NuGet package(s) that are missing on this computer. Use NuGet Package Restore to download them. For more information, see http://go.microsoft.com/fwlink/?LinkID=322105. The missing file is {0}. + + + + \ No newline at end of file