From 1291677e64c3495b7cd7c58347d5fb296eb767ab Mon Sep 17 00:00:00 2001 From: Sylvestre Ledru Date: Sun, 24 Sep 2023 10:43:01 +0200 Subject: [PATCH 01/31] repack: jquery isn't anymore --- debian/copyright | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/debian/copyright b/debian/copyright index 6f8231a2..67f7a55a 100644 --- a/debian/copyright +++ b/debian/copyright @@ -2,8 +2,7 @@ Format: https://www.debian.org/doc/packaging-manuals/copyright-format/1.0/ Upstream-Name: mold Upstream-Contact: ruiu@cs.stanford.edu Source: https://github.com/rui314/mold -Files-Excluded: third-party/mimalloc/docs/jquery.js - third-party/mimalloc/bin/mimalloc-redirect.dll +Files-Excluded: third-party/mimalloc/bin/mimalloc-redirect.dll third-party/mimalloc/bin/mimalloc-redirect32.dll third-party/mimalloc/bin/minject.exe third-party/mimalloc/bin/minject32.exe From 19333acb4ad7f09f60339e10963f85017b92e05a Mon Sep 17 00:00:00 2001 From: Sylvestre Ledru Date: Sun, 24 Sep 2023 10:43:14 +0200 Subject: [PATCH 02/31] New upstream version 2.2.0+dfsg --- .github/workflows/ci.yml | 4 +- CMakeLists.txt | 39 +- README.md | 7 +- common/Dockerfile | 8 +- common/common.h | 94 +- common/sha.h | 81 - elf/arch-arm32.cc | 101 +- elf/arch-arm64.cc | 98 +- elf/arch-i386.cc | 90 +- elf/arch-loongarch.cc | 158 +- elf/arch-mips64.cc | 673 ---- elf/arch-ppc32.cc | 34 +- elf/arch-ppc64v1.cc | 53 +- elf/arch-ppc64v2.cc | 62 +- elf/arch-riscv.cc | 265 +- elf/arch-s390x.cc | 44 +- elf/arch-sparc64.cc | 29 +- elf/arch-x86-64.cc | 191 +- elf/cmdline.cc | 17 +- elf/elf.cc | 1914 ++++++----- elf/elf.h | 356 +-- elf/icf.cc | 32 +- elf/input-files.cc | 112 +- elf/input-sections.cc | 56 +- elf/linker-script.cc | 11 +- elf/lto-unix.cc | 3 + elf/lto.h | 4 +- elf/main.cc | 20 +- elf/mold.h | 256 +- elf/output-chunks.cc | 276 +- elf/passes.cc | 179 +- elf/relocatable.cc | 2 +- elf/thunks.cc | 58 +- elf/tls.cc | 31 +- install-build-deps.sh | 20 +- install-cross-tools.sh | 19 + test/elf/CMakeLists.txt | 2 - test/elf/abs-error.sh | 2 - .../arm_range-extension-thunk-disassembly.sh | 2 +- test/elf/as-needed-dso.sh | 11 +- test/elf/as-needed2.sh | 32 - test/elf/common.inc | 10 + test/elf/compress-debug-sections-zstd.sh | 1 - test/elf/copyrel-alignment.sh | 1 - test/elf/copyrel-protected.sh | 1 - test/elf/copyrel-relro2.sh | 41 + test/elf/exception.sh | 2 +- test/elf/glibc-2.22-bug.sh | 1 - test/elf/hash-style.sh | 1 - test/elf/large-max-page-size-strip.sh | 1 - test/elf/lto-nostdlib.sh | 9 + test/elf/nocopyreloc.sh | 1 - test/elf/physical-image-base.sh | 1 - test/elf/relocatable-c++.sh | 46 + test/elf/riscv64_attributes.sh | 8 +- test/elf/riscv64_relax-hi20.sh | 50 + test/elf/riscv64_weak-undef.sh | 2 + test/elf/tls-gd-dlopen.sh | 33 + test/elf/tlsdesc-dlopen.sh | 35 + test/elf/tlsdesc-import.sh | 12 +- test/elf/tlsdesc-initial-exec.sh | 39 + test/elf/tlsdesc-local-dynamic.sh | 39 + test/elf/tlsdesc-static.sh | 21 +- test/elf/tlsdesc.sh | 42 +- test/elf/version-script19.sh | 16 + .../blake3/.github/workflows/build_b3sum.py | 38 + third-party/blake3/.github/workflows/ci.yml | 330 ++ third-party/blake3/.github/workflows/tag.yml | 45 + .../workflows/upload_github_release_asset.py | 73 + third-party/blake3/.gitignore | 2 + third-party/blake3/CONTRIBUTING.md | 31 + third-party/blake3/Cargo.toml | 101 + third-party/blake3/LICENSE | 330 ++ third-party/blake3/README.md | 221 ++ third-party/blake3/b3sum/.gitignore | 1 + third-party/blake3/b3sum/Cargo.lock | 690 ++++ third-party/blake3/b3sum/Cargo.toml | 27 + third-party/blake3/b3sum/README.md | 71 + third-party/blake3/b3sum/src/main.rs | 617 ++++ third-party/blake3/b3sum/src/unit_tests.rs | 189 ++ third-party/blake3/b3sum/tests/cli_tests.rs | 613 ++++ .../blake3/b3sum/what_does_check_do.md | 174 + third-party/blake3/benches/bench.rs | 517 +++ third-party/blake3/build.rs | 277 ++ third-party/blake3/c/.gitignore | 4 + third-party/blake3/c/CMakeLists.txt | 177 ++ third-party/blake3/c/Makefile.testing | 82 + third-party/blake3/c/README.md | 321 ++ third-party/blake3/c/blake3-config.cmake.in | 5 + third-party/blake3/c/blake3.c | 616 ++++ third-party/blake3/c/blake3.h | 82 + third-party/blake3/c/blake3_avx2.c | 326 ++ .../blake3/c/blake3_avx2_x86-64_unix.S | 1815 +++++++++++ .../blake3/c/blake3_avx2_x86-64_windows_gnu.S | 1817 +++++++++++ .../c/blake3_avx2_x86-64_windows_msvc.asm | 1828 +++++++++++ third-party/blake3/c/blake3_avx512.c | 1220 +++++++ .../blake3/c/blake3_avx512_x86-64_unix.S | 2585 +++++++++++++++ .../c/blake3_avx512_x86-64_windows_gnu.S | 2615 +++++++++++++++ .../c/blake3_avx512_x86-64_windows_msvc.asm | 2634 ++++++++++++++++ .../c/blake3_c_rust_bindings/Cargo.toml | 29 + .../blake3/c/blake3_c_rust_bindings/README.md | 4 + .../c/blake3_c_rust_bindings/benches/bench.rs | 393 +++ .../blake3/c/blake3_c_rust_bindings/build.rs | 190 ++ .../c/blake3_c_rust_bindings/cross_test.sh | 31 + .../c/blake3_c_rust_bindings/src/lib.rs | 306 ++ .../c/blake3_c_rust_bindings/src/test.rs | 570 ++++ third-party/blake3/c/blake3_dispatch.c | 276 ++ third-party/blake3/c/blake3_impl.h | 281 ++ third-party/blake3/c/blake3_neon.c | 368 +++ third-party/blake3/c/blake3_portable.c | 160 + third-party/blake3/c/blake3_sse2.c | 566 ++++ .../blake3/c/blake3_sse2_x86-64_unix.S | 2291 ++++++++++++++ .../blake3/c/blake3_sse2_x86-64_windows_gnu.S | 2332 ++++++++++++++ .../c/blake3_sse2_x86-64_windows_msvc.asm | 2350 ++++++++++++++ third-party/blake3/c/blake3_sse41.c | 560 ++++ .../blake3/c/blake3_sse41_x86-64_unix.S | 2028 ++++++++++++ .../c/blake3_sse41_x86-64_windows_gnu.S | 2069 ++++++++++++ .../c/blake3_sse41_x86-64_windows_msvc.asm | 2089 ++++++++++++ third-party/blake3/c/example.c | 37 + third-party/blake3/c/libblake3.pc.in | 12 + third-party/blake3/c/main.c | 166 + third-party/blake3/c/test.py | 97 + third-party/blake3/media/B3.svg | 70 + third-party/blake3/media/BLAKE3.svg | 85 + third-party/blake3/media/speed.svg | 1474 +++++++++ third-party/blake3/reference_impl/Cargo.toml | 8 + third-party/blake3/reference_impl/README.md | 14 + .../blake3/reference_impl/reference_impl.rs | 383 +++ third-party/blake3/src/ffi_avx2.rs | 63 + third-party/blake3/src/ffi_avx512.rs | 114 + third-party/blake3/src/ffi_neon.rs | 82 + third-party/blake3/src/ffi_sse2.rs | 114 + third-party/blake3/src/ffi_sse41.rs | 114 + third-party/blake3/src/guts.rs | 101 + third-party/blake3/src/join.rs | 92 + third-party/blake3/src/lib.rs | 1479 +++++++++ third-party/blake3/src/platform.rs | 487 +++ third-party/blake3/src/portable.rs | 198 ++ third-party/blake3/src/rust_avx2.rs | 474 +++ third-party/blake3/src/rust_sse2.rs | 775 +++++ third-party/blake3/src/rust_sse41.rs | 766 +++++ third-party/blake3/src/test.rs | 630 ++++ third-party/blake3/src/traits.rs | 227 ++ third-party/blake3/test_vectors/Cargo.toml | 18 + third-party/blake3/test_vectors/cross_test.sh | 25 + .../blake3/test_vectors/src/bin/generate.rs | 4 + third-party/blake3/test_vectors/src/lib.rs | 352 +++ .../blake3/test_vectors/test_vectors.json | 217 ++ .../blake3/tools/compiler_version/Cargo.toml | 7 + .../blake3/tools/compiler_version/build.rs | 6 + .../blake3/tools/compiler_version/src/main.rs | 27 + .../tools/instruction_set_support/Cargo.toml | 6 + .../tools/instruction_set_support/src/main.rs | 10 + third-party/blake3/tools/release.md | 16 + third-party/mimalloc/src/prim/unix/prim.c | 1 + third-party/tbb/.github/labeler.yml | 18 + third-party/tbb/.github/workflows/ci.yml | 11 +- third-party/tbb/.github/workflows/labeler.yml | 26 + third-party/tbb/CMakeLists.txt | 24 +- third-party/tbb/README.md | 1 + third-party/tbb/RELEASE_NOTES.md | 31 +- third-party/tbb/SECURITY.md | 7 + third-party/tbb/SYSTEM_REQUIREMENTS.md | 52 +- third-party/tbb/WASM_Support.md | 31 + third-party/tbb/cmake/compilers/Clang.cmake | 12 +- third-party/tbb/cmake/compilers/GNU.cmake | 6 + third-party/tbb/cmake/compilers/Intel.cmake | 8 +- third-party/tbb/cmake/compilers/MSVC.cmake | 8 +- third-party/tbb/cmake/config_generation.cmake | 71 +- third-party/tbb/cmake/memcheck.cmake | 14 +- third-party/tbb/cmake/packaging.cmake | 7 +- .../scripts/cmake_gen_github_configs.cmake | 7 +- .../tbb/cmake/suppressions/tsan.suppressions | 1 + .../tbb/cmake/templates/TBBConfig.cmake.in | 42 +- .../tbb/cmake/toolchains/riscv64.cmake | 34 + third-party/tbb/cmake/utils.cmake | 7 +- third-party/tbb/doc/GSG/get_started.rst | 25 +- third-party/tbb/doc/GSG/integrate.rst | 68 + third-party/tbb/doc/GSG/intro.rst | 29 + third-party/tbb/doc/GSG/next_steps.rst | 151 + third-party/tbb/doc/GSG/samples.rst | 49 + third-party/tbb/doc/conf.py | 13 +- third-party/tbb/doc/index/toctree.rst | 6 +- .../tbb/doc/main/_templates/layout.html | 9 +- .../tbb/doc/main/reference/reference.rst | 1 - .../main/reference/task_group_extensions.rst | 6 +- .../main/tbb_userguide/Flow_Graph_Tips.rst | 4 +- .../doc/main/tbb_userguide/Graph_Object.rst | 7 + .../Migration_Guide/Task_Scheduler_Init.rst | 3 +- .../attach_flow_graph_to_arena.rst | 2 +- .../design_patterns/Lazy_Initialization.rst | 2 +- .../tbb/doc/main/tbb_userguide/std_invoke.rst | 217 ++ .../tbb/doc/main/tbb_userguide/title.rst | 1 + .../tbb/include/oneapi/tbb/concurrent_queue.h | 132 +- .../tbb/detail/_concurrent_unordered_base.h | 18 +- .../tbb/include/oneapi/tbb/detail/_config.h | 10 +- .../oneapi/tbb/detail/_flow_graph_body_impl.h | 10 +- .../oneapi/tbb/detail/_flow_graph_node_impl.h | 8 +- .../detail/_flow_graph_tagged_buffer_impl.h | 10 +- .../tbb/include/oneapi/tbb/detail/_machine.h | 12 +- .../oneapi/tbb/detail/_pipeline_filters.h | 10 +- .../tbb/detail/_pipeline_filters_deduction.h | 10 +- .../tbb/include/oneapi/tbb/detail/_task.h | 4 +- .../oneapi/tbb/detail/_template_helpers.h | 4 +- .../tbb/include/oneapi/tbb/detail/_utils.h | 21 +- .../tbb/include/oneapi/tbb/flow_graph.h | 34 +- .../tbb/include/oneapi/tbb/parallel_for.h | 17 +- .../include/oneapi/tbb/parallel_for_each.h | 23 +- .../tbb/include/oneapi/tbb/parallel_reduce.h | 32 +- .../tbb/include/oneapi/tbb/parallel_scan.h | 25 +- .../tbb/include/oneapi/tbb/partitioner.h | 6 +- .../tbb/include/oneapi/tbb/profiling.h | 6 +- third-party/tbb/include/oneapi/tbb/version.h | 6 +- .../tbb/integration/pkg-config/tbb.pc.in | 4 +- .../tbb/integration/windows/env/vars.bat | 8 +- third-party/tbb/python/TBB.py | 4 +- third-party/tbb/python/setup.py | 4 +- third-party/tbb/python/tbb/__init__.py | 4 +- third-party/tbb/python/tbb/__main__.py | 4 +- third-party/tbb/python/tbb/pool.py | 4 +- third-party/tbb/python/tbb/test.py | 4 +- third-party/tbb/src/tbb/CMakeLists.txt | 4 +- third-party/tbb/src/tbb/dynamic_link.cpp | 7 +- third-party/tbb/src/tbb/tbb.rc | 4 +- third-party/tbb/src/tbbbind/CMakeLists.txt | 10 +- third-party/tbb/src/tbbbind/tbb_bind.rc | 4 +- third-party/tbb/src/tbbmalloc/CMakeLists.txt | 8 +- third-party/tbb/src/tbbmalloc/Customize.h | 4 +- third-party/tbb/src/tbbmalloc/backend.cpp | 11 +- third-party/tbb/src/tbbmalloc/backref.cpp | 8 +- third-party/tbb/src/tbbmalloc/frontend.cpp | 16 +- .../tbb/src/tbbmalloc/large_objects.cpp | 29 +- third-party/tbb/src/tbbmalloc/large_objects.h | 4 +- third-party/tbb/src/tbbmalloc/tbbmalloc.cpp | 3 +- third-party/tbb/src/tbbmalloc/tbbmalloc.rc | 4 +- .../tbb/src/tbbmalloc/tbbmalloc_internal.h | 6 +- .../src/tbbmalloc_proxy/tbbmalloc_proxy.rc | 4 +- third-party/tbb/test/CMakeLists.txt | 11 +- .../test/common/concurrent_unordered_common.h | 20 +- third-party/tbb/test/common/doctest.h | 2806 +++++++++-------- third-party/tbb/test/common/test_invoke.h | 145 + third-party/tbb/test/common/utils_assert.h | 4 +- .../conformance/conformance_async_node.cpp | 34 +- .../conformance_concurrent_queue.cpp | 208 +- .../conformance/conformance_function_node.cpp | 47 +- .../conformance/conformance_join_node.cpp | 53 +- .../conformance_multifunction_node.cpp | 55 +- .../conformance/conformance_parallel_for.cpp | 128 +- .../conformance_parallel_for_each.cpp | 71 +- .../conformance_parallel_pipeline.cpp | 45 +- .../conformance_parallel_reduce.cpp | 48 +- .../conformance/conformance_parallel_scan.cpp | 39 +- .../conformance_sequencer_node.cpp | 55 +- .../test/conformance/conformance_version.cpp | 4 +- .../tbb/test_concurrent_unordered_map.cpp | 8 +- .../tbb/test_concurrent_unordered_set.cpp | 8 +- .../test/tbb/test_join_node_key_matching.cpp | 35 +- .../test_join_node_key_matching_n_args.cpp | 54 + third-party/tbb/test/tbb/test_task.cpp | 4 +- third-party/zlib/.github/workflows/cmake.yml | 89 + .../zlib/.github/workflows/configure.yml | 136 + third-party/zlib/.github/workflows/fuzz.yml | 25 + third-party/zlib/.gitignore | 26 + third-party/zlib/CMakeLists.txt | 66 +- third-party/zlib/ChangeLog | 44 +- third-party/zlib/FAQ | 2 +- third-party/zlib/LICENSE | 22 + third-party/zlib/Makefile.in | 28 +- third-party/zlib/README | 19 +- third-party/zlib/adler32.c | 32 +- third-party/zlib/compress.c | 21 +- third-party/zlib/configure | 117 +- third-party/zlib/contrib/README.contrib | 2 +- third-party/zlib/contrib/ada/readme.txt | 4 +- third-party/zlib/contrib/ada/test.adb | 4 +- third-party/zlib/contrib/ada/zlib-streams.ads | 2 +- third-party/zlib/contrib/ada/zlib.adb | 2 +- third-party/zlib/contrib/ada/zlib.ads | 2 +- third-party/zlib/contrib/delphi/ZLib.pas | 2 +- third-party/zlib/contrib/infback9/infback9.c | 24 +- third-party/zlib/contrib/infback9/infback9.h | 16 +- third-party/zlib/contrib/infback9/inftree9.c | 17 +- third-party/zlib/contrib/infback9/inftree9.h | 8 +- .../contrib/minizip/MiniZip64_Changes.txt | 2 +- third-party/zlib/contrib/minizip/configure.ac | 2 +- third-party/zlib/contrib/minizip/crypt.h | 14 +- third-party/zlib/contrib/minizip/ioapi.c | 82 +- third-party/zlib/contrib/minizip/ioapi.h | 36 +- third-party/zlib/contrib/minizip/iowin32.c | 70 +- third-party/zlib/contrib/minizip/iowin32.h | 8 +- third-party/zlib/contrib/minizip/miniunz.c | 72 +- third-party/zlib/contrib/minizip/minizip.c | 66 +- third-party/zlib/contrib/minizip/mztools.c | 8 +- third-party/zlib/contrib/minizip/unzip.c | 513 ++- third-party/zlib/contrib/minizip/unzip.h | 136 +- third-party/zlib/contrib/minizip/zip.c | 322 +- third-party/zlib/contrib/minizip/zip.h | 299 +- third-party/zlib/contrib/pascal/zlibpas.pas | 2 +- third-party/zlib/contrib/puff/README | 2 +- third-party/zlib/contrib/puff/puff.c | 4 +- third-party/zlib/contrib/puff/pufftest.c | 2 +- third-party/zlib/contrib/testzlib/testzlib.c | 2 +- third-party/zlib/contrib/untgz/untgz.c | 63 +- third-party/zlib/contrib/vstudio/readme.txt | 5 +- .../vstudio/vc10/miniunz.vcxproj.filters | 2 +- .../vstudio/vc10/minizip.vcxproj.filters | 2 +- .../contrib/vstudio/vc10/testzlib.vcxproj | 24 +- .../vstudio/vc10/testzlib.vcxproj.filters | 5 +- .../vstudio/vc10/testzlibdll.vcxproj.filters | 2 +- third-party/zlib/contrib/vstudio/vc10/zlib.rc | 8 +- .../contrib/vstudio/vc10/zlibstat.vcxproj | 50 +- .../vstudio/vc10/zlibstat.vcxproj.filters | 3 - .../zlib/contrib/vstudio/vc10/zlibvc.def | 2 +- .../zlib/contrib/vstudio/vc10/zlibvc.vcxproj | 58 +- .../vstudio/vc10/zlibvc.vcxproj.filters | 3 - .../contrib/vstudio/vc11/testzlib.vcxproj | 24 +- third-party/zlib/contrib/vstudio/vc11/zlib.rc | 8 +- .../contrib/vstudio/vc11/zlibstat.vcxproj | 34 +- .../zlib/contrib/vstudio/vc11/zlibvc.def | 2 +- .../zlib/contrib/vstudio/vc11/zlibvc.vcxproj | 58 +- .../contrib/vstudio/vc12/testzlib.vcxproj | 24 +- third-party/zlib/contrib/vstudio/vc12/zlib.rc | 8 +- .../contrib/vstudio/vc12/zlibstat.vcxproj | 34 +- .../zlib/contrib/vstudio/vc12/zlibvc.def | 2 +- .../zlib/contrib/vstudio/vc12/zlibvc.vcxproj | 58 +- .../contrib/vstudio/vc14/testzlib.vcxproj | 24 +- third-party/zlib/contrib/vstudio/vc14/zlib.rc | 8 +- .../contrib/vstudio/vc14/zlibstat.vcxproj | 34 +- .../zlib/contrib/vstudio/vc14/zlibvc.def | 2 +- .../zlib/contrib/vstudio/vc14/zlibvc.vcxproj | 58 +- .../zlib/contrib/vstudio/vc9/miniunz.vcproj | 2 +- .../zlib/contrib/vstudio/vc9/minizip.vcproj | 2 +- .../zlib/contrib/vstudio/vc9/testzlib.vcproj | 66 +- .../contrib/vstudio/vc9/testzlibdll.vcproj | 2 +- third-party/zlib/contrib/vstudio/vc9/zlib.rc | 8 +- .../zlib/contrib/vstudio/vc9/zlibstat.vcproj | 76 +- .../zlib/contrib/vstudio/vc9/zlibvc.def | 2 +- .../zlib/contrib/vstudio/vc9/zlibvc.vcproj | 82 +- third-party/zlib/crc32.c | 255 +- third-party/zlib/deflate.c | 753 ++--- third-party/zlib/deflate.h | 20 +- third-party/zlib/examples/enough.c | 2 +- third-party/zlib/examples/fitblk.c | 6 +- third-party/zlib/examples/gun.c | 2 +- third-party/zlib/examples/gzappend.c | 4 +- third-party/zlib/examples/gzlog.h | 2 +- third-party/zlib/examples/zlib_how.html | 26 +- third-party/zlib/examples/zran.c | 738 +++-- third-party/zlib/examples/zran.h | 69 +- third-party/zlib/gzclose.c | 4 +- third-party/zlib/gzguts.h | 23 +- third-party/zlib/gzlib.c | 101 +- third-party/zlib/gzread.c | 96 +- third-party/zlib/gzwrite.c | 84 +- third-party/zlib/infback.c | 47 +- third-party/zlib/inffast.c | 5 +- third-party/zlib/inffast.h | 2 +- third-party/zlib/inflate.c | 136 +- third-party/zlib/inftrees.c | 17 +- third-party/zlib/inftrees.h | 8 +- third-party/zlib/make_vms.com | 4 +- third-party/zlib/os400/README400 | 6 +- third-party/zlib/os400/bndsrc | 8 + third-party/zlib/os400/zlib.inc | 8 +- third-party/zlib/qnx/package.qpg | 10 +- third-party/zlib/test/example.c | 106 +- third-party/zlib/test/infcover.c | 5 +- third-party/zlib/test/minigzip.c | 174 +- third-party/zlib/treebuild.xml | 4 +- third-party/zlib/trees.c | 629 ++-- third-party/zlib/uncompr.c | 16 +- third-party/zlib/win32/README-WIN32.txt | 4 +- third-party/zlib/win32/zlib1.rc | 2 +- third-party/zlib/zconf.h.cmakein | 27 +- third-party/zlib/zconf.h.in | 27 +- third-party/zlib/zconf.h.included | 551 ++++ third-party/zlib/zlib.3 | 6 +- third-party/zlib/zlib.3.pdf | Bin 8848 -> 19505 bytes third-party/zlib/zlib.h | 389 +-- third-party/zlib/zlib2ansi | 152 - third-party/zlib/zutil.c | 62 +- third-party/zlib/zutil.h | 19 +- third-party/zstd/.circleci/config.yml | 3 +- .../zstd/.circleci/images/primary/Dockerfile | 2 +- third-party/zstd/.cirrus.yml | 3 +- .../zstd/.github/ISSUE_TEMPLATE/bug_report.md | 4 +- third-party/zstd/.github/dependabot.yml | 6 + .../zstd/.github/workflows/dev-long-tests.yml | 189 +- .../.github/workflows/dev-short-tests.yml | 383 ++- .../workflows/publish-release-artifacts.yml | 11 +- .../zstd/.github/workflows/scorecards.yml | 64 + .../.github/workflows/windows-artifacts.yml | 51 + third-party/zstd/.travis.yml | 6 +- third-party/zstd/CHANGELOG | 79 + third-party/zstd/CONTRIBUTING.md | 20 +- third-party/zstd/LICENSE | 8 +- third-party/zstd/Makefile | 62 +- third-party/zstd/README.md | 48 +- third-party/zstd/TESTING.md | 2 +- .../zstd/build/VS2008/zstd/zstd.vcproj | 4 + .../build/VS2010/libzstd-dll/libzstd-dll.rc | 4 +- .../VS2010/libzstd-dll/libzstd-dll.vcxproj | 1 - .../zstd/build/VS2010/libzstd/libzstd.vcxproj | 1 - third-party/zstd/build/VS2010/zstd/zstd.rc | 4 +- .../zstd/build/VS2010/zstd/zstd.vcxproj | 6 + third-party/zstd/build/cmake/CMakeLists.txt | 29 +- .../AddZstdCompilationFlags.cmake | 76 +- .../build/cmake/CMakeModules/JoinPaths.cmake | 23 + third-party/zstd/build/cmake/README.md | 6 +- .../zstd/build/cmake/contrib/CMakeLists.txt | 2 +- .../cmake/contrib/gen_html/CMakeLists.txt | 2 +- .../build/cmake/contrib/pzstd/CMakeLists.txt | 2 +- .../zstd/build/cmake/lib/CMakeLists.txt | 43 +- .../zstd/build/cmake/programs/CMakeLists.txt | 6 +- .../zstd/build/cmake/tests/CMakeLists.txt | 23 +- .../build/meson/contrib/pzstd/meson.build | 3 +- third-party/zstd/build/meson/lib/meson.build | 36 +- third-party/zstd/build/meson/meson.build | 27 +- .../zstd/build/meson/meson_options.txt | 2 +- .../zstd/build/meson/programs/meson.build | 36 +- .../zstd/build/meson/tests/meson.build | 63 +- .../zstd/build/meson/tests/valgrindTest.py | 10 +- .../zstd/build/single_file_libs/README.md | 4 +- .../single_file_libs/build_decoder_test.sh | 2 +- .../single_file_libs/build_library_test.sh | 4 +- .../zstd/build/single_file_libs/combine.py | 234 ++ .../zstd/build/single_file_libs/combine.sh | 60 +- .../create_single_file_decoder.sh | 9 +- .../create_single_file_library.sh | 9 +- .../zstd/build/single_file_libs/zstd-in.c | 10 +- .../build/single_file_libs/zstddeclib-in.c | 10 +- .../zstd/contrib/VS2005/zstd/zstd.vcproj | 4 + .../zstd/contrib/diagnose_corruption/Makefile | 2 +- .../diagnose_corruption/check_flipped_bits.c | 2 +- third-party/zstd/contrib/docker/Dockerfile | 4 +- .../externalSequenceProducer/.gitignore | 2 + .../contrib/externalSequenceProducer/Makefile | 40 + .../externalSequenceProducer/README.md | 14 + .../contrib/externalSequenceProducer/main.c | 107 + .../sequence_producer.c | 80 + .../sequence_producer.h | 26 + .../contrib/freestanding_lib/freestanding.py | 33 +- third-party/zstd/contrib/gen_html/Makefile | 2 +- .../zstd/contrib/gen_html/gen_html.cpp | 2 +- .../zstd/contrib/largeNbDicts/Makefile | 2 +- .../zstd/contrib/largeNbDicts/README.md | 24 +- .../zstd/contrib/largeNbDicts/largeNbDicts.c | 187 +- .../zstd/contrib/linux-kernel/Makefile | 15 +- .../contrib/linux-kernel/decompress_sources.h | 2 +- .../zstd/contrib/linux-kernel/linux.mk | 22 +- .../zstd/contrib/linux-kernel/linux_zstd.h | 2 +- third-party/zstd/contrib/linux-kernel/mem.h | 2 +- .../zstd/contrib/linux-kernel/test/Makefile | 3 +- .../test/include/linux/compiler.h | 2 +- .../linux-kernel/test/include/linux/errno.h | 2 +- .../linux-kernel/test/include/linux/kernel.h | 2 +- .../linux-kernel/test/include/linux/limits.h | 2 +- .../linux-kernel/test/include/linux/math64.h | 2 +- .../linux-kernel/test/include/linux/module.h | 4 +- .../linux-kernel/test/include/linux/printk.h | 2 +- .../linux-kernel/test/include/linux/stddef.h | 2 +- .../linux-kernel/test/include/linux/swab.h | 2 +- .../linux-kernel/test/include/linux/types.h | 2 +- .../linux-kernel/test/include/linux/xxhash.h | 4 +- .../contrib/linux-kernel/test/static_test.c | 2 +- .../zstd/contrib/linux-kernel/test/test.c | 8 +- .../contrib/linux-kernel/zstd_common_module.c | 29 + .../linux-kernel/zstd_compress_module.c | 8 +- .../linux-kernel/zstd_decompress_module.c | 2 +- .../zstd/contrib/linux-kernel/zstd_deps.h | 4 +- .../zstd/contrib/match_finders/README.md | 8 +- .../zstd/contrib/match_finders/zstd_edist.c | 24 +- .../zstd/contrib/match_finders/zstd_edist.h | 10 +- third-party/zstd/contrib/pzstd/ErrorHolder.h | 2 +- third-party/zstd/contrib/pzstd/Logging.h | 2 +- third-party/zstd/contrib/pzstd/Makefile | 10 +- third-party/zstd/contrib/pzstd/Options.cpp | 2 +- third-party/zstd/contrib/pzstd/Options.h | 2 +- third-party/zstd/contrib/pzstd/Pzstd.cpp | 11 +- third-party/zstd/contrib/pzstd/Pzstd.h | 2 +- third-party/zstd/contrib/pzstd/README.md | 2 +- .../zstd/contrib/pzstd/SkippableFrame.cpp | 2 +- .../zstd/contrib/pzstd/SkippableFrame.h | 2 +- third-party/zstd/contrib/pzstd/main.cpp | 2 +- .../zstd/contrib/pzstd/test/OptionsTest.cpp | 2 +- .../zstd/contrib/pzstd/test/PzstdTest.cpp | 2 +- .../zstd/contrib/pzstd/test/RoundTrip.h | 2 +- .../zstd/contrib/pzstd/test/RoundTripTest.cpp | 2 +- third-party/zstd/contrib/pzstd/utils/Buffer.h | 2 +- .../zstd/contrib/pzstd/utils/FileSystem.h | 22 +- third-party/zstd/contrib/pzstd/utils/Likely.h | 2 +- .../zstd/contrib/pzstd/utils/Portability.h | 16 + third-party/zstd/contrib/pzstd/utils/Range.h | 4 +- .../zstd/contrib/pzstd/utils/ResourcePool.h | 2 +- .../zstd/contrib/pzstd/utils/ScopeGuard.h | 4 +- .../zstd/contrib/pzstd/utils/ThreadPool.h | 2 +- .../zstd/contrib/pzstd/utils/WorkQueue.h | 2 +- .../contrib/pzstd/utils/test/BufferTest.cpp | 2 +- .../contrib/pzstd/utils/test/RangeTest.cpp | 2 +- .../pzstd/utils/test/ResourcePoolTest.cpp | 2 +- .../pzstd/utils/test/ScopeGuardTest.cpp | 2 +- .../pzstd/utils/test/ThreadPoolTest.cpp | 2 +- .../pzstd/utils/test/WorkQueueTest.cpp | 2 +- third-party/zstd/contrib/recovery/Makefile | 2 +- .../zstd/contrib/recovery/recover_directory.c | 2 +- .../zstd/contrib/seekable_format/README.md | 42 + .../contrib/seekable_format/examples/Makefile | 2 +- .../examples/parallel_compression.c | 4 +- .../examples/parallel_processing.c | 4 +- .../examples/seekable_compression.c | 19 +- .../examples/seekable_decompression.c | 4 +- .../examples/seekable_decompression_mem.c | 2 +- .../contrib/seekable_format/tests/Makefile | 2 +- .../seekable_format/tests/seekable_tests.c | 169 +- .../contrib/seekable_format/zstd_seekable.h | 21 +- .../zstd_seekable_compression_format.md | 2 +- .../seekable_format/zstdseek_compress.c | 6 +- .../seekable_format/zstdseek_decompress.c | 61 +- third-party/zstd/contrib/seqBench/Makefile | 58 + third-party/zstd/contrib/seqBench/seqBench.c | 53 + third-party/zstd/doc/README.md | 5 +- third-party/zstd/doc/decompressor_errata.md | 84 + .../zstd/doc/educational_decoder/Makefile | 2 +- .../zstd/doc/educational_decoder/harness.c | 2 +- .../doc/educational_decoder/zstd_decompress.c | 4 +- .../doc/educational_decoder/zstd_decompress.h | 2 +- third-party/zstd/doc/images/zstd_logo86.png | Bin 5963 -> 13069 bytes .../zstd/doc/zstd_compression_format.md | 88 +- third-party/zstd/doc/zstd_manual.html | 411 ++- third-party/zstd/examples/Makefile | 2 +- third-party/zstd/examples/common.h | 32 +- .../zstd/examples/dictionary_compression.c | 16 +- .../zstd/examples/dictionary_decompression.c | 2 +- .../examples/multiple_simple_compression.c | 2 +- .../examples/multiple_streaming_compression.c | 2 +- .../zstd/examples/simple_compression.c | 2 +- .../zstd/examples/simple_decompression.c | 2 +- .../zstd/examples/streaming_compression.c | 2 +- .../streaming_compression_thread_pool.c | 2 +- .../zstd/examples/streaming_decompression.c | 2 +- .../zstd/examples/streaming_memory_usage.c | 2 +- third-party/zstd/lib/Makefile | 4 +- third-party/zstd/lib/README.md | 9 +- third-party/zstd/lib/common/allocations.h | 55 + third-party/zstd/lib/common/bits.h | 200 ++ third-party/zstd/lib/common/bitstream.h | 79 +- third-party/zstd/lib/common/compiler.h | 29 +- third-party/zstd/lib/common/cpu.h | 2 +- third-party/zstd/lib/common/debug.c | 2 +- third-party/zstd/lib/common/debug.h | 2 +- third-party/zstd/lib/common/entropy_common.c | 52 +- third-party/zstd/lib/common/error_private.c | 11 +- third-party/zstd/lib/common/error_private.h | 2 +- third-party/zstd/lib/common/fse.h | 88 +- third-party/zstd/lib/common/fse_decompress.c | 106 +- third-party/zstd/lib/common/huf.h | 221 +- third-party/zstd/lib/common/mem.h | 85 +- third-party/zstd/lib/common/pool.c | 36 +- third-party/zstd/lib/common/pool.h | 8 +- .../zstd/lib/common/portability_macros.h | 25 +- third-party/zstd/lib/common/threading.c | 82 +- third-party/zstd/lib/common/threading.h | 15 +- third-party/zstd/lib/common/xxhash.c | 4 +- third-party/zstd/lib/common/xxhash.h | 16 +- third-party/zstd/lib/common/zstd_common.c | 37 +- third-party/zstd/lib/common/zstd_deps.h | 2 +- third-party/zstd/lib/common/zstd_internal.h | 135 +- third-party/zstd/lib/common/zstd_trace.h | 6 +- third-party/zstd/lib/compress/clevels.h | 2 +- third-party/zstd/lib/compress/fse_compress.c | 131 +- third-party/zstd/lib/compress/hist.c | 2 +- third-party/zstd/lib/compress/hist.h | 2 +- third-party/zstd/lib/compress/huf_compress.c | 403 ++- third-party/zstd/lib/compress/zstd_compress.c | 1781 +++++++---- .../lib/compress/zstd_compress_internal.h | 376 ++- .../lib/compress/zstd_compress_literals.c | 154 +- .../lib/compress/zstd_compress_literals.h | 24 +- .../lib/compress/zstd_compress_sequences.c | 6 +- .../lib/compress/zstd_compress_sequences.h | 2 +- .../lib/compress/zstd_compress_superblock.c | 46 +- .../lib/compress/zstd_compress_superblock.h | 2 +- third-party/zstd/lib/compress/zstd_cwksp.h | 190 +- .../zstd/lib/compress/zstd_double_fast.c | 128 +- .../zstd/lib/compress/zstd_double_fast.h | 5 +- third-party/zstd/lib/compress/zstd_fast.c | 581 +++- third-party/zstd/lib/compress/zstd_fast.h | 5 +- third-party/zstd/lib/compress/zstd_lazy.c | 743 +++-- third-party/zstd/lib/compress/zstd_lazy.h | 6 +- third-party/zstd/lib/compress/zstd_ldm.c | 10 +- third-party/zstd/lib/compress/zstd_ldm.h | 2 +- .../zstd/lib/compress/zstd_ldm_geartab.h | 2 +- third-party/zstd/lib/compress/zstd_opt.c | 186 +- third-party/zstd/lib/compress/zstd_opt.h | 2 +- .../zstd/lib/compress/zstdmt_compress.c | 26 +- .../zstd/lib/compress/zstdmt_compress.h | 2 +- .../zstd/lib/decompress/huf_decompress.c | 875 +++-- .../lib/decompress/huf_decompress_amd64.S | 69 +- third-party/zstd/lib/decompress/zstd_ddict.c | 8 +- third-party/zstd/lib/decompress/zstd_ddict.h | 2 +- .../zstd/lib/decompress/zstd_decompress.c | 285 +- .../lib/decompress/zstd_decompress_block.c | 282 +- .../lib/decompress/zstd_decompress_block.h | 7 +- .../lib/decompress/zstd_decompress_internal.h | 6 +- third-party/zstd/lib/deprecated/zbuff.h | 2 +- .../zstd/lib/deprecated/zbuff_common.c | 2 +- .../zstd/lib/deprecated/zbuff_compress.c | 2 +- .../zstd/lib/deprecated/zbuff_decompress.c | 4 +- third-party/zstd/lib/dictBuilder/cover.c | 34 +- third-party/zstd/lib/dictBuilder/cover.h | 2 +- third-party/zstd/lib/dictBuilder/fastcover.c | 4 +- third-party/zstd/lib/dictBuilder/zdict.c | 100 +- third-party/zstd/lib/dll/example/Makefile | 2 +- third-party/zstd/lib/dll/example/README.md | 2 +- third-party/zstd/lib/legacy/zstd_legacy.h | 9 +- third-party/zstd/lib/legacy/zstd_v01.c | 63 +- third-party/zstd/lib/legacy/zstd_v01.h | 2 +- third-party/zstd/lib/legacy/zstd_v02.c | 85 +- third-party/zstd/lib/legacy/zstd_v02.h | 2 +- third-party/zstd/lib/legacy/zstd_v03.c | 89 +- third-party/zstd/lib/legacy/zstd_v03.h | 2 +- third-party/zstd/lib/legacy/zstd_v04.c | 84 +- third-party/zstd/lib/legacy/zstd_v04.h | 2 +- third-party/zstd/lib/legacy/zstd_v05.c | 104 +- third-party/zstd/lib/legacy/zstd_v05.h | 2 +- third-party/zstd/lib/legacy/zstd_v06.c | 113 +- third-party/zstd/lib/legacy/zstd_v06.h | 2 +- third-party/zstd/lib/legacy/zstd_v07.c | 101 +- third-party/zstd/lib/legacy/zstd_v07.h | 2 +- third-party/zstd/lib/libzstd.mk | 49 +- third-party/zstd/lib/libzstd.pc.in | 6 +- third-party/zstd/lib/module.modulemap | 16 +- third-party/zstd/lib/zdict.h | 84 +- third-party/zstd/lib/zstd.h | 715 ++++- third-party/zstd/lib/zstd_errors.h | 35 +- third-party/zstd/programs/Makefile | 57 +- third-party/zstd/programs/README.md | 5 +- third-party/zstd/programs/benchfn.c | 8 +- third-party/zstd/programs/benchfn.h | 4 +- third-party/zstd/programs/benchzstd.c | 82 +- third-party/zstd/programs/benchzstd.h | 53 +- third-party/zstd/programs/datagen.c | 2 +- third-party/zstd/programs/datagen.h | 2 +- third-party/zstd/programs/dibio.c | 31 +- third-party/zstd/programs/dibio.h | 2 +- third-party/zstd/programs/fileio.c | 1747 +++++----- third-party/zstd/programs/fileio.h | 23 +- third-party/zstd/programs/fileio_asyncio.c | 663 ++++ third-party/zstd/programs/fileio_asyncio.h | 203 ++ third-party/zstd/programs/fileio_common.h | 125 + third-party/zstd/programs/fileio_types.h | 86 + third-party/zstd/programs/platform.h | 14 +- third-party/zstd/programs/timefn.c | 139 +- third-party/zstd/programs/timefn.h | 59 +- third-party/zstd/programs/util.c | 367 ++- third-party/zstd/programs/util.h | 41 +- third-party/zstd/programs/windres/verrsrc.h | 2 +- third-party/zstd/programs/windres/zstd.rc | 4 +- third-party/zstd/programs/zstd.1 | 406 +-- third-party/zstd/programs/zstd.1.md | 280 +- third-party/zstd/programs/zstdcli.c | 477 +-- third-party/zstd/programs/zstdcli_trace.c | 2 +- third-party/zstd/programs/zstdcli_trace.h | 2 +- third-party/zstd/programs/zstdgrep.1 | 21 +- third-party/zstd/programs/zstdgrep.1.md | 10 +- third-party/zstd/programs/zstdless | 8 +- third-party/zstd/programs/zstdless.1 | 13 +- third-party/zstd/programs/zstdless.1.md | 6 +- third-party/zstd/tests/.gitignore | 1 + .../zstd/tests/DEPRECATED-test-zstd-speed.py | 2 +- third-party/zstd/tests/Makefile | 91 +- third-party/zstd/tests/README.md | 6 +- .../zstd/tests/automated_benchmarking.py | 4 +- third-party/zstd/tests/bigdict.c | 2 +- third-party/zstd/tests/checkTag.c | 2 +- third-party/zstd/tests/check_size.py | 31 + third-party/zstd/tests/cli-tests/.gitignore | 6 + third-party/zstd/tests/cli-tests/README.md | 258 ++ .../zstd/tests/cli-tests/basic/help.sh | 10 + .../tests/cli-tests/basic/help.sh.stdout.glob | 34 + .../zstd/tests/cli-tests/basic/memlimit.sh | 40 + .../cli-tests/basic/memlimit.sh.stderr.exact | 13 + .../cli-tests/basic/memlimit.sh.stdout.exact | 13 + .../zstd/tests/cli-tests/basic/output_dir.sh | 7 + .../basic/output_dir.sh.stderr.exact | 2 + .../basic/output_dir.sh.stdout.exact | 2 + .../zstd/tests/cli-tests/basic/version.sh | 6 + .../cli-tests/basic/version.sh.stdout.glob | 2 + third-party/zstd/tests/cli-tests/bin/cmp_size | 44 + third-party/zstd/tests/cli-tests/bin/datagen | 3 + third-party/zstd/tests/cli-tests/bin/die | 4 + third-party/zstd/tests/cli-tests/bin/println | 2 + third-party/zstd/tests/cli-tests/bin/unzstd | 1 + third-party/zstd/tests/cli-tests/bin/zstd | 9 + third-party/zstd/tests/cli-tests/bin/zstdcat | 1 + third-party/zstd/tests/cli-tests/bin/zstdgrep | 2 + third-party/zstd/tests/cli-tests/bin/zstdless | 2 + .../zstd/tests/cli-tests/cltools/setup | 6 + .../zstd/tests/cli-tests/cltools/zstdgrep.sh | 8 + .../tests/cli-tests/cltools/zstdgrep.sh.exit | 1 + .../cltools/zstdgrep.sh.stderr.exact | 1 + .../cli-tests/cltools/zstdgrep.sh.stdout.glob | 4 + .../zstd/tests/cli-tests/cltools/zstdless.sh | 10 + .../cltools/zstdless.sh.stderr.exact | 2 + .../cli-tests/cltools/zstdless.sh.stdout.glob | 5 + .../zstd/tests/cli-tests/common/format.sh | 19 + .../zstd/tests/cli-tests/common/mtime.sh | 13 + .../tests/cli-tests/common/permissions.sh | 18 + .../zstd/tests/cli-tests/common/platform.sh | 37 + .../zstd/tests/cli-tests/compression/adapt.sh | 14 + .../zstd/tests/cli-tests/compression/basic.sh | 36 + .../compression/compress-literals.sh | 10 + .../tests/cli-tests/compression/format.sh | 16 + .../tests/cli-tests/compression/golden.sh | 16 + .../cli-tests/compression/gzip-compat.sh | 17 + .../tests/cli-tests/compression/levels.sh | 62 + .../compression/levels.sh.stderr.exact | 69 + .../compression/long-distance-matcher.sh | 7 + .../cli-tests/compression/multi-threaded.sh | 15 + .../multi-threaded.sh.stderr.exact | 11 + .../cli-tests/compression/multiple-files.sh | 21 + .../multiple-files.sh.stdout.exact | 12 + .../cli-tests/compression/row-match-finder.sh | 7 + .../zstd/tests/cli-tests/compression/setup | 7 + .../cli-tests/compression/stream-size.sh | 7 + .../cli-tests/compression/verbose-wlog.sh | 11 + .../compression/verbose-wlog.sh.stderr.glob | 5 + .../compression/verbose-wlog.sh.stdout.glob | 5 + .../cli-tests/compression/window-resize.sh | 9 + .../window-resize.sh.stderr.ignore | 0 .../compression/window-resize.sh.stdout.glob | 3 + .../tests/cli-tests/decompression/golden.sh | 7 + .../cli-tests/decompression/pass-through.sh | 57 + .../pass-through.sh.stderr.exact | 11 + .../pass-through.sh.stdout.exact | 25 + .../cli-tests/dict-builder/empty-input.sh | 9 + .../dict-builder/empty-input.sh.stderr.exact | 1 + .../tests/cli-tests/dict-builder/no-inputs.sh | 3 + .../cli-tests/dict-builder/no-inputs.sh.exit | 1 + .../dict-builder/no-inputs.sh.stderr.exact | 5 + .../dictionaries/dictionary-mismatch.sh | 29 + .../dictionary-mismatch.sh.stderr.exact | 7 + .../tests/cli-tests/dictionaries/golden.sh | 9 + .../zstd/tests/cli-tests/dictionaries/setup | 6 + .../tests/cli-tests/dictionaries/setup_once | 24 + ...compress-file-to-dir-without-write-perm.sh | 12 + ...-to-dir-without-write-perm.sh.stderr.exact | 26 + .../file-stat/compress-file-to-file.sh | 9 + .../compress-file-to-file.sh.stderr.exact | 42 + .../file-stat/compress-file-to-stdout.sh | 8 + .../compress-file-to-stdout.sh.stderr.exact | 24 + .../file-stat/compress-stdin-to-file.sh | 8 + .../compress-stdin-to-file.sh.stderr.exact | 24 + .../file-stat/compress-stdin-to-stdout.sh | 8 + .../compress-stdin-to-stdout.sh.stderr.exact | 18 + .../file-stat/decompress-file-to-file.sh | 8 + .../decompress-file-to-file.sh.stderr.exact | 38 + .../file-stat/decompress-file-to-stdout.sh | 7 + .../decompress-file-to-stdout.sh.stderr.exact | 18 + .../file-stat/decompress-stdin-to-file.sh | 7 + .../decompress-stdin-to-file.sh.stderr.exact | 20 + .../file-stat/decompress-stdin-to-stdout.sh | 7 + ...decompress-stdin-to-stdout.sh.stderr.exact | 14 + .../tests/cli-tests/progress/no-progress.sh | 46 + .../progress/no-progress.sh.stderr.glob | 96 + .../zstd/tests/cli-tests/progress/progress.sh | 41 + .../progress/progress.sh.stderr.glob | 62 + third-party/zstd/tests/cli-tests/run.py | 731 +++++ .../zstd/tests/cli-tests/zstd-symlinks/setup | 6 + .../tests/cli-tests/zstd-symlinks/zstdcat.sh | 12 + .../zstd-symlinks/zstdcat.sh.stdout.exact | 8 + third-party/zstd/tests/datagencli.c | 4 +- third-party/zstd/tests/decodecorpus.c | 63 +- third-party/zstd/tests/external_matchfinder.c | 140 + third-party/zstd/tests/external_matchfinder.h | 39 + third-party/zstd/tests/fullbench.c | 13 +- third-party/zstd/tests/fuzz/Makefile | 34 +- third-party/zstd/tests/fuzz/README.md | 18 + .../zstd/tests/fuzz/block_decompress.c | 8 +- .../zstd/tests/fuzz/block_round_trip.c | 6 +- .../tests/fuzz/decompress_dstSize_tooSmall.c | 6 +- .../zstd/tests/fuzz/dictionary_decompress.c | 6 +- .../zstd/tests/fuzz/dictionary_loader.c | 7 +- .../zstd/tests/fuzz/dictionary_round_trip.c | 6 +- .../tests/fuzz/dictionary_stream_round_trip.c | 5 +- third-party/zstd/tests/fuzz/fse_read_ncount.c | 2 +- third-party/zstd/tests/fuzz/fuzz.h | 10 +- third-party/zstd/tests/fuzz/fuzz.py | 15 +- .../zstd/tests/fuzz/fuzz_data_producer.c | 3 +- .../zstd/tests/fuzz/fuzz_data_producer.h | 3 +- third-party/zstd/tests/fuzz/fuzz_helpers.c | 18 +- third-party/zstd/tests/fuzz/fuzz_helpers.h | 9 +- .../tests/fuzz/fuzz_third_party_seq_prod.h | 116 + third-party/zstd/tests/fuzz/huf_decompress.c | 20 +- third-party/zstd/tests/fuzz/huf_round_trip.c | 31 +- .../tests/fuzz/raw_dictionary_round_trip.c | 8 +- .../zstd/tests/fuzz/regression_driver.c | 2 +- .../zstd/tests/fuzz/seekable_roundtrip.c | 2 +- .../tests/fuzz/seq_prod_fuzz_example/Makefile | 16 + .../fuzz/seq_prod_fuzz_example/README.md | 12 + .../seq_prod_fuzz_example/example_seq_prod.c | 52 + .../tests/fuzz/sequence_compression_api.c | 292 +- third-party/zstd/tests/fuzz/simple_compress.c | 8 +- .../zstd/tests/fuzz/simple_decompress.c | 2 +- .../zstd/tests/fuzz/simple_round_trip.c | 50 +- .../zstd/tests/fuzz/stream_decompress.c | 10 +- .../zstd/tests/fuzz/stream_round_trip.c | 23 +- third-party/zstd/tests/fuzz/zstd_frame_info.c | 2 +- third-party/zstd/tests/fuzz/zstd_helpers.c | 64 +- third-party/zstd/tests/fuzz/zstd_helpers.h | 7 +- third-party/zstd/tests/fuzzer.c | 901 +++++- .../PR-3517-block-splitter-corruption-test | 1 + .../large-literal-and-match-lengths | Bin 0 -> 199998 bytes third-party/zstd/tests/gzip/Makefile | 2 +- third-party/zstd/tests/gzip/gzip-env.sh | 2 +- third-party/zstd/tests/gzip/helin-segv.sh | 2 +- third-party/zstd/tests/gzip/help-version.sh | 2 +- third-party/zstd/tests/gzip/hufts.sh | 2 +- third-party/zstd/tests/gzip/init.sh | 2 +- third-party/zstd/tests/gzip/keep.sh | 2 +- third-party/zstd/tests/gzip/list.sh | 2 +- third-party/zstd/tests/gzip/memcpy-abuse.sh | 2 +- third-party/zstd/tests/gzip/mixed.sh | 2 +- .../zstd/tests/gzip/null-suffix-clobber.sh | 2 +- third-party/zstd/tests/gzip/stdin.sh | 2 +- third-party/zstd/tests/gzip/test-driver.sh | 2 +- third-party/zstd/tests/gzip/trailing-nul.sh | 2 +- third-party/zstd/tests/gzip/unpack-invalid.sh | 2 +- third-party/zstd/tests/gzip/z-suffix.sh | 2 +- third-party/zstd/tests/gzip/zdiff.sh | 2 +- third-party/zstd/tests/gzip/zgrep-context.sh | 2 +- third-party/zstd/tests/gzip/zgrep-f.sh | 2 +- third-party/zstd/tests/gzip/zgrep-signal.sh | 2 +- third-party/zstd/tests/gzip/znew-k.sh | 2 +- third-party/zstd/tests/invalidDictionaries.c | 2 +- third-party/zstd/tests/legacy.c | 2 +- third-party/zstd/tests/longmatch.c | 35 +- third-party/zstd/tests/paramgrill.c | 42 +- third-party/zstd/tests/playTests.sh | 167 +- third-party/zstd/tests/poolTests.c | 2 +- third-party/zstd/tests/rateLimiter.py | 2 +- third-party/zstd/tests/regression/.gitignore | 1 + third-party/zstd/tests/regression/Makefile | 2 +- third-party/zstd/tests/regression/config.c | 6 +- third-party/zstd/tests/regression/config.h | 2 +- third-party/zstd/tests/regression/data.c | 2 +- third-party/zstd/tests/regression/data.h | 2 +- third-party/zstd/tests/regression/levels.h | 2 +- third-party/zstd/tests/regression/method.c | 2 +- third-party/zstd/tests/regression/method.h | 2 +- third-party/zstd/tests/regression/result.c | 2 +- third-party/zstd/tests/regression/result.h | 2 +- third-party/zstd/tests/regression/results.csv | 1362 ++++---- third-party/zstd/tests/regression/test.c | 2 +- third-party/zstd/tests/roundTripCrash.c | 2 +- third-party/zstd/tests/seqgen.c | 2 +- third-party/zstd/tests/seqgen.h | 2 +- third-party/zstd/tests/test-license.py | 6 +- third-party/zstd/tests/test-zstd-versions.py | 161 +- third-party/zstd/tests/zstreamtest.c | 704 ++++- third-party/zstd/zlibWrapper/Makefile | 7 +- third-party/zstd/zlibWrapper/README.md | 4 +- .../zstd/zlibWrapper/examples/example.c | 26 +- .../zlibWrapper/examples/example_original.c | 26 +- .../zstd/zlibWrapper/examples/minigzip.c | 36 +- .../zstd/zlibWrapper/examples/zwrapbench.c | 32 +- third-party/zstd/zlibWrapper/gzclose.c | 2 +- .../zstd/zlibWrapper/gzcompatibility.h | 16 +- third-party/zstd/zlibWrapper/gzguts.h | 20 +- third-party/zstd/zlibWrapper/gzlib.c | 6 +- third-party/zstd/zlibWrapper/gzread.c | 20 +- third-party/zstd/zlibWrapper/gzwrite.c | 10 +- .../zstd/zlibWrapper/zstd_zlibwrapper.c | 90 +- .../zstd/zlibWrapper/zstd_zlibwrapper.h | 5 +- 873 files changed, 73758 insertions(+), 16905 deletions(-) delete mode 100644 common/sha.h delete mode 100644 elf/arch-mips64.cc create mode 100755 install-cross-tools.sh delete mode 100755 test/elf/as-needed2.sh create mode 100755 test/elf/copyrel-relro2.sh create mode 100644 test/elf/lto-nostdlib.sh create mode 100755 test/elf/relocatable-c++.sh create mode 100755 test/elf/riscv64_relax-hi20.sh create mode 100755 test/elf/tls-gd-dlopen.sh create mode 100755 test/elf/tlsdesc-dlopen.sh create mode 100755 test/elf/tlsdesc-initial-exec.sh create mode 100755 test/elf/tlsdesc-local-dynamic.sh create mode 100755 test/elf/version-script19.sh create mode 100644 third-party/blake3/.github/workflows/build_b3sum.py create mode 100644 third-party/blake3/.github/workflows/ci.yml create mode 100644 third-party/blake3/.github/workflows/tag.yml create mode 100755 third-party/blake3/.github/workflows/upload_github_release_asset.py create mode 100644 third-party/blake3/.gitignore create mode 100644 third-party/blake3/CONTRIBUTING.md create mode 100644 third-party/blake3/Cargo.toml create mode 100644 third-party/blake3/LICENSE create mode 100644 third-party/blake3/README.md create mode 100644 third-party/blake3/b3sum/.gitignore create mode 100644 third-party/blake3/b3sum/Cargo.lock create mode 100644 third-party/blake3/b3sum/Cargo.toml create mode 100644 third-party/blake3/b3sum/README.md create mode 100644 third-party/blake3/b3sum/src/main.rs create mode 100644 third-party/blake3/b3sum/src/unit_tests.rs create mode 100644 third-party/blake3/b3sum/tests/cli_tests.rs create mode 100644 third-party/blake3/b3sum/what_does_check_do.md create mode 100644 third-party/blake3/benches/bench.rs create mode 100644 third-party/blake3/build.rs create mode 100644 third-party/blake3/c/.gitignore create mode 100644 third-party/blake3/c/CMakeLists.txt create mode 100644 third-party/blake3/c/Makefile.testing create mode 100644 third-party/blake3/c/README.md create mode 100644 third-party/blake3/c/blake3-config.cmake.in create mode 100644 third-party/blake3/c/blake3.c create mode 100644 third-party/blake3/c/blake3.h create mode 100644 third-party/blake3/c/blake3_avx2.c create mode 100644 third-party/blake3/c/blake3_avx2_x86-64_unix.S create mode 100644 third-party/blake3/c/blake3_avx2_x86-64_windows_gnu.S create mode 100644 third-party/blake3/c/blake3_avx2_x86-64_windows_msvc.asm create mode 100644 third-party/blake3/c/blake3_avx512.c create mode 100644 third-party/blake3/c/blake3_avx512_x86-64_unix.S create mode 100644 third-party/blake3/c/blake3_avx512_x86-64_windows_gnu.S create mode 100644 third-party/blake3/c/blake3_avx512_x86-64_windows_msvc.asm create mode 100644 third-party/blake3/c/blake3_c_rust_bindings/Cargo.toml create mode 100644 third-party/blake3/c/blake3_c_rust_bindings/README.md create mode 100644 third-party/blake3/c/blake3_c_rust_bindings/benches/bench.rs create mode 100644 third-party/blake3/c/blake3_c_rust_bindings/build.rs create mode 100755 third-party/blake3/c/blake3_c_rust_bindings/cross_test.sh create mode 100644 third-party/blake3/c/blake3_c_rust_bindings/src/lib.rs create mode 100644 third-party/blake3/c/blake3_c_rust_bindings/src/test.rs create mode 100644 third-party/blake3/c/blake3_dispatch.c create mode 100644 third-party/blake3/c/blake3_impl.h create mode 100644 third-party/blake3/c/blake3_neon.c create mode 100644 third-party/blake3/c/blake3_portable.c create mode 100644 third-party/blake3/c/blake3_sse2.c create mode 100644 third-party/blake3/c/blake3_sse2_x86-64_unix.S create mode 100644 third-party/blake3/c/blake3_sse2_x86-64_windows_gnu.S create mode 100644 third-party/blake3/c/blake3_sse2_x86-64_windows_msvc.asm create mode 100644 third-party/blake3/c/blake3_sse41.c create mode 100644 third-party/blake3/c/blake3_sse41_x86-64_unix.S create mode 100644 third-party/blake3/c/blake3_sse41_x86-64_windows_gnu.S create mode 100644 third-party/blake3/c/blake3_sse41_x86-64_windows_msvc.asm create mode 100644 third-party/blake3/c/example.c create mode 100644 third-party/blake3/c/libblake3.pc.in create mode 100644 third-party/blake3/c/main.c create mode 100755 third-party/blake3/c/test.py create mode 100644 third-party/blake3/media/B3.svg create mode 100644 third-party/blake3/media/BLAKE3.svg create mode 100644 third-party/blake3/media/speed.svg create mode 100644 third-party/blake3/reference_impl/Cargo.toml create mode 100644 third-party/blake3/reference_impl/README.md create mode 100644 third-party/blake3/reference_impl/reference_impl.rs create mode 100644 third-party/blake3/src/ffi_avx2.rs create mode 100644 third-party/blake3/src/ffi_avx512.rs create mode 100644 third-party/blake3/src/ffi_neon.rs create mode 100644 third-party/blake3/src/ffi_sse2.rs create mode 100644 third-party/blake3/src/ffi_sse41.rs create mode 100644 third-party/blake3/src/guts.rs create mode 100644 third-party/blake3/src/join.rs create mode 100644 third-party/blake3/src/lib.rs create mode 100644 third-party/blake3/src/platform.rs create mode 100644 third-party/blake3/src/portable.rs create mode 100644 third-party/blake3/src/rust_avx2.rs create mode 100644 third-party/blake3/src/rust_sse2.rs create mode 100644 third-party/blake3/src/rust_sse41.rs create mode 100644 third-party/blake3/src/test.rs create mode 100644 third-party/blake3/src/traits.rs create mode 100644 third-party/blake3/test_vectors/Cargo.toml create mode 100755 third-party/blake3/test_vectors/cross_test.sh create mode 100644 third-party/blake3/test_vectors/src/bin/generate.rs create mode 100644 third-party/blake3/test_vectors/src/lib.rs create mode 100644 third-party/blake3/test_vectors/test_vectors.json create mode 100644 third-party/blake3/tools/compiler_version/Cargo.toml create mode 100644 third-party/blake3/tools/compiler_version/build.rs create mode 100644 third-party/blake3/tools/compiler_version/src/main.rs create mode 100644 third-party/blake3/tools/instruction_set_support/Cargo.toml create mode 100644 third-party/blake3/tools/instruction_set_support/src/main.rs create mode 100644 third-party/blake3/tools/release.md create mode 100644 third-party/tbb/.github/labeler.yml create mode 100644 third-party/tbb/.github/workflows/labeler.yml create mode 100644 third-party/tbb/SECURITY.md create mode 100644 third-party/tbb/WASM_Support.md create mode 100644 third-party/tbb/cmake/toolchains/riscv64.cmake create mode 100644 third-party/tbb/doc/GSG/integrate.rst create mode 100644 third-party/tbb/doc/GSG/intro.rst create mode 100644 third-party/tbb/doc/GSG/next_steps.rst create mode 100644 third-party/tbb/doc/GSG/samples.rst create mode 100644 third-party/tbb/doc/main/tbb_userguide/std_invoke.rst create mode 100644 third-party/tbb/test/common/test_invoke.h create mode 100644 third-party/tbb/test/tbb/test_join_node_key_matching_n_args.cpp create mode 100644 third-party/zlib/.github/workflows/cmake.yml create mode 100644 third-party/zlib/.github/workflows/configure.yml create mode 100644 third-party/zlib/.github/workflows/fuzz.yml create mode 100644 third-party/zlib/.gitignore create mode 100644 third-party/zlib/LICENSE create mode 100644 third-party/zlib/zconf.h.included delete mode 100755 third-party/zlib/zlib2ansi create mode 100644 third-party/zstd/.github/dependabot.yml create mode 100644 third-party/zstd/.github/workflows/scorecards.yml create mode 100644 third-party/zstd/.github/workflows/windows-artifacts.yml create mode 100644 third-party/zstd/build/cmake/CMakeModules/JoinPaths.cmake create mode 100755 third-party/zstd/build/single_file_libs/combine.py create mode 100644 third-party/zstd/contrib/externalSequenceProducer/.gitignore create mode 100644 third-party/zstd/contrib/externalSequenceProducer/Makefile create mode 100644 third-party/zstd/contrib/externalSequenceProducer/README.md create mode 100644 third-party/zstd/contrib/externalSequenceProducer/main.c create mode 100644 third-party/zstd/contrib/externalSequenceProducer/sequence_producer.c create mode 100644 third-party/zstd/contrib/externalSequenceProducer/sequence_producer.h create mode 100644 third-party/zstd/contrib/linux-kernel/zstd_common_module.c create mode 100644 third-party/zstd/contrib/pzstd/utils/Portability.h create mode 100644 third-party/zstd/contrib/seekable_format/README.md create mode 100644 third-party/zstd/contrib/seqBench/Makefile create mode 100644 third-party/zstd/contrib/seqBench/seqBench.c create mode 100644 third-party/zstd/doc/decompressor_errata.md create mode 100644 third-party/zstd/lib/common/allocations.h create mode 100644 third-party/zstd/lib/common/bits.h create mode 100644 third-party/zstd/programs/fileio_asyncio.c create mode 100644 third-party/zstd/programs/fileio_asyncio.h create mode 100644 third-party/zstd/programs/fileio_common.h create mode 100644 third-party/zstd/programs/fileio_types.h create mode 100755 third-party/zstd/tests/check_size.py create mode 100644 third-party/zstd/tests/cli-tests/.gitignore create mode 100644 third-party/zstd/tests/cli-tests/README.md create mode 100755 third-party/zstd/tests/cli-tests/basic/help.sh create mode 100644 third-party/zstd/tests/cli-tests/basic/help.sh.stdout.glob create mode 100755 third-party/zstd/tests/cli-tests/basic/memlimit.sh create mode 100644 third-party/zstd/tests/cli-tests/basic/memlimit.sh.stderr.exact create mode 100644 third-party/zstd/tests/cli-tests/basic/memlimit.sh.stdout.exact create mode 100755 third-party/zstd/tests/cli-tests/basic/output_dir.sh create mode 100644 third-party/zstd/tests/cli-tests/basic/output_dir.sh.stderr.exact create mode 100644 third-party/zstd/tests/cli-tests/basic/output_dir.sh.stdout.exact create mode 100755 third-party/zstd/tests/cli-tests/basic/version.sh create mode 100644 third-party/zstd/tests/cli-tests/basic/version.sh.stdout.glob create mode 100755 third-party/zstd/tests/cli-tests/bin/cmp_size create mode 100755 third-party/zstd/tests/cli-tests/bin/datagen create mode 100755 third-party/zstd/tests/cli-tests/bin/die create mode 100755 third-party/zstd/tests/cli-tests/bin/println create mode 120000 third-party/zstd/tests/cli-tests/bin/unzstd create mode 100755 third-party/zstd/tests/cli-tests/bin/zstd create mode 120000 third-party/zstd/tests/cli-tests/bin/zstdcat create mode 100755 third-party/zstd/tests/cli-tests/bin/zstdgrep create mode 100755 third-party/zstd/tests/cli-tests/bin/zstdless create mode 100755 third-party/zstd/tests/cli-tests/cltools/setup create mode 100755 third-party/zstd/tests/cli-tests/cltools/zstdgrep.sh create mode 100644 third-party/zstd/tests/cli-tests/cltools/zstdgrep.sh.exit create mode 100644 third-party/zstd/tests/cli-tests/cltools/zstdgrep.sh.stderr.exact create mode 100644 third-party/zstd/tests/cli-tests/cltools/zstdgrep.sh.stdout.glob create mode 100755 third-party/zstd/tests/cli-tests/cltools/zstdless.sh create mode 100644 third-party/zstd/tests/cli-tests/cltools/zstdless.sh.stderr.exact create mode 100644 third-party/zstd/tests/cli-tests/cltools/zstdless.sh.stdout.glob create mode 100644 third-party/zstd/tests/cli-tests/common/format.sh create mode 100644 third-party/zstd/tests/cli-tests/common/mtime.sh create mode 100644 third-party/zstd/tests/cli-tests/common/permissions.sh create mode 100644 third-party/zstd/tests/cli-tests/common/platform.sh create mode 100755 third-party/zstd/tests/cli-tests/compression/adapt.sh create mode 100755 third-party/zstd/tests/cli-tests/compression/basic.sh create mode 100755 third-party/zstd/tests/cli-tests/compression/compress-literals.sh create mode 100755 third-party/zstd/tests/cli-tests/compression/format.sh create mode 100755 third-party/zstd/tests/cli-tests/compression/golden.sh create mode 100755 third-party/zstd/tests/cli-tests/compression/gzip-compat.sh create mode 100755 third-party/zstd/tests/cli-tests/compression/levels.sh create mode 100644 third-party/zstd/tests/cli-tests/compression/levels.sh.stderr.exact create mode 100755 third-party/zstd/tests/cli-tests/compression/long-distance-matcher.sh create mode 100755 third-party/zstd/tests/cli-tests/compression/multi-threaded.sh create mode 100644 third-party/zstd/tests/cli-tests/compression/multi-threaded.sh.stderr.exact create mode 100755 third-party/zstd/tests/cli-tests/compression/multiple-files.sh create mode 100644 third-party/zstd/tests/cli-tests/compression/multiple-files.sh.stdout.exact create mode 100755 third-party/zstd/tests/cli-tests/compression/row-match-finder.sh create mode 100755 third-party/zstd/tests/cli-tests/compression/setup create mode 100755 third-party/zstd/tests/cli-tests/compression/stream-size.sh create mode 100755 third-party/zstd/tests/cli-tests/compression/verbose-wlog.sh create mode 100644 third-party/zstd/tests/cli-tests/compression/verbose-wlog.sh.stderr.glob create mode 100644 third-party/zstd/tests/cli-tests/compression/verbose-wlog.sh.stdout.glob create mode 100755 third-party/zstd/tests/cli-tests/compression/window-resize.sh create mode 100644 third-party/zstd/tests/cli-tests/compression/window-resize.sh.stderr.ignore create mode 100644 third-party/zstd/tests/cli-tests/compression/window-resize.sh.stdout.glob create mode 100755 third-party/zstd/tests/cli-tests/decompression/golden.sh create mode 100755 third-party/zstd/tests/cli-tests/decompression/pass-through.sh create mode 100644 third-party/zstd/tests/cli-tests/decompression/pass-through.sh.stderr.exact create mode 100644 third-party/zstd/tests/cli-tests/decompression/pass-through.sh.stdout.exact create mode 100755 third-party/zstd/tests/cli-tests/dict-builder/empty-input.sh create mode 100644 third-party/zstd/tests/cli-tests/dict-builder/empty-input.sh.stderr.exact create mode 100755 third-party/zstd/tests/cli-tests/dict-builder/no-inputs.sh create mode 100644 third-party/zstd/tests/cli-tests/dict-builder/no-inputs.sh.exit create mode 100644 third-party/zstd/tests/cli-tests/dict-builder/no-inputs.sh.stderr.exact create mode 100755 third-party/zstd/tests/cli-tests/dictionaries/dictionary-mismatch.sh create mode 100644 third-party/zstd/tests/cli-tests/dictionaries/dictionary-mismatch.sh.stderr.exact create mode 100755 third-party/zstd/tests/cli-tests/dictionaries/golden.sh create mode 100755 third-party/zstd/tests/cli-tests/dictionaries/setup create mode 100755 third-party/zstd/tests/cli-tests/dictionaries/setup_once create mode 100755 third-party/zstd/tests/cli-tests/file-stat/compress-file-to-dir-without-write-perm.sh create mode 100644 third-party/zstd/tests/cli-tests/file-stat/compress-file-to-dir-without-write-perm.sh.stderr.exact create mode 100755 third-party/zstd/tests/cli-tests/file-stat/compress-file-to-file.sh create mode 100644 third-party/zstd/tests/cli-tests/file-stat/compress-file-to-file.sh.stderr.exact create mode 100755 third-party/zstd/tests/cli-tests/file-stat/compress-file-to-stdout.sh create mode 100644 third-party/zstd/tests/cli-tests/file-stat/compress-file-to-stdout.sh.stderr.exact create mode 100755 third-party/zstd/tests/cli-tests/file-stat/compress-stdin-to-file.sh create mode 100644 third-party/zstd/tests/cli-tests/file-stat/compress-stdin-to-file.sh.stderr.exact create mode 100755 third-party/zstd/tests/cli-tests/file-stat/compress-stdin-to-stdout.sh create mode 100644 third-party/zstd/tests/cli-tests/file-stat/compress-stdin-to-stdout.sh.stderr.exact create mode 100755 third-party/zstd/tests/cli-tests/file-stat/decompress-file-to-file.sh create mode 100644 third-party/zstd/tests/cli-tests/file-stat/decompress-file-to-file.sh.stderr.exact create mode 100755 third-party/zstd/tests/cli-tests/file-stat/decompress-file-to-stdout.sh create mode 100644 third-party/zstd/tests/cli-tests/file-stat/decompress-file-to-stdout.sh.stderr.exact create mode 100755 third-party/zstd/tests/cli-tests/file-stat/decompress-stdin-to-file.sh create mode 100644 third-party/zstd/tests/cli-tests/file-stat/decompress-stdin-to-file.sh.stderr.exact create mode 100755 third-party/zstd/tests/cli-tests/file-stat/decompress-stdin-to-stdout.sh create mode 100644 third-party/zstd/tests/cli-tests/file-stat/decompress-stdin-to-stdout.sh.stderr.exact create mode 100755 third-party/zstd/tests/cli-tests/progress/no-progress.sh create mode 100644 third-party/zstd/tests/cli-tests/progress/no-progress.sh.stderr.glob create mode 100755 third-party/zstd/tests/cli-tests/progress/progress.sh create mode 100644 third-party/zstd/tests/cli-tests/progress/progress.sh.stderr.glob create mode 100755 third-party/zstd/tests/cli-tests/run.py create mode 100755 third-party/zstd/tests/cli-tests/zstd-symlinks/setup create mode 100755 third-party/zstd/tests/cli-tests/zstd-symlinks/zstdcat.sh create mode 100644 third-party/zstd/tests/cli-tests/zstd-symlinks/zstdcat.sh.stdout.exact create mode 100644 third-party/zstd/tests/external_matchfinder.c create mode 100644 third-party/zstd/tests/external_matchfinder.h create mode 100644 third-party/zstd/tests/fuzz/fuzz_third_party_seq_prod.h create mode 100644 third-party/zstd/tests/fuzz/seq_prod_fuzz_example/Makefile create mode 100644 third-party/zstd/tests/fuzz/seq_prod_fuzz_example/README.md create mode 100644 third-party/zstd/tests/fuzz/seq_prod_fuzz_example/example_seq_prod.c create mode 100644 third-party/zstd/tests/golden-compression/PR-3517-block-splitter-corruption-test create mode 100644 third-party/zstd/tests/golden-compression/large-literal-and-match-lengths diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 8aaed04a..dbae6492 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -51,9 +51,9 @@ jobs: # Install cross toolchains dpkg --add-architecture i386 ./install-build-deps.sh update - apt-get install -y sudo qemu-user gdb zstd dwarfdump xz-utils {gcc,g++}-10-{i686,aarch64,riscv64,powerpc,powerpc64,powerpc64le,s390x,sparc64,m68k,sh4,alpha,mips,mipsel}-linux-gnu {gcc,g++}-10-arm-linux-gnueabihf {gcc,g++}-10-{mips64,mips64el}-linux-gnuabi64 + apt-get install -y sudo qemu-user gdb zstd dwarfdump xz-utils {gcc,g++}-10-{i686,aarch64,riscv64,powerpc,powerpc64,powerpc64le,s390x,sparc64,m68k,sh4,alpha}-linux-gnu {gcc,g++}-10-arm-linux-gnueabihf - for i in {i686,aarch64,riscv64,powerpc,powerpc64,powerpc64le,s390x,sparc64,m68k,sh4,alpha,mips,mipsel}-linux-gnu arm-linux-gnueabihf {mips64,mips64el}-linux-gnuabi64; do + for i in {i686,aarch64,riscv64,powerpc,powerpc64,powerpc64le,s390x,sparc64,m68k,sh4,alpha}-linux-gnu arm-linux-gnueabihf; do ln -sf /usr/bin/$i-gcc-10 /usr/bin/$i-gcc ln -sf /usr/bin/$i-g++-10 /usr/bin/$i-g++ done diff --git a/CMakeLists.txt b/CMakeLists.txt index 1c4f1f9a..0cff71fa 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -43,8 +43,8 @@ # of the mold linker of the same version will have the exactly same set of # features and behave exactly the same. -cmake_minimum_required(VERSION 3.13) -project(mold VERSION 2.1.0) +cmake_minimum_required(VERSION 3.14) +project(mold VERSION 2.2.0) include(CMakeDependentOption) include(GNUInstallDirs) @@ -66,6 +66,7 @@ if(NOT "${CMAKE_CXX_COMPILER_FRONTEND_VARIANT}" STREQUAL "MSVC") -fno-asynchronous-unwind-tables -Wno-sign-compare -Wno-unused-function + -Wunused-variable -ggnu-pubnames) endif() @@ -97,7 +98,7 @@ if(MOLD_USE_TSAN) target_link_options(mold PRIVATE -fsanitize=thread) endif() -# Statically-link libstdc++ and libcrypto if -DMOLD_MOSTLY_STATIC=ON. +# Statically-link libstdc++ if -DMOLD_MOSTLY_STATIC=ON. # # This option is intended to be used by `./dist.sh` script to create a # mold binary that works on various Linux distros. You probably don't @@ -105,13 +106,12 @@ endif() option(MOLD_MOSTLY_STATIC "Statically link libstdc++ and libcrypto" OFF) if(MOLD_MOSTLY_STATIC) target_link_options(mold PRIVATE -static-libstdc++) - target_link_libraries(mold PRIVATE libcrypto.a) endif() # Find zlib. If libz.so is not found, we compile a bundled one and # statically-link it to mold. find_package(ZLIB QUIET) -if(ZLIB_FOUND) +if(ZLIB_FOUND AND NOT MOLD_MOSTLY_STATIC) target_link_libraries(mold PRIVATE ZLIB::ZLIB) else() add_subdirectory(third-party/zlib EXCLUDE_FROM_ALL) @@ -120,12 +120,23 @@ else() target_link_libraries(mold PRIVATE zlibstatic) endif() -# Find zstd compression library. Just like zlib, if libzstd.so is not -# found, we compile a bundled one and statically-link it to mold. +# Find BLAKE3 cryptographic hash library. Just like zlib, if libblkae3.so +# is not found, we compile a bundled one and statically-link it to mold. +find_package(BLAKE3 QUIET) +if(BLAKE3_FOUND AND NOT MOLD_MOSTLY_STATIC) + target_link_libraries(mold PRIVATE BLAKE3::blake3) +else() + add_subdirectory(third-party/blake3/c EXCLUDE_FROM_ALL) + target_link_libraries(mold PRIVATE blake3) + target_include_directories(mold PUBLIC third-party/blake3/c) +endif() + +# Find zstd compression library. If libzstd.so is not found, we compile a +# bundled one and statically-link it to mold. include(CheckIncludeFile) check_include_file(zstd.h HAVE_ZSTD_H) -if(HAVE_ZSTD_H) +if(HAVE_ZSTD_H AND NOT MOLD_MOSTLY_STATIC) target_link_libraries(mold PRIVATE zstd) else() add_subdirectory(third-party/zstd/build/cmake EXCLUDE_FROM_ALL) @@ -236,11 +247,6 @@ if(NOT APPLE AND NOT WIN32) target_sources(mold-wrapper PRIVATE elf/mold-wrapper.c) endif() -if(NOT APPLE AND NOT WIN32 AND NOT MOLD_MOSTLY_STATIC) - find_package(OpenSSL REQUIRED COMPONENTS Crypto) - target_link_libraries(mold PRIVATE OpenSSL::Crypto) -endif() - # If atomics doesn't work by default, add -latomic. # We need the flag on riscv, armv6 and m68k. include(CheckCXXSourceCompiles) @@ -289,7 +295,7 @@ include_directories(${CMAKE_CURRENT_BINARY_DIR}) # on a multicore machine. list(APPEND MOLD_ELF_TARGETS X86_64 I386 ARM64 ARM32 RV32LE RV32BE RV64LE RV64BE PPC32 PPC64V1 PPC64V2 - S390X SPARC64 M68K SH4 ALPHA MIPS64LE MIPS64BE LOONGARCH32 LOONGARCH64) + S390X SPARC64 M68K SH4 ALPHA LOONGARCH32 LOONGARCH64) list(APPEND MOLD_ELF_TEMPLATE_FILES elf/arch-loongarch.cc @@ -329,10 +335,12 @@ list(APPEND MOLD_MACHO_TEMPLATE_FILES function(mold_instantiate_templates SOURCE TARGET) set(PATH ${CMAKE_BINARY_DIR}/${SOURCE}.${TARGET}.cc) - file(WRITE ${PATH} "#define MOLD_${TARGET} 1 + if(NOT EXISTS ${PATH}) + file(WRITE ${PATH} "#define MOLD_${TARGET} 1 #define MOLD_TARGET ${TARGET} #include \"${CMAKE_SOURCE_DIR}/${SOURCE}\" ") + endif() target_sources(mold PRIVATE ${PATH}) endfunction() @@ -367,7 +375,6 @@ target_sources(mold PRIVATE elf/arch-arm64.cc elf/arch-i386.cc elf/arch-m68k.cc - elf/arch-mips64.cc elf/arch-ppc32.cc elf/arch-ppc64v1.cc elf/arch-ppc64v2.cc diff --git a/README.md b/README.md index 2034ef1f..4469201e 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,5 @@ # mold: A Modern Linker -[![CI](https://github.com/rui314/mold/actions/workflows/ci.yml/badge.svg)](https://github.com/rui314/mold/actions/workflows/ci.yml) -[![build result](https://build.opensuse.org/projects/home:marxin:mold/packages/mold/badge.svg?type=default)](https://build.opensuse.org/package/show/home:marxin:mold/mold) - This repository contains a free version of the mold linker. If you are looking for a commercial version that supports macOS please visit the @@ -74,11 +71,11 @@ necessary packages. You may need to run it as root. git clone https://github.com/rui314/mold.git mkdir mold/build cd mold/build -git checkout v2.1.0 +git checkout v2.2.0 ../install-build-deps.sh cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_CXX_COMPILER=c++ .. cmake --build . -j $(nproc) -sudo cmake --install . +sudo cmake --build . --target install ``` You might need to pass a C++20 compiler command name to `cmake`. In the diff --git a/common/Dockerfile b/common/Dockerfile index e7c67fcc..83359215 100644 --- a/common/Dockerfile +++ b/common/Dockerfile @@ -11,13 +11,7 @@ RUN apt-get update && \ apt-get update && \ apt-get install -y --no-install-recommends build-essential wget libstdc++-11-dev zlib1g-dev gcc-10 g++-10 python3 && \ \ - mkdir /openssl && cd /openssl && \ - wget -O- -q https://www.openssl.org/source/openssl-3.0.7.tar.gz | tar --strip-components=1 -xzf - && \ - ./Configure --prefix=/usr/local --libdir=lib && \ - make -j$(nproc) && \ - make -j$(nproc) install && \ - ldconfig && \ mkdir /cmake && cd /cmake && \ wget -O- -q https://github.com/Kitware/CMake/releases/download/v3.24.2/cmake-3.24.2.tar.gz | tar --strip-components=1 -xzf - && \ ./bootstrap --parallel=$(nproc) && make -j$(nproc) && make -j$(nproc) install && \ - rm -rf /var/lib/apt/lists/* /cmake /openssl + rm -rf /var/lib/apt/lists/* /cmake diff --git a/common/common.h b/common/common.h index 2642c5ea..8f2b9350 100644 --- a/common/common.h +++ b/common/common.h @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include @@ -298,7 +299,7 @@ inline u64 bits(u64 val, u64 hi, u64 lo) { inline i64 sign_extend(u64 val, i64 size) { return (i64)(val << (63 - size)) >> (63 - size); -}; +} template > void update_minimum(std::atomic &atomic, u64 new_val, Compare cmp = {}) { @@ -478,53 +479,72 @@ class ConcurrentMap { } ~ConcurrentMap() { - if (keys) { - free((void *)keys); - free((void *)key_sizes); - free((void *)values); - } + free(entries); } + // In order to avoid unnecessary cache-line false sharing, we want + // to make this object to be aligned to a reasonably large + // power-of-two address. + struct alignas(32) Entry { + std::atomic key; + T value; + u32 keylen; + }; + void resize(i64 nbuckets) { - this->~ConcurrentMap(); + this->nbuckets = std::max(MIN_NBUCKETS, bit_ceil(nbuckets)); - nbuckets = std::max(MIN_NBUCKETS, bit_ceil(nbuckets)); + i64 sz = sizeof(Entry) * this->nbuckets; + free(entries); - this->nbuckets = nbuckets; - keys = (std::atomic *)calloc(nbuckets, sizeof(char *)); - key_sizes = (u32 *)malloc(nbuckets * sizeof(u32)); - values = (T *)malloc(nbuckets * sizeof(T)); +#if _WIN32 + // Even though std::aligned_alloc is defined in C++17, MSVC doesn't + // seem to provide that function. + entries = (Entry *)_aligned_malloc(sz, alignof(Entry)); +#else + entries = (Entry *)std::aligned_alloc(alignof(Entry), sz); +#endif + + memset(entries, 0, sz); } std::pair insert(std::string_view key, u64 hash, const T &val) { - if (!keys) - return {nullptr, false}; - assert(has_single_bit(nbuckets)); + i64 idx = hash & (nbuckets - 1); i64 retry = 0; while (retry < MAX_RETRY) { - const char *ptr = keys[idx].load(std::memory_order_acquire); - if (ptr == marker) { - pause(); - continue; + Entry &ent = entries[idx]; + const char *ptr = nullptr; + bool claimed = ent.key.compare_exchange_weak(ptr, (char *)-1, + std::memory_order_acquire); + + // If we successfully claimed the ownership of an unused slot, + // copy values to it. + if (claimed) { + new (&ent.value) T(val); + ent.keylen = key.size(); + ent.key.store(key.data(), std::memory_order_release); + return {&ent.value, true}; } - if (ptr == nullptr) { - if (!keys[idx].compare_exchange_weak(ptr, marker, - std::memory_order_acquire)) - continue; - new (values + idx) T(val); - key_sizes[idx] = key.size(); - keys[idx].store(key.data(), std::memory_order_release); - return {values + idx, true}; + // Loop on a spurious failure. + if (ptr == nullptr) + continue; + + // If someone is copying values to the slot, do busy wait. + while (ptr == (char *)-1) { + pause(); + ptr = ent.key.load(std::memory_order_acquire); } - if (key.size() == key_sizes[idx] && - memcmp(ptr, key.data(), key_sizes[idx]) == 0) - return {values + idx, false}; + // If the same key is already present, this is the slot we are + // looking for. + if (key == std::string_view(ptr, ent.keylen)) + return {&ent.value, false}; + // Otherwise, move on to the next slot. u64 mask = nbuckets / NUM_SHARDS - 1; idx = (idx & ~mask) | ((idx + 1) & mask); retry++; @@ -535,16 +555,20 @@ class ConcurrentMap { } const char *get_key(i64 idx) { - return keys[idx].load(std::memory_order_relaxed); + return entries[idx].key.load(std::memory_order_relaxed); + } + + i64 get_idx(T *value) const { + uintptr_t addr = (uintptr_t)value - (uintptr_t)value % sizeof(Entry); + return (Entry *)addr - entries; } static constexpr i64 MIN_NBUCKETS = 2048; static constexpr i64 NUM_SHARDS = 16; static constexpr i64 MAX_RETRY = 128; + Entry *entries = nullptr; i64 nbuckets = 0; - u32 *key_sizes = nullptr; - T *values = nullptr; private: static void pause() { @@ -554,10 +578,6 @@ class ConcurrentMap { asm volatile("yield"); #endif } - -private: - std::atomic *keys = nullptr; - static constexpr const char *marker = "marker"; }; // diff --git a/common/sha.h b/common/sha.h deleted file mode 100644 index c5c72c5b..00000000 --- a/common/sha.h +++ /dev/null @@ -1,81 +0,0 @@ -#pragma once - -#include - -typedef uint8_t u8; -static constexpr int64_t SHA256_SIZE = 32; - -#ifdef _WIN32 -// On Windows, we use Microsoft CNG. - -#include -#include -#include - -inline static BCRYPT_ALG_HANDLE get_sha256_handle() { - static std::once_flag once; - static BCRYPT_ALG_HANDLE alg; - - std::call_once(once, [&] { - BCryptOpenAlgorithmProvider(&alg, BCRYPT_SHA256_ALGORITHM, nullptr, 0); - }); - return alg; -} - -inline void sha256_hash(u8 *in, size_t len, u8 *out) { - BCryptHash(get_sha256_handle(), nullptr, 0, in, len, out, SHA256_SIZE); -} - -class SHA256Hash { -public: - SHA256Hash() { - BCryptCreateHash(get_sha256_handle(), &handle, nullptr, 0, nullptr, 0, 0); - } - - void update(u8 *data, size_t len) { - BCryptHashData(handle, data, len, 0); - } - - void finish(u8 *out) { - BCryptFinishHash(handle, out, SHA256_SIZE, 0); - } - -private: - BCRYPT_HASH_HANDLE handle; -}; - -#else -// On Unix, we use OpenSSL or the Apple's OpenSSL-compatible API. - -#ifdef __APPLE__ -# define COMMON_DIGEST_FOR_OPENSSL -# include -# define SHA256(data, len, md) CC_SHA256(data, len, md) -#else -# define OPENSSL_SUPPRESS_DEPRECATED 1 -# include -#endif - -inline void sha256_hash(u8 *in, size_t len, u8 *out) { - SHA256(in, len, out); -} - -class SHA256Hash { -public: - SHA256Hash() { - SHA256_Init(&ctx); - } - - void update(u8 *data, size_t len) { - SHA256_Update(&ctx, data, len); - } - - void finish(u8 *out) { - SHA256_Final(out, &ctx); - } - -private: - SHA256_CTX ctx; -}; - -#endif diff --git a/elf/arch-arm32.cc b/elf/arch-arm32.cc index 845518bd..2a203d4c 100644 --- a/elf/arch-arm32.cc +++ b/elf/arch-arm32.cc @@ -195,14 +195,14 @@ void write_addend(u8 *loc, i64 val, const ElfRel &rel) { template <> void write_plt_header(Context &ctx, u8 *buf) { static const ul32 insn[] = { - 0xe52d'e004, // push {lr} - 0xe59f'e004, // ldr lr, 2f + 0xe52d'e004, // push {lr} + 0xe59f'e004, // ldr lr, 2f 0xe08f'e00e, // 1: add lr, pc, lr - 0xe5be'f008, // ldr pc, [lr, #8]! + 0xe5be'f008, // ldr pc, [lr, #8]! 0x0000'0000, // 2: .word .got.plt - 1b - 8 - 0xe320'f000, // nop - 0xe320'f000, // nop - 0xe320'f000, // nop + 0xe320'f000, // nop + 0xe320'f000, // nop + 0xe320'f000, // nop }; memcpy(buf, insn, sizeof(insn)); @@ -211,8 +211,8 @@ void write_plt_header(Context &ctx, u8 *buf) { static const ul32 plt_entry[] = { 0xe59f'c004, // 1: ldr ip, 2f - 0xe08c'c00f, // add ip, ip, pc - 0xe59c'f000, // ldr pc, [ip] + 0xe08c'c00f, // add ip, ip, pc + 0xe59c'f000, // ldr pc, [ip] 0x0000'0000, // 2: .word sym@GOT - 1b }; @@ -336,12 +336,12 @@ void InputSection::apply_reloc_alloc(Context &ctx, u8 *base) { break; } - // Just like THM_CALL, ARM_CALL relocation refers either BL or + // Just like THM_CALL, ARM_CALL relocation refers to either BL or // BLX instruction. We may need to rewrite BL → BLX or BLX → BL. bool is_bl = ((*(ul32 *)loc & 0xff00'0000) == 0xeb00'0000); bool is_blx = ((*(ul32 *)loc & 0xfe00'0000) == 0xfa00'0000); if (!is_bl && !is_blx) - Fatal(ctx) << *this << ": R_ARM_CALL refers neither BL nor BLX"; + Fatal(ctx) << *this << ": R_ARM_CALL refers to neither BL nor BLX"; u64 val = S + A - P; if (is_jump_reachable(val)) { @@ -364,7 +364,7 @@ void InputSection::apply_reloc_alloc(Context &ctx, u8 *base) { break; } - // These relocs refers a B (unconditional branch) instruction. + // These relocs refers to a B (unconditional branch) instruction. // Unlike BL or BLX, we can't rewrite B to BX in place when the // processor mode switch is required because BX doesn't takes an // immediate; it takes only a register. So if mode switch is @@ -467,12 +467,33 @@ void InputSection::apply_reloc_alloc(Context &ctx, u8 *base) { *(ul32 *)loc = S + A - ctx.tp_addr; break; case R_ARM_TLS_GOTDESC: + // ARM32 TLSDESC uses the following code sequence to materialize + // a TP-relative address in r0. + // + // ldr r0, .L2 + // .L1: bl foo + // R_ARM_TLS_CALL + // .L2: .word foo + . - .L1 + // R_ARM_TLS_GOTDESC + // + // We may relax the instructions to the following for non-dlopen'd DSO + // + // ldr r0, .L2 + // .L1: ldr r0, [pc, r0] + // ... + // .L2: .word foo(gottpoff) + . - .L1 + // + // or to the following for executable. + // + // ldr r0, .L2 + // .L1: nop + // ... + // .L2: .word foo(tpoff) if (sym.has_tlsdesc(ctx)) { // A is odd if the corresponding TLS_CALL is Thumb. - if (A & 1) - *(ul32 *)loc = sym.get_tlsdesc_addr(ctx) - P + A - 6; - else - *(ul32 *)loc = sym.get_tlsdesc_addr(ctx) - P + A - 4; + *(ul32 *)loc = sym.get_tlsdesc_addr(ctx) - P + A - ((A & 1) ? 6 : 4); + } else if (sym.has_gottp(ctx)) { + *(ul32 *)loc = sym.get_gottp_addr(ctx) - P + A - ((A & 1) ? 5 : 8); } else { *(ul32 *)loc = S - ctx.tp_addr; } @@ -481,9 +502,10 @@ void InputSection::apply_reloc_alloc(Context &ctx, u8 *base) { if (sym.has_tlsdesc(ctx)) { // BL *(ul32 *)loc = 0xeb00'0000 | bits(get_tls_trampoline_addr(P + 8), 25, 2); + } else if (sym.has_gottp(ctx)) { + *(ul32 *)loc = 0xe79f'0000; // ldr r0, [pc, r0] } else { - // BL -> NOP - *(ul32 *)loc = 0xe320'f000; + *(ul32 *)loc = 0xe320'f000; // nop } break; case R_ARM_THM_TLS_CALL: @@ -491,9 +513,13 @@ void InputSection::apply_reloc_alloc(Context &ctx, u8 *base) { u64 val = align_to(get_tls_trampoline_addr(P + 4), 4); write_thm_b_imm(loc, val); *(ul16 *)(loc + 2) &= ~0x1000; // rewrite BL with BLX + } else if (sym.has_gottp(ctx)) { + // Since `ldr r0, [pc, r0]` is not representable in Thumb, + // we use two instructions instead. + *(ul16 *)loc = 0x4478; // add r0, pc + *(ul16 *)(loc + 2) = 0x6800; // ldr r0, [r0] } else { - // BL -> NOP.W - *(ul32 *)loc = 0x8000'f3af; + *(ul32 *)loc = 0x8000'f3af; // nop.w } break; default: @@ -594,9 +620,9 @@ void InputSection::scan_relocations(Context &ctx) { case R_ARM_TLS_IE32: sym.flags |= NEEDS_GOTTP; break; - case R_ARM_TLS_GOTDESC: - if (!relax_tlsdesc(ctx, sym)) - sym.flags |= NEEDS_TLSDESC; + case R_ARM_TLS_CALL: + case R_ARM_THM_TLS_CALL: + scan_tlsdesc(ctx, sym); break; case R_ARM_TLS_LE32: check_tlsle(ctx, sym, rel); @@ -611,9 +637,8 @@ void InputSection::scan_relocations(Context &ctx) { case R_ARM_THM_MOVW_PREL_NC: case R_ARM_THM_MOVW_ABS_NC: case R_ARM_TLS_LDO32: - case R_ARM_TLS_CALL: - case R_ARM_THM_TLS_CALL: case R_ARM_V4BX: + case R_ARM_TLS_GOTDESC: break; default: Error(ctx) << *this << ": unknown relocation: " << rel; @@ -623,8 +648,6 @@ void InputSection::scan_relocations(Context &ctx) { template <> void RangeExtensionThunk::copy_buf(Context &ctx) { - u8 *buf = ctx.buf + output_section.shdr.sh_offset + offset; - // TLS trampoline code. ARM32's TLSDESC is designed so that this // common piece of code is factored out from object files to reduce // output size. Since no one provide, the linker has to synthesize it. @@ -632,6 +655,7 @@ void RangeExtensionThunk::copy_buf(Context &ctx) { 0xe08e'0000, // add r0, lr, r0 0xe590'1004, // ldr r1, [r0, #4] 0xe12f'ff11, // bx r1 + 0xe320'f000, // nop }; // This is a range extension and mode switch thunk. @@ -641,27 +665,34 @@ void RangeExtensionThunk::copy_buf(Context &ctx) { 0x78, 0x47, // bx pc # jumps to 1f 0xc0, 0x46, // nop // .arm - 0x04, 0xc0, 0x9f, 0xe5, // 1: ldr ip, 3f - 0x0f, 0xc0, 0x8c, 0xe0, // 2: add ip, ip, pc - 0x1c, 0xff, 0x2f, 0xe1, // bx ip + 0x00, 0xc0, 0x9f, 0xe5, // 1: ldr ip, 3f + 0x0f, 0xf0, 0x8c, 0xe0, // 2: add pc, ip, pc 0x00, 0x00, 0x00, 0x00, // 3: .word sym - 2b }; static_assert(E::thunk_hdr_size == sizeof(hdr)); static_assert(E::thunk_size == sizeof(entry)); + u8 *buf = ctx.buf + output_section.shdr.sh_offset + offset; memcpy(buf, hdr, sizeof(hdr)); + buf += sizeof(hdr); - for (i64 i = 0; i < symbols.size(); i++) { - u8 *loc = buf + sizeof(hdr) + i * sizeof(entry); - memcpy(loc, entry, sizeof(entry)); + u64 P = output_section.shdr.sh_addr + offset + sizeof(hdr); - u64 S = symbols[i]->get_addr(ctx); - u64 P = output_section.shdr.sh_addr + offset + sizeof(hdr) + i * sizeof(entry); - *(ul32 *)(loc + 16) = S - P - 16; + for (Symbol *sym : symbols) { + memcpy(buf, entry, sizeof(entry)); + *(ul32 *)(buf + 12) = sym->get_addr(ctx) - P - 16; + + buf += sizeof(entry); + P += sizeof(entry); } } +template <> +u64 get_eflags(Context &ctx) { + return EF_ARM_EABI_VER5; +} + // ARM executables use an .ARM.exidx section to look up an exception // handling record for the current instruction pointer. The table needs // to be sorted by their addresses. diff --git a/elf/arch-arm64.cc b/elf/arch-arm64.cc index f2c0ba5a..c0b5ff67 100644 --- a/elf/arch-arm64.cc +++ b/elf/arch-arm64.cc @@ -238,10 +238,12 @@ void InputSection::apply_reloc_alloc(Context &ctx, u8 *base) { // :lo12: foo` instruction pair to materialize a PC-relative address // in a register can be relaxed to `NOP` followed by `ADR x0, foo` // if foo is in PC ± 1 MiB. - if (ctx.arg.relax && i + 1 < rels.size() && - sign_extend(S + A - P - 4, 20) == S + A - P - 4) { + if (ctx.arg.relax && sym.is_pcrel_linktime_const(ctx) && + i + 1 < rels.size()) { + i64 val = S + A - P - 4; const ElfRel &rel2 = rels[i + 1]; - if (rel2.r_type == R_AARCH64_ADD_ABS_LO12_NC && + if (sign_extend(val, 20) == val && + rel2.r_type == R_AARCH64_ADD_ABS_LO12_NC && rel2.r_sym == rel.r_sym && rel2.r_offset == rel.r_offset + 4 && rel2.r_addend == rel.r_addend && @@ -252,7 +254,7 @@ void InputSection::apply_reloc_alloc(Context &ctx, u8 *base) { if (reg1 == reg2) { *(ul32 *)loc = 0xd503'201f; // nop *(ul32 *)(loc + 4) = 0x1000'0000 | reg1; // adr - write_adr(loc + 4, S + A - P - 4); + write_adr(loc + 4, val); i++; break; } @@ -369,38 +371,65 @@ void InputSection::apply_reloc_alloc(Context &ctx, u8 *base) { *(ul32 *)loc |= bits(sym.get_tlsgd_addr(ctx) + A, 11, 0) << 10; break; case R_AARCH64_TLSDESC_ADR_PAGE21: + // ARM64 TLSDESC uses the following code sequence to materialize + // a TP-relative address in x0. + // + // adrp x0, 0 + // R_AARCH64_TLSDESC_ADR_PAGE21 foo + // ldr x1, [x0] + // R_AARCH64_TLSDESC_LD64_LO12 foo + // add x0, x0, #0 + // R_AARCH64_TLSDESC_ADD_LO12 foo + // blr x1 + // R_AARCH64_TLSDESC_CALL foo + // + // We may relax the instructions to the following for non-dlopen'd DSO + // + // nop + // nop + // adrp x0, :gottprel:foo + // ldr x0, [x0, :gottprel_lo12:foo] + // + // or to the following for executable. + // + // nop + // nop + // movz x0, :tls_offset_hi:foo, lsl #16 + // movk x0, :tls_offset_lo:foo if (sym.has_tlsdesc(ctx)) { i64 val = page(sym.get_tlsdesc_addr(ctx) + A) - page(P); check(val, -(1LL << 32), 1LL << 32); write_adrp(loc, val); } else { - // adrp x0, 0 -> movz x0, #tls_ofset_hi, lsl #16 - i64 val = (S + A - ctx.tp_addr); - check(val, -(1LL << 32), 1LL << 32); - *(ul32 *)loc = 0xd2a0'0000 | (bits(val, 32, 16) << 5); + *(ul32 *)loc = 0xd503'201f; // nop } break; case R_AARCH64_TLSDESC_LD64_LO12: - if (sym.has_tlsdesc(ctx)) { + if (sym.has_tlsdesc(ctx)) *(ul32 *)loc |= bits(sym.get_tlsdesc_addr(ctx) + A, 11, 3) << 10; - } else { - // ldr x2, [x0] -> movk x0, #tls_ofset_lo - u32 offset_lo = (S + A - ctx.tp_addr) & 0xffff; - *(ul32 *)loc = 0xf280'0000 | (offset_lo << 5); - } + else + *(ul32 *)loc = 0xd503'201f; // nop break; case R_AARCH64_TLSDESC_ADD_LO12: if (sym.has_tlsdesc(ctx)) { *(ul32 *)loc |= bits(sym.get_tlsdesc_addr(ctx) + A, 11, 0) << 10; + } else if (sym.has_gottp(ctx)) { + *(ul32 *)loc = 0x9000'0000; // adrp x0, 0 + write_adrp(loc, page(sym.get_gottp_addr(ctx) + A) - page(P)); } else { - // add x0, x0, #0 -> nop - *(ul32 *)loc = 0xd503'201f; + *(ul32 *)loc = 0xd2a0'0000; // movz x0, 0, lsl #16 + *(ul32 *)loc |= bits(S + A - ctx.tp_addr, 32, 16) << 5; } break; case R_AARCH64_TLSDESC_CALL: - if (!sym.has_tlsdesc(ctx)) { - // blr x2 -> nop - *(ul32 *)loc = 0xd503'201f; + if (sym.has_tlsdesc(ctx)) { + // Do nothing + } else if (sym.has_gottp(ctx)) { + *(ul32 *)loc = 0xf940'0000; // ldr x0, [x0, 0] + *(ul32 *)loc |= bits(sym.get_gottp_addr(ctx) + A, 11, 3) << 10; + } else { + *(ul32 *)loc = 0xf280'0000; // movk x0, 0 + *(ul32 *)loc |= bits(S + A - ctx.tp_addr, 15, 0) << 5; } break; default: @@ -487,8 +516,8 @@ void InputSection::scan_relocations(Context &ctx) { // symbol's address from GOT. If the GOT value is a link-time // constant, we may be able to rewrite the ADRP+LDR instruction pair // with an ADRP+ADD, eliminating a GOT memory load. - if (ctx.arg.relax && sym.is_relative() && !sym.is_imported && - !sym.is_ifunc() && i + 1 < rels.size()) { + if (ctx.arg.relax && sym.is_pcrel_linktime_const(ctx) && + i + 1 < rels.size()) { // ADRP+LDR must be consecutive and use the same register to relax. const ElfRel &rel2 = rels[i + 1]; if (rel2.r_type == R_AARCH64_LD64_GOT_LO12_NC && @@ -531,8 +560,7 @@ void InputSection::scan_relocations(Context &ctx) { sym.flags |= NEEDS_TLSGD; break; case R_AARCH64_TLSDESC_CALL: - if (!relax_tlsdesc(ctx, sym)) - sym.flags |= NEEDS_TLSDESC; + scan_tlsdesc(ctx, sym); break; case R_AARCH64_TLSLE_MOVW_TPREL_G2: case R_AARCH64_TLSLE_ADD_TPREL_LO12: @@ -575,24 +603,26 @@ void InputSection::scan_relocations(Context &ctx) { template <> void RangeExtensionThunk::copy_buf(Context &ctx) { - u8 *buf = ctx.buf + output_section.shdr.sh_offset + offset; - - static const ul32 data[] = { + static const ul32 insn[] = { 0x9000'0010, // adrp x16, 0 # R_AARCH64_ADR_PREL_PG_HI21 0x9100'0210, // add x16, x16 # R_AARCH64_ADD_ABS_LO12_NC 0xd61f'0200, // br x16 + 0xd503'201f, // nop }; - static_assert(E::thunk_size == sizeof(data)); + static_assert(E::thunk_size == sizeof(insn)); + + u8 *buf = ctx.buf + output_section.shdr.sh_offset + offset; + u64 P = output_section.shdr.sh_addr + offset; - for (i64 i = 0; i < symbols.size(); i++) { - u64 S = symbols[i]->get_addr(ctx); - u64 P = output_section.shdr.sh_addr + offset + i * E::thunk_size; + for (Symbol *sym : symbols) { + u64 S = sym->get_addr(ctx); + memcpy(buf, insn, E::thunk_size); + write_adrp(buf, page(S) - page(P)); + *(ul32 *)(buf + 4) |= bits(S, 11, 0) << 10; - u8 *loc = buf + i * E::thunk_size; - memcpy(loc, data, sizeof(data)); - write_adrp(loc, page(S) - page(P)); - *(ul32 *)(loc + 4) |= bits(S, 11, 0) << 10; + buf += E::thunk_size; + P += E::thunk_size; } } diff --git a/elf/arch-i386.cc b/elf/arch-i386.cc index 793b2ad0..95184e60 100644 --- a/elf/arch-i386.cc +++ b/elf/arch-i386.cc @@ -338,20 +338,16 @@ void InputSection::apply_reloc_alloc(Context &ctx, u8 *base) { *(ul32 *)loc = sym.get_gottp_addr(ctx) + A; break; case R_386_TLS_GD: - if (sym.has_tlsgd(ctx)) { + if (sym.has_tlsgd(ctx)) *(ul32 *)loc = sym.get_tlsgd_addr(ctx) + A - GOT; - } else { - relax_gd_to_le(loc, rels[i + 1], S - ctx.tp_addr); - i++; - } + else + relax_gd_to_le(loc, rels[++i], S - ctx.tp_addr); break; case R_386_TLS_LDM: - if (ctx.got->has_tlsld(ctx)) { + if (ctx.got->has_tlsld(ctx)) *(ul32 *)loc = ctx.got->get_tlsld_addr(ctx) + A - GOT; - } else { - relax_ld_to_le(loc, rels[i + 1], ctx.tp_addr - ctx.tls_begin); - i++; - } + else + relax_ld_to_le(loc, rels[++i], ctx.tp_addr - ctx.tls_begin); break; case R_386_TLS_LDO_32: *(ul32 *)loc = S + A - ctx.dtp_addr; @@ -360,13 +356,32 @@ void InputSection::apply_reloc_alloc(Context &ctx, u8 *base) { *(ul32 *)loc = sym.esym().st_size + A; break; case R_386_TLS_GOTDESC: + // i386 TLSDESC uses the following code sequence to materialize + // a TP-relative address in %eax. + // + // lea 0(%ebx), %eax + // R_386_TLS_GOTDESC foo + // call *(%eax) + // R_386_TLS_DESC_CALL foo + // + // We may relax the instructions to the following for non-dlopen'd DSO + // + // mov foo@GOTTPOFF(%ebx), %eax + // nop + // + // or to the following for executable. + // + // mov $foo@TPOFF, %eax + // nop if (sym.has_tlsdesc(ctx)) { *(ul32 *)loc = sym.get_tlsdesc_addr(ctx) + A - GOT; + } else if (sym.has_gottp(ctx)) { + loc[-2] = 0x8b; + loc[-1] = 0x83; + *(ul32 *)loc = sym.get_gottp_addr(ctx) + A - GOT; } else { - static const u8 insn[] = { - 0x8d, 0x05, 0, 0, 0, 0, // lea 0, %eax - }; - memcpy(loc - 2, insn, sizeof(insn)); + loc[-2] = 0x90; + loc[-1] = 0xb8; *(ul32 *)loc = S + A - ctx.tp_addr; } break; @@ -476,6 +491,16 @@ void InputSection::scan_relocations(Context &ctx) { if (sym.is_ifunc()) sym.flags |= NEEDS_GOT | NEEDS_PLT; + if (rel.r_type == R_386_TLS_GD || rel.r_type == R_386_TLS_LDM) { + if (i + 1 == rels.size()) + Fatal(ctx) << *this << ": " << rel << " must be followed by PLT or GOT32"; + + if (u32 ty = rels[i + 1].r_type; + ty != R_386_PLT32 && ty != R_386_PC32 && + ty != R_386_GOT32 && ty != R_386_GOT32X) + Fatal(ctx) << *this << ": " << rel << " must be followed by PLT or GOT32"; + } + switch (rel.r_type) { case R_386_8: case R_386_16: @@ -493,15 +518,15 @@ void InputSection::scan_relocations(Context &ctx) { case R_386_GOTPC: sym.flags |= NEEDS_GOT; break; - case R_386_GOT32X: { - // We always want to relax GOT32X because static PIE doesn't - // work without it. - bool do_relax = !sym.is_imported && sym.is_relative() && - relax_got32x(loc - 2); - if (!do_relax) + case R_386_GOT32X: + // We always want to relax GOT32X even if --no-relax is given + // because static PIE doesn't work without it. + if (sym.is_pcrel_linktime_const(ctx) && relax_got32x(loc - 2)) { + // Do nothing + } else { sym.flags |= NEEDS_GOT; + } break; - } case R_386_PLT32: if (sym.is_imported) sym.flags |= NEEDS_PLT; @@ -511,31 +536,15 @@ void InputSection::scan_relocations(Context &ctx) { sym.flags |= NEEDS_GOTTP; break; case R_386_TLS_GD: - if (i + 1 == rels.size()) - Fatal(ctx) << *this << ": TLS_GD reloc must be followed by PLT or GOT32"; - - if (u32 ty = rels[i + 1].r_type; - ty != R_386_PLT32 && ty != R_386_PC32 && - ty != R_386_GOT32 && ty != R_386_GOT32X) - Fatal(ctx) << *this << ": TLS_GD reloc must be followed by PLT or GOT32"; - // We always relax if -static because libc.a doesn't contain // __tls_get_addr(). - if (ctx.arg.is_static || - (ctx.arg.relax && !ctx.arg.shared && !sym.is_imported)) + if ((ctx.arg.relax && sym.is_tprel_linktime_const(ctx)) || + ctx.arg.is_static) i++; else sym.flags |= NEEDS_TLSGD; break; case R_386_TLS_LDM: - if (i + 1 == rels.size()) - Fatal(ctx) << *this << ": TLS_LDM reloc must be followed by PLT or GOT32"; - - if (u32 ty = rels[i + 1].r_type; - ty != R_386_PLT32 && ty != R_386_PC32 && - ty != R_386_GOT32 && ty != R_386_GOT32X) - Fatal(ctx) << *this << ": TLS_LDM reloc must be followed by PLT or GOT32"; - // We always relax if -static because libc.a doesn't contain // __tls_get_addr(). if (ctx.arg.is_static || (ctx.arg.relax && !ctx.arg.shared)) @@ -544,8 +553,7 @@ void InputSection::scan_relocations(Context &ctx) { ctx.needs_tlsld = true; break; case R_386_TLS_GOTDESC: - if (!relax_tlsdesc(ctx, sym)) - sym.flags |= NEEDS_TLSDESC; + scan_tlsdesc(ctx, sym); break; case R_386_TLS_LE: check_tlsle(ctx, sym, rel); diff --git a/elf/arch-loongarch.cc b/elf/arch-loongarch.cc index b914dbd9..b5bd0878 100644 --- a/elf/arch-loongarch.cc +++ b/elf/arch-loongarch.cc @@ -47,36 +47,40 @@ static u64 hi20(u64 val, u64 pc) { // // This is similar but different from RISC-V because RISC-V's AUIPC // doesn't zero-clear [11:0]. - return page(val + 0x800) - page(pc); + return bits(page(val + 0x800) - page(pc), 31, 12); } static u64 hi64(u64 val, u64 pc) { // A PC-relative 64-bit address is materialized with the following // instructions for the large code model: // - // pcalau12i $rX, %pc_hi20(sym) - // addi.d $rY, $zero, %lo12(sym) - // lu32i.d $rY, %pc64_lo20(sym) - // lu52i.d $rY, $r12, %pc64_hi12(sym) - // add.d $rX, $rX, $rY + // pcalau12i $rN, %pc_hi20(sym) + // addi.d $rM, $zero, %lo12(sym) + // lu32i.d $rM, %pc64_lo20(sym) + // lu52i.d $rM, $r12, %pc64_hi12(sym) + // add.d $rN, $rN, $rM // // PCALAU12I computes (pc + imm << 12) to materialize a 64-bit value. // ADDI.D adds a sign-extended 12 bit value to a register. LU32I.D and // LU52I.D simply set bits to [51:31] and to [63:53], respectively. // // Compensating all the sign-extensions is a bit complicated. - u64 x = hi20(val, pc); - if ((val & 0x800) && !(x & 0x8000'0000)) - return x - 0x1'0000'0000; - if (!(val & 0x800) && (x & 0x8000'0000)) - return x + 0x1'0000'0000; - return x; + bool x = val & 0x800; + bool y = (page(val + 0x800) - page(pc)) & 0x8000'0000; + + if (x && !y) + return val - 0x1'0000'0000; + if (!x && y) + return val + 0x1'0000'0000; + return val; } -static void write_j20(u8 *loc, u32 val) { - // opcode, [19:0], rd - *(ul32 *)loc &= 0b1111111'00000000000000000000'11111; - *(ul32 *)loc |= bits(val, 19, 0) << 5; +static u64 higher20(u64 val, u64 pc) { + return bits(hi64(val, pc), 51, 32); +} + +static u64 highest12(u64 val, u64 pc) { + return bits(hi64(val, pc), 63, 52); } static void write_k12(u8 *loc, u32 val) { @@ -85,6 +89,18 @@ static void write_k12(u8 *loc, u32 val) { *(ul32 *)loc |= bits(val, 11, 0) << 10; } +static void write_k16(u8 *loc, u32 val) { + // opcode, [15:0], rj, rd + *(ul32 *)loc &= 0b111111'0000000000000000'11111'11111; + *(ul32 *)loc |= bits(val, 15, 0) << 10; +} + +static void write_j20(u8 *loc, u32 val) { + // opcode, [19:0], rd + *(ul32 *)loc &= 0b1111111'00000000000000000000'11111; + *(ul32 *)loc |= bits(val, 19, 0) << 5; +} + static void write_d5k16(u8 *loc, u32 val) { // opcode, [15:0], rj, [20:16] *(ul32 *)loc &= 0b111111'0000000000000000'11111'00000; @@ -99,12 +115,6 @@ static void write_d10k16(u8 *loc, u32 val) { *(ul32 *)loc |= bits(val, 25, 16); } -static void write_k16(u8 *loc, u32 val) { - // opcode, [15:0], rj, rd - *(ul32 *)loc &= 0b111111'0000000000000000'11111'11111; - *(ul32 *)loc |= bits(val, 15, 0) << 10; -} - template <> void write_plt_header(Context &ctx, u8 *buf) { static const ul32 insn_64[] = { @@ -133,7 +143,7 @@ void write_plt_header(Context &ctx, u8 *buf) { u64 plt = ctx.plt->shdr.sh_addr; memcpy(buf, E::is_64 ? insn_64 : insn_32, E::plt_hdr_size); - write_j20(buf, hi20(gotplt, plt) >> 12); + write_j20(buf, hi20(gotplt, plt)); write_k12(buf + 8, gotplt); write_k12(buf + 16, gotplt); } @@ -158,7 +168,7 @@ void write_plt_entry(Context &ctx, u8 *buf, Symbol &sym) { u64 plt = sym.get_plt_addr(ctx); memcpy(buf, E::is_64 ? plt_entry_64 : plt_entry_32, E::plt_size); - write_j20(buf, hi20(gotplt, plt) >> 12); + write_j20(buf, hi20(gotplt, plt)); write_k12(buf + 4, gotplt); } @@ -168,7 +178,7 @@ void write_pltgot_entry(Context &ctx, u8 *buf, Symbol &sym) { u64 plt = sym.get_plt_addr(ctx); memcpy(buf, E::is_64 ? plt_entry_64 : plt_entry_32, E::plt_size); - write_j20(buf, hi20(got, plt) >> 12); + write_j20(buf, hi20(got, plt)); write_k12(buf + 4, got); } @@ -264,7 +274,7 @@ void InputSection::apply_reloc_alloc(Context &ctx, u8 *base) { // as if they were TLSGD relocs for LoongArch, which is a clear bug. // We need to handle TLSLD relocs as synonyms for TLSGD relocs for the // sake of bug compatibility. - auto get_tls_idx = [&] { + auto get_got_idx = [&] { if (sym.has_tlsgd(ctx)) return sym.get_tlsgd_idx(ctx); return sym.get_got_idx(ctx); @@ -273,7 +283,7 @@ void InputSection::apply_reloc_alloc(Context &ctx, u8 *base) { u64 S = sym.get_addr(ctx); u64 A = rel.r_addend; u64 P = get_addr() + rel.r_offset; - u64 G = get_tls_idx() * sizeof(Word); + u64 G = get_got_idx() * sizeof(Word); u64 GOT = ctx.got->shdr.sh_addr; switch (rel.r_type) { @@ -302,93 +312,87 @@ void InputSection::apply_reloc_alloc(Context &ctx, u8 *base) { write_d10k16(loc, val >> 2); break; } - case R_LARCH_ABS_HI20: - write_j20(loc, (S + A) >> 12); - break; case R_LARCH_ABS_LO12: write_k12(loc, S + A); break; + case R_LARCH_ABS_HI20: + write_j20(loc, (S + A) >> 12); + break; case R_LARCH_ABS64_LO20: write_j20(loc, (S + A) >> 32); break; case R_LARCH_ABS64_HI12: write_k12(loc, (S + A) >> 52); break; - case R_LARCH_PCALA_HI20: { - i64 val = hi20(S + A, P); - check(val, -(1LL << 31), 1LL << 31); - write_j20(loc, val >> 12); - break; - } case R_LARCH_PCALA_LO12: write_k12(loc, S + A); break; + case R_LARCH_PCALA_HI20: + check(S + A - P, -(1LL << 31), 1LL << 31); + write_j20(loc, hi20(S + A, P)); + break; case R_LARCH_PCALA64_LO20: - write_j20(loc, hi64(S + A, P) >> 32); + write_j20(loc, higher20(S + A, P)); break; case R_LARCH_PCALA64_HI12: - write_k12(loc, hi64(S + A, P) >> 52); - break; - case R_LARCH_GOT_PC_HI20: { - i64 val = hi20(GOT + G + A, P); - check(val, -(1LL << 31), 1LL << 31); - write_j20(loc, val >> 12); + write_k12(loc, highest12(S + A, P)); break; - } case R_LARCH_GOT_PC_LO12: write_k12(loc, GOT + G + A); break; + case R_LARCH_GOT_PC_HI20: + check(GOT + G + A - P, -(1LL << 31), 1LL << 31); + write_j20(loc, hi20(GOT + G + A, P)); + break; case R_LARCH_GOT64_PC_LO20: - write_j20(loc, hi64(GOT + G + A, P) >> 32); + write_j20(loc, higher20(GOT + G + A, P)); break; case R_LARCH_GOT64_PC_HI12: - write_k12(loc, hi64(GOT + G + A, P) >> 52); - break; - case R_LARCH_GOT_HI20: - write_j20(loc, (GOT + G + A) >> 12); + write_k12(loc, highest12(GOT + G + A, P)); break; case R_LARCH_GOT_LO12: write_k12(loc, GOT + G + A); break; + case R_LARCH_GOT_HI20: + write_j20(loc, (GOT + G + A) >> 12); + break; case R_LARCH_GOT64_LO20: write_j20(loc, (GOT + G + A) >> 32); break; case R_LARCH_GOT64_HI12: write_k12(loc, (GOT + G + A) >> 52); break; - case R_LARCH_TLS_LE_HI20: - write_j20(loc, (S + A - ctx.tp_addr) >> 12); - break; case R_LARCH_TLS_LE_LO12: write_k12(loc, S + A - ctx.tp_addr); break; + case R_LARCH_TLS_LE_HI20: + write_j20(loc, (S + A - ctx.tp_addr) >> 12); + break; case R_LARCH_TLS_LE64_LO20: write_j20(loc, (S + A - ctx.tp_addr) >> 32); break; case R_LARCH_TLS_LE64_HI12: write_k12(loc, (S + A - ctx.tp_addr) >> 52); break; - case R_LARCH_TLS_IE_PC_HI20: { - i64 val = hi20(sym.get_gottp_addr(ctx) + A, P); - check(val, -(1LL << 31), 1LL << 31); - write_j20(loc, val >> 12); - break; - } case R_LARCH_TLS_IE_PC_LO12: write_k12(loc, sym.get_gottp_addr(ctx) + A); break; + case R_LARCH_TLS_IE_PC_HI20: + check(sym.get_gottp_addr(ctx) + A - P, -(1LL << 31), 1LL << 31); + write_j20(loc, hi20(sym.get_gottp_addr(ctx) + A, P)); + break; case R_LARCH_TLS_IE64_PC_LO20: - write_j20(loc, hi64(sym.get_gottp_addr(ctx) + A, P) >> 32); + write_j20(loc, higher20(sym.get_gottp_addr(ctx) + A, P)); break; case R_LARCH_TLS_IE64_PC_HI12: - write_k12(loc, hi64(sym.get_gottp_addr(ctx) + A, P) >> 52); - break; - case R_LARCH_TLS_IE_HI20: - write_j20(loc, (sym.get_gottp_addr(ctx) + A) >> 12); + write_k12(loc, highest12(sym.get_gottp_addr(ctx) + A, P)); break; case R_LARCH_TLS_IE_LO12: write_k12(loc, sym.get_gottp_addr(ctx) + A); break; + case R_LARCH_TLS_IE_HI20: + write_j20(loc, (sym.get_gottp_addr(ctx) + A) >> 12); + break; case R_LARCH_TLS_IE64_LO20: write_j20(loc, (sym.get_gottp_addr(ctx) + A) >> 32); break; @@ -396,12 +400,10 @@ void InputSection::apply_reloc_alloc(Context &ctx, u8 *base) { write_k12(loc, (sym.get_gottp_addr(ctx) + A) >> 52); break; case R_LARCH_TLS_LD_PC_HI20: - case R_LARCH_TLS_GD_PC_HI20: { - i64 val = hi20(sym.get_tlsgd_addr(ctx) + A, P); - check(val, -(1LL << 31), 1LL << 31); - write_j20(loc, val >> 12); + case R_LARCH_TLS_GD_PC_HI20: + check(sym.get_tlsgd_addr(ctx) + A - P, -(1LL << 31), 1LL << 31); + write_j20(loc, hi20(sym.get_tlsgd_addr(ctx) + A, P)); break; - } case R_LARCH_TLS_LD_HI20: case R_LARCH_TLS_GD_HI20: write_j20(loc, (sym.get_tlsgd_addr(ctx) + A) >> 12); @@ -655,8 +657,6 @@ void InputSection::scan_relocations(Context &ctx) { template <> void RangeExtensionThunk::copy_buf(Context &ctx) { - u8 *buf = ctx.buf + output_section.shdr.sh_offset + offset; - static const ul32 insn[] = { 0x1a00'000c, // pcalau12i $t0, 0 0x02c0'018c, // addi.d $t0, $t0, 0 @@ -666,14 +666,18 @@ void RangeExtensionThunk::copy_buf(Context &ctx) { static_assert(E::thunk_size == sizeof(insn)); - for (i64 i = 0; i < symbols.size(); i++) { - u64 S = symbols[i]->get_addr(ctx); - u64 P = output_section.shdr.sh_addr + offset + i * E::thunk_size; + u8 *buf = ctx.buf + output_section.shdr.sh_offset + offset; + u64 P = output_section.shdr.sh_addr + offset; + + for (Symbol *sym : symbols) { + u64 S = sym->get_addr(ctx); + + memcpy(buf, insn, sizeof(insn)); + write_j20(buf, hi20(S, P)); + write_k12(buf + 4, S); - u8 *loc = buf + i * E::thunk_size; - memcpy(loc, insn, sizeof(insn)); - write_j20(loc, hi20(S, P) >> 12); - write_k12(loc + 4, S); + buf += sizeof(insn); + P += sizeof(insn); } } diff --git a/elf/arch-mips64.cc b/elf/arch-mips64.cc deleted file mode 100644 index 079bc5ba..00000000 --- a/elf/arch-mips64.cc +++ /dev/null @@ -1,673 +0,0 @@ -// MIPS is a RISC ISA developed in the '80s. The processor was once fairly -// popular; for examples, Silicon Graphics workstations and Nintendo 64 -// game consoles are based on the processor. Even though it's no longer a -// popular choice when creating a new system, there are still many uses of -// the ISA especially in the network router segment. -// -// The MIPS psABIs are in a sad state due to the lack of ownership of the -// ABI. The last major Unix vendor in the MIPS market was Silicon -// Graphics, which effectively ceased its MIPS-based Unix workstation -// business in the '90s. Even at the time the MIPS ABIs looked peculiar. -// After that, various small vendors used MIPS to create appliances and -// notably routers, but no one tried to modernize or improve the ABIs. As -// a result, the MIPS ABIs left as probably the most diverged ABI compared -// to the other psABIs. -// -// Specifically, the MIPS ABIs has the following issues: -// -// 1. Since the ISA does not support PC-relative addressing, each function -// first materializes the address of GOT + 0x7ff0 in the GP register -// and access GOT entries relative to the GP's value. This GP-relative -// access is usually done with a single load instruction with a 16-bit -// offset. That means only GP ± 32 KiB is addressable. If GOT is larger -// than that, the linker is expected to create a GOT section for each -// input file and associate a different GP value for each GOT. This -// method is called "multi-GOT". Multi-GOT is not necessary for other -// ABIs because other processors either simply support PC-relative -// addressing or use two instructions to access GOT entries. -// -// 2. The MIPS ABIs require .dynsym entries to be sorted in a very -// specific manner to represent some dynamic relocations implicitly -// rather than explicitly in the .rela.dyn section. This feature is -// called "Quickstart" in the MIPS documentation. -// -// 3. Unlike other psABIs, a MIPS relocation record can have up to three -// types -- that is, each record has not only r_type but also r_type2 -// and r_type3. A relocated value is computed as the combination of all -// the relocation types. -// -// In our MIPS support, we prioritize simplicity of implementation over -// marginal runtime efficiency. Specifically, we made the following -// decisions for simplification: -// -// 1. We do not sort .dynsym entries. Quickstart still kicks in at the -// load-time (there's no way to tell the loader to disable Quickstart), -// and the loader writes resolved addresses to our placeholder section -// `.mips_quickstart`. We just ignore these relocated values. -// -// 2. Instead of supporting arbitrary combinations of relocation types, we -// support only a limited set of them. This works because, in practice, -// the compiler emits only a limted set of relocation types. - -#include "mold.h" - -namespace mold::elf { - -static constexpr i64 BIAS = 0x8000; - -// We don't support lazy symbol resolution for MIPS. All dynamic symbols -// are resolved eagerly on process startup. -template -void write_plt_header(Context &ctx, u8 *buf) {} - -template -void write_plt_entry(Context &ctx, u8 *buf, Symbol &sym) {} - -template -void write_pltgot_entry(Context &ctx, u8 *buf, Symbol &sym) {} - -template -void EhFrameSection::apply_eh_reloc(Context &ctx, const ElfRel &rel, - u64 offset, u64 val) { - u8 *loc = ctx.buf + this->shdr.sh_offset + offset; - - switch (rel.r_type) { - case R_NONE: - break; - case R_MIPS_64: - // We relocate R_MIPS_64 in .eh_frame as a relative relocation. - // See the comment for mips_rewrite_cie() below. - *(U64 *)loc = val - this->shdr.sh_addr - offset; - break; - default: - Fatal(ctx) << "unsupported relocation in .eh_frame: " << rel; - } -} - -template -void InputSection::apply_reloc_alloc(Context &ctx, u8 *base) { - std::span> rels = get_rels(ctx); - - ElfRel *dynrel = nullptr; - if (ctx.reldyn) - dynrel = (ElfRel *)(ctx.buf + ctx.reldyn->shdr.sh_offset + - file.reldyn_offset + this->reldyn_offset); - - // 0x7ff0 is added to maximize the GP-relative addressable range - // for load/store instructions with a signed 16-bit displacement. - u64 GP = file.extra.got->shdr.sh_addr + 0x7ff0; - - u64 GP0 = file.extra.gp0; - MipsGotSection *got = file.extra.got; - - for (i64 i = 0; i < rels.size(); i++) { - const ElfRel &rel = rels[i]; - if (rel.r_type == R_NONE) - continue; - - Symbol &sym = *file.symbols[rel.r_sym]; - u8 *loc = base + rel.r_offset; - - auto check = [&](i64 val, i64 lo, i64 hi) { - if (val < lo || hi <= val) - Error(ctx) << *this << ": relocation " << rel << " against " - << sym << " out of range: " << val << " is not in [" - << lo << ", " << hi << ")"; - }; - - auto write_hi16 = [&](u64 val) { - check(val, -(1LL << 31), 1LL << 31); - *(U32 *)loc |= ((val + BIAS) >> 16) & 0xffff; - }; - - auto write_lo16 = [&](u64 val) { - check(val, -(1 << 15), 1 << 15); - *(U32 *)loc |= val & 0xffff; - }; - - auto write_lo16_nc = [&](u64 val) { - *(U32 *)loc |= val & 0xffff; - }; - - u64 S = sym.get_addr(ctx); - u64 A = rel.r_addend; - u64 P = get_addr() + rel.r_offset; - - switch (rel.r_type) { - case R_MIPS_64: - apply_toc_rel(ctx, sym, rel, loc, S, A, P, &dynrel); - break; - case R_MIPS_GPREL16 | (R_MIPS_SUB << 8) | (R_MIPS_HI16 << 16): { - u64 val = sym.is_local(ctx) ? (S + A + GP0 - GP) : (S + A - GP); - write_hi16(-val); - break; - } - case R_MIPS_GPREL16 | (R_MIPS_SUB << 8) | (R_MIPS_LO16 << 16): { - u64 val = sym.is_local(ctx) ? (S + A + GP0 - GP) : (S + A - GP); - write_lo16_nc(-val); - break; - } - case R_MIPS_GPREL32 | (R_MIPS_64 << 8): - *(U64 *)loc = S + A + GP0 - GP; - break; - case R_MIPS_GOT_DISP: - case R_MIPS_CALL16: - case R_MIPS_CALL_LO16: - case R_MIPS_GOT_LO16: - write_lo16(got->get_got_addr(ctx, sym, A) - GP); - break; - case R_MIPS_CALL_HI16: - case R_MIPS_GOT_HI16: - write_hi16(got->get_got_addr(ctx, sym, A) - GP); - break; - case R_MIPS_GOT_PAGE: - write_lo16(got->get_gotpage_addr(ctx, sym, A) - GP); - break; - case R_MIPS_GOT_OFST: - write_lo16(0); - break; - case R_MIPS_JALR: - break; - case R_MIPS_TLS_TPREL_HI16: - write_hi16(S + A - ctx.tp_addr); - break; - case R_MIPS_TLS_TPREL_LO16: - write_lo16_nc(S + A - ctx.tp_addr); - break; - case R_MIPS_TLS_GOTTPREL: - write_lo16(got->get_gottp_addr(ctx, sym) - GP); - break; - case R_MIPS_TLS_DTPREL_HI16: - write_hi16(S + A - ctx.dtp_addr); - break; - case R_MIPS_TLS_DTPREL_LO16: - write_lo16_nc(S + A - ctx.dtp_addr); - break; - case R_MIPS_TLS_GD: - write_lo16(got->get_tlsgd_addr(ctx, sym) - GP); - break; - case R_MIPS_TLS_LDM: - write_lo16(got->get_tlsld_addr(ctx) - GP); - break; - default: - unreachable(); - } - } -} - -template -void InputSection::apply_reloc_nonalloc(Context &ctx, u8 *base) { - std::span> rels = get_rels(ctx); - - for (i64 i = 0; i < rels.size(); i++) { - const ElfRel &rel = rels[i]; - if (rel.r_type == R_NONE || record_undef_error(ctx, rel)) - continue; - - Symbol &sym = *file.symbols[rel.r_sym]; - u8 *loc = base + rel.r_offset; - - SectionFragment *frag; - i64 frag_addend; - std::tie(frag, frag_addend) = get_fragment(ctx, rel); - - u64 S = frag ? frag->get_addr(ctx) : sym.get_addr(ctx); - u64 A = frag ? frag_addend : get_addend(loc, rel); - - switch (rel.r_type) { - case R_MIPS_64: - if (std::optional val = get_tombstone(sym, frag)) - *(U64 *)loc = *val; - else - *(U64 *)loc = S + A; - break; - case R_MIPS_32: - *(U32 *)loc = S + A; - break; - default: - Fatal(ctx) << *this << ": invalid relocation for non-allocated sections: " - << rel; - } - } -} - -template -void InputSection::scan_relocations(Context &ctx) { - assert(shdr().sh_flags & SHF_ALLOC); - - this->reldyn_offset = file.num_dynrel * sizeof(ElfRel); - std::span> rels = get_rels(ctx); - MipsGotSection *got = file.extra.got; - - for (i64 i = 0; i < rels.size(); i++) { - const ElfRel &rel = rels[i]; - if (rel.r_type == R_NONE || record_undef_error(ctx, rel)) - continue; - - Symbol &sym = *file.symbols[rel.r_sym]; - - switch (rel.r_type) { - case R_MIPS_64: - scan_toc_rel(ctx, sym, rel); - break; - case R_MIPS_GOT_DISP: - case R_MIPS_CALL16: - case R_MIPS_CALL_HI16: - case R_MIPS_CALL_LO16: - case R_MIPS_GOT_HI16: - case R_MIPS_GOT_LO16: - got->got_syms.push_back({&sym, rel.r_addend}); - break; - case R_MIPS_GOT_PAGE: - case R_MIPS_GOT_OFST: - got->gotpage_syms.push_back({&sym, rel.r_addend}); - break; - case R_MIPS_TLS_GOTTPREL: - assert(rel.r_addend == 0); - got->gottp_syms.push_back(&sym); - break; - case R_MIPS_TLS_TPREL_HI16: - case R_MIPS_TLS_TPREL_LO16: - check_tlsle(ctx, sym, rel); - break; - case R_MIPS_TLS_GD: - assert(rel.r_addend == 0); - got->tlsgd_syms.push_back(&sym); - break; - case R_MIPS_TLS_LDM: - assert(rel.r_addend == 0); - got->has_tlsld = true; - break; - case R_MIPS_GPREL16 | (R_MIPS_SUB << 8) | (R_MIPS_HI16 << 16): - case R_MIPS_GPREL16 | (R_MIPS_SUB << 8) | (R_MIPS_LO16 << 16): - case R_MIPS_GPREL32 | (R_MIPS_64 << 8): - case R_MIPS_JALR: - case R_MIPS_TLS_DTPREL_HI16: - case R_MIPS_TLS_DTPREL_LO16: - break; - default: - Error(ctx) << *this << ": unknown relocation: " << rel; - } - } -} - -template -bool MipsGotSection::SymbolAddend::operator<(const SymbolAddend &other) const { - return std::tuple(sym->file->priority, sym->sym_idx, addend) < - std::tuple(other.sym->file->priority, other.sym->sym_idx, other.addend); -}; - -template -static bool compare(const Symbol *a, const Symbol *b) { - return std::tuple(a->file->priority, a->sym_idx) < - std::tuple(b->file->priority, b->sym_idx); -}; - -template -u64 MipsGotSection::SymbolAddend::get_addr(Context &ctx, i64 flags) const { - return sym->get_addr(ctx, flags) + addend; -} - -template -static inline i64 get_gotpage_offset(const MipsGotSection &got) { - return got.got_syms.size(); -} - -template -static inline i64 get_tlsgd_offset(const MipsGotSection &got) { - return get_gotpage_offset(got) + got.gotpage_syms.size(); -} - -template -static inline i64 get_gottp_offset(const MipsGotSection &got) { - return get_tlsgd_offset(got) + got.tlsgd_syms.size() * 2; -} - -template -static inline i64 get_tlsld_offset(const MipsGotSection &got) { - return get_gottp_offset(got) + got.gottp_syms.size(); -} - -template -static inline i64 get_num_got_entries(const MipsGotSection &got) { - return get_tlsld_offset(got) + got.has_tlsld * 2; -} - -template -u64 MipsGotSection::get_got_addr(Context &ctx, Symbol &sym, - i64 addend) const { - auto it = std::lower_bound(got_syms.begin(), got_syms.end(), - SymbolAddend{&sym, addend}); - i64 idx = it - got_syms.begin(); - return this->shdr.sh_addr + idx * sizeof(Word); -} - -template -u64 MipsGotSection::get_gotpage_addr(Context &ctx, Symbol &sym, - i64 addend) const { - auto it = std::lower_bound(gotpage_syms.begin(), gotpage_syms.end(), - SymbolAddend{&sym, addend}); - i64 idx = get_gotpage_offset(*this) + (it - gotpage_syms.begin()); - return this->shdr.sh_addr + idx * sizeof(Word); -} - -template -u64 MipsGotSection::get_tlsgd_addr(Context &ctx, Symbol &sym) const { - auto it = std::lower_bound(tlsgd_syms.begin(), tlsgd_syms.end(), - &sym, compare); - i64 idx = get_tlsgd_offset(*this) + (it - tlsgd_syms.begin()) * 2; - return this->shdr.sh_addr + idx * sizeof(Word); -} - -template -u64 MipsGotSection::get_gottp_addr(Context &ctx, Symbol &sym) const { - auto it = std::lower_bound(gottp_syms.begin(), gottp_syms.end(), - &sym, compare); - i64 idx = get_gottp_offset(*this) + (it - gottp_syms.begin()); - return this->shdr.sh_addr + idx * sizeof(Word); -} - -template -u64 MipsGotSection::get_tlsld_addr(Context &ctx) const { - assert(has_tlsld); - return this->shdr.sh_addr + get_tlsld_offset(*this) * sizeof(Word); -} - -namespace { -template -struct GotEntry { - u64 val = 0; - i64 r_type = R_NONE; - Symbol *sym = nullptr; -}; -} - -template -std::vector> -get_got_entries(Context &ctx, const MipsGotSection &got) { - using SymbolAddend = typename MipsGotSection::SymbolAddend; - - std::vector> entries; - auto add = [&](GotEntry ent) { entries.push_back(ent); }; - - // Create GOT entries for ordinary symbols - for (const SymbolAddend &ent : got.got_syms) { - if (ent.sym->is_imported) { - // If a symbol is imported, let the dynamic linker to resolve it. - add({0, E::R_DYNAMIC, ent.sym}); - } else if (ctx.arg.pic && ent.sym->is_relative()) { - // If we know an address at link-time, fill that GOT entry now. - // It may need a base relocation, though. - add({ent.get_addr(ctx, NO_PLT), E::R_RELATIVE}); - } else { - add({ent.get_addr(ctx, NO_PLT)}); - } - } - - // Create GOT entries for GOT_PAGE and GOT_OFST relocs - for (const SymbolAddend &ent : got.gotpage_syms) { - if (ctx.arg.pic && ent.sym->is_relative()) - add({ent.get_addr(ctx), E::R_RELATIVE}); - else - add({ent.get_addr(ctx)}); - } - - // Create GOT entries for TLVs. - for (Symbol *sym : got.tlsgd_syms) { - if (sym->is_imported) { - // If a symbol is imported, let the dynamic linker to resolve it. - add({0, E::R_DTPMOD, sym}); - add({0, E::R_DTPOFF, sym}); - } else if (ctx.arg.shared) { - // If we are creating a shared library, we know the TLV's offset - // within the current TLS block. We don't know the module ID though. - add({0, E::R_DTPMOD}); - add({sym->get_addr(ctx) - ctx.dtp_addr}); - } else { - // If we are creating an executable, we know both the module ID and - // the offset. Module ID 1 indicates the main executable. - add({1}); - add({sym->get_addr(ctx) - ctx.dtp_addr}); - } - } - - for (Symbol *sym : got.gottp_syms) { - if (sym->is_imported) { - // If we know nothing about the symbol, let the dynamic linker - // to fill the GOT entry. - add({0, E::R_TPOFF, sym}); - } else if (ctx.arg.shared) { - // If we know the offset within the current thread vector, - // let the dynamic linker to adjust it. - add({sym->get_addr(ctx) - ctx.tls_begin, E::R_TPOFF}); - } else { - // Otherwise, we know the offset from the thread pointer (TP) at - // link-time, so we can fill the GOT entry directly. - add({sym->get_addr(ctx) - ctx.tp_addr}); - } - } - - if (got.has_tlsld) { - if (ctx.arg.shared) - add({0, E::R_DTPMOD}); - else - add({1}); // 1 means the main executable - add({0}); - } - - return entries; -} - -template -void MipsGotSection::update_shdr(Context &ctx) { - // Finalize got_syms - sort(got_syms); - remove_duplicates(got_syms); - - // Finalize gotpage_syms - sort(gotpage_syms); - remove_duplicates(gotpage_syms); - - // Finalize tlsgd_syms - sort(tlsgd_syms, compare); - remove_duplicates(tlsgd_syms); - - // Finalize gottp_syms - sort(gottp_syms, compare); - remove_duplicates(gottp_syms); - - this->shdr.sh_size = get_num_got_entries(*this) * sizeof(Word); -} - -template -i64 MipsGotSection::get_reldyn_size(Context &ctx) const { - i64 n = 0; - for (GotEntry &ent : get_got_entries(ctx, *this)) - if (ent.r_type != R_NONE) - n++; - return n; -} - -template -void MipsGotSection::copy_buf(Context &ctx) { - U64 *buf = (U64 *)(ctx.buf + this->shdr.sh_offset); - memset(buf, 0, this->shdr.sh_size); - - ElfRel *dynrel = (ElfRel *)(ctx.buf + ctx.reldyn->shdr.sh_offset + - this->reldyn_offset); - - for (i64 i = 0; GotEntry &ent : get_got_entries(ctx, *this)) { - if (ent.r_type != R_NONE) - *dynrel++ = ElfRel(this->shdr.sh_addr + i * sizeof(Word), - ent.r_type, - ent.sym ? ent.sym->get_dynsym_idx(ctx) : 0, - ent.val); - buf[i++] = ent.val; - } -} - -template -void MipsQuickstartSection::update_shdr(Context &ctx) { - this->shdr.sh_size = (NUM_RESERVED + ctx.dynsym->symbols.size()) * - sizeof(Word); -} - -template -void MipsQuickstartSection::copy_buf(Context &ctx) { - U64 *buf = (U64 *)(ctx.buf + this->shdr.sh_offset); - memset(buf, 0, this->shdr.sh_size); - - // It is not clear how the runtime uses it, but all MIPS binaries - // have this value in GOT[1]. - buf[1] = E::is_64 ? 0x8000'0000'0000'0000 : 0x8000'0000; - - for (i64 i = 0; i < ctx.dynsym->symbols.size(); i++) - if (Symbol *sym = ctx.dynsym->symbols[i]) - if (!sym->file->is_dso && !sym->esym().is_undef()) - buf[i + NUM_RESERVED] = sym->get_addr(ctx, NO_PLT); -} - -template -void MipsABIFlagsSection::update_shdr(Context &ctx) { - for (ObjectFile *file : ctx.objs) { - if (file->extra.abi_flags) { - contents = file->extra.abi_flags->contents; - break; - } - } - - this->shdr.sh_size = contents.size(); -} - -// .MIPS.abiflags section contains ABI info such as ISA level. -// We need to merge input .MIPS.abiflags sections into a single -// .MIPS.abiflags section. But for now, we just pick the first one. -template -void MipsABIFlagsSection::copy_buf(Context &ctx) { - u8 *buf = ctx.buf + this->shdr.sh_offset; - memcpy(buf, contents.data(), contents.size()); -} - -// We merge consective .mips_got sections to reduce the total number of -// .mips_got entries. Note that each .mips_got should be equal or smaller -// than 64 KiB so that all of its entries are within its GP ± 32 KiB. -template -void mips_merge_got_sections(Context &ctx) { - for (ObjectFile *file : ctx.objs) - file->extra.got->update_shdr(ctx); - - for (i64 i = 0; i < ctx.objs.size(); i++) { - MipsGotSection &got = *ctx.objs[i]->extra.got; - - for (i++; i < ctx.objs.size(); i++) { - MipsGotSection &got2 = *ctx.objs[i]->extra.got; - if (got.shdr.sh_size + got2.shdr.sh_size >= 65536) - break; - - append(got.got_syms, got2.got_syms); - append(got.gotpage_syms, got2.gotpage_syms); - append(got.tlsgd_syms, got2.tlsgd_syms); - append(got.gottp_syms, got2.gottp_syms); - got.has_tlsld = got.has_tlsld || got2.has_tlsld; - - got2.got_syms.clear(); - got2.gotpage_syms.clear(); - got2.tlsgd_syms.clear(); - got2.gottp_syms.clear(); - got2.has_tlsld = false; - - got.update_shdr(ctx); - ctx.objs[i]->extra.got = &got; - } - } -} - -// MIPS .eh_frame contains absolute addresses (i.e. R_MIPS_64 relocations) -// even if compiled with -fPIC. Instead of emitting base relocations, we -// rewrite CIEs so that we can write relative addresse instead of absolute -// ones to .eh_frame. -template -void mips_rewrite_cie(Context &ctx, u8 *buf, CieRecord &cie) { - u8 *aug = buf + 9; // Skip Length, CIE ID and Version fields - if (*aug != 'z') - return; - aug++; - - // Skip Augmentation String - u8 *p = aug + strlen((char *)aug) + 1; - - read_uleb(&p); // Skip Code Alignment Factor - read_uleb(&p); // Skip Data Alignment Factor - p++; // Skip Return Address Register - read_uleb(&p); // Skip Augmentation Data Length - - auto rewrite = [&](u8 *ptr) { - i64 sz; - - switch (*ptr & 0xf) { - case DW_EH_PE_absptr: - sz = sizeof(Word); - break; - case DW_EH_PE_udata4: - case DW_EH_PE_sdata4: - sz = 4; - break; - case DW_EH_PE_udata8: - case DW_EH_PE_sdata8: - sz = 8; - break; - default: - Fatal(ctx) << cie.input_section << ": unknown pointer size"; - } - - if ((*ptr & 0x70) == DW_EH_PE_absptr) { - if (sz == 4) - *ptr = (*ptr & 0x80) | DW_EH_PE_pcrel | DW_EH_PE_sdata4; - else - *ptr = (*ptr & 0x80) | DW_EH_PE_pcrel | DW_EH_PE_sdata8; - } - return sz; - }; - - // Now p points to the beginning of Augmentation Data - for (; *aug; aug++) { - switch (*aug) { - case 'L': - case 'R': - rewrite(p); - p++; - break; - case 'P': - p += rewrite(p) + 1; - break; - case 'S': - case 'B': - break; - default: - Error(ctx) << cie.input_section - << ": unknown argumentation string in CIE: '" - << (char)*aug << "'"; - } - } -} - -#define INSTANTIATE(E) \ - template void write_plt_header(Context &, u8 *); \ - template void write_plt_entry(Context &, u8 *, Symbol &); \ - template void write_pltgot_entry(Context &, u8 *, Symbol &); \ - template void EhFrameSection:: \ - apply_eh_reloc(Context &, const ElfRel &, u64, u64); \ - template void InputSection::apply_reloc_alloc(Context &, u8 *); \ - template void InputSection::apply_reloc_nonalloc(Context &, u8 *); \ - template void InputSection::scan_relocations(Context &); \ - template class MipsGotSection; \ - template class MipsQuickstartSection; \ - template class MipsABIFlagsSection; \ - template void mips_merge_got_sections(Context &); \ - template void mips_rewrite_cie(Context &, u8 *, CieRecord &); - - -INSTANTIATE(MIPS64LE); -INSTANTIATE(MIPS64BE); - -} // namespace mold::elf diff --git a/elf/arch-ppc32.cc b/elf/arch-ppc32.cc index ee266e0b..d75656ad 100644 --- a/elf/arch-ppc32.cc +++ b/elf/arch-ppc32.cc @@ -409,8 +409,6 @@ void InputSection::scan_relocations(Context &ctx) { template <> void RangeExtensionThunk::copy_buf(Context &ctx) { - u8 *buf = ctx.buf + output_section.shdr.sh_offset + offset; - static const ub32 local_thunk[] = { // Get this thunk's address 0x7c08'02a6, // mflr r0 @@ -429,22 +427,26 @@ void RangeExtensionThunk::copy_buf(Context &ctx) { static_assert(E::thunk_size == sizeof(plt_entry)); static_assert(E::thunk_size == sizeof(local_thunk)); - for (i64 i = 0; i < symbols.size(); i++) { - ub32 *loc = (ub32 *)(buf + i * E::thunk_size); - Symbol &sym = *symbols[i]; - - if (sym.has_plt(ctx)) { - memcpy(loc, plt_entry, sizeof(plt_entry)); - u64 got = sym.has_got(ctx) ? sym.get_got_addr(ctx) : sym.get_gotplt_addr(ctx); - i64 val = got - get_addr(i) - 8; - loc[4] |= higha(val); - loc[5] |= lo(val); + u8 *buf = ctx.buf + output_section.shdr.sh_offset + offset; + u64 P = output_section.shdr.sh_addr + offset; + + for (Symbol *sym : symbols) { + if (sym->has_plt(ctx)) { + u64 got = + sym->has_got(ctx) ? sym->get_got_addr(ctx) : sym->get_gotplt_addr(ctx); + i64 val = got - P - 8; + memcpy(buf, plt_entry, sizeof(plt_entry)); + *(ub32 *)(buf + 16) |= higha(val); + *(ub32 *)(buf + 20) |= lo(val); } else { - memcpy(loc, local_thunk, sizeof(local_thunk)); - i64 val = sym.get_addr(ctx) - get_addr(i) - 8; - loc[4] |= higha(val); - loc[5] |= lo(val); + i64 val = sym->get_addr(ctx) - P - 8; + memcpy(buf, local_thunk, sizeof(local_thunk)); + *(ub32 *)(buf + 16) |= higha(val); + *(ub32 *)(buf + 20) |= lo(val); } + + buf += E::thunk_size; + P += E::thunk_size; } } diff --git a/elf/arch-ppc64v1.cc b/elf/arch-ppc64v1.cc index 44d572c7..dc7a37a7 100644 --- a/elf/arch-ppc64v1.cc +++ b/elf/arch-ppc64v1.cc @@ -72,22 +72,22 @@ void write_plt_header(Context &ctx, u8 *buf) { 0x7d88'02a6, // mflr r12 0x429f'0005, // bcl 20, 31, 4 // obtain PC 0x7d68'02a6, // mflr r11 - 0xe84b'0024, // ld r2,36(r11) 0x7d88'03a6, // mtlr r12 - 0x7d62'5a14, // add r11,r2,r11 + 0x3d6b'0000, // addis r11, r11, GOTPLT_OFFSET@ha + 0x396b'0000, // addi r11, r11, GOTPLT_OFFSET@lo 0xe98b'0000, // ld r12,0(r11) 0xe84b'0008, // ld r2,8(r11) 0x7d89'03a6, // mtctr r12 0xe96b'0010, // ld r11,16(r11) 0x4e80'0420, // bctr - // .quad .got.plt - .plt - 8 - 0x0000'0000, - 0x0000'0000, }; static_assert(sizeof(insn) == E::plt_hdr_size); memcpy(buf, insn, sizeof(insn)); - *(ub64 *)(buf + 44) = ctx.gotplt->shdr.sh_addr - ctx.plt->shdr.sh_addr - 8; + + i64 val = ctx.gotplt->shdr.sh_addr - ctx.plt->shdr.sh_addr - 8; + *(ub32 *)(buf + 16) |= higha(val); + *(ub32 *)(buf + 20) |= lo(val); } template <> @@ -410,8 +410,6 @@ void InputSection::scan_relocations(Context &ctx) { template <> void RangeExtensionThunk::copy_buf(Context &ctx) { - u8 *buf = ctx.buf + output_section.shdr.sh_offset + offset; - // If the destination is .plt.got, we save the current r2, read an // address of a function descriptor from .got, restore %r2 and jump // to the function. @@ -466,26 +464,29 @@ void RangeExtensionThunk::copy_buf(Context &ctx) { static_assert(E::thunk_size == sizeof(plt_thunk)); static_assert(E::thunk_size == sizeof(local_thunk)); - for (i64 i = 0; i < symbols.size(); i++) { - Symbol &sym = *symbols[i]; - ub32 *loc = (ub32 *)(buf + i * E::thunk_size); - - if (sym.has_got(ctx)) { - memcpy(loc, pltgot_thunk, sizeof(pltgot_thunk)); - i64 val = sym.get_got_addr(ctx) - ctx.extra.TOC->value; - loc[1] |= higha(val); - loc[2] |= lo(val); - } else if(sym.has_plt(ctx)) { - memcpy(loc, plt_thunk, sizeof(plt_thunk)); - i64 val = sym.get_gotplt_addr(ctx) - ctx.extra.TOC->value; - loc[1] |= higha(val); - loc[2] |= lo(val); + u8 *buf = ctx.buf + output_section.shdr.sh_offset + offset; + u64 P = output_section.shdr.sh_addr + offset; + + for (Symbol *sym : symbols) { + if (sym->has_got(ctx)) { + i64 val = sym->get_got_addr(ctx) - ctx.extra.TOC->value; + memcpy(buf, pltgot_thunk, sizeof(pltgot_thunk)); + *(ub32 *)(buf + 4) |= higha(val); + *(ub32 *)(buf + 8) |= lo(val); + } else if(sym->has_plt(ctx)) { + i64 val = sym->get_gotplt_addr(ctx) - ctx.extra.TOC->value; + memcpy(buf, plt_thunk, sizeof(plt_thunk)); + *(ub32 *)(buf + 4) |= higha(val); + *(ub32 *)(buf + 8) |= lo(val); } else { - memcpy(loc, local_thunk, sizeof(local_thunk)); - i64 val = sym.get_addr(ctx, NO_OPD) - ctx.extra.TOC->value; - loc[0] |= higha(val); - loc[1] |= lo(val); + i64 val = sym->get_addr(ctx, NO_OPD) - ctx.extra.TOC->value; + memcpy(buf, local_thunk, sizeof(local_thunk)); + *(ub32 *)buf |= higha(val); + *(ub32 *)(buf + 4) |= lo(val); } + + buf += E::thunk_size; + P += E::thunk_size; } } diff --git a/elf/arch-ppc64v2.cc b/elf/arch-ppc64v2.cc index 91c1fa91..97dbae07 100644 --- a/elf/arch-ppc64v2.cc +++ b/elf/arch-ppc64v2.cc @@ -112,25 +112,26 @@ void write_plt_header(Context &ctx, u8 *buf) { 0x7c08'03a6, // mtlr r0 // Compute the PLT entry index - 0xe80b'002c, // ld r0, 44(r11) - 0x7d8b'6050, // subf r12, r11, r12 - 0x7d60'5a14, // add r11, r0, r11 - 0x380c'ffcc, // addi r0, r12, -52 + 0x398c'ffd4, // addi r12, r12, -44 + 0x7c0b'6050, // subf r0, r11, r12 0x7800'f082, // rldicl r0, r0, 62, 2 + // Compute the address of .got.plt + 0x3d6b'0000, // addis r11, r11, GOTPLT_OFFSET@ha + 0x396b'0000, // addi r11, r11, GOTPLT_OFFSET@lo + // Load .got.plt[0] and .got.plt[1] and branch to .got.plt[0] 0xe98b'0000, // ld r12, 0(r11) 0x7d89'03a6, // mtctr r12 0xe96b'0008, // ld r11, 8(r11) 0x4e80'0420, // bctr - - // .quad .got.plt - .plt - 8 - 0x0000'0000, - 0x0000'0000, }; memcpy(buf, insn, sizeof(insn)); - *(ul64 *)(buf + 52) = ctx.gotplt->shdr.sh_addr - ctx.plt->shdr.sh_addr - 8; + + i64 val = ctx.gotplt->shdr.sh_addr - ctx.plt->shdr.sh_addr - 8; + *(ul32 *)(buf + 28) |= higha(val); + *(ul32 *)(buf + 32) |= lo(val); } template <> @@ -478,8 +479,6 @@ void InputSection::scan_relocations(Context &ctx) { template <> void RangeExtensionThunk::copy_buf(Context &ctx) { - u8 *buf = ctx.buf + output_section.shdr.sh_offset + offset; - // If the destination is PLT, we read an address from .got.plt or .got // and jump there. static const ul32 plt_thunk[] = { @@ -521,34 +520,43 @@ void RangeExtensionThunk::copy_buf(Context &ctx) { static_assert(E::thunk_size == sizeof(local_thunk)); static_assert(E::thunk_size == sizeof(local_thunk_power10)); - for (i64 i = 0; i < symbols.size(); i++) { - Symbol &sym = *symbols[i]; - ul32 *loc = (ul32 *)(buf + i * E::thunk_size); + u8 *buf = ctx.buf + output_section.shdr.sh_offset + offset; + u64 P = output_section.shdr.sh_addr + offset; - if (sym.has_plt(ctx)) { - u64 got = sym.has_got(ctx) ? sym.get_got_addr(ctx) : sym.get_gotplt_addr(ctx); + for (Symbol *sym : symbols) { + if (sym->has_plt(ctx)) { + u64 got = + sym->has_got(ctx) ? sym->get_got_addr(ctx) : sym->get_gotplt_addr(ctx); if (ctx.extra.is_power10) { - memcpy(loc, plt_thunk_power10, E::thunk_size); - *(ul64 *)(loc + 1) |= prefix34(got - get_addr(i) - 4); + memcpy(buf, plt_thunk_power10, E::thunk_size); + *(ul64 *)(buf + 1) |= prefix34(got - P - 4); } else { i64 val = got - ctx.extra.TOC->value; - memcpy(loc, plt_thunk, E::thunk_size); - loc[1] |= higha(val); - loc[2] |= lo(val); + memcpy(buf, plt_thunk, E::thunk_size); + *(ul32 *)(buf + 4) |= higha(val); + *(ul32 *)(buf + 8) |= lo(val); } } else { if (ctx.extra.is_power10) { - memcpy(loc, local_thunk_power10, E::thunk_size); - *(ul64 *)(loc + 1) |= prefix34(sym.get_addr(ctx) - get_addr(i) - 4); + memcpy(buf, local_thunk_power10, E::thunk_size); + *(ul64 *)(buf + 1) |= prefix34(sym->get_addr(ctx) - P - 4); } else { - i64 val = sym.get_addr(ctx) - ctx.extra.TOC->value; - memcpy(loc, local_thunk, E::thunk_size); - loc[1] |= higha(val); - loc[2] |= lo(val); + i64 val = sym->get_addr(ctx) - ctx.extra.TOC->value; + memcpy(buf, local_thunk, E::thunk_size); + *(ul32 *)(buf + 4) |= higha(val); + *(ul32 *)(buf + 8) |= lo(val); } } + + buf += E::thunk_size; + P += E::thunk_size; } } +template <> +u64 get_eflags(Context &ctx) { + return 2; +} + } // namespace mold::elf diff --git a/elf/arch-riscv.cc b/elf/arch-riscv.cc index 58e28f0e..c6fb8b36 100644 --- a/elf/arch-riscv.cc +++ b/elf/arch-riscv.cc @@ -69,8 +69,10 @@ #if MOLD_RV64LE || MOLD_RV64BE || MOLD_RV32LE || MOLD_RV32BE +#include "elf.h" #include "mold.h" +#include #include #include @@ -111,6 +113,11 @@ static void write_jtype(u8 *loc, u32 val) { bit(val, 11) << 20 | bits(val, 19, 12) << 12; } +static void write_citype(u8 *loc, u32 val) { + *(ul16 *)loc &= 0b111'0'11111'00000'11; + *(ul16 *)loc |= bit(val, 5) << 12 | bits(val, 4, 0) << 2; +} + static void write_cbtype(u8 *loc, u32 val) { *(ul16 *)loc &= 0b111'000'111'00000'11; *(ul16 *)loc |= bit(val, 8) << 12 | bit(val, 4) << 11 | bit(val, 3) << 10 | @@ -126,11 +133,6 @@ static void write_cjtype(u8 *loc, u32 val) { bit(val, 1) << 3 | bit(val, 5) << 2; } -// Returns the rd register of an R/I/U/J-type instruction. -static u32 get_rd(u32 val) { - return bits(val, 11, 7); -} - static void set_rs1(u8 *loc, u32 rs1) { assert(rs1 < 32); *(ul32 *)loc &= 0b111111'11111'00000'111'11111'1111111; @@ -247,6 +249,13 @@ void EhFrameSection::apply_eh_reloc(Context &ctx, const ElfRel &rel, } } +static inline bool is_hi20(const ElfRel &rel) { + u32 ty = rel.r_type; + return ty == R_RISCV_GOT_HI20 || ty == R_RISCV_TLS_GOT_HI20 || + ty == R_RISCV_TLS_GD_HI20 || ty == R_RISCV_PCREL_HI20 || + ty == R_RISCV_TLSDESC_HI20; +} + template <> void InputSection::apply_reloc_alloc(Context &ctx, u8 *base) { std::span> rels = get_rels(ctx); @@ -277,16 +286,8 @@ void InputSection::apply_reloc_alloc(Context &ctx, u8 *base) { << lo << ", " << hi << ")"; }; - auto is_hi20 = [](const ElfRel &r) { - u32 ty = r.r_type; - return ty == R_RISCV_GOT_HI20 || ty == R_RISCV_TLS_GOT_HI20 || - ty == R_RISCV_TLS_GD_HI20 || ty == R_RISCV_PCREL_HI20; - }; - auto find_paired_reloc = [&] { - assert(sym.get_input_section() == this); - - if (sym.value < r_offset) { + if (sym.value <= rels[i].r_offset - get_r_delta(i)) { for (i64 j = i - 1; j >= 0; j--) if (is_hi20(rels[j]) && sym.value == rels[j].r_offset - get_r_delta(j)) return j; @@ -299,6 +300,11 @@ void InputSection::apply_reloc_alloc(Context &ctx, u8 *base) { Fatal(ctx) << *this << ": paired relocation is missing: " << i; }; + auto get_rd = [&](i64 offset) { + // Returns the rd register of an R/I/U/J-type instruction. + return bits(*(ul32 *)(contents.data() + offset), 11, 7); + }; + u64 S = sym.get_addr(ctx); u64 A = rel.r_addend; u64 P = get_addr() + r_offset; @@ -326,27 +332,30 @@ void InputSection::apply_reloc_alloc(Context &ctx, u8 *base) { break; case R_RISCV_CALL: case R_RISCV_CALL_PLT: { - u32 rd = get_rd(*(ul32 *)(contents.data() + rel.r_offset + 4)); + i64 val = S + A - P; + i64 rd = get_rd(rel.r_offset + 4); + + // Calling an undefined weak symbol does not make sense. + // We make such call into an infinite loop. This should + // help debugging of a faulty program. + if (sym.esym().is_undef_weak()) + val = 0; if (removed_bytes == 4) { // auipc + jalr -> jal *(ul32 *)loc = (rd << 7) | 0b1101111; - write_jtype(loc, S + A - P); + write_jtype(loc, val); } else if (removed_bytes == 6 && rd == 0) { // auipc + jalr -> c.j *(ul16 *)loc = 0b101'00000000000'01; - write_cjtype(loc, S + A - P); + write_cjtype(loc, val); } else if (removed_bytes == 6 && rd == 1) { // auipc + jalr -> c.jal assert(!E::is_64); *(ul16 *)loc = 0b001'00000000000'01; - write_cjtype(loc, S + A - P); + write_cjtype(loc, val); } else { assert(removed_bytes == 0); - // Calling an undefined weak symbol does not make sense. - // We make such call into an infinite loop. This should - // help debugging of a faulty program. - u64 val = sym.esym().is_undef_weak() ? 0 : S + A - P; check(val, -(1LL << 31), 1LL << 31); write_utype(loc, val); write_itype(loc + 4, val); @@ -401,8 +410,12 @@ void InputSection::apply_reloc_alloc(Context &ctx, u8 *base) { break; } case R_RISCV_HI20: - assert(removed_bytes == 0 || removed_bytes == 4); - if (removed_bytes == 0) { + if (removed_bytes == 2) { + // Rewrite LUI with C.LUI + i64 rd = get_rd(rel.r_offset); + *(ul16 *)loc = 0b011'0'00000'00000'01 | (rd << 7); + write_citype(loc, (S + A + 0x800) >> 12); + } else if (removed_bytes == 0) { check(S + A, -(1LL << 31), 1LL << 31); write_utype(loc, S + A); } @@ -415,9 +428,9 @@ void InputSection::apply_reloc_alloc(Context &ctx, u8 *base) { write_stype(loc, S + A); // Rewrite `lw t1, 0(t0)` with `lw t1, 0(x0)` if the address is - // accessible relative to the zero register. If the upper 20 bits - // are all zero, the corresponding LUI might have been removed. - if (bits(S + A, 31, 12) == 0) + // accessible relative to the zero register because if that's the + // case, corresponding LUI might have been removed by relaxation. + if (sign_extend(S + A, 11) == S + A) set_rs1(loc, 0); break; case R_RISCV_TPREL_HI20: @@ -445,6 +458,58 @@ void InputSection::apply_reloc_alloc(Context &ctx, u8 *base) { set_rs1(loc, 4); break; } + case R_RISCV_TLSDESC_HI20: + if (removed_bytes == 0) + write_utype(loc, sym.get_tlsdesc_addr(ctx) + A - P); + break; + case R_RISCV_TLSDESC_LOAD_LO12: + case R_RISCV_TLSDESC_ADD_LO12: + case R_RISCV_TLSDESC_CALL: { + i64 idx2 = find_paired_reloc(); + const ElfRel &rel2 = rels[idx2]; + Symbol &sym2 = *file.symbols[rel2.r_sym]; + + u64 S = sym2.get_addr(ctx); + u64 A = rel2.r_addend; + u64 P = get_addr() + rel2.r_offset - get_r_delta(idx2); + + switch (rel.r_type) { + case R_RISCV_TLSDESC_LOAD_LO12: + if (sym2.has_tlsdesc(ctx)) + write_itype(loc, sym2.get_tlsdesc_addr(ctx) + A - P); + break; + case R_RISCV_TLSDESC_ADD_LO12: + if (sym2.has_tlsdesc(ctx)) { + write_itype(loc, sym2.get_tlsdesc_addr(ctx) + A - P); + } else if (sym2.has_gottp(ctx)) { + *(ul32 *)loc = 0x517; // auipc a0, + write_utype(loc, sym2.get_gottp_addr(ctx) + A - P); + } else { + if (removed_bytes == 0) { + *(ul32 *)loc = 0x537; // lui a0, + write_utype(loc, S + A - ctx.tp_addr); + } + } + break; + case R_RISCV_TLSDESC_CALL: + if (sym2.has_tlsdesc(ctx)) { + // Do nothing + } else if (sym2.has_gottp(ctx)) { + // {ld,lw} a0, (a0) + *(ul32 *)loc = E::is_64 ? 0x53503 : 0x52503; + write_itype(loc, sym2.get_gottp_addr(ctx) + A - P); + } else { + i64 val = S + A - ctx.tp_addr; + if (sign_extend(val, 11) == val) + *(ul32 *)loc = 0x513; // addi a0,zero, + else + *(ul32 *)loc = 0x50513; // addi a0,a0, + write_itype(loc, val); + } + break; + } + break; + } case R_RISCV_ADD8: loc += S + A; break; @@ -683,6 +748,9 @@ void InputSection::scan_relocations(Context &ctx) { case R_RISCV_TLS_GD_HI20: sym.flags |= NEEDS_TLSGD; break; + case R_RISCV_TLSDESC_HI20: + scan_tlsdesc(ctx, sym); + break; case R_RISCV_32_PCREL: case R_RISCV_PCREL_HI20: scan_pcrel(ctx, sym, rel); @@ -699,6 +767,9 @@ void InputSection::scan_relocations(Context &ctx) { case R_RISCV_PCREL_LO12_S: case R_RISCV_LO12_I: case R_RISCV_LO12_S: + case R_RISCV_TLSDESC_LOAD_LO12: + case R_RISCV_TLSDESC_ADD_LO12: + case R_RISCV_TLSDESC_CALL: case R_RISCV_ADD8: case R_RISCV_ADD16: case R_RISCV_ADD32: @@ -725,14 +796,37 @@ void InputSection::scan_relocations(Context &ctx) { } } -template +template <> +u64 get_eflags(Context &ctx) { + std::vector *> objs = ctx.objs; + std::erase(objs, ctx.internal_obj); + + if (objs.empty()) + return 0; + + u32 ret = objs[0]->get_ehdr().e_flags; + for (i64 i = 1; i < objs.size(); i++) { + u32 flags = objs[i]->get_ehdr().e_flags; + if (flags & EF_RISCV_RVC) + ret |= EF_RISCV_RVC; + + if ((flags & EF_RISCV_FLOAT_ABI) != (ret & EF_RISCV_FLOAT_ABI)) + Error(ctx) << *objs[i] << ": cannot link object files with different" + << " floating-point ABI from " << *objs[0]; + + if ((flags & EF_RISCV_RVE) != (ret & EF_RISCV_RVE)) + Error(ctx) << *objs[i] << ": cannot link object files with different" + << " EF_RISCV_RVE from " << *objs[0]; + } + return ret; +} + static bool is_resizable(Context &ctx, InputSection *isec) { return isec && isec->is_alive && (isec->shdr().sh_flags & SHF_ALLOC) && (isec->shdr().sh_flags & SHF_EXECINSTR); } // Returns the distance between a relocated place and a symbol. -template static i64 compute_distance(Context &ctx, Symbol &sym, InputSection &isec, const ElfRel &rel) { // We handle absolute symbols as if they were infinitely far away @@ -754,11 +848,14 @@ static i64 compute_distance(Context &ctx, Symbol &sym, } // Scan relocations to shrink sections. -template static void shrink_section(Context &ctx, InputSection &isec, bool use_rvc) { std::span> rels = isec.get_rels(ctx); isec.extra.r_deltas.resize(rels.size() + 1); + auto get_rd = [&](i64 offset) { + return bits(*(ul32 *)(isec.contents.data() + offset), 11, 7); + }; + i64 delta = 0; for (i64 i = 0; i < rels.size(); i++) { @@ -795,6 +892,20 @@ static void shrink_section(Context &ctx, InputSection &isec, bool use_rvc) if (sym.file == ctx.internal_obj) continue; + auto find_paired_reloc = [&] { + if (sym.value <= rels[i].r_offset) { + for (i64 j = i - 1; j >= 0; j--) + if (is_hi20(rels[j]) && sym.value == rels[j].r_offset) + return j; + } else { + for (i64 j = i + 1; j < rels.size(); j++) + if (is_hi20(rels[j]) && sym.value == rels[j].r_offset) + return j; + } + + Fatal(ctx) << isec << ": paired relocation is missing: " << i; + }; + switch (r.r_type) { case R_RISCV_CALL: case R_RISCV_CALL_PLT: { @@ -805,13 +916,13 @@ static void shrink_section(Context &ctx, InputSection &isec, bool use_rvc) if (dist & 1) break; - i64 rd = get_rd(*(ul32 *)(isec.contents.data() + r.r_offset + 4)); + i64 rd = get_rd(r.r_offset + 4); - if (rd == 0 && sign_extend(dist, 11) == dist && use_rvc) { + if (use_rvc && rd == 0 && sign_extend(dist, 11) == dist) { // If rd is x0 and the jump target is within ±2 KiB, we can use // C.J, saving 6 bytes. delta += 6; - } else if (rd == 1 && sign_extend(dist, 11) == dist && use_rvc && !E::is_64) { + } else if (use_rvc && !E::is_64 && rd == 1 && sign_extend(dist, 11) == dist) { // If rd is x1 and the jump target is within ±2 KiB, we can use // C.JAL. This is RV32 only because C.JAL is RV32-only instruction. delta += 6; @@ -821,38 +932,68 @@ static void shrink_section(Context &ctx, InputSection &isec, bool use_rvc) } break; } - case R_RISCV_HI20: - // If the upper 20 bits are all zero, we can remove LUI. - // The corresponding instructions referred to by LO12_I/LO12_S - // relocations will use the zero register instead. - if (bits(sym.get_addr(ctx), 31, 12) == 0) + case R_RISCV_HI20: { + u64 val = sym.get_addr(ctx) + r.r_addend; + i64 rd = get_rd(r.r_offset); + + if (sign_extend(val, 11) == val) { + // We can replace `lui t0, %hi(foo)` and `add t0, t0, %lo(foo)` + // instruction pair with `add t0, x0, %lo(foo)` if foo's bits + // [32:11] are all one or all zero. delta += 4; + } else if (use_rvc && rd != 0 && rd != 2 && sign_extend(val, 17) == val) { + // If the upper 20 bits can actually be represented in 6 bits, + // we can use C.LUI instead of LUI. + delta += 2; + } break; + } case R_RISCV_TPREL_HI20: case R_RISCV_TPREL_ADD: // These relocations are used to add a high 20-bit value to the // thread pointer. The following two instructions materializes - // TP + HI20(foo) in %r5, for example. + // TP + %tprel_hi20(foo) in %t0, for example. // - // lui a5,%tprel_hi(foo) # R_RISCV_TPREL_HI20 (symbol) - // add a5,a5,tp,%tprel_add(foo) # R_RISCV_TPREL_ADD (symbol) + // lui t0, %tprel_hi(foo) # R_RISCV_TPREL_HI20 + // add t0, t0, tp # R_RISCV_TPREL_ADD // - // Then thread-local variable `foo` is accessed with a low 12-bit - // offset like this: + // Then thread-local variable `foo` is accessed with the low + // 12-bit offset like this: // - // sw t0,%tprel_lo(foo)(a5) # R_RISCV_TPREL_LO12_S (symbol) + // sw t0, %tprel_lo(foo)(t0) # R_RISCV_TPREL_LO12_S // - // However, if the variable is at TP ±2 KiB, TP + HI20(foo) is the - // same as TP, so we can instead access the thread-local variable - // directly using TP like this: + // However, if the variable is at TP ± 2 KiB, TP + %tprel_hi20(foo) + // is the same as TP, so we can instead access the thread-local + // variable directly using TP like this: // - // sw t0,%tprel_lo(foo)(tp) + // sw t0, %tprel_lo(foo)(tp) // // Here, we remove `lui` and `add` if the offset is within ±2 KiB. if (i64 val = sym.get_addr(ctx) + r.r_addend - ctx.tp_addr; sign_extend(val, 11) == val) delta += 4; break; + case R_RISCV_TLSDESC_HI20: + if (!sym.has_tlsdesc(ctx)) + delta += 4; + break; + case R_RISCV_TLSDESC_LOAD_LO12: + case R_RISCV_TLSDESC_ADD_LO12: { + const ElfRel &rel2 = rels[find_paired_reloc()]; + Symbol &sym2 = *isec.file.symbols[rel2.r_sym]; + + if (r.r_type == R_RISCV_TLSDESC_LOAD_LO12) { + if (!sym2.has_tlsdesc(ctx)) + delta += 4; + } else { + assert(r.r_type == R_RISCV_TLSDESC_ADD_LO12); + if (!sym2.has_tlsdesc(ctx) && !sym2.has_gottp(ctx)) + if (i64 val = sym2.get_addr(ctx) + rel2.r_addend - ctx.tp_addr; + sign_extend(val, 11) == val) + delta += 4; + } + break; + } } } @@ -936,11 +1077,13 @@ i64 riscv_resize_sections(Context &ctx) { // // The following functions takes care of ISA strings. +namespace { struct Extn { std::string name; i64 major; i64 minor; }; +} // As per the RISC-V spec, the extension names must be sorted in a very // specific way, and unfortunately that's not just an alphabetical order. @@ -981,26 +1124,14 @@ static bool extn_version_less(const Extn &e1, const Extn &e2) { } static std::optional read_extn_string(std::string_view &str) { - Extn extn; - - size_t pos = str.find_first_of("0123456789"); - if (pos == str.npos) - return {}; - - extn.name = str.substr(0, pos); - str = str.substr(pos); + auto flags = std::regex_constants::optimize | std::regex_constants::ECMAScript; + static std::regex re(R"(^([a-z]+)(\d+)p(\d+))", flags); - size_t nread; - extn.major = std::stoul(std::string(str), &nread, 10); - str = str.substr(nread); - if (str.size() < 2 || str[0] != 'p') - return {}; - str = str.substr(1); - - extn.minor = std::stoul(std::string(str), &nread, 10); - str = str.substr(nread); - if (str.empty() || str[0] == '_') - return extn; + std::cmatch m; + if (std::regex_search(str.data(), str.data() + str.size(), m, re)) { + str = str.substr(m.length()); + return Extn{m[1], (i64)std::stoul(m[2]), (i64)std::stoul(m[3])}; + } return {}; } diff --git a/elf/arch-s390x.cc b/elf/arch-s390x.cc index a536550b..87ec3e8c 100644 --- a/elf/arch-s390x.cc +++ b/elf/arch-s390x.cc @@ -48,16 +48,23 @@ static void write_mid20(u8 *loc, u64 val) { template <> void write_plt_header(Context &ctx, u8 *buf) { static u8 insn[] = { + // Compute PLT_INDEX + 0xb9, 0x09, 0x00, 0x01, // sgr %r0, %r1 + 0xa7, 0x0b, 0xff, 0xc2, // aghi %r0, -62 + 0xeb, 0x10, 0x00, 0x01, 0x00, 0x0c, // srlg %r1, %r0, 1 + 0xb9, 0x08, 0x00, 0x01, // agr %r0, %r1 0xe3, 0x00, 0xf0, 0x38, 0x00, 0x24, // stg %r0, 56(%r15) + // Branch to _dl_runtime_resolve 0xc0, 0x10, 0, 0, 0, 0, // larl %r1, GOTPLT_OFFSET 0xd2, 0x07, 0xf0, 0x30, 0x10, 0x08, // mvc 48(8, %r15), 8(%r1) 0xe3, 0x10, 0x10, 0x10, 0x00, 0x04, // lg %r1, 16(%r1) 0x07, 0xf1, // br %r1 - 0x07, 0x00, 0x07, 0x00, 0x07, 0x00, // nopr; nopr; nopr + 0x07, 0x00, 0x07, 0x00, // nopr; nopr }; memcpy(buf, insn, sizeof(insn)); - *(ub32 *)(buf + 8) = (ctx.gotplt->shdr.sh_addr - ctx.plt->shdr.sh_addr - 6) >> 1; + *(ub32 *)(buf + 26) = + (ctx.gotplt->shdr.sh_addr - ctx.plt->shdr.sh_addr - 24) >> 1; } template <> @@ -65,15 +72,12 @@ void write_plt_entry(Context &ctx, u8 *buf, Symbol &sym) { static u8 insn[] = { 0xc0, 0x10, 0, 0, 0, 0, // larl %r1, GOTPLT_ENTRY_OFFSET 0xe3, 0x10, 0x10, 0x00, 0x00, 0x04, // lg %r1, (%r1) - 0xc0, 0x01, 0, 0, 0, 0, // lgfi %r0, PLT_INDEX - 0x07, 0xf1, // br %r1 - 0x07, 0x00, 0x07, 0x00, 0x07, 0x00, // nopr; nopr; nopr - 0x07, 0x00, 0x07, 0x00, 0x07, 0x00, // nopr; nopr; nopr + 0x0d, 0x01, // basr %r0, %r1 + 0x07, 0x00, // nopr }; memcpy(buf, insn, sizeof(insn)); *(ub32 *)(buf + 2) = (sym.get_gotplt_addr(ctx) - sym.get_plt_addr(ctx)) >> 1; - *(ub32 *)(buf + 14) = sym.get_plt_idx(ctx) * sizeof(ElfRel); } template <> @@ -356,12 +360,10 @@ void InputSection::apply_reloc_nonalloc(Context &ctx, u8 *base) { u64 A = frag ? frag_addend : (i64)rel.r_addend; switch (rel.r_type) { - case R_390_32: { - i64 val = S + A; - check(val, 0, 1LL << 32); - *(ub32 *)loc = val; + case R_390_32: + check(S + A, 0, 1LL << 32); + *(ub32 *)loc = S + A; break; - } case R_390_64: if (std::optional val = get_tombstone(sym, frag)) *(ub64 *)loc = *val; @@ -455,23 +457,23 @@ void InputSection::scan_relocations(Context &ctx) { // We always want to relax calls to __tls_get_offset() in statically- // linked executables because __tls_get_offset() in libc.a just calls // abort(). - if (ctx.arg.is_static || - (ctx.arg.relax && !sym.is_imported && !ctx.arg.shared)) { - // do nothing - } else if (ctx.arg.relax && !sym.is_imported && ctx.arg.shared && - !ctx.arg.z_dlopen) { + if ((ctx.arg.relax && sym.is_tprel_linktime_const(ctx)) || + ctx.arg.is_static) { + // Do nothing + } else if (ctx.arg.relax && sym.is_tprel_runtime_const(ctx)) { sym.flags |= NEEDS_GOTTP; } else { sym.flags |= NEEDS_TLSGD; } break; case R_390_TLS_LDM32: - case R_390_TLS_LDM64: { - bool do_relax = ctx.arg.is_static || (ctx.arg.relax && !ctx.arg.shared); - if (!do_relax) + case R_390_TLS_LDM64: + if (ctx.arg.is_static || (ctx.arg.relax && !ctx.arg.shared)) { + // Do nothing + } else { ctx.needs_tlsld = true; + } break; - } case R_390_TLS_LE32: case R_390_TLS_LE64: check_tlsle(ctx, sym, rel); diff --git a/elf/arch-sparc64.cc b/elf/arch-sparc64.cc index 4341f3ca..c6c455b3 100644 --- a/elf/arch-sparc64.cc +++ b/elf/arch-sparc64.cc @@ -288,36 +288,32 @@ void InputSection::apply_reloc_alloc(Context &ctx, u8 *base) { // We always have to relax a GOT load to a load immediate if a // symbol is local, because R_SPARC_GOTDATA_OP cannot represent // an addend for a local symbol. - if (sym.is_imported || sym.is_ifunc()) { - *(ub32 *)loc |= bits(G, 31, 10); - } else if (sym.is_absolute()) { + if (sym.is_absolute()) { i64 val = S + A; *(ub32 *)loc |= bits(val < 0 ? ~val : val, 31, 10); - } else { + } else if (sym.is_pcrel_linktime_const(ctx)) { i64 val = S + A - GOT; *(ub32 *)loc |= bits(val < 0 ? ~val : val, 31, 10); + } else { + *(ub32 *)loc |= bits(G, 31, 10); } break; - case R_SPARC_GOTDATA_OP_LOX10: { - if (sym.is_imported || sym.is_ifunc()) { - *(ub32 *)loc |= bits(G, 9, 0); - } else if (sym.is_absolute()) { + case R_SPARC_GOTDATA_OP_LOX10: + if (sym.is_absolute()) { i64 val = S + A; *(ub32 *)loc |= bits(val, 9, 0) | (val < 0 ? 0b1'1100'0000'0000 : 0); - } else { + } else if (sym.is_pcrel_linktime_const(ctx)) { i64 val = S + A - GOT; *(ub32 *)loc |= bits(val, 9, 0) | (val < 0 ? 0b1'1100'0000'0000 : 0); + } else { + *(ub32 *)loc |= bits(G, 9, 0); } break; - } case R_SPARC_GOTDATA_OP: - if (sym.is_imported || sym.is_ifunc()) - break; - if (sym.is_absolute()) { // ldx [ %g2 + %g1 ], %g1 → nop *(ub32 *)loc = 0x0100'0000; - } else { + } else if (sym.is_pcrel_linktime_const(ctx)) { // ldx [ %g2 + %g1 ], %g1 → add %g2, %g1, %g1 *(ub32 *)loc &= 0b00'11111'000000'11111'1'11111111'11111; *(ub32 *)loc |= 0b10'00000'000000'00000'0'00000000'00000; @@ -568,8 +564,9 @@ void InputSection::scan_relocations(Context &ctx) { break; case R_SPARC_TLS_GD_CALL: case R_SPARC_TLS_LDM_CALL: - if (!ctx.arg.is_static && ctx.extra.tls_get_addr_sym->is_imported) - ctx.extra.tls_get_addr_sym->flags |= NEEDS_PLT; + if (!ctx.arg.is_static) + if (Symbol &sym = *ctx.extra.tls_get_addr_sym; sym.is_imported) + sym.flags |= NEEDS_PLT; break; case R_SPARC_TLS_LE_HIX22: case R_SPARC_TLS_LE_LOX10: diff --git a/elf/arch-x86-64.cc b/elf/arch-x86-64.cc index 329b8aba..15cf6a28 100644 --- a/elf/arch-x86-64.cc +++ b/elf/arch-x86-64.cc @@ -165,28 +165,6 @@ static u32 relax_gottpoff(u8 *loc) { return 0; } -static u32 relax_gotpc32_tlsdesc(u8 *loc) { - switch ((loc[0] << 16) | (loc[1] << 8) | loc[2]) { - case 0x488d05: return 0x48c7c0; // lea 0(%rip), %rax -> mov $0, %rax - case 0x488d0d: return 0x48c7c1; // lea 0(%rip), %rcx -> mov $0, %rcx - case 0x488d15: return 0x48c7c2; // lea 0(%rip), %rdx -> mov $0, %rdx - case 0x488d1d: return 0x48c7c3; // lea 0(%rip), %rbx -> mov $0, %rbx - case 0x488d25: return 0x48c7c4; // lea 0(%rip), %rsp -> mov $0, %rsp - case 0x488d2d: return 0x48c7c5; // lea 0(%rip), %rbp -> mov $0, %rbp - case 0x488d35: return 0x48c7c6; // lea 0(%rip), %rsi -> mov $0, %rsi - case 0x488d3d: return 0x48c7c7; // lea 0(%rip), %rdi -> mov $0, %rdi - case 0x4c8d05: return 0x49c7c0; // lea 0(%rip), %r8 -> mov $0, %r8 - case 0x4c8d0d: return 0x49c7c1; // lea 0(%rip), %r9 -> mov $0, %r9 - case 0x4c8d15: return 0x49c7c2; // lea 0(%rip), %r10 -> mov $0, %r10 - case 0x4c8d1d: return 0x49c7c3; // lea 0(%rip), %r11 -> mov $0, %r11 - case 0x4c8d25: return 0x49c7c4; // lea 0(%rip), %r12 -> mov $0, %r12 - case 0x4c8d2d: return 0x49c7c5; // lea 0(%rip), %r13 -> mov $0, %r13 - case 0x4c8d35: return 0x49c7c6; // lea 0(%rip), %r14 -> mov $0, %r14 - case 0x4c8d3d: return 0x49c7c7; // lea 0(%rip), %r15 -> mov $0, %r15 - } - return 0; -} - // Rewrite a function call to __tls_get_addr to a cheaper instruction // sequence. We can do this when we know the thread-local variable's TP- // relative address at link-time. @@ -267,7 +245,7 @@ static void relax_gd_to_ie(u8 *loc, ElfRel rel, u64 val) { // sequence. The difference from relax_gd_to_le is that we are // materializing a Dynamic Thread Pointer for the current ELF module // instead of an address for a particular thread-local variable. -static void relax_ld_to_le(u8 *loc, ElfRel rel, u64 val) { +static void relax_ld_to_le(u8 *loc, ElfRel rel, u64 tls_size) { switch (rel.r_type) { case R_X86_64_PLT32: case R_X86_64_PC32: { @@ -275,13 +253,18 @@ static void relax_ld_to_le(u8 *loc, ElfRel rel, u64 val) { // // 48 8d 3d 00 00 00 00 lea foo@tlsld(%rip), %rdi // e8 00 00 00 00 call __tls_get_addr + // + // The instructions are so short that we cannot rewrite them with + // "mov %fs:0, %rax" which is 9 bytes long. We use a shorter code + // sequence instead. Since "xor %eax, %eax" zero-clears %rax, the + // meaning is equivalent. static const u8 insn[] = { 0x31, 0xc0, // xor %eax, %eax 0x64, 0x48, 0x8b, 0x00, // mov %fs:(%rax), %rax 0x48, 0x2d, 0, 0, 0, 0, // sub $tls_size, %rax }; memcpy(loc - 3, insn, sizeof(insn)); - *(ul32 *)(loc + 5) = val; + *(ul32 *)(loc + 5) = tls_size; break; } case R_X86_64_GOTPCREL: @@ -297,7 +280,7 @@ static void relax_ld_to_le(u8 *loc, ElfRel rel, u64 val) { 0x90, // nop }; memcpy(loc - 3, insn, sizeof(insn)); - *(ul32 *)(loc + 5) = val; + *(ul32 *)(loc + 5) = tls_size; break; } case R_X86_64_PLTOFF64: { @@ -308,14 +291,12 @@ static void relax_ld_to_le(u8 *loc, ElfRel rel, u64 val) { // 48 01 d8 add %rbx, %rax // ff d0 call *%rax static const u8 insn[] = { - 0x31, 0xc0, // xor %eax, %eax - 0x64, 0x48, 0x8b, 0x00, // mov %fs:(%rax), %rax - 0x48, 0x2d, 0, 0, 0, 0, // sub $tls_size, %rax - 0x0f, 0x1f, 0x44, 0x00, 0x00, // nop - 0x0f, 0x1f, 0x44, 0x00, 0x00, // nop + 0x64, 0x48, 0x8b, 0x04, 0x25, 0, 0, 0, 0, // mov %fs:0, %rax + 0x48, 0x2d, 0, 0, 0, 0, // sub $tls_size, %rax + 0x0f, 0x1f, 0x80, 0x00, 0x00, 0x00, 0x00, // nop }; memcpy(loc - 3, insn, sizeof(insn)); - *(ul32 *)(loc + 5) = val; + *(ul32 *)(loc + 8) = tls_size; break; } default: @@ -400,7 +381,7 @@ void InputSection::apply_reloc_alloc(Context &ctx, u8 *base) { *(ul64 *)loc = S + A - P; break; case R_X86_64_GOT32: - write32s(G + A); + write32(G + A); break; case R_X86_64_GOT64: *(ul64 *)loc = G + A; @@ -425,7 +406,7 @@ void InputSection::apply_reloc_alloc(Context &ctx, u8 *base) { // We always want to relax GOTPCRELX relocs even if --no-relax // was given because some static PIE runtime code depends on these // relaxations. - if (!sym.is_imported && !sym.is_ifunc() && sym.is_relative()) { + if (sym.is_pcrel_linktime_const(ctx)) { u32 insn = relax_gotpcrelx(loc - 2); i64 val = S + A - P; if (insn && (i32)val == val) { @@ -438,7 +419,7 @@ void InputSection::apply_reloc_alloc(Context &ctx, u8 *base) { write32s(G + GOTPLT + A - P); break; case R_X86_64_REX_GOTPCRELX: - if (!sym.is_imported && !sym.is_ifunc() && sym.is_relative()) { + if (sym.is_pcrel_linktime_const(ctx)) { u32 insn = relax_rex_gotpcrelx(loc - 3); i64 val = S + A - P; if (insn && (i32)val == val) { @@ -452,23 +433,18 @@ void InputSection::apply_reloc_alloc(Context &ctx, u8 *base) { write32s(G + GOTPLT + A - P); break; case R_X86_64_TLSGD: - if (sym.has_tlsgd(ctx)) { + if (sym.has_tlsgd(ctx)) write32s(sym.get_tlsgd_addr(ctx) + A - P); - } else if (sym.has_gottp(ctx)) { - relax_gd_to_ie(loc, rels[i + 1], sym.get_gottp_addr(ctx) - P); - i++; - } else { - relax_gd_to_le(loc, rels[i + 1], S - ctx.tp_addr); - i++; - } + else if (sym.has_gottp(ctx)) + relax_gd_to_ie(loc, rels[++i], sym.get_gottp_addr(ctx) - P); + else + relax_gd_to_le(loc, rels[++i], S - ctx.tp_addr); break; case R_X86_64_TLSLD: - if (ctx.got->has_tlsld(ctx)) { + if (ctx.got->has_tlsld(ctx)) write32s(ctx.got->get_tlsld_addr(ctx) + A - P); - } else { - relax_ld_to_le(loc, rels[i + 1], ctx.tp_addr - ctx.tls_begin); - i++; - } + else + relax_ld_to_le(loc, rels[++i], ctx.tp_addr - ctx.tls_begin); break; case R_X86_64_DTPOFF32: write32s(S + A - ctx.dtp_addr); @@ -495,30 +471,54 @@ void InputSection::apply_reloc_alloc(Context &ctx, u8 *base) { } break; case R_X86_64_GOTPC32_TLSDESC: + // x86-64 TLSDESC uses the following code sequence to materialize + // a TP-relative address in %rax. + // + // lea 0(%rip), %rax + // R_X86_64_GOTPC32_TLSDESC foo + // call *(%rax) + // R_X86_64_TLSDESC_CALL foo + // + // We may relax the instructions to the following for non-dlopen'd DSO + // + // mov foo@GOTTPOFF(%rip), %rax + // nop + // + // or to the following for executable. + // + // mov $foo@TPOFF, %rax + // nop if (sym.has_tlsdesc(ctx)) { write32s(sym.get_tlsdesc_addr(ctx) + A - P); + } else if (sym.has_gottp(ctx)) { + // mov foo@gottpoff(%rip), %rax + loc[-3] = 0x48; + loc[-2] = 0x8b; + loc[-1] = 0x05; + write32s(sym.get_gottp_addr(ctx) + A - P); } else { - u32 insn = relax_gotpc32_tlsdesc(loc - 3); - loc[-3] = insn >> 16; - loc[-2] = insn >> 8; - loc[-1] = insn; + // mov $foo@tpoff, %rax + loc[-3] = 0x48; + loc[-2] = 0xc7; + loc[-1] = 0xc0; write32s(S - ctx.tp_addr); - assert(A == -4); } break; - case R_X86_64_SIZE32: - write32(sym.esym().st_size + A); - break; - case R_X86_64_SIZE64: - *(ul64 *)loc = sym.esym().st_size + A; - break; case R_X86_64_TLSDESC_CALL: - if (!sym.has_tlsdesc(ctx)) { + if (sym.has_tlsdesc(ctx)) { + // Do nothing + } else { // call *(%rax) -> nop loc[0] = 0x66; loc[1] = 0x90; } break; + case R_X86_64_SIZE32: + write32(sym.esym().st_size + A); + break; + case R_X86_64_SIZE64: + *(ul64 *)loc = sym.esym().st_size + A; + break; default: unreachable(); } @@ -653,6 +653,19 @@ void InputSection::scan_relocations(Context &ctx) { if (sym.is_ifunc()) sym.flags |= NEEDS_GOT | NEEDS_PLT; + if (rel.r_type == R_X86_64_TLSGD || rel.r_type == R_X86_64_TLSLD) { + if (i + 1 == rels.size()) + Fatal(ctx) << *this << ": " << rel + << " must be followed by PLT or GOTPCREL"; + + if (u32 ty = rels[i + 1].r_type; + ty != R_X86_64_PLT32 && ty != R_X86_64_PC32 && + ty != R_X86_64_PLTOFF64 && ty != R_X86_64_GOTPCREL && + ty != R_X86_64_GOTPCRELX) + Fatal(ctx) << *this << ": " << rel + << " must be followed by PLT or GOTPCREL"; + } + switch (rel.r_type) { case R_X86_64_8: case R_X86_64_16: @@ -685,25 +698,12 @@ void InputSection::scan_relocations(Context &ctx) { sym.flags |= NEEDS_PLT; break; case R_X86_64_TLSGD: - if (rel.r_addend != -4) - Fatal(ctx) << *this << ": bad r_addend for R_X86_64_TLSGD"; - - if (i + 1 == rels.size()) - Fatal(ctx) << *this << ": TLSGD reloc must be followed by PLT or GOTPCREL"; - - if (u32 ty = rels[i + 1].r_type; - ty != R_X86_64_PLT32 && ty != R_X86_64_PC32 && - ty != R_X86_64_PLTOFF64 && ty != R_X86_64_GOTPCREL && - ty != R_X86_64_GOTPCRELX) - Fatal(ctx) << *this << ": TLSGD reloc must be followed by PLT or GOTPCREL"; - - if (ctx.arg.is_static || - (ctx.arg.relax && !sym.is_imported && !ctx.arg.shared)) { + if ((ctx.arg.relax && sym.is_tprel_linktime_const(ctx)) || + ctx.arg.is_static) { // We always relax if -static because libc.a doesn't contain // __tls_get_addr(). i++; - } else if (ctx.arg.relax && !sym.is_imported && ctx.arg.shared && - !ctx.arg.z_dlopen) { + } else if (ctx.arg.relax && sym.is_tprel_runtime_const(ctx)) { sym.flags |= NEEDS_GOTTP; i++; } else { @@ -711,18 +711,6 @@ void InputSection::scan_relocations(Context &ctx) { } break; case R_X86_64_TLSLD: - if (rel.r_addend != -4) - Fatal(ctx) << *this << ": bad r_addend for R_X86_64_TLSLD"; - - if (i + 1 == rels.size()) - Fatal(ctx) << *this << ": TLSLD reloc must be followed by PLT or GOTPCREL"; - - if (u32 ty = rels[i + 1].r_type; - ty != R_X86_64_PLT32 && ty != R_X86_64_PC32 && - ty != R_X86_64_PLTOFF64 && ty != R_X86_64_GOTPCREL && - ty != R_X86_64_GOTPCRELX) - Fatal(ctx) << *this << ": TLSLD reloc must be followed by PLT or GOTPCREL"; - // We always relax if -static because libc.a doesn't contain // __tls_get_addr(). if (ctx.arg.is_static || (ctx.arg.relax && !ctx.arg.shared)) @@ -730,28 +718,17 @@ void InputSection::scan_relocations(Context &ctx) { else ctx.needs_tlsld = true; break; - case R_X86_64_GOTTPOFF: { - if (rel.r_addend != -4) - Fatal(ctx) << *this << ": bad r_addend for R_X86_64_GOTTPOFF"; - - bool do_relax = ctx.arg.relax && !ctx.arg.shared && - !sym.is_imported && relax_gottpoff(loc - 3); - if (!do_relax) + case R_X86_64_GOTTPOFF: + if (ctx.arg.relax && relax_gottpoff(loc - 3) && + sym.is_tprel_linktime_const(ctx)) { + // do nothing + } else { sym.flags |= NEEDS_GOTTP; + } break; - } - case R_X86_64_GOTPC32_TLSDESC: { - if (rel.r_addend != -4) - Fatal(ctx) << *this << ": bad r_addend for R_X86_64_GOTPC32_TLSDESC"; - - if (relax_gotpc32_tlsdesc(loc - 3) == 0) - Fatal(ctx) << *this << ": GOTPC32_TLSDESC relocation is used" - << " against an invalid code sequence"; - - if (!relax_tlsdesc(ctx, sym)) - sym.flags |= NEEDS_TLSDESC; + case R_X86_64_TLSDESC_CALL: + scan_tlsdesc(ctx, sym); break; - } case R_X86_64_TPOFF32: case R_X86_64_TPOFF64: check_tlsle(ctx, sym, rel); @@ -761,7 +738,7 @@ void InputSection::scan_relocations(Context &ctx) { case R_X86_64_DTPOFF64: case R_X86_64_SIZE32: case R_X86_64_SIZE64: - case R_X86_64_TLSDESC_CALL: + case R_X86_64_GOTPC32_TLSDESC: break; default: Error(ctx) << *this << ": unknown relocation: " << rel; diff --git a/elf/cmdline.cc b/elf/cmdline.cc index 9c3b5ec2..e2ee842a 100644 --- a/elf/cmdline.cc +++ b/elf/cmdline.cc @@ -402,7 +402,6 @@ std::vector parse_nonpositional_args(Context &ctx) { std::vector remaining; std::string_view arg; - ctx.page_size = E::page_size; ctx.arg.color_diagnostics = isatty(STDERR_FILENO); bool version_shown = false; @@ -526,16 +525,8 @@ std::vector parse_nonpositional_args(Context &ctx) { << " elf64briscv\n elf32lriscv\n elf32briscv\n" << " elf32ppc\n elf64ppc\n elf64lppc\n elf64_s390\n" << " elf64_sparc\n m68kelf\n shlelf_linux\n" - << " elf64alpha\n elf64ltsmip\n elf64btsmip\n" - << " elf64loongarch\n elf32loongarch"; + << " elf64alpha\n elf64loongarch\n elf32loongarch"; version_shown = true; - } else if (read_flag("mips32") || read_flag("mips32r2") || - read_flag("mips32r3") || read_flag("mips32r4") || - read_flag("mips32r5") || read_flag("mips32r6") || - read_flag("mips64") || read_flag("mips64r2") || - read_flag("mips64r3") || read_flag("mips64r4") || - read_flag("mips64r5") || read_flag("mips64r6")) { - // Ignore useless MIPS-specific flags } else if (read_arg("m")) { if (arg == "elf_x86_64") { ctx.arg.emulation = X86_64::target_name; @@ -569,10 +560,6 @@ std::vector parse_nonpositional_args(Context &ctx) { ctx.arg.emulation = SH4::target_name; } else if (arg == "elf64alpha") { ctx.arg.emulation = ALPHA::target_name; - } else if (arg == "elf64ltsmip") { - ctx.arg.emulation = MIPS64LE::target_name; - } else if (arg == "elf64btsmip") { - ctx.arg.emulation = MIPS64BE::target_name; } else if (arg == "elf64loongarch") { ctx.arg.emulation = LOONGARCH64::target_name; } else if (arg == "elf32loongarch") { @@ -1218,7 +1205,7 @@ std::vector parse_nonpositional_args(Context &ctx) { Fatal(ctx) << "-auxiliary may not be used without -shared"; } - if constexpr (!E::is_rela || is_mips) + if constexpr (!E::is_rela) if (!ctx.arg.apply_dynamic_relocs) Fatal(ctx) << "--no-apply-dynamic-relocs may not be used on " << E::target_name; diff --git a/elf/elf.cc b/elf/elf.cc index 030d6845..2ce2ec47 100644 --- a/elf/elf.cc +++ b/elf/elf.cc @@ -8,49 +8,51 @@ static std::string unknown_type(u32 r_type) { return buf; } +#define CASE(x) case x: return #x + template <> std::string rel_to_string(u32 r_type) { switch (r_type) { - case R_X86_64_NONE: return "R_X86_64_NONE"; - case R_X86_64_64: return "R_X86_64_64"; - case R_X86_64_PC32: return "R_X86_64_PC32"; - case R_X86_64_GOT32: return "R_X86_64_GOT32"; - case R_X86_64_PLT32: return "R_X86_64_PLT32"; - case R_X86_64_COPY: return "R_X86_64_COPY"; - case R_X86_64_GLOB_DAT: return "R_X86_64_GLOB_DAT"; - case R_X86_64_JUMP_SLOT: return "R_X86_64_JUMP_SLOT"; - case R_X86_64_RELATIVE: return "R_X86_64_RELATIVE"; - case R_X86_64_GOTPCREL: return "R_X86_64_GOTPCREL"; - case R_X86_64_32: return "R_X86_64_32"; - case R_X86_64_32S: return "R_X86_64_32S"; - case R_X86_64_16: return "R_X86_64_16"; - case R_X86_64_PC16: return "R_X86_64_PC16"; - case R_X86_64_8: return "R_X86_64_8"; - case R_X86_64_PC8: return "R_X86_64_PC8"; - case R_X86_64_DTPMOD64: return "R_X86_64_DTPMOD64"; - case R_X86_64_DTPOFF64: return "R_X86_64_DTPOFF64"; - case R_X86_64_TPOFF64: return "R_X86_64_TPOFF64"; - case R_X86_64_TLSGD: return "R_X86_64_TLSGD"; - case R_X86_64_TLSLD: return "R_X86_64_TLSLD"; - case R_X86_64_DTPOFF32: return "R_X86_64_DTPOFF32"; - case R_X86_64_GOTTPOFF: return "R_X86_64_GOTTPOFF"; - case R_X86_64_TPOFF32: return "R_X86_64_TPOFF32"; - case R_X86_64_PC64: return "R_X86_64_PC64"; - case R_X86_64_GOTOFF64: return "R_X86_64_GOTOFF64"; - case R_X86_64_GOTPC32: return "R_X86_64_GOTPC32"; - case R_X86_64_GOT64: return "R_X86_64_GOT64"; - case R_X86_64_GOTPCREL64: return "R_X86_64_GOTPCREL64"; - case R_X86_64_GOTPC64: return "R_X86_64_GOTPC64"; - case R_X86_64_GOTPLT64: return "R_X86_64_GOTPLT64"; - case R_X86_64_PLTOFF64: return "R_X86_64_PLTOFF64"; - case R_X86_64_SIZE32: return "R_X86_64_SIZE32"; - case R_X86_64_SIZE64: return "R_X86_64_SIZE64"; - case R_X86_64_GOTPC32_TLSDESC: return "R_X86_64_GOTPC32_TLSDESC"; - case R_X86_64_TLSDESC_CALL: return "R_X86_64_TLSDESC_CALL"; - case R_X86_64_TLSDESC: return "R_X86_64_TLSDESC"; - case R_X86_64_IRELATIVE: return "R_X86_64_IRELATIVE"; - case R_X86_64_GOTPCRELX: return "R_X86_64_GOTPCRELX"; - case R_X86_64_REX_GOTPCRELX: return "R_X86_64_REX_GOTPCRELX"; + CASE(R_X86_64_NONE); + CASE(R_X86_64_64); + CASE(R_X86_64_PC32); + CASE(R_X86_64_GOT32); + CASE(R_X86_64_PLT32); + CASE(R_X86_64_COPY); + CASE(R_X86_64_GLOB_DAT); + CASE(R_X86_64_JUMP_SLOT); + CASE(R_X86_64_RELATIVE); + CASE(R_X86_64_GOTPCREL); + CASE(R_X86_64_32); + CASE(R_X86_64_32S); + CASE(R_X86_64_16); + CASE(R_X86_64_PC16); + CASE(R_X86_64_8); + CASE(R_X86_64_PC8); + CASE(R_X86_64_DTPMOD64); + CASE(R_X86_64_DTPOFF64); + CASE(R_X86_64_TPOFF64); + CASE(R_X86_64_TLSGD); + CASE(R_X86_64_TLSLD); + CASE(R_X86_64_DTPOFF32); + CASE(R_X86_64_GOTTPOFF); + CASE(R_X86_64_TPOFF32); + CASE(R_X86_64_PC64); + CASE(R_X86_64_GOTOFF64); + CASE(R_X86_64_GOTPC32); + CASE(R_X86_64_GOT64); + CASE(R_X86_64_GOTPCREL64); + CASE(R_X86_64_GOTPC64); + CASE(R_X86_64_GOTPLT64); + CASE(R_X86_64_PLTOFF64); + CASE(R_X86_64_SIZE32); + CASE(R_X86_64_SIZE64); + CASE(R_X86_64_GOTPC32_TLSDESC); + CASE(R_X86_64_TLSDESC_CALL); + CASE(R_X86_64_TLSDESC); + CASE(R_X86_64_IRELATIVE); + CASE(R_X86_64_GOTPCRELX); + CASE(R_X86_64_REX_GOTPCRELX); } return unknown_type(r_type); } @@ -58,48 +60,48 @@ std::string rel_to_string(u32 r_type) { template <> std::string rel_to_string(u32 r_type) { switch (r_type) { - case R_386_NONE: return "R_386_NONE"; - case R_386_32: return "R_386_32"; - case R_386_PC32: return "R_386_PC32"; - case R_386_GOT32: return "R_386_GOT32"; - case R_386_PLT32: return "R_386_PLT32"; - case R_386_COPY: return "R_386_COPY"; - case R_386_GLOB_DAT: return "R_386_GLOB_DAT"; - case R_386_JUMP_SLOT: return "R_386_JUMP_SLOT"; - case R_386_RELATIVE: return "R_386_RELATIVE"; - case R_386_GOTOFF: return "R_386_GOTOFF"; - case R_386_GOTPC: return "R_386_GOTPC"; - case R_386_32PLT: return "R_386_32PLT"; - case R_386_TLS_TPOFF: return "R_386_TLS_TPOFF"; - case R_386_TLS_IE: return "R_386_TLS_IE"; - case R_386_TLS_GOTIE: return "R_386_TLS_GOTIE"; - case R_386_TLS_LE: return "R_386_TLS_LE"; - case R_386_TLS_GD: return "R_386_TLS_GD"; - case R_386_TLS_LDM: return "R_386_TLS_LDM"; - case R_386_16: return "R_386_16"; - case R_386_PC16: return "R_386_PC16"; - case R_386_8: return "R_386_8"; - case R_386_PC8: return "R_386_PC8"; - case R_386_TLS_GD_32: return "R_386_TLS_GD_32"; - case R_386_TLS_GD_PUSH: return "R_386_TLS_GD_PUSH"; - case R_386_TLS_GD_CALL: return "R_386_TLS_GD_CALL"; - case R_386_TLS_GD_POP: return "R_386_TLS_GD_POP"; - case R_386_TLS_LDM_32: return "R_386_TLS_LDM_32"; - case R_386_TLS_LDM_PUSH: return "R_386_TLS_LDM_PUSH"; - case R_386_TLS_LDM_CALL: return "R_386_TLS_LDM_CALL"; - case R_386_TLS_LDM_POP: return "R_386_TLS_LDM_POP"; - case R_386_TLS_LDO_32: return "R_386_TLS_LDO_32"; - case R_386_TLS_IE_32: return "R_386_TLS_IE_32"; - case R_386_TLS_LE_32: return "R_386_TLS_LE_32"; - case R_386_TLS_DTPMOD32: return "R_386_TLS_DTPMOD32"; - case R_386_TLS_DTPOFF32: return "R_386_TLS_DTPOFF32"; - case R_386_TLS_TPOFF32: return "R_386_TLS_TPOFF32"; - case R_386_SIZE32: return "R_386_SIZE32"; - case R_386_TLS_GOTDESC: return "R_386_TLS_GOTDESC"; - case R_386_TLS_DESC_CALL: return "R_386_TLS_DESC_CALL"; - case R_386_TLS_DESC: return "R_386_TLS_DESC"; - case R_386_IRELATIVE: return "R_386_IRELATIVE"; - case R_386_GOT32X: return "R_386_GOT32X"; + CASE(R_386_NONE); + CASE(R_386_32); + CASE(R_386_PC32); + CASE(R_386_GOT32); + CASE(R_386_PLT32); + CASE(R_386_COPY); + CASE(R_386_GLOB_DAT); + CASE(R_386_JUMP_SLOT); + CASE(R_386_RELATIVE); + CASE(R_386_GOTOFF); + CASE(R_386_GOTPC); + CASE(R_386_32PLT); + CASE(R_386_TLS_TPOFF); + CASE(R_386_TLS_IE); + CASE(R_386_TLS_GOTIE); + CASE(R_386_TLS_LE); + CASE(R_386_TLS_GD); + CASE(R_386_TLS_LDM); + CASE(R_386_16); + CASE(R_386_PC16); + CASE(R_386_8); + CASE(R_386_PC8); + CASE(R_386_TLS_GD_32); + CASE(R_386_TLS_GD_PUSH); + CASE(R_386_TLS_GD_CALL); + CASE(R_386_TLS_GD_POP); + CASE(R_386_TLS_LDM_32); + CASE(R_386_TLS_LDM_PUSH); + CASE(R_386_TLS_LDM_CALL); + CASE(R_386_TLS_LDM_POP); + CASE(R_386_TLS_LDO_32); + CASE(R_386_TLS_IE_32); + CASE(R_386_TLS_LE_32); + CASE(R_386_TLS_DTPMOD32); + CASE(R_386_TLS_DTPOFF32); + CASE(R_386_TLS_TPOFF32); + CASE(R_386_SIZE32); + CASE(R_386_TLS_GOTDESC); + CASE(R_386_TLS_DESC_CALL); + CASE(R_386_TLS_DESC); + CASE(R_386_IRELATIVE); + CASE(R_386_GOT32X); } return unknown_type(r_type); } @@ -107,110 +109,110 @@ std::string rel_to_string(u32 r_type) { template <> std::string rel_to_string(u32 r_type) { switch (r_type) { - case R_AARCH64_NONE: return "R_AARCH64_NONE"; - case R_AARCH64_ABS64: return "R_AARCH64_ABS64"; - case R_AARCH64_ABS32: return "R_AARCH64_ABS32"; - case R_AARCH64_ABS16: return "R_AARCH64_ABS16"; - case R_AARCH64_PREL64: return "R_AARCH64_PREL64"; - case R_AARCH64_PREL32: return "R_AARCH64_PREL32"; - case R_AARCH64_PREL16: return "R_AARCH64_PREL16"; - case R_AARCH64_MOVW_UABS_G0: return "R_AARCH64_MOVW_UABS_G0"; - case R_AARCH64_MOVW_UABS_G0_NC: return "R_AARCH64_MOVW_UABS_G0_NC"; - case R_AARCH64_MOVW_UABS_G1: return "R_AARCH64_MOVW_UABS_G1"; - case R_AARCH64_MOVW_UABS_G1_NC: return "R_AARCH64_MOVW_UABS_G1_NC"; - case R_AARCH64_MOVW_UABS_G2: return "R_AARCH64_MOVW_UABS_G2"; - case R_AARCH64_MOVW_UABS_G2_NC: return "R_AARCH64_MOVW_UABS_G2_NC"; - case R_AARCH64_MOVW_UABS_G3: return "R_AARCH64_MOVW_UABS_G3"; - case R_AARCH64_MOVW_SABS_G0: return "R_AARCH64_MOVW_SABS_G0"; - case R_AARCH64_MOVW_SABS_G1: return "R_AARCH64_MOVW_SABS_G1"; - case R_AARCH64_MOVW_SABS_G2: return "R_AARCH64_MOVW_SABS_G2"; - case R_AARCH64_LD_PREL_LO19: return "R_AARCH64_LD_PREL_LO19"; - case R_AARCH64_ADR_PREL_LO21: return "R_AARCH64_ADR_PREL_LO21"; - case R_AARCH64_ADR_PREL_PG_HI21: return "R_AARCH64_ADR_PREL_PG_HI21"; - case R_AARCH64_ADR_PREL_PG_HI21_NC: return "R_AARCH64_ADR_PREL_PG_HI21_NC"; - case R_AARCH64_ADD_ABS_LO12_NC: return "R_AARCH64_ADD_ABS_LO12_NC"; - case R_AARCH64_LDST8_ABS_LO12_NC: return "R_AARCH64_LDST8_ABS_LO12_NC"; - case R_AARCH64_TSTBR14: return "R_AARCH64_TSTBR14"; - case R_AARCH64_CONDBR19: return "R_AARCH64_CONDBR19"; - case R_AARCH64_JUMP26: return "R_AARCH64_JUMP26"; - case R_AARCH64_CALL26: return "R_AARCH64_CALL26"; - case R_AARCH64_LDST16_ABS_LO12_NC: return "R_AARCH64_LDST16_ABS_LO12_NC"; - case R_AARCH64_LDST32_ABS_LO12_NC: return "R_AARCH64_LDST32_ABS_LO12_NC"; - case R_AARCH64_LDST64_ABS_LO12_NC: return "R_AARCH64_LDST64_ABS_LO12_NC"; - case R_AARCH64_MOVW_PREL_G0: return "R_AARCH64_MOVW_PREL_G0"; - case R_AARCH64_MOVW_PREL_G0_NC: return "R_AARCH64_MOVW_PREL_G0_NC"; - case R_AARCH64_MOVW_PREL_G1: return "R_AARCH64_MOVW_PREL_G1"; - case R_AARCH64_MOVW_PREL_G1_NC: return "R_AARCH64_MOVW_PREL_G1_NC"; - case R_AARCH64_MOVW_PREL_G2: return "R_AARCH64_MOVW_PREL_G2"; - case R_AARCH64_MOVW_PREL_G2_NC: return "R_AARCH64_MOVW_PREL_G2_NC"; - case R_AARCH64_MOVW_PREL_G3: return "R_AARCH64_MOVW_PREL_G3"; - case R_AARCH64_LDST128_ABS_LO12_NC: return "R_AARCH64_LDST128_ABS_LO12_NC"; - case R_AARCH64_ADR_GOT_PAGE: return "R_AARCH64_ADR_GOT_PAGE"; - case R_AARCH64_LD64_GOT_LO12_NC: return "R_AARCH64_LD64_GOT_LO12_NC"; - case R_AARCH64_LD64_GOTPAGE_LO15: return "R_AARCH64_LD64_GOTPAGE_LO15"; - case R_AARCH64_PLT32: return "R_AARCH64_PLT32"; - case R_AARCH64_TLSGD_ADR_PREL21: return "R_AARCH64_TLSGD_ADR_PREL21"; - case R_AARCH64_TLSGD_ADR_PAGE21: return "R_AARCH64_TLSGD_ADR_PAGE21"; - case R_AARCH64_TLSGD_ADD_LO12_NC: return "R_AARCH64_TLSGD_ADD_LO12_NC"; - case R_AARCH64_TLSGD_MOVW_G1: return "R_AARCH64_TLSGD_MOVW_G1"; - case R_AARCH64_TLSGD_MOVW_G0_NC: return "R_AARCH64_TLSGD_MOVW_G0_NC"; - case R_AARCH64_TLSLD_ADR_PREL21: return "R_AARCH64_TLSLD_ADR_PREL21"; - case R_AARCH64_TLSLD_ADR_PAGE21: return "R_AARCH64_TLSLD_ADR_PAGE21"; - case R_AARCH64_TLSLD_ADD_LO12_NC: return "R_AARCH64_TLSLD_ADD_LO12_NC"; - case R_AARCH64_TLSLD_MOVW_G1: return "R_AARCH64_TLSLD_MOVW_G1"; - case R_AARCH64_TLSLD_MOVW_G0_NC: return "R_AARCH64_TLSLD_MOVW_G0_NC"; - case R_AARCH64_TLSLD_LD_PREL19: return "R_AARCH64_TLSLD_LD_PREL19"; - case R_AARCH64_TLSLD_MOVW_DTPREL_G2: return "R_AARCH64_TLSLD_MOVW_DTPREL_G2"; - case R_AARCH64_TLSLD_MOVW_DTPREL_G1: return "R_AARCH64_TLSLD_MOVW_DTPREL_G1"; - case R_AARCH64_TLSLD_MOVW_DTPREL_G1_NC: return "R_AARCH64_TLSLD_MOVW_DTPREL_G1_NC"; - case R_AARCH64_TLSLD_MOVW_DTPREL_G0: return "R_AARCH64_TLSLD_MOVW_DTPREL_G0"; - case R_AARCH64_TLSLD_MOVW_DTPREL_G0_NC: return "R_AARCH64_TLSLD_MOVW_DTPREL_G0_NC"; - case R_AARCH64_TLSLD_ADD_DTPREL_HI12: return "R_AARCH64_TLSLD_ADD_DTPREL_HI12"; - case R_AARCH64_TLSLD_ADD_DTPREL_LO12: return "R_AARCH64_TLSLD_ADD_DTPREL_LO12"; - case R_AARCH64_TLSLD_ADD_DTPREL_LO12_NC: return "R_AARCH64_TLSLD_ADD_DTPREL_LO12_NC"; - case R_AARCH64_TLSLD_LDST8_DTPREL_LO12: return "R_AARCH64_TLSLD_LDST8_DTPREL_LO12"; - case R_AARCH64_TLSLD_LDST8_DTPREL_LO12_NC: return "R_AARCH64_TLSLD_LDST8_DTPREL_LO12_NC"; - case R_AARCH64_TLSLD_LDST16_DTPREL_LO12: return "R_AARCH64_TLSLD_LDST16_DTPREL_LO12"; - case R_AARCH64_TLSLD_LDST16_DTPREL_LO12_NC: return "R_AARCH64_TLSLD_LDST16_DTPREL_LO12_NC"; - case R_AARCH64_TLSLD_LDST32_DTPREL_LO12: return "R_AARCH64_TLSLD_LDST32_DTPREL_LO12"; - case R_AARCH64_TLSLD_LDST32_DTPREL_LO12_NC: return "R_AARCH64_TLSLD_LDST32_DTPREL_LO12_NC"; - case R_AARCH64_TLSLD_LDST64_DTPREL_LO12: return "R_AARCH64_TLSLD_LDST64_DTPREL_LO12"; - case R_AARCH64_TLSLD_LDST64_DTPREL_LO12_NC: return "R_AARCH64_TLSLD_LDST64_DTPREL_LO12_NC"; - case R_AARCH64_TLSIE_MOVW_GOTTPREL_G1: return "R_AARCH64_TLSIE_MOVW_GOTTPREL_G1"; - case R_AARCH64_TLSIE_MOVW_GOTTPREL_G0_NC: return "R_AARCH64_TLSIE_MOVW_GOTTPREL_G0_NC"; - case R_AARCH64_TLSIE_ADR_GOTTPREL_PAGE21: return "R_AARCH64_TLSIE_ADR_GOTTPREL_PAGE21"; - case R_AARCH64_TLSIE_LD64_GOTTPREL_LO12_NC: return "R_AARCH64_TLSIE_LD64_GOTTPREL_LO12_NC"; - case R_AARCH64_TLSIE_LD_GOTTPREL_PREL19: return "R_AARCH64_TLSIE_LD_GOTTPREL_PREL19"; - case R_AARCH64_TLSLE_MOVW_TPREL_G2: return "R_AARCH64_TLSLE_MOVW_TPREL_G2"; - case R_AARCH64_TLSLE_MOVW_TPREL_G1: return "R_AARCH64_TLSLE_MOVW_TPREL_G1"; - case R_AARCH64_TLSLE_MOVW_TPREL_G1_NC: return "R_AARCH64_TLSLE_MOVW_TPREL_G1_NC"; - case R_AARCH64_TLSLE_MOVW_TPREL_G0: return "R_AARCH64_TLSLE_MOVW_TPREL_G0"; - case R_AARCH64_TLSLE_MOVW_TPREL_G0_NC: return "R_AARCH64_TLSLE_MOVW_TPREL_G0_NC"; - case R_AARCH64_TLSLE_ADD_TPREL_HI12: return "R_AARCH64_TLSLE_ADD_TPREL_HI12"; - case R_AARCH64_TLSLE_ADD_TPREL_LO12: return "R_AARCH64_TLSLE_ADD_TPREL_LO12"; - case R_AARCH64_TLSLE_ADD_TPREL_LO12_NC: return "R_AARCH64_TLSLE_ADD_TPREL_LO12_NC"; - case R_AARCH64_TLSLE_LDST8_TPREL_LO12: return "R_AARCH64_TLSLE_LDST8_TPREL_LO12"; - case R_AARCH64_TLSLE_LDST8_TPREL_LO12_NC: return "R_AARCH64_TLSLE_LDST8_TPREL_LO12_NC"; - case R_AARCH64_TLSLE_LDST16_TPREL_LO12: return "R_AARCH64_TLSLE_LDST16_TPREL_LO12"; - case R_AARCH64_TLSLE_LDST16_TPREL_LO12_NC: return "R_AARCH64_TLSLE_LDST16_TPREL_LO12_NC"; - case R_AARCH64_TLSLE_LDST32_TPREL_LO12: return "R_AARCH64_TLSLE_LDST32_TPREL_LO12"; - case R_AARCH64_TLSLE_LDST32_TPREL_LO12_NC: return "R_AARCH64_TLSLE_LDST32_TPREL_LO12_NC"; - case R_AARCH64_TLSLE_LDST64_TPREL_LO12: return "R_AARCH64_TLSLE_LDST64_TPREL_LO12"; - case R_AARCH64_TLSLE_LDST64_TPREL_LO12_NC: return "R_AARCH64_TLSLE_LDST64_TPREL_LO12_NC"; - case R_AARCH64_TLSDESC_ADR_PAGE21: return "R_AARCH64_TLSDESC_ADR_PAGE21"; - case R_AARCH64_TLSDESC_LD64_LO12: return "R_AARCH64_TLSDESC_LD64_LO12"; - case R_AARCH64_TLSDESC_ADD_LO12: return "R_AARCH64_TLSDESC_ADD_LO12"; - case R_AARCH64_TLSDESC_CALL: return "R_AARCH64_TLSDESC_CALL"; - case R_AARCH64_TLSLE_LDST128_TPREL_LO12_NC: return "R_AARCH64_TLSLE_LDST128_TPREL_LO12_NC"; - case R_AARCH64_COPY: return "R_AARCH64_COPY"; - case R_AARCH64_GLOB_DAT: return "R_AARCH64_GLOB_DAT"; - case R_AARCH64_JUMP_SLOT: return "R_AARCH64_JUMP_SLOT"; - case R_AARCH64_RELATIVE: return "R_AARCH64_RELATIVE"; - case R_AARCH64_TLS_DTPMOD64: return "R_AARCH64_TLS_DTPMOD64"; - case R_AARCH64_TLS_DTPREL64: return "R_AARCH64_TLS_DTPREL64"; - case R_AARCH64_TLS_TPREL64: return "R_AARCH64_TLS_TPREL64"; - case R_AARCH64_TLSDESC: return "R_AARCH64_TLSDESC"; - case R_AARCH64_IRELATIVE: return "R_AARCH64_IRELATIVE"; + CASE(R_AARCH64_NONE); + CASE(R_AARCH64_ABS64); + CASE(R_AARCH64_ABS32); + CASE(R_AARCH64_ABS16); + CASE(R_AARCH64_PREL64); + CASE(R_AARCH64_PREL32); + CASE(R_AARCH64_PREL16); + CASE(R_AARCH64_MOVW_UABS_G0); + CASE(R_AARCH64_MOVW_UABS_G0_NC); + CASE(R_AARCH64_MOVW_UABS_G1); + CASE(R_AARCH64_MOVW_UABS_G1_NC); + CASE(R_AARCH64_MOVW_UABS_G2); + CASE(R_AARCH64_MOVW_UABS_G2_NC); + CASE(R_AARCH64_MOVW_UABS_G3); + CASE(R_AARCH64_MOVW_SABS_G0); + CASE(R_AARCH64_MOVW_SABS_G1); + CASE(R_AARCH64_MOVW_SABS_G2); + CASE(R_AARCH64_LD_PREL_LO19); + CASE(R_AARCH64_ADR_PREL_LO21); + CASE(R_AARCH64_ADR_PREL_PG_HI21); + CASE(R_AARCH64_ADR_PREL_PG_HI21_NC); + CASE(R_AARCH64_ADD_ABS_LO12_NC); + CASE(R_AARCH64_LDST8_ABS_LO12_NC); + CASE(R_AARCH64_TSTBR14); + CASE(R_AARCH64_CONDBR19); + CASE(R_AARCH64_JUMP26); + CASE(R_AARCH64_CALL26); + CASE(R_AARCH64_LDST16_ABS_LO12_NC); + CASE(R_AARCH64_LDST32_ABS_LO12_NC); + CASE(R_AARCH64_LDST64_ABS_LO12_NC); + CASE(R_AARCH64_MOVW_PREL_G0); + CASE(R_AARCH64_MOVW_PREL_G0_NC); + CASE(R_AARCH64_MOVW_PREL_G1); + CASE(R_AARCH64_MOVW_PREL_G1_NC); + CASE(R_AARCH64_MOVW_PREL_G2); + CASE(R_AARCH64_MOVW_PREL_G2_NC); + CASE(R_AARCH64_MOVW_PREL_G3); + CASE(R_AARCH64_LDST128_ABS_LO12_NC); + CASE(R_AARCH64_ADR_GOT_PAGE); + CASE(R_AARCH64_LD64_GOT_LO12_NC); + CASE(R_AARCH64_LD64_GOTPAGE_LO15); + CASE(R_AARCH64_PLT32); + CASE(R_AARCH64_TLSGD_ADR_PREL21); + CASE(R_AARCH64_TLSGD_ADR_PAGE21); + CASE(R_AARCH64_TLSGD_ADD_LO12_NC); + CASE(R_AARCH64_TLSGD_MOVW_G1); + CASE(R_AARCH64_TLSGD_MOVW_G0_NC); + CASE(R_AARCH64_TLSLD_ADR_PREL21); + CASE(R_AARCH64_TLSLD_ADR_PAGE21); + CASE(R_AARCH64_TLSLD_ADD_LO12_NC); + CASE(R_AARCH64_TLSLD_MOVW_G1); + CASE(R_AARCH64_TLSLD_MOVW_G0_NC); + CASE(R_AARCH64_TLSLD_LD_PREL19); + CASE(R_AARCH64_TLSLD_MOVW_DTPREL_G2); + CASE(R_AARCH64_TLSLD_MOVW_DTPREL_G1); + CASE(R_AARCH64_TLSLD_MOVW_DTPREL_G1_NC); + CASE(R_AARCH64_TLSLD_MOVW_DTPREL_G0); + CASE(R_AARCH64_TLSLD_MOVW_DTPREL_G0_NC); + CASE(R_AARCH64_TLSLD_ADD_DTPREL_HI12); + CASE(R_AARCH64_TLSLD_ADD_DTPREL_LO12); + CASE(R_AARCH64_TLSLD_ADD_DTPREL_LO12_NC); + CASE(R_AARCH64_TLSLD_LDST8_DTPREL_LO12); + CASE(R_AARCH64_TLSLD_LDST8_DTPREL_LO12_NC); + CASE(R_AARCH64_TLSLD_LDST16_DTPREL_LO12); + CASE(R_AARCH64_TLSLD_LDST16_DTPREL_LO12_NC); + CASE(R_AARCH64_TLSLD_LDST32_DTPREL_LO12); + CASE(R_AARCH64_TLSLD_LDST32_DTPREL_LO12_NC); + CASE(R_AARCH64_TLSLD_LDST64_DTPREL_LO12); + CASE(R_AARCH64_TLSLD_LDST64_DTPREL_LO12_NC); + CASE(R_AARCH64_TLSIE_MOVW_GOTTPREL_G1); + CASE(R_AARCH64_TLSIE_MOVW_GOTTPREL_G0_NC); + CASE(R_AARCH64_TLSIE_ADR_GOTTPREL_PAGE21); + CASE(R_AARCH64_TLSIE_LD64_GOTTPREL_LO12_NC); + CASE(R_AARCH64_TLSIE_LD_GOTTPREL_PREL19); + CASE(R_AARCH64_TLSLE_MOVW_TPREL_G2); + CASE(R_AARCH64_TLSLE_MOVW_TPREL_G1); + CASE(R_AARCH64_TLSLE_MOVW_TPREL_G1_NC); + CASE(R_AARCH64_TLSLE_MOVW_TPREL_G0); + CASE(R_AARCH64_TLSLE_MOVW_TPREL_G0_NC); + CASE(R_AARCH64_TLSLE_ADD_TPREL_HI12); + CASE(R_AARCH64_TLSLE_ADD_TPREL_LO12); + CASE(R_AARCH64_TLSLE_ADD_TPREL_LO12_NC); + CASE(R_AARCH64_TLSLE_LDST8_TPREL_LO12); + CASE(R_AARCH64_TLSLE_LDST8_TPREL_LO12_NC); + CASE(R_AARCH64_TLSLE_LDST16_TPREL_LO12); + CASE(R_AARCH64_TLSLE_LDST16_TPREL_LO12_NC); + CASE(R_AARCH64_TLSLE_LDST32_TPREL_LO12); + CASE(R_AARCH64_TLSLE_LDST32_TPREL_LO12_NC); + CASE(R_AARCH64_TLSLE_LDST64_TPREL_LO12); + CASE(R_AARCH64_TLSLE_LDST64_TPREL_LO12_NC); + CASE(R_AARCH64_TLSDESC_ADR_PAGE21); + CASE(R_AARCH64_TLSDESC_LD64_LO12); + CASE(R_AARCH64_TLSDESC_ADD_LO12); + CASE(R_AARCH64_TLSDESC_CALL); + CASE(R_AARCH64_TLSLE_LDST128_TPREL_LO12_NC); + CASE(R_AARCH64_COPY); + CASE(R_AARCH64_GLOB_DAT); + CASE(R_AARCH64_JUMP_SLOT); + CASE(R_AARCH64_RELATIVE); + CASE(R_AARCH64_TLS_DTPMOD64); + CASE(R_AARCH64_TLS_DTPREL64); + CASE(R_AARCH64_TLS_TPREL64); + CASE(R_AARCH64_TLSDESC); + CASE(R_AARCH64_IRELATIVE); } return unknown_type(r_type); } @@ -218,141 +220,141 @@ std::string rel_to_string(u32 r_type) { template <> std::string rel_to_string(u32 r_type) { switch (r_type) { - case R_ARM_NONE: return "R_ARM_NONE"; - case R_ARM_PC24: return "R_ARM_PC24"; - case R_ARM_ABS32: return "R_ARM_ABS32"; - case R_ARM_REL32: return "R_ARM_REL32"; - case R_ARM_LDR_PC_G0: return "R_ARM_LDR_PC_G0"; - case R_ARM_ABS16: return "R_ARM_ABS16"; - case R_ARM_ABS12: return "R_ARM_ABS12"; - case R_ARM_THM_ABS5: return "R_ARM_THM_ABS5"; - case R_ARM_ABS8: return "R_ARM_ABS8"; - case R_ARM_SBREL32: return "R_ARM_SBREL32"; - case R_ARM_THM_CALL: return "R_ARM_THM_CALL"; - case R_ARM_THM_PC8: return "R_ARM_THM_PC8"; - case R_ARM_BREL_ADJ: return "R_ARM_BREL_ADJ"; - case R_ARM_TLS_DESC: return "R_ARM_TLS_DESC"; - case R_ARM_THM_SWI8: return "R_ARM_THM_SWI8"; - case R_ARM_XPC25: return "R_ARM_XPC25"; - case R_ARM_THM_XPC22: return "R_ARM_THM_XPC22"; - case R_ARM_TLS_DTPMOD32: return "R_ARM_TLS_DTPMOD32"; - case R_ARM_TLS_DTPOFF32: return "R_ARM_TLS_DTPOFF32"; - case R_ARM_TLS_TPOFF32: return "R_ARM_TLS_TPOFF32"; - case R_ARM_COPY: return "R_ARM_COPY"; - case R_ARM_GLOB_DAT: return "R_ARM_GLOB_DAT"; - case R_ARM_JUMP_SLOT: return "R_ARM_JUMP_SLOT"; - case R_ARM_RELATIVE: return "R_ARM_RELATIVE"; - case R_ARM_GOTOFF32: return "R_ARM_GOTOFF32"; - case R_ARM_BASE_PREL: return "R_ARM_BASE_PREL"; - case R_ARM_GOT_BREL: return "R_ARM_GOT_BREL"; - case R_ARM_PLT32: return "R_ARM_PLT32"; - case R_ARM_CALL: return "R_ARM_CALL"; - case R_ARM_JUMP24: return "R_ARM_JUMP24"; - case R_ARM_THM_JUMP24: return "R_ARM_THM_JUMP24"; - case R_ARM_BASE_ABS: return "R_ARM_BASE_ABS"; - case R_ARM_ALU_PCREL_7_0: return "R_ARM_ALU_PCREL_7_0"; - case R_ARM_ALU_PCREL_15_8: return "R_ARM_ALU_PCREL_15_8"; - case R_ARM_ALU_PCREL_23_15: return "R_ARM_ALU_PCREL_23_15"; - case R_ARM_LDR_SBREL_11_0_NC: return "R_ARM_LDR_SBREL_11_0_NC"; - case R_ARM_ALU_SBREL_19_12_NC: return "R_ARM_ALU_SBREL_19_12_NC"; - case R_ARM_ALU_SBREL_27_20_CK: return "R_ARM_ALU_SBREL_27_20_CK"; - case R_ARM_TARGET1: return "R_ARM_TARGET1"; - case R_ARM_SBREL31: return "R_ARM_SBREL31"; - case R_ARM_V4BX: return "R_ARM_V4BX"; - case R_ARM_TARGET2: return "R_ARM_TARGET2"; - case R_ARM_PREL31: return "R_ARM_PREL31"; - case R_ARM_MOVW_ABS_NC: return "R_ARM_MOVW_ABS_NC"; - case R_ARM_MOVT_ABS: return "R_ARM_MOVT_ABS"; - case R_ARM_MOVW_PREL_NC: return "R_ARM_MOVW_PREL_NC"; - case R_ARM_MOVT_PREL: return "R_ARM_MOVT_PREL"; - case R_ARM_THM_MOVW_ABS_NC: return "R_ARM_THM_MOVW_ABS_NC"; - case R_ARM_THM_MOVT_ABS: return "R_ARM_THM_MOVT_ABS"; - case R_ARM_THM_MOVW_PREL_NC: return "R_ARM_THM_MOVW_PREL_NC"; - case R_ARM_THM_MOVT_PREL: return "R_ARM_THM_MOVT_PREL"; - case R_ARM_THM_JUMP19: return "R_ARM_THM_JUMP19"; - case R_ARM_THM_JUMP6: return "R_ARM_THM_JUMP6"; - case R_ARM_THM_ALU_PREL_11_0: return "R_ARM_THM_ALU_PREL_11_0"; - case R_ARM_THM_PC12: return "R_ARM_THM_PC12"; - case R_ARM_ABS32_NOI: return "R_ARM_ABS32_NOI"; - case R_ARM_REL32_NOI: return "R_ARM_REL32_NOI"; - case R_ARM_ALU_PC_G0_NC: return "R_ARM_ALU_PC_G0_NC"; - case R_ARM_ALU_PC_G0: return "R_ARM_ALU_PC_G0"; - case R_ARM_ALU_PC_G1_NC: return "R_ARM_ALU_PC_G1_NC"; - case R_ARM_ALU_PC_G1: return "R_ARM_ALU_PC_G1"; - case R_ARM_ALU_PC_G2: return "R_ARM_ALU_PC_G2"; - case R_ARM_LDR_PC_G1: return "R_ARM_LDR_PC_G1"; - case R_ARM_LDR_PC_G2: return "R_ARM_LDR_PC_G2"; - case R_ARM_LDRS_PC_G0: return "R_ARM_LDRS_PC_G0"; - case R_ARM_LDRS_PC_G1: return "R_ARM_LDRS_PC_G1"; - case R_ARM_LDRS_PC_G2: return "R_ARM_LDRS_PC_G2"; - case R_ARM_LDC_PC_G0: return "R_ARM_LDC_PC_G0"; - case R_ARM_LDC_PC_G1: return "R_ARM_LDC_PC_G1"; - case R_ARM_LDC_PC_G2: return "R_ARM_LDC_PC_G2"; - case R_ARM_ALU_SB_G0_NC: return "R_ARM_ALU_SB_G0_NC"; - case R_ARM_ALU_SB_G0: return "R_ARM_ALU_SB_G0"; - case R_ARM_ALU_SB_G1_NC: return "R_ARM_ALU_SB_G1_NC"; - case R_ARM_ALU_SB_G1: return "R_ARM_ALU_SB_G1"; - case R_ARM_ALU_SB_G2: return "R_ARM_ALU_SB_G2"; - case R_ARM_LDR_SB_G0: return "R_ARM_LDR_SB_G0"; - case R_ARM_LDR_SB_G1: return "R_ARM_LDR_SB_G1"; - case R_ARM_LDR_SB_G2: return "R_ARM_LDR_SB_G2"; - case R_ARM_LDRS_SB_G0: return "R_ARM_LDRS_SB_G0"; - case R_ARM_LDRS_SB_G1: return "R_ARM_LDRS_SB_G1"; - case R_ARM_LDRS_SB_G2: return "R_ARM_LDRS_SB_G2"; - case R_ARM_LDC_SB_G0: return "R_ARM_LDC_SB_G0"; - case R_ARM_LDC_SB_G1: return "R_ARM_LDC_SB_G1"; - case R_ARM_LDC_SB_G2: return "R_ARM_LDC_SB_G2"; - case R_ARM_MOVW_BREL_NC: return "R_ARM_MOVW_BREL_NC"; - case R_ARM_MOVT_BREL: return "R_ARM_MOVT_BREL"; - case R_ARM_MOVW_BREL: return "R_ARM_MOVW_BREL"; - case R_ARM_THM_MOVW_BREL_NC: return "R_ARM_THM_MOVW_BREL_NC"; - case R_ARM_THM_MOVT_BREL: return "R_ARM_THM_MOVT_BREL"; - case R_ARM_THM_MOVW_BREL: return "R_ARM_THM_MOVW_BREL"; - case R_ARM_TLS_GOTDESC: return "R_ARM_TLS_GOTDESC"; - case R_ARM_TLS_CALL: return "R_ARM_TLS_CALL"; - case R_ARM_TLS_DESCSEQ: return "R_ARM_TLS_DESCSEQ"; - case R_ARM_THM_TLS_CALL: return "R_ARM_THM_TLS_CALL"; - case R_ARM_PLT32_ABS: return "R_ARM_PLT32_ABS"; - case R_ARM_GOT_ABS: return "R_ARM_GOT_ABS"; - case R_ARM_GOT_PREL: return "R_ARM_GOT_PREL"; - case R_ARM_GOT_BREL12: return "R_ARM_GOT_BREL12"; - case R_ARM_GOTOFF12: return "R_ARM_GOTOFF12"; - case R_ARM_GOTRELAX: return "R_ARM_GOTRELAX"; - case R_ARM_GNU_VTENTRY: return "R_ARM_GNU_VTENTRY"; - case R_ARM_GNU_VTINHERIT: return "R_ARM_GNU_VTINHERIT"; - case R_ARM_THM_JUMP11: return "R_ARM_THM_JUMP11"; - case R_ARM_THM_JUMP8: return "R_ARM_THM_JUMP8"; - case R_ARM_TLS_GD32: return "R_ARM_TLS_GD32"; - case R_ARM_TLS_LDM32: return "R_ARM_TLS_LDM32"; - case R_ARM_TLS_LDO32: return "R_ARM_TLS_LDO32"; - case R_ARM_TLS_IE32: return "R_ARM_TLS_IE32"; - case R_ARM_TLS_LE32: return "R_ARM_TLS_LE32"; - case R_ARM_TLS_LDO12: return "R_ARM_TLS_LDO12"; - case R_ARM_TLS_LE12: return "R_ARM_TLS_LE12"; - case R_ARM_TLS_IE12GP: return "R_ARM_TLS_IE12GP"; - case R_ARM_PRIVATE_0: return "R_ARM_PRIVATE_0"; - case R_ARM_PRIVATE_1: return "R_ARM_PRIVATE_1"; - case R_ARM_PRIVATE_2: return "R_ARM_PRIVATE_2"; - case R_ARM_PRIVATE_3: return "R_ARM_PRIVATE_3"; - case R_ARM_PRIVATE_4: return "R_ARM_PRIVATE_4"; - case R_ARM_PRIVATE_5: return "R_ARM_PRIVATE_5"; - case R_ARM_PRIVATE_6: return "R_ARM_PRIVATE_6"; - case R_ARM_PRIVATE_7: return "R_ARM_PRIVATE_7"; - case R_ARM_PRIVATE_8: return "R_ARM_PRIVATE_8"; - case R_ARM_PRIVATE_9: return "R_ARM_PRIVATE_9"; - case R_ARM_PRIVATE_10: return "R_ARM_PRIVATE_10"; - case R_ARM_PRIVATE_11: return "R_ARM_PRIVATE_11"; - case R_ARM_PRIVATE_12: return "R_ARM_PRIVATE_12"; - case R_ARM_PRIVATE_13: return "R_ARM_PRIVATE_13"; - case R_ARM_PRIVATE_14: return "R_ARM_PRIVATE_14"; - case R_ARM_PRIVATE_15: return "R_ARM_PRIVATE_15"; - case R_ARM_ME_TOO: return "R_ARM_ME_TOO"; - case R_ARM_THM_TLS_DESCSEQ16: return "R_ARM_THM_TLS_DESCSEQ16"; - case R_ARM_THM_TLS_DESCSEQ32: return "R_ARM_THM_TLS_DESCSEQ32"; - case R_ARM_THM_BF16: return "R_ARM_THM_BF16"; - case R_ARM_THM_BF12: return "R_ARM_THM_BF12"; - case R_ARM_THM_BF18: return "R_ARM_THM_BF18"; - case R_ARM_IRELATIVE: return "R_ARM_IRELATIVE"; + CASE(R_ARM_NONE); + CASE(R_ARM_PC24); + CASE(R_ARM_ABS32); + CASE(R_ARM_REL32); + CASE(R_ARM_LDR_PC_G0); + CASE(R_ARM_ABS16); + CASE(R_ARM_ABS12); + CASE(R_ARM_THM_ABS5); + CASE(R_ARM_ABS8); + CASE(R_ARM_SBREL32); + CASE(R_ARM_THM_CALL); + CASE(R_ARM_THM_PC8); + CASE(R_ARM_BREL_ADJ); + CASE(R_ARM_TLS_DESC); + CASE(R_ARM_THM_SWI8); + CASE(R_ARM_XPC25); + CASE(R_ARM_THM_XPC22); + CASE(R_ARM_TLS_DTPMOD32); + CASE(R_ARM_TLS_DTPOFF32); + CASE(R_ARM_TLS_TPOFF32); + CASE(R_ARM_COPY); + CASE(R_ARM_GLOB_DAT); + CASE(R_ARM_JUMP_SLOT); + CASE(R_ARM_RELATIVE); + CASE(R_ARM_GOTOFF32); + CASE(R_ARM_BASE_PREL); + CASE(R_ARM_GOT_BREL); + CASE(R_ARM_PLT32); + CASE(R_ARM_CALL); + CASE(R_ARM_JUMP24); + CASE(R_ARM_THM_JUMP24); + CASE(R_ARM_BASE_ABS); + CASE(R_ARM_ALU_PCREL_7_0); + CASE(R_ARM_ALU_PCREL_15_8); + CASE(R_ARM_ALU_PCREL_23_15); + CASE(R_ARM_LDR_SBREL_11_0_NC); + CASE(R_ARM_ALU_SBREL_19_12_NC); + CASE(R_ARM_ALU_SBREL_27_20_CK); + CASE(R_ARM_TARGET1); + CASE(R_ARM_SBREL31); + CASE(R_ARM_V4BX); + CASE(R_ARM_TARGET2); + CASE(R_ARM_PREL31); + CASE(R_ARM_MOVW_ABS_NC); + CASE(R_ARM_MOVT_ABS); + CASE(R_ARM_MOVW_PREL_NC); + CASE(R_ARM_MOVT_PREL); + CASE(R_ARM_THM_MOVW_ABS_NC); + CASE(R_ARM_THM_MOVT_ABS); + CASE(R_ARM_THM_MOVW_PREL_NC); + CASE(R_ARM_THM_MOVT_PREL); + CASE(R_ARM_THM_JUMP19); + CASE(R_ARM_THM_JUMP6); + CASE(R_ARM_THM_ALU_PREL_11_0); + CASE(R_ARM_THM_PC12); + CASE(R_ARM_ABS32_NOI); + CASE(R_ARM_REL32_NOI); + CASE(R_ARM_ALU_PC_G0_NC); + CASE(R_ARM_ALU_PC_G0); + CASE(R_ARM_ALU_PC_G1_NC); + CASE(R_ARM_ALU_PC_G1); + CASE(R_ARM_ALU_PC_G2); + CASE(R_ARM_LDR_PC_G1); + CASE(R_ARM_LDR_PC_G2); + CASE(R_ARM_LDRS_PC_G0); + CASE(R_ARM_LDRS_PC_G1); + CASE(R_ARM_LDRS_PC_G2); + CASE(R_ARM_LDC_PC_G0); + CASE(R_ARM_LDC_PC_G1); + CASE(R_ARM_LDC_PC_G2); + CASE(R_ARM_ALU_SB_G0_NC); + CASE(R_ARM_ALU_SB_G0); + CASE(R_ARM_ALU_SB_G1_NC); + CASE(R_ARM_ALU_SB_G1); + CASE(R_ARM_ALU_SB_G2); + CASE(R_ARM_LDR_SB_G0); + CASE(R_ARM_LDR_SB_G1); + CASE(R_ARM_LDR_SB_G2); + CASE(R_ARM_LDRS_SB_G0); + CASE(R_ARM_LDRS_SB_G1); + CASE(R_ARM_LDRS_SB_G2); + CASE(R_ARM_LDC_SB_G0); + CASE(R_ARM_LDC_SB_G1); + CASE(R_ARM_LDC_SB_G2); + CASE(R_ARM_MOVW_BREL_NC); + CASE(R_ARM_MOVT_BREL); + CASE(R_ARM_MOVW_BREL); + CASE(R_ARM_THM_MOVW_BREL_NC); + CASE(R_ARM_THM_MOVT_BREL); + CASE(R_ARM_THM_MOVW_BREL); + CASE(R_ARM_TLS_GOTDESC); + CASE(R_ARM_TLS_CALL); + CASE(R_ARM_TLS_DESCSEQ); + CASE(R_ARM_THM_TLS_CALL); + CASE(R_ARM_PLT32_ABS); + CASE(R_ARM_GOT_ABS); + CASE(R_ARM_GOT_PREL); + CASE(R_ARM_GOT_BREL12); + CASE(R_ARM_GOTOFF12); + CASE(R_ARM_GOTRELAX); + CASE(R_ARM_GNU_VTENTRY); + CASE(R_ARM_GNU_VTINHERIT); + CASE(R_ARM_THM_JUMP11); + CASE(R_ARM_THM_JUMP8); + CASE(R_ARM_TLS_GD32); + CASE(R_ARM_TLS_LDM32); + CASE(R_ARM_TLS_LDO32); + CASE(R_ARM_TLS_IE32); + CASE(R_ARM_TLS_LE32); + CASE(R_ARM_TLS_LDO12); + CASE(R_ARM_TLS_LE12); + CASE(R_ARM_TLS_IE12GP); + CASE(R_ARM_PRIVATE_0); + CASE(R_ARM_PRIVATE_1); + CASE(R_ARM_PRIVATE_2); + CASE(R_ARM_PRIVATE_3); + CASE(R_ARM_PRIVATE_4); + CASE(R_ARM_PRIVATE_5); + CASE(R_ARM_PRIVATE_6); + CASE(R_ARM_PRIVATE_7); + CASE(R_ARM_PRIVATE_8); + CASE(R_ARM_PRIVATE_9); + CASE(R_ARM_PRIVATE_10); + CASE(R_ARM_PRIVATE_11); + CASE(R_ARM_PRIVATE_12); + CASE(R_ARM_PRIVATE_13); + CASE(R_ARM_PRIVATE_14); + CASE(R_ARM_PRIVATE_15); + CASE(R_ARM_ME_TOO); + CASE(R_ARM_THM_TLS_DESCSEQ16); + CASE(R_ARM_THM_TLS_DESCSEQ32); + CASE(R_ARM_THM_BF16); + CASE(R_ARM_THM_BF12); + CASE(R_ARM_THM_BF18); + CASE(R_ARM_IRELATIVE); } return unknown_type(r_type); } @@ -360,58 +362,62 @@ std::string rel_to_string(u32 r_type) { template <> std::string rel_to_string(u32 r_type) { switch (r_type) { - case R_RISCV_NONE: return "R_RISCV_NONE"; - case R_RISCV_32: return "R_RISCV_32"; - case R_RISCV_64: return "R_RISCV_64"; - case R_RISCV_RELATIVE: return "R_RISCV_RELATIVE"; - case R_RISCV_COPY: return "R_RISCV_COPY"; - case R_RISCV_JUMP_SLOT: return "R_RISCV_JUMP_SLOT"; - case R_RISCV_TLS_DTPMOD32: return "R_RISCV_TLS_DTPMOD32"; - case R_RISCV_TLS_DTPMOD64: return "R_RISCV_TLS_DTPMOD64"; - case R_RISCV_TLS_DTPREL32: return "R_RISCV_TLS_DTPREL32"; - case R_RISCV_TLS_DTPREL64: return "R_RISCV_TLS_DTPREL64"; - case R_RISCV_TLS_TPREL32: return "R_RISCV_TLS_TPREL32"; - case R_RISCV_TLS_TPREL64: return "R_RISCV_TLS_TPREL64"; - case R_RISCV_BRANCH: return "R_RISCV_BRANCH"; - case R_RISCV_JAL: return "R_RISCV_JAL"; - case R_RISCV_CALL: return "R_RISCV_CALL"; - case R_RISCV_CALL_PLT: return "R_RISCV_CALL_PLT"; - case R_RISCV_GOT_HI20: return "R_RISCV_GOT_HI20"; - case R_RISCV_TLS_GOT_HI20: return "R_RISCV_TLS_GOT_HI20"; - case R_RISCV_TLS_GD_HI20: return "R_RISCV_TLS_GD_HI20"; - case R_RISCV_PCREL_HI20: return "R_RISCV_PCREL_HI20"; - case R_RISCV_PCREL_LO12_I: return "R_RISCV_PCREL_LO12_I"; - case R_RISCV_PCREL_LO12_S: return "R_RISCV_PCREL_LO12_S"; - case R_RISCV_HI20: return "R_RISCV_HI20"; - case R_RISCV_LO12_I: return "R_RISCV_LO12_I"; - case R_RISCV_LO12_S: return "R_RISCV_LO12_S"; - case R_RISCV_TPREL_HI20: return "R_RISCV_TPREL_HI20"; - case R_RISCV_TPREL_LO12_I: return "R_RISCV_TPREL_LO12_I"; - case R_RISCV_TPREL_LO12_S: return "R_RISCV_TPREL_LO12_S"; - case R_RISCV_TPREL_ADD: return "R_RISCV_TPREL_ADD"; - case R_RISCV_ADD8: return "R_RISCV_ADD8"; - case R_RISCV_ADD16: return "R_RISCV_ADD16"; - case R_RISCV_ADD32: return "R_RISCV_ADD32"; - case R_RISCV_ADD64: return "R_RISCV_ADD64"; - case R_RISCV_SUB8: return "R_RISCV_SUB8"; - case R_RISCV_SUB16: return "R_RISCV_SUB16"; - case R_RISCV_SUB32: return "R_RISCV_SUB32"; - case R_RISCV_SUB64: return "R_RISCV_SUB64"; - case R_RISCV_ALIGN: return "R_RISCV_ALIGN"; - case R_RISCV_RVC_BRANCH: return "R_RISCV_RVC_BRANCH"; - case R_RISCV_RVC_JUMP: return "R_RISCV_RVC_JUMP"; - case R_RISCV_RVC_LUI: return "R_RISCV_RVC_LUI"; - case R_RISCV_RELAX: return "R_RISCV_RELAX"; - case R_RISCV_SUB6: return "R_RISCV_SUB6"; - case R_RISCV_SET6: return "R_RISCV_SET6"; - case R_RISCV_SET8: return "R_RISCV_SET8"; - case R_RISCV_SET16: return "R_RISCV_SET16"; - case R_RISCV_SET32: return "R_RISCV_SET32"; - case R_RISCV_32_PCREL: return "R_RISCV_32_PCREL"; - case R_RISCV_IRELATIVE: return "R_RISCV_IRELATIVE"; - case R_RISCV_PLT32: return "R_RISCV_PLT32"; - case R_RISCV_SET_ULEB128: return "R_RISCV_SET_ULEB128"; - case R_RISCV_SUB_ULEB128: return "R_RISCV_SUB_ULEB128"; + CASE(R_RISCV_NONE); + CASE(R_RISCV_32); + CASE(R_RISCV_64); + CASE(R_RISCV_RELATIVE); + CASE(R_RISCV_COPY); + CASE(R_RISCV_JUMP_SLOT); + CASE(R_RISCV_TLS_DTPMOD32); + CASE(R_RISCV_TLS_DTPMOD64); + CASE(R_RISCV_TLS_DTPREL32); + CASE(R_RISCV_TLS_DTPREL64); + CASE(R_RISCV_TLS_TPREL32); + CASE(R_RISCV_TLS_TPREL64); + CASE(R_RISCV_BRANCH); + CASE(R_RISCV_JAL); + CASE(R_RISCV_CALL); + CASE(R_RISCV_CALL_PLT); + CASE(R_RISCV_GOT_HI20); + CASE(R_RISCV_TLS_GOT_HI20); + CASE(R_RISCV_TLS_GD_HI20); + CASE(R_RISCV_PCREL_HI20); + CASE(R_RISCV_PCREL_LO12_I); + CASE(R_RISCV_PCREL_LO12_S); + CASE(R_RISCV_HI20); + CASE(R_RISCV_LO12_I); + CASE(R_RISCV_LO12_S); + CASE(R_RISCV_TPREL_HI20); + CASE(R_RISCV_TPREL_LO12_I); + CASE(R_RISCV_TPREL_LO12_S); + CASE(R_RISCV_TPREL_ADD); + CASE(R_RISCV_ADD8); + CASE(R_RISCV_ADD16); + CASE(R_RISCV_ADD32); + CASE(R_RISCV_ADD64); + CASE(R_RISCV_SUB8); + CASE(R_RISCV_SUB16); + CASE(R_RISCV_SUB32); + CASE(R_RISCV_SUB64); + CASE(R_RISCV_ALIGN); + CASE(R_RISCV_RVC_BRANCH); + CASE(R_RISCV_RVC_JUMP); + CASE(R_RISCV_RVC_LUI); + CASE(R_RISCV_RELAX); + CASE(R_RISCV_SUB6); + CASE(R_RISCV_SET6); + CASE(R_RISCV_SET8); + CASE(R_RISCV_SET16); + CASE(R_RISCV_SET32); + CASE(R_RISCV_32_PCREL); + CASE(R_RISCV_IRELATIVE); + CASE(R_RISCV_PLT32); + CASE(R_RISCV_SET_ULEB128); + CASE(R_RISCV_SUB_ULEB128); + CASE(R_RISCV_TLSDESC_HI20); + CASE(R_RISCV_TLSDESC_LOAD_LO12); + CASE(R_RISCV_TLSDESC_ADD_LO12); + CASE(R_RISCV_TLSDESC_CALL); } return unknown_type(r_type); } @@ -434,81 +440,81 @@ std::string rel_to_string(u32 r_type) { template <> std::string rel_to_string(u32 r_type) { switch (r_type) { - case R_PPC_NONE: return "R_PPC_NONE"; - case R_PPC_ADDR32: return "R_PPC_ADDR32"; - case R_PPC_ADDR24: return "R_PPC_ADDR24"; - case R_PPC_ADDR16: return "R_PPC_ADDR16"; - case R_PPC_ADDR16_LO: return "R_PPC_ADDR16_LO"; - case R_PPC_ADDR16_HI: return "R_PPC_ADDR16_HI"; - case R_PPC_ADDR16_HA: return "R_PPC_ADDR16_HA"; - case R_PPC_ADDR14: return "R_PPC_ADDR14"; - case R_PPC_ADDR14_BRTAKEN: return "R_PPC_ADDR14_BRTAKEN"; - case R_PPC_ADDR14_BRNTAKEN: return "R_PPC_ADDR14_BRNTAKEN"; - case R_PPC_REL24: return "R_PPC_REL24"; - case R_PPC_REL14: return "R_PPC_REL14"; - case R_PPC_REL14_BRTAKEN: return "R_PPC_REL14_BRTAKEN"; - case R_PPC_REL14_BRNTAKEN: return "R_PPC_REL14_BRNTAKEN"; - case R_PPC_GOT16: return "R_PPC_GOT16"; - case R_PPC_GOT16_LO: return "R_PPC_GOT16_LO"; - case R_PPC_GOT16_HI: return "R_PPC_GOT16_HI"; - case R_PPC_GOT16_HA: return "R_PPC_GOT16_HA"; - case R_PPC_PLTREL24: return "R_PPC_PLTREL24"; - case R_PPC_COPY: return "R_PPC_COPY"; - case R_PPC_GLOB_DAT: return "R_PPC_GLOB_DAT"; - case R_PPC_JMP_SLOT: return "R_PPC_JMP_SLOT"; - case R_PPC_RELATIVE: return "R_PPC_RELATIVE"; - case R_PPC_LOCAL24PC: return "R_PPC_LOCAL24PC"; - case R_PPC_UADDR32: return "R_PPC_UADDR32"; - case R_PPC_UADDR16: return "R_PPC_UADDR16"; - case R_PPC_REL32: return "R_PPC_REL32"; - case R_PPC_PLT32: return "R_PPC_PLT32"; - case R_PPC_PLTREL32: return "R_PPC_PLTREL32"; - case R_PPC_PLT16_LO: return "R_PPC_PLT16_LO"; - case R_PPC_PLT16_HI: return "R_PPC_PLT16_HI"; - case R_PPC_PLT16_HA: return "R_PPC_PLT16_HA"; - case R_PPC_SDAREL16: return "R_PPC_SDAREL16"; - case R_PPC_SECTOFF: return "R_PPC_SECTOFF"; - case R_PPC_SECTOFF_LO: return "R_PPC_SECTOFF_LO"; - case R_PPC_SECTOFF_HI: return "R_PPC_SECTOFF_HI"; - case R_PPC_SECTOFF_HA: return "R_PPC_SECTOFF_HA"; - case R_PPC_ADDR30: return "R_PPC_ADDR30"; - case R_PPC_TLS: return "R_PPC_TLS"; - case R_PPC_DTPMOD32: return "R_PPC_DTPMOD32"; - case R_PPC_TPREL16: return "R_PPC_TPREL16"; - case R_PPC_TPREL16_LO: return "R_PPC_TPREL16_LO"; - case R_PPC_TPREL16_HI: return "R_PPC_TPREL16_HI"; - case R_PPC_TPREL16_HA: return "R_PPC_TPREL16_HA"; - case R_PPC_TPREL32: return "R_PPC_TPREL32"; - case R_PPC_DTPREL16: return "R_PPC_DTPREL16"; - case R_PPC_DTPREL16_LO: return "R_PPC_DTPREL16_LO"; - case R_PPC_DTPREL16_HI: return "R_PPC_DTPREL16_HI"; - case R_PPC_DTPREL16_HA: return "R_PPC_DTPREL16_HA"; - case R_PPC_DTPREL32: return "R_PPC_DTPREL32"; - case R_PPC_GOT_TLSGD16: return "R_PPC_GOT_TLSGD16"; - case R_PPC_GOT_TLSGD16_LO: return "R_PPC_GOT_TLSGD16_LO"; - case R_PPC_GOT_TLSGD16_HI: return "R_PPC_GOT_TLSGD16_HI"; - case R_PPC_GOT_TLSGD16_HA: return "R_PPC_GOT_TLSGD16_HA"; - case R_PPC_GOT_TLSLD16: return "R_PPC_GOT_TLSLD16"; - case R_PPC_GOT_TLSLD16_LO: return "R_PPC_GOT_TLSLD16_LO"; - case R_PPC_GOT_TLSLD16_HI: return "R_PPC_GOT_TLSLD16_HI"; - case R_PPC_GOT_TLSLD16_HA: return "R_PPC_GOT_TLSLD16_HA"; - case R_PPC_GOT_TPREL16: return "R_PPC_GOT_TPREL16"; - case R_PPC_GOT_TPREL16_LO: return "R_PPC_GOT_TPREL16_LO"; - case R_PPC_GOT_TPREL16_HI: return "R_PPC_GOT_TPREL16_HI"; - case R_PPC_GOT_TPREL16_HA: return "R_PPC_GOT_TPREL16_HA"; - case R_PPC_GOT_DTPREL16: return "R_PPC_GOT_DTPREL16"; - case R_PPC_GOT_DTPREL16_LO: return "R_PPC_GOT_DTPREL16_LO"; - case R_PPC_GOT_DTPREL16_HI: return "R_PPC_GOT_DTPREL16_HI"; - case R_PPC_GOT_DTPREL16_HA: return "R_PPC_GOT_DTPREL16_HA"; - case R_PPC_TLSGD: return "R_PPC_TLSGD"; - case R_PPC_TLSLD: return "R_PPC_TLSLD"; - case R_PPC_PLTSEQ: return "R_PPC_PLTSEQ"; - case R_PPC_PLTCALL: return "R_PPC_PLTCALL"; - case R_PPC_IRELATIVE: return "R_PPC_IRELATIVE"; - case R_PPC_REL16: return "R_PPC_REL16"; - case R_PPC_REL16_LO: return "R_PPC_REL16_LO"; - case R_PPC_REL16_HI: return "R_PPC_REL16_HI"; - case R_PPC_REL16_HA: return "R_PPC_REL16_HA"; + CASE(R_PPC_NONE); + CASE(R_PPC_ADDR32); + CASE(R_PPC_ADDR24); + CASE(R_PPC_ADDR16); + CASE(R_PPC_ADDR16_LO); + CASE(R_PPC_ADDR16_HI); + CASE(R_PPC_ADDR16_HA); + CASE(R_PPC_ADDR14); + CASE(R_PPC_ADDR14_BRTAKEN); + CASE(R_PPC_ADDR14_BRNTAKEN); + CASE(R_PPC_REL24); + CASE(R_PPC_REL14); + CASE(R_PPC_REL14_BRTAKEN); + CASE(R_PPC_REL14_BRNTAKEN); + CASE(R_PPC_GOT16); + CASE(R_PPC_GOT16_LO); + CASE(R_PPC_GOT16_HI); + CASE(R_PPC_GOT16_HA); + CASE(R_PPC_PLTREL24); + CASE(R_PPC_COPY); + CASE(R_PPC_GLOB_DAT); + CASE(R_PPC_JMP_SLOT); + CASE(R_PPC_RELATIVE); + CASE(R_PPC_LOCAL24PC); + CASE(R_PPC_UADDR32); + CASE(R_PPC_UADDR16); + CASE(R_PPC_REL32); + CASE(R_PPC_PLT32); + CASE(R_PPC_PLTREL32); + CASE(R_PPC_PLT16_LO); + CASE(R_PPC_PLT16_HI); + CASE(R_PPC_PLT16_HA); + CASE(R_PPC_SDAREL16); + CASE(R_PPC_SECTOFF); + CASE(R_PPC_SECTOFF_LO); + CASE(R_PPC_SECTOFF_HI); + CASE(R_PPC_SECTOFF_HA); + CASE(R_PPC_ADDR30); + CASE(R_PPC_TLS); + CASE(R_PPC_DTPMOD32); + CASE(R_PPC_TPREL16); + CASE(R_PPC_TPREL16_LO); + CASE(R_PPC_TPREL16_HI); + CASE(R_PPC_TPREL16_HA); + CASE(R_PPC_TPREL32); + CASE(R_PPC_DTPREL16); + CASE(R_PPC_DTPREL16_LO); + CASE(R_PPC_DTPREL16_HI); + CASE(R_PPC_DTPREL16_HA); + CASE(R_PPC_DTPREL32); + CASE(R_PPC_GOT_TLSGD16); + CASE(R_PPC_GOT_TLSGD16_LO); + CASE(R_PPC_GOT_TLSGD16_HI); + CASE(R_PPC_GOT_TLSGD16_HA); + CASE(R_PPC_GOT_TLSLD16); + CASE(R_PPC_GOT_TLSLD16_LO); + CASE(R_PPC_GOT_TLSLD16_HI); + CASE(R_PPC_GOT_TLSLD16_HA); + CASE(R_PPC_GOT_TPREL16); + CASE(R_PPC_GOT_TPREL16_LO); + CASE(R_PPC_GOT_TPREL16_HI); + CASE(R_PPC_GOT_TPREL16_HA); + CASE(R_PPC_GOT_DTPREL16); + CASE(R_PPC_GOT_DTPREL16_LO); + CASE(R_PPC_GOT_DTPREL16_HI); + CASE(R_PPC_GOT_DTPREL16_HA); + CASE(R_PPC_TLSGD); + CASE(R_PPC_TLSLD); + CASE(R_PPC_PLTSEQ); + CASE(R_PPC_PLTCALL); + CASE(R_PPC_IRELATIVE); + CASE(R_PPC_REL16); + CASE(R_PPC_REL16_LO); + CASE(R_PPC_REL16_HI); + CASE(R_PPC_REL16_HA); } return unknown_type(r_type); } @@ -516,118 +522,118 @@ std::string rel_to_string(u32 r_type) { template <> std::string rel_to_string(u32 r_type) { switch (r_type) { - case R_PPC64_NONE: return "R_PPC64_NONE"; - case R_PPC64_ADDR32: return "R_PPC64_ADDR32"; - case R_PPC64_ADDR24: return "R_PPC64_ADDR24"; - case R_PPC64_ADDR16: return "R_PPC64_ADDR16"; - case R_PPC64_ADDR16_LO: return "R_PPC64_ADDR16_LO"; - case R_PPC64_ADDR16_HI: return "R_PPC64_ADDR16_HI"; - case R_PPC64_ADDR16_HA: return "R_PPC64_ADDR16_HA"; - case R_PPC64_ADDR14: return "R_PPC64_ADDR14"; - case R_PPC64_ADDR14_BRTAKEN: return "R_PPC64_ADDR14_BRTAKEN"; - case R_PPC64_ADDR14_BRNTAKEN: return "R_PPC64_ADDR14_BRNTAKEN"; - case R_PPC64_REL24: return "R_PPC64_REL24"; - case R_PPC64_REL14: return "R_PPC64_REL14"; - case R_PPC64_REL14_BRTAKEN: return "R_PPC64_REL14_BRTAKEN"; - case R_PPC64_REL14_BRNTAKEN: return "R_PPC64_REL14_BRNTAKEN"; - case R_PPC64_GOT16: return "R_PPC64_GOT16"; - case R_PPC64_GOT16_LO: return "R_PPC64_GOT16_LO"; - case R_PPC64_GOT16_HI: return "R_PPC64_GOT16_HI"; - case R_PPC64_GOT16_HA: return "R_PPC64_GOT16_HA"; - case R_PPC64_COPY: return "R_PPC64_COPY"; - case R_PPC64_GLOB_DAT: return "R_PPC64_GLOB_DAT"; - case R_PPC64_JMP_SLOT: return "R_PPC64_JMP_SLOT"; - case R_PPC64_RELATIVE: return "R_PPC64_RELATIVE"; - case R_PPC64_REL32: return "R_PPC64_REL32"; - case R_PPC64_PLT16_LO: return "R_PPC64_PLT16_LO"; - case R_PPC64_PLT16_HI: return "R_PPC64_PLT16_HI"; - case R_PPC64_PLT16_HA: return "R_PPC64_PLT16_HA"; - case R_PPC64_ADDR64: return "R_PPC64_ADDR64"; - case R_PPC64_ADDR16_HIGHER: return "R_PPC64_ADDR16_HIGHER"; - case R_PPC64_ADDR16_HIGHERA: return "R_PPC64_ADDR16_HIGHERA"; - case R_PPC64_ADDR16_HIGHEST: return "R_PPC64_ADDR16_HIGHEST"; - case R_PPC64_ADDR16_HIGHESTA: return "R_PPC64_ADDR16_HIGHESTA"; - case R_PPC64_REL64: return "R_PPC64_REL64"; - case R_PPC64_TOC16: return "R_PPC64_TOC16"; - case R_PPC64_TOC16_LO: return "R_PPC64_TOC16_LO"; - case R_PPC64_TOC16_HI: return "R_PPC64_TOC16_HI"; - case R_PPC64_TOC16_HA: return "R_PPC64_TOC16_HA"; - case R_PPC64_TOC: return "R_PPC64_TOC"; - case R_PPC64_ADDR16_DS: return "R_PPC64_ADDR16_DS"; - case R_PPC64_ADDR16_LO_DS: return "R_PPC64_ADDR16_LO_DS"; - case R_PPC64_GOT16_DS: return "R_PPC64_GOT16_DS"; - case R_PPC64_GOT16_LO_DS: return "R_PPC64_GOT16_LO_DS"; - case R_PPC64_PLT16_LO_DS: return "R_PPC64_PLT16_LO_DS"; - case R_PPC64_TOC16_DS: return "R_PPC64_TOC16_DS"; - case R_PPC64_TOC16_LO_DS: return "R_PPC64_TOC16_LO_DS"; - case R_PPC64_TLS: return "R_PPC64_TLS"; - case R_PPC64_DTPMOD64: return "R_PPC64_DTPMOD64"; - case R_PPC64_TPREL16: return "R_PPC64_TPREL16"; - case R_PPC64_TPREL16_LO: return "R_PPC64_TPREL16_LO"; - case R_PPC64_TPREL16_HI: return "R_PPC64_TPREL16_HI"; - case R_PPC64_TPREL16_HA: return "R_PPC64_TPREL16_HA"; - case R_PPC64_TPREL64: return "R_PPC64_TPREL64"; - case R_PPC64_DTPREL16: return "R_PPC64_DTPREL16"; - case R_PPC64_DTPREL16_LO: return "R_PPC64_DTPREL16_LO"; - case R_PPC64_DTPREL16_HI: return "R_PPC64_DTPREL16_HI"; - case R_PPC64_DTPREL16_HA: return "R_PPC64_DTPREL16_HA"; - case R_PPC64_DTPREL64: return "R_PPC64_DTPREL64"; - case R_PPC64_GOT_TLSGD16: return "R_PPC64_GOT_TLSGD16"; - case R_PPC64_GOT_TLSGD16_LO: return "R_PPC64_GOT_TLSGD16_LO"; - case R_PPC64_GOT_TLSGD16_HI: return "R_PPC64_GOT_TLSGD16_HI"; - case R_PPC64_GOT_TLSGD16_HA: return "R_PPC64_GOT_TLSGD16_HA"; - case R_PPC64_GOT_TLSLD16: return "R_PPC64_GOT_TLSLD16"; - case R_PPC64_GOT_TLSLD16_LO: return "R_PPC64_GOT_TLSLD16_LO"; - case R_PPC64_GOT_TLSLD16_HI: return "R_PPC64_GOT_TLSLD16_HI"; - case R_PPC64_GOT_TLSLD16_HA: return "R_PPC64_GOT_TLSLD16_HA"; - case R_PPC64_GOT_TPREL16_DS: return "R_PPC64_GOT_TPREL16_DS"; - case R_PPC64_GOT_TPREL16_LO_DS: return "R_PPC64_GOT_TPREL16_LO_DS"; - case R_PPC64_GOT_TPREL16_HI: return "R_PPC64_GOT_TPREL16_HI"; - case R_PPC64_GOT_TPREL16_HA: return "R_PPC64_GOT_TPREL16_HA"; - case R_PPC64_GOT_DTPREL16_DS: return "R_PPC64_GOT_DTPREL16_DS"; - case R_PPC64_GOT_DTPREL16_LO_DS: return "R_PPC64_GOT_DTPREL16_LO_DS"; - case R_PPC64_GOT_DTPREL16_HI: return "R_PPC64_GOT_DTPREL16_HI"; - case R_PPC64_GOT_DTPREL16_HA: return "R_PPC64_GOT_DTPREL16_HA"; - case R_PPC64_TPREL16_DS: return "R_PPC64_TPREL16_DS"; - case R_PPC64_TPREL16_LO_DS: return "R_PPC64_TPREL16_LO_DS"; - case R_PPC64_TPREL16_HIGHER: return "R_PPC64_TPREL16_HIGHER"; - case R_PPC64_TPREL16_HIGHERA: return "R_PPC64_TPREL16_HIGHERA"; - case R_PPC64_TPREL16_HIGHEST: return "R_PPC64_TPREL16_HIGHEST"; - case R_PPC64_TPREL16_HIGHESTA: return "R_PPC64_TPREL16_HIGHESTA"; - case R_PPC64_DTPREL16_DS: return "R_PPC64_DTPREL16_DS"; - case R_PPC64_DTPREL16_LO_DS: return "R_PPC64_DTPREL16_LO_DS"; - case R_PPC64_DTPREL16_HIGHER: return "R_PPC64_DTPREL16_HIGHER"; - case R_PPC64_DTPREL16_HIGHERA: return "R_PPC64_DTPREL16_HIGHERA"; - case R_PPC64_DTPREL16_HIGHEST: return "R_PPC64_DTPREL16_HIGHEST"; - case R_PPC64_DTPREL16_HIGHESTA: return "R_PPC64_DTPREL16_HIGHESTA"; - case R_PPC64_TLSGD: return "R_PPC64_TLSGD"; - case R_PPC64_TLSLD: return "R_PPC64_TLSLD"; - case R_PPC64_ADDR16_HIGH: return "R_PPC64_ADDR16_HIGH"; - case R_PPC64_ADDR16_HIGHA: return "R_PPC64_ADDR16_HIGHA"; - case R_PPC64_TPREL16_HIGH: return "R_PPC64_TPREL16_HIGH"; - case R_PPC64_TPREL16_HIGHA: return "R_PPC64_TPREL16_HIGHA"; - case R_PPC64_DTPREL16_HIGH: return "R_PPC64_DTPREL16_HIGH"; - case R_PPC64_DTPREL16_HIGHA: return "R_PPC64_DTPREL16_HIGHA"; - case R_PPC64_REL24_NOTOC: return "R_PPC64_REL24_NOTOC"; - case R_PPC64_PLTSEQ: return "R_PPC64_PLTSEQ"; - case R_PPC64_PLTCALL: return "R_PPC64_PLTCALL"; - case R_PPC64_PLTSEQ_NOTOC: return "R_PPC64_PLTSEQ_NOTOC"; - case R_PPC64_PLTCALL_NOTOC: return "R_PPC64_PLTCALL_NOTOC"; - case R_PPC64_PCREL_OPT: return "R_PPC64_PCREL_OPT"; - case R_PPC64_PCREL34: return "R_PPC64_PCREL34"; - case R_PPC64_GOT_PCREL34: return "R_PPC64_GOT_PCREL34"; - case R_PPC64_PLT_PCREL34: return "R_PPC64_PLT_PCREL34"; - case R_PPC64_PLT_PCREL34_NOTOC: return "R_PPC64_PLT_PCREL34_NOTOC"; - case R_PPC64_TPREL34: return "R_PPC64_TPREL34"; - case R_PPC64_DTPREL34: return "R_PPC64_DTPREL34"; - case R_PPC64_GOT_TLSGD_PCREL34: return "R_PPC64_GOT_TLSGD_PCREL34"; - case R_PPC64_GOT_TLSLD_PCREL34: return "R_PPC64_GOT_TLSLD_PCREL34"; - case R_PPC64_GOT_TPREL_PCREL34: return "R_PPC64_GOT_TPREL_PCREL34"; - case R_PPC64_IRELATIVE: return "R_PPC64_IRELATIVE"; - case R_PPC64_REL16: return "R_PPC64_REL16"; - case R_PPC64_REL16_LO: return "R_PPC64_REL16_LO"; - case R_PPC64_REL16_HI: return "R_PPC64_REL16_HI"; - case R_PPC64_REL16_HA: return "R_PPC64_REL16_HA"; + CASE(R_PPC64_NONE); + CASE(R_PPC64_ADDR32); + CASE(R_PPC64_ADDR24); + CASE(R_PPC64_ADDR16); + CASE(R_PPC64_ADDR16_LO); + CASE(R_PPC64_ADDR16_HI); + CASE(R_PPC64_ADDR16_HA); + CASE(R_PPC64_ADDR14); + CASE(R_PPC64_ADDR14_BRTAKEN); + CASE(R_PPC64_ADDR14_BRNTAKEN); + CASE(R_PPC64_REL24); + CASE(R_PPC64_REL14); + CASE(R_PPC64_REL14_BRTAKEN); + CASE(R_PPC64_REL14_BRNTAKEN); + CASE(R_PPC64_GOT16); + CASE(R_PPC64_GOT16_LO); + CASE(R_PPC64_GOT16_HI); + CASE(R_PPC64_GOT16_HA); + CASE(R_PPC64_COPY); + CASE(R_PPC64_GLOB_DAT); + CASE(R_PPC64_JMP_SLOT); + CASE(R_PPC64_RELATIVE); + CASE(R_PPC64_REL32); + CASE(R_PPC64_PLT16_LO); + CASE(R_PPC64_PLT16_HI); + CASE(R_PPC64_PLT16_HA); + CASE(R_PPC64_ADDR64); + CASE(R_PPC64_ADDR16_HIGHER); + CASE(R_PPC64_ADDR16_HIGHERA); + CASE(R_PPC64_ADDR16_HIGHEST); + CASE(R_PPC64_ADDR16_HIGHESTA); + CASE(R_PPC64_REL64); + CASE(R_PPC64_TOC16); + CASE(R_PPC64_TOC16_LO); + CASE(R_PPC64_TOC16_HI); + CASE(R_PPC64_TOC16_HA); + CASE(R_PPC64_TOC); + CASE(R_PPC64_ADDR16_DS); + CASE(R_PPC64_ADDR16_LO_DS); + CASE(R_PPC64_GOT16_DS); + CASE(R_PPC64_GOT16_LO_DS); + CASE(R_PPC64_PLT16_LO_DS); + CASE(R_PPC64_TOC16_DS); + CASE(R_PPC64_TOC16_LO_DS); + CASE(R_PPC64_TLS); + CASE(R_PPC64_DTPMOD64); + CASE(R_PPC64_TPREL16); + CASE(R_PPC64_TPREL16_LO); + CASE(R_PPC64_TPREL16_HI); + CASE(R_PPC64_TPREL16_HA); + CASE(R_PPC64_TPREL64); + CASE(R_PPC64_DTPREL16); + CASE(R_PPC64_DTPREL16_LO); + CASE(R_PPC64_DTPREL16_HI); + CASE(R_PPC64_DTPREL16_HA); + CASE(R_PPC64_DTPREL64); + CASE(R_PPC64_GOT_TLSGD16); + CASE(R_PPC64_GOT_TLSGD16_LO); + CASE(R_PPC64_GOT_TLSGD16_HI); + CASE(R_PPC64_GOT_TLSGD16_HA); + CASE(R_PPC64_GOT_TLSLD16); + CASE(R_PPC64_GOT_TLSLD16_LO); + CASE(R_PPC64_GOT_TLSLD16_HI); + CASE(R_PPC64_GOT_TLSLD16_HA); + CASE(R_PPC64_GOT_TPREL16_DS); + CASE(R_PPC64_GOT_TPREL16_LO_DS); + CASE(R_PPC64_GOT_TPREL16_HI); + CASE(R_PPC64_GOT_TPREL16_HA); + CASE(R_PPC64_GOT_DTPREL16_DS); + CASE(R_PPC64_GOT_DTPREL16_LO_DS); + CASE(R_PPC64_GOT_DTPREL16_HI); + CASE(R_PPC64_GOT_DTPREL16_HA); + CASE(R_PPC64_TPREL16_DS); + CASE(R_PPC64_TPREL16_LO_DS); + CASE(R_PPC64_TPREL16_HIGHER); + CASE(R_PPC64_TPREL16_HIGHERA); + CASE(R_PPC64_TPREL16_HIGHEST); + CASE(R_PPC64_TPREL16_HIGHESTA); + CASE(R_PPC64_DTPREL16_DS); + CASE(R_PPC64_DTPREL16_LO_DS); + CASE(R_PPC64_DTPREL16_HIGHER); + CASE(R_PPC64_DTPREL16_HIGHERA); + CASE(R_PPC64_DTPREL16_HIGHEST); + CASE(R_PPC64_DTPREL16_HIGHESTA); + CASE(R_PPC64_TLSGD); + CASE(R_PPC64_TLSLD); + CASE(R_PPC64_ADDR16_HIGH); + CASE(R_PPC64_ADDR16_HIGHA); + CASE(R_PPC64_TPREL16_HIGH); + CASE(R_PPC64_TPREL16_HIGHA); + CASE(R_PPC64_DTPREL16_HIGH); + CASE(R_PPC64_DTPREL16_HIGHA); + CASE(R_PPC64_REL24_NOTOC); + CASE(R_PPC64_PLTSEQ); + CASE(R_PPC64_PLTCALL); + CASE(R_PPC64_PLTSEQ_NOTOC); + CASE(R_PPC64_PLTCALL_NOTOC); + CASE(R_PPC64_PCREL_OPT); + CASE(R_PPC64_PCREL34); + CASE(R_PPC64_GOT_PCREL34); + CASE(R_PPC64_PLT_PCREL34); + CASE(R_PPC64_PLT_PCREL34_NOTOC); + CASE(R_PPC64_TPREL34); + CASE(R_PPC64_DTPREL34); + CASE(R_PPC64_GOT_TLSGD_PCREL34); + CASE(R_PPC64_GOT_TLSLD_PCREL34); + CASE(R_PPC64_GOT_TPREL_PCREL34); + CASE(R_PPC64_IRELATIVE); + CASE(R_PPC64_REL16); + CASE(R_PPC64_REL16_LO); + CASE(R_PPC64_REL16_HI); + CASE(R_PPC64_REL16_HA); } return unknown_type(r_type); } @@ -640,91 +646,91 @@ std::string rel_to_string(u32 r_type) { template <> std::string rel_to_string(u32 r_type) { switch (r_type) { - case R_SPARC_NONE: return "R_SPARC_NONE"; - case R_SPARC_8: return "R_SPARC_8"; - case R_SPARC_16: return "R_SPARC_16"; - case R_SPARC_32: return "R_SPARC_32"; - case R_SPARC_DISP8: return "R_SPARC_DISP8"; - case R_SPARC_DISP16: return "R_SPARC_DISP16"; - case R_SPARC_DISP32: return "R_SPARC_DISP32"; - case R_SPARC_WDISP30: return "R_SPARC_WDISP30"; - case R_SPARC_WDISP22: return "R_SPARC_WDISP22"; - case R_SPARC_HI22: return "R_SPARC_HI22"; - case R_SPARC_22: return "R_SPARC_22"; - case R_SPARC_13: return "R_SPARC_13"; - case R_SPARC_LO10: return "R_SPARC_LO10"; - case R_SPARC_GOT10: return "R_SPARC_GOT10"; - case R_SPARC_GOT13: return "R_SPARC_GOT13"; - case R_SPARC_GOT22: return "R_SPARC_GOT22"; - case R_SPARC_PC10: return "R_SPARC_PC10"; - case R_SPARC_PC22: return "R_SPARC_PC22"; - case R_SPARC_WPLT30: return "R_SPARC_WPLT30"; - case R_SPARC_COPY: return "R_SPARC_COPY"; - case R_SPARC_GLOB_DAT: return "R_SPARC_GLOB_DAT"; - case R_SPARC_JMP_SLOT: return "R_SPARC_JMP_SLOT"; - case R_SPARC_RELATIVE: return "R_SPARC_RELATIVE"; - case R_SPARC_UA32: return "R_SPARC_UA32"; - case R_SPARC_PLT32: return "R_SPARC_PLT32"; - case R_SPARC_HIPLT22: return "R_SPARC_HIPLT22"; - case R_SPARC_LOPLT10: return "R_SPARC_LOPLT10"; - case R_SPARC_PCPLT32: return "R_SPARC_PCPLT32"; - case R_SPARC_PCPLT22: return "R_SPARC_PCPLT22"; - case R_SPARC_PCPLT10: return "R_SPARC_PCPLT10"; - case R_SPARC_10: return "R_SPARC_10"; - case R_SPARC_11: return "R_SPARC_11"; - case R_SPARC_64: return "R_SPARC_64"; - case R_SPARC_OLO10: return "R_SPARC_OLO10"; - case R_SPARC_HH22: return "R_SPARC_HH22"; - case R_SPARC_HM10: return "R_SPARC_HM10"; - case R_SPARC_LM22: return "R_SPARC_LM22"; - case R_SPARC_PC_HH22: return "R_SPARC_PC_HH22"; - case R_SPARC_PC_HM10: return "R_SPARC_PC_HM10"; - case R_SPARC_PC_LM22: return "R_SPARC_PC_LM22"; - case R_SPARC_WDISP16: return "R_SPARC_WDISP16"; - case R_SPARC_WDISP19: return "R_SPARC_WDISP19"; - case R_SPARC_7: return "R_SPARC_7"; - case R_SPARC_5: return "R_SPARC_5"; - case R_SPARC_6: return "R_SPARC_6"; - case R_SPARC_DISP64: return "R_SPARC_DISP64"; - case R_SPARC_PLT64: return "R_SPARC_PLT64"; - case R_SPARC_HIX22: return "R_SPARC_HIX22"; - case R_SPARC_LOX10: return "R_SPARC_LOX10"; - case R_SPARC_H44: return "R_SPARC_H44"; - case R_SPARC_M44: return "R_SPARC_M44"; - case R_SPARC_L44: return "R_SPARC_L44"; - case R_SPARC_REGISTER: return "R_SPARC_REGISTER"; - case R_SPARC_UA64: return "R_SPARC_UA64"; - case R_SPARC_UA16: return "R_SPARC_UA16"; - case R_SPARC_TLS_GD_HI22: return "R_SPARC_TLS_GD_HI22"; - case R_SPARC_TLS_GD_LO10: return "R_SPARC_TLS_GD_LO10"; - case R_SPARC_TLS_GD_ADD: return "R_SPARC_TLS_GD_ADD"; - case R_SPARC_TLS_GD_CALL: return "R_SPARC_TLS_GD_CALL"; - case R_SPARC_TLS_LDM_HI22: return "R_SPARC_TLS_LDM_HI22"; - case R_SPARC_TLS_LDM_LO10: return "R_SPARC_TLS_LDM_LO10"; - case R_SPARC_TLS_LDM_ADD: return "R_SPARC_TLS_LDM_ADD"; - case R_SPARC_TLS_LDM_CALL: return "R_SPARC_TLS_LDM_CALL"; - case R_SPARC_TLS_LDO_HIX22: return "R_SPARC_TLS_LDO_HIX22"; - case R_SPARC_TLS_LDO_LOX10: return "R_SPARC_TLS_LDO_LOX10"; - case R_SPARC_TLS_LDO_ADD: return "R_SPARC_TLS_LDO_ADD"; - case R_SPARC_TLS_IE_HI22: return "R_SPARC_TLS_IE_HI22"; - case R_SPARC_TLS_IE_LO10: return "R_SPARC_TLS_IE_LO10"; - case R_SPARC_TLS_IE_LD: return "R_SPARC_TLS_IE_LD"; - case R_SPARC_TLS_IE_LDX: return "R_SPARC_TLS_IE_LDX"; - case R_SPARC_TLS_IE_ADD: return "R_SPARC_TLS_IE_ADD"; - case R_SPARC_TLS_LE_HIX22: return "R_SPARC_TLS_LE_HIX22"; - case R_SPARC_TLS_LE_LOX10: return "R_SPARC_TLS_LE_LOX10"; - case R_SPARC_TLS_DTPMOD32: return "R_SPARC_TLS_DTPMOD32"; - case R_SPARC_TLS_DTPMOD64: return "R_SPARC_TLS_DTPMOD64"; - case R_SPARC_TLS_DTPOFF32: return "R_SPARC_TLS_DTPOFF32"; - case R_SPARC_TLS_DTPOFF64: return "R_SPARC_TLS_DTPOFF64"; - case R_SPARC_TLS_TPOFF32: return "R_SPARC_TLS_TPOFF32"; - case R_SPARC_TLS_TPOFF64: return "R_SPARC_TLS_TPOFF64"; - case R_SPARC_GOTDATA_HIX22: return "R_SPARC_GOTDATA_HIX22"; - case R_SPARC_GOTDATA_LOX10: return "R_SPARC_GOTDATA_LOX10"; - case R_SPARC_GOTDATA_OP_HIX22: return "R_SPARC_GOTDATA_OP_HIX22"; - case R_SPARC_GOTDATA_OP_LOX10: return "R_SPARC_GOTDATA_OP_LOX10"; - case R_SPARC_GOTDATA_OP: return "R_SPARC_GOTDATA_OP"; - case R_SPARC_IRELATIVE: return "R_SPARC_IRELATIVE"; + CASE(R_SPARC_NONE); + CASE(R_SPARC_8); + CASE(R_SPARC_16); + CASE(R_SPARC_32); + CASE(R_SPARC_DISP8); + CASE(R_SPARC_DISP16); + CASE(R_SPARC_DISP32); + CASE(R_SPARC_WDISP30); + CASE(R_SPARC_WDISP22); + CASE(R_SPARC_HI22); + CASE(R_SPARC_22); + CASE(R_SPARC_13); + CASE(R_SPARC_LO10); + CASE(R_SPARC_GOT10); + CASE(R_SPARC_GOT13); + CASE(R_SPARC_GOT22); + CASE(R_SPARC_PC10); + CASE(R_SPARC_PC22); + CASE(R_SPARC_WPLT30); + CASE(R_SPARC_COPY); + CASE(R_SPARC_GLOB_DAT); + CASE(R_SPARC_JMP_SLOT); + CASE(R_SPARC_RELATIVE); + CASE(R_SPARC_UA32); + CASE(R_SPARC_PLT32); + CASE(R_SPARC_HIPLT22); + CASE(R_SPARC_LOPLT10); + CASE(R_SPARC_PCPLT32); + CASE(R_SPARC_PCPLT22); + CASE(R_SPARC_PCPLT10); + CASE(R_SPARC_10); + CASE(R_SPARC_11); + CASE(R_SPARC_64); + CASE(R_SPARC_OLO10); + CASE(R_SPARC_HH22); + CASE(R_SPARC_HM10); + CASE(R_SPARC_LM22); + CASE(R_SPARC_PC_HH22); + CASE(R_SPARC_PC_HM10); + CASE(R_SPARC_PC_LM22); + CASE(R_SPARC_WDISP16); + CASE(R_SPARC_WDISP19); + CASE(R_SPARC_7); + CASE(R_SPARC_5); + CASE(R_SPARC_6); + CASE(R_SPARC_DISP64); + CASE(R_SPARC_PLT64); + CASE(R_SPARC_HIX22); + CASE(R_SPARC_LOX10); + CASE(R_SPARC_H44); + CASE(R_SPARC_M44); + CASE(R_SPARC_L44); + CASE(R_SPARC_REGISTER); + CASE(R_SPARC_UA64); + CASE(R_SPARC_UA16); + CASE(R_SPARC_TLS_GD_HI22); + CASE(R_SPARC_TLS_GD_LO10); + CASE(R_SPARC_TLS_GD_ADD); + CASE(R_SPARC_TLS_GD_CALL); + CASE(R_SPARC_TLS_LDM_HI22); + CASE(R_SPARC_TLS_LDM_LO10); + CASE(R_SPARC_TLS_LDM_ADD); + CASE(R_SPARC_TLS_LDM_CALL); + CASE(R_SPARC_TLS_LDO_HIX22); + CASE(R_SPARC_TLS_LDO_LOX10); + CASE(R_SPARC_TLS_LDO_ADD); + CASE(R_SPARC_TLS_IE_HI22); + CASE(R_SPARC_TLS_IE_LO10); + CASE(R_SPARC_TLS_IE_LD); + CASE(R_SPARC_TLS_IE_LDX); + CASE(R_SPARC_TLS_IE_ADD); + CASE(R_SPARC_TLS_LE_HIX22); + CASE(R_SPARC_TLS_LE_LOX10); + CASE(R_SPARC_TLS_DTPMOD32); + CASE(R_SPARC_TLS_DTPMOD64); + CASE(R_SPARC_TLS_DTPOFF32); + CASE(R_SPARC_TLS_DTPOFF64); + CASE(R_SPARC_TLS_TPOFF32); + CASE(R_SPARC_TLS_TPOFF64); + CASE(R_SPARC_GOTDATA_HIX22); + CASE(R_SPARC_GOTDATA_LOX10); + CASE(R_SPARC_GOTDATA_OP_HIX22); + CASE(R_SPARC_GOTDATA_OP_LOX10); + CASE(R_SPARC_GOTDATA_OP); + CASE(R_SPARC_IRELATIVE); } return unknown_type(r_type); } @@ -732,72 +738,72 @@ std::string rel_to_string(u32 r_type) { template <> std::string rel_to_string(u32 r_type) { switch (r_type) { - case R_390_NONE: return "R_390_NONE"; - case R_390_8: return "R_390_8"; - case R_390_12: return "R_390_12"; - case R_390_16: return "R_390_16"; - case R_390_32: return "R_390_32"; - case R_390_PC32: return "R_390_PC32"; - case R_390_GOT12: return "R_390_GOT12"; - case R_390_GOT32: return "R_390_GOT32"; - case R_390_PLT32: return "R_390_PLT32"; - case R_390_COPY: return "R_390_COPY"; - case R_390_GLOB_DAT: return "R_390_GLOB_DAT"; - case R_390_JMP_SLOT: return "R_390_JMP_SLOT"; - case R_390_RELATIVE: return "R_390_RELATIVE"; - case R_390_GOTOFF32: return "R_390_GOTOFF32"; - case R_390_GOTPC: return "R_390_GOTPC"; - case R_390_GOT16: return "R_390_GOT16"; - case R_390_PC16: return "R_390_PC16"; - case R_390_PC16DBL: return "R_390_PC16DBL"; - case R_390_PLT16DBL: return "R_390_PLT16DBL"; - case R_390_PC32DBL: return "R_390_PC32DBL"; - case R_390_PLT32DBL: return "R_390_PLT32DBL"; - case R_390_GOTPCDBL: return "R_390_GOTPCDBL"; - case R_390_64: return "R_390_64"; - case R_390_PC64: return "R_390_PC64"; - case R_390_GOT64: return "R_390_GOT64"; - case R_390_PLT64: return "R_390_PLT64"; - case R_390_GOTENT: return "R_390_GOTENT"; - case R_390_GOTOFF16: return "R_390_GOTOFF16"; - case R_390_GOTOFF64: return "R_390_GOTOFF64"; - case R_390_GOTPLT12: return "R_390_GOTPLT12"; - case R_390_GOTPLT16: return "R_390_GOTPLT16"; - case R_390_GOTPLT32: return "R_390_GOTPLT32"; - case R_390_GOTPLT64: return "R_390_GOTPLT64"; - case R_390_GOTPLTENT: return "R_390_GOTPLTENT"; - case R_390_PLTOFF16: return "R_390_PLTOFF16"; - case R_390_PLTOFF32: return "R_390_PLTOFF32"; - case R_390_PLTOFF64: return "R_390_PLTOFF64"; - case R_390_TLS_LOAD: return "R_390_TLS_LOAD"; - case R_390_TLS_GDCALL: return "R_390_TLS_GDCALL"; - case R_390_TLS_LDCALL: return "R_390_TLS_LDCALL"; - case R_390_TLS_GD32: return "R_390_TLS_GD32"; - case R_390_TLS_GD64: return "R_390_TLS_GD64"; - case R_390_TLS_GOTIE12: return "R_390_TLS_GOTIE12"; - case R_390_TLS_GOTIE32: return "R_390_TLS_GOTIE32"; - case R_390_TLS_GOTIE64: return "R_390_TLS_GOTIE64"; - case R_390_TLS_LDM32: return "R_390_TLS_LDM32"; - case R_390_TLS_LDM64: return "R_390_TLS_LDM64"; - case R_390_TLS_IE32: return "R_390_TLS_IE32"; - case R_390_TLS_IE64: return "R_390_TLS_IE64"; - case R_390_TLS_IEENT: return "R_390_TLS_IEENT"; - case R_390_TLS_LE32: return "R_390_TLS_LE32"; - case R_390_TLS_LE64: return "R_390_TLS_LE64"; - case R_390_TLS_LDO32: return "R_390_TLS_LDO32"; - case R_390_TLS_LDO64: return "R_390_TLS_LDO64"; - case R_390_TLS_DTPMOD: return "R_390_TLS_DTPMOD"; - case R_390_TLS_DTPOFF: return "R_390_TLS_DTPOFF"; - case R_390_TLS_TPOFF: return "R_390_TLS_TPOFF"; - case R_390_20: return "R_390_20"; - case R_390_GOT20: return "R_390_GOT20"; - case R_390_GOTPLT20: return "R_390_GOTPLT20"; - case R_390_TLS_GOTIE20: return "R_390_TLS_GOTIE20"; - case R_390_IRELATIVE: return "R_390_IRELATIVE"; - case R_390_PC12DBL: return "R_390_PC12DBL"; - case R_390_PLT12DBL: return "R_390_PLT12DBL"; - case R_390_PC24DBL: return "R_390_PC24DBL"; - case R_390_PLT24DBL: return "R_390_PLT24DBL"; + CASE(R_390_NONE); + CASE(R_390_8); + CASE(R_390_12); + CASE(R_390_16); + CASE(R_390_32); + CASE(R_390_PC32); + CASE(R_390_GOT12); + CASE(R_390_GOT32); + CASE(R_390_PLT32); + CASE(R_390_COPY); + CASE(R_390_GLOB_DAT); + CASE(R_390_JMP_SLOT); + CASE(R_390_RELATIVE); + CASE(R_390_GOTOFF32); + CASE(R_390_GOTPC); + CASE(R_390_GOT16); + CASE(R_390_PC16); + CASE(R_390_PC16DBL); + CASE(R_390_PLT16DBL); + CASE(R_390_PC32DBL); + CASE(R_390_PLT32DBL); + CASE(R_390_GOTPCDBL); + CASE(R_390_64); + CASE(R_390_PC64); + CASE(R_390_GOT64); + CASE(R_390_PLT64); + CASE(R_390_GOTENT); + CASE(R_390_GOTOFF16); + CASE(R_390_GOTOFF64); + CASE(R_390_GOTPLT12); + CASE(R_390_GOTPLT16); + CASE(R_390_GOTPLT32); + CASE(R_390_GOTPLT64); + CASE(R_390_GOTPLTENT); + CASE(R_390_PLTOFF16); + CASE(R_390_PLTOFF32); + CASE(R_390_PLTOFF64); + CASE(R_390_TLS_LOAD); + CASE(R_390_TLS_GDCALL); + CASE(R_390_TLS_LDCALL); + CASE(R_390_TLS_GD32); + CASE(R_390_TLS_GD64); + CASE(R_390_TLS_GOTIE12); + CASE(R_390_TLS_GOTIE32); + CASE(R_390_TLS_GOTIE64); + CASE(R_390_TLS_LDM32); + CASE(R_390_TLS_LDM64); + CASE(R_390_TLS_IE32); + CASE(R_390_TLS_IE64); + CASE(R_390_TLS_IEENT); + CASE(R_390_TLS_LE32); + CASE(R_390_TLS_LE64); + CASE(R_390_TLS_LDO32); + CASE(R_390_TLS_LDO64); + CASE(R_390_TLS_DTPMOD); + CASE(R_390_TLS_DTPOFF); + CASE(R_390_TLS_TPOFF); + CASE(R_390_20); + CASE(R_390_GOT20); + CASE(R_390_GOTPLT20); + CASE(R_390_TLS_GOTIE20); + CASE(R_390_IRELATIVE); + CASE(R_390_PC12DBL); + CASE(R_390_PLT12DBL); + CASE(R_390_PC24DBL); + CASE(R_390_PLT24DBL); } return unknown_type(r_type); } @@ -805,47 +811,47 @@ std::string rel_to_string(u32 r_type) { template <> std::string rel_to_string(u32 r_type) { switch (r_type) { - case R_68K_NONE: return "R_68K_NONE"; - case R_68K_32: return "R_68K_32"; - case R_68K_16: return "R_68K_16"; - case R_68K_8: return "R_68K_8"; - case R_68K_PC32: return "R_68K_PC32"; - case R_68K_PC16: return "R_68K_PC16"; - case R_68K_PC8: return "R_68K_PC8"; - case R_68K_GOTPCREL32: return "R_68K_GOTPCREL32"; - case R_68K_GOTPCREL16: return "R_68K_GOTPCREL16"; - case R_68K_GOTPCREL8: return "R_68K_GOTPCREL8"; - case R_68K_GOTOFF32: return "R_68K_GOTOFF32"; - case R_68K_GOTOFF16: return "R_68K_GOTOFF16"; - case R_68K_GOTOFF8: return "R_68K_GOTOFF8"; - case R_68K_PLT32: return "R_68K_PLT32"; - case R_68K_PLT16: return "R_68K_PLT16"; - case R_68K_PLT8: return "R_68K_PLT8"; - case R_68K_PLTOFF32: return "R_68K_PLTOFF32"; - case R_68K_PLTOFF16: return "R_68K_PLTOFF16"; - case R_68K_PLTOFF8: return "R_68K_PLTOFF8"; - case R_68K_COPY: return "R_68K_COPY"; - case R_68K_GLOB_DAT: return "R_68K_GLOB_DAT"; - case R_68K_JMP_SLOT: return "R_68K_JMP_SLOT"; - case R_68K_RELATIVE: return "R_68K_RELATIVE"; - case R_68K_TLS_GD32: return "R_68K_TLS_GD32"; - case R_68K_TLS_GD16: return "R_68K_TLS_GD16"; - case R_68K_TLS_GD8: return "R_68K_TLS_GD8"; - case R_68K_TLS_LDM32: return "R_68K_TLS_LDM32"; - case R_68K_TLS_LDM16: return "R_68K_TLS_LDM16"; - case R_68K_TLS_LDM8: return "R_68K_TLS_LDM8"; - case R_68K_TLS_LDO32: return "R_68K_TLS_LDO32"; - case R_68K_TLS_LDO16: return "R_68K_TLS_LDO16"; - case R_68K_TLS_LDO8: return "R_68K_TLS_LDO8"; - case R_68K_TLS_IE32: return "R_68K_TLS_IE32"; - case R_68K_TLS_IE16: return "R_68K_TLS_IE16"; - case R_68K_TLS_IE8: return "R_68K_TLS_IE8"; - case R_68K_TLS_LE32: return "R_68K_TLS_LE32"; - case R_68K_TLS_LE16: return "R_68K_TLS_LE16"; - case R_68K_TLS_LE8: return "R_68K_TLS_LE8"; - case R_68K_TLS_DTPMOD32: return "R_68K_TLS_DTPMOD32"; - case R_68K_TLS_DTPREL32: return "R_68K_TLS_DTPREL32"; - case R_68K_TLS_TPREL32: return "R_68K_TLS_TPREL32"; + CASE(R_68K_NONE); + CASE(R_68K_32); + CASE(R_68K_16); + CASE(R_68K_8); + CASE(R_68K_PC32); + CASE(R_68K_PC16); + CASE(R_68K_PC8); + CASE(R_68K_GOTPCREL32); + CASE(R_68K_GOTPCREL16); + CASE(R_68K_GOTPCREL8); + CASE(R_68K_GOTOFF32); + CASE(R_68K_GOTOFF16); + CASE(R_68K_GOTOFF8); + CASE(R_68K_PLT32); + CASE(R_68K_PLT16); + CASE(R_68K_PLT8); + CASE(R_68K_PLTOFF32); + CASE(R_68K_PLTOFF16); + CASE(R_68K_PLTOFF8); + CASE(R_68K_COPY); + CASE(R_68K_GLOB_DAT); + CASE(R_68K_JMP_SLOT); + CASE(R_68K_RELATIVE); + CASE(R_68K_TLS_GD32); + CASE(R_68K_TLS_GD16); + CASE(R_68K_TLS_GD8); + CASE(R_68K_TLS_LDM32); + CASE(R_68K_TLS_LDM16); + CASE(R_68K_TLS_LDM8); + CASE(R_68K_TLS_LDO32); + CASE(R_68K_TLS_LDO16); + CASE(R_68K_TLS_LDO8); + CASE(R_68K_TLS_IE32); + CASE(R_68K_TLS_IE16); + CASE(R_68K_TLS_IE8); + CASE(R_68K_TLS_LE32); + CASE(R_68K_TLS_LE16); + CASE(R_68K_TLS_LE8); + CASE(R_68K_TLS_DTPMOD32); + CASE(R_68K_TLS_DTPREL32); + CASE(R_68K_TLS_TPREL32); } return unknown_type(r_type); } @@ -853,33 +859,33 @@ std::string rel_to_string(u32 r_type) { template <> std::string rel_to_string(u32 r_type) { switch (r_type) { - case R_SH_NONE: return "R_SH_NONE"; - case R_SH_DIR32: return "R_SH_DIR32"; - case R_SH_REL32: return "R_SH_REL32"; - case R_SH_DIR8WPN: return "R_SH_DIR8WPN"; - case R_SH_IND12W: return "R_SH_IND12W"; - case R_SH_DIR8WPL: return "R_SH_DIR8WPL"; - case R_SH_DIR8WPZ: return "R_SH_DIR8WPZ"; - case R_SH_DIR8BP: return "R_SH_DIR8BP"; - case R_SH_DIR8W: return "R_SH_DIR8W"; - case R_SH_DIR8L: return "R_SH_DIR8L"; - case R_SH_TLS_GD_32: return "R_SH_TLS_GD_32"; - case R_SH_TLS_LD_32: return "R_SH_TLS_LD_32"; - case R_SH_TLS_LDO_32: return "R_SH_TLS_LDO_32"; - case R_SH_TLS_IE_32: return "R_SH_TLS_IE_32"; - case R_SH_TLS_LE_32: return "R_SH_TLS_LE_32"; - case R_SH_TLS_DTPMOD32: return "R_SH_TLS_DTPMOD32"; - case R_SH_TLS_DTPOFF32: return "R_SH_TLS_DTPOFF32"; - case R_SH_TLS_TPOFF32: return "R_SH_TLS_TPOFF32"; - case R_SH_GOT32: return "R_SH_GOT32"; - case R_SH_PLT32: return "R_SH_PLT32"; - case R_SH_COPY: return "R_SH_COPY"; - case R_SH_GLOB_DAT: return "R_SH_GLOB_DAT"; - case R_SH_JMP_SLOT: return "R_SH_JMP_SLOT"; - case R_SH_RELATIVE: return "R_SH_RELATIVE"; - case R_SH_GOTOFF: return "R_SH_GOTOFF"; - case R_SH_GOTPC: return "R_SH_GOTPC"; - case R_SH_GOTPLT32: return "R_SH_GOTPLT32"; + CASE(R_SH_NONE); + CASE(R_SH_DIR32); + CASE(R_SH_REL32); + CASE(R_SH_DIR8WPN); + CASE(R_SH_IND12W); + CASE(R_SH_DIR8WPL); + CASE(R_SH_DIR8WPZ); + CASE(R_SH_DIR8BP); + CASE(R_SH_DIR8W); + CASE(R_SH_DIR8L); + CASE(R_SH_TLS_GD_32); + CASE(R_SH_TLS_LD_32); + CASE(R_SH_TLS_LDO_32); + CASE(R_SH_TLS_IE_32); + CASE(R_SH_TLS_LE_32); + CASE(R_SH_TLS_DTPMOD32); + CASE(R_SH_TLS_DTPOFF32); + CASE(R_SH_TLS_TPOFF32); + CASE(R_SH_GOT32); + CASE(R_SH_PLT32); + CASE(R_SH_COPY); + CASE(R_SH_GLOB_DAT); + CASE(R_SH_JMP_SLOT); + CASE(R_SH_RELATIVE); + CASE(R_SH_GOTOFF); + CASE(R_SH_GOTPC); + CASE(R_SH_GOTPLT32); } return unknown_type(r_type); } @@ -887,232 +893,144 @@ std::string rel_to_string(u32 r_type) { template <> std::string rel_to_string(u32 r_type) { switch (r_type) { - case R_ALPHA_NONE: return "R_ALPHA_NONE"; - case R_ALPHA_REFLONG: return "R_ALPHA_REFLONG"; - case R_ALPHA_REFQUAD: return "R_ALPHA_REFQUAD"; - case R_ALPHA_GPREL32: return "R_ALPHA_GPREL32"; - case R_ALPHA_LITERAL: return "R_ALPHA_LITERAL"; - case R_ALPHA_LITUSE: return "R_ALPHA_LITUSE"; - case R_ALPHA_GPDISP: return "R_ALPHA_GPDISP"; - case R_ALPHA_BRADDR: return "R_ALPHA_BRADDR"; - case R_ALPHA_HINT: return "R_ALPHA_HINT"; - case R_ALPHA_SREL16: return "R_ALPHA_SREL16"; - case R_ALPHA_SREL32: return "R_ALPHA_SREL32"; - case R_ALPHA_SREL64: return "R_ALPHA_SREL64"; - case R_ALPHA_GPRELHIGH: return "R_ALPHA_GPRELHIGH"; - case R_ALPHA_GPRELLOW: return "R_ALPHA_GPRELLOW"; - case R_ALPHA_GPREL16: return "R_ALPHA_GPREL16"; - case R_ALPHA_COPY: return "R_ALPHA_COPY"; - case R_ALPHA_GLOB_DAT: return "R_ALPHA_GLOB_DAT"; - case R_ALPHA_JMP_SLOT: return "R_ALPHA_JMP_SLOT"; - case R_ALPHA_RELATIVE: return "R_ALPHA_RELATIVE"; - case R_ALPHA_BRSGP: return "R_ALPHA_BRSGP"; - case R_ALPHA_TLSGD: return "R_ALPHA_TLSGD"; - case R_ALPHA_TLSLDM: return "R_ALPHA_TLSLDM"; - case R_ALPHA_DTPMOD64: return "R_ALPHA_DTPMOD64"; - case R_ALPHA_GOTDTPREL: return "R_ALPHA_GOTDTPREL"; - case R_ALPHA_DTPREL64: return "R_ALPHA_DTPREL64"; - case R_ALPHA_DTPRELHI: return "R_ALPHA_DTPRELHI"; - case R_ALPHA_DTPRELLO: return "R_ALPHA_DTPRELLO"; - case R_ALPHA_DTPREL16: return "R_ALPHA_DTPREL16"; - case R_ALPHA_GOTTPREL: return "R_ALPHA_GOTTPREL"; - case R_ALPHA_TPREL64: return "R_ALPHA_TPREL64"; - case R_ALPHA_TPRELHI: return "R_ALPHA_TPRELHI"; - case R_ALPHA_TPRELLO: return "R_ALPHA_TPRELLO"; - case R_ALPHA_TPREL16: return "R_ALPHA_TPREL16"; - } - return unknown_type(r_type); -} - -template <> -std::string rel_to_string(u32 r_type) { - switch (r_type) { - case R_MIPS_NONE: return "R_MIPS_NONE"; - case R_MIPS_16: return "R_MIPS_16"; - case R_MIPS_32: return "R_MIPS_32"; - case R_MIPS_REL32: return "R_MIPS_REL32"; - case R_MIPS_26: return "R_MIPS_26"; - case R_MIPS_HI16: return "R_MIPS_HI16"; - case R_MIPS_LO16: return "R_MIPS_LO16"; - case R_MIPS_GPREL16: return "R_MIPS_GPREL16"; - case R_MIPS_LITERAL: return "R_MIPS_LITERAL"; - case R_MIPS_GOT16: return "R_MIPS_GOT16"; - case R_MIPS_PC16: return "R_MIPS_PC16"; - case R_MIPS_CALL16: return "R_MIPS_CALL16"; - case R_MIPS_GPREL32: return "R_MIPS_GPREL32"; - case R_MIPS_UNUSED1: return "R_MIPS_UNUSED1"; - case R_MIPS_UNUSED2: return "R_MIPS_UNUSED2"; - case R_MIPS_UNUSED3: return "R_MIPS_UNUSED3"; - case R_MIPS_SHIFT5: return "R_MIPS_SHIFT5"; - case R_MIPS_SHIFT6: return "R_MIPS_SHIFT6"; - case R_MIPS_64: return "R_MIPS_64"; - case R_MIPS_GOT_DISP: return "R_MIPS_GOT_DISP"; - case R_MIPS_GOT_PAGE: return "R_MIPS_GOT_PAGE"; - case R_MIPS_GOT_OFST: return "R_MIPS_GOT_OFST"; - case R_MIPS_GOT_HI16: return "R_MIPS_GOT_HI16"; - case R_MIPS_GOT_LO16: return "R_MIPS_GOT_LO16"; - case R_MIPS_SUB: return "R_MIPS_SUB"; - case R_MIPS_INSERT_A: return "R_MIPS_INSERT_A"; - case R_MIPS_INSERT_B: return "R_MIPS_INSERT_B"; - case R_MIPS_DELETE: return "R_MIPS_DELETE"; - case R_MIPS_HIGHER: return "R_MIPS_HIGHER"; - case R_MIPS_HIGHEST: return "R_MIPS_HIGHEST"; - case R_MIPS_CALL_HI16: return "R_MIPS_CALL_HI16"; - case R_MIPS_CALL_LO16: return "R_MIPS_CALL_LO16"; - case R_MIPS_SCN_DISP: return "R_MIPS_SCN_DISP"; - case R_MIPS_REL16: return "R_MIPS_REL16"; - case R_MIPS_ADD_IMMEDIATE: return "R_MIPS_ADD_IMMEDIATE"; - case R_MIPS_PJUMP: return "R_MIPS_PJUMP"; - case R_MIPS_RELGOT: return "R_MIPS_RELGOT"; - case R_MIPS_JALR: return "R_MIPS_JALR"; - case R_MIPS_TLS_DTPMOD32: return "R_MIPS_TLS_DTPMOD32"; - case R_MIPS_TLS_DTPREL32: return "R_MIPS_TLS_DTPREL32"; - case R_MIPS_TLS_DTPMOD64: return "R_MIPS_TLS_DTPMOD64"; - case R_MIPS_TLS_DTPREL64: return "R_MIPS_TLS_DTPREL64"; - case R_MIPS_TLS_GD: return "R_MIPS_TLS_GD"; - case R_MIPS_TLS_LDM: return "R_MIPS_TLS_LDM"; - case R_MIPS_TLS_DTPREL_HI16: return "R_MIPS_TLS_DTPREL_HI16"; - case R_MIPS_TLS_DTPREL_LO16: return "R_MIPS_TLS_DTPREL_LO16"; - case R_MIPS_TLS_GOTTPREL: return "R_MIPS_TLS_GOTTPREL"; - case R_MIPS_TLS_TPREL32: return "R_MIPS_TLS_TPREL32"; - case R_MIPS_TLS_TPREL64: return "R_MIPS_TLS_TPREL64"; - case R_MIPS_TLS_TPREL_HI16: return "R_MIPS_TLS_TPREL_HI16"; - case R_MIPS_TLS_TPREL_LO16: return "R_MIPS_TLS_TPREL_LO16"; - case R_MIPS_GLOB_DAT: return "R_MIPS_GLOB_DAT"; - case R_MIPS_PC21_S2: return "R_MIPS_PC21_S2"; - case R_MIPS_PC26_S2: return "R_MIPS_PC26_S2"; - case R_MIPS_PC18_S3: return "R_MIPS_PC18_S3"; - case R_MIPS_PC19_S2: return "R_MIPS_PC19_S2"; - case R_MIPS_PCHI16: return "R_MIPS_PCHI16"; - case R_MIPS_PCLO16: return "R_MIPS_PCLO16"; - case R_MIPS16_26: return "R_MIPS16_26"; - case R_MIPS16_GPREL: return "R_MIPS16_GPREL"; - case R_MIPS16_GOT16: return "R_MIPS16_GOT16"; - case R_MIPS16_CALL16: return "R_MIPS16_CALL16"; - case R_MIPS16_HI16: return "R_MIPS16_HI16"; - case R_MIPS16_LO16: return "R_MIPS16_LO16"; - case R_MIPS16_TLS_GD: return "R_MIPS16_TLS_GD"; - case R_MIPS16_TLS_LDM: return "R_MIPS16_TLS_LDM"; - case R_MIPS16_TLS_DTPREL_HI16: return "R_MIPS16_TLS_DTPREL_HI16"; - case R_MIPS16_TLS_DTPREL_LO16: return "R_MIPS16_TLS_DTPREL_LO16"; - case R_MIPS16_TLS_GOTTPREL: return "R_MIPS16_TLS_GOTTPREL"; - case R_MIPS16_TLS_TPREL_HI16: return "R_MIPS16_TLS_TPREL_HI16"; - case R_MIPS16_TLS_TPREL_LO16: return "R_MIPS16_TLS_TPREL_LO16"; - case R_MIPS_COPY: return "R_MIPS_COPY"; - case R_MIPS_JUMP_SLOT: return "R_MIPS_JUMP_SLOT"; - case R_MIPS_NUM: return "R_MIPS_NUM"; - case R_MIPS_PC32: return "R_MIPS_PC32"; - case R_MIPS_EH: return "R_MIPS_EH"; + CASE(R_ALPHA_NONE); + CASE(R_ALPHA_REFLONG); + CASE(R_ALPHA_REFQUAD); + CASE(R_ALPHA_GPREL32); + CASE(R_ALPHA_LITERAL); + CASE(R_ALPHA_LITUSE); + CASE(R_ALPHA_GPDISP); + CASE(R_ALPHA_BRADDR); + CASE(R_ALPHA_HINT); + CASE(R_ALPHA_SREL16); + CASE(R_ALPHA_SREL32); + CASE(R_ALPHA_SREL64); + CASE(R_ALPHA_GPRELHIGH); + CASE(R_ALPHA_GPRELLOW); + CASE(R_ALPHA_GPREL16); + CASE(R_ALPHA_COPY); + CASE(R_ALPHA_GLOB_DAT); + CASE(R_ALPHA_JMP_SLOT); + CASE(R_ALPHA_RELATIVE); + CASE(R_ALPHA_BRSGP); + CASE(R_ALPHA_TLSGD); + CASE(R_ALPHA_TLSLDM); + CASE(R_ALPHA_DTPMOD64); + CASE(R_ALPHA_GOTDTPREL); + CASE(R_ALPHA_DTPREL64); + CASE(R_ALPHA_DTPRELHI); + CASE(R_ALPHA_DTPRELLO); + CASE(R_ALPHA_DTPREL16); + CASE(R_ALPHA_GOTTPREL); + CASE(R_ALPHA_TPREL64); + CASE(R_ALPHA_TPRELHI); + CASE(R_ALPHA_TPRELLO); + CASE(R_ALPHA_TPREL16); } return unknown_type(r_type); } -template <> -std::string rel_to_string(u32 r_type) { - return rel_to_string(r_type); -} - template <> std::string rel_to_string(u32 r_type) { switch (r_type) { - case R_LARCH_NONE: return "R_LARCH_NONE"; - case R_LARCH_32: return "R_LARCH_32"; - case R_LARCH_64: return "R_LARCH_64"; - case R_LARCH_RELATIVE: return "R_LARCH_RELATIVE"; - case R_LARCH_COPY: return "R_LARCH_COPY"; - case R_LARCH_JUMP_SLOT: return "R_LARCH_JUMP_SLOT"; - case R_LARCH_TLS_DTPMOD32: return "R_LARCH_TLS_DTPMOD32"; - case R_LARCH_TLS_DTPMOD64: return "R_LARCH_TLS_DTPMOD64"; - case R_LARCH_TLS_DTPREL32: return "R_LARCH_TLS_DTPREL32"; - case R_LARCH_TLS_DTPREL64: return "R_LARCH_TLS_DTPREL64"; - case R_LARCH_TLS_TPREL32: return "R_LARCH_TLS_TPREL32"; - case R_LARCH_TLS_TPREL64: return "R_LARCH_TLS_TPREL64"; - case R_LARCH_IRELATIVE: return "R_LARCH_IRELATIVE"; - case R_LARCH_MARK_LA: return "R_LARCH_MARK_LA"; - case R_LARCH_MARK_PCREL: return "R_LARCH_MARK_PCREL"; - case R_LARCH_SOP_PUSH_PCREL: return "R_LARCH_SOP_PUSH_PCREL"; - case R_LARCH_SOP_PUSH_ABSOLUTE: return "R_LARCH_SOP_PUSH_ABSOLUTE"; - case R_LARCH_SOP_PUSH_DUP: return "R_LARCH_SOP_PUSH_DUP"; - case R_LARCH_SOP_PUSH_GPREL: return "R_LARCH_SOP_PUSH_GPREL"; - case R_LARCH_SOP_PUSH_TLS_TPREL: return "R_LARCH_SOP_PUSH_TLS_TPREL"; - case R_LARCH_SOP_PUSH_TLS_GOT: return "R_LARCH_SOP_PUSH_TLS_GOT"; - case R_LARCH_SOP_PUSH_TLS_GD: return "R_LARCH_SOP_PUSH_TLS_GD"; - case R_LARCH_SOP_PUSH_PLT_PCREL: return "R_LARCH_SOP_PUSH_PLT_PCREL"; - case R_LARCH_SOP_ASSERT: return "R_LARCH_SOP_ASSERT"; - case R_LARCH_SOP_NOT: return "R_LARCH_SOP_NOT"; - case R_LARCH_SOP_SUB: return "R_LARCH_SOP_SUB"; - case R_LARCH_SOP_SL: return "R_LARCH_SOP_SL"; - case R_LARCH_SOP_SR: return "R_LARCH_SOP_SR"; - case R_LARCH_SOP_ADD: return "R_LARCH_SOP_ADD"; - case R_LARCH_SOP_AND: return "R_LARCH_SOP_AND"; - case R_LARCH_SOP_IF_ELSE: return "R_LARCH_SOP_IF_ELSE"; - case R_LARCH_SOP_POP_32_S_10_5: return "R_LARCH_SOP_POP_32_S_10_5"; - case R_LARCH_SOP_POP_32_U_10_12: return "R_LARCH_SOP_POP_32_U_10_12"; - case R_LARCH_SOP_POP_32_S_10_12: return "R_LARCH_SOP_POP_32_S_10_12"; - case R_LARCH_SOP_POP_32_S_10_16: return "R_LARCH_SOP_POP_32_S_10_16"; - case R_LARCH_SOP_POP_32_S_10_16_S2: return "R_LARCH_SOP_POP_32_S_10_16_S2"; - case R_LARCH_SOP_POP_32_S_5_20: return "R_LARCH_SOP_POP_32_S_5_20"; - case R_LARCH_SOP_POP_32_S_0_5_10_16_S2: return "R_LARCH_SOP_POP_32_S_0_5_10_16_S2"; - case R_LARCH_SOP_POP_32_S_0_10_10_16_S2: return "R_LARCH_SOP_POP_32_S_0_10_10_16_S2"; - case R_LARCH_SOP_POP_32_U: return "R_LARCH_SOP_POP_32_U"; - case R_LARCH_ADD8: return "R_LARCH_ADD8"; - case R_LARCH_ADD16: return "R_LARCH_ADD16"; - case R_LARCH_ADD24: return "R_LARCH_ADD24"; - case R_LARCH_ADD32: return "R_LARCH_ADD32"; - case R_LARCH_ADD64: return "R_LARCH_ADD64"; - case R_LARCH_SUB8: return "R_LARCH_SUB8"; - case R_LARCH_SUB16: return "R_LARCH_SUB16"; - case R_LARCH_SUB24: return "R_LARCH_SUB24"; - case R_LARCH_SUB32: return "R_LARCH_SUB32"; - case R_LARCH_SUB64: return "R_LARCH_SUB64"; - case R_LARCH_GNU_VTINHERIT: return "R_LARCH_GNU_VTINHERIT"; - case R_LARCH_GNU_VTENTRY: return "R_LARCH_GNU_VTENTRY"; - case R_LARCH_B16: return "R_LARCH_B16"; - case R_LARCH_B21: return "R_LARCH_B21"; - case R_LARCH_B26: return "R_LARCH_B26"; - case R_LARCH_ABS_HI20: return "R_LARCH_ABS_HI20"; - case R_LARCH_ABS_LO12: return "R_LARCH_ABS_LO12"; - case R_LARCH_ABS64_LO20: return "R_LARCH_ABS64_LO20"; - case R_LARCH_ABS64_HI12: return "R_LARCH_ABS64_HI12"; - case R_LARCH_PCALA_HI20: return "R_LARCH_PCALA_HI20"; - case R_LARCH_PCALA_LO12: return "R_LARCH_PCALA_LO12"; - case R_LARCH_PCALA64_LO20: return "R_LARCH_PCALA64_LO20"; - case R_LARCH_PCALA64_HI12: return "R_LARCH_PCALA64_HI12"; - case R_LARCH_GOT_PC_HI20: return "R_LARCH_GOT_PC_HI20"; - case R_LARCH_GOT_PC_LO12: return "R_LARCH_GOT_PC_LO12"; - case R_LARCH_GOT64_PC_LO20: return "R_LARCH_GOT64_PC_LO20"; - case R_LARCH_GOT64_PC_HI12: return "R_LARCH_GOT64_PC_HI12"; - case R_LARCH_GOT_HI20: return "R_LARCH_GOT_HI20"; - case R_LARCH_GOT_LO12: return "R_LARCH_GOT_LO12"; - case R_LARCH_GOT64_LO20: return "R_LARCH_GOT64_LO20"; - case R_LARCH_GOT64_HI12: return "R_LARCH_GOT64_HI12"; - case R_LARCH_TLS_LE_HI20: return "R_LARCH_TLS_LE_HI20"; - case R_LARCH_TLS_LE_LO12: return "R_LARCH_TLS_LE_LO12"; - case R_LARCH_TLS_LE64_LO20: return "R_LARCH_TLS_LE64_LO20"; - case R_LARCH_TLS_LE64_HI12: return "R_LARCH_TLS_LE64_HI12"; - case R_LARCH_TLS_IE_PC_HI20: return "R_LARCH_TLS_IE_PC_HI20"; - case R_LARCH_TLS_IE_PC_LO12: return "R_LARCH_TLS_IE_PC_LO12"; - case R_LARCH_TLS_IE64_PC_LO20: return "R_LARCH_TLS_IE64_PC_LO20"; - case R_LARCH_TLS_IE64_PC_HI12: return "R_LARCH_TLS_IE64_PC_HI12"; - case R_LARCH_TLS_IE_HI20: return "R_LARCH_TLS_IE_HI20"; - case R_LARCH_TLS_IE_LO12: return "R_LARCH_TLS_IE_LO12"; - case R_LARCH_TLS_IE64_LO20: return "R_LARCH_TLS_IE64_LO20"; - case R_LARCH_TLS_IE64_HI12: return "R_LARCH_TLS_IE64_HI12"; - case R_LARCH_TLS_LD_PC_HI20: return "R_LARCH_TLS_LD_PC_HI20"; - case R_LARCH_TLS_LD_HI20: return "R_LARCH_TLS_LD_HI20"; - case R_LARCH_TLS_GD_PC_HI20: return "R_LARCH_TLS_GD_PC_HI20"; - case R_LARCH_TLS_GD_HI20: return "R_LARCH_TLS_GD_HI20"; - case R_LARCH_32_PCREL: return "R_LARCH_32_PCREL"; - case R_LARCH_RELAX: return "R_LARCH_RELAX"; - case R_LARCH_DELETE: return "R_LARCH_DELETE"; - case R_LARCH_ALIGN: return "R_LARCH_ALIGN"; - case R_LARCH_PCREL20_S2: return "R_LARCH_PCREL20_S2"; - case R_LARCH_CFA: return "R_LARCH_CFA"; - case R_LARCH_ADD6: return "R_LARCH_ADD6"; - case R_LARCH_SUB6: return "R_LARCH_SUB6"; - case R_LARCH_ADD_ULEB128: return "R_LARCH_ADD_ULEB128"; - case R_LARCH_SUB_ULEB128: return "R_LARCH_SUB_ULEB128"; - case R_LARCH_64_PCREL: return "R_LARCH_64_PCREL"; + CASE(R_LARCH_NONE); + CASE(R_LARCH_32); + CASE(R_LARCH_64); + CASE(R_LARCH_RELATIVE); + CASE(R_LARCH_COPY); + CASE(R_LARCH_JUMP_SLOT); + CASE(R_LARCH_TLS_DTPMOD32); + CASE(R_LARCH_TLS_DTPMOD64); + CASE(R_LARCH_TLS_DTPREL32); + CASE(R_LARCH_TLS_DTPREL64); + CASE(R_LARCH_TLS_TPREL32); + CASE(R_LARCH_TLS_TPREL64); + CASE(R_LARCH_IRELATIVE); + CASE(R_LARCH_MARK_LA); + CASE(R_LARCH_MARK_PCREL); + CASE(R_LARCH_SOP_PUSH_PCREL); + CASE(R_LARCH_SOP_PUSH_ABSOLUTE); + CASE(R_LARCH_SOP_PUSH_DUP); + CASE(R_LARCH_SOP_PUSH_GPREL); + CASE(R_LARCH_SOP_PUSH_TLS_TPREL); + CASE(R_LARCH_SOP_PUSH_TLS_GOT); + CASE(R_LARCH_SOP_PUSH_TLS_GD); + CASE(R_LARCH_SOP_PUSH_PLT_PCREL); + CASE(R_LARCH_SOP_ASSERT); + CASE(R_LARCH_SOP_NOT); + CASE(R_LARCH_SOP_SUB); + CASE(R_LARCH_SOP_SL); + CASE(R_LARCH_SOP_SR); + CASE(R_LARCH_SOP_ADD); + CASE(R_LARCH_SOP_AND); + CASE(R_LARCH_SOP_IF_ELSE); + CASE(R_LARCH_SOP_POP_32_S_10_5); + CASE(R_LARCH_SOP_POP_32_U_10_12); + CASE(R_LARCH_SOP_POP_32_S_10_12); + CASE(R_LARCH_SOP_POP_32_S_10_16); + CASE(R_LARCH_SOP_POP_32_S_10_16_S2); + CASE(R_LARCH_SOP_POP_32_S_5_20); + CASE(R_LARCH_SOP_POP_32_S_0_5_10_16_S2); + CASE(R_LARCH_SOP_POP_32_S_0_10_10_16_S2); + CASE(R_LARCH_SOP_POP_32_U); + CASE(R_LARCH_ADD8); + CASE(R_LARCH_ADD16); + CASE(R_LARCH_ADD24); + CASE(R_LARCH_ADD32); + CASE(R_LARCH_ADD64); + CASE(R_LARCH_SUB8); + CASE(R_LARCH_SUB16); + CASE(R_LARCH_SUB24); + CASE(R_LARCH_SUB32); + CASE(R_LARCH_SUB64); + CASE(R_LARCH_GNU_VTINHERIT); + CASE(R_LARCH_GNU_VTENTRY); + CASE(R_LARCH_B16); + CASE(R_LARCH_B21); + CASE(R_LARCH_B26); + CASE(R_LARCH_ABS_HI20); + CASE(R_LARCH_ABS_LO12); + CASE(R_LARCH_ABS64_LO20); + CASE(R_LARCH_ABS64_HI12); + CASE(R_LARCH_PCALA_HI20); + CASE(R_LARCH_PCALA_LO12); + CASE(R_LARCH_PCALA64_LO20); + CASE(R_LARCH_PCALA64_HI12); + CASE(R_LARCH_GOT_PC_HI20); + CASE(R_LARCH_GOT_PC_LO12); + CASE(R_LARCH_GOT64_PC_LO20); + CASE(R_LARCH_GOT64_PC_HI12); + CASE(R_LARCH_GOT_HI20); + CASE(R_LARCH_GOT_LO12); + CASE(R_LARCH_GOT64_LO20); + CASE(R_LARCH_GOT64_HI12); + CASE(R_LARCH_TLS_LE_HI20); + CASE(R_LARCH_TLS_LE_LO12); + CASE(R_LARCH_TLS_LE64_LO20); + CASE(R_LARCH_TLS_LE64_HI12); + CASE(R_LARCH_TLS_IE_PC_HI20); + CASE(R_LARCH_TLS_IE_PC_LO12); + CASE(R_LARCH_TLS_IE64_PC_LO20); + CASE(R_LARCH_TLS_IE64_PC_HI12); + CASE(R_LARCH_TLS_IE_HI20); + CASE(R_LARCH_TLS_IE_LO12); + CASE(R_LARCH_TLS_IE64_LO20); + CASE(R_LARCH_TLS_IE64_HI12); + CASE(R_LARCH_TLS_LD_PC_HI20); + CASE(R_LARCH_TLS_LD_HI20); + CASE(R_LARCH_TLS_GD_PC_HI20); + CASE(R_LARCH_TLS_GD_HI20); + CASE(R_LARCH_32_PCREL); + CASE(R_LARCH_RELAX); + CASE(R_LARCH_DELETE); + CASE(R_LARCH_ALIGN); + CASE(R_LARCH_PCREL20_S2); + CASE(R_LARCH_CFA); + CASE(R_LARCH_ADD6); + CASE(R_LARCH_SUB6); + CASE(R_LARCH_ADD_ULEB128); + CASE(R_LARCH_SUB_ULEB128); + CASE(R_LARCH_64_PCREL); } return unknown_type(r_type); } diff --git a/elf/elf.h b/elf/elf.h index 12ece2bb..128dd0dc 100644 --- a/elf/elf.h +++ b/elf/elf.h @@ -2,6 +2,7 @@ #include "../common/integers.h" +#include #include #include #include @@ -24,8 +25,6 @@ struct SPARC64; struct M68K; struct SH4; struct ALPHA; -struct MIPS64LE; -struct MIPS64BE; struct LOONGARCH64; struct LOONGARCH32; @@ -91,7 +90,6 @@ enum : u32 { SHT_ARM_EXIDX = 0x70000001, SHT_ARM_ATTRIBUTES = 0x70000003, SHT_RISCV_ATTRIBUTES = 0x70000003, - SHT_MIPS_ABIFLAGS = 0x7000002a, }; enum : u32 { @@ -106,7 +104,6 @@ enum : u32 { SHF_TLS = 0x400, SHF_COMPRESSED = 0x800, SHF_GNU_RETAIN = 0x200000, - SHF_MIPS_GPREL = 0x10000000, SHF_EXCLUDE = 0x80000000, }; @@ -191,7 +188,6 @@ enum : u32 { PT_OPENBSD_RANDOMIZE = 0x65a3dbe6, PT_ARM_EXIDX = 0x70000001, PT_RISCV_ATTRIBUTES = 0x70000003, - PT_MIPS_ABIFLAGS = 0x70000003, }; enum : u32 { @@ -226,7 +222,6 @@ enum : u32 { EM_NONE = 0, EM_386 = 3, EM_68K = 4, - EM_MIPS = 8, EM_PPC = 20, EM_PPC64 = 21, EM_S390X = 22, @@ -296,13 +291,6 @@ enum : u32 { DT_VERNEEDNUM = 0x6fffffff, DT_PPC_GOT = 0x70000000, DT_PPC64_GLINK = 0x70000000, - DT_MIPS_RLD_VERSION = 0x70000001, - DT_MIPS_FLAGS = 0x70000005, - DT_MIPS_BASE_ADDRESS = 0x70000006, - DT_MIPS_LOCAL_GOTNO = 0x7000000a, - DT_MIPS_SYMTABNO = 0x70000011, - DT_MIPS_GOTSYM = 0x70000013, - DT_MIPS_OPTIONS = 0x70000029, DT_AUXILIARY = 0x7ffffffd, DT_FILTER = 0x7fffffff, }; @@ -756,6 +744,7 @@ enum : u32 { R_RISCV_TLS_DTPREL64 = 9, R_RISCV_TLS_TPREL32 = 10, R_RISCV_TLS_TPREL64 = 11, + R_RISCV_TLSDESC = 12, R_RISCV_BRANCH = 16, R_RISCV_JAL = 17, R_RISCV_CALL = 18, @@ -796,6 +785,10 @@ enum : u32 { R_RISCV_PLT32 = 59, R_RISCV_SET_ULEB128 = 60, R_RISCV_SUB_ULEB128 = 61, + R_RISCV_TLSDESC_HI20 = 62, + R_RISCV_TLSDESC_LOAD_LO12 = 63, + R_RISCV_TLSDESC_ADD_LO12 = 64, + R_RISCV_TLSDESC_CALL = 65, }; enum : u32 { @@ -1260,85 +1253,6 @@ enum : u32 { R_ALPHA_TPREL16 = 41, }; -enum : u32 { - R_MIPS_NONE = 0, - R_MIPS_16 = 1, - R_MIPS_32 = 2, - R_MIPS_REL32 = 3, - R_MIPS_26 = 4, - R_MIPS_HI16 = 5, - R_MIPS_LO16 = 6, - R_MIPS_GPREL16 = 7, - R_MIPS_LITERAL = 8, - R_MIPS_GOT16 = 9, - R_MIPS_PC16 = 10, - R_MIPS_CALL16 = 11, - R_MIPS_GPREL32 = 12, - R_MIPS_UNUSED1 = 13, - R_MIPS_UNUSED2 = 14, - R_MIPS_UNUSED3 = 15, - R_MIPS_SHIFT5 = 16, - R_MIPS_SHIFT6 = 17, - R_MIPS_64 = 18, - R_MIPS_GOT_DISP = 19, - R_MIPS_GOT_PAGE = 20, - R_MIPS_GOT_OFST = 21, - R_MIPS_GOT_HI16 = 22, - R_MIPS_GOT_LO16 = 23, - R_MIPS_SUB = 24, - R_MIPS_INSERT_A = 25, - R_MIPS_INSERT_B = 26, - R_MIPS_DELETE = 27, - R_MIPS_HIGHER = 28, - R_MIPS_HIGHEST = 29, - R_MIPS_CALL_HI16 = 30, - R_MIPS_CALL_LO16 = 31, - R_MIPS_SCN_DISP = 32, - R_MIPS_REL16 = 33, - R_MIPS_ADD_IMMEDIATE = 34, - R_MIPS_PJUMP = 35, - R_MIPS_RELGOT = 36, - R_MIPS_JALR = 37, - R_MIPS_TLS_DTPMOD32 = 38, - R_MIPS_TLS_DTPREL32 = 39, - R_MIPS_TLS_DTPMOD64 = 40, - R_MIPS_TLS_DTPREL64 = 41, - R_MIPS_TLS_GD = 42, - R_MIPS_TLS_LDM = 43, - R_MIPS_TLS_DTPREL_HI16 = 44, - R_MIPS_TLS_DTPREL_LO16 = 45, - R_MIPS_TLS_GOTTPREL = 46, - R_MIPS_TLS_TPREL32 = 47, - R_MIPS_TLS_TPREL64 = 48, - R_MIPS_TLS_TPREL_HI16 = 49, - R_MIPS_TLS_TPREL_LO16 = 50, - R_MIPS_GLOB_DAT = 51, - R_MIPS_PC21_S2 = 60, - R_MIPS_PC26_S2 = 61, - R_MIPS_PC18_S3 = 62, - R_MIPS_PC19_S2 = 63, - R_MIPS_PCHI16 = 64, - R_MIPS_PCLO16 = 65, - R_MIPS16_26 = 100, - R_MIPS16_GPREL = 101, - R_MIPS16_GOT16 = 102, - R_MIPS16_CALL16 = 103, - R_MIPS16_HI16 = 104, - R_MIPS16_LO16 = 105, - R_MIPS16_TLS_GD = 106, - R_MIPS16_TLS_LDM = 107, - R_MIPS16_TLS_DTPREL_HI16 = 108, - R_MIPS16_TLS_DTPREL_LO16 = 109, - R_MIPS16_TLS_GOTTPREL = 110, - R_MIPS16_TLS_TPREL_HI16 = 111, - R_MIPS16_TLS_TPREL_LO16 = 112, - R_MIPS_COPY = 126, - R_MIPS_JUMP_SLOT = 127, - R_MIPS_NUM = 218, - R_MIPS_PC32 = 248, - R_MIPS_EH = 249, -}; - enum : u32 { R_LARCH_NONE = 0, R_LARCH_32 = 1, @@ -1864,86 +1778,42 @@ struct ElfRel { ib64 r_addend; }; -template <> -struct ElfRel { - ElfRel() = default; - ElfRel(u64 offset, u32 type, u32 sym, i64 addend) - : r_offset(offset), r_sym(sym), r_type(type), r_addend(addend) {} - - // In the little-endian MIPS64, r_sym and r_type are reversed, with - // r_type being stored in big-endian order. It's extremely odd though. - ul64 r_offset; - ul32 r_sym; - ub32 r_type; - il64 r_addend; -}; - -// .MIPS.options section -template -struct MipsOptions { - u8 kind; - u8 size; - U16 section; - U32 info; -}; - -template -struct MipsRegInfo { - U32 ri_gprmask; - U32 ri_pad; - U32 ri_cprmask[4]; - U64 ri_gp_value; -}; - -enum : u32 { - ODK_REGINFO = 1, -}; - // // Machine descriptions // -template -static constexpr bool supports_ifunc = requires { E::R_IRELATIVE; }; - -template -static constexpr bool supports_tlsdesc = requires { E::R_TLSDESC; }; - -template -static constexpr bool needs_thunk = requires { E::thunk_size; }; - -template static constexpr bool is_x86_64 = std::is_same_v; -template static constexpr bool is_i386 = std::is_same_v; -template static constexpr bool is_arm64 = std::is_same_v; -template static constexpr bool is_arm32 = std::is_same_v; -template static constexpr bool is_rv64le = std::is_same_v; -template static constexpr bool is_rv64be = std::is_same_v; -template static constexpr bool is_rv32le = std::is_same_v; -template static constexpr bool is_rv32be = std::is_same_v; -template static constexpr bool is_ppc32 = std::is_same_v; -template static constexpr bool is_ppc64v1 = std::is_same_v; -template static constexpr bool is_ppc64v2 = std::is_same_v; -template static constexpr bool is_s390x = std::is_same_v; -template static constexpr bool is_sparc64 = std::is_same_v; -template static constexpr bool is_m68k = std::is_same_v; -template static constexpr bool is_sh4 = std::is_same_v; -template static constexpr bool is_alpha = std::is_same_v; -template static constexpr bool is_mips64le = std::is_same_v; -template static constexpr bool is_mips64be = std::is_same_v; -template static constexpr bool is_loongarch64 = std::is_same_v; -template static constexpr bool is_loongarch32 = std::is_same_v; - -template static constexpr bool is_x86 = is_x86_64 || is_i386; -template static constexpr bool is_arm = is_arm64 || is_arm32; -template static constexpr bool is_rv64 = is_rv64le || is_rv64be; -template static constexpr bool is_rv32 = is_rv32le || is_rv32be; -template static constexpr bool is_riscv = is_rv64 || is_rv32; -template static constexpr bool is_ppc64 = is_ppc64v1 || is_ppc64v2; -template static constexpr bool is_ppc = is_ppc64 || is_ppc32; -template static constexpr bool is_sparc = is_sparc64; -template static constexpr bool is_mips64 = is_mips64le || is_mips64be; -template static constexpr bool is_mips = is_mips64; -template static constexpr bool is_loongarch = is_loongarch64 || is_loongarch32; +template concept supports_ifunc = requires { E::R_IRELATIVE; }; +template concept supports_tlsdesc = requires { E::R_TLSDESC; }; +template concept needs_thunk = requires { E::thunk_size; }; + +template concept is_x86_64 = std::same_as; +template concept is_i386 = std::same_as; +template concept is_arm64 = std::same_as; +template concept is_arm32 = std::same_as; +template concept is_rv64le = std::same_as; +template concept is_rv64be = std::same_as; +template concept is_rv32le = std::same_as; +template concept is_rv32be = std::same_as; +template concept is_ppc32 = std::same_as; +template concept is_ppc64v1 = std::same_as; +template concept is_ppc64v2 = std::same_as; +template concept is_s390x = std::same_as; +template concept is_sparc64 = std::same_as; +template concept is_m68k = std::same_as; +template concept is_sh4 = std::same_as; +template concept is_alpha = std::same_as; +template concept is_loongarch64 = std::same_as; +template concept is_loongarch32 = std::same_as; + +template concept is_x86 = is_x86_64 || is_i386; +template concept is_arm = is_arm64 || is_arm32; +template concept is_rv64 = is_rv64le || is_rv64be; +template concept is_rv32 = is_rv32le || is_rv32be; +template concept is_riscv = is_rv64 || is_rv32; +template concept is_ppc64 = is_ppc64v1 || is_ppc64v2; +template concept is_ppc = is_ppc64 || is_ppc32; +template concept is_sparc = is_sparc64; +template concept is_loongarch = is_loongarch64 || is_loongarch32; struct X86_64 { static constexpr std::string_view target_name = "x86_64"; @@ -2004,7 +1874,7 @@ struct ARM64 { static constexpr u32 plt_size = 16; static constexpr u32 pltgot_size = 16; static constexpr u32 thunk_hdr_size = 0; - static constexpr u32 thunk_size = 12; + static constexpr u32 thunk_size = 16; static constexpr u32 R_COPY = R_AARCH64_COPY; static constexpr u32 R_GLOB_DAT = R_AARCH64_GLOB_DAT; @@ -2029,8 +1899,8 @@ struct ARM32 { static constexpr u32 plt_hdr_size = 32; static constexpr u32 plt_size = 16; static constexpr u32 pltgot_size = 16; - static constexpr u32 thunk_hdr_size = 12; - static constexpr u32 thunk_size = 20; + static constexpr u32 thunk_hdr_size = 16; + static constexpr u32 thunk_size = 16; static constexpr u32 R_COPY = R_ARM_COPY; static constexpr u32 R_GLOB_DAT = R_ARM_GLOB_DAT; @@ -2045,10 +1915,8 @@ struct ARM32 { static constexpr u32 R_TLSDESC = R_ARM_TLS_DESC; }; -struct RV64LE { - static constexpr std::string_view target_name = "riscv64"; +struct RV64 { static constexpr bool is_64 = true; - static constexpr bool is_le = true; static constexpr bool is_rela = true; static constexpr u32 page_size = 4096; static constexpr u32 e_machine = EM_RISCV; @@ -2066,35 +1934,21 @@ struct RV64LE { static constexpr u32 R_DTPOFF = R_RISCV_TLS_DTPREL64; static constexpr u32 R_TPOFF = R_RISCV_TLS_TPREL64; static constexpr u32 R_DTPMOD = R_RISCV_TLS_DTPMOD64; + static constexpr u32 R_TLSDESC = R_RISCV_TLSDESC; +}; + +struct RV64LE : RV64 { + static constexpr std::string_view target_name = "riscv64"; + static constexpr bool is_le = true; }; -struct RV64BE { +struct RV64BE : RV64 { static constexpr std::string_view target_name = "riscv64be"; - static constexpr bool is_64 = true; static constexpr bool is_le = false; - static constexpr bool is_rela = true; - static constexpr u32 page_size = 4096; - static constexpr u32 e_machine = EM_RISCV; - static constexpr u32 plt_hdr_size = 32; - static constexpr u32 plt_size = 16; - static constexpr u32 pltgot_size = 16; - - static constexpr u32 R_COPY = R_RISCV_COPY; - static constexpr u32 R_GLOB_DAT = R_RISCV_64; - static constexpr u32 R_JUMP_SLOT = R_RISCV_JUMP_SLOT; - static constexpr u32 R_ABS = R_RISCV_64; - static constexpr u32 R_DYNAMIC = R_RISCV_64; - static constexpr u32 R_RELATIVE = R_RISCV_RELATIVE; - static constexpr u32 R_IRELATIVE = R_RISCV_IRELATIVE; - static constexpr u32 R_DTPOFF = R_RISCV_TLS_DTPREL64; - static constexpr u32 R_TPOFF = R_RISCV_TLS_TPREL64; - static constexpr u32 R_DTPMOD = R_RISCV_TLS_DTPMOD64; }; -struct RV32LE { - static constexpr std::string_view target_name = "riscv32"; +struct RV32 { static constexpr bool is_64 = false; - static constexpr bool is_le = true; static constexpr bool is_rela = true; static constexpr u32 page_size = 4096; static constexpr u32 e_machine = EM_RISCV; @@ -2112,29 +1966,17 @@ struct RV32LE { static constexpr u32 R_DTPOFF = R_RISCV_TLS_DTPREL32; static constexpr u32 R_TPOFF = R_RISCV_TLS_TPREL32; static constexpr u32 R_DTPMOD = R_RISCV_TLS_DTPMOD32; + static constexpr u32 R_TLSDESC = R_RISCV_TLSDESC; +}; + +struct RV32LE : RV32 { + static constexpr std::string_view target_name = "riscv32"; + static constexpr bool is_le = true; }; -struct RV32BE { +struct RV32BE : RV32 { static constexpr std::string_view target_name = "riscv32be"; - static constexpr bool is_64 = false; static constexpr bool is_le = false; - static constexpr bool is_rela = true; - static constexpr u32 page_size = 4096; - static constexpr u32 e_machine = EM_RISCV; - static constexpr u32 plt_hdr_size = 32; - static constexpr u32 plt_size = 16; - static constexpr u32 pltgot_size = 16; - - static constexpr u32 R_COPY = R_RISCV_COPY; - static constexpr u32 R_GLOB_DAT = R_RISCV_32; - static constexpr u32 R_JUMP_SLOT = R_RISCV_JUMP_SLOT; - static constexpr u32 R_ABS = R_RISCV_32; - static constexpr u32 R_DYNAMIC = R_RISCV_32; - static constexpr u32 R_RELATIVE = R_RISCV_RELATIVE; - static constexpr u32 R_IRELATIVE = R_RISCV_IRELATIVE; - static constexpr u32 R_DTPOFF = R_RISCV_TLS_DTPREL32; - static constexpr u32 R_TPOFF = R_RISCV_TLS_TPREL32; - static constexpr u32 R_DTPMOD = R_RISCV_TLS_DTPMOD32; }; struct PPC32 { @@ -2162,17 +2004,11 @@ struct PPC32 { static constexpr u32 R_DTPMOD = R_PPC_DTPMOD32; }; -struct PPC64V1 { - static constexpr std::string_view target_name = "ppc64v1"; +struct PPC64 { static constexpr bool is_64 = true; - static constexpr bool is_le = false; static constexpr bool is_rela = true; static constexpr u32 page_size = 65536; static constexpr u32 e_machine = EM_PPC64; - static constexpr u32 plt_hdr_size = 52; - static constexpr u32 pltgot_size = 0; - static constexpr u32 thunk_hdr_size = 0; - static constexpr u32 thunk_size = 28; static constexpr u32 R_COPY = R_PPC64_COPY; static constexpr u32 R_GLOB_DAT = R_PPC64_GLOB_DAT; @@ -2186,29 +2022,23 @@ struct PPC64V1 { static constexpr u32 R_DTPMOD = R_PPC64_DTPMOD64; }; -struct PPC64V2 { +struct PPC64V1 : PPC64 { + static constexpr std::string_view target_name = "ppc64v1"; + static constexpr bool is_le = false; + static constexpr u32 plt_hdr_size = 44; + static constexpr u32 pltgot_size = 0; + static constexpr u32 thunk_hdr_size = 0; + static constexpr u32 thunk_size = 28; +}; + +struct PPC64V2 : PPC64 { static constexpr std::string_view target_name = "ppc64v2"; - static constexpr bool is_64 = true; static constexpr bool is_le = true; - static constexpr bool is_rela = true; - static constexpr u32 page_size = 65536; - static constexpr u32 e_machine = EM_PPC64; - static constexpr u32 plt_hdr_size = 60; + static constexpr u32 plt_hdr_size = 52; static constexpr u32 plt_size = 4; static constexpr u32 pltgot_size = 0; static constexpr u32 thunk_hdr_size = 0; static constexpr u32 thunk_size = 20; - - static constexpr u32 R_COPY = R_PPC64_COPY; - static constexpr u32 R_GLOB_DAT = R_PPC64_GLOB_DAT; - static constexpr u32 R_JUMP_SLOT = R_PPC64_JMP_SLOT; - static constexpr u32 R_ABS = R_PPC64_ADDR64; - static constexpr u32 R_DYNAMIC = R_PPC64_ADDR64; - static constexpr u32 R_RELATIVE = R_PPC64_RELATIVE; - static constexpr u32 R_IRELATIVE = R_PPC64_IRELATIVE; - static constexpr u32 R_DTPOFF = R_PPC64_DTPREL64; - static constexpr u32 R_TPOFF = R_PPC64_TPREL64; - static constexpr u32 R_DTPMOD = R_PPC64_DTPMOD64; }; struct S390X { @@ -2218,8 +2048,8 @@ struct S390X { static constexpr bool is_rela = true; static constexpr u32 page_size = 4096; static constexpr u32 e_machine = EM_S390X; - static constexpr u32 plt_hdr_size = 32; - static constexpr u32 plt_size = 32; + static constexpr u32 plt_hdr_size = 48; + static constexpr u32 plt_size = 16; static constexpr u32 pltgot_size = 16; static constexpr u32 R_COPY = R_390_COPY; @@ -2323,50 +2153,6 @@ struct ALPHA { static constexpr u32 R_DTPMOD = R_ALPHA_DTPMOD64; }; -struct MIPS64LE { - static constexpr std::string_view target_name = "mips64le"; - static constexpr bool is_64 = true; - static constexpr bool is_le = true; - static constexpr bool is_rela = true; - static constexpr u32 page_size = 4096; - static constexpr u32 e_machine = EM_MIPS; - static constexpr u32 plt_hdr_size = 0; - static constexpr u32 plt_size = 0; - static constexpr u32 pltgot_size = 0; - - static constexpr u32 R_COPY = R_MIPS_COPY; - static constexpr u32 R_GLOB_DAT = R_MIPS_GLOB_DAT | (R_MIPS_64 << 8); - static constexpr u32 R_JUMP_SLOT = R_MIPS_JUMP_SLOT; - static constexpr u32 R_ABS = R_MIPS_64; - static constexpr u32 R_DYNAMIC = R_MIPS_REL32 | (R_MIPS_64 << 8); - static constexpr u32 R_RELATIVE = R_MIPS_REL32 | (R_MIPS_64 << 8); - static constexpr u32 R_DTPOFF = R_MIPS_TLS_DTPREL64; - static constexpr u32 R_TPOFF = R_MIPS_TLS_TPREL64; - static constexpr u32 R_DTPMOD = R_MIPS_TLS_DTPMOD64; -}; - -struct MIPS64BE { - static constexpr std::string_view target_name = "mips64"; - static constexpr bool is_64 = true; - static constexpr bool is_le = false; - static constexpr bool is_rela = true; - static constexpr u32 page_size = 4096; - static constexpr u32 e_machine = EM_MIPS; - static constexpr u32 plt_hdr_size = 0; - static constexpr u32 plt_size = 0; - static constexpr u32 pltgot_size = 0; - - static constexpr u32 R_COPY = R_MIPS_COPY; - static constexpr u32 R_GLOB_DAT = R_MIPS_GLOB_DAT | (R_MIPS_64 << 8); - static constexpr u32 R_JUMP_SLOT = R_MIPS_JUMP_SLOT; - static constexpr u32 R_ABS = R_MIPS_64; - static constexpr u32 R_DYNAMIC = R_MIPS_REL32 | (R_MIPS_64 << 8); - static constexpr u32 R_RELATIVE = R_MIPS_REL32 | (R_MIPS_64 << 8); - static constexpr u32 R_DTPOFF = R_MIPS_TLS_DTPREL64; - static constexpr u32 R_TPOFF = R_MIPS_TLS_TPREL64; - static constexpr u32 R_DTPMOD = R_MIPS_TLS_DTPMOD64; -}; - struct LOONGARCH64 { static constexpr std::string_view target_name = "loongarch64"; static constexpr bool is_64 = true; diff --git a/elf/icf.cc b/elf/icf.cc index 6cdd8bcb..72773f42 100644 --- a/elf/icf.cc +++ b/elf/icf.cc @@ -65,7 +65,7 @@ // conditions. #include "mold.h" -#include "../common/sha.h" +#include "blake3.h" #include #include @@ -81,7 +81,7 @@ static constexpr int64_t HASH_SIZE = 16; typedef std::array Digest; namespace std { -template<> struct hash { +template <> struct hash { size_t operator()(const Digest &k) const { return *(int64_t *)&k[0]; } @@ -132,9 +132,11 @@ static bool is_eligible(Context &ctx, InputSection &isec) { !is_init && !is_fini && !is_enumerable && !is_addr_taken; } -static Digest digest_final(SHA256Hash &sha) { - u8 buf[SHA256_SIZE]; - sha.finish(buf); +static Digest digest_final(blake3_hasher *hasher) { + assert(HASH_SIZE <= BLAKE3_OUT_LEN); + + u8 buf[BLAKE3_OUT_LEN]; + blake3_hasher_finalize(hasher, buf, BLAKE3_OUT_LEN); Digest digest; memcpy(digest.data(), buf, HASH_SIZE); @@ -234,15 +236,16 @@ static void merge_leaf_nodes(Context &ctx) { template static Digest compute_digest(Context &ctx, InputSection &isec) { - SHA256Hash sha; + blake3_hasher hasher; + blake3_hasher_init(&hasher); auto hash = [&](auto val) { - sha.update((u8 *)&val, sizeof(val)); + blake3_hasher_update(&hasher, (u8 *)&val, sizeof(val)); }; auto hash_string = [&](std::string_view str) { hash(str.size()); - sha.update((u8 *)str.data(), str.size()); + blake3_hasher_update(&hasher, (u8 *)str.data(), str.size()); }; auto hash_symbol = [&](Symbol &sym) { @@ -298,7 +301,7 @@ static Digest compute_digest(Context &ctx, InputSection &isec) { hash_symbol(*isec.file.symbols[rel.r_sym]); } - return digest_final(sha); + return digest_final(&hasher); } template @@ -411,16 +414,17 @@ static i64 propagate(std::span> digests, if (converged.get(i)) return; - SHA256Hash sha; - sha.update(digests[2][i].data(), HASH_SIZE); + blake3_hasher hasher; + blake3_hasher_init(&hasher); + blake3_hasher_update(&hasher, digests[2][i].data(), HASH_SIZE); i64 begin = edge_indices[i]; i64 end = (i + 1 == num_digests) ? edges.size() : edge_indices[i + 1]; for (i64 j : edges.subspan(begin, end - begin)) - sha.update(digests[slot][j].data(), HASH_SIZE); + blake3_hasher_update(&hasher, digests[slot][j].data(), HASH_SIZE); - digests[!slot][i] = digest_final(sha); + digests[!slot][i] = digest_final(&hasher); if (digests[slot][i] == digests[!slot][i]) { // This node has converged. Skip further iterations as it will @@ -563,7 +567,7 @@ void icf_sections(Context &ctx) { } } - // Group sections by SHA digest. + // Group sections by BLAKE3 digest. { Timer t(ctx, "group"); diff --git a/elf/input-files.cc b/elf/input-files.cc index 4c217856..48974c28 100644 --- a/elf/input-files.cc +++ b/elf/input-files.cc @@ -104,7 +104,7 @@ static bool is_debug_section(const ElfShdr &shdr, std::string_view name) { template void -ObjectFile::read_note_gnu_property(Context &ctx, const ElfShdr &shdr) { +ObjectFile::parse_note_gnu_property(Context &ctx, const ElfShdr &shdr) { std::string_view data = this->get_string(ctx, shdr); while (!data.empty()) { @@ -140,19 +140,11 @@ ObjectFile::read_note_gnu_property(Context &ctx, const ElfShdr &shdr) { } } -static inline std::string_view read_string(std::string_view &str) { - i64 pos = str.find_first_of('\0'); - std::string_view val = str.substr(0, pos); - str = str.substr(pos + 1); - return val; -} - // // [ "vendor-name" *]+ ]* template static void read_riscv_attributes(Context &ctx, ObjectFile &file, std::string_view data) { - const char *begin = data.data(); if (data.empty()) Fatal(ctx) << file << ": corrupted .riscv.attributes section"; @@ -183,9 +175,12 @@ static void read_riscv_attributes(Context &ctx, ObjectFile &file, case ELF_TAG_RISCV_STACK_ALIGN: file.extra.stack_align = read_uleb(&p); break; - case ELF_TAG_RISCV_ARCH: - file.extra.arch = read_string(p); + case ELF_TAG_RISCV_ARCH: { + i64 pos = p.find_first_of('\0'); + file.extra.arch = p.substr(0, pos); + p = p.substr(pos + 1); break; + } case ELF_TAG_RISCV_UNALIGNED_ACCESS: file.extra.unaligned_access = read_uleb(&p); break; @@ -196,26 +191,6 @@ static void read_riscv_attributes(Context &ctx, ObjectFile &file, } } -template -static u64 read_mips_gp0(Context &ctx, InputSection &isec) { - std::string_view data = isec.contents; - while (!data.empty()) { - if (data.size() < sizeof(MipsOptions)) - Fatal(ctx) << isec << ": corrupted .MIPS.options section"; - - MipsOptions *opt = (MipsOptions *)data.data(); - if (opt->kind == ODK_REGINFO) { - if (data.size() < sizeof(MipsOptions) + sizeof(MipsRegInfo)) - Fatal(ctx) << isec << ": corrupted .MIPS.options section"; - MipsRegInfo *info = (MipsRegInfo *)(opt + 1); - return info->ri_gp_value; - } - - data = data.substr(opt->size); - } - return 0; -} - template void ObjectFile::initialize_sections(Context &ctx) { // Read sections @@ -295,11 +270,8 @@ void ObjectFile::initialize_sections(Context &ctx) { // area in GNU linkers. We ignore that section because silently // making the stack area executable is too dangerous. Tell our // users about the difference if that matters. - // - // MIPS object files don't contain .note.GNU-stack for some reason, - // so ignore this error on MIPS. if (name == ".note.GNU-stack" && !ctx.arg.relocatable) { - if ((shdr.sh_flags & SHF_EXECINSTR) && !is_mips) { + if (shdr.sh_flags & SHF_EXECINSTR) { if (!ctx.arg.z_execstack && !ctx.arg.z_execstack_if_needed) Warn(ctx) << *this << ": this file may cause a segmentation" " fault because it requires an executable stack. See" @@ -311,7 +283,7 @@ void ObjectFile::initialize_sections(Context &ctx) { } if (name == ".note.gnu.property") { - read_note_gnu_property(ctx, shdr); + parse_note_gnu_property(ctx, shdr); continue; } @@ -350,20 +322,13 @@ void ObjectFile::initialize_sections(Context &ctx) { continue; } + if (name == ".eh_frame") + eh_frame_section = this->sections[i].get(); + if constexpr (is_ppc32) if (name == ".got2") extra.got2 = this->sections[i].get(); - if constexpr (is_mips) { - if (name == ".MIPS.abiflags") { - extra.abi_flags = std::move(this->sections[i]); - continue; - } - - if (name == ".MIPS.options") - extra.gp0 = read_mips_gp0(ctx, *this->sections[i]); - } - // Save debug sections for --gdb-index. if (ctx.arg.gdb_index) { InputSection *isec = this->sections[i].get(); @@ -423,16 +388,6 @@ void ObjectFile::initialize_sections(Context &ctx) { } } -template -void ObjectFile::initialize_ehframe_sections(Context &ctx) { - for (i64 i = 0; i < sections.size(); i++) { - std::unique_ptr> &isec = sections[i]; - if (isec && isec->is_alive && isec->name() == ".eh_frame") { - read_ehframe(ctx, *isec); - } - } -} - // .eh_frame contains data records explaining how to handle exceptions. // When an exception is thrown, the runtime searches a record from // .eh_frame with the current program counter as a key. A record that @@ -463,7 +418,11 @@ void ObjectFile::initialize_ehframe_sections(Context &ctx) { // // This function parses an input .eh_frame section. template -void ObjectFile::read_ehframe(Context &ctx, InputSection &isec) { +void ObjectFile::parse_ehframe(Context &ctx) { + if (!eh_frame_section) + return; + + InputSection &isec = *eh_frame_section; std::span> rels = isec.get_rels(ctx); i64 cies_begin = cies.size(); i64 fdes_begin = fdes.size(); @@ -953,7 +912,7 @@ void ObjectFile::parse(Context &ctx) { initialize_sections(ctx); initialize_symbols(ctx); sort_relocations(ctx); - initialize_ehframe_sections(ctx); + parse_ehframe(ctx); } // Symbols with higher priorities overwrites symbols with lower priorities. @@ -1109,11 +1068,10 @@ void ObjectFile::scan_relocations(Context &ctx) { for (ElfRel &rel : cie.get_rels()) { Symbol &sym = *this->symbols[rel.r_sym]; - if constexpr (!is_mips) - if (ctx.arg.pic && rel.r_type == E::R_ABS) - Error(ctx) << *this << ": relocation " << rel << " in .eh_frame can" - << " not be used when making a position-independent output;" - << " recompile with -fPIE or -fPIC"; + if (ctx.arg.pic && rel.r_type == E::R_ABS) + Error(ctx) << *this << ": relocation " << rel << " in .eh_frame can" + << " not be used when making a position-independent output;" + << " recompile with -fPIE or -fPIC"; if (sym.is_imported) { if (sym.get_type() != STT_FUNC) @@ -1226,9 +1184,6 @@ void ObjectFile::compute_symtab_size(Context &ctx) { this->output_sym_indices.resize(this->elf_syms.size(), -1); auto is_alive = [&](Symbol &sym) -> bool { - if (!ctx.arg.gc_sections) - return true; - if (SectionFragment *frag = sym.get_frag()) return frag->is_alive; if (InputSection *isec = sym.get_input_section()) @@ -1274,30 +1229,29 @@ void ObjectFile::populate_symtab(Context &ctx) { u8 *strtab_base = ctx.buf + ctx.strtab->shdr.sh_offset; i64 strtab_off = this->strtab_offset; - auto write_sym = [&](Symbol &sym, i64 &symtab_idx) { + auto write_sym = [&](Symbol &sym, i64 idx) { U32 *xindex = nullptr; if (ctx.symtab_shndx) - xindex = (U32 *)(ctx.buf + ctx.symtab_shndx->shdr.sh_offset) + symtab_idx; + xindex = (U32 *)(ctx.buf + ctx.symtab_shndx->shdr.sh_offset) + idx; - symtab_base[symtab_idx++] = to_output_esym(ctx, sym, strtab_off, xindex); + symtab_base[idx] = to_output_esym(ctx, sym, strtab_off, xindex); strtab_off += write_string(strtab_base + strtab_off, sym.name()); }; i64 local_symtab_idx = this->local_symtab_idx; i64 global_symtab_idx = this->global_symtab_idx; - for (i64 i = 1; i < this->first_global; i++) { - Symbol &sym = *this->symbols[i]; - if (sym.write_to_symtab) - write_sym(sym, local_symtab_idx); - } + + for (i64 i = 1; i < this->first_global; i++) + if (Symbol &sym = *this->symbols[i]; sym.write_to_symtab) + write_sym(sym, local_symtab_idx++); for (i64 i = this->first_global; i < this->elf_syms.size(); i++) { Symbol &sym = *this->symbols[i]; if (sym.file == this && sym.write_to_symtab) { if (sym.is_local(ctx)) - write_sym(sym, local_symtab_idx); + write_sym(sym, local_symtab_idx++); else - write_sym(sym, global_symtab_idx); + write_sym(sym, global_symtab_idx++); } } } @@ -1328,7 +1282,6 @@ SharedFile::create(Context &ctx, MappedFile> *mf) { template SharedFile::SharedFile(Context &ctx, MappedFile> *mf) : InputFile(ctx, mf) { - this->is_needed = ctx.as_needed; this->is_alive = !ctx.as_needed; } @@ -1483,7 +1436,7 @@ SharedFile::mark_live_objects(Context &ctx, if (sym.is_traced) print_trace_symbol(ctx, *this, esym, sym); - if (esym.is_undef() && !esym.is_weak() && sym.file && + if (esym.is_undef() && !esym.is_weak() && sym.file && !sym.file->is_dso && !sym.file->is_alive.test_and_set()) { feeder(sym.file); @@ -1540,7 +1493,8 @@ bool SharedFile::is_readonly(Symbol *sym) { u64 val = sym->esym().st_value; for (ElfPhdr &phdr : this->get_phdrs()) - if (phdr.p_type == PT_LOAD && !(phdr.p_flags & PF_W) && + if ((phdr.p_type == PT_LOAD || phdr.p_type == PT_GNU_RELRO) && + !(phdr.p_flags & PF_W) && phdr.p_vaddr <= val && val < phdr.p_vaddr + phdr.p_memsz) return true; return false; diff --git a/elf/input-sections.cc b/elf/input-sections.cc index 7bcf03bb..0780d94d 100644 --- a/elf/input-sections.cc +++ b/elf/input-sections.cc @@ -150,40 +150,67 @@ static void scan_rel(Context &ctx, InputSection &isec, Symbol &sym, case NONE: break; case ERROR: + // Print out the "recompile with -fPIC" error message. error(); break; case COPYREL: + // Create a copy relocation. if (!ctx.arg.z_copyreloc) error(); copyrel(); break; case DYN_COPYREL: + // Same as COPYREL but try to avoid creating a copy relocation by + // creating a dynamic relocation instead if the relocation is in + // a writable section. + // + // GHC (Glasgow Haskell Compiler) places a small amount of data in + // .text before each function and access that data with a fixed + // offset. The function breaks if we copy-relocate the data. For such + // programs, we should avoid copy relocations if possible. + // + // Besides GHC, copy relocation is a hacky solution, so if we can + // represent a relocation either with copyrel or dynrel, we prefer + // dynamic relocation. if (writable || !ctx.arg.z_copyreloc) dynrel(); else copyrel(); break; case PLT: + // Create a PLT entry. sym.flags |= NEEDS_PLT; break; case CPLT: + // Create a canonical PLT entry. sym.flags |= NEEDS_CPLT; break; case DYN_CPLT: + // Same as CPLT but try to avoid creating a canonical PLT creating by + // creating a dynamic relocation instead if the relocation is in a + // writable section. The motivation behind it is hte same as DYN_COPYREL. if (writable) dynrel(); else sym.flags |= NEEDS_CPLT; break; case DYNREL: + // Create a dynamic relocation. dynrel(); break; case BASEREL: + // Create a base relocation. check_textrel(); if (!isec.is_relr_reloc(ctx, rel)) isec.file.num_dynrel++; break; case IFUNC: + // Create an IRELATIVE relocation for a GNU ifunc symbol. + // + // We usually create an IRELATIVE relocation in .got for each ifunc. + // However, if a statically-initialized pointer is initialized to an + // ifunc's address, we have no choice other than emitting an IRELATIVE + // relocation for each such pointer. dynrel(); ctx.num_ifunc_dynrels++; break; @@ -230,9 +257,9 @@ static Action get_pcrel_action(Context &ctx, Symbol &sym) { template static Action get_absrel_action(Context &ctx, Symbol &sym) { // This is a decision table for absolute relocations that is smaller - // than the word size (e.g. R_X86_64_32). Since the dynamic linker + // than the pointer size (e.g. R_X86_64_32). Since the dynamic linker // generally does not support dynamic relocations smaller than the - // word size, we need to report an error if a relocation cannot be + // pointer size, we need to report an error if a relocation cannot be // resolved at link-time. static Action table[3][4] = { // Absolute Local Imported data Imported code @@ -249,7 +276,7 @@ static Action get_dyn_absrel_action(Context &ctx, Symbol &sym) { if (sym.is_ifunc()) return IFUNC; - // This is a decision table for absolute relocations for the word + // This is a decision table for absolute relocations for the pointer // size data (e.g. R_X86_64_64). Unlike the absrel_table, we can emit // a dynamic relocation if we cannot resolve an address at link-time. static Action table[3][4] = { @@ -305,6 +332,29 @@ void InputSection::scan_toc_rel(Context &ctx, Symbol &sym, scan_rel(ctx, *this, sym, rel, get_ppc64_toc_action(ctx, sym)); } +template +void InputSection::scan_tlsdesc(Context &ctx, Symbol &sym) { + if (ctx.arg.is_static || + (ctx.arg.relax && sym.is_tprel_linktime_const(ctx))) { + // Relax TLSDESC to Local Exec. In this case, we directly materialize + // a TP-relative offset, so no dynamic relocation is needed. + // + // TLSDESC relocs must always be relaxed for statically-linked + // executables even if -no-relax is given. It is because a + // statically-linked executable doesn't contain a trampoline + // function needed for TLSDESC. + } else if (ctx.arg.relax && sym.is_tprel_runtime_const(ctx)) { + // In this condition, TP-relative offset of a thread-local variable + // is known at process startup time, so we can relax TLSDESC to the + // code that reads the TP-relative offset from GOT and add TP to it. + sym.flags |= NEEDS_GOTTP; + } else { + // If no relaxation is doable, we simply create a TLSDESC dynamic + // relocation. + sym.flags |= NEEDS_TLSDESC; + } +} + template void InputSection::check_tlsle(Context &ctx, Symbol &sym, const ElfRel &rel) { diff --git a/elf/linker-script.cc b/elf/linker-script.cc index 44e95a6c..4bdc19e7 100644 --- a/elf/linker-script.cc +++ b/elf/linker-script.cc @@ -279,9 +279,8 @@ static bool read_label(std::span &tok, template static void read_version_script_commands(Context &ctx, std::span &tok, - std::string_view ver_str, u16 ver_idx, bool is_cpp) { - bool is_global = true; - + std::string_view ver_str, u16 ver_idx, + bool is_global, bool is_cpp) { while (!tok.empty() && tok[0] != "}") { if (read_label(tok, "global")) { is_global = true; @@ -299,11 +298,11 @@ read_version_script_commands(Context &ctx, std::span &tok, if (!tok.empty() && tok[0] == "\"C\"") { tok = tok.subspan(1); tok = skip(ctx, tok, "{"); - read_version_script_commands( ctx, tok, ver_str, ver_idx, false); + read_version_script_commands( ctx, tok, ver_str, ver_idx, is_global, false); } else { tok = skip(ctx, tok, "\"C++\""); tok = skip(ctx, tok, "{"); - read_version_script_commands(ctx, tok, ver_str, ver_idx, true); + read_version_script_commands(ctx, tok, ver_str, ver_idx, is_global, true); } tok = skip(ctx, tok, "}"); @@ -349,7 +348,7 @@ void read_version_script(Context &ctx, std::span &tok) { } tok = skip(ctx, tok, "{"); - read_version_script_commands(ctx, tok, ver_str, ver_idx, false); + read_version_script_commands(ctx, tok, ver_str, ver_idx, true, false); tok = skip(ctx, tok, "}"); if (!tok.empty() && tok[0] != ";") tok = tok.subspan(1); diff --git a/elf/lto-unix.cc b/elf/lto-unix.cc index 0820abd7..dde07a8f 100644 --- a/elf/lto-unix.cc +++ b/elf/lto-unix.cc @@ -709,6 +709,9 @@ std::vector *> do_lto(Context &ctx) { get_symbol(ctx, y)->referenced_by_regular_obj = true; } + // Handle --entry + get_symbol(ctx, ctx.arg.entry)->referenced_by_regular_obj = true; + // all_symbols_read_hook() calls add_input_file() and add_input_library() LOG << "all symbols read\n"; if (PluginStatus st = all_symbols_read_hook(); st != LDPS_OK) diff --git a/elf/lto.h b/elf/lto.h index 218909db..489a06fa 100644 --- a/elf/lto.h +++ b/elf/lto.h @@ -55,9 +55,7 @@ enum PluginApiVersion { struct PluginTagValue { PluginTagValue(PluginTag tag, int val) : tag(tag), val(val) {} - - template - PluginTagValue(PluginTag tag, T *ptr) : tag(tag), ptr((void *)ptr) {} + PluginTagValue(PluginTag tag, auto *ptr) : tag(tag), ptr((void *)ptr) {} PluginTag tag; union { diff --git a/elf/main.cc b/elf/main.cc index 90ceed9d..8827a8cd 100644 --- a/elf/main.cc +++ b/elf/main.cc @@ -70,10 +70,6 @@ std::string_view get_machine_type(Context &ctx, MappedFile> *mf) { return SH4::target_name; case EM_ALPHA: return ALPHA::target_name; - case EM_MIPS: - if (is_64) - return is_le ? MIPS64LE::target_name : MIPS64BE::target_name; - return ""; case EM_LOONGARCH: return is_64 ? LOONGARCH64::target_name : LOONGARCH32::target_name; default: @@ -345,7 +341,6 @@ static void read_input_files(Context &ctx, std::span args) { // Since elf_main is a template, we can't run it without a type parameter. // We speculatively run elf_main with X86_64, and if the speculation was // wrong, re-run it with an actual machine type. -template static int redo_main(int argc, char **argv, std::string_view target) { if (target == I386::target_name) return elf_main(argc, argv); @@ -377,10 +372,6 @@ static int redo_main(int argc, char **argv, std::string_view target) { return elf_main(argc, argv); if (target == ALPHA::target_name) return elf_main(argc, argv); - if (target == MIPS64LE::target_name) - return elf_main(argc, argv); - if (target == MIPS64BE::target_name) - return elf_main(argc, argv); if (target == LOONGARCH32::target_name) return elf_main(argc, argv); if (target == LOONGARCH64::target_name) @@ -411,7 +402,7 @@ int elf_main(int argc, char **argv) { // Redo if -m is not x86-64. if constexpr (is_x86_64) if (ctx.arg.emulation != X86_64::target_name) - return redo_main(argc, argv, ctx.arg.emulation); + return redo_main(argc, argv, ctx.arg.emulation); Timer t_all(ctx, "all"); @@ -755,11 +746,6 @@ int elf_main(int argc, char **argv) { return 0; } -using E = MOLD_TARGET; - -template void read_file(Context &, MappedFile> *); -template MappedFile> *open_library(Context &, std::string); - #ifdef MOLD_X86_64 extern template int elf_main(int, char **); @@ -777,8 +763,6 @@ extern template int elf_main(int, char **); extern template int elf_main(int, char **); extern template int elf_main(int, char **); extern template int elf_main(int, char **); -extern template int elf_main(int, char **); -extern template int elf_main(int, char **); extern template int elf_main(int, char **); extern template int elf_main(int, char **); @@ -788,6 +772,8 @@ int main(int argc, char **argv) { #else +using E = MOLD_TARGET; + template int elf_main(int, char **); #endif diff --git a/elf/mold.h b/elf/mold.h index c11d700e..e101270c 100644 --- a/elf/mold.h +++ b/elf/mold.h @@ -36,8 +36,6 @@ namespace mold::elf { -static constexpr i32 SHA256_SIZE = 32; - template class InputFile; template class InputSection; template class MergedSection; @@ -50,7 +48,6 @@ template struct CieRecord; template struct Context; template struct FdeRecord; template class RelocSection; -template class MipsGotSection; template std::ostream &operator<<(std::ostream &out, const Symbol &sym); @@ -100,7 +97,7 @@ struct SymbolAux : SymbolAux { template class RangeExtensionThunk {}; -template requires needs_thunk +template class RangeExtensionThunk { public: RangeExtensionThunk(OutputSection &osec, i64 offset) @@ -127,9 +124,6 @@ struct RangeExtensionRef { i16 sym_idx = -1; }; -template -void create_range_extension_thunks(Context &ctx, OutputSection &osec); - // // input-sections.cc // @@ -216,12 +210,12 @@ struct FdeRecord { template struct InputSectionExtras {}; -template requires needs_thunk +template struct InputSectionExtras { std::vector range_extn; }; -template requires is_riscv +template struct InputSectionExtras { std::vector r_deltas; }; @@ -296,7 +290,7 @@ class InputSection { void scan_absrel(Context &ctx, Symbol &sym, const ElfRel &rel); void scan_dyn_absrel(Context &ctx, Symbol &sym, const ElfRel &rel); void scan_toc_rel(Context &ctx, Symbol &sym, const ElfRel &rel); - + void scan_tlsdesc(Context &ctx, Symbol &sym); void check_tlsle(Context &ctx, Symbol &sym, const ElfRel &rel); void apply_dyn_absrel(Context &ctx, Symbol &sym, const ElfRel &rel, @@ -319,9 +313,9 @@ class InputSection { // tls.cc // -template u64 get_tls_begin(Context &); -template u64 get_tp_addr(Context &); -template u64 get_dtp_addr(Context &); +template u64 get_tls_begin(Context &); +template u64 get_tp_addr(Context &); +template u64 get_dtp_addr(Context &); // // output-chunks.cc @@ -355,6 +349,7 @@ class Chunk { virtual ChunkKind kind() { return SYNTHETIC; } virtual OutputSection *to_osec() { return nullptr; } virtual i64 get_reldyn_size(Context &ctx) const { return 0; } + virtual void construct_relr(Context &ctx) {} virtual void copy_buf(Context &ctx) {} virtual void write_to(Context &ctx, u8 *buf) { unreachable(); } virtual void update_shdr(Context &ctx) {} @@ -384,6 +379,9 @@ class Chunk { // For --section-order i64 sect_order = 0; + + // For --pack-dyn-relocs=relr + std::vector relr; }; // ELF header @@ -449,20 +447,19 @@ class InterpSection : public Chunk { template class OutputSection : public Chunk { public: - OutputSection(std::string_view name, u32 type, u64 flags); + OutputSection(Context &ctx, std::string_view name, u32 type, u64 flags); ChunkKind kind() override { return OUTPUT_SECTION; } OutputSection *to_osec() override { return this; } + void construct_relr(Context &ctx) override; void copy_buf(Context &ctx) override; void write_to(Context &ctx, u8 *buf) override; void compute_symtab_size(Context &ctx) override; void populate_symtab(Context &ctx) override; - std::vector *> members; - - void construct_relr(Context &ctx); - std::vector relr; + void create_range_extension_thunks(Context &ctx); + std::vector *> members; std::vector>> thunks; std::unique_ptr> reloc_sec; }; @@ -478,7 +475,7 @@ class GotSection : public Chunk { this->shdr.sh_addralign = sizeof(Word); // We always create a .got so that _GLOBAL_OFFSET_TABLE_ has - // something to point to. s390x/MIPS psABIs define GOT[1] as a + // something to point to. s390x psABI define GOT[1] as a // reserved slot, so we allocate one more for them. this->shdr.sh_size = (is_s390x ? 2 : 1) * sizeof(Word); } @@ -494,6 +491,7 @@ class GotSection : public Chunk { i64 get_reldyn_size(Context &ctx) const override; void copy_buf(Context &ctx) override; + void construct_relr(Context &ctx) override; void compute_symtab_size(Context &ctx) override; void populate_symtab(Context &ctx) override; @@ -502,9 +500,6 @@ class GotSection : public Chunk { std::vector *> tlsdesc_syms; std::vector *> gottp_syms; u32 tlsld_idx = -1; - - void construct_relr(Context &ctx); - std::vector relr; }; template @@ -648,7 +643,6 @@ class DynstrSection : public Chunk { this->shdr.sh_flags = SHF_ALLOC; } - void keep() { this->shdr.sh_size = 1; } i64 add_string(std::string_view str); i64 find_string(std::string_view str); void copy_buf(Context &ctx) override; @@ -675,7 +669,7 @@ class DynamicSection : public Chunk { void copy_buf(Context &ctx) override; }; -template +template ElfSym to_output_esym(Context &ctx, Symbol &sym, u32 st_name, U32 *shndx); @@ -715,13 +709,13 @@ class DynsymSection : public Chunk { this->shdr.sh_addralign = sizeof(Word); } - void keep() { this->symbols.resize(1); } void add_symbol(Context &ctx, Symbol *sym); void finalize(Context &ctx); void update_shdr(Context &ctx) override; void copy_buf(Context &ctx) override; std::vector *> symbols; + bool finalized = false; }; template @@ -1133,11 +1127,11 @@ class InputFile { virtual ~InputFile() = default; - template std::span - get_data(Context &ctx, const ElfShdr &shdr); + template + std::span get_data(Context &ctx, const ElfShdr &shdr); - template std::span - get_data(Context &ctx, i64 idx); + template + std::span get_data(Context &ctx, i64 idx); std::string_view get_string(Context &ctx, const ElfShdr &shdr); std::string_view get_string(Context &ctx, i64 idx); @@ -1188,24 +1182,18 @@ class InputFile { template struct ObjectFileExtras {}; -template requires is_riscv +template struct ObjectFileExtras { std::optional stack_align; std::optional arch; bool unaligned_access = false; }; -template <> struct ObjectFileExtras { +template <> +struct ObjectFileExtras { InputSection *got2 = nullptr; }; -template requires is_mips -struct ObjectFileExtras { - std::unique_ptr> abi_flags; - MipsGotSection *got = nullptr; - u64 gp0 = 0; -}; - // ObjectFile represents an input .o file. template class ObjectFile : public InputFile { @@ -1240,6 +1228,7 @@ class ObjectFile : public InputFile { std::vector> fdes; BitVector has_symver; std::vector> comdat_groups; + InputSection *eh_frame_section = nullptr; bool exclude_libs = false; std::map gnu_properties; bool is_lto_obj = false; @@ -1285,8 +1274,8 @@ class ObjectFile : public InputFile { void initialize_symbols(Context &ctx); void sort_relocations(Context &ctx); void initialize_ehframe_sections(Context &ctx); - void read_note_gnu_property(Context &ctx, const ElfShdr &shdr); - void read_ehframe(Context &ctx, InputSection &isec); + void parse_note_gnu_property(Context &ctx, const ElfShdr &shdr); + void parse_ehframe(Context &ctx); void override_symbol(Context &ctx, Symbol &sym, const ElfSym &esym, i64 symidx); void merge_visibility(Context &ctx, Symbol &sym, u8 visibility); @@ -1315,7 +1304,6 @@ class SharedFile : public InputFile { void compute_symtab_size(Context &ctx); void populate_symtab(Context &ctx); - bool is_needed = false; std::string soname; std::vector version_strings; std::vector> elf_syms2; @@ -1411,7 +1399,7 @@ template void acquire_global_lock(Context &ctx); template void release_global_lock(Context &ctx); // -// commandline.cc +// cmdline.cc // template @@ -1465,13 +1453,14 @@ template void show_stats(Context &); // arch-arm32.cc // +template <> u64 get_eflags(Context &ctx); void fixup_arm_exidx_section(Context &ctx); // -// arch-riscv64.cc +// arch-riscv.cc // -template requires is_riscv +template class RiscvAttributesSection : public Chunk { public: RiscvAttributesSection() { @@ -1485,7 +1474,10 @@ class RiscvAttributesSection : public Chunk { std::vector contents; }; -template +template +u64 get_eflags(Context &ctx); + +template i64 riscv_resize_sections(Context &ctx); // @@ -1513,6 +1505,12 @@ class PPC64OpdSection : public Chunk { std::vector *> symbols; }; +// +// arch-ppc64v2.cc +// + +template <> u64 get_eflags(Context &ctx); + // // arch-sparc.cc // @@ -1561,86 +1559,6 @@ class AlphaGotSection : public Chunk { std::mutex mu; }; -// -// arch-mips64.cc -// - -template -class MipsQuickstartSection : public Chunk { -public: - MipsQuickstartSection() { - this->name = ".mips_quickstart"; - this->is_relro = true; - this->shdr.sh_type = SHT_PROGBITS; - this->shdr.sh_flags = SHF_ALLOC | SHF_WRITE | SHF_MIPS_GPREL; - this->shdr.sh_addralign = 8; - } - - static constexpr i64 NUM_RESERVED = 2; - - void update_shdr(Context &ctx) override; - void copy_buf(Context &ctx) override; -}; - -template -class MipsGotSection : public Chunk { -public: - MipsGotSection(Context &ctx, const ObjectFile &file) { - this->name = save_string(ctx, ".mips_got." + std::to_string(file.priority)); - this->is_relro = true; - this->shdr.sh_type = SHT_PROGBITS; - this->shdr.sh_flags = SHF_ALLOC | SHF_WRITE | SHF_MIPS_GPREL; - this->shdr.sh_addralign = 8; - } - - u64 get_got_addr(Context &ctx, Symbol &sym, i64 addend) const; - u64 get_gotpage_addr(Context &ctx, Symbol &sym, i64 addend) const; - u64 get_tlsgd_addr(Context &ctx, Symbol &sym) const; - u64 get_gottp_addr(Context &ctx, Symbol &sym) const; - u64 get_tlsld_addr(Context &ctx) const; - - void update_shdr(Context &ctx) override; - i64 get_reldyn_size(Context &ctx) const override; - void copy_buf(Context &ctx) override; - - struct SymbolAddend { - bool operator==(const SymbolAddend &) const = default; - bool operator<(const SymbolAddend &) const; - u64 get_addr(Context &ctx, i64 flags = 0) const; - - Symbol *sym; - i64 addend; - }; - - std::vector got_syms; - std::vector gotpage_syms; - std::vector *> tlsgd_syms; - std::vector *> gottp_syms; - bool has_tlsld = false; -}; - -template -class MipsABIFlagsSection : public Chunk { -public: - MipsABIFlagsSection() { - this->name = ".MIPS.abiflags"; - this->shdr.sh_type = SHT_MIPS_ABIFLAGS; - this->shdr.sh_flags = SHF_ALLOC; - this->shdr.sh_addralign = 8; - } - - std::string_view contents; - - void update_shdr(Context &ctx) override; - void copy_buf(Context &ctx) override; -}; - -template -void mips_merge_got_sections(Context &ctx); - -template -void mips_rewrite_cie(Context &ctx, u8 *buf, CieRecord &cie); - // // main.cc // @@ -1694,42 +1612,42 @@ struct SectionOrder { }; // Target-specific context members -template struct ContextExtras {}; +template +struct ContextExtras {}; -template requires is_riscv +template struct ContextExtras { RiscvAttributesSection *riscv_attributes = nullptr; }; -template <> struct ContextExtras { +template <> +struct ContextExtras { Symbol *_SDA_BASE_ = nullptr; }; -template <> struct ContextExtras { +template <> +struct ContextExtras { PPC64OpdSection *opd = nullptr; Symbol *TOC = nullptr; }; -template <> struct ContextExtras { +template <> +struct ContextExtras { Symbol *TOC = nullptr; Atomic is_power10 = false; }; -template <> struct ContextExtras { +template <> +struct ContextExtras { SparcTlsGetAddrSection *tls_get_addr_sec = nullptr; Symbol *tls_get_addr_sym = nullptr; }; -template <> struct ContextExtras { +template <> +struct ContextExtras { AlphaGotSection *got = nullptr; }; -template requires is_mips -struct ContextExtras { - MipsQuickstartSection *quickstart = nullptr; - MipsABIFlagsSection *abi_flags = nullptr; -}; - // Context represents a context object for each invocation of the linker. // It contains command line flags, pointers to singleton objects // (such as linker-synthesized output sections), unique_ptrs for @@ -1772,7 +1690,7 @@ struct Context { bool fork = true; bool gc_sections = false; bool gdb_index = false; - bool hash_style_gnu = !is_mips; + bool hash_style_gnu = true; bool hash_style_sysv = true; bool icf = false; bool icf_all = false; @@ -1839,7 +1757,7 @@ struct Context { std::string dependency_file; std::string directory; std::string dynamic_linker; - std::string entry = is_mips ? "__start" : "_start"; + std::string entry = "_start"; std::string fini = "_fini"; std::string init = "_init"; std::string output = "a.out"; @@ -1869,7 +1787,7 @@ struct Context { std::vector version_patterns; u16 default_version = VER_NDX_GLOBAL; - i64 page_size = -1; + i64 page_size = E::page_size; std::optional global_lock_fd; // true if default_version is set by a wildcard in version script. @@ -1921,9 +1839,9 @@ struct Context { bool overwrite_output_file = true; std::vector *> chunks; - std::atomic_bool needs_tlsld = false; - std::atomic_bool has_textrel = false; - std::atomic_uint32_t num_ifunc_dynrels = 0; + Atomic needs_tlsld = false; + Atomic has_textrel = false; + Atomic num_ifunc_dynrels = 0; tbb::concurrent_hash_map> undef_errors; @@ -2000,7 +1918,6 @@ struct Context { Symbol *_edata = nullptr; Symbol *_end = nullptr; Symbol *_etext = nullptr; - Symbol *_gp = nullptr; Symbol *edata = nullptr; Symbol *end = nullptr; Symbol *etext = nullptr; @@ -2045,7 +1962,7 @@ enum { template struct SymbolExtras {}; -template requires needs_thunk +template struct SymbolExtras { // For range extension thunks i16 thunk_idx = -1; @@ -2113,6 +2030,10 @@ class Symbol { bool is_ifunc() const { return get_type() == STT_GNU_IFUNC; } bool is_remaining_undef_weak() const; + bool is_pcrel_linktime_const(Context &ctx) const; + bool is_tprel_linktime_const(Context &ctx) const; + bool is_tprel_runtime_const(Context &ctx) const; + InputSection *get_input_section() const; Chunk *get_output_section() const; SectionFragment *get_frag() const; @@ -2484,7 +2405,7 @@ InputSection::is_relr_reloc(Context &ctx, const ElfRel &rel) const { (rel.r_offset % sizeof(Word)) == 0; } -template +template inline bool InputSection::is_killed_by_icf() const { return this->leader && this->leader != this; } @@ -2839,11 +2760,34 @@ inline bool Symbol::is_local(Context &ctx) const { // A remaining weak undefined symbol is promoted to a dynamic symbol // in DSO and resolved to 0 in an executable. This function returns // true if it's latter. -template +template inline bool Symbol::is_remaining_undef_weak() const { return !is_imported && esym().is_undef_weak(); } +// Returns true if the symbol's PC-relative address is known at link-time. +template +inline bool Symbol::is_pcrel_linktime_const(Context &ctx) const { + return !is_imported && !is_ifunc() && (is_relative() || !ctx.arg.pic); +} + +// Returns true if the symbol's Thread Pointer-relative address is +// known at link-time. +template +inline bool Symbol::is_tprel_linktime_const(Context &ctx) const { + assert(get_type() == STT_TLS); + return !ctx.arg.shared && !is_imported; +} + +// Returns true if the symbol's Thread Pointer-relative address is +// known at load-time. +template +inline bool Symbol::is_tprel_runtime_const(Context &ctx) const { + // Returns true unless we are creating a dlopen'able DSO. + assert(get_type() == STT_TLS); + return !(ctx.arg.shared && ctx.arg.z_dlopen); +} + template inline InputSection *Symbol::get_input_section() const { if ((origin & TAG_MASK) == TAG_ISEC) @@ -2928,9 +2872,8 @@ inline std::string_view Symbol::name() const { template inline void Symbol::add_aux(Context &ctx) { if (aux_idx == -1) { - i64 sz = ctx.symbol_aux.size(); - aux_idx = sz; - ctx.symbol_aux.resize(sz + 1); + aux_idx = ctx.symbol_aux.size(); + ctx.symbol_aux.resize(aux_idx + 1); } } @@ -2954,15 +2897,4 @@ inline bool is_c_identifier(std::string_view s) { return true; } -template -inline bool relax_tlsdesc(Context &ctx, Symbol &sym) { - // TLSDESC relocs must be always relaxed for statically-linked - // executables even if -no-relax is given. It is because a - // statically-linked executable doesn't contain a tranpoline - // function needed for TLSDESC. - if (ctx.arg.is_static) - return true; - return ctx.arg.relax && !ctx.arg.shared && !sym.is_imported; -} - } // namespace mold::elf diff --git a/elf/output-chunks.cc b/elf/output-chunks.cc index 1cf7c788..b88ed7a2 100644 --- a/elf/output-chunks.cc +++ b/elf/output-chunks.cc @@ -1,5 +1,5 @@ #include "mold.h" -#include "../common/sha.h" +#include "blake3.h" #include #include @@ -38,44 +38,6 @@ static u32 djb_hash(std::string_view name) { template u64 get_eflags(Context &ctx) { - std::vector *> objs = ctx.objs; - std::erase(objs, ctx.internal_obj); - - if constexpr (is_arm32) - return EF_ARM_EABI_VER5; - - if constexpr (is_riscv) { - if (objs.empty()) - return 0; - - u32 ret = objs[0]->get_ehdr().e_flags; - for (i64 i = 1; i < objs.size(); i++) { - u32 flags = objs[i]->get_ehdr().e_flags; - if (flags & EF_RISCV_RVC) - ret |= EF_RISCV_RVC; - - if ((flags & EF_RISCV_FLOAT_ABI) != (ret & EF_RISCV_FLOAT_ABI)) - Error(ctx) << *objs[i] << ": cannot link object files with different" - << " floating-point ABI from " << *objs[0]; - - if ((flags & EF_RISCV_RVE) != (ret & EF_RISCV_RVE)) - Error(ctx) << *objs[i] << ": cannot link object files with different" - << " EF_RISCV_RVE from " << *objs[0]; - } - return ret; - } - - if constexpr (is_ppc64v2) - return 2; - - if constexpr (is_mips) { - // Real MIPS e_flags computation is much more complicated. - // For now, we just copy the first object's e_flags to the output. - if (objs.empty()) - return 0; - return objs[0]->get_ehdr().e_flags; - } - return 0; } @@ -188,8 +150,7 @@ static std::vector> create_phdr(Context &ctx) { std::vector> vec; auto define = [&](u64 type, u64 flags, i64 min_align, Chunk *chunk) { - vec.push_back({}); - ElfPhdr &phdr = vec.back(); + ElfPhdr phdr = {}; phdr.p_type = type; phdr.p_flags = flags; phdr.p_align = std::max(min_align, chunk->shdr.sh_addralign); @@ -203,14 +164,15 @@ static std::vector> create_phdr(Context &ctx) { if (chunk->shdr.sh_flags & SHF_ALLOC) phdr.p_memsz = chunk->shdr.sh_size; + vec.push_back(phdr); }; auto append = [&](Chunk *chunk) { ElfPhdr &phdr = vec.back(); phdr.p_align = std::max(phdr.p_align, chunk->shdr.sh_addralign); - if (chunk->shdr.sh_type != SHT_NOBITS) - phdr.p_filesz = chunk->shdr.sh_addr + chunk->shdr.sh_size - phdr.p_vaddr; phdr.p_memsz = chunk->shdr.sh_addr + chunk->shdr.sh_size - phdr.p_vaddr; + if (chunk->shdr.sh_type != SHT_NOBITS) + phdr.p_filesz = phdr.p_memsz; }; auto is_bss = [](Chunk *chunk) { @@ -339,11 +301,6 @@ static std::vector> create_phdr(Context &ctx) { if (ctx.extra.riscv_attributes->shdr.sh_size) define(PT_RISCV_ATTRIBUTES, PF_R, 1, ctx.extra.riscv_attributes); - // Create a PT_MIPS_ABIFLAGS - if constexpr (is_mips) - if (ctx.extra.abi_flags->shdr.sh_size) - define(PT_MIPS_ABIFLAGS, PF_R, 8, ctx.extra.abi_flags); - // Create a PT_OPENBSD_RANDOMIZE for (Chunk *chunk : ctx.chunks) if (chunk->name == ".openbsd.randomdata") @@ -486,10 +443,9 @@ template void RelrDynSection::update_shdr(Context &ctx) { this->shdr.sh_link = ctx.dynsym->shndx; - i64 n = ctx.got->relr.size(); + i64 n = 0; for (Chunk *chunk : ctx.chunks) - if (OutputSection *osec = chunk->to_osec()) - n += osec->relr.size(); + n += chunk->relr.size(); this->shdr.sh_size = n * sizeof(Word); } @@ -497,13 +453,9 @@ template void RelrDynSection::copy_buf(Context &ctx) { Word *buf = (Word *)(ctx.buf + this->shdr.sh_offset); - for (u64 val : ctx.got->relr) - *buf++ = (val & 1) ? val : (ctx.got->shdr.sh_addr + val); - for (Chunk *chunk : ctx.chunks) - if (OutputSection *osec = chunk->to_osec()) - for (u64 val : osec->relr) - *buf++ = (val & 1) ? val : (osec->shdr.sh_addr + val); + for (u64 val : chunk->relr) + *buf++ = (val & 1) ? val : (chunk->shdr.sh_addr + val); } template @@ -589,8 +541,11 @@ void DynstrSection::copy_buf(Context &ctx) { if (!ctx.dynsym->symbols.empty()) { i64 offset = dynsym_offset; - for (Symbol *sym : std::span *>(ctx.dynsym->symbols).subspan(1)) - offset += write_string(base + offset, sym->name()); + + for (i64 i = 1; i < ctx.dynsym->symbols.size(); i++) { + Symbol &sym = *ctx.dynsym->symbols[i]; + offset += write_string(base + offset, sym.name()); + } } } @@ -726,8 +681,6 @@ static std::vector> create_dynamic_section(Context &ctx) { } else if constexpr (is_ppc32) { if (ctx.gotplt->shdr.sh_size) define(DT_PLTGOT, ctx.gotplt->shdr.sh_addr + GotPltSection::HDR_SIZE); - } else if constexpr (is_mips) { - define(DT_PLTGOT, ctx.extra.quickstart->shdr.sh_addr); } else { if (ctx.gotplt->shdr.sh_size) define(DT_PLTGOT, ctx.gotplt->shdr.sh_addr); @@ -818,18 +771,7 @@ static std::vector> create_dynamic_section(Context &ctx) { if (ctx.arg.z_interpose) flags1 |= DF_1_INTERPOSE; - auto has_gottp_syms = [&] { - if constexpr (is_mips) { - for (ObjectFile *file : ctx.objs) - if (!file->extra.got->gottp_syms.empty()) - return true; - return false; - } else { - return !ctx.got->gottp_syms.empty(); - } - }; - - if (has_gottp_syms()) + if (!ctx.got->gottp_syms.empty()) flags |= DF_STATIC_TLS; if (ctx.has_textrel) flags |= DF_TEXTREL; @@ -849,16 +791,6 @@ static std::vector> create_dynamic_section(Context &ctx) { define(DT_PPC64_GLINK, ctx.plt->shdr.sh_addr + E::plt_hdr_size - 32); } - if constexpr (is_mips) { - define(DT_MIPS_RLD_VERSION, 1); - define(DT_MIPS_FLAGS, 0); - define(DT_MIPS_BASE_ADDRESS, ctx.arg.image_base); - define(DT_MIPS_LOCAL_GOTNO, 2); - define(DT_MIPS_SYMTABNO, ctx.dynsym->symbols.size()); - define(DT_MIPS_GOTSYM, 0); - define(DT_MIPS_OPTIONS, 0); - } - // GDB needs a DT_DEBUG entry in an executable to store a word-size // data for its own purpose. Its content is not important. if (!ctx.arg.shared) @@ -889,11 +821,16 @@ void DynamicSection::copy_buf(Context &ctx) { } template -OutputSection::OutputSection(std::string_view name, u32 type, u64 flags) { +OutputSection::OutputSection(Context &ctx, std::string_view name, + u32 type, u64 flags) { this->name = name; this->shdr.sh_type = type; this->shdr.sh_flags = flags; + if (auto it = ctx.arg.section_align.find(name); + it != ctx.arg.section_align.end()) + this->shdr.sh_addralign = it->second; + // PT_GNU_RELRO segment is a security mechanism to make more pages // read-only than we could have done without it. // @@ -973,23 +910,24 @@ void OutputSection::write_to(Context &ctx, u8 *buf) { // bit. An address must be even and thus its LSB is 0 (odd address is not // representable in this encoding and such relocation must be stored to // the .rel.dyn section). A bitmap has LSB 1. -static std::vector encode_relr(std::span pos, i64 word_size) { +template +static std::vector encode_relr(std::span pos) { std::vector vec; - u64 num_bits = word_size * 8 - 1; - u64 max_delta = num_bits * word_size; + i64 num_bits = sizeof(Word) * 8 - 1; + i64 max_delta = sizeof(Word) * num_bits; for (i64 i = 0; i < pos.size();) { assert(i == 0 || pos[i - 1] <= pos[i]); - assert(pos[i] % word_size == 0); + assert(pos[i] % sizeof(Word) == 0); vec.push_back(pos[i]); - u64 base = pos[i] + word_size; + u64 base = pos[i] + sizeof(Word); i++; for (;;) { u64 bits = 0; for (; i < pos.size() && pos[i] - base < max_delta; i++) - bits |= 1LL << ((pos[i] - base) / word_size); + bits |= 1LL << ((pos[i] - base) / sizeof(Word)); if (!bits) break; @@ -1032,7 +970,7 @@ void OutputSection::construct_relr(Context &ctx) { // Compress them std::vector pos = flatten(shards); - relr = encode_relr(pos, sizeof(Word)); + this->relr = encode_relr(pos); } // Compute spaces needed for thunk symbols @@ -1050,7 +988,8 @@ void OutputSection::compute_symtab_size(Context &ctx) { for (std::unique_ptr> &thunk : thunks) { // For ARM32, we emit additional symbol "$t", "$a" and "$d" for - // each thunk to mark the beginning of ARM code. + // each thunk to mark the beginning of Thumb code, ARM code and + // data, respectively. if constexpr (is_arm32) this->num_local_symtab += thunk->symbols.size() * 4; else @@ -1107,7 +1046,7 @@ void OutputSection::populate_symtab(Context &ctx) { if constexpr (is_arm32) { write_esym(this->strtab_offset, 0); write_esym(this->strtab_offset + 3, 4); - write_esym(this->strtab_offset + 6, 16); + write_esym(this->strtab_offset + 6, 12); } } } @@ -1137,19 +1076,23 @@ void GotSection::add_tlsgd_symbol(Context &ctx, Symbol *sym) { template void GotSection::add_tlsdesc_symbol(Context &ctx, Symbol *sym) { + // TLSDESC's GOT slot values may vary depending on libc, so we + // always emit a dynamic relocation for each TLSDESC entry. + // + // If dynamic relocation is not available (i.e. if we are creating a + // non-PIC executable), we always relax TLSDESC relocations so that + // no TLSDESC relocation exist at runtime. assert(supports_tlsdesc); + assert(ctx.arg.pic); + sym->set_tlsdesc_idx(ctx, this->shdr.sh_size / sizeof(Word)); this->shdr.sh_size += sizeof(Word) * 2; tlsdesc_syms.push_back(sym); - - if (sym != ctx._TLS_MODULE_BASE_) - ctx.dynsym->add_symbol(ctx, sym); } template void GotSection::add_tlsld(Context &ctx) { - if (tlsld_idx != -1) - return; + assert(tlsld_idx == -1); tlsld_idx = this->shdr.sh_size / sizeof(Word); this->shdr.sh_size += sizeof(Word) * 2; } @@ -1243,12 +1186,17 @@ static std::vector> get_got_entries(Context &ctx) { if constexpr (supports_tlsdesc) { for (Symbol *sym : ctx.got->tlsdesc_syms) { - // _TLS_MODULE_BASE_ is a linker-synthesized virtual symbol that - // refers the begining of the TLS block. - if (sym == ctx._TLS_MODULE_BASE_) - add({sym->get_tlsdesc_idx(ctx), 0, E::R_TLSDESC}); + i64 idx = sym->get_tlsdesc_idx(ctx); + + // TLSDESC uses two consecutive GOT slots, and a single TLSDESC + // dynamic relocation fills both. The actual values of the slots + // vary depending on libc, so we can't precompute their values. + // We always emit a dynamic relocation for each incoming TLSDESC + // reloc. + if (sym->is_imported) + add({idx, 0, E::R_TLSDESC, sym}); else - add({sym->get_tlsdesc_idx(ctx), 0, E::R_TLSDESC, sym}); + add({idx, sym->get_addr(ctx) - ctx.tls_begin, E::R_TLSDESC}); } } @@ -1320,8 +1268,25 @@ void GotSection::copy_buf(Context &ctx) { ent.sym ? ent.sym->get_dynsym_idx(ctx) : 0, ent.val); - if (ctx.arg.apply_dynamic_relocs) - buf[ent.idx] = ent.val; + bool is_tlsdesc = false; + if constexpr (supports_tlsdesc) + is_tlsdesc = (ent.r_type == E::R_TLSDESC); + + if (ctx.arg.apply_dynamic_relocs) { + if (is_tlsdesc && !is_arm32) { + // A single TLSDESC relocation fixes two consecutive GOT slots + // where one slot holds a function pointer and the other an + // argument to the function. An addend should be applied not to + // the function pointer but to the function argument, which is + // usually stored to the second slot. + // + // ARM32 employs the inverted layout for some reason, so an + // addend is applied to the first slot. + buf[ent.idx + 1] = ent.val; + } else { + buf[ent.idx] = ent.val; + } + } } } } @@ -1335,7 +1300,7 @@ void GotSection::construct_relr(Context &ctx) { if (ent.is_relr(ctx)) pos.push_back(ent.idx * sizeof(Word)); - relr = encode_relr(pos, sizeof(Word)); + this->relr = encode_relr(pos); } template @@ -1577,7 +1542,7 @@ void RelPltSection::copy_buf(Context &ctx) { } } -template +template ElfSym to_output_esym(Context &ctx, Symbol &sym, u32 st_name, U32 *shn_xindex) { ElfSym esym; @@ -1668,6 +1633,8 @@ ElfSym to_output_esym(Context &ctx, Symbol &sym, u32 st_name, template void DynsymSection::add_symbol(Context &ctx, Symbol *sym) { + assert(!finalized); + if (symbols.empty()) symbols.resize(1); @@ -1680,6 +1647,9 @@ void DynsymSection::add_symbol(Context &ctx, Symbol *sym) { template void DynsymSection::finalize(Context &ctx) { Timer t(ctx, "DynsymSection::finalize"); + assert(!finalized); + finalized = true; + if (symbols.empty()) return; @@ -1965,8 +1935,8 @@ void MergedSection::assign_offsets(Context &ctx) { for (i64 j = shard_size * i; j < shard_size * (i + 1); j++) if (const char *key = map.get_key(j)) - if (SectionFragment &frag = map.values[j]; frag.is_alive) - fragments.push_back({{key, map.key_sizes[j]}, &frag}); + if (SectionFragment &frag = map.entries[j].value; frag.is_alive) + fragments.push_back({{key, map.entries[j].keylen}, &frag}); // Sort fragments to make output deterministic. tbb::parallel_sort(fragments.begin(), fragments.end(), @@ -2004,7 +1974,7 @@ void MergedSection::assign_offsets(Context &ctx) { tbb::parallel_for((i64)1, map.NUM_SHARDS, [&](i64 i) { for (i64 j = shard_size * i; j < shard_size * (i + 1); j++) - if (SectionFragment &frag = map.values[j]; frag.is_alive) + if (SectionFragment &frag = map.entries[j].value; frag.is_alive) frag.offset += shard_offsets[i]; }); @@ -2026,8 +1996,8 @@ void MergedSection::write_to(Context &ctx, u8 *buf) { for (i64 j = shard_size * i; j < shard_size * (i + 1); j++) if (const char *key = map.get_key(j)) - if (SectionFragment &frag = map.values[j]; frag.is_alive) - memcpy(buf + frag.offset, key, map.key_sizes[j]); + if (SectionFragment &frag = map.entries[j].value; frag.is_alive) + memcpy(buf + frag.offset, key, map.entries[j].keylen); }); } @@ -2132,9 +2102,6 @@ void EhFrameSection::copy_buf(Context &ctx) { if (ctx.arg.relocatable) continue; - if constexpr (is_mips) - mips_rewrite_cie(ctx, base + cie.output_offset, cie); - for (const ElfRel &rel : cie.get_rels()) { assert(rel.r_offset - cie.input_offset < contents.size()); @@ -2353,16 +2320,14 @@ template void VerneedSection::construct(Context &ctx) { Timer t(ctx, "fill_verneed"); - if (ctx.dynsym->symbols.empty()) - return; - // Create a list of versioned symbols and sort by file and version. - std::vector *> syms(ctx.dynsym->symbols.begin() + 1, - ctx.dynsym->symbols.end()); + std::vector *> syms; - std::erase_if(syms, [](Symbol *sym) { - return !sym->file->is_dso || sym->ver_idx <= VER_NDX_LAST_RESERVED; - }); + for (i64 i = 1; i < ctx.dynsym->symbols.size(); i++) { + Symbol &sym = *ctx.dynsym->symbols[i]; + if (sym.file->is_dso && VER_NDX_LAST_RESERVED < sym.ver_idx) + syms.push_back(&sym); + } if (syms.empty()) return; @@ -2560,19 +2525,28 @@ void BuildIdSection::copy_buf(Context &ctx) { memcpy(base + 3, "GNU", 4); // Name string } +// BLAKE3 is a cryptographic hash function just like SHA256. +// We use it instead of SHA256 because it's faster. +static void blake3_hash(u8 *buf, i64 size, u8 *out) { + blake3_hasher hasher; + blake3_hasher_init(&hasher); + blake3_hasher_update(&hasher, buf, size); + blake3_hasher_finalize(&hasher, out, BLAKE3_OUT_LEN); +} + template -static void compute_sha256(Context &ctx, i64 offset) { +static void compute_blake3(Context &ctx, i64 offset) { u8 *buf = ctx.buf; i64 filesize = ctx.output_file->filesize; i64 shard_size = 4096 * 1024; i64 num_shards = align_to(filesize, shard_size) / shard_size; - std::vector shards(num_shards * SHA256_SIZE); + std::vector shards(num_shards * BLAKE3_OUT_LEN); tbb::parallel_for((i64)0, num_shards, [&](i64 i) { u8 *begin = buf + shard_size * i; u8 *end = (i == num_shards - 1) ? buf + filesize : begin + shard_size; - sha256_hash(begin, end - begin, shards.data() + i * SHA256_SIZE); + blake3_hash(begin, end - begin, shards.data() + i * BLAKE3_OUT_LEN); #ifndef _WIN32 // We call munmap early for each chunk so that the last munmap @@ -2584,10 +2558,10 @@ static void compute_sha256(Context &ctx, i64 offset) { #endif }); - assert(ctx.arg.build_id.size() <= SHA256_SIZE); + assert(ctx.arg.build_id.size() <= BLAKE3_OUT_LEN); - u8 digest[SHA256_SIZE]; - sha256_hash(shards.data(), shards.size(), digest); + u8 digest[BLAKE3_OUT_LEN]; + blake3_hash(shards.data(), shards.size(), digest); memcpy(buf + offset, digest, ctx.arg.build_id.size()); #ifndef _WIN32 @@ -2608,11 +2582,7 @@ void BuildIdSection::write_buildid(Context &ctx) { ctx.arg.build_id.value); return; case BuildId::HASH: - // Modern x86 processors have purpose-built instructions to accelerate - // SHA256 computation, and SHA256 outperforms MD5 on such computers. - // So, we always compute SHA256 and truncate it if smaller digest was - // requested. - compute_sha256(ctx, this->shdr.sh_offset + HEADER_SIZE); + compute_blake3(ctx, this->shdr.sh_offset + HEADER_SIZE); return; case BuildId::UUID: { std::array uuid = get_uuid_v4(); @@ -2797,14 +2767,14 @@ void GdbIndexSection::construct(Context &ctx) { !ent->owner.compare_exchange_weak(old_val, file)); ent->num_attrs++; - name.entry_idx = ent - map.values; + name.entry_idx = map.get_idx(ent); } }); // Assign offsets for names and attributes within each file. tbb::parallel_for_each(ctx.objs, [&](ObjectFile *file) { for (GdbIndexName &name : file->gdb_names) { - MapEntry &ent = map.values[name.entry_idx]; + MapEntry &ent = map.entries[name.entry_idx].value; if (ent.owner == file) { ent.attr_offset = file->attrs_size; file->attrs_size += (ent.num_attrs + 1) * 4; @@ -2887,16 +2857,17 @@ void GdbIndexSection::copy_buf(Context &ctx) { for (i64 i = 0; i < map.nbuckets; i++) { if (map.get_key(i)) { - u32 hash = map.values[i].hash; + MapEntry &ent = map.entries[i].value; + u32 hash = ent.hash; u32 step = (hash & mask) | 1; u32 j = hash & mask; while (*(U32 *)(buf + j * 8)) j = (j + step) & mask; - ObjectFile &file = *map.values[i].owner; - *(ul32 *)(buf + j * 8) = file.names_offset + map.values[i].name_offset; - *(ul32 *)(buf + j * 8 + 4) = file.attrs_offset + map.values[i].attr_offset; + ObjectFile &file = *ent.owner; + *(ul32 *)(buf + j * 8) = file.names_offset + ent.name_offset; + *(ul32 *)(buf + j * 8 + 4) = file.attrs_offset + ent.attr_offset; } } @@ -2909,7 +2880,7 @@ void GdbIndexSection::copy_buf(Context &ctx) { std::atomic_uint32_t *attrs = (std::atomic_uint32_t *)buf; for (GdbIndexName &name : file->gdb_names) { - MapEntry &ent = map.values[name.entry_idx]; + MapEntry &ent = map.entries[name.entry_idx].value; u32 idx = (ent.owner.load()->attrs_offset + ent.attr_offset) / 4; attrs[idx + ++attrs[idx]] = name.attr; } @@ -2923,7 +2894,7 @@ void GdbIndexSection::copy_buf(Context &ctx) { for (i64 j = shard_size * i; j < shard_size * (i + 1); j++) { if (map.get_key(j)) { - MapEntry &ent = map.values[j]; + MapEntry &ent = map.entries[j].value; u32 idx = (ent.owner.load()->attrs_offset + ent.attr_offset) / 4; u32 *start = attrs + idx + 1; std::sort(start, start + attrs[idx]); @@ -2940,9 +2911,10 @@ void GdbIndexSection::copy_buf(Context &ctx) { tbb::parallel_for((i64)0, (i64)map.NUM_SHARDS, [&](i64 i) { for (i64 j = shard_size * i; j < shard_size * (i + 1); j++) { if (const char *key = map.get_key(j)) { - ObjectFile &file = *map.values[j].owner; - std::string_view name{key, map.key_sizes[j]}; - write_string(buf + file.names_offset + map.values[j].name_offset, name); + MapEntry &ent = map.entries[j].value; + ObjectFile &file = *ent.owner; + std::string_view name{key, map.entries[j].keylen}; + write_string(buf + file.names_offset + ent.name_offset, name); } } }); @@ -3111,11 +3083,10 @@ void RelocSection::update_shdr(Context &ctx) { template void RelocSection::copy_buf(Context &ctx) { auto write = [&](ElfRel &out, InputSection &isec, const ElfRel &rel) { + Symbol &sym = *isec.file.symbols[rel.r_sym]; i64 symidx = 0; i64 addend = 0; - Symbol &sym = *isec.file.symbols[rel.r_sym]; - if (sym.esym().st_type == STT_SECTION) { if (SectionFragment *frag = sym.get_frag()) { symidx = frag->output_section.shndx; @@ -3134,9 +3105,8 @@ void RelocSection::copy_buf(Context &ctx) { // COMDAT-eliminated section. } } - } else { - if (sym.sym_idx) - symidx = sym.get_output_sym_idx(ctx); + } else if (sym.write_to_symtab) { + symidx = sym.get_output_sym_idx(ctx); addend = get_addend(isec, rel); } @@ -3167,7 +3137,11 @@ template void ComdatGroupSection::update_shdr(Context &ctx) { assert(ctx.arg.relocatable); this->shdr.sh_link = ctx.symtab->shndx; - this->shdr.sh_info = sym.get_output_sym_idx(ctx); + + if (sym.esym().st_type == STT_SECTION) + this->shdr.sh_info = sym.get_input_section()->output_section->shndx; + else + this->shdr.sh_info = sym.get_output_sym_idx(ctx); } template diff --git a/elf/passes.cc b/elf/passes.cc index c9b9059e..c62c4c25 100644 --- a/elf/passes.cc +++ b/elf/passes.cc @@ -33,7 +33,7 @@ void apply_exclude_libs(Context &ctx) { template void create_synthetic_sections(Context &ctx) { - auto push = [&](T *x) { + auto push = [&](auto *x) { ctx.chunks.push_back(x); ctx.chunk_pool.emplace_back(x); return x; @@ -105,9 +105,15 @@ void create_synthetic_sections(Context &ctx) { if (ctx.arg.emit_relocs) ctx.eh_frame_reloc = push(new EhFrameRelocSection); - if (ctx.arg.shared || !ctx.dsos.empty() || ctx.arg.pie) + if (ctx.arg.shared || !ctx.dsos.empty() || ctx.arg.pie) { ctx.dynamic = push(new DynamicSection); + // If .dynamic exists, .dynsym and .dynstr must exist as well + // since .dynamic refers to them. + ctx.dynstr->shdr.sh_size = 1; + ctx.dynsym->symbols.resize(1); + } + ctx.versym = push(new VersymSection); ctx.verneed = push(new VerneedSection); ctx.note_package = push(new NotePackageSection); @@ -127,21 +133,6 @@ void create_synthetic_sections(Context &ctx) { if constexpr (is_alpha) ctx.extra.got = push(new AlphaGotSection); - - if constexpr (is_mips) { - ctx.extra.quickstart = push(new MipsQuickstartSection); - ctx.extra.abi_flags = push(new MipsABIFlagsSection); - - for (ObjectFile *file : ctx.objs) - file->extra.got = push(new MipsGotSection(ctx, *file)); - } - - // If .dynamic exists, .dynsym and .dynstr must exist as well - // since .dynamic refers them. - if (ctx.dynamic) { - ctx.dynstr->keep(); - ctx.dynsym->keep(); - } } template @@ -304,6 +295,7 @@ void resolve_symbols(Context &ctx) { // .eh_frame sections are parsed and regenerated by the linker for the purpose // of deduplication and garbage collection. As such, the input sections should // not be copied over. +// // However, in very rare cases (e.g. GCC CRT compiled with LTO) we might need // to resolve cross-object .eh_frame section references (they only point to // begin or end and don't depend on the actual section contents). @@ -313,15 +305,9 @@ template void kill_eh_frame_sections(Context &ctx) { Timer t(ctx, "kill_eh_frame_sections"); - tbb::parallel_for_each(ctx.objs, [&](ObjectFile *file) { - for (i64 i = 0; i < file->sections.size(); i++) { - if (std::unique_ptr> &isec = file->sections[i]) { - if (isec && isec->is_alive && isec->name() == ".eh_frame") { - isec->is_alive = false; - } - } - } - }); + for (ObjectFile *file : ctx.objs) + if (file->eh_frame_section) + file->eh_frame_section->is_alive = false; } template @@ -414,13 +400,10 @@ static u64 canonicalize_type(std::string_view name, u64 type) { } struct OutputSectionKey { + bool operator==(const OutputSectionKey &) const = default; std::string_view name; u64 type; u64 flags; - - bool operator==(const OutputSectionKey &other) const { - return name == other.name && type == other.type && flags == other.flags; - } }; template @@ -481,10 +464,7 @@ get_output_section_key(Context &ctx, InputSection &isec) { const ElfShdr &shdr = isec.shdr(); std::string_view name = get_output_name(ctx, isec.name(), shdr.sh_flags); u64 type = canonicalize_type(name, shdr.sh_type); - u64 flags = shdr.sh_flags & ~(u64)SHF_COMPRESSED; - - if (!ctx.arg.relocatable) - flags &= ~(u64)SHF_GROUP & ~(u64)SHF_GNU_RETAIN; + u64 flags = shdr.sh_flags & ~(u64)(SHF_COMPRESSED | SHF_GROUP | SHF_GNU_RETAIN); // .init_array is usually writable. We don't want to create multiple // .init_array output sections, so make it always writable. @@ -511,6 +491,8 @@ void create_output_sections(Context &ctx) { std::unordered_map *, Hash> map; std::shared_mutex mu; + i64 size = ctx.osec_pool.size(); + // Instantiate output sections tbb::parallel_for_each(ctx.objs, [&](ObjectFile *file) { // Make a per-thread cache of the main map to avoid lock contention. @@ -525,6 +507,15 @@ void create_output_sections(Context &ctx) { if (!isec || !isec->is_alive) continue; + const ElfShdr &shdr = isec->shdr(); + if (ctx.arg.relocatable && (shdr.sh_flags & SHF_GROUP)) { + OutputSection *osec = + new OutputSection(ctx, isec->name(), shdr.sh_type, shdr.sh_flags); + isec->output_section = osec; + ctx.osec_pool.emplace_back(osec); + continue; + } + OutputSectionKey key = get_output_section_key(ctx, *isec); if (auto it = cache.find(key); it != cache.end()) { @@ -540,12 +531,11 @@ void create_output_sections(Context &ctx) { } std::unique_ptr> osec = - std::make_unique>(key.name, key.type, key.flags); + std::make_unique>(ctx, key.name, key.type, key.flags); std::unique_lock lock(mu); auto [it, inserted] = map.insert({key, osec.get()}); OutputSection *ret = it->second; - lock.unlock(); if (inserted) ctx.osec_pool.emplace_back(std::move(osec)); @@ -559,29 +549,29 @@ void create_output_sections(Context &ctx) { }); // Add input sections to output sections + std::vector *> chunks; + for (i64 i = size; i < ctx.osec_pool.size(); i++) + chunks.push_back(ctx.osec_pool[i].get()); + for (ObjectFile *file : ctx.objs) for (std::unique_ptr> &isec : file->sections) if (isec && isec->is_alive) isec->output_section->members.push_back(isec.get()); // Add output sections and mergeable sections to ctx.chunks - std::vector *> vec; - for (std::pair *> &kv : map) - vec.push_back(kv.second); - for (std::unique_ptr> &osec : ctx.merged_sections) if (osec->shdr.sh_size) - vec.push_back(osec.get()); + chunks.push_back(osec.get()); // Sections are added to the section lists in an arbitrary order // because they are created in parallel. Sort them to to make the // output deterministic. - tbb::parallel_sort(vec.begin(), vec.end(), [](Chunk *x, Chunk *y) { + tbb::parallel_sort(chunks.begin(), chunks.end(), [](Chunk *x, Chunk *y) { return std::tuple(x->name, x->shdr.sh_type, x->shdr.sh_flags) < std::tuple(y->name, y->shdr.sh_type, y->shdr.sh_flags); }); - append(ctx.chunks, vec); + append(ctx.chunks, chunks); } // Create a dummy object file containing linker-synthesized @@ -688,10 +678,10 @@ template void add_synthetic_symbols(Context &ctx) { ObjectFile &obj = *ctx.internal_obj; - auto add = [&](std::string_view name) { + auto add = [&](std::string_view name, u32 type = STT_NOTYPE) { ElfSym esym; memset(&esym, 0, sizeof(esym)); - esym.st_type = STT_NOTYPE; + esym.st_type = type; esym.st_shndx = SHN_ABS; esym.st_bind = STB_GLOBAL; esym.st_visibility = STV_HIDDEN; @@ -737,7 +727,7 @@ void add_synthetic_symbols(Context &ctx) { ctx.__dso_handle = add("__dso_handle"); if constexpr (supports_tlsdesc) - ctx._TLS_MODULE_BASE_ = add("_TLS_MODULE_BASE_"); + ctx._TLS_MODULE_BASE_ = add("_TLS_MODULE_BASE_", STT_TLS); if constexpr (is_riscv) if (!ctx.arg.shared) @@ -754,9 +744,6 @@ void add_synthetic_symbols(Context &ctx) { if constexpr (is_ppc32) ctx.extra._SDA_BASE_ = add("_SDA_BASE_"); - if constexpr (is_mips) - ctx._gp = add("_gp"); - for (Chunk *chunk : ctx.chunks) { if (std::optional name = get_start_stop_name(ctx, *chunk)) { add(save_string(ctx, "__start_" + *name)); @@ -982,33 +969,30 @@ template void check_symbol_types(Context &ctx) { Timer t(ctx, "check_symbol_types"); - auto check = [&](InputFile *file) { + std::vector *> files; + append(files, ctx.objs); + append(files, ctx.dsos); + + tbb::parallel_for_each(files.begin(), files.end(), [&](InputFile *file) { for (i64 i = file->first_global; i < file->elf_syms.size(); i++) { - const ElfSym &esym = file->elf_syms[i]; Symbol &sym = *file->symbols[i]; - - if (!sym.file) + if (!sym.file || sym.file == file) continue; - u32 x = sym.esym().st_type; - if (x == STT_GNU_IFUNC) - x = STT_FUNC; + const ElfSym &esym1 = sym.esym(); + const ElfSym &esym2 = file->elf_syms[i]; - u32 y = esym.st_type; - if (y == STT_GNU_IFUNC) - y = STT_FUNC; + u32 ty1 = (esym1.st_type == STT_GNU_IFUNC) ? STT_FUNC : esym1.st_type; + u32 ty2 = (esym2.st_type == STT_GNU_IFUNC) ? STT_FUNC : esym2.st_type; - if (x != STT_NOTYPE && y != STT_NOTYPE && x != y) + if (ty1 != STT_NOTYPE && ty2 != STT_NOTYPE && ty1 != ty2) Warn(ctx) << "symbol type mismatch: " << sym << '\n' << ">>> defined in " << *sym.file << " as " - << stt_to_string(sym.esym().st_type) << '\n' + << stt_to_string(esym1.st_type) << '\n' << ">>> defined in " << *file << " as " - << stt_to_string(esym.st_type); + << stt_to_string(esym2.st_type); } - }; - - tbb::parallel_for_each(ctx.objs, check); - tbb::parallel_for_each(ctx.dsos, check); + }); } template @@ -1185,19 +1169,16 @@ void compute_section_sizes(Context &ctx) { } }); - i64 offset = 0; - i64 p2align = 0; + ElfShdr &shdr = osec->shdr; + shdr.sh_size = 0; for (i64 i = 0; i < groups.size(); i++) { - offset = align_to(offset, 1 << groups[i].p2align); - groups[i].offset = offset; - offset += groups[i].size; - p2align = std::max(p2align, groups[i].p2align); + shdr.sh_size = align_to(shdr.sh_size, 1 << groups[i].p2align); + groups[i].offset = shdr.sh_size; + shdr.sh_size += groups[i].size; + shdr.sh_addralign = std::max(shdr.sh_addralign, 1 << groups[i].p2align); } - osec->shdr.sh_size = offset; - osec->shdr.sh_addralign = 1 << p2align; - // Assign offsets to input sections. tbb::parallel_for_each(groups, [](Group &group) { i64 offset = group.offset; @@ -1217,23 +1198,12 @@ void compute_section_sizes(Context &ctx) { // inserting thunks. This pass cannot be parallelized. That is, // create_range_extension_thunks is parallelized internally, but the // function itself is not thread-safe. - if constexpr (needs_thunk) { - for (Chunk *chunk : ctx.chunks) { - OutputSection *osec = chunk->to_osec(); - if (osec && (osec->shdr.sh_flags & SHF_EXECINSTR) && !ctx.arg.relocatable) { - create_range_extension_thunks(ctx, *osec); - - for (InputSection *isec : osec->members) - osec->shdr.sh_addralign = - std::max(osec->shdr.sh_addralign, 1 << isec->p2align); - } - } - } - - for (Chunk *chunk : ctx.chunks) - if (OutputSection *osec = chunk->to_osec()) - if (u32 align = ctx.arg.section_align[osec->name]) - osec->shdr.sh_addralign = std::max(osec->shdr.sh_addralign, align); + if constexpr (needs_thunk) + if (!ctx.arg.relocatable) + for (Chunk *chunk : ctx.chunks) + if (OutputSection *osec = chunk->to_osec()) + if (osec->shdr.sh_flags & SHF_EXECINSTR) + osec->create_range_extension_thunks(ctx); } // Find all unresolved symbols and attach them to the most appropriate files. @@ -1413,9 +1383,6 @@ void scan_relocations(Context &ctx) { if constexpr (is_alpha) ctx.extra.got->finalize(); - if constexpr (is_mips) - mips_merge_got_sections(ctx); - if (ctx.has_textrel && ctx.arg.warn_textrel) Warn(ctx) << "creating a DT_TEXTREL in an output file"; } @@ -1505,11 +1472,8 @@ void construct_relr(Context &ctx) { Timer t(ctx, "construct_relr"); tbb::parallel_for_each(ctx.chunks, [&](Chunk *chunk) { - if (OutputSection *osec = chunk->to_osec()) - osec->construct_relr(ctx); + chunk->construct_relr(ctx); }); - - ctx.got->construct_relr(ctx); } template @@ -1834,6 +1798,7 @@ void sort_output_sections_regular(Context &ctx) { (!tls << 6) | (!relro << 5) | (is_bss << 4); }; + // Ties are broken by additional rules auto get_rank2 = [&](Chunk *chunk) -> i64 { if (chunk->shdr.sh_type == SHT_NOTE) return -chunk->shdr.sh_addralign; @@ -1850,14 +1815,8 @@ void sort_output_sections_regular(Context &ctx) { }; sort(ctx.chunks, [&](Chunk *a, Chunk *b) { - // Sort sections by segments - i64 x = get_rank1(a); - i64 y = get_rank1(b); - if (x != y) - return x < y; - - // Ties are broken by additional rules - return get_rank2(a) < get_rank2(b); + return std::tuple{get_rank1(a), get_rank2(a)} < + std::tuple{get_rank1(b), get_rank2(b)}; }); } @@ -2423,7 +2382,7 @@ void fix_synthetic_symbols(Context &ctx) { // this symbol. if (ctx._TLS_MODULE_BASE_) { ctx._TLS_MODULE_BASE_->set_output_section(sections[0]); - ctx._TLS_MODULE_BASE_->value = ctx.tls_begin; + ctx._TLS_MODULE_BASE_->value = ctx.dtp_addr; } // __GNU_EH_FRAME_HDR @@ -2459,10 +2418,6 @@ void fix_synthetic_symbols(Context &ctx) { } } - // MIPS' _gp symbol. - if constexpr (is_mips) - start(ctx._gp, ctx.extra.quickstart, 0x7ff0); - // __start_ and __stop_ symbols for (Chunk *chunk : sections) { if (std::optional name = get_start_stop_name(ctx, *chunk)) { diff --git a/elf/relocatable.cc b/elf/relocatable.cc index 395db8dc..01bf6d39 100644 --- a/elf/relocatable.cc +++ b/elf/relocatable.cc @@ -40,7 +40,7 @@ namespace mold::elf { // Create linker-synthesized sections template static void r_create_synthetic_sections(Context &ctx) { - auto push = [&](T *x) { + auto push = [&](auto *x) { ctx.chunks.push_back(x); ctx.chunk_pool.emplace_back(x); return x; diff --git a/elf/thunks.cc b/elf/thunks.cc index ad54131a..6d09d831 100644 --- a/elf/thunks.cc +++ b/elf/thunks.cc @@ -30,8 +30,9 @@ namespace mold::elf { +using E = MOLD_TARGET; + // Returns a branch reach in bytes for a given target. -template static consteval i64 max_distance() { // ARM64's branch has 26 bits immediate. The immediate is padded with // implicit two-bit zeros because all instructions are 4 bytes aligned @@ -60,12 +61,13 @@ static consteval i64 max_distance() { // We create thunks for each 12.8/1.6/3.2 MiB code block for // ARM64/ARM32/PPC, respectively. -template -static constexpr i64 batch_size = max_distance() / 10; +static constexpr i64 batch_size = max_distance() / 10; // We assume that a single thunk group is smaller than 100 KiB. static constexpr i64 max_thunk_size = 102400; +static_assert(max_thunk_size / E::thunk_size < INT16_MAX); + // Returns true if a given relocation is of type used for function calls. template static bool needs_thunk_rel(const ElfRel &r) { @@ -135,10 +137,9 @@ static bool is_reachable(Context &ctx, InputSection &isec, i64 A = get_addend(isec, rel); i64 P = isec.get_addr() + rel.r_offset; i64 val = S + A - P; - return -max_distance() <= val && val < max_distance(); + return -max_distance() <= val && val < max_distance(); } -template static void reset_thunk(RangeExtensionThunk &thunk) { for (Symbol *sym : thunk.symbols) { sym->extra.thunk_idx = -1; @@ -148,7 +149,6 @@ static void reset_thunk(RangeExtensionThunk &thunk) { } // Scan relocations to collect symbols that need thunks. -template static void scan_rels(Context &ctx, InputSection &isec, RangeExtensionThunk &thunk, i64 thunk_idx) { std::span> rels = isec.get_rels(ctx); @@ -189,9 +189,9 @@ static void scan_rels(Context &ctx, InputSection &isec, } } -template -void create_range_extension_thunks(Context &ctx, OutputSection &osec) { - std::span *> m = osec.members; +template <> +void OutputSection::create_range_extension_thunks(Context &ctx) { + std::span *> m = members; if (m.empty()) return; @@ -236,7 +236,7 @@ void create_range_extension_thunks(Context &ctx, OutputSection &osec) { // Move D foward as far as we can jump from B to anywhere in a thunk at D. while (d < m.size() && align_to(offset, 1 << m[d]->p2align) + m[d]->sh_size + max_thunk_size < - m[b]->offset + max_distance()) { + m[b]->offset + max_distance()) { offset = align_to(offset, 1 << m[d]->p2align); m[d]->offset = offset; offset += m[d]->sh_size; @@ -248,33 +248,33 @@ void create_range_extension_thunks(Context &ctx, OutputSection &osec) { // to ensure progress. c = b + 1; while (c < m.size() && - m[c]->offset + m[c]->sh_size < m[b]->offset + batch_size) + m[c]->offset + m[c]->sh_size < m[b]->offset + batch_size) c++; // Move A forward so that A is reachable from C. i64 c_offset = (c == m.size()) ? offset : m[c]->offset; - while (a < m.size() && m[a]->offset + max_distance() < c_offset) + while (a < m.size() && m[a]->offset + max_distance() < c_offset) a++; // Erase references to out-of-range thunks. - while (t < osec.thunks.size() && - osec.thunks[t]->offset < m[a]->offset) - reset_thunk(*osec.thunks[t++]); + while (t < thunks.size() && thunks[t]->offset < m[a]->offset) + reset_thunk(*thunks[t++]); - // Create a thunk for input sections between B and C and place it at D. + // Create a new thunk and place it at D. offset = align_to(offset, RangeExtensionThunk::alignment); - i64 thunk_idx = osec.thunks.size(); - RangeExtensionThunk *thunk = new RangeExtensionThunk(osec, offset); - osec.thunks.emplace_back(thunk); + i64 thunk_idx = thunks.size(); + RangeExtensionThunk *thunk = new RangeExtensionThunk(*this, offset); + thunks.emplace_back(thunk); - // Scan relocations between B and C to collect symbols that need thunks. + // Scan relocations between B and C to collect symbols that need + // entries in the new thunk. tbb::parallel_for_each(m.begin() + b, m.begin() + c, [&](InputSection *isec) { scan_rels(ctx, *isec, *thunk, thunk_idx); }); // Now that we know the number of symbols in the thunk, we can compute - // its size. + // the thunk's size. assert(thunk->size() < max_thunk_size); offset += thunk->size(); @@ -307,17 +307,15 @@ void create_range_extension_thunks(Context &ctx, OutputSection &osec) { b = c; } - while (t < osec.thunks.size()) - reset_thunk(*osec.thunks[t++]); - - osec.shdr.sh_size = offset; -} + while (t < thunks.size()) + reset_thunk(*thunks[t++]); -using E = MOLD_TARGET; + this->shdr.sh_size = offset; -static_assert(max_thunk_size / E::thunk_size < INT16_MAX); - -template void create_range_extension_thunks(Context &, OutputSection &); + for (InputSection *isec : members) + this->shdr.sh_addralign = + std::max(this->shdr.sh_addralign, 1 << isec->p2align); +} } // namespace mold::elf diff --git a/elf/tls.cc b/elf/tls.cc index 1caa5f19..8d391ace 100644 --- a/elf/tls.cc +++ b/elf/tls.cc @@ -160,18 +160,19 @@ u64 get_tp_addr(Context &ctx) { // of TLV template image when copying TLVs to the TLS block, so we need // to offset it. return align_down(phdr->p_vaddr - sizeof(Word) * 2, phdr->p_align); - } else if constexpr (is_ppc || is_m68k || is_mips) { - // On PPC and m68k, TP is 0x7000 (28 KiB) past the beginning of the TLV - // block to maximize the addressable range for load/store instructions - // with 16-bits signed immediates. It's not exactly 0x8000 (32 KiB) off - // because there's a small implementation-defined piece of data before - // the TLV block, and the runtime wants to access them efficiently too. + } else if constexpr (is_ppc || is_m68k) { + // On PowerPC and m68k, TP is 0x7000 (28 KiB) past the beginning + // of the TLV block to maximize the addressable range of load/store + // instructions with 16-bits signed immediates. It's not exactly 0x8000 + // (32 KiB) off because there's a small implementation-defined piece of + // data before the initial TLV block, and the runtime wants to access + // them efficiently too. return phdr->p_vaddr + 0x7000; } else { - // RISC-V just uses the beginning of the main executable's TLV block as - // TP. RISC-V load/store instructions usually take 12-bits signed - // immediates, so the beginning of the TLS block ± 2 KiB is accessible - // with a single load/store instruction. + // RISC-V and LoongArch just uses the beginning of the main executable's + // TLV block as TP. Their load/store instructions usually take 12-bits + // signed immediates, so the beginning of the TLS block ± 2 KiB is + // accessible with a single load/store instruction. static_assert(is_riscv || is_loongarch); return phdr->p_vaddr; } @@ -185,12 +186,12 @@ u64 get_dtp_addr(Context &ctx) { if (!phdr) return 0; - if constexpr (is_ppc || is_m68k || is_mips) { - // On PPC64 and m68k, R_DTPOFF is resolved to the address 0x8000 + if constexpr (is_ppc || is_m68k) { + // On PowerPC and m68k, R_DTPOFF is resolved to the address 0x8000 // (32 KiB) past the start of the TLS block. The bias maximizes the - // accessible range for load/store instructions with 16-bits signed - // immediates. That is, if the offset were right at the beginning of - // the start of the TLS block, the half of addressible space (negative + // accessible range of load/store instructions with 16-bits signed + // immediates. That is, if the offset were right at the beginning of the + // start of the TLS block, the half of addressible space (negative // immediates) would have been wasted. return phdr->p_vaddr + 0x8000; } else if constexpr (is_riscv) { diff --git a/install-build-deps.sh b/install-build-deps.sh index 2d820e5b..968e7613 100755 --- a/install-build-deps.sh +++ b/install-build-deps.sh @@ -12,42 +12,42 @@ set -x case "$ID-$VERSION_ID" in ubuntu-20.* | pop-20.*) apt-get update - apt-get install -y cmake libssl-dev zlib1g-dev gcc g++ g++-10 + apt-get install -y cmake gcc g++ g++-10 apt-get install -y file ;; ubuntu-* | pop-* | linuxmint-* | debian-* | raspbian-*) apt-get update - apt-get install -y cmake libssl-dev zlib1g-dev gcc g++ + apt-get install -y cmake gcc g++ apt-get install -y file ;; fedora-*) - dnf install -y gcc-g++ cmake openssl-devel zlib-devel + dnf install -y gcc-g++ cmake dnf install -y glibc-static file libstdc++-static diffutils util-linux ;; opensuse-leap-*) - zypper install -y make cmake zlib-devel libopenssl-devel gcc-c++ gcc11-c++ + zypper install -y make cmake gcc-c++ gcc11-c++ zypper install -y glibc-devel-static tar diffutils util-linux ;; opensuse-tumbleweed-*) - zypper install -y make cmake zlib-devel libopenssl-devel gcc-c++ + zypper install -y make cmake gcc-c++ zypper install -y glibc-devel-static tar diffutils util-linux ;; gentoo-*) emerge-webrsync - emerge dev-util/cmake sys-libs/zlib + emerge dev-util/cmake ;; -arch-*) +arch-* | archarm-*) pacman -Sy - pacman -S --needed --noconfirm base-devel zlib openssl cmake util-linux + pacman -S --needed --noconfirm base-devel cmake util-linux ;; void-*) xbps-install -Sy xbps - xbps-install -Sy bash make cmake openssl-devel zlib-devel gcc + xbps-install -Sy bash make cmake gcc xbps-install -Sy tar diffutils util-linux ;; alpine-*) apk update - apk add bash make openssl-dev linux-headers cmake zlib-dev gcc g++ + apk add bash make linux-headers cmake gcc g++ ;; *) echo "Error: don't know anything about build dependencies on $ID-$VERSION_ID" diff --git a/install-cross-tools.sh b/install-cross-tools.sh new file mode 100755 index 00000000..86dc10dd --- /dev/null +++ b/install-cross-tools.sh @@ -0,0 +1,19 @@ +#!/bin/bash +set -e +. /etc/os-release + +set -x + +# This script install packages for -DMOLD_ENABLE_QEMU_TESTS=1 +# to enable cross-target tests. +# +# Feel free to send me a PR if your OS is not on this list. + +case "$ID-$VERSION_ID" in +ubuntu-* | pop-* | linuxmint-* | debian-* | raspbian-*) + apt-get install -y qemu-user {gcc,g++}-{i686,aarch64,riscv64,powerpc,powerpc64,powerpc64le,s390x,sparc64,m68k,sh4,alpha}-linux-gnu {gcc,g++}-arm-linux-gnueabihf + ;; +*) + echo "Error: don't know anything about build dependencies on $ID-$VERSION_ID" + exit 1 +esac diff --git a/test/elf/CMakeLists.txt b/test/elf/CMakeLists.txt index df56a67f..6b1d65b0 100644 --- a/test/elf/CMakeLists.txt +++ b/test/elf/CMakeLists.txt @@ -62,8 +62,6 @@ add_target(sparc64-linux-gnu) add_target(s390x-linux-gnu) add_target(sh4-linux-gnu) add_target(alpha-linux-gnu) -add_target(mips64-linux-gnuabi64) -add_target(mips64el-linux-gnuabi64) option(MOLD_ENABLE_QEMU_TESTS_RV32 "Enable tests for RV32" OFF) if(MOLD_ENABLE_QEMU_TESTS_RV32) diff --git a/test/elf/abs-error.sh b/test/elf/abs-error.sh index 595cf52b..ca1cc1d7 100755 --- a/test/elf/abs-error.sh +++ b/test/elf/abs-error.sh @@ -6,8 +6,6 @@ [ $MACHINE = ppc64le ] && skip [ $MACHINE = s390x ] && skip [ $MACHINE = alpha ] && skip -[ $MACHINE = mips64el ] && skip -[ $MACHINE = mips64 ] && skip [[ $MACHINE = loongarch* ]] && skip cat <:' > $t/log grep -Eq 'bx\s+pc' $t/log -grep -Eq 'add\s+ip, ip, pc' $t/log +grep -Eq 'add\s+pc, ip, pc' $t/log diff --git a/test/elf/as-needed-dso.sh b/test/elf/as-needed-dso.sh index d2bc87eb..40f0a46b 100755 --- a/test/elf/as-needed-dso.sh +++ b/test/elf/as-needed-dso.sh @@ -15,10 +15,7 @@ int fn2(); int main() { fn2(); } EOF -$CC -B. -o $t/exe1 $t/a.o -L$t -Wl,--as-needed -lbar -Wl,--allow-shlib-undefined -readelf -W --dynamic $t/exe1 > $t/log1 -! grep -q libfoo $t/log1 || false - -$CC -B. -o $t/exe2 $t/a.o -L$t -Wl,--as-needed -lbar -lfoo -readelf -W --dynamic $t/exe2 > $t/log2 -grep -q libfoo $t/log2 +$CC -B. -o $t/exe $t/a.o -L$t -Wl,--as-needed -lbar -lfoo +readelf -W --dynamic $t/exe > $t/log2 +grep -q libbar $t/log2 +! grep -q libfoo $t/log2 || false diff --git a/test/elf/as-needed2.sh b/test/elf/as-needed2.sh deleted file mode 100755 index a5b7f20d..00000000 --- a/test/elf/as-needed2.sh +++ /dev/null @@ -1,32 +0,0 @@ -#!/bin/bash -. $(dirname $0)/common.inc - -cat < -int baz(); -int main() { - printf("%d\n", baz()); -} -EOF - -$CC -B. -o $t/exe $t/b.o -L$t -Wl,--as-needed -lbaz -lbar -lfoo - -readelf --dynamic $t/exe > $t/log -grep -q libbaz $t/log || false -! grep -q libbar $t/log || false -grep -q libfoo $t/log || false diff --git a/test/elf/common.inc b/test/elf/common.inc index 48b30767..78c37757 100644 --- a/test/elf/common.inc +++ b/test/elf/common.inc @@ -53,6 +53,12 @@ else QEMU="qemu-$MACHINE -L /usr/$TRIPLE" fi +if [ $MACHINE = x86_64 -o $MACHINE = i386 -o $MACHINE = arm ]; then + tlsdesc_opt=-mtls-dialect=gnu2 +elif [ $MACHINE = aarch64 ]; then + tlsdesc_opt=-mtls-dialect=desc +fi + # Common functions test_cflags() { echo 'int main() {}' | $CC "$@" -o /dev/null -xc - >& /dev/null @@ -63,6 +69,10 @@ supports_ifunc() { $CC -c -o /dev/null -xc - >& /dev/null } +supports_tlsdesc() { + [ -n "$tlsdesc_opt" ] +} + skip() { echo skipped trap - EXIT diff --git a/test/elf/compress-debug-sections-zstd.sh b/test/elf/compress-debug-sections-zstd.sh index 7c18efde..126997a3 100755 --- a/test/elf/compress-debug-sections-zstd.sh +++ b/test/elf/compress-debug-sections-zstd.sh @@ -4,7 +4,6 @@ # arm-linux-gnueabihf-objcopy crashes on x86-64 [ $MACHINE = arm ] && skip [ $MACHINE = riscv32 ] && skip -[[ $MACHINE = mips* ]] && skip command -v zstdcat >& /dev/null || skip diff --git a/test/elf/copyrel-alignment.sh b/test/elf/copyrel-alignment.sh index 961ccc58..4b265ac7 100755 --- a/test/elf/copyrel-alignment.sh +++ b/test/elf/copyrel-alignment.sh @@ -4,7 +4,6 @@ [ $MACHINE = ppc64 ] && skip [ $MACHINE = ppc64le ] && skip [ $MACHINE = alpha ] && skip -[[ $MACHINE = mips* ]] && skip [[ $MACHINE = loongarch* ]] && skip cat < +#include +#include + +extern char readonly[100]; +extern char readwrite[100]; + +static int segv = 0; +static jmp_buf buf; + +void handler(int sig) { + segv = 1; + longjmp(buf, 1); +} + +int main() { + signal(SIGSEGV, handler); + + readwrite[0] = 5; + int x = segv; + + if (setjmp(buf) == 0) + *(char *)readonly = 5; + int y = segv; + + printf("sigsegv %d %d\n", x, y); +} +EOF + +cat <& /dev/null || skip diff --git a/test/elf/lto-nostdlib.sh b/test/elf/lto-nostdlib.sh new file mode 100644 index 00000000..59f4c0d6 --- /dev/null +++ b/test/elf/lto-nostdlib.sh @@ -0,0 +1,9 @@ +#!/bin/bash +. $(dirname $0)/common.inc + +cat <&1 | grep BusyBox && skip diff --git a/test/elf/relocatable-c++.sh b/test/elf/relocatable-c++.sh new file mode 100755 index 00000000..cce3d033 --- /dev/null +++ b/test/elf/relocatable-c++.sh @@ -0,0 +1,46 @@ +#!/bin/bash +. $(dirname $0)/common.inc + +# OneTBB isn't tsan-clean +nm mold | grep -q '__tsan_init' && skip + +# Ubuntu 22.04 SH4 GCC is broken +[ $MACHINE = sh4 ] && skip + +cat < +struct Foo { + Foo() { hello(); } +}; + +template +struct Bar { + Bar() { world(); } +}; + +void baz() { + Foo foo; + Bar bar; +} +EOF + +cat < + +void hello() { std::cout << "Hello "; } +void world() { std::cout << "world\n"; } +void baz(); + +int main() { + baz(); +} +EOF + +./mold --relocatable -o $t/c.o $t/a.o +./mold --relocatable -o $t/d.o $t/b.o + +$CXX -B. -o $t/exe $t/c.o $t/d.o +$QEMU $t/exe | grep -q 'Hello world' diff --git a/test/elf/riscv64_attributes.sh b/test/elf/riscv64_attributes.sh index c5062cf8..9ea4270d 100755 --- a/test/elf/riscv64_attributes.sh +++ b/test/elf/riscv64_attributes.sh @@ -1,14 +1,14 @@ #!/bin/bash . $(dirname $0)/common.inc -cat < + +int get_foo(); +int get_foo2(); +int get_bar(); +int get_baz(); + +int main() { + printf("%x %x %x %x\n", get_foo(), get_foo2(), get_bar(), get_baz()); +} +EOF + +$CC -B. -o $t/exe1 $t/a.o $t/b.o $t/c.o -Wl,--no-relax +$QEMU $t/exe1 | grep -q 'f00 10000f00 ba 11beef' + +$CC -B. -o $t/exe2 $t/a.o $t/b.o $t/c.o +$QEMU $t/exe2 | grep -q 'f00 10000f00 ba 11beef' diff --git a/test/elf/riscv64_weak-undef.sh b/test/elf/riscv64_weak-undef.sh index 36267a6d..9d99db4c 100755 --- a/test/elf/riscv64_weak-undef.sh +++ b/test/elf/riscv64_weak-undef.sh @@ -1,6 +1,8 @@ #!/bin/bash . $(dirname $0)/common.inc +test_cflags -static || skip + cat < +#include +#include +#include + +int main(int argc, char **argv) { + void *handle = dlopen(argv[1], RTLD_LAZY); + if (!handle) { + fprintf(stderr, "dlopen failed: %s: %s: \n", argv[1], dlerror()); + exit(1); + } + + int (*get)(int) = dlsym(handle, "get_foo"); + assert(get); + + printf("%d %d %d\n", get(0), get(1), get(9999)); +} +EOF + +$CC -B. -o $t/exe $t/c.o -ldl +$QEMU $t/exe $t/b.so | grep -q '3 0 5' diff --git a/test/elf/tlsdesc-dlopen.sh b/test/elf/tlsdesc-dlopen.sh new file mode 100755 index 00000000..70bfa144 --- /dev/null +++ b/test/elf/tlsdesc-dlopen.sh @@ -0,0 +1,35 @@ +#!/bin/bash +. $(dirname $0)/common.inc + +supports_tlsdesc || skip + +cat < +#include +#include +#include + +int main(int argc, char **argv) { + void *handle = dlopen(argv[1], RTLD_LAZY); + if (!handle) { + fprintf(stderr, "dlopen failed: %s: %s: \n", argv[1], dlerror()); + exit(1); + } + + int (*get)(int) = dlsym(handle, "get_foo"); + assert(get); + + printf("%d %d %d\n", get(0), get(1), get(9999)); +} +EOF + +$CC -B. -o $t/exe $t/c.o -ldl +$QEMU $t/exe $t/b.so | grep -q '3 0 5' diff --git a/test/elf/tlsdesc-import.sh b/test/elf/tlsdesc-import.sh index bef86951..e2872b27 100755 --- a/test/elf/tlsdesc-import.sh +++ b/test/elf/tlsdesc-import.sh @@ -1,15 +1,9 @@ #!/bin/bash . $(dirname $0)/common.inc -if [ $MACHINE = x86_64 -o $MACHINE = arm ]; then - dialect=gnu2 -elif [ $MACHINE = aarch64 ]; then - dialect=desc -else - skip -fi +supports_tlsdesc || skip -cat < extern _Thread_local int foo; @@ -21,7 +15,7 @@ int main() { } EOF -cat < + +extern _Thread_local int foo; +int get_foo1(); +int get_foo2() { return foo; } + +int main() { + printf("%d %d %d\n", foo, get_foo1(), get_foo2()); +} +EOF + +$CC -B. -o $t/exe1 $t/c.o $t/d.o $t/b.so +$QEMU $t/exe1 | grep -q '^5 5 5$' + +readelf -Wr $t/exe1 > $t/log1 +! grep -Eq 'TLS.?DESC' $t/log1 || false + +$CC -B. -o $t/exe1 $t/c.o $t/d.o $t/b.so -Wl,--no-relax +$QEMU $t/exe1 | grep -q '^5 5 5$' + +readelf -Wr $t/exe1 > $t/log2 +grep -Eq 'TLS.?DESC' $t/log2 diff --git a/test/elf/tlsdesc-local-dynamic.sh b/test/elf/tlsdesc-local-dynamic.sh new file mode 100755 index 00000000..01d2d0fb --- /dev/null +++ b/test/elf/tlsdesc-local-dynamic.sh @@ -0,0 +1,39 @@ +#!/bin/bash +. $(dirname $0)/common.inc + +supports_tlsdesc || skip + +cat < + +_Thread_local int foo; + +int get_foo(); +int get_bar(); + +int main() { + foo = 42; + printf("%d %d\n", get_foo(), get_bar()); + return 0; +} +EOF + +$CC -B. -o $t/exe1 $t/a.o $t/b.o +$QEMU $t/exe1 | grep -q '42 5' + +$CC -B. -o $t/exe2 $t/a.o $t/b.o -Wl,--no-relax +$QEMU $t/exe2 | grep -q '42 5' diff --git a/test/elf/tlsdesc-static.sh b/test/elf/tlsdesc-static.sh index f70ce0ba..41c89cb1 100755 --- a/test/elf/tlsdesc-static.sh +++ b/test/elf/tlsdesc-static.sh @@ -1,17 +1,10 @@ #!/bin/bash . $(dirname $0)/common.inc +supports_tlsdesc || skip test_cflags -static || skip -if [ $MACHINE = x86_64 -o $MACHINE = arm ]; then - dialect=gnu2 -elif [ $MACHINE = aarch64 ]; then - dialect=desc -else - skip -fi - -cat < extern _Thread_local int foo; @@ -22,12 +15,12 @@ int main() { } EOF -cat < _Thread_local int foo; +extern _Thread_local int bar; int get_foo(); -int get_bar(); +int get_baz(); int main() { foo = 42; - printf("%d %d\n", get_foo(), get_bar()); + printf("%d %d %d\n", get_foo(), bar, get_baz()); return 0; } EOF -$CC -B. -o $t/exe $t/a.o $t/b.o -$QEMU $t/exe | grep -q '42 5' +$CC -B. -o $t/exe1 $t/a.o $t/b.o +$QEMU $t/exe1 | grep -q '42 3 5' -$CC -B. -o $t/exe $t/a.o $t/b.o -Wl,-no-relax -$QEMU $t/exe | grep -q '42 5' +$CC -B. -o $t/exe2 $t/a.o $t/b.o -Wl,-no-relax +$QEMU $t/exe2 | grep -q '42 3 5' $CC -B. -shared -o $t/c.so $t/a.o -$CC -B. -o $t/exe $t/b.o $t/c.so -$QEMU $t/exe | grep -q '42 5' +$CC -B. -o $t/exe3 $t/b.o $t/c.so +$QEMU $t/exe3 | grep -q '42 3 5' $CC -B. -shared -o $t/c.so $t/a.o -Wl,-no-relax -$CC -B. -o $t/exe $t/b.o $t/c.so -Wl,-no-relax -$QEMU $t/exe | grep -q '42 5' +$CC -B. -o $t/exe4 $t/b.o $t/c.so -Wl,-no-relax +$QEMU $t/exe4 | grep -q '42 3 5' diff --git a/test/elf/version-script19.sh b/test/elf/version-script19.sh new file mode 100755 index 00000000..45da96f1 --- /dev/null +++ b/test/elf/version-script19.sh @@ -0,0 +1,16 @@ +#!/bin/bash +. $(dirname $0)/common.inc + +cat <<'EOF' > $t/a.ver +{ local: extern "C++" { foo*; }; }; +EOF + +cat < $t/log +! grep -Eq foobar $t/log || false +grep -Eq 'GLOBAL.*baz' $t/log diff --git a/third-party/blake3/.github/workflows/build_b3sum.py b/third-party/blake3/.github/workflows/build_b3sum.py new file mode 100644 index 00000000..f0e1787c --- /dev/null +++ b/third-party/blake3/.github/workflows/build_b3sum.py @@ -0,0 +1,38 @@ +#! /usr/bin/env python3 + +from pathlib import Path +import platform +import shutil +import subprocess +import sys + +ROOT = Path(__file__).parent.parent.parent +RUST_TARGET = sys.argv[1] + +subprocess.run( + ["cargo", "build", "--target", sys.argv[1], "--release"], cwd=ROOT / "b3sum" +) + +if platform.system() == "Windows": + original_exe_name = "b3sum.exe" +else: + original_exe_name = "b3sum" + +if platform.system() == "Windows": + new_exe_name = "b3sum_windows_x64_bin.exe" +elif platform.system() == "Darwin": + new_exe_name = "b3sum_macos_x64_bin" +elif platform.system() == "Linux": + new_exe_name = "b3sum_linux_x64_bin" +else: + raise RuntimeError("Unexpected platform: " + platform.system()) + +# Copy the built binary so that it has the upload name we want. +out_dir = ROOT / "b3sum/target" / RUST_TARGET / "release" +original_exe_path = str(out_dir / original_exe_name) +new_exe_path = str(out_dir / new_exe_name) +print("copying", repr(original_exe_path), "to", repr(new_exe_path)) +shutil.copyfile(original_exe_path, new_exe_path) + +# This lets the subsequent upload step get the filepath. +print("::set-output name=bin_path::" + new_exe_path) diff --git a/third-party/blake3/.github/workflows/ci.yml b/third-party/blake3/.github/workflows/ci.yml new file mode 100644 index 00000000..c1a88aaf --- /dev/null +++ b/third-party/blake3/.github/workflows/ci.yml @@ -0,0 +1,330 @@ +name: tests + +on: + push: + branches: + - "*" + # not on tags + pull_request: + +env: + BLAKE3_CI: "1" + RUSTFLAGS: "-D warnings" + RUST_BACKTRACE: "1" + +jobs: + library_tests: + name: ${{ matrix.target.name }} ${{ matrix.channel }} + runs-on: ${{ matrix.target.os }} + strategy: + fail-fast: false + matrix: + target: [ + { "os": "ubuntu-latest", "toolchain": "x86_64-unknown-linux-gnu", "name": "Linux GNU" }, + { "os": "macOS-latest", "toolchain": "x86_64-apple-darwin", "name": "macOS" }, + { "os": "windows-latest", "toolchain": "x86_64-pc-windows-msvc", "name": "Windows MSVC" }, + { "os": "windows-latest", "toolchain": "x86_64-pc-windows-gnu", "name": "Windows GNU" } + ] + channel: [ + "stable", + "beta", + "nightly", + # The current MSRV. This crate doesn't have an official MSRV policy, + # but in practice we'll probably do what libc does: + # https://github.com/rust-lang/libs-team/issues/72. + # This test target is here so that we notice if we accidentally bump + # the MSRV, but it's not a promise that we won't bump it. + "1.66.1", + ] + + steps: + - uses: actions/checkout@v3 + - uses: actions-rs/toolchain@v1 + with: + toolchain: ${{ format('{0}-{1}', matrix.channel, matrix.target.toolchain) }} + profile: minimal + override: true + # Print the compiler version, for debugging. + - name: print compiler version + run: cargo run --quiet + working-directory: ./tools/compiler_version + # Print out instruction set support, for debugging. + - name: print instruction set support + run: cargo run --quiet + working-directory: ./tools/instruction_set_support + # Default tests plus Rayon and RustCrypto trait implementations. + - run: cargo test --features=rayon,traits-preview + # Same but with only one thread in the Rayon pool. This can find deadlocks. + - name: "again with RAYON_NUM_THREADS=1" + run: cargo test --features=rayon,traits-preview + env: + RAYON_NUM_THREADS: 1 + # no_std tests. + - run: cargo test --no-default-features + + # A matrix of different test settings: + # - debug vs release + # - assembly vs Rust+C intrinsics vs pure Rust intrinsics + # - different levels of SIMD support + # + # Full SIMD support. + - run: cargo test --features= + - run: cargo test --features=prefer_intrinsics + - run: cargo test --features=pure + - run: cargo test --features= --release + - run: cargo test --features=prefer_intrinsics --release + - run: cargo test --features=pure --release + # No AVX-512. + - run: cargo test --features=no_avx512 + - run: cargo test --features=no_avx512,prefer_intrinsics + - run: cargo test --features=no_avx512,pure + - run: cargo test --features=no_avx512 --release + - run: cargo test --features=no_avx512,prefer_intrinsics --release + - run: cargo test --features=no_avx512,pure --release + # No AVX2. + - run: cargo test --features=no_avx512,no_avx2 + - run: cargo test --features=no_avx512,no_avx2,prefer_intrinsics + - run: cargo test --features=no_avx512,no_avx2,pure + - run: cargo test --features=no_avx512,no_avx2 --release + - run: cargo test --features=no_avx512,no_avx2,prefer_intrinsics --release + - run: cargo test --features=no_avx512,no_avx2,pure --release + # No SSE4.1 + - run: cargo test --features=no_avx512,no_avx2,no_sse41 + - run: cargo test --features=no_avx512,no_avx2,no_sse41,prefer_intrinsics + - run: cargo test --features=no_avx512,no_avx2,no_sse41,pure + - run: cargo test --features=no_avx512,no_avx2,no_sse41 --release + - run: cargo test --features=no_avx512,no_avx2,no_sse41,prefer_intrinsics --release + - run: cargo test --features=no_avx512,no_avx2,no_sse41,pure --release + # No SSE2 + - run: cargo test --features=no_avx512,no_avx2,no_sse41,no_sse2 + - run: cargo test --features=no_avx512,no_avx2,no_sse41,no_sse2,prefer_intrinsics + - run: cargo test --features=no_avx512,no_avx2,no_sse41,no_sse2,pure + - run: cargo test --features=no_avx512,no_avx2,no_sse41,no_sse2 --release + - run: cargo test --features=no_avx512,no_avx2,no_sse41,no_sse2,prefer_intrinsics --release + - run: cargo test --features=no_avx512,no_avx2,no_sse41,no_sse2,pure --release + + # Test benchmarks. RUSTC_BOOTSTRAP=1 lets this run on non-nightly toolchains. + - run: cargo test --benches --features=rayon + env: + RUSTC_BOOTSTRAP: 1 + # Test vectors. + - name: test vectors + run: cargo test + working-directory: ./test_vectors + - name: test vectors intrinsics + run: cargo test --features=prefer_intrinsics + working-directory: ./test_vectors + - name: test vectors pure + run: cargo test --features=pure + working-directory: ./test_vectors + # Test C code. + - name: cargo test C bindings assembly + run: cargo test + working-directory: ./c/blake3_c_rust_bindings + - name: cargo test C bindings intrinsics + run: cargo test --features=prefer_intrinsics + working-directory: ./c/blake3_c_rust_bindings + # Reference impl doc test. + - name: reference impl doc test + run: cargo test + working-directory: ./reference_impl + + b3sum_tests: + name: b3sum ${{ matrix.target.name }} ${{ matrix.channel }} + runs-on: ${{ matrix.target.os }} + strategy: + fail-fast: false + matrix: + target: [ + { "os": "ubuntu-latest", "toolchain": "x86_64-unknown-linux-gnu", "name": "Linux GNU" }, + { "os": "macOS-latest", "toolchain": "x86_64-apple-darwin", "name": "macOS" }, + { "os": "windows-latest", "toolchain": "x86_64-pc-windows-msvc", "name": "Windows MSVC" }, + { "os": "windows-latest", "toolchain": "x86_64-pc-windows-gnu", "name": "Windows GNU" } + ] + channel: [ + "stable", + "beta", + "nightly", + # The b3sum MSRV is sometimes higher than the blake3 crate's, because + # b3sum depends on Clap. We check in the b3sum Cargo.lock, so Clap + # update shouldn't randomly break us here. + "1.66.1", + ] + + steps: + - uses: actions/checkout@v3 + - uses: actions-rs/toolchain@v1 + with: + toolchain: ${{ format('{0}-{1}', matrix.channel, matrix.target.toolchain) }} + profile: minimal + override: true + # Test b3sum. + - name: test b3sum + run: cargo test + working-directory: ./b3sum + - name: test b3sum --no-default-features + run: cargo test --no-default-features + working-directory: ./b3sum + + cross_tests: + name: cross ${{ matrix.arch }} + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + arch: + - i586-unknown-linux-musl + - i686-unknown-linux-musl + - armv7-unknown-linux-gnueabihf + - aarch64-unknown-linux-gnu + - mips-unknown-linux-gnu + + steps: + - uses: actions/checkout@v3 + - uses: actions-rs/toolchain@v1 + with: + toolchain: stable + override: true + - run: cargo install cross + # Test the portable implementation on everything. + - run: cross test --target ${{ matrix.arch }} + # Test building for ancient i386 processors without guaranteed SSE2 support. + - run: cross rustc --target ${{ matrix.arch }} -- -C target-cpu=i386 + if: startsWith(matrix.arch, 'i586-') || startsWith(matrix.arch, 'i686-') + # Test the NEON implementation on ARM targets. + - run: cross test --target ${{ matrix.arch }} --features=neon + if: startsWith(matrix.arch, 'armv7-') || startsWith(matrix.arch, 'aarch64-') + # NEON is enabled by default on aarch64, disabling it through the no_neon feature. + - run: cross test --target ${{ matrix.arch }} --features=no_neon + if: startsWith(matrix.arch, 'aarch64-') + # Test vectors. Note that this uses a hacky script due to path dependency limitations. + - run: ./test_vectors/cross_test.sh --target ${{ matrix.arch }} + # C code. Same issue with the hacky script. + - run: ./c/blake3_c_rust_bindings/cross_test.sh --target ${{ matrix.arch }} + - run: ./c/blake3_c_rust_bindings/cross_test.sh --target ${{ matrix.arch }} --features=neon + if: startsWith(matrix.arch, 'armv7-') || startsWith(matrix.arch, 'aarch64-') + + # Currently only on x86. + c_tests: + name: C Makefile tests + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v3 + # Test the intrinsics-based implementations. + - run: make -f Makefile.testing test + working-directory: ./c + - run: make -f Makefile.testing clean && rm blake3_sse2.c + working-directory: ./c + - run: BLAKE3_NO_SSE2=1 make -f Makefile.testing test + working-directory: ./c + - run: make -f Makefile.testing clean && rm blake3_sse41.c + working-directory: ./c + - run: BLAKE3_NO_SSE2=1 BLAKE3_NO_SSE41=1 make -f Makefile.testing test + working-directory: ./c + - run: make -f Makefile.testing clean && rm blake3_avx2.c + working-directory: ./c + - run: BLAKE3_NO_SSE2=1 BLAKE3_NO_SSE41=1 BLAKE3_NO_AVX2=1 make -f Makefile.testing test + working-directory: ./c + - run: make -f Makefile.testing clean && rm blake3_avx512.c + working-directory: ./c + - run: BLAKE3_NO_SSE2=1 BLAKE3_NO_SSE41=1 BLAKE3_NO_AVX2=1 BLAKE3_NO_AVX512=1 make -f Makefile.testing test + working-directory: ./c + # Test the assembly implementations. + - run: make -f Makefile.testing test_asm + working-directory: ./c + - run: make -f Makefile.testing clean && rm blake3_sse2_x86-64_unix.S + working-directory: ./c + - run: BLAKE3_NO_SSE2=1 make -f Makefile.testing test_asm + working-directory: ./c + - run: make -f Makefile.testing clean && rm blake3_sse41_x86-64_unix.S + working-directory: ./c + - run: BLAKE3_NO_SSE2=1 BLAKE3_NO_SSE41=1 make -f Makefile.testing test_asm + working-directory: ./c + - run: make -f Makefile.testing clean && rm blake3_avx2_x86-64_unix.S + working-directory: ./c + - run: BLAKE3_NO_SSE2=1 BLAKE3_NO_SSE41=1 BLAKE3_NO_AVX2=1 make -f Makefile.testing test_asm + working-directory: ./c + - run: make -f Makefile.testing clean && rm blake3_avx512_x86-64_unix.S + working-directory: ./c + - run: BLAKE3_NO_SSE2=1 BLAKE3_NO_SSE41=1 BLAKE3_NO_AVX2=1 BLAKE3_NO_AVX512=1 make -f Makefile.testing test_asm + working-directory: ./c + # Restore the files we deleted above. + - run: git checkout . + # Build the example. + - run: make -f Makefile.testing example + working-directory: ./c + + # Note that this jobs builds AArch64 binaries from an x86_64 host. + build_apple_silicon: + name: build for Apple Silicon + runs-on: macOS-latest + strategy: + fail-fast: false + steps: + - uses: actions/checkout@v3 + - uses: actions-rs/toolchain@v1 + with: + toolchain: stable + target: aarch64-apple-darwin + override: true + - name: build blake3 + run: cargo build --target aarch64-apple-darwin + - name: build b3sum + run: cargo build --target aarch64-apple-darwin + working-directory: ./b3sum + + build_tinycc: + name: build with the Tiny C Compiler + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - name: install TCC + run: sudo apt-get install -y tcc + - name: compile + run: > + tcc -shared -O3 -o libblake3.so \ + -DBLAKE3_NO_SSE2 -DBLAKE3_NO_SSE41 -DBLAKE3_NO_AVX2 -DBLAKE3_NO_AVX512 \ + blake3.c blake3_dispatch.c blake3_portable.c + working-directory: ./c + + # See https://github.com/BLAKE3-team/BLAKE3/issues/271 for why we test this. + # Note that this isn't guaranteed to execute on an AVX-512-supporting server, + # but hopefully at least some of the time it will. + gcc54: + name: "compile and test with GCC 5.4" + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - uses: addnab/docker-run-action@v3 + with: + image: gcc:5.4 + options: -v ${{ github.workspace }}:/work + run: | + cat /proc/cpuinfo + curl https://sh.rustup.rs -sSf | sh -s -- -y --profile minimal + cd /work + ~/.cargo/bin/cargo test --features prefer_intrinsics + + # CMake build test (Library only), current macOS/Linux only. + cmake_build: + name: CMake ${{ matrix.os }} + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + os: ["ubuntu-latest", "macOS-latest", "windows-latest"] + compiler: [gcc, clang, cl] + exclude: + - os: windows-latest + compiler: gcc + - os: ubuntu-latest + compiler: msvc + - os: macOS-latest + compiler: msvc + steps: + - uses: actions/checkout@v3 + - name: CMake generation + run: cmake -S c -B c/build -DCMAKE_INSTALL_PREFIX=${{github.workspace}}/target + - name: CMake build / install + run: cmake --build c/build --target install diff --git a/third-party/blake3/.github/workflows/tag.yml b/third-party/blake3/.github/workflows/tag.yml new file mode 100644 index 00000000..3f7e886b --- /dev/null +++ b/third-party/blake3/.github/workflows/tag.yml @@ -0,0 +1,45 @@ +name: publish_b3sum_binaries + +on: + push: + tags: + - "*" + +env: + BLAKE3_CI: "1" + RUSTFLAGS: "-D warnings" + +jobs: + cargo_tests: + name: ${{ matrix.target.name }} + runs-on: ${{ matrix.target.os }} + strategy: + fail-fast: false + matrix: + target: [ + { "os": "ubuntu-latest", "rust-target": "x86_64-unknown-linux-musl", "name": "Linux" }, + { "os": "macOS-latest", "rust-target": "x86_64-apple-darwin", "name": "macOS" }, + { "os": "windows-latest", "rust-target": "x86_64-pc-windows-msvc", "name": "Windows" }, + ] + + steps: + - uses: actions/checkout@v3 + - uses: actions/setup-python@v4 + with: + python-version: "3.x" + - run: pip install PyGithub + - run: sudo apt-get install musl-tools + if: matrix.target.os == 'ubuntu-latest' + - uses: actions-rs/toolchain@v1 + with: + toolchain: stable + profile: minimal + - run: rustup target add ${{ matrix.target.rust-target }} + - name: build b3sum + id: build_b3sum + run: python -u .github/workflows/build_b3sum.py ${{ matrix.target.rust-target }} + - name: upload release asset + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + GITHUB_TAG: ${{ github.ref }} + run: python -u .github/workflows/upload_github_release_asset.py ${{ steps.build_b3sum.outputs.bin_path }} diff --git a/third-party/blake3/.github/workflows/upload_github_release_asset.py b/third-party/blake3/.github/workflows/upload_github_release_asset.py new file mode 100755 index 00000000..76340bee --- /dev/null +++ b/third-party/blake3/.github/workflows/upload_github_release_asset.py @@ -0,0 +1,73 @@ +#! /usr/bin/env python3 + +import github +import os +import sys +import time + +RETRIES = 10 + +g = github.Github(os.environ["GITHUB_TOKEN"]) +tag_name = os.environ["GITHUB_TAG"] +tag_prefix = "refs/tags/" +if tag_name.startswith(tag_prefix): + tag_name = tag_name[len(tag_prefix) :] +assert len(sys.argv) == 2 +asset_path = sys.argv[1] +asset_name = os.path.basename(asset_path) + +repo = g.get_repo(os.environ["GITHUB_REPOSITORY"]) + +tags = list(repo.get_tags()) + +for tag in tags: + if tag.name == tag_name: + break +else: + raise RuntimeError("no tag named " + repr(tag_name)) + +try: + print("Creating GitHub release for tag " + repr(tag_name) + "...") + repo.create_git_release(tag_name, tag_name, tag.commit.commit.message) +except github.GithubException as github_error: + if github_error.data["errors"][0]["code"] == "already_exists": + print("Release for tag " + repr(tag_name) + " already exists.") + else: + raise + + +def get_release(): + for i in range(RETRIES): + releases = list(repo.get_releases()) + for release in releases: + if release.tag_name == tag_name: + return release + print(f"Release for tag {repr(tag_name)} not found. Retrying...") + time.sleep(1) + raise RuntimeError("no release for tag " + repr(tag_name)) + + +release = get_release() + +print("Uploading " + repr(asset_path) + "...") +for i in range(RETRIES): + try: + print("Upload attempt #{} of {}...".format(i + 1, RETRIES)) + release.upload_asset(asset_path) + break + except github.GithubException as github_error: + # Unfortunately the asset upload API is flaky. Even worse, it often + # partially succeeds, returning an error to the caller but leaving the + # release in a state where subsequent uploads of the same asset will + # fail with an "already_exists" error. (Though the asset is not visible + # on github.com, so we can't just declare victory and move on.) If we + # detect this case, explicitly delete the asset and continue retrying. + print(github_error) + for asset in release.get_assets(): + if asset.name == asset_name: + print("Found uploaded asset after failure. Deleting...") + asset.delete_asset() +else: + raise RuntimeError("All upload attempts failed.") + +print("Success!") diff --git a/third-party/blake3/.gitignore b/third-party/blake3/.gitignore new file mode 100644 index 00000000..fa8d85ac --- /dev/null +++ b/third-party/blake3/.gitignore @@ -0,0 +1,2 @@ +Cargo.lock +target diff --git a/third-party/blake3/CONTRIBUTING.md b/third-party/blake3/CONTRIBUTING.md new file mode 100644 index 00000000..3a605f25 --- /dev/null +++ b/third-party/blake3/CONTRIBUTING.md @@ -0,0 +1,31 @@ +# Contributing + +We welcome and encourage third-party contributions to BLAKE3, be it reports of issues encountered while using the software or proposals of patches. + +## Bug reports + +Bugs and other problems should be reported on [GitHub Issues](https://github.com/BLAKE3/BLAKE3/issues). + +If you report a bug, please: + +* Check that it's not already reported in the [GitHub Issues](https://github.com/BLAKE3/BLAKE3/issues). +* Provide information to help us diagnose and ideally reproduce the bug. + +## Patches + +We encourage you to fix a bug via a [GitHub Pull request](https://github.com/BLAKE3/BLAKE3/pulls), preferably after creating a related issue and referring it in the PR. + +If you contribute code and submit a patch, please note the following: + +* We use Rust's stable branch for developing BLAKE3. +* Pull requests should target the `master` branch. +* Try to follow the established Rust [style guidelines](https://doc.rust-lang.org/1.0.0/style/). + +Also please make sure to create new unit tests covering your code additions. You can execute the tests by running: + +```bash +cargo test +``` + +All third-party contributions will be recognized in the list of contributors. + diff --git a/third-party/blake3/Cargo.toml b/third-party/blake3/Cargo.toml new file mode 100644 index 00000000..8df13874 --- /dev/null +++ b/third-party/blake3/Cargo.toml @@ -0,0 +1,101 @@ +[package] +name = "blake3" +version = "1.4.1" +authors = ["Jack O'Connor ", "Samuel Neves"] +description = "the BLAKE3 hash function" +repository = "https://github.com/BLAKE3-team/BLAKE3" +license = "CC0-1.0 OR Apache-2.0" +documentation = "https://docs.rs/blake3" +readme = "README.md" +edition = "2021" + +[features] +default = ["std"] + +# The NEON implementation does not participate in dynamic feature detection, +# which is currently x86-only. If "neon" is on, NEON support is assumed. Note +# that AArch64 always supports NEON, but support on ARMv7 varies. The NEON +# implementation uses C intrinsics and requires a C compiler. +neon = [] + +# This crate uses libstd for std::io trait implementations, and also for +# runtime CPU feature detection. This feature is enabled by default. If you use +# --no-default-features, the only way to use the SIMD implementations in this +# crate is to enable the corresponding instruction sets statically for the +# entire build, with e.g. RUSTFLAGS="-C target-cpu=native". +std = ["digest/std"] + +# The "rayon" feature (defined below as an optional dependency) enables the +# `Hasher::update_rayon` method, for multithreaded hashing. However, even if +# this feature is enabled, all other APIs remain single-threaded. + +# This crate implements traits from the RustCrypto project, exposed here as the +# "traits-preview" feature. However, these traits aren't stable, and they're +# expected to change in incompatible ways before they reach 1.0. For that +# reason, this crate makes no SemVer guarantees for this feature, and callers +# who use it should expect breaking changes between patch versions of this +# crate. (The "*-preview" feature name follows the conventions of the RustCrypto +# "signature" crate.) +traits-preview = ["digest"] + +# ---------- Features below this line are undocumented and unstable. ---------- +# The following features are mainly intended for testing and benchmarking, and +# they might change or disappear at any time without a major version bump. + +# By default on x86_64, this crate uses Samuel Neves' hand-written assembly +# implementations for SSE4.1, AVX2, and AVX512. (These provide both the best +# runtime performance, and the fastest build times.) And by default on 32-bit +# x86, this crate uses Rust intrinsics implementations for SSE4.1 and AVX2, and +# a C intrinsics implementation for AVX-512. In both cases, if a C compiler is +# not detected, or if AVX-512 support is missing from the detected compiler, +# build.rs automatically falls back to a pure Rust build. This feature forces +# that fallback, for testing purposes. (Note that in CI testing, we set the +# BLAKE3_CI environment variable, which instructs build.rs to error out rather +# than doing an automatic fallback.) +pure = [] + +# As described above, on x86_64 this crate use assembly implementations by +# default. Enabling the "prefer_intrinsics" feature makes this crate use +# intrinsics implementations on both 32-bit and 64-bit x86, again for testing +# purposes. +prefer_intrinsics = [] + +# Disable individual instruction sets. CI testing uses these flags to simulate +# different levels of hardware SIMD support. Note that code for the +# corresponding instruction set is still compiled; only detection is disabled. +# +# As noted above, these flags are *for testing only* and are not stable. It's +# possible that some users might find that their particular use case performs +# better if e.g. AVX-512 is disabled, because of issues like CPU downclocking. +# If that comes up, and if disabling the instruction set here at the feature +# level turns out to be the right approach, then we can design a stable +# feature. Until then, we reserve the right to break these features in a patch +# release. +no_sse2 = [] +no_sse41 = [] +no_avx2 = [] +no_avx512 = [] +no_neon = [] + +[package.metadata.docs.rs] +# Document Hasher::update_rayon on docs.rs. +features = ["rayon"] + +[dependencies] +arrayref = "0.3.5" +arrayvec = { version = "0.7.0", default-features = false } +constant_time_eq = "0.3.0" +rayon = { version = "1.2.1", optional = true } +cfg-if = "1.0.0" +digest = { version = "0.10.1", features = [ "mac" ], optional = true } + +[dev-dependencies] +hex = "0.4.2" +page_size = "0.5.0" +rand = "0.8.0" +rand_chacha = "0.3.0" +reference_impl = { path = "./reference_impl" } +hmac = "0.12.0" + +[build-dependencies] +cc = "1.0.4" diff --git a/third-party/blake3/LICENSE b/third-party/blake3/LICENSE new file mode 100644 index 00000000..f5892efc --- /dev/null +++ b/third-party/blake3/LICENSE @@ -0,0 +1,330 @@ +This work is released into the public domain with CC0 1.0. Alternatively, it is +licensed under the Apache License 2.0. + +------------------------------------------------------------------------------- + +Creative Commons Legal Code + +CC0 1.0 Universal + + CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE + LEGAL SERVICES. DISTRIBUTION OF THIS DOCUMENT DOES NOT CREATE AN + ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS + INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES + REGARDING THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS + PROVIDED HEREUNDER, AND DISCLAIMS LIABILITY FOR DAMAGES RESULTING FROM + THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS PROVIDED + HEREUNDER. + +Statement of Purpose + +The laws of most jurisdictions throughout the world automatically confer +exclusive Copyright and Related Rights (defined below) upon the creator +and subsequent owner(s) (each and all, an "owner") of an original work of +authorship and/or a database (each, a "Work"). + +Certain owners wish to permanently relinquish those rights to a Work for +the purpose of contributing to a commons of creative, cultural and +scientific works ("Commons") that the public can reliably and without fear +of later claims of infringement build upon, modify, incorporate in other +works, reuse and redistribute as freely as possible in any form whatsoever +and for any purposes, including without limitation commercial purposes. +These owners may contribute to the Commons to promote the ideal of a free +culture and the further production of creative, cultural and scientific +works, or to gain reputation or greater distribution for their Work in +part through the use and efforts of others. + +For these and/or other purposes and motivations, and without any +expectation of additional consideration or compensation, the person +associating CC0 with a Work (the "Affirmer"), to the extent that he or she +is an owner of Copyright and Related Rights in the Work, voluntarily +elects to apply CC0 to the Work and publicly distribute the Work under its +terms, with knowledge of his or her Copyright and Related Rights in the +Work and the meaning and intended legal effect of CC0 on those rights. + +1. Copyright and Related Rights. A Work made available under CC0 may be +protected by copyright and related or neighboring rights ("Copyright and +Related Rights"). Copyright and Related Rights include, but are not +limited to, the following: + + i. the right to reproduce, adapt, distribute, perform, display, + communicate, and translate a Work; + ii. moral rights retained by the original author(s) and/or performer(s); +iii. publicity and privacy rights pertaining to a person's image or + likeness depicted in a Work; + iv. rights protecting against unfair competition in regards to a Work, + subject to the limitations in paragraph 4(a), below; + v. rights protecting the extraction, dissemination, use and reuse of data + in a Work; + vi. database rights (such as those arising under Directive 96/9/EC of the + European Parliament and of the Council of 11 March 1996 on the legal + protection of databases, and under any national implementation + thereof, including any amended or successor version of such + directive); and +vii. other similar, equivalent or corresponding rights throughout the + world based on applicable law or treaty, and any national + implementations thereof. + +2. Waiver. To the greatest extent permitted by, but not in contravention +of, applicable law, Affirmer hereby overtly, fully, permanently, +irrevocably and unconditionally waives, abandons, and surrenders all of +Affirmer's Copyright and Related Rights and associated claims and causes +of action, whether now known or unknown (including existing as well as +future claims and causes of action), in the Work (i) in all territories +worldwide, (ii) for the maximum duration provided by applicable law or +treaty (including future time extensions), (iii) in any current or future +medium and for any number of copies, and (iv) for any purpose whatsoever, +including without limitation commercial, advertising or promotional +purposes (the "Waiver"). Affirmer makes the Waiver for the benefit of each +member of the public at large and to the detriment of Affirmer's heirs and +successors, fully intending that such Waiver shall not be subject to +revocation, rescission, cancellation, termination, or any other legal or +equitable action to disrupt the quiet enjoyment of the Work by the public +as contemplated by Affirmer's express Statement of Purpose. + +3. Public License Fallback. Should any part of the Waiver for any reason +be judged legally invalid or ineffective under applicable law, then the +Waiver shall be preserved to the maximum extent permitted taking into +account Affirmer's express Statement of Purpose. In addition, to the +extent the Waiver is so judged Affirmer hereby grants to each affected +person a royalty-free, non transferable, non sublicensable, non exclusive, +irrevocable and unconditional license to exercise Affirmer's Copyright and +Related Rights in the Work (i) in all territories worldwide, (ii) for the +maximum duration provided by applicable law or treaty (including future +time extensions), (iii) in any current or future medium and for any number +of copies, and (iv) for any purpose whatsoever, including without +limitation commercial, advertising or promotional purposes (the +"License"). The License shall be deemed effective as of the date CC0 was +applied by Affirmer to the Work. Should any part of the License for any +reason be judged legally invalid or ineffective under applicable law, such +partial invalidity or ineffectiveness shall not invalidate the remainder +of the License, and in such case Affirmer hereby affirms that he or she +will not (i) exercise any of his or her remaining Copyright and Related +Rights in the Work or (ii) assert any associated claims and causes of +action with respect to the Work, in either case contrary to Affirmer's +express Statement of Purpose. + +4. Limitations and Disclaimers. + + a. No trademark or patent rights held by Affirmer are waived, abandoned, + surrendered, licensed or otherwise affected by this document. + b. Affirmer offers the Work as-is and makes no representations or + warranties of any kind concerning the Work, express, implied, + statutory or otherwise, including without limitation warranties of + title, merchantability, fitness for a particular purpose, non + infringement, or the absence of latent or other defects, accuracy, or + the present or absence of errors, whether or not discoverable, all to + the greatest extent permissible under applicable law. + c. Affirmer disclaims responsibility for clearing rights of other persons + that may apply to the Work or any use thereof, including without + limitation any person's Copyright and Related Rights in the Work. + Further, Affirmer disclaims responsibility for obtaining any necessary + consents, permissions or other rights required for any use of the + Work. + d. Affirmer understands and acknowledges that Creative Commons is not a + party to this document and has no duty or obligation with respect to + this CC0 or use of the Work. + +------------------------------------------------------------------------------- + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright 2019 Jack O'Connor and Samuel Neves + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/third-party/blake3/README.md b/third-party/blake3/README.md new file mode 100644 index 00000000..a63d5f2c --- /dev/null +++ b/third-party/blake3/README.md @@ -0,0 +1,221 @@ +# BLAKE3 + +BLAKE3 is a cryptographic hash function that is: + +- **Much faster** than MD5, SHA-1, SHA-2, SHA-3, and BLAKE2. +- **Secure**, unlike MD5 and SHA-1. And secure against length extension, + unlike SHA-2. +- **Highly parallelizable** across any number of threads and SIMD lanes, + because it's a Merkle tree on the inside. +- Capable of **verified streaming** and **incremental updates**, again + because it's a Merkle tree. +- A **PRF**, **MAC**, **KDF**, and **XOF**, as well as a regular hash. +- **One algorithm with no variants**, which is fast on x86-64 and also + on smaller architectures. + +The [chart below](https://github.com/BLAKE3-team/BLAKE3-specs/blob/master/benchmarks/bar_chart.py) +is an example benchmark of 16 KiB inputs on a Cascade Lake-SP 8275CL server CPU +from 2019. For more detailed benchmarks, see the +[BLAKE3 paper](https://github.com/BLAKE3-team/BLAKE3-specs/blob/master/blake3.pdf). + +

+performance graph +

+ +BLAKE3 is based on an optimized instance of the established hash +function [BLAKE2](https://blake2.net) and on the [original Bao tree +mode](https://github.com/oconnor663/bao/blob/master/docs/spec_0.9.1.md). +The specifications and design rationale are available in the [BLAKE3 +paper](https://github.com/BLAKE3-team/BLAKE3-specs/blob/master/blake3.pdf). +The default output size is 256 bits. The current version of +[Bao](https://github.com/oconnor663/bao) implements verified streaming +with BLAKE3. + +This repository is the official implementation of BLAKE3. It includes: + +* The [`blake3`](https://crates.io/crates/blake3) Rust crate, which + includes optimized implementations for SSE2, SSE4.1, AVX2, AVX-512, + and NEON, with automatic runtime CPU feature detection on x86. The + `rayon` feature provides multithreading. + +* The [`b3sum`](https://crates.io/crates/b3sum) Rust crate, which + provides a command line interface. It uses multithreading by default, + making it an order of magnitude faster than e.g. `sha256sum` on + typical desktop hardware. + +* The [C implementation](c), which like the Rust implementation includes + SIMD code and runtime CPU feature detection on x86. Unlike the Rust + implementation, it's [not currently multithreaded](c#multithreading). See + [`c/README.md`](c/README.md). + +* The [Rust reference implementation](reference_impl/reference_impl.rs), + which is discussed in Section 5.1 of the [BLAKE3 + paper](https://github.com/BLAKE3-team/BLAKE3-specs/blob/master/blake3.pdf). + This implementation is much smaller and simpler than the optimized + ones above. If you want to see how BLAKE3 works, or you're writing a + port that doesn't need multithreading or SIMD optimizations, start + here. Ports of the reference implementation to other languages are + hosted in separate repositories + ([C](https://github.com/oconnor663/blake3_reference_impl_c), + [Python](https://github.com/oconnor663/pure_python_blake3)). + +* A [set of test + vectors](https://github.com/BLAKE3-team/BLAKE3/blob/master/test_vectors/test_vectors.json) + that covers extended outputs, all three modes, and a variety of input + lengths. + +* [![Actions Status](https://github.com/BLAKE3-team/BLAKE3/workflows/tests/badge.svg)](https://github.com/BLAKE3-team/BLAKE3/actions) + +BLAKE3 was designed by: + +* [@oconnor663 ](https://github.com/oconnor663) (Jack O'Connor) +* [@sneves](https://github.com/sneves) (Samuel Neves) +* [@veorq](https://github.com/veorq) (Jean-Philippe Aumasson) +* [@zookozcash](https://github.com/zookozcash) (Zooko) + +The development of BLAKE3 was sponsored by [Electric Coin Company](https://electriccoin.co). + +*NOTE: BLAKE3 is not a password hashing algorithm, because it's +designed to be fast, whereas password hashing should not be fast. If you +hash passwords to store the hashes or if you derive keys from passwords, +we recommend [Argon2](https://github.com/P-H-C/phc-winner-argon2).* + +## Usage + +### The `b3sum` utility + +The `b3sum` command line utility prints the BLAKE3 hashes of files or of +standard input. Prebuilt binaries are available for Linux, Windows, and +macOS (requiring the [unidentified developer +workaround](https://support.apple.com/guide/mac-help/open-a-mac-app-from-an-unidentified-developer-mh40616/mac)) +on the [releases page](https://github.com/BLAKE3-team/BLAKE3/releases). +If you've [installed Rust and +Cargo](https://doc.rust-lang.org/cargo/getting-started/installation.html), +you can also build `b3sum` yourself with: + +```bash +cargo install b3sum +``` + +If `rustup` didn't configure your `PATH` for you, you might need to go +looking for the installed binary in e.g. `~/.cargo/bin`. You can test +out how fast BLAKE3 is on your machine by creating a big file and +hashing it, for example: + +```bash +# Create a 1 GB file. +head -c 1000000000 /dev/zero > /tmp/bigfile +# Hash it with SHA-256. +time openssl sha256 /tmp/bigfile +# Hash it with BLAKE3. +time b3sum /tmp/bigfile +``` + +### The `blake3` crate [![docs.rs](https://docs.rs/blake3/badge.svg)](https://docs.rs/blake3) + +To use BLAKE3 from Rust code, add a dependency on the `blake3` crate to +your `Cargo.toml`. Here's an example of hashing some input bytes: + +```rust +// Hash an input all at once. +let hash1 = blake3::hash(b"foobarbaz"); + +// Hash an input incrementally. +let mut hasher = blake3::Hasher::new(); +hasher.update(b"foo"); +hasher.update(b"bar"); +hasher.update(b"baz"); +let hash2 = hasher.finalize(); +assert_eq!(hash1, hash2); + +// Extended output. OutputReader also implements Read and Seek. +let mut output = [0; 1000]; +let mut output_reader = hasher.finalize_xof(); +output_reader.fill(&mut output); +assert_eq!(hash1, output[..32]); + +// Print a hash as hex. +println!("{}", hash1); +``` + +Besides `hash`, BLAKE3 provides two other modes, `keyed_hash` and +`derive_key`. The `keyed_hash` mode takes a 256-bit key: + +```rust +// MAC an input all at once. +let example_key = [42u8; 32]; +let mac1 = blake3::keyed_hash(&example_key, b"example input"); + +// MAC incrementally. +let mut hasher = blake3::Hasher::new_keyed(&example_key); +hasher.update(b"example input"); +let mac2 = hasher.finalize(); +assert_eq!(mac1, mac2); +``` + +The `derive_key` mode takes a context string and some key material (not a +password). The context string should be hardcoded, globally unique, and +application-specific. A good default format for the context string is +`"[application] [commit timestamp] [purpose]"`: + +```rust +// Derive a couple of subkeys for different purposes. +const EMAIL_CONTEXT: &str = "BLAKE3 example 2020-01-07 17:10:44 email key"; +const API_CONTEXT: &str = "BLAKE3 example 2020-01-07 17:11:21 API key"; +let input_key_material = b"usually at least 32 random bytes, not a password"; +let email_key = blake3::derive_key(EMAIL_CONTEXT, input_key_material); +let api_key = blake3::derive_key(API_CONTEXT, input_key_material); +assert_ne!(email_key, api_key); +``` + +### The C implementation + +See [`c/README.md`](c/README.md). + +### Other implementations + +We post links to third-party bindings and implementations on the +[@BLAKE3team Twitter account](https://twitter.com/BLAKE3team) whenever +we hear about them. Some highlights include [an optimized Go +implementation](https://github.com/zeebo/blake3), [Wasm bindings for +Node.js and browsers](https://github.com/connor4312/blake3), [binary +wheels for Python](https://github.com/oconnor663/blake3-py), [.NET +bindings](https://github.com/xoofx/Blake3.NET), and [JNI +bindings](https://github.com/sken77/BLAKE3jni). + +## Contributing + +Please see [CONTRIBUTING.md](CONTRIBUTING.md). + +## Intellectual property + +The Rust code is copyright Jack O'Connor, 2019-2020. The C code is +copyright Samuel Neves and Jack O'Connor, 2019-2020. The assembly code +is copyright Samuel Neves, 2019-2020. + +This work is released into the public domain with CC0 1.0. +Alternatively, it is licensed under the Apache License 2.0. + +## Adoption & deployment + +Here's a (non-exhaustive) list of protocols and software that use BLAKE3: + +* [Alephium](https://github.com/alephium/alephium/blob/master/crypto/src/main/scala/org/alephium/crypto/Blake3.scala) +* [Chia](https://github.com/Chia-Network/chia-blockchain/blob/main/CHANGELOG.md#10beta8-aka-beta-18---2020-07-16) +* [IPFS](https://github.com/ipfs/go-verifcid/issues/13) +* [Farcaster](https://www.farcaster.xyz/) +* [LLVM](https://reviews.llvm.org/D121510) +* [Nym](https://github.com/nymtech/nym/blob/59056a22c5e6b01a38da2124662bd1fa3c8abef2/common/nymsphinx/params/src/lib.rs#L5) +* [OpenZFS](https://github.com/openzfs/zfs/) +* [Redox](https://www.redox-os.org/news/pkgar-introduction/) +* [Saito](https://saito.tech/) +* [Skale](https://github.com/skalenetwork/skale-consensus/pull/284) +* [Solana](https://docs.rs/solana-program/1.9.5/solana_program/blake3/index.html) +* [Wasmer](https://github.com/wasmerio/wasmer/blob/4f935a8c162bf604df223003e434e4f7ca253688/lib/cache/src/hash.rs#L21) + + +## Miscellany + +- [@veorq](https://github.com/veorq) and + [@oconnor663](https://github.com/oconnor663) did [a podcast + interview](https://www.cryptography.fm/3) about designing BLAKE3. diff --git a/third-party/blake3/b3sum/.gitignore b/third-party/blake3/b3sum/.gitignore new file mode 100644 index 00000000..9da4a887 --- /dev/null +++ b/third-party/blake3/b3sum/.gitignore @@ -0,0 +1 @@ +!Cargo.lock diff --git a/third-party/blake3/b3sum/Cargo.lock b/third-party/blake3/b3sum/Cargo.lock new file mode 100644 index 00000000..2a599a85 --- /dev/null +++ b/third-party/blake3/b3sum/Cargo.lock @@ -0,0 +1,690 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 3 + +[[package]] +name = "anstream" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0ca84f3628370c59db74ee214b3263d58f9aadd9b4fe7e711fd87dc452b7f163" +dependencies = [ + "anstyle", + "anstyle-parse", + "anstyle-query", + "anstyle-wincon", + "colorchoice", + "is-terminal", + "utf8parse", +] + +[[package]] +name = "anstyle" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3a30da5c5f2d5e72842e00bcb57657162cdabef0931f40e2deb9b4140440cecd" + +[[package]] +name = "anstyle-parse" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "938874ff5980b03a87c5524b3ae5b59cf99b1d6bc836848df7bc5ada9643c333" +dependencies = [ + "utf8parse", +] + +[[package]] +name = "anstyle-query" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5ca11d4be1bab0c8bc8734a9aa7bf4ee8316d462a08c6ac5052f888fef5b494b" +dependencies = [ + "windows-sys", +] + +[[package]] +name = "anstyle-wincon" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "180abfa45703aebe0093f79badacc01b8fd4ea2e35118747e5811127f926e188" +dependencies = [ + "anstyle", + "windows-sys", +] + +[[package]] +name = "anyhow" +version = "1.0.71" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9c7d0618f0e0b7e8ff11427422b64564d5fb0be1940354bfe2e0529b18a9d9b8" + +[[package]] +name = "arrayref" +version = "0.3.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6b4930d2cb77ce62f89ee5d5289b4ac049559b1c45539271f5ed4fdc7db34545" + +[[package]] +name = "arrayvec" +version = "0.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "96d30a06541fbafbc7f82ed10c06164cfbd2c401138f6addd8404629c4b16711" + +[[package]] +name = "autocfg" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa" + +[[package]] +name = "b3sum" +version = "1.4.1" +dependencies = [ + "anyhow", + "blake3", + "clap", + "duct", + "hex", + "memmap2", + "rayon", + "tempfile", + "wild", +] + +[[package]] +name = "bitflags" +version = "1.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" + +[[package]] +name = "bitflags" +version = "2.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "630be753d4e58660abd17930c71b647fe46c27ea6b63cc59e1e3851406972e42" + +[[package]] +name = "blake3" +version = "1.4.1" +dependencies = [ + "arrayref", + "arrayvec", + "cc", + "cfg-if", + "constant_time_eq", + "digest", + "rayon", +] + +[[package]] +name = "block-buffer" +version = "0.10.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3078c7629b62d3f0439517fa394996acacc5cbc91c5a20d8c658e77abd503a71" +dependencies = [ + "generic-array", +] + +[[package]] +name = "cc" +version = "1.0.79" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "50d30906286121d95be3d479533b458f87493b30a4b5f79a607db8f5d11aa91f" + +[[package]] +name = "cfg-if" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" + +[[package]] +name = "clap" +version = "4.3.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1640e5cc7fb47dbb8338fd471b105e7ed6c3cb2aeb00c2e067127ffd3764a05d" +dependencies = [ + "clap_builder", + "clap_derive", + "once_cell", +] + +[[package]] +name = "clap_builder" +version = "4.3.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "98c59138d527eeaf9b53f35a77fcc1fad9d883116070c63d5de1c7dc7b00c72b" +dependencies = [ + "anstream", + "anstyle", + "clap_lex", + "strsim", + "terminal_size", +] + +[[package]] +name = "clap_derive" +version = "4.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b8cd2b2a819ad6eec39e8f1d6b53001af1e5469f8c177579cdaeb313115b825f" +dependencies = [ + "heck", + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "clap_lex" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2da6da31387c7e4ef160ffab6d5e7f00c42626fe39aea70a7b0f1773f7dd6c1b" + +[[package]] +name = "colorchoice" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "acbf1af155f9b9ef647e42cdc158db4b64a1b61f743629225fde6f3e0be2a7c7" + +[[package]] +name = "constant_time_eq" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f7144d30dcf0fafbce74250a3963025d8d52177934239851c917d29f1df280c2" + +[[package]] +name = "crossbeam-channel" +version = "0.5.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a33c2bf77f2df06183c3aa30d1e96c0695a313d4f9c453cc3762a6db39f99200" +dependencies = [ + "cfg-if", + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-deque" +version = "0.8.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ce6fd6f855243022dcecf8702fef0c297d4338e226845fe067f6341ad9fa0cef" +dependencies = [ + "cfg-if", + "crossbeam-epoch", + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-epoch" +version = "0.9.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ae211234986c545741a7dc064309f67ee1e5ad243d0e48335adc0484d960bcc7" +dependencies = [ + "autocfg", + "cfg-if", + "crossbeam-utils", + "memoffset", + "scopeguard", +] + +[[package]] +name = "crossbeam-utils" +version = "0.8.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a22b2d63d4d1dc0b7f1b6b2747dd0088008a9be28b6ddf0b1e7d335e3037294" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "crypto-common" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1bfb12502f3fc46cca1bb51ac28df9d618d813cdc3d2f25b9fe775a34af26bb3" +dependencies = [ + "generic-array", + "typenum", +] + +[[package]] +name = "digest" +version = "0.10.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292" +dependencies = [ + "block-buffer", + "crypto-common", + "subtle", +] + +[[package]] +name = "duct" +version = "0.13.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "37ae3fc31835f74c2a7ceda3aeede378b0ae2e74c8f1c36559fcc9ae2a4e7d3e" +dependencies = [ + "libc", + "once_cell", + "os_pipe", + "shared_child", +] + +[[package]] +name = "either" +version = "1.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7fcaabb2fef8c910e7f4c7ce9f67a1283a1715879a7c230ca9d6d1ae31f16d91" + +[[package]] +name = "errno" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4bcfec3a70f97c962c307b2d2c56e358cf1d00b558d74262b5f929ee8cc7e73a" +dependencies = [ + "errno-dragonfly", + "libc", + "windows-sys", +] + +[[package]] +name = "errno-dragonfly" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aa68f1b12764fab894d2755d2518754e71b4fd80ecfb822714a1206c2aab39bf" +dependencies = [ + "cc", + "libc", +] + +[[package]] +name = "fastrand" +version = "1.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e51093e27b0797c359783294ca4f0a911c270184cb10f85783b118614a1501be" +dependencies = [ + "instant", +] + +[[package]] +name = "generic-array" +version = "0.14.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85649ca51fd72272d7821adaf274ad91c288277713d9c18820d8499a7ff69e9a" +dependencies = [ + "typenum", + "version_check", +] + +[[package]] +name = "glob" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b" + +[[package]] +name = "heck" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8" + +[[package]] +name = "hermit-abi" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "443144c8cdadd93ebf52ddb4056d257f5b52c04d3c804e657d19eb73fc33668b" + +[[package]] +name = "hex" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70" + +[[package]] +name = "instant" +version = "0.1.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7a5bbe824c507c5da5956355e86a746d82e0e1464f65d862cc5e71da70e94b2c" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "io-lifetimes" +version = "1.0.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eae7b9aee968036d54dce06cebaefd919e4472e753296daccd6d344e3e2df0c2" +dependencies = [ + "hermit-abi", + "libc", + "windows-sys", +] + +[[package]] +name = "is-terminal" +version = "0.4.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cb0889898416213fab133e1d33a0e5858a48177452750691bde3666d0fdbaf8b" +dependencies = [ + "hermit-abi", + "rustix 0.38.3", + "windows-sys", +] + +[[package]] +name = "libc" +version = "0.2.147" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b4668fb0ea861c1df094127ac5f1da3409a82116a4ba74fca2e58ef927159bb3" + +[[package]] +name = "linux-raw-sys" +version = "0.3.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ef53942eb7bf7ff43a617b3e2c1c4a5ecf5944a7c1bc12d7ee39bbb15e5c1519" + +[[package]] +name = "linux-raw-sys" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09fc20d2ca12cb9f044c93e3bd6d32d523e6e2ec3db4f7b2939cd99026ecd3f0" + +[[package]] +name = "memmap2" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f49388d20533534cd19360ad3d6a7dadc885944aa802ba3995040c5ec11288c6" +dependencies = [ + "libc", +] + +[[package]] +name = "memoffset" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a634b1c61a95585bd15607c6ab0c4e5b226e695ff2800ba0cdccddf208c406c" +dependencies = [ + "autocfg", +] + +[[package]] +name = "num_cpus" +version = "1.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4161fcb6d602d4d2081af7c3a45852d875a03dd337a6bfdd6e06407b61342a43" +dependencies = [ + "hermit-abi", + "libc", +] + +[[package]] +name = "once_cell" +version = "1.18.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dd8b5dd2ae5ed71462c540258bedcb51965123ad7e7ccf4b9a8cafaa4a63576d" + +[[package]] +name = "os_pipe" +version = "1.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0ae859aa07428ca9a929b936690f8b12dc5f11dd8c6992a18ca93919f28bc177" +dependencies = [ + "libc", + "windows-sys", +] + +[[package]] +name = "proc-macro2" +version = "1.0.63" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7b368fba921b0dce7e60f5e04ec15e565b3303972b42bcfde1d0713b881959eb" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "quote" +version = "1.0.29" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "573015e8ab27661678357f27dc26460738fd2b6c86e46f386fde94cb5d913105" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "rayon" +version = "1.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d2df5196e37bcc87abebc0053e20787d73847bb33134a69841207dd0a47f03b" +dependencies = [ + "either", + "rayon-core", +] + +[[package]] +name = "rayon-core" +version = "1.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4b8f95bd6966f5c87776639160a66bd8ab9895d9d4ab01ddba9fc60661aebe8d" +dependencies = [ + "crossbeam-channel", + "crossbeam-deque", + "crossbeam-utils", + "num_cpus", +] + +[[package]] +name = "redox_syscall" +version = "0.3.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "567664f262709473930a4bf9e51bf2ebf3348f2e748ccc50dea20646858f8f29" +dependencies = [ + "bitflags 1.3.2", +] + +[[package]] +name = "rustix" +version = "0.37.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4d69718bf81c6127a49dc64e44a742e8bb9213c0ff8869a22c308f84c1d4ab06" +dependencies = [ + "bitflags 1.3.2", + "errno", + "io-lifetimes", + "libc", + "linux-raw-sys 0.3.8", + "windows-sys", +] + +[[package]] +name = "rustix" +version = "0.38.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac5ffa1efe7548069688cd7028f32591853cd7b5b756d41bcffd2353e4fc75b4" +dependencies = [ + "bitflags 2.3.3", + "errno", + "libc", + "linux-raw-sys 0.4.3", + "windows-sys", +] + +[[package]] +name = "scopeguard" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd" + +[[package]] +name = "shared_child" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b0d94659ad3c2137fef23ae75b03d5241d633f8acded53d672decfa0e6e0caef" +dependencies = [ + "libc", + "winapi", +] + +[[package]] +name = "strsim" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623" + +[[package]] +name = "subtle" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "81cdd64d312baedb58e21336b31bc043b77e01cc99033ce76ef539f78e965ebc" + +[[package]] +name = "syn" +version = "2.0.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "59fb7d6d8281a51045d62b8eb3a7d1ce347b76f312af50cd3dc0af39c87c1737" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "tempfile" +version = "3.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "31c0432476357e58790aaa47a8efb0c5138f137343f3b5f23bd36a27e3b0a6d6" +dependencies = [ + "autocfg", + "cfg-if", + "fastrand", + "redox_syscall", + "rustix 0.37.23", + "windows-sys", +] + +[[package]] +name = "terminal_size" +version = "0.2.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e6bf6f19e9f8ed8d4048dc22981458ebcf406d67e94cd422e5ecd73d63b3237" +dependencies = [ + "rustix 0.37.23", + "windows-sys", +] + +[[package]] +name = "typenum" +version = "1.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "497961ef93d974e23eb6f433eb5fe1b7930b659f06d12dec6fc44a8f554c0bba" + +[[package]] +name = "unicode-ident" +version = "1.0.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "22049a19f4a68748a168c0fc439f9516686aa045927ff767eca0a85101fb6e73" + +[[package]] +name = "utf8parse" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "711b9620af191e0cdc7468a8d14e709c3dcdb115b36f838e601583af800a370a" + +[[package]] +name = "version_check" +version = "0.9.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f" + +[[package]] +name = "wild" +version = "2.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "05b116685a6be0c52f5a103334cbff26db643826c7b3735fc0a3ba9871310a74" +dependencies = [ + "glob", +] + +[[package]] +name = "winapi" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" +dependencies = [ + "winapi-i686-pc-windows-gnu", + "winapi-x86_64-pc-windows-gnu", +] + +[[package]] +name = "winapi-i686-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" + +[[package]] +name = "winapi-x86_64-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" + +[[package]] +name = "windows-sys" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9" +dependencies = [ + "windows-targets", +] + +[[package]] +name = "windows-targets" +version = "0.48.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "05d4b17490f70499f20b9e791dcf6a299785ce8af4d709018206dc5b4953e95f" +dependencies = [ + "windows_aarch64_gnullvm", + "windows_aarch64_msvc", + "windows_i686_gnu", + "windows_i686_msvc", + "windows_x86_64_gnu", + "windows_x86_64_gnullvm", + "windows_x86_64_msvc", +] + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "91ae572e1b79dba883e0d315474df7305d12f569b400fcf90581b06062f7e1bc" + +[[package]] +name = "windows_aarch64_msvc" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b2ef27e0d7bdfcfc7b868b317c1d32c641a6fe4629c171b8928c7b08d98d7cf3" + +[[package]] +name = "windows_i686_gnu" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "622a1962a7db830d6fd0a69683c80a18fda201879f0f447f065a3b7467daa241" + +[[package]] +name = "windows_i686_msvc" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4542c6e364ce21bf45d69fdd2a8e455fa38d316158cfd43b3ac1c5b1b19f8e00" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ca2b8a661f7628cbd23440e50b05d705db3686f894fc9580820623656af974b1" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7896dbc1f41e08872e9d5e8f8baa8fdd2677f29468c4e156210174edc7f7b953" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1a515f5799fe4961cb532f983ce2b23082366b898e52ffbce459c86f67c8378a" diff --git a/third-party/blake3/b3sum/Cargo.toml b/third-party/blake3/b3sum/Cargo.toml new file mode 100644 index 00000000..02c9405f --- /dev/null +++ b/third-party/blake3/b3sum/Cargo.toml @@ -0,0 +1,27 @@ +[package] +name = "b3sum" +version = "1.4.1" +authors = ["Jack O'Connor "] +description = "a command line implementation of the BLAKE3 hash function" +repository = "https://github.com/BLAKE3-team/BLAKE3" +license = "CC0-1.0 OR Apache-2.0" +readme = "README.md" +edition = "2021" + +[features] +neon = ["blake3/neon"] +prefer_intrinsics = ["blake3/prefer_intrinsics"] +pure = ["blake3/pure"] + +[dependencies] +anyhow = "1.0.25" +blake3 = { version = "1", path = "..", features = ["rayon"] } +clap = { version = "4.0.8", features = ["derive", "wrap_help"] } +hex = "0.4.0" +memmap2 = "0.7.0" +rayon = "1.2.1" +wild = "2.0.3" + +[dev-dependencies] +duct = "0.13.3" +tempfile = "3.1.0" diff --git a/third-party/blake3/b3sum/README.md b/third-party/blake3/b3sum/README.md new file mode 100644 index 00000000..d1fbc213 --- /dev/null +++ b/third-party/blake3/b3sum/README.md @@ -0,0 +1,71 @@ +# b3sum + +A command line utility for calculating +[BLAKE3](https://github.com/BLAKE3-team/BLAKE3) hashes, similar to +Coreutils tools like `b2sum` or `md5sum`. + +``` +Usage: b3sum [OPTIONS] [FILE]... + +Arguments: + [FILE]... Files to hash, or checkfiles to check + +Options: + --keyed Use the keyed mode, reading the 32-byte key from stdin + --derive-key Use the key derivation mode, with the given context string + -l, --length The number of output bytes, before hex encoding [default: 32] + --seek The starting output byte offset, before hex encoding [default: 0] + --num-threads The maximum number of threads to use + --no-mmap Disable memory mapping + --no-names Omit filenames in the output + --raw Write raw output bytes to stdout, rather than hex + -c, --check Read BLAKE3 sums from the [FILE]s and check them + --quiet Skip printing OK for each checked file + -h, --help Print help (see more with '--help') + -V, --version Print version +``` + +See also [this document about how the `--check` flag +works](https://github.com/BLAKE3-team/BLAKE3/blob/master/b3sum/what_does_check_do.md). + +# Example + +Hash the file `foo.txt`: + +```bash +b3sum foo.txt +``` + +Time hashing a gigabyte of data, to see how fast it is: + +```bash +# Create a 1 GB file. +head -c 1000000000 /dev/zero > /tmp/bigfile +# Hash it with SHA-256. +time openssl sha256 /tmp/bigfile +# Hash it with BLAKE3. +time b3sum /tmp/bigfile +``` + + +# Installation + +Prebuilt binaries are available for Linux, Windows, and macOS (requiring +the [unidentified developer +workaround](https://support.apple.com/guide/mac-help/open-a-mac-app-from-an-unidentified-developer-mh40616/mac)) +on the [releases page](https://github.com/BLAKE3-team/BLAKE3/releases). +If you've [installed Rust and +Cargo](https://doc.rust-lang.org/cargo/getting-started/installation.html), +you can also build `b3sum` yourself with: + +``` +cargo install b3sum +``` + +On Linux for example, Cargo will put the compiled binary in +`~/.cargo/bin`. You might want to add that directory to your `$PATH`, or +`rustup` might have done it for you when you installed Cargo. + +If you want to install directly from this directory, you can run `cargo +install --path .`. Or you can just build with `cargo build --release`, +which puts the binary at `./target/release/b3sum`. diff --git a/third-party/blake3/b3sum/src/main.rs b/third-party/blake3/b3sum/src/main.rs new file mode 100644 index 00000000..fd35f686 --- /dev/null +++ b/third-party/blake3/b3sum/src/main.rs @@ -0,0 +1,617 @@ +use anyhow::{bail, ensure, Result}; +use clap::Parser; +use std::cmp; +use std::fs::File; +use std::io; +use std::io::prelude::*; +use std::path::{Path, PathBuf}; + +#[cfg(test)] +mod unit_tests; + +const NAME: &str = "b3sum"; + +const DERIVE_KEY_ARG: &str = "derive_key"; +const KEYED_ARG: &str = "keyed"; +const LENGTH_ARG: &str = "length"; +const NO_NAMES_ARG: &str = "no_names"; +const RAW_ARG: &str = "raw"; +const CHECK_ARG: &str = "check"; + +#[derive(Parser)] +#[command(version, max_term_width(100))] +struct Inner { + /// Files to hash, or checkfiles to check + /// + /// When no file is given, or when - is given, read standard input. + file: Vec, + + /// Use the keyed mode, reading the 32-byte key from stdin + #[arg(long, requires("file"))] + keyed: bool, + + /// Use the key derivation mode, with the given context string + /// + /// Cannot be used with --keyed. + #[arg(long, value_name("CONTEXT"), conflicts_with(KEYED_ARG))] + derive_key: Option, + + /// The number of output bytes, before hex encoding + #[arg( + short, + long, + default_value_t = blake3::OUT_LEN as u64, + value_name("LEN") + )] + length: u64, + + /// The starting output byte offset, before hex encoding + #[arg(long, default_value_t = 0, value_name("SEEK"))] + seek: u64, + + /// The maximum number of threads to use + /// + /// By default, this is the number of logical cores. If this flag is + /// omitted, or if its value is 0, RAYON_NUM_THREADS is also respected. + #[arg(long, value_name("NUM"))] + num_threads: Option, + + /// Disable memory mapping + /// + /// Currently this also disables multithreading. + #[arg(long)] + no_mmap: bool, + + /// Omit filenames in the output + #[arg(long)] + no_names: bool, + + /// Write raw output bytes to stdout, rather than hex + /// + /// --no-names is implied. In this case, only a single input is allowed. + #[arg(long)] + raw: bool, + + /// Read BLAKE3 sums from the [FILE]s and check them + #[arg( + short, + long, + conflicts_with(DERIVE_KEY_ARG), + conflicts_with(KEYED_ARG), + conflicts_with(LENGTH_ARG), + conflicts_with(RAW_ARG), + conflicts_with(NO_NAMES_ARG) + )] + check: bool, + + /// Skip printing OK for each checked file + /// + /// Must be used with --check. + #[arg(long, requires(CHECK_ARG))] + quiet: bool, +} + +struct Args { + inner: Inner, + file_args: Vec, + base_hasher: blake3::Hasher, +} + +impl Args { + fn parse() -> Result { + // wild::args_os() is equivalent to std::env::args_os() on Unix, + // but on Windows it adds support for globbing. + let inner = Inner::parse_from(wild::args_os()); + let file_args = if !inner.file.is_empty() { + inner.file.clone() + } else { + vec!["-".into()] + }; + if inner.raw && file_args.len() > 1 { + bail!("Only one filename can be provided when using --raw"); + } + let base_hasher = if inner.keyed { + // In keyed mode, since stdin is used for the key, we can't handle + // `-` arguments. Input::open handles that case below. + blake3::Hasher::new_keyed(&read_key_from_stdin()?) + } else if let Some(ref context) = inner.derive_key { + blake3::Hasher::new_derive_key(context) + } else { + blake3::Hasher::new() + }; + Ok(Self { + inner, + file_args, + base_hasher, + }) + } + + fn num_threads(&self) -> Option { + self.inner.num_threads + } + + fn check(&self) -> bool { + self.inner.check + } + + fn raw(&self) -> bool { + self.inner.raw + } + + fn no_mmap(&self) -> bool { + self.inner.no_mmap + } + + fn no_names(&self) -> bool { + self.inner.no_names + } + + fn len(&self) -> u64 { + self.inner.length + } + + fn seek(&self) -> u64 { + self.inner.seek + } + + fn keyed(&self) -> bool { + self.inner.keyed + } + + fn quiet(&self) -> bool { + self.inner.quiet + } +} + +enum Input { + Mmap(io::Cursor), + File(File), + Stdin, +} + +impl Input { + // Open an input file, using mmap if appropriate. "-" means stdin. Note + // that this convention applies both to command line arguments, and to + // filepaths that appear in a checkfile. + fn open(path: &Path, args: &Args) -> Result { + if path == Path::new("-") { + if args.keyed() { + bail!("Cannot open `-` in keyed mode"); + } + return Ok(Self::Stdin); + } + let file = File::open(path)?; + if !args.no_mmap() { + if let Some(mmap) = maybe_memmap_file(&file)? { + return Ok(Self::Mmap(io::Cursor::new(mmap))); + } + } + Ok(Self::File(file)) + } + + fn hash(&mut self, args: &Args) -> Result { + let mut hasher = args.base_hasher.clone(); + match self { + // The fast path: If we mmapped the file successfully, hash using + // multiple threads. This doesn't work on stdin, or on some files, + // and it can also be disabled with --no-mmap. + Self::Mmap(cursor) => { + hasher.update_rayon(cursor.get_ref()); + } + // The slower paths, for stdin or files we didn't/couldn't mmap. + // This is currently all single-threaded. Doing multi-threaded + // hashing without memory mapping is tricky, since all your worker + // threads have to stop every time you refill the buffer, and that + // ends up being a lot of overhead. To solve that, we need a more + // complicated double-buffering strategy where a background thread + // fills one buffer while the worker threads are hashing the other + // one. We might implement that in the future, but since this is + // the slow path anyway, it's not high priority. + Self::File(file) => { + copy_wide(file, &mut hasher)?; + } + Self::Stdin => { + let stdin = io::stdin(); + let lock = stdin.lock(); + copy_wide(lock, &mut hasher)?; + } + } + let mut output_reader = hasher.finalize_xof(); + output_reader.set_position(args.seek()); + Ok(output_reader) + } +} + +impl Read for Input { + fn read(&mut self, buf: &mut [u8]) -> io::Result { + match self { + Self::Mmap(cursor) => cursor.read(buf), + Self::File(file) => file.read(buf), + Self::Stdin => io::stdin().read(buf), + } + } +} + +// A 16 KiB buffer is enough to take advantage of all the SIMD instruction sets +// that we support, but `std::io::copy` currently uses 8 KiB. Most platforms +// can support at least 64 KiB, and there's some performance benefit to using +// bigger reads, so that's what we use here. +fn copy_wide(mut reader: impl Read, hasher: &mut blake3::Hasher) -> io::Result { + let mut buffer = [0; 65536]; + let mut total = 0; + loop { + match reader.read(&mut buffer) { + Ok(0) => return Ok(total), + Ok(n) => { + hasher.update(&buffer[..n]); + total += n as u64; + } + Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue, + Err(e) => return Err(e), + } + } +} + +// Mmap a file, if it looks like a good idea. Return None in cases where we +// know mmap will fail, or if the file is short enough that mmapping isn't +// worth it. However, if we do try to mmap and it fails, return the error. +fn maybe_memmap_file(file: &File) -> Result> { + let metadata = file.metadata()?; + let file_size = metadata.len(); + Ok(if !metadata.is_file() { + // Not a real file. + None + } else if file_size > isize::max_value() as u64 { + // Too long to safely map. + // https://github.com/danburkert/memmap-rs/issues/69 + None + } else if file_size == 0 { + // Mapping an empty file currently fails. + // https://github.com/danburkert/memmap-rs/issues/72 + None + } else if file_size < 16 * 1024 { + // Mapping small files is not worth it. + None + } else { + // Explicitly set the length of the memory map, so that filesystem + // changes can't race to violate the invariants we just checked. + let map = unsafe { + memmap2::MmapOptions::new() + .len(file_size as usize) + .map(file)? + }; + Some(map) + }) +} + +fn write_hex_output(mut output: blake3::OutputReader, args: &Args) -> Result<()> { + // Encoding multiples of the 64 bytes is most efficient. + // TODO: This computes each output block twice when the --seek argument isn't a multiple of 64. + // We'll refactor all of this soon anyway, once SIMD optimizations are available for the XOF. + let mut len = args.len(); + let mut block = [0; blake3::guts::BLOCK_LEN]; + while len > 0 { + output.fill(&mut block); + let hex_str = hex::encode(&block[..]); + let take_bytes = cmp::min(len, block.len() as u64); + print!("{}", &hex_str[..2 * take_bytes as usize]); + len -= take_bytes; + } + Ok(()) +} + +fn write_raw_output(output: blake3::OutputReader, args: &Args) -> Result<()> { + let mut output = output.take(args.len()); + let stdout = std::io::stdout(); + let mut handler = stdout.lock(); + std::io::copy(&mut output, &mut handler)?; + + Ok(()) +} + +fn read_key_from_stdin() -> Result<[u8; blake3::KEY_LEN]> { + let mut bytes = Vec::with_capacity(blake3::KEY_LEN + 1); + let n = std::io::stdin() + .lock() + .take(blake3::KEY_LEN as u64 + 1) + .read_to_end(&mut bytes)?; + if n < blake3::KEY_LEN { + bail!( + "expected {} key bytes from stdin, found {}", + blake3::KEY_LEN, + n, + ) + } else if n > blake3::KEY_LEN { + bail!("read more than {} key bytes from stdin", blake3::KEY_LEN) + } else { + Ok(bytes[..blake3::KEY_LEN].try_into().unwrap()) + } +} + +struct FilepathString { + filepath_string: String, + is_escaped: bool, +} + +// returns (string, did_escape) +fn filepath_to_string(filepath: &Path) -> FilepathString { + let unicode_cow = filepath.to_string_lossy(); + let mut filepath_string = unicode_cow.to_string(); + // If we're on Windows, normalize backslashes to forward slashes. This + // avoids a lot of ugly escaping in the common case, and it makes + // checkfiles created on Windows more likely to be portable to Unix. It + // also allows us to set a blanket "no backslashes allowed in checkfiles on + // Windows" rule, rather than allowing a Unix backslash to potentially get + // interpreted as a directory separator on Windows. + if cfg!(windows) { + filepath_string = filepath_string.replace('\\', "/"); + } + let mut is_escaped = false; + if filepath_string.contains('\\') || filepath_string.contains('\n') { + filepath_string = filepath_string.replace('\\', "\\\\").replace('\n', "\\n"); + is_escaped = true; + } + FilepathString { + filepath_string, + is_escaped, + } +} + +fn hex_half_byte(c: char) -> Result { + // The hex characters in the hash must be lowercase for now, though we + // could support uppercase too if we wanted to. + if '0' <= c && c <= '9' { + return Ok(c as u8 - '0' as u8); + } + if 'a' <= c && c <= 'f' { + return Ok(c as u8 - 'a' as u8 + 10); + } + bail!("Invalid hex"); +} + +// The `check` command is a security tool. That means it's much better for a +// check to fail more often than it should (a false negative), than for a check +// to ever succeed when it shouldn't (a false positive). By forbidding certain +// characters in checked filepaths, we avoid a class of false positives where +// two different filepaths can get confused with each other. +fn check_for_invalid_characters(utf8_path: &str) -> Result<()> { + // Null characters in paths should never happen, but they can result in a + // path getting silently truncated on Unix. + if utf8_path.contains('\0') { + bail!("Null character in path"); + } + // Because we convert invalid UTF-8 sequences in paths to the Unicode + // replacement character, multiple different invalid paths can map to the + // same UTF-8 string. + if utf8_path.contains('�') { + bail!("Unicode replacement character in path"); + } + // We normalize all Windows backslashes to forward slashes in our output, + // so the only natural way to get a backslash in a checkfile on Windows is + // to construct it on Unix and copy it over. (Or of course you could just + // doctor it by hand.) To avoid confusing this with a directory separator, + // we forbid backslashes entirely on Windows. Note that this check comes + // after unescaping has been done. + if cfg!(windows) && utf8_path.contains('\\') { + bail!("Backslash in path"); + } + Ok(()) +} + +fn unescape(mut path: &str) -> Result { + let mut unescaped = String::with_capacity(2 * path.len()); + while let Some(i) = path.find('\\') { + ensure!(i < path.len() - 1, "Invalid backslash escape"); + unescaped.push_str(&path[..i]); + match path[i + 1..].chars().next().unwrap() { + // Anything other than a recognized escape sequence is an error. + 'n' => unescaped.push_str("\n"), + '\\' => unescaped.push_str("\\"), + _ => bail!("Invalid backslash escape"), + } + path = &path[i + 2..]; + } + unescaped.push_str(path); + Ok(unescaped) +} + +#[derive(Debug)] +struct ParsedCheckLine { + file_string: String, + is_escaped: bool, + file_path: PathBuf, + expected_hash: blake3::Hash, +} + +fn parse_check_line(mut line: &str) -> Result { + // Trim off the trailing newline, if any. + line = line.trim_end_matches('\n'); + // If there's a backslash at the front of the line, that means we need to + // unescape the path below. This matches the behavior of e.g. md5sum. + let first = if let Some(c) = line.chars().next() { + c + } else { + bail!("Empty line"); + }; + let mut is_escaped = false; + if first == '\\' { + is_escaped = true; + line = &line[1..]; + } + // The front of the line must be a hash of the usual length, followed by + // two spaces. The hex characters in the hash must be lowercase for now, + // though we could support uppercase too if we wanted to. + let hash_hex_len = 2 * blake3::OUT_LEN; + let num_spaces = 2; + let prefix_len = hash_hex_len + num_spaces; + ensure!(line.len() > prefix_len, "Short line"); + ensure!( + line.chars().take(prefix_len).all(|c| c.is_ascii()), + "Non-ASCII prefix" + ); + ensure!(&line[hash_hex_len..][..2] == " ", "Invalid space"); + // Decode the hash hex. + let mut hash_bytes = [0; blake3::OUT_LEN]; + let mut hex_chars = line[..hash_hex_len].chars(); + for byte in &mut hash_bytes { + let high_char = hex_chars.next().unwrap(); + let low_char = hex_chars.next().unwrap(); + *byte = 16 * hex_half_byte(high_char)? + hex_half_byte(low_char)?; + } + let expected_hash: blake3::Hash = hash_bytes.into(); + let file_string = line[prefix_len..].to_string(); + let file_path_string = if is_escaped { + // If we detected a backslash at the start of the line earlier, now we + // need to unescape backslashes and newlines. + unescape(&file_string)? + } else { + file_string.clone().into() + }; + check_for_invalid_characters(&file_path_string)?; + Ok(ParsedCheckLine { + file_string, + is_escaped, + file_path: file_path_string.into(), + expected_hash, + }) +} + +fn hash_one_input(path: &Path, args: &Args) -> Result<()> { + let mut input = Input::open(path, args)?; + let output = input.hash(args)?; + if args.raw() { + write_raw_output(output, args)?; + return Ok(()); + } + if args.no_names() { + write_hex_output(output, args)?; + println!(); + return Ok(()); + } + let FilepathString { + filepath_string, + is_escaped, + } = filepath_to_string(path); + if is_escaped { + print!("\\"); + } + write_hex_output(output, args)?; + println!(" {}", filepath_string); + Ok(()) +} + +// Returns true for success. Having a boolean return value here, instead of +// passing down the files_failed reference, makes it less likely that we might +// forget to set it in some error condition. +fn check_one_line(line: &str, args: &Args) -> bool { + let parse_result = parse_check_line(&line); + let ParsedCheckLine { + file_string, + is_escaped, + file_path, + expected_hash, + } = match parse_result { + Ok(parsed) => parsed, + Err(e) => { + eprintln!("{}: {}", NAME, e); + return false; + } + }; + let file_string = if is_escaped { + "\\".to_string() + &file_string + } else { + file_string + }; + let hash_result: Result = Input::open(&file_path, args) + .and_then(|mut input| input.hash(args)) + .map(|mut hash_output| { + let mut found_hash_bytes = [0; blake3::OUT_LEN]; + hash_output.fill(&mut found_hash_bytes); + found_hash_bytes.into() + }); + let found_hash: blake3::Hash = match hash_result { + Ok(hash) => hash, + Err(e) => { + println!("{}: FAILED ({})", file_string, e); + return false; + } + }; + // This is a constant-time comparison. + if expected_hash == found_hash { + if !args.quiet() { + println!("{}: OK", file_string); + } + true + } else { + println!("{}: FAILED", file_string); + false + } +} + +fn check_one_checkfile(path: &Path, args: &Args, files_failed: &mut u64) -> Result<()> { + let checkfile_input = Input::open(path, args)?; + let mut bufreader = io::BufReader::new(checkfile_input); + let mut line = String::new(); + loop { + line.clear(); + let n = bufreader.read_line(&mut line)?; + if n == 0 { + return Ok(()); + } + // check_one_line() prints errors and turns them into a success=false + // return, so it doesn't return a Result. + let success = check_one_line(&line, args); + if !success { + // We use `files_failed > 0` to indicate a mismatch, so it's important for correctness + // that it's impossible for this counter to overflow. + *files_failed = files_failed.saturating_add(1); + } + } +} + +fn main() -> Result<()> { + let args = Args::parse()?; + let mut thread_pool_builder = rayon::ThreadPoolBuilder::new(); + if let Some(num_threads) = args.num_threads() { + thread_pool_builder = thread_pool_builder.num_threads(num_threads); + } + let thread_pool = thread_pool_builder.build()?; + thread_pool.install(|| { + let mut files_failed = 0u64; + // Note that file_args automatically includes `-` if nothing is given. + for path in &args.file_args { + if args.check() { + check_one_checkfile(path, &args, &mut files_failed)?; + } else { + // Errors encountered in hashing are tolerated and printed to + // stderr. This allows e.g. `b3sum *` to print errors for + // non-files and keep going. However, if we encounter any + // errors we'll still return non-zero at the end. + let result = hash_one_input(path, &args); + if let Err(e) = result { + files_failed = files_failed.saturating_add(1); + eprintln!("{}: {}: {}", NAME, path.to_string_lossy(), e); + } + } + } + if args.check() && files_failed > 0 { + eprintln!( + "{}: WARNING: {} computed checksum{} did NOT match", + NAME, + files_failed, + if files_failed == 1 { "" } else { "s" }, + ); + } + std::process::exit(if files_failed > 0 { 1 } else { 0 }); + }) +} + +#[cfg(test)] +mod test { + use clap::CommandFactory; + + #[test] + fn test_args() { + crate::Inner::command().debug_assert(); + } +} diff --git a/third-party/blake3/b3sum/src/unit_tests.rs b/third-party/blake3/b3sum/src/unit_tests.rs new file mode 100644 index 00000000..1fa1a17d --- /dev/null +++ b/third-party/blake3/b3sum/src/unit_tests.rs @@ -0,0 +1,189 @@ +use std::path::Path; + +#[test] +fn test_parse_check_line() { + // ========================= + // ===== Success Cases ===== + // ========================= + + // the basic case + let crate::ParsedCheckLine { + file_string, + is_escaped, + file_path, + expected_hash, + } = crate::parse_check_line( + "0909090909090909090909090909090909090909090909090909090909090909 foo", + ) + .unwrap(); + assert_eq!(expected_hash, blake3::Hash::from([0x09; 32])); + assert!(!is_escaped); + assert_eq!(file_string, "foo"); + assert_eq!(file_path, Path::new("foo")); + + // regular whitespace + let crate::ParsedCheckLine { + file_string, + is_escaped, + file_path, + expected_hash, + } = crate::parse_check_line( + "fafafafafafafafafafafafafafafafafafafafafafafafafafafafafafafafa fo \to\n\n\n", + ) + .unwrap(); + assert_eq!(expected_hash, blake3::Hash::from([0xfa; 32])); + assert!(!is_escaped); + assert_eq!(file_string, "fo \to"); + assert_eq!(file_path, Path::new("fo \to")); + + // path is one space + let crate::ParsedCheckLine { + file_string, + is_escaped, + file_path, + expected_hash, + } = crate::parse_check_line( + "4242424242424242424242424242424242424242424242424242424242424242 ", + ) + .unwrap(); + assert_eq!(expected_hash, blake3::Hash::from([0x42; 32])); + assert!(!is_escaped); + assert_eq!(file_string, " "); + assert_eq!(file_path, Path::new(" ")); + + // *Unescaped* backslashes. Note that this line does *not* start with a + // backslash, so something like "\" + "n" is interpreted as *two* + // characters. We forbid all backslashes on Windows, so this test is + // Unix-only. + if cfg!(not(windows)) { + let crate::ParsedCheckLine { + file_string, + is_escaped, + file_path, + expected_hash, + } = crate::parse_check_line( + "4343434343434343434343434343434343434343434343434343434343434343 fo\\a\\no", + ) + .unwrap(); + assert_eq!(expected_hash, blake3::Hash::from([0x43; 32])); + assert!(!is_escaped); + assert_eq!(file_string, "fo\\a\\no"); + assert_eq!(file_path, Path::new("fo\\a\\no")); + } + + // escaped newline + let crate::ParsedCheckLine { + file_string, + is_escaped, + file_path, + expected_hash, + } = crate::parse_check_line( + "\\4444444444444444444444444444444444444444444444444444444444444444 fo\\n\\no", + ) + .unwrap(); + assert_eq!(expected_hash, blake3::Hash::from([0x44; 32])); + assert!(is_escaped); + assert_eq!(file_string, "fo\\n\\no"); + assert_eq!(file_path, Path::new("fo\n\no")); + + // Escaped newline and backslash. Again because backslash is not allowed on + // Windows, this test is Unix-only. + if cfg!(not(windows)) { + let crate::ParsedCheckLine { + file_string, + is_escaped, + file_path, + expected_hash, + } = crate::parse_check_line( + "\\4545454545454545454545454545454545454545454545454545454545454545 fo\\n\\\\o", + ) + .unwrap(); + assert_eq!(expected_hash, blake3::Hash::from([0x45; 32])); + assert!(is_escaped); + assert_eq!(file_string, "fo\\n\\\\o"); + assert_eq!(file_path, Path::new("fo\n\\o")); + } + + // non-ASCII path + let crate::ParsedCheckLine { + file_string, + is_escaped, + file_path, + expected_hash, + } = crate::parse_check_line( + "4646464646464646464646464646464646464646464646464646464646464646 å¦è®¤", + ) + .unwrap(); + assert_eq!(expected_hash, blake3::Hash::from([0x46; 32])); + assert!(!is_escaped); + assert_eq!(file_string, "å¦è®¤"); + assert_eq!(file_path, Path::new("å¦è®¤")); + + // ========================= + // ===== Failure Cases ===== + // ========================= + + // too short + crate::parse_check_line("").unwrap_err(); + crate::parse_check_line("0").unwrap_err(); + crate::parse_check_line("00").unwrap_err(); + crate::parse_check_line("0000000000000000000000000000000000000000000000000000000000000000") + .unwrap_err(); + crate::parse_check_line("0000000000000000000000000000000000000000000000000000000000000000 ") + .unwrap_err(); + + // not enough spaces + crate::parse_check_line("0000000000000000000000000000000000000000000000000000000000000000 foo") + .unwrap_err(); + + // capital letter hex + crate::parse_check_line( + "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA foo", + ) + .unwrap_err(); + + // non-hex hex + crate::parse_check_line( + "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx foo", + ) + .unwrap_err(); + + // non-ASCII hex + crate::parse_check_line("你好, 我å«æ°å…‹. 认识你很高兴. è¦ä¸è¦åƒä¸ªé¦™è•‰? foo").unwrap_err(); + + // invalid escape sequence + crate::parse_check_line( + "\\0000000000000000000000000000000000000000000000000000000000000000 fo\\o", + ) + .unwrap_err(); + + // truncated escape sequence + crate::parse_check_line( + "\\0000000000000000000000000000000000000000000000000000000000000000 foo\\", + ) + .unwrap_err(); + + // null char + crate::parse_check_line( + "0000000000000000000000000000000000000000000000000000000000000000 fo\0o", + ) + .unwrap_err(); + + // Unicode replacement char + crate::parse_check_line( + "0000000000000000000000000000000000000000000000000000000000000000 fo�o", + ) + .unwrap_err(); + + // On Windows only, backslashes are not allowed, escaped or otherwise. + if cfg!(windows) { + crate::parse_check_line( + "0000000000000000000000000000000000000000000000000000000000000000 fo\\o", + ) + .unwrap_err(); + crate::parse_check_line( + "\\0000000000000000000000000000000000000000000000000000000000000000 fo\\\\o", + ) + .unwrap_err(); + } +} diff --git a/third-party/blake3/b3sum/tests/cli_tests.rs b/third-party/blake3/b3sum/tests/cli_tests.rs new file mode 100644 index 00000000..d5d4efa3 --- /dev/null +++ b/third-party/blake3/b3sum/tests/cli_tests.rs @@ -0,0 +1,613 @@ +use duct::cmd; +use std::ffi::OsString; +use std::fs; +use std::io::prelude::*; +use std::path::PathBuf; + +pub fn b3sum_exe() -> PathBuf { + env!("CARGO_BIN_EXE_b3sum").into() +} + +#[test] +fn test_hash_one() { + let expected = format!("{} -", blake3::hash(b"foo").to_hex()); + let output = cmd!(b3sum_exe()).stdin_bytes("foo").read().unwrap(); + assert_eq!(&*expected, output); +} + +#[test] +fn test_hash_one_raw() { + let expected = blake3::hash(b"foo").as_bytes().to_owned(); + let output = cmd!(b3sum_exe(), "--raw") + .stdin_bytes("foo") + .stdout_capture() + .run() + .unwrap() + .stdout; + assert_eq!(expected, output.as_slice()); +} + +#[test] +fn test_hash_many() { + let dir = tempfile::tempdir().unwrap(); + let file1 = dir.path().join("file1"); + fs::write(&file1, b"foo").unwrap(); + let file2 = dir.path().join("file2"); + fs::write(&file2, b"bar").unwrap(); + + let output = cmd!(b3sum_exe(), &file1, &file2).read().unwrap(); + let foo_hash = blake3::hash(b"foo"); + let bar_hash = blake3::hash(b"bar"); + let expected = format!( + "{} {}\n{} {}", + foo_hash.to_hex(), + // account for slash normalization on Windows + file1.to_string_lossy().replace("\\", "/"), + bar_hash.to_hex(), + file2.to_string_lossy().replace("\\", "/"), + ); + assert_eq!(expected, output); + + let output_no_names = cmd!(b3sum_exe(), "--no-names", &file1, &file2) + .read() + .unwrap(); + let expected_no_names = format!("{}\n{}", foo_hash.to_hex(), bar_hash.to_hex(),); + assert_eq!(expected_no_names, output_no_names); +} + +#[test] +fn test_missing_files() { + let dir = tempfile::tempdir().unwrap(); + let file1 = dir.path().join("file1"); + fs::write(&file1, b"foo").unwrap(); + let file2 = dir.path().join("file2"); + fs::write(&file2, b"bar").unwrap(); + + let output = cmd!(b3sum_exe(), "file1", "missing_file", "file2") + .dir(dir.path()) + .stdout_capture() + .stderr_capture() + .unchecked() + .run() + .unwrap(); + assert!(!output.status.success()); + + let foo_hash = blake3::hash(b"foo"); + let bar_hash = blake3::hash(b"bar"); + let expected_stdout = format!( + "{} file1\n{} file2\n", + foo_hash.to_hex(), + bar_hash.to_hex(), + ); + assert_eq!(expected_stdout.as_bytes(), &output.stdout[..]); + + let bing_error = fs::File::open(dir.path().join("missing_file")).unwrap_err(); + let expected_stderr = format!("b3sum: missing_file: {}\n", bing_error.to_string()); + assert_eq!(expected_stderr.as_bytes(), &output.stderr[..]); +} + +#[test] +fn test_hash_length_and_seek() { + let mut expected = [0; 100]; + blake3::Hasher::new() + .update(b"foo") + .finalize_xof() + .fill(&mut expected); + let output = cmd!(b3sum_exe(), "--raw", "--length=100") + .stdin_bytes("foo") + .stdout_capture() + .run() + .unwrap() + .stdout; + assert_eq!(expected[..], output); + + let short_output = cmd!(b3sum_exe(), "--raw", "--length=99") + .stdin_bytes("foo") + .stdout_capture() + .run() + .unwrap() + .stdout; + assert_eq!(expected[..99], short_output); + + let seek1_output = cmd!(b3sum_exe(), "--raw", "--length=99", "--seek=1") + .stdin_bytes("foo") + .stdout_capture() + .run() + .unwrap() + .stdout; + assert_eq!(expected[1..], seek1_output); + + let seek99_output = cmd!(b3sum_exe(), "--raw", "--length=1", "--seek=99") + .stdin_bytes("foo") + .stdout_capture() + .run() + .unwrap() + .stdout; + assert_eq!(expected[99..], seek99_output); +} + +#[test] +fn test_keyed() { + let key = [42; blake3::KEY_LEN]; + let f = tempfile::NamedTempFile::new().unwrap(); + f.as_file().write_all(b"foo").unwrap(); + f.as_file().flush().unwrap(); + let expected = blake3::keyed_hash(&key, b"foo").to_hex(); + let output = cmd!(b3sum_exe(), "--keyed", "--no-names", f.path()) + .stdin_bytes(&key[..]) + .read() + .unwrap(); + assert_eq!(&*expected, &*output); + + // Make sure that keys of the wrong length lead to errors. + for bad_length in [0, 1, blake3::KEY_LEN - 1, blake3::KEY_LEN + 1] { + dbg!(bad_length); + let output = cmd!(b3sum_exe(), "--keyed", f.path()) + .stdin_bytes(vec![0; bad_length]) + .stdout_capture() + .stderr_capture() + .unchecked() + .run() + .unwrap(); + assert!(!output.status.success()); + assert!(output.stdout.is_empty()); + // Make sure the error message is relevant. + let stderr = std::str::from_utf8(&output.stderr).unwrap(); + assert!(stderr.contains("key bytes")); + } +} + +#[test] +fn test_derive_key() { + let context = "BLAKE3 2019-12-28 10:28:41 example context"; + let f = tempfile::NamedTempFile::new().unwrap(); + f.as_file().write_all(b"key material").unwrap(); + f.as_file().flush().unwrap(); + let expected = hex::encode(blake3::derive_key(context, b"key material")); + let output = cmd!(b3sum_exe(), "--derive-key", context, "--no-names", f.path()) + .read() + .unwrap(); + assert_eq!(&*expected, &*output); +} + +#[test] +fn test_no_mmap() { + let f = tempfile::NamedTempFile::new().unwrap(); + f.as_file().write_all(b"foo").unwrap(); + f.as_file().flush().unwrap(); + + let expected = blake3::hash(b"foo").to_hex(); + let output = cmd!(b3sum_exe(), "--no-mmap", "--no-names", f.path()) + .read() + .unwrap(); + assert_eq!(&*expected, &*output); +} + +#[test] +fn test_length_without_value_is_an_error() { + let result = cmd!(b3sum_exe(), "--length") + .stdin_bytes("foo") + .stderr_capture() + .run(); + assert!(result.is_err()); +} + +#[test] +fn test_raw_with_multi_files_is_an_error() { + let f1 = tempfile::NamedTempFile::new().unwrap(); + let f2 = tempfile::NamedTempFile::new().unwrap(); + + // Make sure it doesn't error with just one file + let result = cmd!(b3sum_exe(), "--raw", f1.path()).stdout_capture().run(); + assert!(result.is_ok()); + + // Make sure it errors when both file are passed + let result = cmd!(b3sum_exe(), "--raw", f1.path(), f2.path()) + .stderr_capture() + .run(); + assert!(result.is_err()); +} + +#[test] +#[cfg(unix)] +fn test_newline_and_backslash_escaping_on_unix() { + let empty_hash = blake3::hash(b"").to_hex(); + let dir = tempfile::tempdir().unwrap(); + fs::create_dir(dir.path().join("subdir")).unwrap(); + let names = [ + "abcdef", + "abc\ndef", + "abc\\def", + "abc\rdef", + "abc\r\ndef", + "subdir/foo", + ]; + let mut paths = Vec::new(); + for name in &names { + let path = dir.path().join(name); + println!("creating file at {:?}", path); + fs::write(&path, b"").unwrap(); + paths.push(path); + } + let output = cmd(b3sum_exe(), &names).dir(dir.path()).read().unwrap(); + let expected = format!( + "\ +{0} abcdef +\\{0} abc\\ndef +\\{0} abc\\\\def +{0} abc\rdef +\\{0} abc\r\\ndef +{0} subdir/foo", + empty_hash, + ); + println!("output"); + println!("======"); + println!("{}", output); + println!(); + println!("expected"); + println!("========"); + println!("{}", expected); + println!(); + assert_eq!(expected, output); +} + +#[test] +#[cfg(windows)] +fn test_slash_normalization_on_windows() { + let empty_hash = blake3::hash(b"").to_hex(); + let dir = tempfile::tempdir().unwrap(); + fs::create_dir(dir.path().join("subdir")).unwrap(); + // Note that filenames can't contain newlines or backslashes on Windows, so + // we don't test escaping here. We only test forward slash and backslash as + // directory separators. + let names = ["abcdef", "subdir/foo", "subdir\\bar"]; + let mut paths = Vec::new(); + for name in &names { + let path = dir.path().join(name); + println!("creating file at {:?}", path); + fs::write(&path, b"").unwrap(); + paths.push(path); + } + let output = cmd(b3sum_exe(), &names).dir(dir.path()).read().unwrap(); + let expected = format!( + "\ +{0} abcdef +{0} subdir/foo +{0} subdir/bar", + empty_hash, + ); + println!("output"); + println!("======"); + println!("{}", output); + println!(); + println!("expected"); + println!("========"); + println!("{}", expected); + println!(); + assert_eq!(expected, output); +} + +#[test] +#[cfg(unix)] +fn test_invalid_unicode_on_unix() { + use std::os::unix::ffi::OsStringExt; + + let empty_hash = blake3::hash(b"").to_hex(); + let dir = tempfile::tempdir().unwrap(); + let names = ["abcdef".into(), OsString::from_vec(b"abc\xffdef".to_vec())]; + let mut paths = Vec::new(); + for name in &names { + let path = dir.path().join(name); + println!("creating file at {:?}", path); + // Note: Some operating systems, macOS in particular, simply don't + // allow invalid Unicode in filenames. On those systems, this write + // will fail. That's fine, we'll just short-circuit this test in that + // case. But assert that at least Linux allows this. + let write_result = fs::write(&path, b""); + if cfg!(target_os = "linux") { + write_result.expect("Linux should allow invalid Unicode"); + } else if write_result.is_err() { + return; + } + paths.push(path); + } + let output = cmd(b3sum_exe(), &names).dir(dir.path()).read().unwrap(); + let expected = format!( + "\ +{0} abcdef +{0} abc�def", + empty_hash, + ); + println!("output"); + println!("======"); + println!("{}", output); + println!(); + println!("expected"); + println!("========"); + println!("{}", expected); + println!(); + assert_eq!(expected, output); +} + +#[test] +#[cfg(windows)] +fn test_invalid_unicode_on_windows() { + use std::os::windows::ffi::OsStringExt; + + let empty_hash = blake3::hash(b"").to_hex(); + let dir = tempfile::tempdir().unwrap(); + let surrogate_char = 0xDC00; + let bad_unicode_wchars = [ + 'a' as u16, + 'b' as u16, + 'c' as u16, + surrogate_char, + 'd' as u16, + 'e' as u16, + 'f' as u16, + ]; + let bad_osstring = OsString::from_wide(&bad_unicode_wchars); + let names = ["abcdef".into(), bad_osstring]; + let mut paths = Vec::new(); + for name in &names { + let path = dir.path().join(name); + println!("creating file at {:?}", path); + fs::write(&path, b"").unwrap(); + paths.push(path); + } + let output = cmd(b3sum_exe(), &names).dir(dir.path()).read().unwrap(); + let expected = format!( + "\ +{0} abcdef +{0} abc�def", + empty_hash, + ); + println!("output"); + println!("======"); + println!("{}", output); + println!(); + println!("expected"); + println!("========"); + println!("{}", expected); + println!(); + assert_eq!(expected, output); +} + +#[test] +fn test_check() { + // Make a directory full of files, and make sure the b3sum output in that + // directory is what we expect. + let a_hash = blake3::hash(b"a").to_hex(); + let b_hash = blake3::hash(b"b").to_hex(); + let cd_hash = blake3::hash(b"cd").to_hex(); + let dir = tempfile::tempdir().unwrap(); + fs::write(dir.path().join("a"), b"a").unwrap(); + fs::write(dir.path().join("b"), b"b").unwrap(); + fs::create_dir(dir.path().join("c")).unwrap(); + fs::write(dir.path().join("c/d"), b"cd").unwrap(); + let output = cmd!(b3sum_exe(), "a", "b", "c/d") + .dir(dir.path()) + .stdout_capture() + .stderr_capture() + .run() + .unwrap(); + let stdout = std::str::from_utf8(&output.stdout).unwrap(); + let stderr = std::str::from_utf8(&output.stderr).unwrap(); + let expected_checkfile = format!( + "{} a\n\ + {} b\n\ + {} c/d\n", + a_hash, b_hash, cd_hash, + ); + assert_eq!(expected_checkfile, stdout); + assert_eq!("", stderr); + + // Now use the output we just validated as a checkfile, passed to stdin. + let output = cmd!(b3sum_exe(), "--check") + .stdin_bytes(expected_checkfile.as_bytes()) + .dir(dir.path()) + .stdout_capture() + .stderr_capture() + .run() + .unwrap(); + let stdout = std::str::from_utf8(&output.stdout).unwrap(); + let stderr = std::str::from_utf8(&output.stderr).unwrap(); + let expected_check_output = "\ + a: OK\n\ + b: OK\n\ + c/d: OK\n"; + assert_eq!(expected_check_output, stdout); + assert_eq!("", stderr); + + // Now pass the same checkfile twice on the command line just for fun. + let checkfile_path = dir.path().join("checkfile"); + fs::write(&checkfile_path, &expected_checkfile).unwrap(); + let output = cmd!(b3sum_exe(), "--check", &checkfile_path, &checkfile_path) + .dir(dir.path()) + .stdout_capture() + .stderr_capture() + .run() + .unwrap(); + let stdout = std::str::from_utf8(&output.stdout).unwrap(); + let stderr = std::str::from_utf8(&output.stderr).unwrap(); + let mut double_check_output = String::new(); + double_check_output.push_str(&expected_check_output); + double_check_output.push_str(&expected_check_output); + assert_eq!(double_check_output, stdout); + assert_eq!("", stderr); + + // Corrupt one of the files and check again. + fs::write(dir.path().join("b"), b"CORRUPTION").unwrap(); + let output = cmd!(b3sum_exe(), "--check", &checkfile_path) + .dir(dir.path()) + .stdout_capture() + .stderr_capture() + .unchecked() + .run() + .unwrap(); + let stdout = std::str::from_utf8(&output.stdout).unwrap(); + let stderr = std::str::from_utf8(&output.stderr).unwrap(); + let expected_check_failure = "\ + a: OK\n\ + b: FAILED\n\ + c/d: OK\n"; + assert!(!output.status.success()); + assert_eq!(expected_check_failure, stdout); + assert_eq!( + "b3sum: WARNING: 1 computed checksum did NOT match\n", + stderr, + ); + + // Delete one of the files and check again. + fs::remove_file(dir.path().join("b")).unwrap(); + let open_file_error = fs::File::open(dir.path().join("b")).unwrap_err(); + let output = cmd!(b3sum_exe(), "--check", &checkfile_path) + .dir(dir.path()) + .stdout_capture() + .stderr_capture() + .unchecked() + .run() + .unwrap(); + let stdout = std::str::from_utf8(&output.stdout).unwrap(); + let stderr = std::str::from_utf8(&output.stderr).unwrap(); + let expected_check_failure = format!( + "a: OK\n\ + b: FAILED ({})\n\ + c/d: OK\n", + open_file_error, + ); + assert!(!output.status.success()); + assert_eq!(expected_check_failure, stdout); + assert_eq!( + "b3sum: WARNING: 1 computed checksum did NOT match\n", + stderr, + ); + + // Confirm that --quiet suppresses the OKs but not the FAILEDs. + let output = cmd!(b3sum_exe(), "--check", "--quiet", &checkfile_path) + .dir(dir.path()) + .stdout_capture() + .stderr_capture() + .unchecked() + .run() + .unwrap(); + let stdout = std::str::from_utf8(&output.stdout).unwrap(); + let stderr = std::str::from_utf8(&output.stderr).unwrap(); + let expected_check_failure = format!("b: FAILED ({})\n", open_file_error); + assert!(!output.status.success()); + assert_eq!(expected_check_failure, stdout); + assert_eq!( + "b3sum: WARNING: 1 computed checksum did NOT match\n", + stderr, + ); +} + +#[test] +fn test_check_invalid_characters() { + // Check that a null character in the path fails. + let output = cmd!(b3sum_exe(), "--check") + .stdin_bytes("0000000000000000000000000000000000000000000000000000000000000000 \0") + .stdout_capture() + .stderr_capture() + .unchecked() + .run() + .unwrap(); + let stdout = std::str::from_utf8(&output.stdout).unwrap(); + let stderr = std::str::from_utf8(&output.stderr).unwrap(); + let expected_stderr = "\ + b3sum: Null character in path\n\ + b3sum: WARNING: 1 computed checksum did NOT match\n"; + assert!(!output.status.success()); + assert_eq!("", stdout); + assert_eq!(expected_stderr, stderr); + + // Check that a Unicode replacement character in the path fails. + let output = cmd!(b3sum_exe(), "--check") + .stdin_bytes("0000000000000000000000000000000000000000000000000000000000000000 �") + .stdout_capture() + .stderr_capture() + .unchecked() + .run() + .unwrap(); + let stdout = std::str::from_utf8(&output.stdout).unwrap(); + let stderr = std::str::from_utf8(&output.stderr).unwrap(); + let expected_stderr = "\ + b3sum: Unicode replacement character in path\n\ + b3sum: WARNING: 1 computed checksum did NOT match\n"; + assert!(!output.status.success()); + assert_eq!("", stdout); + assert_eq!(expected_stderr, stderr); + + // Check that an invalid escape sequence in the path fails. + let output = cmd!(b3sum_exe(), "--check") + .stdin_bytes("\\0000000000000000000000000000000000000000000000000000000000000000 \\a") + .stdout_capture() + .stderr_capture() + .unchecked() + .run() + .unwrap(); + let stdout = std::str::from_utf8(&output.stdout).unwrap(); + let stderr = std::str::from_utf8(&output.stderr).unwrap(); + let expected_stderr = "\ + b3sum: Invalid backslash escape\n\ + b3sum: WARNING: 1 computed checksum did NOT match\n"; + assert!(!output.status.success()); + assert_eq!("", stdout); + assert_eq!(expected_stderr, stderr); + + // Windows also forbids literal backslashes. Check for that if and only if + // we're on Windows. + if cfg!(windows) { + let output = cmd!(b3sum_exe(), "--check") + .stdin_bytes("0000000000000000000000000000000000000000000000000000000000000000 \\") + .stdout_capture() + .stderr_capture() + .unchecked() + .run() + .unwrap(); + let stdout = std::str::from_utf8(&output.stdout).unwrap(); + let stderr = std::str::from_utf8(&output.stderr).unwrap(); + let expected_stderr = "\ + b3sum: Backslash in path\n\ + b3sum: WARNING: 1 computed checksum did NOT match\n"; + assert!(!output.status.success()); + assert_eq!("", stdout); + assert_eq!(expected_stderr, stderr); + } +} + +#[test] +fn test_globbing() { + // On Unix, globbing is provided by the shell. On Windows, globbing is + // provided by us, using the `wild` crate. + let dir = tempfile::tempdir().unwrap(); + let file1 = dir.path().join("file1"); + fs::write(&file1, b"foo").unwrap(); + let file2 = dir.path().join("file2"); + fs::write(&file2, b"bar").unwrap(); + + let foo_hash = blake3::hash(b"foo"); + let bar_hash = blake3::hash(b"bar"); + // NOTE: This assumes that the glob will be expanded in alphabetical order, + // to "file1 file2" rather than "file2 file1". So far, this seems to + // be true (guaranteed?) of Unix shell behavior, and true in practice + // with the `wild` crate on Windows. It's possible that this could + // start failing in the future, though, or on some unknown platform. + // If that ever happens, we'll need to relax this test somehow, + // probably by just testing for both possible outputs. I'm not + // handling that case in advance, though, because I'd prefer to hear + // about it if it comes up. + let expected = format!("{} file1\n{} file2", foo_hash.to_hex(), bar_hash.to_hex()); + + let star_command = format!("{} *", b3sum_exe().to_str().unwrap()); + let (exe, c_flag) = if cfg!(windows) { + ("cmd.exe", "/C") + } else { + ("/bin/sh", "-c") + }; + let output = cmd!(exe, c_flag, star_command) + .dir(dir.path()) + .read() + .unwrap(); + assert_eq!(expected, output); +} diff --git a/third-party/blake3/b3sum/what_does_check_do.md b/third-party/blake3/b3sum/what_does_check_do.md new file mode 100644 index 00000000..3af0e53b --- /dev/null +++ b/third-party/blake3/b3sum/what_does_check_do.md @@ -0,0 +1,174 @@ +# How does `b3sum --check` behave exactly?
or: Are filepaths...text? + +Most of the time, `b3sum --check` is a drop-in replacement for `md5sum --check` +and other Coreutils hashing tools. It consumes a checkfile (the output of a +regular `b3sum` command), re-hashes all the files listed there, and returns +success if all of those hashes are still correct. What makes this more +complicated than it might seem, is that representing filepaths as text means we +need to consider many possible edge cases of unrepresentable filepaths. This +document describes all of these edge cases in detail. + +## The simple case + +Here's the result of running `b3sum a b c/d` in a directory that contains +those three files: + +```bash +$ echo hi > a +$ echo lo > b +$ mkdir c +$ echo stuff > c/d +$ b3sum a b c/d +0b8b60248fad7ac6dfac221b7e01a8b91c772421a15b387dd1fb2d6a94aee438 a +6ae4a57bbba24f79c461d30bcb4db973b9427d9207877e34d2d74528daa84115 b +2d477356c962e54784f1c5dc5297718d92087006f6ee96b08aeaf7f3cd252377 c/d +``` + +If we pipe that output into `b3sum --check`, it will exit with status zero +(success) and print: + +```bash +$ b3sum a b c/d | b3sum --check +a: OK +b: OK +c/d: OK +``` + +If we delete `b` and change the contents of `c/d`, and then use the same +checkfile as above, `b3sum --check` will exit with a non-zero status (failure) +and print: + +```bash +$ b3sum a b c/d > checkfile +$ rm b +$ echo more stuff >> c/d +$ b3sum --check checkfile +a: OK +b: FAILED (No such file or directory (os error 2)) +c/d: FAILED +``` + +In these typical cases, `b3sum` and `md5sum` have identical output for success +and very similar output for failure. + +## Escaping newlines and backslashes + +Since the checkfile format (the regular output format of `b3sum`) is +newline-separated text, we need to worry about what happens when a filepath +contains a newline, or worse. Suppose we create a file named `x[newline]x` +(3 characters). One way to create such a file is with a Python one-liner like +this: + +```python +>>> open("x\nx", "w") +``` + +Here's what happens when we hash that file with `b3sum`: + +```bash +$ b3sum x* +\af1349b9f5f9a1a6a0404dea36dcc9499bcb25c9adc112b7cc9a93cae41f3262 x\nx +``` + +Notice two things. First, `b3sum` puts a single `\` character at the front of +the line. This indicates that the filepath contains escape sequences that +`b3sum --check` will need to unescape. Then, `b3sum` replaces the newline +character in the filepath with the two-character escape sequence `\n`. +Similarly, if the filepath contained a backslash, `b3sum` would escape it as +`\\` in the output. So far, all of this behavior is still identical to +`md5sum`. + +## Invalid Unicode + +This is where `b3sum` and `md5sum` diverge. Apart from the newline and +backslash escapes described above, `md5sum` copies all other filepath bytes +verbatim to its output. That means its output encoding is "ASCII plus whatever +bytes we got from the command line". This creates two problems: + +1. Printing something that isn't UTF-8 is kind of gross. +2. Windows support. + +What's the deal with Windows? To start with, there's a fundamental difference +in how Unix and Windows represent filepaths. Unix filepaths are "usually UTF-8" +and Windows filepaths are "usually UTF-16". That means that a file named `abc` +is typically represented as the bytes `[97, 98, 99]` on Unix and as the bytes +`[97, 0, 98, 0, 99, 0]` on Windows. The `md5sum` approach won't work if we plan +on creating a checkfile on Unix and checking it on Windows, or vice versa. + +A more portable approach is to convert platform-specific bytes into some +consistent Unicode encoding. (In practice this is going to be UTF-8, but in +theory it could be anything.) Then when `--check` needs to open a file, we +convert the Unicode representation back into platform-specific bytes. This +makes important common cases like `abc`, and in fact even `abc[newline]def`, +work as expected. Great! + +But...what did we mean above when we said *usually* UTF-8 and *usually* UTF-16? +It turns out that not every possible sequence of bytes is valid UTF-8, and not +every possible sequence of 16-bit wide chars is valid UTF-16. For example, the +byte 0xFF (255) can never appear in any UTF-8 string. If we ask Python to +decode it, it yells at us: + +```python +>>> b"\xFF".decode("UTF-8") +UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte +``` + +However, tragically, we *can* create a file with that byte in its name (on +Linux at least, though not usually on macOS): + +```python +>>> open(b"y\xFFy", "w") +``` + +So some filepaths aren't representable in Unicode at all. Our plan to "convert +platform-specific bytes into some consistent Unicode encoding" isn't going to +work for everything. What does `b3sum` do with the file above? + +```bash +$ b3sum y* +af1349b9f5f9a1a6a0404dea36dcc9499bcb25c9adc112b7cc9a93cae41f3262 y�y +``` + +That � in there is a "Unicode replacement character". When we run into +filepaths that we can't represent in Unicode, we replace the unrepresentable +parts with these characters. On the checking side, to avoid any possible +confusion between two different invalid filepaths, we automatically fail if we +see a replacement character. Together with a few more details covered in the +next section, this gives us an important set of properties: + +1. Any file can be hashed locally. +2. Any file with a valid Unicode name not containing the � character can be + checked. +3. Checking ambiguous or unrepresentable filepaths always fails. +4. Checkfiles are always valid UTF-8. +5. Checkfiles are portable between Unix and Windows. + +## Formal Rules + +1. When hashing, filepaths are represented in a platform-specific encoding, + which can accommodate any filepath on the current platform. In Rust, this is + `OsStr`/`OsString`. +2. In output, filepaths are first converted to UTF-8. Any non-Unicode segments + are replaced with Unicode replacement characters (U+FFFD). In Rust, this is + `OsStr::to_string_lossy`. +3. Then, if a filepath contains any backslashes (U+005C) or newlines (U+000A), + these characters are escaped as `\\` and `\n` respectively. +4. Finally, any output line containing an escape sequence is prefixed with a + single backslash. +5. When checking, each line is parsed as UTF-8, separated by a newline + (U+000A). Invalid UTF-8 is an error. +6. Then, if a line begins with a backslash, the filepath component is + unescaped. Any escape sequence other than `\\` or `\n` is an error. If a + line does not begin with a backslash, unescaping is not performed, and any + backslashes in the filepath component are interpreted literally. (`b3sum` + output never contains unescaped backslashes, but they can occur in + checkfiles assembled by hand.) +7. Finally, if a filepath contains a Unicode replacement character (U+FFFD) or + a null character (U+0000), it is an error. + + **Additionally, on Windows only:** + +8. In output, all backslashes (U+005C) are replaced with forward slashes + (U+002F). +9. When checking, after unescaping, if a filepath contains a backslash, it is + an error. diff --git a/third-party/blake3/benches/bench.rs b/third-party/blake3/benches/bench.rs new file mode 100644 index 00000000..5efb9e6f --- /dev/null +++ b/third-party/blake3/benches/bench.rs @@ -0,0 +1,517 @@ +#![feature(test)] + +extern crate test; + +use arrayref::array_ref; +use arrayvec::ArrayVec; +use blake3::guts::{BLOCK_LEN, CHUNK_LEN}; +use blake3::platform::{Platform, MAX_SIMD_DEGREE}; +use blake3::OUT_LEN; +use rand::prelude::*; +use test::Bencher; + +const KIB: usize = 1024; + +// This struct randomizes two things: +// 1. The actual bytes of input. +// 2. The page offset the input starts at. +pub struct RandomInput { + buf: Vec, + len: usize, + offsets: Vec, + offset_index: usize, +} + +impl RandomInput { + pub fn new(b: &mut Bencher, len: usize) -> Self { + b.bytes += len as u64; + let page_size: usize = page_size::get(); + let mut buf = vec![0u8; len + page_size]; + let mut rng = rand::thread_rng(); + rng.fill_bytes(&mut buf); + let mut offsets: Vec = (0..page_size).collect(); + offsets.shuffle(&mut rng); + Self { + buf, + len, + offsets, + offset_index: 0, + } + } + + pub fn get(&mut self) -> &[u8] { + let offset = self.offsets[self.offset_index]; + self.offset_index += 1; + if self.offset_index >= self.offsets.len() { + self.offset_index = 0; + } + &self.buf[offset..][..self.len] + } +} + +fn bench_single_compression_fn(b: &mut Bencher, platform: Platform) { + let mut state = [1u32; 8]; + let mut r = RandomInput::new(b, 64); + let input = array_ref!(r.get(), 0, 64); + b.iter(|| platform.compress_in_place(&mut state, input, 64 as u8, 0, 0)); +} + +#[bench] +fn bench_single_compression_portable(b: &mut Bencher) { + bench_single_compression_fn(b, Platform::portable()); +} + +#[bench] +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +fn bench_single_compression_sse2(b: &mut Bencher) { + if let Some(platform) = Platform::sse2() { + bench_single_compression_fn(b, platform); + } +} + +#[bench] +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +fn bench_single_compression_sse41(b: &mut Bencher) { + if let Some(platform) = Platform::sse41() { + bench_single_compression_fn(b, platform); + } +} + +#[bench] +#[cfg(blake3_avx512_ffi)] +fn bench_single_compression_avx512(b: &mut Bencher) { + if let Some(platform) = Platform::avx512() { + bench_single_compression_fn(b, platform); + } +} + +fn bench_many_chunks_fn(b: &mut Bencher, platform: Platform) { + let degree = platform.simd_degree(); + let mut inputs = Vec::new(); + for _ in 0..degree { + inputs.push(RandomInput::new(b, CHUNK_LEN)); + } + b.iter(|| { + let input_arrays: ArrayVec<&[u8; CHUNK_LEN], MAX_SIMD_DEGREE> = inputs + .iter_mut() + .take(degree) + .map(|i| array_ref!(i.get(), 0, CHUNK_LEN)) + .collect(); + let mut out = [0; MAX_SIMD_DEGREE * OUT_LEN]; + platform.hash_many( + &input_arrays[..], + &[0; 8], + 0, + blake3::IncrementCounter::Yes, + 0, + 0, + 0, + &mut out, + ); + }); +} + +#[bench] +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +fn bench_many_chunks_sse2(b: &mut Bencher) { + if let Some(platform) = Platform::sse2() { + bench_many_chunks_fn(b, platform); + } +} + +#[bench] +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +fn bench_many_chunks_sse41(b: &mut Bencher) { + if let Some(platform) = Platform::sse41() { + bench_many_chunks_fn(b, platform); + } +} + +#[bench] +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +fn bench_many_chunks_avx2(b: &mut Bencher) { + if let Some(platform) = Platform::avx2() { + bench_many_chunks_fn(b, platform); + } +} + +#[bench] +#[cfg(blake3_avx512_ffi)] +fn bench_many_chunks_avx512(b: &mut Bencher) { + if let Some(platform) = Platform::avx512() { + bench_many_chunks_fn(b, platform); + } +} + +#[bench] +#[cfg(feature = "neon")] +fn bench_many_chunks_neon(b: &mut Bencher) { + if let Some(platform) = Platform::neon() { + bench_many_chunks_fn(b, platform); + } +} + +// TODO: When we get const generics we can unify this with the chunks code. +fn bench_many_parents_fn(b: &mut Bencher, platform: Platform) { + let degree = platform.simd_degree(); + let mut inputs = Vec::new(); + for _ in 0..degree { + inputs.push(RandomInput::new(b, BLOCK_LEN)); + } + b.iter(|| { + let input_arrays: ArrayVec<&[u8; BLOCK_LEN], MAX_SIMD_DEGREE> = inputs + .iter_mut() + .take(degree) + .map(|i| array_ref!(i.get(), 0, BLOCK_LEN)) + .collect(); + let mut out = [0; MAX_SIMD_DEGREE * OUT_LEN]; + platform.hash_many( + &input_arrays[..], + &[0; 8], + 0, + blake3::IncrementCounter::No, + 0, + 0, + 0, + &mut out, + ); + }); +} + +#[bench] +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +fn bench_many_parents_sse2(b: &mut Bencher) { + if let Some(platform) = Platform::sse2() { + bench_many_parents_fn(b, platform); + } +} + +#[bench] +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +fn bench_many_parents_sse41(b: &mut Bencher) { + if let Some(platform) = Platform::sse41() { + bench_many_parents_fn(b, platform); + } +} + +#[bench] +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +fn bench_many_parents_avx2(b: &mut Bencher) { + if let Some(platform) = Platform::avx2() { + bench_many_parents_fn(b, platform); + } +} + +#[bench] +#[cfg(blake3_avx512_ffi)] +fn bench_many_parents_avx512(b: &mut Bencher) { + if let Some(platform) = Platform::avx512() { + bench_many_parents_fn(b, platform); + } +} + +#[bench] +#[cfg(feature = "neon")] +fn bench_many_parents_neon(b: &mut Bencher) { + if let Some(platform) = Platform::neon() { + bench_many_parents_fn(b, platform); + } +} + +fn bench_atonce(b: &mut Bencher, len: usize) { + let mut input = RandomInput::new(b, len); + b.iter(|| blake3::hash(input.get())); +} + +#[bench] +fn bench_atonce_0001_block(b: &mut Bencher) { + bench_atonce(b, BLOCK_LEN); +} + +#[bench] +fn bench_atonce_0001_kib(b: &mut Bencher) { + bench_atonce(b, 1 * KIB); +} + +#[bench] +fn bench_atonce_0002_kib(b: &mut Bencher) { + bench_atonce(b, 2 * KIB); +} + +#[bench] +fn bench_atonce_0004_kib(b: &mut Bencher) { + bench_atonce(b, 4 * KIB); +} + +#[bench] +fn bench_atonce_0008_kib(b: &mut Bencher) { + bench_atonce(b, 8 * KIB); +} + +#[bench] +fn bench_atonce_0016_kib(b: &mut Bencher) { + bench_atonce(b, 16 * KIB); +} + +#[bench] +fn bench_atonce_0032_kib(b: &mut Bencher) { + bench_atonce(b, 32 * KIB); +} + +#[bench] +fn bench_atonce_0064_kib(b: &mut Bencher) { + bench_atonce(b, 64 * KIB); +} + +#[bench] +fn bench_atonce_0128_kib(b: &mut Bencher) { + bench_atonce(b, 128 * KIB); +} + +#[bench] +fn bench_atonce_0256_kib(b: &mut Bencher) { + bench_atonce(b, 256 * KIB); +} + +#[bench] +fn bench_atonce_0512_kib(b: &mut Bencher) { + bench_atonce(b, 512 * KIB); +} + +#[bench] +fn bench_atonce_1024_kib(b: &mut Bencher) { + bench_atonce(b, 1024 * KIB); +} + +fn bench_incremental(b: &mut Bencher, len: usize) { + let mut input = RandomInput::new(b, len); + b.iter(|| blake3::Hasher::new().update(input.get()).finalize()); +} + +#[bench] +fn bench_incremental_0001_block(b: &mut Bencher) { + bench_incremental(b, BLOCK_LEN); +} + +#[bench] +fn bench_incremental_0001_kib(b: &mut Bencher) { + bench_incremental(b, 1 * KIB); +} + +#[bench] +fn bench_incremental_0002_kib(b: &mut Bencher) { + bench_incremental(b, 2 * KIB); +} + +#[bench] +fn bench_incremental_0004_kib(b: &mut Bencher) { + bench_incremental(b, 4 * KIB); +} + +#[bench] +fn bench_incremental_0008_kib(b: &mut Bencher) { + bench_incremental(b, 8 * KIB); +} + +#[bench] +fn bench_incremental_0016_kib(b: &mut Bencher) { + bench_incremental(b, 16 * KIB); +} + +#[bench] +fn bench_incremental_0032_kib(b: &mut Bencher) { + bench_incremental(b, 32 * KIB); +} + +#[bench] +fn bench_incremental_0064_kib(b: &mut Bencher) { + bench_incremental(b, 64 * KIB); +} + +#[bench] +fn bench_incremental_0128_kib(b: &mut Bencher) { + bench_incremental(b, 128 * KIB); +} + +#[bench] +fn bench_incremental_0256_kib(b: &mut Bencher) { + bench_incremental(b, 256 * KIB); +} + +#[bench] +fn bench_incremental_0512_kib(b: &mut Bencher) { + bench_incremental(b, 512 * KIB); +} + +#[bench] +fn bench_incremental_1024_kib(b: &mut Bencher) { + bench_incremental(b, 1024 * KIB); +} + +fn bench_reference(b: &mut Bencher, len: usize) { + let mut input = RandomInput::new(b, len); + b.iter(|| { + let mut hasher = reference_impl::Hasher::new(); + hasher.update(input.get()); + let mut out = [0; 32]; + hasher.finalize(&mut out); + out + }); +} + +#[bench] +fn bench_reference_0001_block(b: &mut Bencher) { + bench_reference(b, BLOCK_LEN); +} + +#[bench] +fn bench_reference_0001_kib(b: &mut Bencher) { + bench_reference(b, 1 * KIB); +} + +#[bench] +fn bench_reference_0002_kib(b: &mut Bencher) { + bench_reference(b, 2 * KIB); +} + +#[bench] +fn bench_reference_0004_kib(b: &mut Bencher) { + bench_reference(b, 4 * KIB); +} + +#[bench] +fn bench_reference_0008_kib(b: &mut Bencher) { + bench_reference(b, 8 * KIB); +} + +#[bench] +fn bench_reference_0016_kib(b: &mut Bencher) { + bench_reference(b, 16 * KIB); +} + +#[bench] +fn bench_reference_0032_kib(b: &mut Bencher) { + bench_reference(b, 32 * KIB); +} + +#[bench] +fn bench_reference_0064_kib(b: &mut Bencher) { + bench_reference(b, 64 * KIB); +} + +#[bench] +fn bench_reference_0128_kib(b: &mut Bencher) { + bench_reference(b, 128 * KIB); +} + +#[bench] +fn bench_reference_0256_kib(b: &mut Bencher) { + bench_reference(b, 256 * KIB); +} + +#[bench] +fn bench_reference_0512_kib(b: &mut Bencher) { + bench_reference(b, 512 * KIB); +} + +#[bench] +fn bench_reference_1024_kib(b: &mut Bencher) { + bench_reference(b, 1024 * KIB); +} + +#[cfg(feature = "rayon")] +fn bench_rayon(b: &mut Bencher, len: usize) { + let mut input = RandomInput::new(b, len); + b.iter(|| blake3::Hasher::new().update_rayon(input.get()).finalize()); +} + +#[bench] +#[cfg(feature = "rayon")] +fn bench_rayon_0001_block(b: &mut Bencher) { + bench_rayon(b, BLOCK_LEN); +} + +#[bench] +#[cfg(feature = "rayon")] +fn bench_rayon_0001_kib(b: &mut Bencher) { + bench_rayon(b, 1 * KIB); +} + +#[bench] +#[cfg(feature = "rayon")] +fn bench_rayon_0002_kib(b: &mut Bencher) { + bench_rayon(b, 2 * KIB); +} + +#[bench] +#[cfg(feature = "rayon")] +fn bench_rayon_0004_kib(b: &mut Bencher) { + bench_rayon(b, 4 * KIB); +} + +#[bench] +#[cfg(feature = "rayon")] +fn bench_rayon_0008_kib(b: &mut Bencher) { + bench_rayon(b, 8 * KIB); +} + +#[bench] +#[cfg(feature = "rayon")] +fn bench_rayon_0016_kib(b: &mut Bencher) { + bench_rayon(b, 16 * KIB); +} + +#[bench] +#[cfg(feature = "rayon")] +fn bench_rayon_0032_kib(b: &mut Bencher) { + bench_rayon(b, 32 * KIB); +} + +#[bench] +#[cfg(feature = "rayon")] +fn bench_rayon_0064_kib(b: &mut Bencher) { + bench_rayon(b, 64 * KIB); +} + +#[bench] +#[cfg(feature = "rayon")] +fn bench_rayon_0128_kib(b: &mut Bencher) { + bench_rayon(b, 128 * KIB); +} + +#[bench] +#[cfg(feature = "rayon")] +fn bench_rayon_0256_kib(b: &mut Bencher) { + bench_rayon(b, 256 * KIB); +} + +#[bench] +#[cfg(feature = "rayon")] +fn bench_rayon_0512_kib(b: &mut Bencher) { + bench_rayon(b, 512 * KIB); +} + +#[bench] +#[cfg(feature = "rayon")] +fn bench_rayon_1024_kib(b: &mut Bencher) { + bench_rayon(b, 1024 * KIB); +} + +// This checks that update() splits up its input in increasing powers of 2, so +// that it can recover a high degree of parallelism when the number of bytes +// hashed so far is uneven. The performance of this benchmark should be +// reasonably close to bench_incremental_0064_kib, within 80% or so. When we +// had a bug in this logic (https://github.com/BLAKE3-team/BLAKE3/issues/69), +// performance was less than half. +#[bench] +fn bench_two_updates(b: &mut Bencher) { + let len = 65536; + let mut input = RandomInput::new(b, len); + b.iter(|| { + let mut hasher = blake3::Hasher::new(); + let input = input.get(); + hasher.update(&input[..1]); + hasher.update(&input[1..]); + hasher.finalize() + }); +} diff --git a/third-party/blake3/build.rs b/third-party/blake3/build.rs new file mode 100644 index 00000000..ac1d6a64 --- /dev/null +++ b/third-party/blake3/build.rs @@ -0,0 +1,277 @@ +use std::env; + +fn defined(var: &str) -> bool { + println!("cargo:rerun-if-env-changed={}", var); + env::var_os(var).is_some() +} + +fn is_pure() -> bool { + defined("CARGO_FEATURE_PURE") +} + +fn should_prefer_intrinsics() -> bool { + defined("CARGO_FEATURE_PREFER_INTRINSICS") +} + +fn is_neon() -> bool { + defined("CARGO_FEATURE_NEON") +} + +fn is_no_neon() -> bool { + defined("CARGO_FEATURE_NO_NEON") +} + +fn is_ci() -> bool { + defined("BLAKE3_CI") +} + +fn warn(warning: &str) { + assert!(!warning.contains("\n")); + println!("cargo:warning={}", warning); + if is_ci() { + println!("cargo:warning=Warnings in CI are treated as errors. Build failed."); + std::process::exit(1); + } +} + +fn target_components() -> Vec { + let target = env::var("TARGET").unwrap(); + target.split("-").map(|s| s.to_string()).collect() +} + +fn is_x86_64() -> bool { + target_components()[0] == "x86_64" +} + +fn is_x86_32() -> bool { + let arch = &target_components()[0]; + arch == "i386" || arch == "i586" || arch == "i686" +} + +fn is_arm() -> bool { + is_armv7() || is_aarch64() || target_components()[0] == "arm" +} + +fn is_aarch64() -> bool { + target_components()[0] == "aarch64" +} + +fn is_armv7() -> bool { + target_components()[0] == "armv7" +} + +// Windows targets may be using the MSVC toolchain or the GNU toolchain. The +// right compiler flags to use depend on the toolchain. (And we don't want to +// use flag_if_supported, because we don't want features to be silently +// disabled by old compilers.) +fn is_windows_msvc() -> bool { + // Some targets are only two components long, so check in steps. + target_components()[1] == "pc" + && target_components()[2] == "windows" + && target_components()[3] == "msvc" +} + +fn is_windows_gnu() -> bool { + // Some targets are only two components long, so check in steps. + target_components()[1] == "pc" + && target_components()[2] == "windows" + && target_components()[3] == "gnu" +} + +fn new_build() -> cc::Build { + let mut build = cc::Build::new(); + if !is_windows_msvc() { + build.flag("-std=c11"); + } + build +} + +#[derive(PartialEq)] +enum CCompilerSupport { + NoCompiler, + NoAVX512, + YesAVX512, +} +use CCompilerSupport::*; + +fn c_compiler_support() -> CCompilerSupport { + let build = new_build(); + let flags_checked; + let support_result: Result = if is_windows_msvc() { + flags_checked = "/arch:AVX512"; + build.is_flag_supported("/arch:AVX512") + } else { + // Check for both of the flags we use. If -mavx512f works, then -mavx512vl + // will probably always work too, but we might as well be thorough. + flags_checked = "-mavx512f and -mavx512vl"; + match build.is_flag_supported("-mavx512f") { + Ok(true) => build.is_flag_supported("-mavx512vl"), + false_or_error => false_or_error, + } + }; + match support_result { + Ok(true) => YesAVX512, + Ok(false) => { + warn(&format!( + "The C compiler {:?} does not support {}.", + build.get_compiler().path(), + flags_checked, + )); + NoAVX512 + } + Err(e) => { + println!("{:?}", e); + warn(&format!( + "No C compiler {:?} detected.", + build.get_compiler().path() + )); + NoCompiler + } + } +} + +fn build_sse2_sse41_avx2_rust_intrinsics() { + // No C code to compile here. Set the cfg flags that enable the Rust SSE2, + // SSE4.1, and AVX2 intrinsics modules. The regular Cargo build will compile + // them. + println!("cargo:rustc-cfg=blake3_sse2_rust"); + println!("cargo:rustc-cfg=blake3_sse41_rust"); + println!("cargo:rustc-cfg=blake3_avx2_rust"); +} + +fn build_sse2_sse41_avx2_assembly() { + // Build the assembly implementations for SSE4.1 and AVX2. This is + // preferred, but it only supports x86_64. + assert!(is_x86_64()); + println!("cargo:rustc-cfg=blake3_sse2_ffi"); + println!("cargo:rustc-cfg=blake3_sse41_ffi"); + println!("cargo:rustc-cfg=blake3_avx2_ffi"); + let mut build = new_build(); + if is_windows_msvc() { + build.file("c/blake3_sse2_x86-64_windows_msvc.asm"); + build.file("c/blake3_sse41_x86-64_windows_msvc.asm"); + build.file("c/blake3_avx2_x86-64_windows_msvc.asm"); + } else if is_windows_gnu() { + build.file("c/blake3_sse2_x86-64_windows_gnu.S"); + build.file("c/blake3_sse41_x86-64_windows_gnu.S"); + build.file("c/blake3_avx2_x86-64_windows_gnu.S"); + } else { + // All non-Windows implementations are assumed to support + // Linux-style assembly. These files do contain a small + // explicit workaround for macOS also. + build.file("c/blake3_sse2_x86-64_unix.S"); + build.file("c/blake3_sse41_x86-64_unix.S"); + build.file("c/blake3_avx2_x86-64_unix.S"); + } + build.compile("blake3_sse2_sse41_avx2_assembly"); +} + +fn build_avx512_c_intrinsics() { + // This is required on 32-bit x86 targets, since the assembly + // implementation doesn't support those. + println!("cargo:rustc-cfg=blake3_avx512_ffi"); + let mut build = new_build(); + build.file("c/blake3_avx512.c"); + if is_windows_msvc() { + build.flag("/arch:AVX512"); + } else { + build.flag("-mavx512f"); + build.flag("-mavx512vl"); + } + if is_windows_gnu() { + // Workaround for https://gcc.gnu.org/bugzilla/show_bug.cgi?id=65782. + build.flag("-fno-asynchronous-unwind-tables"); + } + build.compile("blake3_avx512_intrinsics"); +} + +fn build_avx512_assembly() { + // Build the assembly implementation for AVX-512. This is preferred, but it + // only supports x86_64. + assert!(is_x86_64()); + println!("cargo:rustc-cfg=blake3_avx512_ffi"); + let mut build = new_build(); + if is_windows_msvc() { + build.file("c/blake3_avx512_x86-64_windows_msvc.asm"); + } else { + if is_windows_gnu() { + build.file("c/blake3_avx512_x86-64_windows_gnu.S"); + } else { + // All non-Windows implementations are assumed to support Linux-style + // assembly. These files do contain a small explicit workaround for + // macOS also. + build.file("c/blake3_avx512_x86-64_unix.S"); + } + // Older versions of Clang require these flags, even for assembly. See + // https://github.com/BLAKE3-team/BLAKE3/issues/79. + build.flag("-mavx512f"); + build.flag("-mavx512vl"); + } + build.compile("blake3_avx512_assembly"); +} + +fn build_neon_c_intrinsics() { + let mut build = new_build(); + // Note that blake3_neon.c normally depends on the blake3_portable.c + // for the single-instance compression function, but we expose + // portable.rs over FFI instead. See ffi_neon.rs. + build.file("c/blake3_neon.c"); + // ARMv7 platforms that support NEON generally need the following + // flags. AArch64 supports NEON by default and does not support -mpfu. + if is_armv7() { + build.flag("-mfpu=neon-vfpv4"); + build.flag("-mfloat-abi=hard"); + } + build.compile("blake3_neon"); +} + +fn main() -> Result<(), Box> { + if is_pure() && is_neon() { + panic!("It doesn't make sense to enable both \"pure\" and \"neon\"."); + } + + if is_no_neon() && is_neon() { + panic!("It doesn't make sense to enable both \"no_neon\" and \"neon\"."); + } + + if is_x86_64() || is_x86_32() { + let support = c_compiler_support(); + if is_x86_32() || should_prefer_intrinsics() || is_pure() || support == NoCompiler { + build_sse2_sse41_avx2_rust_intrinsics(); + } else { + // We assume that all C compilers can assemble SSE4.1 and AVX2. We + // don't explicitly check for support. + build_sse2_sse41_avx2_assembly(); + } + + if is_pure() || support == NoCompiler || support == NoAVX512 { + // The binary will not include any AVX-512 code. + } else if is_x86_32() || should_prefer_intrinsics() { + build_avx512_c_intrinsics(); + } else { + build_avx512_assembly(); + } + } + + if (is_arm() && is_neon()) || (!is_no_neon() && !is_pure() && is_aarch64()) { + println!("cargo:rustc-cfg=blake3_neon"); + build_neon_c_intrinsics(); + } + + // The `cc` crate doesn't automatically emit rerun-if directives for the + // environment variables it supports, in particular for $CC. We expect to + // do a lot of benchmarking across different compilers, so we explicitly + // add the variables that we're likely to need. + println!("cargo:rerun-if-env-changed=CC"); + println!("cargo:rerun-if-env-changed=CFLAGS"); + + // Ditto for source files, though these shouldn't change as often. + for file in std::fs::read_dir("c")? { + println!( + "cargo:rerun-if-changed={}", + file?.path().to_str().expect("utf-8") + ); + } + + Ok(()) +} diff --git a/third-party/blake3/c/.gitignore b/third-party/blake3/c/.gitignore new file mode 100644 index 00000000..3d4b7041 --- /dev/null +++ b/third-party/blake3/c/.gitignore @@ -0,0 +1,4 @@ +blake3 +example +build/ +*.o diff --git a/third-party/blake3/c/CMakeLists.txt b/third-party/blake3/c/CMakeLists.txt new file mode 100644 index 00000000..3aa6c154 --- /dev/null +++ b/third-party/blake3/c/CMakeLists.txt @@ -0,0 +1,177 @@ +cmake_minimum_required(VERSION 3.9) + +project(libblake3 + VERSION 1.4.1 + DESCRIPTION "BLAKE3 C implementation" + LANGUAGES C ASM +) + +include(FeatureSummary) +include(GNUInstallDirs) + +# default SIMD compiler flag configuration (can be overriden by toolchains or CLI) +if(MSVC) + set(BLAKE3_CFLAGS_SSE2 "/arch:SSE2" CACHE STRING "the compiler flags to enable SSE2") + # MSVC has no dedicated sse4.1 flag (see https://learn.microsoft.com/en-us/cpp/build/reference/arch-x86?view=msvc-170) + set(BLAKE3_CFLAGS_SSE4.1 "/arch:AVX" CACHE STRING "the compiler flags to enable SSE4.1") + set(BLAKE3_CFLAGS_AVX2 "/arch:AVX2" CACHE STRING "the compiler flags to enable AVX2") + set(BLAKE3_CFLAGS_AVX512 "/arch:AVX512" CACHE STRING "the compiler flags to enable AVX512") + +elseif(CMAKE_C_COMPILER_ID STREQUAL "GNU" + OR CMAKE_C_COMPILER_ID STREQUAL "Clang" + OR CMAKE_C_COMPILER_ID STREQUAL "AppleClang") + set(BLAKE3_CFLAGS_SSE2 "-msse2" CACHE STRING "the compiler flags to enable SSE2") + set(BLAKE3_CFLAGS_SSE4.1 "-msse4.1" CACHE STRING "the compiler flags to enable SSE4.1") + set(BLAKE3_CFLAGS_AVX2 "-mavx2" CACHE STRING "the compiler flags to enable AVX2") + set(BLAKE3_CFLAGS_AVX512 "-mavx512f -mavx512vl" CACHE STRING "the compiler flags to enable AVX512") +endif() +# architecture lists for which to enable assembly / SIMD sources +set(BLAKE3_AMD64_NAMES amd64 AMD64 x86_64) +set(BLAKE3_X86_NAMES i686 x86 X86) +set(BLAKE3_ARMv8_NAMES aarch64 AArch64 arm64 ARM64 armv8 armv8a) + +# library target +add_library(blake3 + blake3.c + blake3_dispatch.c + blake3_portable.c +) +add_library(BLAKE3::blake3 ALIAS blake3) + +# library configuration +set(BLAKE3_PKGCONFIG_CFLAGS) +if (BUILD_SHARED_LIBS) + target_compile_definitions(blake3 + PUBLIC BLAKE3_DLL + PRIVATE BLAKE3_DLL_EXPORTS + ) + list(APPEND BLAKE3_PKGCONFIG_CFLAGS -DBLAKE3_DLL) +endif() +target_include_directories(blake3 PUBLIC $) +set_target_properties(blake3 PROPERTIES + VERSION ${PROJECT_VERSION} + SOVERSION 0 + C_VISIBILITY_PRESET hidden +) + +# optional SIMD sources +macro(BLAKE3_DISABLE_SIMD) + set(BLAKE3_SIMD_AMD64_ASM OFF) + set(BLAKE3_SIMD_X86_INTRINSICS OFF) + set(BLAKE3_SIMD_NEON_INTRINSICS OFF) + set_source_files_properties(blake3_dispatch.c PROPERTIES + COMPILE_DEFINITIONS BLAKE3_USE_NEON=0;BLAKE3_NO_SSE2;BLAKE3_NO_SSE41;BLAKE3_NO_AVX2;BLAKE3_NO_AVX512 + ) +endmacro() + +if(CMAKE_SYSTEM_PROCESSOR IN_LIST BLAKE3_AMD64_NAMES OR BLAKE3_USE_AMD64_ASM) + set(BLAKE3_SIMD_AMD64_ASM ON) + + if(MSVC) + enable_language(ASM_MASM) + target_sources(blake3 PRIVATE + blake3_avx2_x86-64_windows_msvc.asm + blake3_avx512_x86-64_windows_msvc.asm + blake3_sse2_x86-64_windows_msvc.asm + blake3_sse41_x86-64_windows_msvc.asm + ) + + elseif(CMAKE_C_COMPILER_ID STREQUAL "GNU" + OR CMAKE_C_COMPILER_ID STREQUAL "Clang" + OR CMAKE_C_COMPILER_ID STREQUAL "AppleClang") + if (WIN32) + target_sources(blake3 PRIVATE + blake3_avx2_x86-64_windows_gnu.S + blake3_avx512_x86-64_windows_gnu.S + blake3_sse2_x86-64_windows_gnu.S + blake3_sse41_x86-64_windows_gnu.S + ) + + elseif(UNIX) + target_sources(blake3 PRIVATE + blake3_avx2_x86-64_unix.S + blake3_avx512_x86-64_unix.S + blake3_sse2_x86-64_unix.S + blake3_sse41_x86-64_unix.S + ) + + else() + BLAKE3_DISABLE_SIMD() + endif() + + else() + BLAKE3_DISABLE_SIMD() + endif() + +elseif((CMAKE_SYSTEM_PROCESSOR IN_LIST BLAKE3_X86_NAMES OR BLAKE3_USE_X86_INTRINSICS) + AND DEFINED BLAKE3_CFLAGS_SSE2 + AND DEFINED BLAKE3_CFLAGS_SSE4.1 + AND DEFINED BLAKE3_CFLAGS_AVX2 + AND DEFINED BLAKE3_CFLAGS_AVX512) + set(BLAKE3_SIMD_X86_INTRINSICS ON) + + target_sources(blake3 PRIVATE + blake3_avx2.c + blake3_avx512.c + blake3_sse2.c + blake3_sse41.c + ) + set_source_files_properties(blake3_avx2.c PROPERTIES COMPILE_FLAGS "${BLAKE3_CFLAGS_AVX2}") + set_source_files_properties(blake3_avx512.c PROPERTIES COMPILE_FLAGS "${BLAKE3_CFLAGS_AVX512}") + set_source_files_properties(blake3_sse2.c PROPERTIES COMPILE_FLAGS "${BLAKE3_CFLAGS_SSE2}") + set_source_files_properties(blake3_sse41.c PROPERTIES COMPILE_FLAGS "${BLAKE3_CFLAGS_SSE4.1}") + +elseif(CMAKE_SYSTEM_PROCESSOR IN_LIST BLAKE3_ARMv8_NAMES + OR ((ANDROID_ABI STREQUAL "armeabi-v7a" + OR BLAKE3_USE_NEON_INTRINSICS) + AND (DEFINED BLAKE3_CFLAGS_NEON + OR CMAKE_SIZEOF_VOID_P EQUAL 8))) + set(BLAKE3_SIMD_NEON_INTRINSICS ON) + + target_sources(blake3 PRIVATE + blake3_neon.c + ) + set_source_files_properties(blake3_dispatch.c PROPERTIES COMPILE_DEFINITIONS BLAKE3_USE_NEON=1) + + if (DEFINED BLAKE3_CFLAGS_NEON) + set_source_files_properties(blake3_neon.c PROPERTIES COMPILE_FLAGS "${BLAKE3_CFLAGS_NEON}") + endif() + +else() + BLAKE3_DISABLE_SIMD() +endif() + +# cmake install support +install(FILES blake3.h DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}") +install(TARGETS blake3 EXPORT blake3-targets) +install(EXPORT blake3-targets + NAMESPACE BLAKE3:: + DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/blake3" +) + +include(CMakePackageConfigHelpers) +configure_package_config_file(blake3-config.cmake.in + "${CMAKE_CURRENT_BINARY_DIR}/blake3-config.cmake" + + INSTALL_DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/blake3" +) +write_basic_package_version_file( + "${CMAKE_CURRENT_BINARY_DIR}/blake3-config-version.cmake" + VERSION ${libblake3_VERSION} + COMPATIBILITY SameMajorVersion +) +install(FILES + "${CMAKE_CURRENT_BINARY_DIR}/blake3-config.cmake" + "${CMAKE_CURRENT_BINARY_DIR}/blake3-config-version.cmake" + DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/blake3" +) + +configure_file(libblake3.pc.in libblake3.pc @ONLY) +install(FILES "${CMAKE_BINARY_DIR}/libblake3.pc" + DESTINATION "${CMAKE_INSTALL_LIBDIR}/pkgconfig") + +# print feature summary +add_feature_info("AMD64 assembly" BLAKE3_SIMD_AMD64_ASM "The library uses hand written amd64 SIMD assembly.") +add_feature_info("x86 SIMD intrinsics" BLAKE3_SIMD_X86_INTRINSICS "The library uses x86 SIMD intrinsics.") +add_feature_info("NEON SIMD intrinsics" BLAKE3_SIMD_NEON_INTRINSICS "The library uses NEON SIMD intrinsics.") +feature_summary(WHAT ENABLED_FEATURES) diff --git a/third-party/blake3/c/Makefile.testing b/third-party/blake3/c/Makefile.testing new file mode 100644 index 00000000..b540528d --- /dev/null +++ b/third-party/blake3/c/Makefile.testing @@ -0,0 +1,82 @@ +# This Makefile is only for testing. C callers should follow the instructions +# in ./README.md to incorporate these C files into their existing build. + +NAME=blake3 +CC=gcc +CFLAGS=-O3 -Wall -Wextra -std=c11 -pedantic -fstack-protector-strong -D_FORTIFY_SOURCE=2 -fPIE -fvisibility=hidden +LDFLAGS=-pie -Wl,-z,relro,-z,now +TARGETS= +ASM_TARGETS= +EXTRAFLAGS=-Wa,--noexecstack + +ifdef BLAKE3_NO_SSE2 +EXTRAFLAGS += -DBLAKE3_NO_SSE2 +else +TARGETS += blake3_sse2.o +ASM_TARGETS += blake3_sse2_x86-64_unix.S +endif + +ifdef BLAKE3_NO_SSE41 +EXTRAFLAGS += -DBLAKE3_NO_SSE41 +else +TARGETS += blake3_sse41.o +ASM_TARGETS += blake3_sse41_x86-64_unix.S +endif + +ifdef BLAKE3_NO_AVX2 +EXTRAFLAGS += -DBLAKE3_NO_AVX2 +else +TARGETS += blake3_avx2.o +ASM_TARGETS += blake3_avx2_x86-64_unix.S +endif + +ifdef BLAKE3_NO_AVX512 +EXTRAFLAGS += -DBLAKE3_NO_AVX512 +else +TARGETS += blake3_avx512.o +ASM_TARGETS += blake3_avx512_x86-64_unix.S +endif + +ifdef BLAKE3_USE_NEON +EXTRAFLAGS += -DBLAKE3_USE_NEON=1 +TARGETS += blake3_neon.o +endif + +ifdef BLAKE3_NO_NEON +EXTRAFLAGS += -DBLAKE3_USE_NEON=0 +endif + +all: blake3.c blake3_dispatch.c blake3_portable.c main.c $(TARGETS) + $(CC) $(CFLAGS) $(EXTRAFLAGS) $^ -o $(NAME) $(LDFLAGS) + +blake3_sse2.o: blake3_sse2.c + $(CC) $(CFLAGS) $(EXTRAFLAGS) -c $^ -o $@ -msse2 + +blake3_sse41.o: blake3_sse41.c + $(CC) $(CFLAGS) $(EXTRAFLAGS) -c $^ -o $@ -msse4.1 + +blake3_avx2.o: blake3_avx2.c + $(CC) $(CFLAGS) $(EXTRAFLAGS) -c $^ -o $@ -mavx2 + +blake3_avx512.o: blake3_avx512.c + $(CC) $(CFLAGS) $(EXTRAFLAGS) -c $^ -o $@ -mavx512f -mavx512vl + +blake3_neon.o: blake3_neon.c + $(CC) $(CFLAGS) $(EXTRAFLAGS) -c $^ -o $@ + +test: CFLAGS += -DBLAKE3_TESTING -fsanitize=address,undefined +test: all + ./test.py + +asm: blake3.c blake3_dispatch.c blake3_portable.c main.c $(ASM_TARGETS) + $(CC) $(CFLAGS) $(EXTRAFLAGS) $^ -o $(NAME) $(LDFLAGS) + +test_asm: CFLAGS += -DBLAKE3_TESTING -fsanitize=address,undefined +test_asm: asm + ./test.py + +example: example.c blake3.c blake3_dispatch.c blake3_portable.c $(ASM_TARGETS) + $(CC) $(CFLAGS) $(EXTRAFLAGS) $^ -o $@ $(LDFLAGS) + +clean: + rm -f $(NAME) *.o diff --git a/third-party/blake3/c/README.md b/third-party/blake3/c/README.md new file mode 100644 index 00000000..965d8c74 --- /dev/null +++ b/third-party/blake3/c/README.md @@ -0,0 +1,321 @@ +The official C implementation of BLAKE3. + +# Example + +An example program that hashes bytes from standard input and prints the +result: + +```c +#include "blake3.h" +#include +#include +#include +#include +#include + +int main(void) { + // Initialize the hasher. + blake3_hasher hasher; + blake3_hasher_init(&hasher); + + // Read input bytes from stdin. + unsigned char buf[65536]; + while (1) { + ssize_t n = read(STDIN_FILENO, buf, sizeof(buf)); + if (n > 0) { + blake3_hasher_update(&hasher, buf, n); + } else if (n == 0) { + break; // end of file + } else { + fprintf(stderr, "read failed: %s\n", strerror(errno)); + exit(1); + } + } + + // Finalize the hash. BLAKE3_OUT_LEN is the default output length, 32 bytes. + uint8_t output[BLAKE3_OUT_LEN]; + blake3_hasher_finalize(&hasher, output, BLAKE3_OUT_LEN); + + // Print the hash as hexadecimal. + for (size_t i = 0; i < BLAKE3_OUT_LEN; i++) { + printf("%02x", output[i]); + } + printf("\n"); + return 0; +} +``` + +The code above is included in this directory as `example.c`. If you're +on x86\_64 with a Unix-like OS, you can compile a working binary like +this: + +```bash +gcc -O3 -o example example.c blake3.c blake3_dispatch.c blake3_portable.c \ + blake3_sse2_x86-64_unix.S blake3_sse41_x86-64_unix.S blake3_avx2_x86-64_unix.S \ + blake3_avx512_x86-64_unix.S +``` + +# API + +## The Struct + +```c +typedef struct { + // private fields +} blake3_hasher; +``` + +An incremental BLAKE3 hashing state, which can accept any number of +updates. This implementation doesn't allocate any heap memory, but +`sizeof(blake3_hasher)` itself is relatively large, currently 1912 bytes +on x86-64. This size can be reduced by restricting the maximum input +length, as described in Section 5.4 of [the BLAKE3 +spec](https://github.com/BLAKE3-team/BLAKE3-specs/blob/master/blake3.pdf), +but this implementation doesn't currently support that strategy. + +## Common API Functions + +```c +void blake3_hasher_init( + blake3_hasher *self); +``` + +Initialize a `blake3_hasher` in the default hashing mode. + +--- + +```c +void blake3_hasher_update( + blake3_hasher *self, + const void *input, + size_t input_len); +``` + +Add input to the hasher. This can be called any number of times. + +--- + +```c +void blake3_hasher_finalize( + const blake3_hasher *self, + uint8_t *out, + size_t out_len); +``` + +Finalize the hasher and return an output of any length, given in bytes. +This doesn't modify the hasher itself, and it's possible to finalize +again after adding more input. The constant `BLAKE3_OUT_LEN` provides +the default output length, 32 bytes, which is recommended for most +callers. See the [Security Notes](#security-notes) below. + +## Less Common API Functions + +```c +void blake3_hasher_init_keyed( + blake3_hasher *self, + const uint8_t key[BLAKE3_KEY_LEN]); +``` + +Initialize a `blake3_hasher` in the keyed hashing mode. The key must be +exactly 32 bytes. + +--- + +```c +void blake3_hasher_init_derive_key( + blake3_hasher *self, + const char *context); +``` + +Initialize a `blake3_hasher` in the key derivation mode. The context +string is given as an initialization parameter, and afterwards input key +material should be given with `blake3_hasher_update`. The context string +is a null-terminated C string which should be **hardcoded, globally +unique, and application-specific**. The context string should not +include any dynamic input like salts, nonces, or identifiers read from a +database at runtime. A good default format for the context string is +`"[application] [commit timestamp] [purpose]"`, e.g., `"example.com +2019-12-25 16:18:03 session tokens v1"`. + +This function is intended for application code written in C. For +language bindings, see `blake3_hasher_init_derive_key_raw` below. + +--- + +```c +void blake3_hasher_init_derive_key_raw( + blake3_hasher *self, + const void *context, + size_t context_len); +``` + +As `blake3_hasher_init_derive_key` above, except that the context string +is given as a pointer to an array of arbitrary bytes with a provided +length. This is intended for writing language bindings, where C string +conversion would add unnecessary overhead and new error cases. Unicode +strings should be encoded as UTF-8. + +Application code in C should prefer `blake3_hasher_init_derive_key`, +which takes the context as a C string. If you need to use arbitrary +bytes as a context string in application code, consider whether you're +violating the requirement that context strings should be hardcoded. + +--- + +```c +void blake3_hasher_finalize_seek( + const blake3_hasher *self, + uint64_t seek, + uint8_t *out, + size_t out_len); +``` + +The same as `blake3_hasher_finalize`, but with an additional `seek` +parameter for the starting byte position in the output stream. To +efficiently stream a large output without allocating memory, call this +function in a loop, incrementing `seek` by the output length each time. + +--- + +```c +void blake3_hasher_reset( + blake3_hasher *self); +``` + +Reset the hasher to its initial state, prior to any calls to +`blake3_hasher_update`. Currently this is no different from calling +`blake3_hasher_init` or similar again. However, if this implementation gains +multithreading support in the future, and if `blake3_hasher` holds (optional) +threading resources, this function will reuse those resources. Until then, this +is mainly for feature compatibility with the Rust implementation. + +# Security Notes + +Outputs shorter than the default length of 32 bytes (256 bits) provide less security. An N-bit +BLAKE3 output is intended to provide N bits of first and second preimage resistance and N/2 +bits of collision resistance, for any N up to 256. Longer outputs don't provide any additional +security. + +Avoid relying on the secrecy of the output offset, that is, the `seek` argument of +`blake3_hasher_finalize_seek`. [_Block-Cipher-Based Tree Hashing_ by Aldo +Gunsing](https://eprint.iacr.org/2022/283) shows that an attacker who knows both the message +and the key (if any) can easily determine the offset of an extended output. For comparison, +AES-CTR has a similar property: if you know the key, you can decrypt a block from an unknown +position in the output stream to recover its block index. Callers with strong secret keys +aren't affected in practice, but secret offsets are a [design +smell](https://en.wikipedia.org/wiki/Design_smell) in any case. + +# Building + +This implementation is just C and assembly files. It doesn't include a +public-facing build system. (The `Makefile` in this directory is only +for testing.) Instead, the intention is that you can include these files +in whatever build system you're already using. This section describes +the commands your build system should execute, or which you can execute +by hand. Note that these steps may change in future versions. + +## x86 + +Dynamic dispatch is enabled by default on x86. The implementation will +query the CPU at runtime to detect SIMD support, and it will use the +widest instruction set available. By default, `blake3_dispatch.c` +expects to be linked with code for five different instruction sets: +portable C, SSE2, SSE4.1, AVX2, and AVX-512. + +For each of the x86 SIMD instruction sets, four versions are available: +three flavors of assembly (Unix, Windows MSVC, and Windows GNU) and one +version using C intrinsics. The assembly versions are generally +preferred. They perform better, they perform more consistently across +different compilers, and they build more quickly. On the other hand, the +assembly versions are x86\_64-only, and you need to select the right +flavor for your target platform. + +Here's an example of building a shared library on x86\_64 Linux using +the assembly implementations: + +```bash +gcc -shared -O3 -o libblake3.so blake3.c blake3_dispatch.c blake3_portable.c \ + blake3_sse2_x86-64_unix.S blake3_sse41_x86-64_unix.S blake3_avx2_x86-64_unix.S \ + blake3_avx512_x86-64_unix.S +``` + +When building the intrinsics-based implementations, you need to build +each implementation separately, with the corresponding instruction set +explicitly enabled in the compiler. Here's the same shared library using +the intrinsics-based implementations: + +```bash +gcc -c -fPIC -O3 -msse2 blake3_sse2.c -o blake3_sse2.o +gcc -c -fPIC -O3 -msse4.1 blake3_sse41.c -o blake3_sse41.o +gcc -c -fPIC -O3 -mavx2 blake3_avx2.c -o blake3_avx2.o +gcc -c -fPIC -O3 -mavx512f -mavx512vl blake3_avx512.c -o blake3_avx512.o +gcc -shared -O3 -o libblake3.so blake3.c blake3_dispatch.c blake3_portable.c \ + blake3_avx2.o blake3_avx512.o blake3_sse41.o blake3_sse2.o +``` + +Note above that building `blake3_avx512.c` requires both `-mavx512f` and +`-mavx512vl` under GCC and Clang. Under MSVC, the single `/arch:AVX512` +flag is sufficient. The MSVC equivalent of `-mavx2` is `/arch:AVX2`. +MSVC enables SSE2 and SSE4.1 by default, and it doesn't have a +corresponding flag. + +If you want to omit SIMD code entirely, you need to explicitly disable +each instruction set. Here's an example of building a shared library on +x86 with only portable code: + +```bash +gcc -shared -O3 -o libblake3.so -DBLAKE3_NO_SSE2 -DBLAKE3_NO_SSE41 -DBLAKE3_NO_AVX2 \ + -DBLAKE3_NO_AVX512 blake3.c blake3_dispatch.c blake3_portable.c +``` + +## ARM NEON + +The NEON implementation is enabled by default on AArch64, but not on +other ARM targets, since not all of them support it. To enable it, set +`BLAKE3_USE_NEON=1`. Here's an example of building a shared library on +ARM Linux with NEON support: + +```bash +gcc -shared -O3 -o libblake3.so -DBLAKE3_USE_NEON=1 blake3.c blake3_dispatch.c \ + blake3_portable.c blake3_neon.c +``` + +To explicitiy disable using NEON instructions on AArch64, set +`BLAKE3_USE_NEON=0`. + +```bash +gcc -shared -O3 -o libblake3.so -DBLAKE3_USE_NEON=0 blake3.c blake3_dispatch.c \ + blake3_portable.c +``` + +Note that on some targets (ARMv7 in particular), extra flags may be +required to activate NEON support in the compiler. If you see an error +like... + +``` +/usr/lib/gcc/armv7l-unknown-linux-gnueabihf/9.2.0/include/arm_neon.h:635:1: error: inlining failed +in call to always_inline ‘vaddq_u32’: target specific option mismatch +``` + +...then you may need to add something like `-mfpu=neon-vfpv4 +-mfloat-abi=hard`. + +## Other Platforms + +The portable implementation should work on most other architectures. For +example: + +```bash +gcc -shared -O3 -o libblake3.so blake3.c blake3_dispatch.c blake3_portable.c +``` + +# Multithreading + +Unlike the Rust implementation, the C implementation doesn't currently support +multithreading. A future version of this library could add support by taking an +optional dependency on OpenMP or similar. Alternatively, we could expose a +lower-level API to allow callers to implement concurrency themselves. The +former would be more convenient and less error-prone, but the latter would give +callers the maximum possible amount of control. The best choice here depends on +the specific use case, so if you have a use case for multithreaded hashing in +C, please file a GitHub issue and let us know. diff --git a/third-party/blake3/c/blake3-config.cmake.in b/third-party/blake3/c/blake3-config.cmake.in new file mode 100644 index 00000000..071552be --- /dev/null +++ b/third-party/blake3/c/blake3-config.cmake.in @@ -0,0 +1,5 @@ +@PACKAGE_INIT@ + +include("${CMAKE_CURRENT_LIST_DIR}/blake3-targets.cmake") + +check_required_components(blake3) \ No newline at end of file diff --git a/third-party/blake3/c/blake3.c b/third-party/blake3/c/blake3.c new file mode 100644 index 00000000..692f4b02 --- /dev/null +++ b/third-party/blake3/c/blake3.c @@ -0,0 +1,616 @@ +#include +#include +#include + +#include "blake3.h" +#include "blake3_impl.h" + +const char *blake3_version(void) { return BLAKE3_VERSION_STRING; } + +INLINE void chunk_state_init(blake3_chunk_state *self, const uint32_t key[8], + uint8_t flags) { + memcpy(self->cv, key, BLAKE3_KEY_LEN); + self->chunk_counter = 0; + memset(self->buf, 0, BLAKE3_BLOCK_LEN); + self->buf_len = 0; + self->blocks_compressed = 0; + self->flags = flags; +} + +INLINE void chunk_state_reset(blake3_chunk_state *self, const uint32_t key[8], + uint64_t chunk_counter) { + memcpy(self->cv, key, BLAKE3_KEY_LEN); + self->chunk_counter = chunk_counter; + self->blocks_compressed = 0; + memset(self->buf, 0, BLAKE3_BLOCK_LEN); + self->buf_len = 0; +} + +INLINE size_t chunk_state_len(const blake3_chunk_state *self) { + return (BLAKE3_BLOCK_LEN * (size_t)self->blocks_compressed) + + ((size_t)self->buf_len); +} + +INLINE size_t chunk_state_fill_buf(blake3_chunk_state *self, + const uint8_t *input, size_t input_len) { + size_t take = BLAKE3_BLOCK_LEN - ((size_t)self->buf_len); + if (take > input_len) { + take = input_len; + } + uint8_t *dest = self->buf + ((size_t)self->buf_len); + memcpy(dest, input, take); + self->buf_len += (uint8_t)take; + return take; +} + +INLINE uint8_t chunk_state_maybe_start_flag(const blake3_chunk_state *self) { + if (self->blocks_compressed == 0) { + return CHUNK_START; + } else { + return 0; + } +} + +typedef struct { + uint32_t input_cv[8]; + uint64_t counter; + uint8_t block[BLAKE3_BLOCK_LEN]; + uint8_t block_len; + uint8_t flags; +} output_t; + +INLINE output_t make_output(const uint32_t input_cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, + uint8_t flags) { + output_t ret; + memcpy(ret.input_cv, input_cv, 32); + memcpy(ret.block, block, BLAKE3_BLOCK_LEN); + ret.block_len = block_len; + ret.counter = counter; + ret.flags = flags; + return ret; +} + +// Chaining values within a given chunk (specifically the compress_in_place +// interface) are represented as words. This avoids unnecessary bytes<->words +// conversion overhead in the portable implementation. However, the hash_many +// interface handles both user input and parent node blocks, so it accepts +// bytes. For that reason, chaining values in the CV stack are represented as +// bytes. +INLINE void output_chaining_value(const output_t *self, uint8_t cv[32]) { + uint32_t cv_words[8]; + memcpy(cv_words, self->input_cv, 32); + blake3_compress_in_place(cv_words, self->block, self->block_len, + self->counter, self->flags); + store_cv_words(cv, cv_words); +} + +INLINE void output_root_bytes(const output_t *self, uint64_t seek, uint8_t *out, + size_t out_len) { + uint64_t output_block_counter = seek / 64; + size_t offset_within_block = seek % 64; + uint8_t wide_buf[64]; + while (out_len > 0) { + blake3_compress_xof(self->input_cv, self->block, self->block_len, + output_block_counter, self->flags | ROOT, wide_buf); + size_t available_bytes = 64 - offset_within_block; + size_t memcpy_len; + if (out_len > available_bytes) { + memcpy_len = available_bytes; + } else { + memcpy_len = out_len; + } + memcpy(out, wide_buf + offset_within_block, memcpy_len); + out += memcpy_len; + out_len -= memcpy_len; + output_block_counter += 1; + offset_within_block = 0; + } +} + +INLINE void chunk_state_update(blake3_chunk_state *self, const uint8_t *input, + size_t input_len) { + if (self->buf_len > 0) { + size_t take = chunk_state_fill_buf(self, input, input_len); + input += take; + input_len -= take; + if (input_len > 0) { + blake3_compress_in_place( + self->cv, self->buf, BLAKE3_BLOCK_LEN, self->chunk_counter, + self->flags | chunk_state_maybe_start_flag(self)); + self->blocks_compressed += 1; + self->buf_len = 0; + memset(self->buf, 0, BLAKE3_BLOCK_LEN); + } + } + + while (input_len > BLAKE3_BLOCK_LEN) { + blake3_compress_in_place(self->cv, input, BLAKE3_BLOCK_LEN, + self->chunk_counter, + self->flags | chunk_state_maybe_start_flag(self)); + self->blocks_compressed += 1; + input += BLAKE3_BLOCK_LEN; + input_len -= BLAKE3_BLOCK_LEN; + } + + size_t take = chunk_state_fill_buf(self, input, input_len); + input += take; + input_len -= take; +} + +INLINE output_t chunk_state_output(const blake3_chunk_state *self) { + uint8_t block_flags = + self->flags | chunk_state_maybe_start_flag(self) | CHUNK_END; + return make_output(self->cv, self->buf, self->buf_len, self->chunk_counter, + block_flags); +} + +INLINE output_t parent_output(const uint8_t block[BLAKE3_BLOCK_LEN], + const uint32_t key[8], uint8_t flags) { + return make_output(key, block, BLAKE3_BLOCK_LEN, 0, flags | PARENT); +} + +// Given some input larger than one chunk, return the number of bytes that +// should go in the left subtree. This is the largest power-of-2 number of +// chunks that leaves at least 1 byte for the right subtree. +INLINE size_t left_len(size_t content_len) { + // Subtract 1 to reserve at least one byte for the right side. content_len + // should always be greater than BLAKE3_CHUNK_LEN. + size_t full_chunks = (content_len - 1) / BLAKE3_CHUNK_LEN; + return round_down_to_power_of_2(full_chunks) * BLAKE3_CHUNK_LEN; +} + +// Use SIMD parallelism to hash up to MAX_SIMD_DEGREE chunks at the same time +// on a single thread. Write out the chunk chaining values and return the +// number of chunks hashed. These chunks are never the root and never empty; +// those cases use a different codepath. +INLINE size_t compress_chunks_parallel(const uint8_t *input, size_t input_len, + const uint32_t key[8], + uint64_t chunk_counter, uint8_t flags, + uint8_t *out) { +#if defined(BLAKE3_TESTING) + assert(0 < input_len); + assert(input_len <= MAX_SIMD_DEGREE * BLAKE3_CHUNK_LEN); +#endif + + const uint8_t *chunks_array[MAX_SIMD_DEGREE]; + size_t input_position = 0; + size_t chunks_array_len = 0; + while (input_len - input_position >= BLAKE3_CHUNK_LEN) { + chunks_array[chunks_array_len] = &input[input_position]; + input_position += BLAKE3_CHUNK_LEN; + chunks_array_len += 1; + } + + blake3_hash_many(chunks_array, chunks_array_len, + BLAKE3_CHUNK_LEN / BLAKE3_BLOCK_LEN, key, chunk_counter, + true, flags, CHUNK_START, CHUNK_END, out); + + // Hash the remaining partial chunk, if there is one. Note that the empty + // chunk (meaning the empty message) is a different codepath. + if (input_len > input_position) { + uint64_t counter = chunk_counter + (uint64_t)chunks_array_len; + blake3_chunk_state chunk_state; + chunk_state_init(&chunk_state, key, flags); + chunk_state.chunk_counter = counter; + chunk_state_update(&chunk_state, &input[input_position], + input_len - input_position); + output_t output = chunk_state_output(&chunk_state); + output_chaining_value(&output, &out[chunks_array_len * BLAKE3_OUT_LEN]); + return chunks_array_len + 1; + } else { + return chunks_array_len; + } +} + +// Use SIMD parallelism to hash up to MAX_SIMD_DEGREE parents at the same time +// on a single thread. Write out the parent chaining values and return the +// number of parents hashed. (If there's an odd input chaining value left over, +// return it as an additional output.) These parents are never the root and +// never empty; those cases use a different codepath. +INLINE size_t compress_parents_parallel(const uint8_t *child_chaining_values, + size_t num_chaining_values, + const uint32_t key[8], uint8_t flags, + uint8_t *out) { +#if defined(BLAKE3_TESTING) + assert(2 <= num_chaining_values); + assert(num_chaining_values <= 2 * MAX_SIMD_DEGREE_OR_2); +#endif + + const uint8_t *parents_array[MAX_SIMD_DEGREE_OR_2]; + size_t parents_array_len = 0; + while (num_chaining_values - (2 * parents_array_len) >= 2) { + parents_array[parents_array_len] = + &child_chaining_values[2 * parents_array_len * BLAKE3_OUT_LEN]; + parents_array_len += 1; + } + + blake3_hash_many(parents_array, parents_array_len, 1, key, + 0, // Parents always use counter 0. + false, flags | PARENT, + 0, // Parents have no start flags. + 0, // Parents have no end flags. + out); + + // If there's an odd child left over, it becomes an output. + if (num_chaining_values > 2 * parents_array_len) { + memcpy(&out[parents_array_len * BLAKE3_OUT_LEN], + &child_chaining_values[2 * parents_array_len * BLAKE3_OUT_LEN], + BLAKE3_OUT_LEN); + return parents_array_len + 1; + } else { + return parents_array_len; + } +} + +// The wide helper function returns (writes out) an array of chaining values +// and returns the length of that array. The number of chaining values returned +// is the dynamically detected SIMD degree, at most MAX_SIMD_DEGREE. Or fewer, +// if the input is shorter than that many chunks. The reason for maintaining a +// wide array of chaining values going back up the tree, is to allow the +// implementation to hash as many parents in parallel as possible. +// +// As a special case when the SIMD degree is 1, this function will still return +// at least 2 outputs. This guarantees that this function doesn't perform the +// root compression. (If it did, it would use the wrong flags, and also we +// wouldn't be able to implement extendable output.) Note that this function is +// not used when the whole input is only 1 chunk long; that's a different +// codepath. +// +// Why not just have the caller split the input on the first update(), instead +// of implementing this special rule? Because we don't want to limit SIMD or +// multi-threading parallelism for that update(). +static size_t blake3_compress_subtree_wide(const uint8_t *input, + size_t input_len, + const uint32_t key[8], + uint64_t chunk_counter, + uint8_t flags, uint8_t *out) { + // Note that the single chunk case does *not* bump the SIMD degree up to 2 + // when it is 1. If this implementation adds multi-threading in the future, + // this gives us the option of multi-threading even the 2-chunk case, which + // can help performance on smaller platforms. + if (input_len <= blake3_simd_degree() * BLAKE3_CHUNK_LEN) { + return compress_chunks_parallel(input, input_len, key, chunk_counter, flags, + out); + } + + // With more than simd_degree chunks, we need to recurse. Start by dividing + // the input into left and right subtrees. (Note that this is only optimal + // as long as the SIMD degree is a power of 2. If we ever get a SIMD degree + // of 3 or something, we'll need a more complicated strategy.) + size_t left_input_len = left_len(input_len); + size_t right_input_len = input_len - left_input_len; + const uint8_t *right_input = &input[left_input_len]; + uint64_t right_chunk_counter = + chunk_counter + (uint64_t)(left_input_len / BLAKE3_CHUNK_LEN); + + // Make space for the child outputs. Here we use MAX_SIMD_DEGREE_OR_2 to + // account for the special case of returning 2 outputs when the SIMD degree + // is 1. + uint8_t cv_array[2 * MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN]; + size_t degree = blake3_simd_degree(); + if (left_input_len > BLAKE3_CHUNK_LEN && degree == 1) { + // The special case: We always use a degree of at least two, to make + // sure there are two outputs. Except, as noted above, at the chunk + // level, where we allow degree=1. (Note that the 1-chunk-input case is + // a different codepath.) + degree = 2; + } + uint8_t *right_cvs = &cv_array[degree * BLAKE3_OUT_LEN]; + + // Recurse! If this implementation adds multi-threading support in the + // future, this is where it will go. + size_t left_n = blake3_compress_subtree_wide(input, left_input_len, key, + chunk_counter, flags, cv_array); + size_t right_n = blake3_compress_subtree_wide( + right_input, right_input_len, key, right_chunk_counter, flags, right_cvs); + + // The special case again. If simd_degree=1, then we'll have left_n=1 and + // right_n=1. Rather than compressing them into a single output, return + // them directly, to make sure we always have at least two outputs. + if (left_n == 1) { + memcpy(out, cv_array, 2 * BLAKE3_OUT_LEN); + return 2; + } + + // Otherwise, do one layer of parent node compression. + size_t num_chaining_values = left_n + right_n; + return compress_parents_parallel(cv_array, num_chaining_values, key, flags, + out); +} + +// Hash a subtree with compress_subtree_wide(), and then condense the resulting +// list of chaining values down to a single parent node. Don't compress that +// last parent node, however. Instead, return its message bytes (the +// concatenated chaining values of its children). This is necessary when the +// first call to update() supplies a complete subtree, because the topmost +// parent node of that subtree could end up being the root. It's also necessary +// for extended output in the general case. +// +// As with compress_subtree_wide(), this function is not used on inputs of 1 +// chunk or less. That's a different codepath. +INLINE void compress_subtree_to_parent_node( + const uint8_t *input, size_t input_len, const uint32_t key[8], + uint64_t chunk_counter, uint8_t flags, uint8_t out[2 * BLAKE3_OUT_LEN]) { +#if defined(BLAKE3_TESTING) + assert(input_len > BLAKE3_CHUNK_LEN); +#endif + + uint8_t cv_array[MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN]; + size_t num_cvs = blake3_compress_subtree_wide(input, input_len, key, + chunk_counter, flags, cv_array); + assert(num_cvs <= MAX_SIMD_DEGREE_OR_2); + + // If MAX_SIMD_DEGREE is greater than 2 and there's enough input, + // compress_subtree_wide() returns more than 2 chaining values. Condense + // them into 2 by forming parent nodes repeatedly. + uint8_t out_array[MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN / 2]; + // The second half of this loop condition is always true, and we just + // asserted it above. But GCC can't tell that it's always true, and if NDEBUG + // is set on platforms where MAX_SIMD_DEGREE_OR_2 == 2, GCC emits spurious + // warnings here. GCC 8.5 is particularly sensitive, so if you're changing + // this code, test it against that version. + while (num_cvs > 2 && num_cvs <= MAX_SIMD_DEGREE_OR_2) { + num_cvs = + compress_parents_parallel(cv_array, num_cvs, key, flags, out_array); + memcpy(cv_array, out_array, num_cvs * BLAKE3_OUT_LEN); + } + memcpy(out, cv_array, 2 * BLAKE3_OUT_LEN); +} + +INLINE void hasher_init_base(blake3_hasher *self, const uint32_t key[8], + uint8_t flags) { + memcpy(self->key, key, BLAKE3_KEY_LEN); + chunk_state_init(&self->chunk, key, flags); + self->cv_stack_len = 0; +} + +void blake3_hasher_init(blake3_hasher *self) { hasher_init_base(self, IV, 0); } + +void blake3_hasher_init_keyed(blake3_hasher *self, + const uint8_t key[BLAKE3_KEY_LEN]) { + uint32_t key_words[8]; + load_key_words(key, key_words); + hasher_init_base(self, key_words, KEYED_HASH); +} + +void blake3_hasher_init_derive_key_raw(blake3_hasher *self, const void *context, + size_t context_len) { + blake3_hasher context_hasher; + hasher_init_base(&context_hasher, IV, DERIVE_KEY_CONTEXT); + blake3_hasher_update(&context_hasher, context, context_len); + uint8_t context_key[BLAKE3_KEY_LEN]; + blake3_hasher_finalize(&context_hasher, context_key, BLAKE3_KEY_LEN); + uint32_t context_key_words[8]; + load_key_words(context_key, context_key_words); + hasher_init_base(self, context_key_words, DERIVE_KEY_MATERIAL); +} + +void blake3_hasher_init_derive_key(blake3_hasher *self, const char *context) { + blake3_hasher_init_derive_key_raw(self, context, strlen(context)); +} + +// As described in hasher_push_cv() below, we do "lazy merging", delaying +// merges until right before the next CV is about to be added. This is +// different from the reference implementation. Another difference is that we +// aren't always merging 1 chunk at a time. Instead, each CV might represent +// any power-of-two number of chunks, as long as the smaller-above-larger stack +// order is maintained. Instead of the "count the trailing 0-bits" algorithm +// described in the spec, we use a "count the total number of 1-bits" variant +// that doesn't require us to retain the subtree size of the CV on top of the +// stack. The principle is the same: each CV that should remain in the stack is +// represented by a 1-bit in the total number of chunks (or bytes) so far. +INLINE void hasher_merge_cv_stack(blake3_hasher *self, uint64_t total_len) { + size_t post_merge_stack_len = (size_t)popcnt(total_len); + while (self->cv_stack_len > post_merge_stack_len) { + uint8_t *parent_node = + &self->cv_stack[(self->cv_stack_len - 2) * BLAKE3_OUT_LEN]; + output_t output = parent_output(parent_node, self->key, self->chunk.flags); + output_chaining_value(&output, parent_node); + self->cv_stack_len -= 1; + } +} + +// In reference_impl.rs, we merge the new CV with existing CVs from the stack +// before pushing it. We can do that because we know more input is coming, so +// we know none of the merges are root. +// +// This setting is different. We want to feed as much input as possible to +// compress_subtree_wide(), without setting aside anything for the chunk_state. +// If the user gives us 64 KiB, we want to parallelize over all 64 KiB at once +// as a single subtree, if at all possible. +// +// This leads to two problems: +// 1) This 64 KiB input might be the only call that ever gets made to update. +// In this case, the root node of the 64 KiB subtree would be the root node +// of the whole tree, and it would need to be ROOT finalized. We can't +// compress it until we know. +// 2) This 64 KiB input might complete a larger tree, whose root node is +// similarly going to be the the root of the whole tree. For example, maybe +// we have 196 KiB (that is, 128 + 64) hashed so far. We can't compress the +// node at the root of the 256 KiB subtree until we know how to finalize it. +// +// The second problem is solved with "lazy merging". That is, when we're about +// to add a CV to the stack, we don't merge it with anything first, as the +// reference impl does. Instead we do merges using the *previous* CV that was +// added, which is sitting on top of the stack, and we put the new CV +// (unmerged) on top of the stack afterwards. This guarantees that we never +// merge the root node until finalize(). +// +// Solving the first problem requires an additional tool, +// compress_subtree_to_parent_node(). That function always returns the top +// *two* chaining values of the subtree it's compressing. We then do lazy +// merging with each of them separately, so that the second CV will always +// remain unmerged. (That also helps us support extendable output when we're +// hashing an input all-at-once.) +INLINE void hasher_push_cv(blake3_hasher *self, uint8_t new_cv[BLAKE3_OUT_LEN], + uint64_t chunk_counter) { + hasher_merge_cv_stack(self, chunk_counter); + memcpy(&self->cv_stack[self->cv_stack_len * BLAKE3_OUT_LEN], new_cv, + BLAKE3_OUT_LEN); + self->cv_stack_len += 1; +} + +void blake3_hasher_update(blake3_hasher *self, const void *input, + size_t input_len) { + // Explicitly checking for zero avoids causing UB by passing a null pointer + // to memcpy. This comes up in practice with things like: + // std::vector v; + // blake3_hasher_update(&hasher, v.data(), v.size()); + if (input_len == 0) { + return; + } + + const uint8_t *input_bytes = (const uint8_t *)input; + + // If we have some partial chunk bytes in the internal chunk_state, we need + // to finish that chunk first. + if (chunk_state_len(&self->chunk) > 0) { + size_t take = BLAKE3_CHUNK_LEN - chunk_state_len(&self->chunk); + if (take > input_len) { + take = input_len; + } + chunk_state_update(&self->chunk, input_bytes, take); + input_bytes += take; + input_len -= take; + // If we've filled the current chunk and there's more coming, finalize this + // chunk and proceed. In this case we know it's not the root. + if (input_len > 0) { + output_t output = chunk_state_output(&self->chunk); + uint8_t chunk_cv[32]; + output_chaining_value(&output, chunk_cv); + hasher_push_cv(self, chunk_cv, self->chunk.chunk_counter); + chunk_state_reset(&self->chunk, self->key, self->chunk.chunk_counter + 1); + } else { + return; + } + } + + // Now the chunk_state is clear, and we have more input. If there's more than + // a single chunk (so, definitely not the root chunk), hash the largest whole + // subtree we can, with the full benefits of SIMD (and maybe in the future, + // multi-threading) parallelism. Two restrictions: + // - The subtree has to be a power-of-2 number of chunks. Only subtrees along + // the right edge can be incomplete, and we don't know where the right edge + // is going to be until we get to finalize(). + // - The subtree must evenly divide the total number of chunks up until this + // point (if total is not 0). If the current incomplete subtree is only + // waiting for 1 more chunk, we can't hash a subtree of 4 chunks. We have + // to complete the current subtree first. + // Because we might need to break up the input to form powers of 2, or to + // evenly divide what we already have, this part runs in a loop. + while (input_len > BLAKE3_CHUNK_LEN) { + size_t subtree_len = round_down_to_power_of_2(input_len); + uint64_t count_so_far = self->chunk.chunk_counter * BLAKE3_CHUNK_LEN; + // Shrink the subtree_len until it evenly divides the count so far. We know + // that subtree_len itself is a power of 2, so we can use a bitmasking + // trick instead of an actual remainder operation. (Note that if the caller + // consistently passes power-of-2 inputs of the same size, as is hopefully + // typical, this loop condition will always fail, and subtree_len will + // always be the full length of the input.) + // + // An aside: We don't have to shrink subtree_len quite this much. For + // example, if count_so_far is 1, we could pass 2 chunks to + // compress_subtree_to_parent_node. Since we'll get 2 CVs back, we'll still + // get the right answer in the end, and we might get to use 2-way SIMD + // parallelism. The problem with this optimization, is that it gets us + // stuck always hashing 2 chunks. The total number of chunks will remain + // odd, and we'll never graduate to higher degrees of parallelism. See + // https://github.com/BLAKE3-team/BLAKE3/issues/69. + while ((((uint64_t)(subtree_len - 1)) & count_so_far) != 0) { + subtree_len /= 2; + } + // The shrunken subtree_len might now be 1 chunk long. If so, hash that one + // chunk by itself. Otherwise, compress the subtree into a pair of CVs. + uint64_t subtree_chunks = subtree_len / BLAKE3_CHUNK_LEN; + if (subtree_len <= BLAKE3_CHUNK_LEN) { + blake3_chunk_state chunk_state; + chunk_state_init(&chunk_state, self->key, self->chunk.flags); + chunk_state.chunk_counter = self->chunk.chunk_counter; + chunk_state_update(&chunk_state, input_bytes, subtree_len); + output_t output = chunk_state_output(&chunk_state); + uint8_t cv[BLAKE3_OUT_LEN]; + output_chaining_value(&output, cv); + hasher_push_cv(self, cv, chunk_state.chunk_counter); + } else { + // This is the high-performance happy path, though getting here depends + // on the caller giving us a long enough input. + uint8_t cv_pair[2 * BLAKE3_OUT_LEN]; + compress_subtree_to_parent_node(input_bytes, subtree_len, self->key, + self->chunk.chunk_counter, + self->chunk.flags, cv_pair); + hasher_push_cv(self, cv_pair, self->chunk.chunk_counter); + hasher_push_cv(self, &cv_pair[BLAKE3_OUT_LEN], + self->chunk.chunk_counter + (subtree_chunks / 2)); + } + self->chunk.chunk_counter += subtree_chunks; + input_bytes += subtree_len; + input_len -= subtree_len; + } + + // If there's any remaining input less than a full chunk, add it to the chunk + // state. In that case, also do a final merge loop to make sure the subtree + // stack doesn't contain any unmerged pairs. The remaining input means we + // know these merges are non-root. This merge loop isn't strictly necessary + // here, because hasher_push_chunk_cv already does its own merge loop, but it + // simplifies blake3_hasher_finalize below. + if (input_len > 0) { + chunk_state_update(&self->chunk, input_bytes, input_len); + hasher_merge_cv_stack(self, self->chunk.chunk_counter); + } +} + +void blake3_hasher_finalize(const blake3_hasher *self, uint8_t *out, + size_t out_len) { + blake3_hasher_finalize_seek(self, 0, out, out_len); +} + +void blake3_hasher_finalize_seek(const blake3_hasher *self, uint64_t seek, + uint8_t *out, size_t out_len) { + // Explicitly checking for zero avoids causing UB by passing a null pointer + // to memcpy. This comes up in practice with things like: + // std::vector v; + // blake3_hasher_finalize(&hasher, v.data(), v.size()); + if (out_len == 0) { + return; + } + + // If the subtree stack is empty, then the current chunk is the root. + if (self->cv_stack_len == 0) { + output_t output = chunk_state_output(&self->chunk); + output_root_bytes(&output, seek, out, out_len); + return; + } + // If there are any bytes in the chunk state, finalize that chunk and do a + // roll-up merge between that chunk hash and every subtree in the stack. In + // this case, the extra merge loop at the end of blake3_hasher_update + // guarantees that none of the subtrees in the stack need to be merged with + // each other first. Otherwise, if there are no bytes in the chunk state, + // then the top of the stack is a chunk hash, and we start the merge from + // that. + output_t output; + size_t cvs_remaining; + if (chunk_state_len(&self->chunk) > 0) { + cvs_remaining = self->cv_stack_len; + output = chunk_state_output(&self->chunk); + } else { + // There are always at least 2 CVs in the stack in this case. + cvs_remaining = self->cv_stack_len - 2; + output = parent_output(&self->cv_stack[cvs_remaining * 32], self->key, + self->chunk.flags); + } + while (cvs_remaining > 0) { + cvs_remaining -= 1; + uint8_t parent_block[BLAKE3_BLOCK_LEN]; + memcpy(parent_block, &self->cv_stack[cvs_remaining * 32], 32); + output_chaining_value(&output, &parent_block[32]); + output = parent_output(parent_block, self->key, self->chunk.flags); + } + output_root_bytes(&output, seek, out, out_len); +} + +void blake3_hasher_reset(blake3_hasher *self) { + chunk_state_reset(&self->chunk, self->key, 0); + self->cv_stack_len = 0; +} diff --git a/third-party/blake3/c/blake3.h b/third-party/blake3/c/blake3.h new file mode 100644 index 00000000..21e0d7b9 --- /dev/null +++ b/third-party/blake3/c/blake3.h @@ -0,0 +1,82 @@ +#ifndef BLAKE3_H +#define BLAKE3_H + +#include +#include + +#if !defined(BLAKE3_API) +# if defined(_WIN32) || defined(__CYGWIN__) +# if defined(BLAKE3_DLL) +# if defined(BLAKE3_DLL_EXPORTS) +# define BLAKE3_API __declspec(dllexport) +# else +# define BLAKE3_API __declspec(dllimport) +# endif +# define BLAKE3_PRIVATE +# else +# define BLAKE3_API +# define BLAKE3_PRIVATE +# endif +# elif __GNUC__ >= 4 +# define BLAKE3_API __attribute__((visibility("default"))) +# define BLAKE3_PRIVATE __attribute__((visibility("hidden"))) +# else +# define BLAKE3_API +# define BLAKE3_PRIVATE +# endif +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +#define BLAKE3_VERSION_STRING "1.4.1" +#define BLAKE3_KEY_LEN 32 +#define BLAKE3_OUT_LEN 32 +#define BLAKE3_BLOCK_LEN 64 +#define BLAKE3_CHUNK_LEN 1024 +#define BLAKE3_MAX_DEPTH 54 + +// This struct is a private implementation detail. It has to be here because +// it's part of blake3_hasher below. +typedef struct { + uint32_t cv[8]; + uint64_t chunk_counter; + uint8_t buf[BLAKE3_BLOCK_LEN]; + uint8_t buf_len; + uint8_t blocks_compressed; + uint8_t flags; +} blake3_chunk_state; + +typedef struct { + uint32_t key[8]; + blake3_chunk_state chunk; + uint8_t cv_stack_len; + // The stack size is MAX_DEPTH + 1 because we do lazy merging. For example, + // with 7 chunks, we have 3 entries in the stack. Adding an 8th chunk + // requires a 4th entry, rather than merging everything down to 1, because we + // don't know whether more input is coming. This is different from how the + // reference implementation does things. + uint8_t cv_stack[(BLAKE3_MAX_DEPTH + 1) * BLAKE3_OUT_LEN]; +} blake3_hasher; + +BLAKE3_API const char *blake3_version(void); +BLAKE3_API void blake3_hasher_init(blake3_hasher *self); +BLAKE3_API void blake3_hasher_init_keyed(blake3_hasher *self, + const uint8_t key[BLAKE3_KEY_LEN]); +BLAKE3_API void blake3_hasher_init_derive_key(blake3_hasher *self, const char *context); +BLAKE3_API void blake3_hasher_init_derive_key_raw(blake3_hasher *self, const void *context, + size_t context_len); +BLAKE3_API void blake3_hasher_update(blake3_hasher *self, const void *input, + size_t input_len); +BLAKE3_API void blake3_hasher_finalize(const blake3_hasher *self, uint8_t *out, + size_t out_len); +BLAKE3_API void blake3_hasher_finalize_seek(const blake3_hasher *self, uint64_t seek, + uint8_t *out, size_t out_len); +BLAKE3_API void blake3_hasher_reset(blake3_hasher *self); + +#ifdef __cplusplus +} +#endif + +#endif /* BLAKE3_H */ diff --git a/third-party/blake3/c/blake3_avx2.c b/third-party/blake3/c/blake3_avx2.c new file mode 100644 index 00000000..381e7c42 --- /dev/null +++ b/third-party/blake3/c/blake3_avx2.c @@ -0,0 +1,326 @@ +#include "blake3_impl.h" + +#include + +#define DEGREE 8 + +INLINE __m256i loadu(const uint8_t src[32]) { + return _mm256_loadu_si256((const __m256i *)src); +} + +INLINE void storeu(__m256i src, uint8_t dest[16]) { + _mm256_storeu_si256((__m256i *)dest, src); +} + +INLINE __m256i addv(__m256i a, __m256i b) { return _mm256_add_epi32(a, b); } + +// Note that clang-format doesn't like the name "xor" for some reason. +INLINE __m256i xorv(__m256i a, __m256i b) { return _mm256_xor_si256(a, b); } + +INLINE __m256i set1(uint32_t x) { return _mm256_set1_epi32((int32_t)x); } + +INLINE __m256i rot16(__m256i x) { + return _mm256_shuffle_epi8( + x, _mm256_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2, + 13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2)); +} + +INLINE __m256i rot12(__m256i x) { + return _mm256_or_si256(_mm256_srli_epi32(x, 12), _mm256_slli_epi32(x, 32 - 12)); +} + +INLINE __m256i rot8(__m256i x) { + return _mm256_shuffle_epi8( + x, _mm256_set_epi8(12, 15, 14, 13, 8, 11, 10, 9, 4, 7, 6, 5, 0, 3, 2, 1, + 12, 15, 14, 13, 8, 11, 10, 9, 4, 7, 6, 5, 0, 3, 2, 1)); +} + +INLINE __m256i rot7(__m256i x) { + return _mm256_or_si256(_mm256_srli_epi32(x, 7), _mm256_slli_epi32(x, 32 - 7)); +} + +INLINE void round_fn(__m256i v[16], __m256i m[16], size_t r) { + v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][0]]); + v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][2]]); + v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][4]]); + v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][6]]); + v[0] = addv(v[0], v[4]); + v[1] = addv(v[1], v[5]); + v[2] = addv(v[2], v[6]); + v[3] = addv(v[3], v[7]); + v[12] = xorv(v[12], v[0]); + v[13] = xorv(v[13], v[1]); + v[14] = xorv(v[14], v[2]); + v[15] = xorv(v[15], v[3]); + v[12] = rot16(v[12]); + v[13] = rot16(v[13]); + v[14] = rot16(v[14]); + v[15] = rot16(v[15]); + v[8] = addv(v[8], v[12]); + v[9] = addv(v[9], v[13]); + v[10] = addv(v[10], v[14]); + v[11] = addv(v[11], v[15]); + v[4] = xorv(v[4], v[8]); + v[5] = xorv(v[5], v[9]); + v[6] = xorv(v[6], v[10]); + v[7] = xorv(v[7], v[11]); + v[4] = rot12(v[4]); + v[5] = rot12(v[5]); + v[6] = rot12(v[6]); + v[7] = rot12(v[7]); + v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][1]]); + v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][3]]); + v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][5]]); + v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][7]]); + v[0] = addv(v[0], v[4]); + v[1] = addv(v[1], v[5]); + v[2] = addv(v[2], v[6]); + v[3] = addv(v[3], v[7]); + v[12] = xorv(v[12], v[0]); + v[13] = xorv(v[13], v[1]); + v[14] = xorv(v[14], v[2]); + v[15] = xorv(v[15], v[3]); + v[12] = rot8(v[12]); + v[13] = rot8(v[13]); + v[14] = rot8(v[14]); + v[15] = rot8(v[15]); + v[8] = addv(v[8], v[12]); + v[9] = addv(v[9], v[13]); + v[10] = addv(v[10], v[14]); + v[11] = addv(v[11], v[15]); + v[4] = xorv(v[4], v[8]); + v[5] = xorv(v[5], v[9]); + v[6] = xorv(v[6], v[10]); + v[7] = xorv(v[7], v[11]); + v[4] = rot7(v[4]); + v[5] = rot7(v[5]); + v[6] = rot7(v[6]); + v[7] = rot7(v[7]); + + v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][8]]); + v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][10]]); + v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][12]]); + v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][14]]); + v[0] = addv(v[0], v[5]); + v[1] = addv(v[1], v[6]); + v[2] = addv(v[2], v[7]); + v[3] = addv(v[3], v[4]); + v[15] = xorv(v[15], v[0]); + v[12] = xorv(v[12], v[1]); + v[13] = xorv(v[13], v[2]); + v[14] = xorv(v[14], v[3]); + v[15] = rot16(v[15]); + v[12] = rot16(v[12]); + v[13] = rot16(v[13]); + v[14] = rot16(v[14]); + v[10] = addv(v[10], v[15]); + v[11] = addv(v[11], v[12]); + v[8] = addv(v[8], v[13]); + v[9] = addv(v[9], v[14]); + v[5] = xorv(v[5], v[10]); + v[6] = xorv(v[6], v[11]); + v[7] = xorv(v[7], v[8]); + v[4] = xorv(v[4], v[9]); + v[5] = rot12(v[5]); + v[6] = rot12(v[6]); + v[7] = rot12(v[7]); + v[4] = rot12(v[4]); + v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][9]]); + v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][11]]); + v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][13]]); + v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][15]]); + v[0] = addv(v[0], v[5]); + v[1] = addv(v[1], v[6]); + v[2] = addv(v[2], v[7]); + v[3] = addv(v[3], v[4]); + v[15] = xorv(v[15], v[0]); + v[12] = xorv(v[12], v[1]); + v[13] = xorv(v[13], v[2]); + v[14] = xorv(v[14], v[3]); + v[15] = rot8(v[15]); + v[12] = rot8(v[12]); + v[13] = rot8(v[13]); + v[14] = rot8(v[14]); + v[10] = addv(v[10], v[15]); + v[11] = addv(v[11], v[12]); + v[8] = addv(v[8], v[13]); + v[9] = addv(v[9], v[14]); + v[5] = xorv(v[5], v[10]); + v[6] = xorv(v[6], v[11]); + v[7] = xorv(v[7], v[8]); + v[4] = xorv(v[4], v[9]); + v[5] = rot7(v[5]); + v[6] = rot7(v[6]); + v[7] = rot7(v[7]); + v[4] = rot7(v[4]); +} + +INLINE void transpose_vecs(__m256i vecs[DEGREE]) { + // Interleave 32-bit lanes. The low unpack is lanes 00/11/44/55, and the high + // is 22/33/66/77. + __m256i ab_0145 = _mm256_unpacklo_epi32(vecs[0], vecs[1]); + __m256i ab_2367 = _mm256_unpackhi_epi32(vecs[0], vecs[1]); + __m256i cd_0145 = _mm256_unpacklo_epi32(vecs[2], vecs[3]); + __m256i cd_2367 = _mm256_unpackhi_epi32(vecs[2], vecs[3]); + __m256i ef_0145 = _mm256_unpacklo_epi32(vecs[4], vecs[5]); + __m256i ef_2367 = _mm256_unpackhi_epi32(vecs[4], vecs[5]); + __m256i gh_0145 = _mm256_unpacklo_epi32(vecs[6], vecs[7]); + __m256i gh_2367 = _mm256_unpackhi_epi32(vecs[6], vecs[7]); + + // Interleave 64-bit lanes. The low unpack is lanes 00/22 and the high is + // 11/33. + __m256i abcd_04 = _mm256_unpacklo_epi64(ab_0145, cd_0145); + __m256i abcd_15 = _mm256_unpackhi_epi64(ab_0145, cd_0145); + __m256i abcd_26 = _mm256_unpacklo_epi64(ab_2367, cd_2367); + __m256i abcd_37 = _mm256_unpackhi_epi64(ab_2367, cd_2367); + __m256i efgh_04 = _mm256_unpacklo_epi64(ef_0145, gh_0145); + __m256i efgh_15 = _mm256_unpackhi_epi64(ef_0145, gh_0145); + __m256i efgh_26 = _mm256_unpacklo_epi64(ef_2367, gh_2367); + __m256i efgh_37 = _mm256_unpackhi_epi64(ef_2367, gh_2367); + + // Interleave 128-bit lanes. + vecs[0] = _mm256_permute2x128_si256(abcd_04, efgh_04, 0x20); + vecs[1] = _mm256_permute2x128_si256(abcd_15, efgh_15, 0x20); + vecs[2] = _mm256_permute2x128_si256(abcd_26, efgh_26, 0x20); + vecs[3] = _mm256_permute2x128_si256(abcd_37, efgh_37, 0x20); + vecs[4] = _mm256_permute2x128_si256(abcd_04, efgh_04, 0x31); + vecs[5] = _mm256_permute2x128_si256(abcd_15, efgh_15, 0x31); + vecs[6] = _mm256_permute2x128_si256(abcd_26, efgh_26, 0x31); + vecs[7] = _mm256_permute2x128_si256(abcd_37, efgh_37, 0x31); +} + +INLINE void transpose_msg_vecs(const uint8_t *const *inputs, + size_t block_offset, __m256i out[16]) { + out[0] = loadu(&inputs[0][block_offset + 0 * sizeof(__m256i)]); + out[1] = loadu(&inputs[1][block_offset + 0 * sizeof(__m256i)]); + out[2] = loadu(&inputs[2][block_offset + 0 * sizeof(__m256i)]); + out[3] = loadu(&inputs[3][block_offset + 0 * sizeof(__m256i)]); + out[4] = loadu(&inputs[4][block_offset + 0 * sizeof(__m256i)]); + out[5] = loadu(&inputs[5][block_offset + 0 * sizeof(__m256i)]); + out[6] = loadu(&inputs[6][block_offset + 0 * sizeof(__m256i)]); + out[7] = loadu(&inputs[7][block_offset + 0 * sizeof(__m256i)]); + out[8] = loadu(&inputs[0][block_offset + 1 * sizeof(__m256i)]); + out[9] = loadu(&inputs[1][block_offset + 1 * sizeof(__m256i)]); + out[10] = loadu(&inputs[2][block_offset + 1 * sizeof(__m256i)]); + out[11] = loadu(&inputs[3][block_offset + 1 * sizeof(__m256i)]); + out[12] = loadu(&inputs[4][block_offset + 1 * sizeof(__m256i)]); + out[13] = loadu(&inputs[5][block_offset + 1 * sizeof(__m256i)]); + out[14] = loadu(&inputs[6][block_offset + 1 * sizeof(__m256i)]); + out[15] = loadu(&inputs[7][block_offset + 1 * sizeof(__m256i)]); + for (size_t i = 0; i < 8; ++i) { + _mm_prefetch((const void *)&inputs[i][block_offset + 256], _MM_HINT_T0); + } + transpose_vecs(&out[0]); + transpose_vecs(&out[8]); +} + +INLINE void load_counters(uint64_t counter, bool increment_counter, + __m256i *out_lo, __m256i *out_hi) { + const __m256i mask = _mm256_set1_epi32(-(int32_t)increment_counter); + const __m256i add0 = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0); + const __m256i add1 = _mm256_and_si256(mask, add0); + __m256i l = _mm256_add_epi32(_mm256_set1_epi32((int32_t)counter), add1); + __m256i carry = _mm256_cmpgt_epi32(_mm256_xor_si256(add1, _mm256_set1_epi32(0x80000000)), + _mm256_xor_si256( l, _mm256_set1_epi32(0x80000000))); + __m256i h = _mm256_sub_epi32(_mm256_set1_epi32((int32_t)(counter >> 32)), carry); + *out_lo = l; + *out_hi = h; +} + +static +void blake3_hash8_avx2(const uint8_t *const *inputs, size_t blocks, + const uint32_t key[8], uint64_t counter, + bool increment_counter, uint8_t flags, + uint8_t flags_start, uint8_t flags_end, uint8_t *out) { + __m256i h_vecs[8] = { + set1(key[0]), set1(key[1]), set1(key[2]), set1(key[3]), + set1(key[4]), set1(key[5]), set1(key[6]), set1(key[7]), + }; + __m256i counter_low_vec, counter_high_vec; + load_counters(counter, increment_counter, &counter_low_vec, + &counter_high_vec); + uint8_t block_flags = flags | flags_start; + + for (size_t block = 0; block < blocks; block++) { + if (block + 1 == blocks) { + block_flags |= flags_end; + } + __m256i block_len_vec = set1(BLAKE3_BLOCK_LEN); + __m256i block_flags_vec = set1(block_flags); + __m256i msg_vecs[16]; + transpose_msg_vecs(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs); + + __m256i v[16] = { + h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3], + h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7], + set1(IV[0]), set1(IV[1]), set1(IV[2]), set1(IV[3]), + counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec, + }; + round_fn(v, msg_vecs, 0); + round_fn(v, msg_vecs, 1); + round_fn(v, msg_vecs, 2); + round_fn(v, msg_vecs, 3); + round_fn(v, msg_vecs, 4); + round_fn(v, msg_vecs, 5); + round_fn(v, msg_vecs, 6); + h_vecs[0] = xorv(v[0], v[8]); + h_vecs[1] = xorv(v[1], v[9]); + h_vecs[2] = xorv(v[2], v[10]); + h_vecs[3] = xorv(v[3], v[11]); + h_vecs[4] = xorv(v[4], v[12]); + h_vecs[5] = xorv(v[5], v[13]); + h_vecs[6] = xorv(v[6], v[14]); + h_vecs[7] = xorv(v[7], v[15]); + + block_flags = flags; + } + + transpose_vecs(h_vecs); + storeu(h_vecs[0], &out[0 * sizeof(__m256i)]); + storeu(h_vecs[1], &out[1 * sizeof(__m256i)]); + storeu(h_vecs[2], &out[2 * sizeof(__m256i)]); + storeu(h_vecs[3], &out[3 * sizeof(__m256i)]); + storeu(h_vecs[4], &out[4 * sizeof(__m256i)]); + storeu(h_vecs[5], &out[5 * sizeof(__m256i)]); + storeu(h_vecs[6], &out[6 * sizeof(__m256i)]); + storeu(h_vecs[7], &out[7 * sizeof(__m256i)]); +} + +#if !defined(BLAKE3_NO_SSE41) +void blake3_hash_many_sse41(const uint8_t *const *inputs, size_t num_inputs, + size_t blocks, const uint32_t key[8], + uint64_t counter, bool increment_counter, + uint8_t flags, uint8_t flags_start, + uint8_t flags_end, uint8_t *out); +#else +void blake3_hash_many_portable(const uint8_t *const *inputs, size_t num_inputs, + size_t blocks, const uint32_t key[8], + uint64_t counter, bool increment_counter, + uint8_t flags, uint8_t flags_start, + uint8_t flags_end, uint8_t *out); +#endif + +void blake3_hash_many_avx2(const uint8_t *const *inputs, size_t num_inputs, + size_t blocks, const uint32_t key[8], + uint64_t counter, bool increment_counter, + uint8_t flags, uint8_t flags_start, + uint8_t flags_end, uint8_t *out) { + while (num_inputs >= DEGREE) { + blake3_hash8_avx2(inputs, blocks, key, counter, increment_counter, flags, + flags_start, flags_end, out); + if (increment_counter) { + counter += DEGREE; + } + inputs += DEGREE; + num_inputs -= DEGREE; + out = &out[DEGREE * BLAKE3_OUT_LEN]; + } +#if !defined(BLAKE3_NO_SSE41) + blake3_hash_many_sse41(inputs, num_inputs, blocks, key, counter, + increment_counter, flags, flags_start, flags_end, out); +#else + blake3_hash_many_portable(inputs, num_inputs, blocks, key, counter, + increment_counter, flags, flags_start, flags_end, + out); +#endif +} diff --git a/third-party/blake3/c/blake3_avx2_x86-64_unix.S b/third-party/blake3/c/blake3_avx2_x86-64_unix.S new file mode 100644 index 00000000..812bb856 --- /dev/null +++ b/third-party/blake3/c/blake3_avx2_x86-64_unix.S @@ -0,0 +1,1815 @@ +#if defined(__ELF__) && defined(__linux__) +.section .note.GNU-stack,"",%progbits +#endif + +#if defined(__ELF__) && defined(__CET__) && defined(__has_include) +#if __has_include() +#include +#endif +#endif + +#if !defined(_CET_ENDBR) +#define _CET_ENDBR +#endif + +.intel_syntax noprefix +.global _blake3_hash_many_avx2 +.global blake3_hash_many_avx2 +#ifdef __APPLE__ +.text +#else +.section .text +#endif + .p2align 6 +_blake3_hash_many_avx2: +blake3_hash_many_avx2: + _CET_ENDBR + push r15 + push r14 + push r13 + push r12 + push rbx + push rbp + mov rbp, rsp + sub rsp, 680 + and rsp, 0xFFFFFFFFFFFFFFC0 + neg r9d + vmovd xmm0, r9d + vpbroadcastd ymm0, xmm0 + vmovdqa ymmword ptr [rsp+0x280], ymm0 + vpand ymm1, ymm0, ymmword ptr [ADD0+rip] + vpand ymm2, ymm0, ymmword ptr [ADD1+rip] + vmovdqa ymmword ptr [rsp+0x220], ymm2 + vmovd xmm2, r8d + vpbroadcastd ymm2, xmm2 + vpaddd ymm2, ymm2, ymm1 + vmovdqa ymmword ptr [rsp+0x240], ymm2 + vpxor ymm1, ymm1, ymmword ptr [CMP_MSB_MASK+rip] + vpxor ymm2, ymm2, ymmword ptr [CMP_MSB_MASK+rip] + vpcmpgtd ymm2, ymm1, ymm2 + shr r8, 32 + vmovd xmm3, r8d + vpbroadcastd ymm3, xmm3 + vpsubd ymm3, ymm3, ymm2 + vmovdqa ymmword ptr [rsp+0x260], ymm3 + shl rdx, 6 + mov qword ptr [rsp+0x2A0], rdx + cmp rsi, 8 + jc 3f +2: + vpbroadcastd ymm0, dword ptr [rcx] + vpbroadcastd ymm1, dword ptr [rcx+0x4] + vpbroadcastd ymm2, dword ptr [rcx+0x8] + vpbroadcastd ymm3, dword ptr [rcx+0xC] + vpbroadcastd ymm4, dword ptr [rcx+0x10] + vpbroadcastd ymm5, dword ptr [rcx+0x14] + vpbroadcastd ymm6, dword ptr [rcx+0x18] + vpbroadcastd ymm7, dword ptr [rcx+0x1C] + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+0x8] + mov r10, qword ptr [rdi+0x10] + mov r11, qword ptr [rdi+0x18] + mov r12, qword ptr [rdi+0x20] + mov r13, qword ptr [rdi+0x28] + mov r14, qword ptr [rdi+0x30] + mov r15, qword ptr [rdi+0x38] + movzx eax, byte ptr [rbp+0x38] + movzx ebx, byte ptr [rbp+0x40] + or eax, ebx + xor edx, edx +.p2align 5 +9: + movzx ebx, byte ptr [rbp+0x48] + or ebx, eax + add rdx, 64 + cmp rdx, qword ptr [rsp+0x2A0] + cmove eax, ebx + mov dword ptr [rsp+0x200], eax + vmovups xmm8, xmmword ptr [r8+rdx-0x40] + vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x40], 0x01 + vmovups xmm9, xmmword ptr [r9+rdx-0x40] + vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x40], 0x01 + vunpcklpd ymm12, ymm8, ymm9 + vunpckhpd ymm13, ymm8, ymm9 + vmovups xmm10, xmmword ptr [r10+rdx-0x40] + vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x40], 0x01 + vmovups xmm11, xmmword ptr [r11+rdx-0x40] + vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x40], 0x01 + vunpcklpd ymm14, ymm10, ymm11 + vunpckhpd ymm15, ymm10, ymm11 + vshufps ymm8, ymm12, ymm14, 136 + vmovaps ymmword ptr [rsp], ymm8 + vshufps ymm9, ymm12, ymm14, 221 + vmovaps ymmword ptr [rsp+0x20], ymm9 + vshufps ymm10, ymm13, ymm15, 136 + vmovaps ymmword ptr [rsp+0x40], ymm10 + vshufps ymm11, ymm13, ymm15, 221 + vmovaps ymmword ptr [rsp+0x60], ymm11 + vmovups xmm8, xmmword ptr [r8+rdx-0x30] + vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x30], 0x01 + vmovups xmm9, xmmword ptr [r9+rdx-0x30] + vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x30], 0x01 + vunpcklpd ymm12, ymm8, ymm9 + vunpckhpd ymm13, ymm8, ymm9 + vmovups xmm10, xmmword ptr [r10+rdx-0x30] + vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x30], 0x01 + vmovups xmm11, xmmword ptr [r11+rdx-0x30] + vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x30], 0x01 + vunpcklpd ymm14, ymm10, ymm11 + vunpckhpd ymm15, ymm10, ymm11 + vshufps ymm8, ymm12, ymm14, 136 + vmovaps ymmword ptr [rsp+0x80], ymm8 + vshufps ymm9, ymm12, ymm14, 221 + vmovaps ymmword ptr [rsp+0xA0], ymm9 + vshufps ymm10, ymm13, ymm15, 136 + vmovaps ymmword ptr [rsp+0xC0], ymm10 + vshufps ymm11, ymm13, ymm15, 221 + vmovaps ymmword ptr [rsp+0xE0], ymm11 + vmovups xmm8, xmmword ptr [r8+rdx-0x20] + vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x20], 0x01 + vmovups xmm9, xmmword ptr [r9+rdx-0x20] + vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x20], 0x01 + vunpcklpd ymm12, ymm8, ymm9 + vunpckhpd ymm13, ymm8, ymm9 + vmovups xmm10, xmmword ptr [r10+rdx-0x20] + vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x20], 0x01 + vmovups xmm11, xmmword ptr [r11+rdx-0x20] + vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x20], 0x01 + vunpcklpd ymm14, ymm10, ymm11 + vunpckhpd ymm15, ymm10, ymm11 + vshufps ymm8, ymm12, ymm14, 136 + vmovaps ymmword ptr [rsp+0x100], ymm8 + vshufps ymm9, ymm12, ymm14, 221 + vmovaps ymmword ptr [rsp+0x120], ymm9 + vshufps ymm10, ymm13, ymm15, 136 + vmovaps ymmword ptr [rsp+0x140], ymm10 + vshufps ymm11, ymm13, ymm15, 221 + vmovaps ymmword ptr [rsp+0x160], ymm11 + vmovups xmm8, xmmword ptr [r8+rdx-0x10] + vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x10], 0x01 + vmovups xmm9, xmmword ptr [r9+rdx-0x10] + vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x10], 0x01 + vunpcklpd ymm12, ymm8, ymm9 + vunpckhpd ymm13, ymm8, ymm9 + vmovups xmm10, xmmword ptr [r10+rdx-0x10] + vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x10], 0x01 + vmovups xmm11, xmmword ptr [r11+rdx-0x10] + vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x10], 0x01 + vunpcklpd ymm14, ymm10, ymm11 + vunpckhpd ymm15, ymm10, ymm11 + vshufps ymm8, ymm12, ymm14, 136 + vmovaps ymmword ptr [rsp+0x180], ymm8 + vshufps ymm9, ymm12, ymm14, 221 + vmovaps ymmword ptr [rsp+0x1A0], ymm9 + vshufps ymm10, ymm13, ymm15, 136 + vmovaps ymmword ptr [rsp+0x1C0], ymm10 + vshufps ymm11, ymm13, ymm15, 221 + vmovaps ymmword ptr [rsp+0x1E0], ymm11 + vpbroadcastd ymm15, dword ptr [rsp+0x200] + prefetcht0 [r8+rdx+0x80] + prefetcht0 [r12+rdx+0x80] + prefetcht0 [r9+rdx+0x80] + prefetcht0 [r13+rdx+0x80] + prefetcht0 [r10+rdx+0x80] + prefetcht0 [r14+rdx+0x80] + prefetcht0 [r11+rdx+0x80] + prefetcht0 [r15+rdx+0x80] + vpaddd ymm0, ymm0, ymmword ptr [rsp] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x40] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x80] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0xC0] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm0, ymmword ptr [rsp+0x240] + vpxor ymm13, ymm1, ymmword ptr [rsp+0x260] + vpxor ymm14, ymm2, ymmword ptr [BLAKE3_BLOCK_LEN+rip] + vpxor ymm15, ymm3, ymm15 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [BLAKE3_IV_0+rip] + vpaddd ymm9, ymm13, ymmword ptr [BLAKE3_IV_1+rip] + vpaddd ymm10, ymm14, ymmword ptr [BLAKE3_IV_2+rip] + vpaddd ymm11, ymm15, ymmword ptr [BLAKE3_IV_3+rip] + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x20] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x60] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0xA0] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0xE0] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x100] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x140] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x180] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1C0] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x120] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x160] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1A0] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1E0] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x40] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x60] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0xE0] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x80] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0xC0] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x140] + vpaddd ymm2, ymm2, ymmword ptr [rsp] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1A0] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x20] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x180] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x120] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1E0] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x160] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0xA0] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1C0] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x100] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x60] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x140] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1A0] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0xE0] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x80] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x180] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x40] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1C0] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0xC0] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x120] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x160] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x100] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0xA0] + vpaddd ymm1, ymm1, ymmword ptr [rsp] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1E0] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x20] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x140] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x180] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1C0] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1A0] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0xE0] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x120] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x60] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1E0] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x80] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x160] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0xA0] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x20] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x40] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x100] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0xC0] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x180] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x120] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1E0] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1C0] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x1A0] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x160] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x140] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x100] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0xE0] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0xA0] + vpaddd ymm2, ymm2, ymmword ptr [rsp] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0xC0] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x40] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x60] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x20] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x80] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x120] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x160] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x100] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1E0] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x1C0] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0xA0] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x180] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x20] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x1A0] + vpaddd ymm1, ymm1, ymmword ptr [rsp] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x40] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x80] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x60] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x140] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0xC0] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0xE0] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x160] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0xA0] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x20] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x100] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x1E0] + vpaddd ymm1, ymm1, ymmword ptr [rsp] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x120] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0xC0] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x1C0] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x40] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x60] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0xE0] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x140] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x180] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x80] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1A0] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vpxor ymm0, ymm0, ymm8 + vpxor ymm1, ymm1, ymm9 + vpxor ymm2, ymm2, ymm10 + vpxor ymm3, ymm3, ymm11 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpxor ymm4, ymm4, ymm12 + vpxor ymm5, ymm5, ymm13 + vpxor ymm6, ymm6, ymm14 + vpxor ymm7, ymm7, ymm15 + movzx eax, byte ptr [rbp+0x38] + jne 9b + mov rbx, qword ptr [rbp+0x50] + vunpcklps ymm8, ymm0, ymm1 + vunpcklps ymm9, ymm2, ymm3 + vunpckhps ymm10, ymm0, ymm1 + vunpcklps ymm11, ymm4, ymm5 + vunpcklps ymm0, ymm6, ymm7 + vshufps ymm12, ymm8, ymm9, 78 + vblendps ymm1, ymm8, ymm12, 0xCC + vshufps ymm8, ymm11, ymm0, 78 + vunpckhps ymm13, ymm2, ymm3 + vblendps ymm2, ymm11, ymm8, 0xCC + vblendps ymm3, ymm12, ymm9, 0xCC + vperm2f128 ymm12, ymm1, ymm2, 0x20 + vmovups ymmword ptr [rbx], ymm12 + vunpckhps ymm14, ymm4, ymm5 + vblendps ymm4, ymm8, ymm0, 0xCC + vunpckhps ymm15, ymm6, ymm7 + vperm2f128 ymm7, ymm3, ymm4, 0x20 + vmovups ymmword ptr [rbx+0x20], ymm7 + vshufps ymm5, ymm10, ymm13, 78 + vblendps ymm6, ymm5, ymm13, 0xCC + vshufps ymm13, ymm14, ymm15, 78 + vblendps ymm10, ymm10, ymm5, 0xCC + vblendps ymm14, ymm14, ymm13, 0xCC + vperm2f128 ymm8, ymm10, ymm14, 0x20 + vmovups ymmword ptr [rbx+0x40], ymm8 + vblendps ymm15, ymm13, ymm15, 0xCC + vperm2f128 ymm13, ymm6, ymm15, 0x20 + vmovups ymmword ptr [rbx+0x60], ymm13 + vperm2f128 ymm9, ymm1, ymm2, 0x31 + vperm2f128 ymm11, ymm3, ymm4, 0x31 + vmovups ymmword ptr [rbx+0x80], ymm9 + vperm2f128 ymm14, ymm10, ymm14, 0x31 + vperm2f128 ymm15, ymm6, ymm15, 0x31 + vmovups ymmword ptr [rbx+0xA0], ymm11 + vmovups ymmword ptr [rbx+0xC0], ymm14 + vmovups ymmword ptr [rbx+0xE0], ymm15 + vmovdqa ymm0, ymmword ptr [rsp+0x220] + vpaddd ymm1, ymm0, ymmword ptr [rsp+0x240] + vmovdqa ymmword ptr [rsp+0x240], ymm1 + vpxor ymm0, ymm0, ymmword ptr [CMP_MSB_MASK+rip] + vpxor ymm2, ymm1, ymmword ptr [CMP_MSB_MASK+rip] + vpcmpgtd ymm2, ymm0, ymm2 + vmovdqa ymm0, ymmword ptr [rsp+0x260] + vpsubd ymm2, ymm0, ymm2 + vmovdqa ymmword ptr [rsp+0x260], ymm2 + add rdi, 64 + add rbx, 256 + mov qword ptr [rbp+0x50], rbx + sub rsi, 8 + cmp rsi, 8 + jnc 2b + test rsi, rsi + jnz 3f +4: + vzeroupper + mov rsp, rbp + pop rbp + pop rbx + pop r12 + pop r13 + pop r14 + pop r15 + ret +.p2align 5 +3: + mov rbx, qword ptr [rbp+0x50] + mov r15, qword ptr [rsp+0x2A0] + movzx r13d, byte ptr [rbp+0x38] + movzx r12d, byte ptr [rbp+0x48] + test rsi, 0x4 + je 3f + vbroadcasti128 ymm0, xmmword ptr [rcx] + vbroadcasti128 ymm1, xmmword ptr [rcx+0x10] + vmovdqa ymm8, ymm0 + vmovdqa ymm9, ymm1 + vbroadcasti128 ymm12, xmmword ptr [rsp+0x240] + vbroadcasti128 ymm13, xmmword ptr [rsp+0x260] + vpunpckldq ymm14, ymm12, ymm13 + vpunpckhdq ymm15, ymm12, ymm13 + vpermq ymm14, ymm14, 0x50 + vpermq ymm15, ymm15, 0x50 + vbroadcasti128 ymm12, xmmword ptr [BLAKE3_BLOCK_LEN+rip] + vpblendd ymm14, ymm14, ymm12, 0x44 + vpblendd ymm15, ymm15, ymm12, 0x44 + vmovdqa ymmword ptr [rsp], ymm14 + vmovdqa ymmword ptr [rsp+0x20], ymm15 + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+0x8] + mov r10, qword ptr [rdi+0x10] + mov r11, qword ptr [rdi+0x18] + movzx eax, byte ptr [rbp+0x40] + or eax, r13d + xor edx, edx +.p2align 5 +2: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + mov dword ptr [rsp+0x200], eax + vmovups ymm2, ymmword ptr [r8+rdx-0x40] + vinsertf128 ymm2, ymm2, xmmword ptr [r9+rdx-0x40], 0x01 + vmovups ymm3, ymmword ptr [r8+rdx-0x30] + vinsertf128 ymm3, ymm3, xmmword ptr [r9+rdx-0x30], 0x01 + vshufps ymm4, ymm2, ymm3, 136 + vshufps ymm5, ymm2, ymm3, 221 + vmovups ymm2, ymmword ptr [r8+rdx-0x20] + vinsertf128 ymm2, ymm2, xmmword ptr [r9+rdx-0x20], 0x01 + vmovups ymm3, ymmword ptr [r8+rdx-0x10] + vinsertf128 ymm3, ymm3, xmmword ptr [r9+rdx-0x10], 0x01 + vshufps ymm6, ymm2, ymm3, 136 + vshufps ymm7, ymm2, ymm3, 221 + vpshufd ymm6, ymm6, 0x93 + vpshufd ymm7, ymm7, 0x93 + vmovups ymm10, ymmword ptr [r10+rdx-0x40] + vinsertf128 ymm10, ymm10, xmmword ptr [r11+rdx-0x40], 0x01 + vmovups ymm11, ymmword ptr [r10+rdx-0x30] + vinsertf128 ymm11, ymm11, xmmword ptr [r11+rdx-0x30], 0x01 + vshufps ymm12, ymm10, ymm11, 136 + vshufps ymm13, ymm10, ymm11, 221 + vmovups ymm10, ymmword ptr [r10+rdx-0x20] + vinsertf128 ymm10, ymm10, xmmword ptr [r11+rdx-0x20], 0x01 + vmovups ymm11, ymmword ptr [r10+rdx-0x10] + vinsertf128 ymm11, ymm11, xmmword ptr [r11+rdx-0x10], 0x01 + vshufps ymm14, ymm10, ymm11, 136 + vshufps ymm15, ymm10, ymm11, 221 + vpshufd ymm14, ymm14, 0x93 + vpshufd ymm15, ymm15, 0x93 + prefetcht0 [r8+rdx+0x80] + prefetcht0 [r9+rdx+0x80] + prefetcht0 [r10+rdx+0x80] + prefetcht0 [r11+rdx+0x80] + vpbroadcastd ymm2, dword ptr [rsp+0x200] + vmovdqa ymm3, ymmword ptr [rsp] + vmovdqa ymm11, ymmword ptr [rsp+0x20] + vpblendd ymm3, ymm3, ymm2, 0x88 + vpblendd ymm11, ymm11, ymm2, 0x88 + vbroadcasti128 ymm2, xmmword ptr [BLAKE3_IV+rip] + vmovdqa ymm10, ymm2 + mov al, 7 +9: + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm8, ymm8, ymm12 + vmovdqa ymmword ptr [rsp+0x40], ymm4 + nop + vmovdqa ymmword ptr [rsp+0x60], ymm12 + nop + vpaddd ymm0, ymm0, ymm1 + vpaddd ymm8, ymm8, ymm9 + vpxor ymm3, ymm3, ymm0 + vpxor ymm11, ymm11, ymm8 + vbroadcasti128 ymm4, xmmword ptr [ROT16+rip] + vpshufb ymm3, ymm3, ymm4 + vpshufb ymm11, ymm11, ymm4 + vpaddd ymm2, ymm2, ymm3 + vpaddd ymm10, ymm10, ymm11 + vpxor ymm1, ymm1, ymm2 + vpxor ymm9, ymm9, ymm10 + vpsrld ymm4, ymm1, 12 + vpslld ymm1, ymm1, 20 + vpor ymm1, ymm1, ymm4 + vpsrld ymm4, ymm9, 12 + vpslld ymm9, ymm9, 20 + vpor ymm9, ymm9, ymm4 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm0, ymm0, ymm1 + vpaddd ymm8, ymm8, ymm9 + vmovdqa ymmword ptr [rsp+0x80], ymm5 + vmovdqa ymmword ptr [rsp+0xA0], ymm13 + vpxor ymm3, ymm3, ymm0 + vpxor ymm11, ymm11, ymm8 + vbroadcasti128 ymm4, xmmword ptr [ROT8+rip] + vpshufb ymm3, ymm3, ymm4 + vpshufb ymm11, ymm11, ymm4 + vpaddd ymm2, ymm2, ymm3 + vpaddd ymm10, ymm10, ymm11 + vpxor ymm1, ymm1, ymm2 + vpxor ymm9, ymm9, ymm10 + vpsrld ymm4, ymm1, 7 + vpslld ymm1, ymm1, 25 + vpor ymm1, ymm1, ymm4 + vpsrld ymm4, ymm9, 7 + vpslld ymm9, ymm9, 25 + vpor ymm9, ymm9, ymm4 + vpshufd ymm0, ymm0, 0x93 + vpshufd ymm8, ymm8, 0x93 + vpshufd ymm3, ymm3, 0x4E + vpshufd ymm11, ymm11, 0x4E + vpshufd ymm2, ymm2, 0x39 + vpshufd ymm10, ymm10, 0x39 + vpaddd ymm0, ymm0, ymm6 + vpaddd ymm8, ymm8, ymm14 + vpaddd ymm0, ymm0, ymm1 + vpaddd ymm8, ymm8, ymm9 + vpxor ymm3, ymm3, ymm0 + vpxor ymm11, ymm11, ymm8 + vbroadcasti128 ymm4, xmmword ptr [ROT16+rip] + vpshufb ymm3, ymm3, ymm4 + vpshufb ymm11, ymm11, ymm4 + vpaddd ymm2, ymm2, ymm3 + vpaddd ymm10, ymm10, ymm11 + vpxor ymm1, ymm1, ymm2 + vpxor ymm9, ymm9, ymm10 + vpsrld ymm4, ymm1, 12 + vpslld ymm1, ymm1, 20 + vpor ymm1, ymm1, ymm4 + vpsrld ymm4, ymm9, 12 + vpslld ymm9, ymm9, 20 + vpor ymm9, ymm9, ymm4 + vpaddd ymm0, ymm0, ymm7 + vpaddd ymm8, ymm8, ymm15 + vpaddd ymm0, ymm0, ymm1 + vpaddd ymm8, ymm8, ymm9 + vpxor ymm3, ymm3, ymm0 + vpxor ymm11, ymm11, ymm8 + vbroadcasti128 ymm4, xmmword ptr [ROT8+rip] + vpshufb ymm3, ymm3, ymm4 + vpshufb ymm11, ymm11, ymm4 + vpaddd ymm2, ymm2, ymm3 + vpaddd ymm10, ymm10, ymm11 + vpxor ymm1, ymm1, ymm2 + vpxor ymm9, ymm9, ymm10 + vpsrld ymm4, ymm1, 7 + vpslld ymm1, ymm1, 25 + vpor ymm1, ymm1, ymm4 + vpsrld ymm4, ymm9, 7 + vpslld ymm9, ymm9, 25 + vpor ymm9, ymm9, ymm4 + vpshufd ymm0, ymm0, 0x39 + vpshufd ymm8, ymm8, 0x39 + vpshufd ymm3, ymm3, 0x4E + vpshufd ymm11, ymm11, 0x4E + vpshufd ymm2, ymm2, 0x93 + vpshufd ymm10, ymm10, 0x93 + dec al + je 9f + vmovdqa ymm4, ymmword ptr [rsp+0x40] + vmovdqa ymm5, ymmword ptr [rsp+0x80] + vshufps ymm12, ymm4, ymm5, 214 + vpshufd ymm13, ymm4, 0x0F + vpshufd ymm4, ymm12, 0x39 + vshufps ymm12, ymm6, ymm7, 250 + vpblendd ymm13, ymm13, ymm12, 0xAA + vpunpcklqdq ymm12, ymm7, ymm5 + vpblendd ymm12, ymm12, ymm6, 0x88 + vpshufd ymm12, ymm12, 0x78 + vpunpckhdq ymm5, ymm5, ymm7 + vpunpckldq ymm6, ymm6, ymm5 + vpshufd ymm7, ymm6, 0x1E + vmovdqa ymmword ptr [rsp+0x40], ymm13 + vmovdqa ymmword ptr [rsp+0x80], ymm12 + vmovdqa ymm12, ymmword ptr [rsp+0x60] + vmovdqa ymm13, ymmword ptr [rsp+0xA0] + vshufps ymm5, ymm12, ymm13, 214 + vpshufd ymm6, ymm12, 0x0F + vpshufd ymm12, ymm5, 0x39 + vshufps ymm5, ymm14, ymm15, 250 + vpblendd ymm6, ymm6, ymm5, 0xAA + vpunpcklqdq ymm5, ymm15, ymm13 + vpblendd ymm5, ymm5, ymm14, 0x88 + vpshufd ymm5, ymm5, 0x78 + vpunpckhdq ymm13, ymm13, ymm15 + vpunpckldq ymm14, ymm14, ymm13 + vpshufd ymm15, ymm14, 0x1E + vmovdqa ymm13, ymm6 + vmovdqa ymm14, ymm5 + vmovdqa ymm5, ymmword ptr [rsp+0x40] + vmovdqa ymm6, ymmword ptr [rsp+0x80] + jmp 9b +9: + vpxor ymm0, ymm0, ymm2 + vpxor ymm1, ymm1, ymm3 + vpxor ymm8, ymm8, ymm10 + vpxor ymm9, ymm9, ymm11 + mov eax, r13d + cmp rdx, r15 + jne 2b + vmovdqu xmmword ptr [rbx], xmm0 + vmovdqu xmmword ptr [rbx+0x10], xmm1 + vextracti128 xmmword ptr [rbx+0x20], ymm0, 0x01 + vextracti128 xmmword ptr [rbx+0x30], ymm1, 0x01 + vmovdqu xmmword ptr [rbx+0x40], xmm8 + vmovdqu xmmword ptr [rbx+0x50], xmm9 + vextracti128 xmmword ptr [rbx+0x60], ymm8, 0x01 + vextracti128 xmmword ptr [rbx+0x70], ymm9, 0x01 + vmovaps xmm8, xmmword ptr [rsp+0x280] + vmovaps xmm0, xmmword ptr [rsp+0x240] + vmovaps xmm1, xmmword ptr [rsp+0x250] + vmovaps xmm2, xmmword ptr [rsp+0x260] + vmovaps xmm3, xmmword ptr [rsp+0x270] + vblendvps xmm0, xmm0, xmm1, xmm8 + vblendvps xmm2, xmm2, xmm3, xmm8 + vmovaps xmmword ptr [rsp+0x240], xmm0 + vmovaps xmmword ptr [rsp+0x260], xmm2 + add rbx, 128 + add rdi, 32 + sub rsi, 4 +3: + test rsi, 0x2 + je 3f + vbroadcasti128 ymm0, xmmword ptr [rcx] + vbroadcasti128 ymm1, xmmword ptr [rcx+0x10] + vmovd xmm13, dword ptr [rsp+0x240] + vpinsrd xmm13, xmm13, dword ptr [rsp+0x260], 1 + vpinsrd xmm13, xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 + vmovd xmm14, dword ptr [rsp+0x244] + vpinsrd xmm14, xmm14, dword ptr [rsp+0x264], 1 + vpinsrd xmm14, xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 + vinserti128 ymm13, ymm13, xmm14, 0x01 + vbroadcasti128 ymm14, xmmword ptr [ROT16+rip] + vbroadcasti128 ymm15, xmmword ptr [ROT8+rip] + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+0x8] + movzx eax, byte ptr [rbp+0x40] + or eax, r13d + xor edx, edx +.p2align 5 +2: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + mov dword ptr [rsp+0x200], eax + vbroadcasti128 ymm2, xmmword ptr [BLAKE3_IV+rip] + vpbroadcastd ymm8, dword ptr [rsp+0x200] + vpblendd ymm3, ymm13, ymm8, 0x88 + vmovups ymm8, ymmword ptr [r8+rdx-0x40] + vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-0x40], 0x01 + vmovups ymm9, ymmword ptr [r8+rdx-0x30] + vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-0x30], 0x01 + vshufps ymm4, ymm8, ymm9, 136 + vshufps ymm5, ymm8, ymm9, 221 + vmovups ymm8, ymmword ptr [r8+rdx-0x20] + vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-0x20], 0x01 + vmovups ymm9, ymmword ptr [r8+rdx-0x10] + vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-0x10], 0x01 + vshufps ymm6, ymm8, ymm9, 136 + vshufps ymm7, ymm8, ymm9, 221 + vpshufd ymm6, ymm6, 0x93 + vpshufd ymm7, ymm7, 0x93 + mov al, 7 +9: + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm0, ymm0, ymm1 + vpxor ymm3, ymm3, ymm0 + vpshufb ymm3, ymm3, ymm14 + vpaddd ymm2, ymm2, ymm3 + vpxor ymm1, ymm1, ymm2 + vpsrld ymm8, ymm1, 12 + vpslld ymm1, ymm1, 20 + vpor ymm1, ymm1, ymm8 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm0, ymm0, ymm1 + vpxor ymm3, ymm3, ymm0 + vpshufb ymm3, ymm3, ymm15 + vpaddd ymm2, ymm2, ymm3 + vpxor ymm1, ymm1, ymm2 + vpsrld ymm8, ymm1, 7 + vpslld ymm1, ymm1, 25 + vpor ymm1, ymm1, ymm8 + vpshufd ymm0, ymm0, 0x93 + vpshufd ymm3, ymm3, 0x4E + vpshufd ymm2, ymm2, 0x39 + vpaddd ymm0, ymm0, ymm6 + vpaddd ymm0, ymm0, ymm1 + vpxor ymm3, ymm3, ymm0 + vpshufb ymm3, ymm3, ymm14 + vpaddd ymm2, ymm2, ymm3 + vpxor ymm1, ymm1, ymm2 + vpsrld ymm8, ymm1, 12 + vpslld ymm1, ymm1, 20 + vpor ymm1, ymm1, ymm8 + vpaddd ymm0, ymm0, ymm7 + vpaddd ymm0, ymm0, ymm1 + vpxor ymm3, ymm3, ymm0 + vpshufb ymm3, ymm3, ymm15 + vpaddd ymm2, ymm2, ymm3 + vpxor ymm1, ymm1, ymm2 + vpsrld ymm8, ymm1, 7 + vpslld ymm1, ymm1, 25 + vpor ymm1, ymm1, ymm8 + vpshufd ymm0, ymm0, 0x39 + vpshufd ymm3, ymm3, 0x4E + vpshufd ymm2, ymm2, 0x93 + dec al + jz 9f + vshufps ymm8, ymm4, ymm5, 214 + vpshufd ymm9, ymm4, 0x0F + vpshufd ymm4, ymm8, 0x39 + vshufps ymm8, ymm6, ymm7, 250 + vpblendd ymm9, ymm9, ymm8, 0xAA + vpunpcklqdq ymm8, ymm7, ymm5 + vpblendd ymm8, ymm8, ymm6, 0x88 + vpshufd ymm8, ymm8, 0x78 + vpunpckhdq ymm5, ymm5, ymm7 + vpunpckldq ymm6, ymm6, ymm5 + vpshufd ymm7, ymm6, 0x1E + vmovdqa ymm5, ymm9 + vmovdqa ymm6, ymm8 + jmp 9b +9: + vpxor ymm0, ymm0, ymm2 + vpxor ymm1, ymm1, ymm3 + mov eax, r13d + cmp rdx, r15 + jne 2b + vmovdqu xmmword ptr [rbx], xmm0 + vmovdqu xmmword ptr [rbx+0x10], xmm1 + vextracti128 xmmword ptr [rbx+0x20], ymm0, 0x01 + vextracti128 xmmword ptr [rbx+0x30], ymm1, 0x01 + vmovaps ymm8, ymmword ptr [rsp+0x280] + vmovaps ymm0, ymmword ptr [rsp+0x240] + vmovups ymm1, ymmword ptr [rsp+0x248] + vmovaps ymm2, ymmword ptr [rsp+0x260] + vmovups ymm3, ymmword ptr [rsp+0x268] + vblendvps ymm0, ymm0, ymm1, ymm8 + vblendvps ymm2, ymm2, ymm3, ymm8 + vmovaps ymmword ptr [rsp+0x240], ymm0 + vmovaps ymmword ptr [rsp+0x260], ymm2 + add rbx, 64 + add rdi, 16 + sub rsi, 2 +3: + test rsi, 0x1 + je 4b + vmovdqu xmm0, xmmword ptr [rcx] + vmovdqu xmm1, xmmword ptr [rcx+0x10] + vmovd xmm3, dword ptr [rsp+0x240] + vpinsrd xmm3, xmm3, dword ptr [rsp+0x260], 1 + vpinsrd xmm13, xmm3, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 + vmovdqa xmm14, xmmword ptr [ROT16+rip] + vmovdqa xmm15, xmmword ptr [ROT8+rip] + mov r8, qword ptr [rdi] + movzx eax, byte ptr [rbp+0x40] + or eax, r13d + xor edx, edx +.p2align 5 +2: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + vmovdqa xmm2, xmmword ptr [BLAKE3_IV+rip] + vmovdqa xmm3, xmm13 + vpinsrd xmm3, xmm3, eax, 3 + vmovups xmm8, xmmword ptr [r8+rdx-0x40] + vmovups xmm9, xmmword ptr [r8+rdx-0x30] + vshufps xmm4, xmm8, xmm9, 136 + vshufps xmm5, xmm8, xmm9, 221 + vmovups xmm8, xmmword ptr [r8+rdx-0x20] + vmovups xmm9, xmmword ptr [r8+rdx-0x10] + vshufps xmm6, xmm8, xmm9, 136 + vshufps xmm7, xmm8, xmm9, 221 + vpshufd xmm6, xmm6, 0x93 + vpshufd xmm7, xmm7, 0x93 + mov al, 7 +9: + vpaddd xmm0, xmm0, xmm4 + vpaddd xmm0, xmm0, xmm1 + vpxor xmm3, xmm3, xmm0 + vpshufb xmm3, xmm3, xmm14 + vpaddd xmm2, xmm2, xmm3 + vpxor xmm1, xmm1, xmm2 + vpsrld xmm8, xmm1, 12 + vpslld xmm1, xmm1, 20 + vpor xmm1, xmm1, xmm8 + vpaddd xmm0, xmm0, xmm5 + vpaddd xmm0, xmm0, xmm1 + vpxor xmm3, xmm3, xmm0 + vpshufb xmm3, xmm3, xmm15 + vpaddd xmm2, xmm2, xmm3 + vpxor xmm1, xmm1, xmm2 + vpsrld xmm8, xmm1, 7 + vpslld xmm1, xmm1, 25 + vpor xmm1, xmm1, xmm8 + vpshufd xmm0, xmm0, 0x93 + vpshufd xmm3, xmm3, 0x4E + vpshufd xmm2, xmm2, 0x39 + vpaddd xmm0, xmm0, xmm6 + vpaddd xmm0, xmm0, xmm1 + vpxor xmm3, xmm3, xmm0 + vpshufb xmm3, xmm3, xmm14 + vpaddd xmm2, xmm2, xmm3 + vpxor xmm1, xmm1, xmm2 + vpsrld xmm8, xmm1, 12 + vpslld xmm1, xmm1, 20 + vpor xmm1, xmm1, xmm8 + vpaddd xmm0, xmm0, xmm7 + vpaddd xmm0, xmm0, xmm1 + vpxor xmm3, xmm3, xmm0 + vpshufb xmm3, xmm3, xmm15 + vpaddd xmm2, xmm2, xmm3 + vpxor xmm1, xmm1, xmm2 + vpsrld xmm8, xmm1, 7 + vpslld xmm1, xmm1, 25 + vpor xmm1, xmm1, xmm8 + vpshufd xmm0, xmm0, 0x39 + vpshufd xmm3, xmm3, 0x4E + vpshufd xmm2, xmm2, 0x93 + dec al + jz 9f + vshufps xmm8, xmm4, xmm5, 214 + vpshufd xmm9, xmm4, 0x0F + vpshufd xmm4, xmm8, 0x39 + vshufps xmm8, xmm6, xmm7, 250 + vpblendd xmm9, xmm9, xmm8, 0xAA + vpunpcklqdq xmm8, xmm7, xmm5 + vpblendd xmm8, xmm8, xmm6, 0x88 + vpshufd xmm8, xmm8, 0x78 + vpunpckhdq xmm5, xmm5, xmm7 + vpunpckldq xmm6, xmm6, xmm5 + vpshufd xmm7, xmm6, 0x1E + vmovdqa xmm5, xmm9 + vmovdqa xmm6, xmm8 + jmp 9b +9: + vpxor xmm0, xmm0, xmm2 + vpxor xmm1, xmm1, xmm3 + mov eax, r13d + cmp rdx, r15 + jne 2b + vmovdqu xmmword ptr [rbx], xmm0 + vmovdqu xmmword ptr [rbx+0x10], xmm1 + jmp 4b + + +#ifdef __APPLE__ +.static_data +#else +.section .rodata +#endif +.p2align 6 +ADD0: + .long 0, 1, 2, 3, 4, 5, 6, 7 +ADD1: + .long 8, 8, 8, 8, 8, 8, 8, 8 +BLAKE3_IV_0: + .long 0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667 + .long 0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667 +BLAKE3_IV_1: + .long 0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85 + .long 0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85 +BLAKE3_IV_2: + .long 0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372 + .long 0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372 +BLAKE3_IV_3: + .long 0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A + .long 0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A +BLAKE3_BLOCK_LEN: + .long 0x00000040, 0x00000040, 0x00000040, 0x00000040 + .long 0x00000040, 0x00000040, 0x00000040, 0x00000040 +ROT16: + .byte 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13 +ROT8: + .byte 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12 +CMP_MSB_MASK: + .long 0x80000000, 0x80000000, 0x80000000, 0x80000000 + .long 0x80000000, 0x80000000, 0x80000000, 0x80000000 +BLAKE3_IV: + .long 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A + diff --git a/third-party/blake3/c/blake3_avx2_x86-64_windows_gnu.S b/third-party/blake3/c/blake3_avx2_x86-64_windows_gnu.S new file mode 100644 index 00000000..3d4be4a7 --- /dev/null +++ b/third-party/blake3/c/blake3_avx2_x86-64_windows_gnu.S @@ -0,0 +1,1817 @@ +.intel_syntax noprefix +.global _blake3_hash_many_avx2 +.global blake3_hash_many_avx2 +.section .text + .p2align 6 +_blake3_hash_many_avx2: +blake3_hash_many_avx2: + push r15 + push r14 + push r13 + push r12 + push rsi + push rdi + push rbx + push rbp + mov rbp, rsp + sub rsp, 880 + and rsp, 0xFFFFFFFFFFFFFFC0 + vmovdqa xmmword ptr [rsp+0x2D0], xmm6 + vmovdqa xmmword ptr [rsp+0x2E0], xmm7 + vmovdqa xmmword ptr [rsp+0x2F0], xmm8 + vmovdqa xmmword ptr [rsp+0x300], xmm9 + vmovdqa xmmword ptr [rsp+0x310], xmm10 + vmovdqa xmmword ptr [rsp+0x320], xmm11 + vmovdqa xmmword ptr [rsp+0x330], xmm12 + vmovdqa xmmword ptr [rsp+0x340], xmm13 + vmovdqa xmmword ptr [rsp+0x350], xmm14 + vmovdqa xmmword ptr [rsp+0x360], xmm15 + mov rdi, rcx + mov rsi, rdx + mov rdx, r8 + mov rcx, r9 + mov r8, qword ptr [rbp+0x68] + movzx r9, byte ptr [rbp+0x70] + neg r9d + vmovd xmm0, r9d + vpbroadcastd ymm0, xmm0 + vmovdqa ymmword ptr [rsp+0x260], ymm0 + vpand ymm1, ymm0, ymmword ptr [ADD0+rip] + vpand ymm2, ymm0, ymmword ptr [ADD1+rip] + vmovdqa ymmword ptr [rsp+0x2A0], ymm2 + vmovd xmm2, r8d + vpbroadcastd ymm2, xmm2 + vpaddd ymm2, ymm2, ymm1 + vmovdqa ymmword ptr [rsp+0x220], ymm2 + vpxor ymm1, ymm1, ymmword ptr [CMP_MSB_MASK+rip] + vpxor ymm2, ymm2, ymmword ptr [CMP_MSB_MASK+rip] + vpcmpgtd ymm2, ymm1, ymm2 + shr r8, 32 + vmovd xmm3, r8d + vpbroadcastd ymm3, xmm3 + vpsubd ymm3, ymm3, ymm2 + vmovdqa ymmword ptr [rsp+0x240], ymm3 + shl rdx, 6 + mov qword ptr [rsp+0x2C0], rdx + cmp rsi, 8 + jc 3f +2: + vpbroadcastd ymm0, dword ptr [rcx] + vpbroadcastd ymm1, dword ptr [rcx+0x4] + vpbroadcastd ymm2, dword ptr [rcx+0x8] + vpbroadcastd ymm3, dword ptr [rcx+0xC] + vpbroadcastd ymm4, dword ptr [rcx+0x10] + vpbroadcastd ymm5, dword ptr [rcx+0x14] + vpbroadcastd ymm6, dword ptr [rcx+0x18] + vpbroadcastd ymm7, dword ptr [rcx+0x1C] + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+0x8] + mov r10, qword ptr [rdi+0x10] + mov r11, qword ptr [rdi+0x18] + mov r12, qword ptr [rdi+0x20] + mov r13, qword ptr [rdi+0x28] + mov r14, qword ptr [rdi+0x30] + mov r15, qword ptr [rdi+0x38] + movzx eax, byte ptr [rbp+0x78] + movzx ebx, byte ptr [rbp+0x80] + or eax, ebx + xor edx, edx +.p2align 5 +9: + movzx ebx, byte ptr [rbp+0x88] + or ebx, eax + add rdx, 64 + cmp rdx, qword ptr [rsp+0x2C0] + cmove eax, ebx + mov dword ptr [rsp+0x200], eax + vmovups xmm8, xmmword ptr [r8+rdx-0x40] + vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x40], 0x01 + vmovups xmm9, xmmword ptr [r9+rdx-0x40] + vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x40], 0x01 + vunpcklpd ymm12, ymm8, ymm9 + vunpckhpd ymm13, ymm8, ymm9 + vmovups xmm10, xmmword ptr [r10+rdx-0x40] + vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x40], 0x01 + vmovups xmm11, xmmword ptr [r11+rdx-0x40] + vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x40], 0x01 + vunpcklpd ymm14, ymm10, ymm11 + vunpckhpd ymm15, ymm10, ymm11 + vshufps ymm8, ymm12, ymm14, 136 + vmovaps ymmword ptr [rsp], ymm8 + vshufps ymm9, ymm12, ymm14, 221 + vmovaps ymmword ptr [rsp+0x20], ymm9 + vshufps ymm10, ymm13, ymm15, 136 + vmovaps ymmword ptr [rsp+0x40], ymm10 + vshufps ymm11, ymm13, ymm15, 221 + vmovaps ymmword ptr [rsp+0x60], ymm11 + vmovups xmm8, xmmword ptr [r8+rdx-0x30] + vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x30], 0x01 + vmovups xmm9, xmmword ptr [r9+rdx-0x30] + vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x30], 0x01 + vunpcklpd ymm12, ymm8, ymm9 + vunpckhpd ymm13, ymm8, ymm9 + vmovups xmm10, xmmword ptr [r10+rdx-0x30] + vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x30], 0x01 + vmovups xmm11, xmmword ptr [r11+rdx-0x30] + vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x30], 0x01 + vunpcklpd ymm14, ymm10, ymm11 + vunpckhpd ymm15, ymm10, ymm11 + vshufps ymm8, ymm12, ymm14, 136 + vmovaps ymmword ptr [rsp+0x80], ymm8 + vshufps ymm9, ymm12, ymm14, 221 + vmovaps ymmword ptr [rsp+0xA0], ymm9 + vshufps ymm10, ymm13, ymm15, 136 + vmovaps ymmword ptr [rsp+0xC0], ymm10 + vshufps ymm11, ymm13, ymm15, 221 + vmovaps ymmword ptr [rsp+0xE0], ymm11 + vmovups xmm8, xmmword ptr [r8+rdx-0x20] + vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x20], 0x01 + vmovups xmm9, xmmword ptr [r9+rdx-0x20] + vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x20], 0x01 + vunpcklpd ymm12, ymm8, ymm9 + vunpckhpd ymm13, ymm8, ymm9 + vmovups xmm10, xmmword ptr [r10+rdx-0x20] + vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x20], 0x01 + vmovups xmm11, xmmword ptr [r11+rdx-0x20] + vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x20], 0x01 + vunpcklpd ymm14, ymm10, ymm11 + vunpckhpd ymm15, ymm10, ymm11 + vshufps ymm8, ymm12, ymm14, 136 + vmovaps ymmword ptr [rsp+0x100], ymm8 + vshufps ymm9, ymm12, ymm14, 221 + vmovaps ymmword ptr [rsp+0x120], ymm9 + vshufps ymm10, ymm13, ymm15, 136 + vmovaps ymmword ptr [rsp+0x140], ymm10 + vshufps ymm11, ymm13, ymm15, 221 + vmovaps ymmword ptr [rsp+0x160], ymm11 + vmovups xmm8, xmmword ptr [r8+rdx-0x10] + vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x10], 0x01 + vmovups xmm9, xmmword ptr [r9+rdx-0x10] + vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x10], 0x01 + vunpcklpd ymm12, ymm8, ymm9 + vunpckhpd ymm13, ymm8, ymm9 + vmovups xmm10, xmmword ptr [r10+rdx-0x10] + vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x10], 0x01 + vmovups xmm11, xmmword ptr [r11+rdx-0x10] + vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x10], 0x01 + vunpcklpd ymm14, ymm10, ymm11 + vunpckhpd ymm15, ymm10, ymm11 + vshufps ymm8, ymm12, ymm14, 136 + vmovaps ymmword ptr [rsp+0x180], ymm8 + vshufps ymm9, ymm12, ymm14, 221 + vmovaps ymmword ptr [rsp+0x1A0], ymm9 + vshufps ymm10, ymm13, ymm15, 136 + vmovaps ymmword ptr [rsp+0x1C0], ymm10 + vshufps ymm11, ymm13, ymm15, 221 + vmovaps ymmword ptr [rsp+0x1E0], ymm11 + vpbroadcastd ymm15, dword ptr [rsp+0x200] + prefetcht0 [r8+rdx+0x80] + prefetcht0 [r12+rdx+0x80] + prefetcht0 [r9+rdx+0x80] + prefetcht0 [r13+rdx+0x80] + prefetcht0 [r10+rdx+0x80] + prefetcht0 [r14+rdx+0x80] + prefetcht0 [r11+rdx+0x80] + prefetcht0 [r15+rdx+0x80] + vpaddd ymm0, ymm0, ymmword ptr [rsp] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x40] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x80] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0xC0] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm0, ymmword ptr [rsp+0x220] + vpxor ymm13, ymm1, ymmword ptr [rsp+0x240] + vpxor ymm14, ymm2, ymmword ptr [BLAKE3_BLOCK_LEN+rip] + vpxor ymm15, ymm3, ymm15 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [BLAKE3_IV_0+rip] + vpaddd ymm9, ymm13, ymmword ptr [BLAKE3_IV_1+rip] + vpaddd ymm10, ymm14, ymmword ptr [BLAKE3_IV_2+rip] + vpaddd ymm11, ymm15, ymmword ptr [BLAKE3_IV_3+rip] + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x20] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x60] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0xA0] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0xE0] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x100] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x140] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x180] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1C0] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x120] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x160] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1A0] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1E0] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x40] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x60] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0xE0] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x80] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0xC0] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x140] + vpaddd ymm2, ymm2, ymmword ptr [rsp] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1A0] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x20] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x180] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x120] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1E0] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x160] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0xA0] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1C0] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x100] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x60] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x140] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1A0] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0xE0] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x80] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x180] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x40] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1C0] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0xC0] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x120] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x160] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x100] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0xA0] + vpaddd ymm1, ymm1, ymmword ptr [rsp] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1E0] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x20] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x140] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x180] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1C0] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1A0] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0xE0] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x120] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x60] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1E0] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x80] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x160] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0xA0] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x20] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x40] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x100] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0xC0] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x180] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x120] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1E0] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1C0] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x1A0] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x160] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x140] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x100] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0xE0] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0xA0] + vpaddd ymm2, ymm2, ymmword ptr [rsp] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0xC0] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x40] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x60] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x20] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x80] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x120] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x160] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x100] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1E0] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x1C0] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0xA0] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x180] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x20] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x1A0] + vpaddd ymm1, ymm1, ymmword ptr [rsp] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x40] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x80] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x60] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x140] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0xC0] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0xE0] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x160] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0xA0] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x20] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x100] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x1E0] + vpaddd ymm1, ymm1, ymmword ptr [rsp] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x120] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0xC0] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x1C0] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x40] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x60] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0xE0] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x140] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x180] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x80] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1A0] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vpxor ymm0, ymm0, ymm8 + vpxor ymm1, ymm1, ymm9 + vpxor ymm2, ymm2, ymm10 + vpxor ymm3, ymm3, ymm11 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpxor ymm4, ymm4, ymm12 + vpxor ymm5, ymm5, ymm13 + vpxor ymm6, ymm6, ymm14 + vpxor ymm7, ymm7, ymm15 + movzx eax, byte ptr [rbp+0x78] + jne 9b + mov rbx, qword ptr [rbp+0x90] + vunpcklps ymm8, ymm0, ymm1 + vunpcklps ymm9, ymm2, ymm3 + vunpckhps ymm10, ymm0, ymm1 + vunpcklps ymm11, ymm4, ymm5 + vunpcklps ymm0, ymm6, ymm7 + vshufps ymm12, ymm8, ymm9, 78 + vblendps ymm1, ymm8, ymm12, 0xCC + vshufps ymm8, ymm11, ymm0, 78 + vunpckhps ymm13, ymm2, ymm3 + vblendps ymm2, ymm11, ymm8, 0xCC + vblendps ymm3, ymm12, ymm9, 0xCC + vperm2f128 ymm12, ymm1, ymm2, 0x20 + vmovups ymmword ptr [rbx], ymm12 + vunpckhps ymm14, ymm4, ymm5 + vblendps ymm4, ymm8, ymm0, 0xCC + vunpckhps ymm15, ymm6, ymm7 + vperm2f128 ymm7, ymm3, ymm4, 0x20 + vmovups ymmword ptr [rbx+0x20], ymm7 + vshufps ymm5, ymm10, ymm13, 78 + vblendps ymm6, ymm5, ymm13, 0xCC + vshufps ymm13, ymm14, ymm15, 78 + vblendps ymm10, ymm10, ymm5, 0xCC + vblendps ymm14, ymm14, ymm13, 0xCC + vperm2f128 ymm8, ymm10, ymm14, 0x20 + vmovups ymmword ptr [rbx+0x40], ymm8 + vblendps ymm15, ymm13, ymm15, 0xCC + vperm2f128 ymm13, ymm6, ymm15, 0x20 + vmovups ymmword ptr [rbx+0x60], ymm13 + vperm2f128 ymm9, ymm1, ymm2, 0x31 + vperm2f128 ymm11, ymm3, ymm4, 0x31 + vmovups ymmword ptr [rbx+0x80], ymm9 + vperm2f128 ymm14, ymm10, ymm14, 0x31 + vperm2f128 ymm15, ymm6, ymm15, 0x31 + vmovups ymmword ptr [rbx+0xA0], ymm11 + vmovups ymmword ptr [rbx+0xC0], ymm14 + vmovups ymmword ptr [rbx+0xE0], ymm15 + vmovdqa ymm0, ymmword ptr [rsp+0x2A0] + vpaddd ymm1, ymm0, ymmword ptr [rsp+0x220] + vmovdqa ymmword ptr [rsp+0x220], ymm1 + vpxor ymm0, ymm0, ymmword ptr [CMP_MSB_MASK+rip] + vpxor ymm2, ymm1, ymmword ptr [CMP_MSB_MASK+rip] + vpcmpgtd ymm2, ymm0, ymm2 + vmovdqa ymm0, ymmword ptr [rsp+0x240] + vpsubd ymm2, ymm0, ymm2 + vmovdqa ymmword ptr [rsp+0x240], ymm2 + add rdi, 64 + add rbx, 256 + mov qword ptr [rbp+0x90], rbx + sub rsi, 8 + cmp rsi, 8 + jnc 2b + test rsi, rsi + jnz 3f +4: + vzeroupper + vmovdqa xmm6, xmmword ptr [rsp+0x2D0] + vmovdqa xmm7, xmmword ptr [rsp+0x2E0] + vmovdqa xmm8, xmmword ptr [rsp+0x2F0] + vmovdqa xmm9, xmmword ptr [rsp+0x300] + vmovdqa xmm10, xmmword ptr [rsp+0x310] + vmovdqa xmm11, xmmword ptr [rsp+0x320] + vmovdqa xmm12, xmmword ptr [rsp+0x330] + vmovdqa xmm13, xmmword ptr [rsp+0x340] + vmovdqa xmm14, xmmword ptr [rsp+0x350] + vmovdqa xmm15, xmmword ptr [rsp+0x360] + mov rsp, rbp + pop rbp + pop rbx + pop rdi + pop rsi + pop r12 + pop r13 + pop r14 + pop r15 + ret +.p2align 5 +3: + mov rbx, qword ptr [rbp+0x90] + mov r15, qword ptr [rsp+0x2C0] + movzx r13d, byte ptr [rbp+0x78] + movzx r12d, byte ptr [rbp+0x88] + test rsi, 0x4 + je 3f + vbroadcasti128 ymm0, xmmword ptr [rcx] + vbroadcasti128 ymm1, xmmword ptr [rcx+0x10] + vmovdqa ymm8, ymm0 + vmovdqa ymm9, ymm1 + vbroadcasti128 ymm12, xmmword ptr [rsp+0x220] + vbroadcasti128 ymm13, xmmword ptr [rsp+0x240] + vpunpckldq ymm14, ymm12, ymm13 + vpunpckhdq ymm15, ymm12, ymm13 + vpermq ymm14, ymm14, 0x50 + vpermq ymm15, ymm15, 0x50 + vbroadcasti128 ymm12, xmmword ptr [BLAKE3_BLOCK_LEN+rip] + vpblendd ymm14, ymm14, ymm12, 0x44 + vpblendd ymm15, ymm15, ymm12, 0x44 + vmovdqa ymmword ptr [rsp], ymm14 + vmovdqa ymmword ptr [rsp+0x20], ymm15 + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+0x8] + mov r10, qword ptr [rdi+0x10] + mov r11, qword ptr [rdi+0x18] + movzx eax, byte ptr [rbp+0x80] + or eax, r13d + xor edx, edx +.p2align 5 +2: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + mov dword ptr [rsp+0x200], eax + vmovups ymm2, ymmword ptr [r8+rdx-0x40] + vinsertf128 ymm2, ymm2, xmmword ptr [r9+rdx-0x40], 0x01 + vmovups ymm3, ymmword ptr [r8+rdx-0x30] + vinsertf128 ymm3, ymm3, xmmword ptr [r9+rdx-0x30], 0x01 + vshufps ymm4, ymm2, ymm3, 136 + vshufps ymm5, ymm2, ymm3, 221 + vmovups ymm2, ymmword ptr [r8+rdx-0x20] + vinsertf128 ymm2, ymm2, xmmword ptr [r9+rdx-0x20], 0x01 + vmovups ymm3, ymmword ptr [r8+rdx-0x10] + vinsertf128 ymm3, ymm3, xmmword ptr [r9+rdx-0x10], 0x01 + vshufps ymm6, ymm2, ymm3, 136 + vshufps ymm7, ymm2, ymm3, 221 + vpshufd ymm6, ymm6, 0x93 + vpshufd ymm7, ymm7, 0x93 + vmovups ymm10, ymmword ptr [r10+rdx-0x40] + vinsertf128 ymm10, ymm10, xmmword ptr [r11+rdx-0x40], 0x01 + vmovups ymm11, ymmword ptr [r10+rdx-0x30] + vinsertf128 ymm11, ymm11, xmmword ptr [r11+rdx-0x30], 0x01 + vshufps ymm12, ymm10, ymm11, 136 + vshufps ymm13, ymm10, ymm11, 221 + vmovups ymm10, ymmword ptr [r10+rdx-0x20] + vinsertf128 ymm10, ymm10, xmmword ptr [r11+rdx-0x20], 0x01 + vmovups ymm11, ymmword ptr [r10+rdx-0x10] + vinsertf128 ymm11, ymm11, xmmword ptr [r11+rdx-0x10], 0x01 + vshufps ymm14, ymm10, ymm11, 136 + vshufps ymm15, ymm10, ymm11, 221 + vpshufd ymm14, ymm14, 0x93 + vpshufd ymm15, ymm15, 0x93 + vpbroadcastd ymm2, dword ptr [rsp+0x200] + vmovdqa ymm3, ymmword ptr [rsp] + vmovdqa ymm11, ymmword ptr [rsp+0x20] + vpblendd ymm3, ymm3, ymm2, 0x88 + vpblendd ymm11, ymm11, ymm2, 0x88 + vbroadcasti128 ymm2, xmmword ptr [BLAKE3_IV+rip] + vmovdqa ymm10, ymm2 + mov al, 7 +9: + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm8, ymm8, ymm12 + vmovdqa ymmword ptr [rsp+0x40], ymm4 + nop + vmovdqa ymmword ptr [rsp+0x60], ymm12 + nop + vpaddd ymm0, ymm0, ymm1 + vpaddd ymm8, ymm8, ymm9 + vpxor ymm3, ymm3, ymm0 + vpxor ymm11, ymm11, ymm8 + vbroadcasti128 ymm4, xmmword ptr [ROT16+rip] + vpshufb ymm3, ymm3, ymm4 + vpshufb ymm11, ymm11, ymm4 + vpaddd ymm2, ymm2, ymm3 + vpaddd ymm10, ymm10, ymm11 + vpxor ymm1, ymm1, ymm2 + vpxor ymm9, ymm9, ymm10 + vpsrld ymm4, ymm1, 12 + vpslld ymm1, ymm1, 20 + vpor ymm1, ymm1, ymm4 + vpsrld ymm4, ymm9, 12 + vpslld ymm9, ymm9, 20 + vpor ymm9, ymm9, ymm4 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm0, ymm0, ymm1 + vpaddd ymm8, ymm8, ymm9 + vmovdqa ymmword ptr [rsp+0x80], ymm5 + vmovdqa ymmword ptr [rsp+0xA0], ymm13 + vpxor ymm3, ymm3, ymm0 + vpxor ymm11, ymm11, ymm8 + vbroadcasti128 ymm4, xmmword ptr [ROT8+rip] + vpshufb ymm3, ymm3, ymm4 + vpshufb ymm11, ymm11, ymm4 + vpaddd ymm2, ymm2, ymm3 + vpaddd ymm10, ymm10, ymm11 + vpxor ymm1, ymm1, ymm2 + vpxor ymm9, ymm9, ymm10 + vpsrld ymm4, ymm1, 7 + vpslld ymm1, ymm1, 25 + vpor ymm1, ymm1, ymm4 + vpsrld ymm4, ymm9, 7 + vpslld ymm9, ymm9, 25 + vpor ymm9, ymm9, ymm4 + vpshufd ymm0, ymm0, 0x93 + vpshufd ymm8, ymm8, 0x93 + vpshufd ymm3, ymm3, 0x4E + vpshufd ymm11, ymm11, 0x4E + vpshufd ymm2, ymm2, 0x39 + vpshufd ymm10, ymm10, 0x39 + vpaddd ymm0, ymm0, ymm6 + vpaddd ymm8, ymm8, ymm14 + vpaddd ymm0, ymm0, ymm1 + vpaddd ymm8, ymm8, ymm9 + vpxor ymm3, ymm3, ymm0 + vpxor ymm11, ymm11, ymm8 + vbroadcasti128 ymm4, xmmword ptr [ROT16+rip] + vpshufb ymm3, ymm3, ymm4 + vpshufb ymm11, ymm11, ymm4 + vpaddd ymm2, ymm2, ymm3 + vpaddd ymm10, ymm10, ymm11 + vpxor ymm1, ymm1, ymm2 + vpxor ymm9, ymm9, ymm10 + vpsrld ymm4, ymm1, 12 + vpslld ymm1, ymm1, 20 + vpor ymm1, ymm1, ymm4 + vpsrld ymm4, ymm9, 12 + vpslld ymm9, ymm9, 20 + vpor ymm9, ymm9, ymm4 + vpaddd ymm0, ymm0, ymm7 + vpaddd ymm8, ymm8, ymm15 + vpaddd ymm0, ymm0, ymm1 + vpaddd ymm8, ymm8, ymm9 + vpxor ymm3, ymm3, ymm0 + vpxor ymm11, ymm11, ymm8 + vbroadcasti128 ymm4, xmmword ptr [ROT8+rip] + vpshufb ymm3, ymm3, ymm4 + vpshufb ymm11, ymm11, ymm4 + vpaddd ymm2, ymm2, ymm3 + vpaddd ymm10, ymm10, ymm11 + vpxor ymm1, ymm1, ymm2 + vpxor ymm9, ymm9, ymm10 + vpsrld ymm4, ymm1, 7 + vpslld ymm1, ymm1, 25 + vpor ymm1, ymm1, ymm4 + vpsrld ymm4, ymm9, 7 + vpslld ymm9, ymm9, 25 + vpor ymm9, ymm9, ymm4 + vpshufd ymm0, ymm0, 0x39 + vpshufd ymm8, ymm8, 0x39 + vpshufd ymm3, ymm3, 0x4E + vpshufd ymm11, ymm11, 0x4E + vpshufd ymm2, ymm2, 0x93 + vpshufd ymm10, ymm10, 0x93 + dec al + je 9f + vmovdqa ymm4, ymmword ptr [rsp+0x40] + vmovdqa ymm5, ymmword ptr [rsp+0x80] + vshufps ymm12, ymm4, ymm5, 214 + vpshufd ymm13, ymm4, 0x0F + vpshufd ymm4, ymm12, 0x39 + vshufps ymm12, ymm6, ymm7, 250 + vpblendd ymm13, ymm13, ymm12, 0xAA + vpunpcklqdq ymm12, ymm7, ymm5 + vpblendd ymm12, ymm12, ymm6, 0x88 + vpshufd ymm12, ymm12, 0x78 + vpunpckhdq ymm5, ymm5, ymm7 + vpunpckldq ymm6, ymm6, ymm5 + vpshufd ymm7, ymm6, 0x1E + vmovdqa ymmword ptr [rsp+0x40], ymm13 + vmovdqa ymmword ptr [rsp+0x80], ymm12 + vmovdqa ymm12, ymmword ptr [rsp+0x60] + vmovdqa ymm13, ymmword ptr [rsp+0xA0] + vshufps ymm5, ymm12, ymm13, 214 + vpshufd ymm6, ymm12, 0x0F + vpshufd ymm12, ymm5, 0x39 + vshufps ymm5, ymm14, ymm15, 250 + vpblendd ymm6, ymm6, ymm5, 0xAA + vpunpcklqdq ymm5, ymm15, ymm13 + vpblendd ymm5, ymm5, ymm14, 0x88 + vpshufd ymm5, ymm5, 0x78 + vpunpckhdq ymm13, ymm13, ymm15 + vpunpckldq ymm14, ymm14, ymm13 + vpshufd ymm15, ymm14, 0x1E + vmovdqa ymm13, ymm6 + vmovdqa ymm14, ymm5 + vmovdqa ymm5, ymmword ptr [rsp+0x40] + vmovdqa ymm6, ymmword ptr [rsp+0x80] + jmp 9b +9: + vpxor ymm0, ymm0, ymm2 + vpxor ymm1, ymm1, ymm3 + vpxor ymm8, ymm8, ymm10 + vpxor ymm9, ymm9, ymm11 + mov eax, r13d + cmp rdx, r15 + jne 2b + vmovdqu xmmword ptr [rbx], xmm0 + vmovdqu xmmword ptr [rbx+0x10], xmm1 + vextracti128 xmmword ptr [rbx+0x20], ymm0, 0x01 + vextracti128 xmmword ptr [rbx+0x30], ymm1, 0x01 + vmovdqu xmmword ptr [rbx+0x40], xmm8 + vmovdqu xmmword ptr [rbx+0x50], xmm9 + vextracti128 xmmword ptr [rbx+0x60], ymm8, 0x01 + vextracti128 xmmword ptr [rbx+0x70], ymm9, 0x01 + vmovaps xmm8, xmmword ptr [rsp+0x260] + vmovaps xmm0, xmmword ptr [rsp+0x220] + vmovaps xmm1, xmmword ptr [rsp+0x230] + vmovaps xmm2, xmmword ptr [rsp+0x240] + vmovaps xmm3, xmmword ptr [rsp+0x250] + vblendvps xmm0, xmm0, xmm1, xmm8 + vblendvps xmm2, xmm2, xmm3, xmm8 + vmovaps xmmword ptr [rsp+0x220], xmm0 + vmovaps xmmword ptr [rsp+0x240], xmm2 + add rbx, 128 + add rdi, 32 + sub rsi, 4 +3: + test rsi, 0x2 + je 3f + vbroadcasti128 ymm0, xmmword ptr [rcx] + vbroadcasti128 ymm1, xmmword ptr [rcx+0x10] + vmovd xmm13, dword ptr [rsp+0x220] + vpinsrd xmm13, xmm13, dword ptr [rsp+0x240], 1 + vpinsrd xmm13, xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 + vmovd xmm14, dword ptr [rsp+0x224] + vpinsrd xmm14, xmm14, dword ptr [rsp+0x244], 1 + vpinsrd xmm14, xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 + vinserti128 ymm13, ymm13, xmm14, 0x01 + vbroadcasti128 ymm14, xmmword ptr [ROT16+rip] + vbroadcasti128 ymm15, xmmword ptr [ROT8+rip] + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+0x8] + movzx eax, byte ptr [rbp+0x80] + or eax, r13d + xor edx, edx +.p2align 5 +2: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + mov dword ptr [rsp+0x200], eax + vbroadcasti128 ymm2, xmmword ptr [BLAKE3_IV+rip] + vpbroadcastd ymm8, dword ptr [rsp+0x200] + vpblendd ymm3, ymm13, ymm8, 0x88 + vmovups ymm8, ymmword ptr [r8+rdx-0x40] + vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-0x40], 0x01 + vmovups ymm9, ymmword ptr [r8+rdx-0x30] + vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-0x30], 0x01 + vshufps ymm4, ymm8, ymm9, 136 + vshufps ymm5, ymm8, ymm9, 221 + vmovups ymm8, ymmword ptr [r8+rdx-0x20] + vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-0x20], 0x01 + vmovups ymm9, ymmword ptr [r8+rdx-0x10] + vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-0x10], 0x01 + vshufps ymm6, ymm8, ymm9, 136 + vshufps ymm7, ymm8, ymm9, 221 + vpshufd ymm6, ymm6, 0x93 + vpshufd ymm7, ymm7, 0x93 + mov al, 7 +9: + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm0, ymm0, ymm1 + vpxor ymm3, ymm3, ymm0 + vpshufb ymm3, ymm3, ymm14 + vpaddd ymm2, ymm2, ymm3 + vpxor ymm1, ymm1, ymm2 + vpsrld ymm8, ymm1, 12 + vpslld ymm1, ymm1, 20 + vpor ymm1, ymm1, ymm8 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm0, ymm0, ymm1 + vpxor ymm3, ymm3, ymm0 + vpshufb ymm3, ymm3, ymm15 + vpaddd ymm2, ymm2, ymm3 + vpxor ymm1, ymm1, ymm2 + vpsrld ymm8, ymm1, 7 + vpslld ymm1, ymm1, 25 + vpor ymm1, ymm1, ymm8 + vpshufd ymm0, ymm0, 0x93 + vpshufd ymm3, ymm3, 0x4E + vpshufd ymm2, ymm2, 0x39 + vpaddd ymm0, ymm0, ymm6 + vpaddd ymm0, ymm0, ymm1 + vpxor ymm3, ymm3, ymm0 + vpshufb ymm3, ymm3, ymm14 + vpaddd ymm2, ymm2, ymm3 + vpxor ymm1, ymm1, ymm2 + vpsrld ymm8, ymm1, 12 + vpslld ymm1, ymm1, 20 + vpor ymm1, ymm1, ymm8 + vpaddd ymm0, ymm0, ymm7 + vpaddd ymm0, ymm0, ymm1 + vpxor ymm3, ymm3, ymm0 + vpshufb ymm3, ymm3, ymm15 + vpaddd ymm2, ymm2, ymm3 + vpxor ymm1, ymm1, ymm2 + vpsrld ymm8, ymm1, 7 + vpslld ymm1, ymm1, 25 + vpor ymm1, ymm1, ymm8 + vpshufd ymm0, ymm0, 0x39 + vpshufd ymm3, ymm3, 0x4E + vpshufd ymm2, ymm2, 0x93 + dec al + jz 9f + vshufps ymm8, ymm4, ymm5, 214 + vpshufd ymm9, ymm4, 0x0F + vpshufd ymm4, ymm8, 0x39 + vshufps ymm8, ymm6, ymm7, 250 + vpblendd ymm9, ymm9, ymm8, 0xAA + vpunpcklqdq ymm8, ymm7, ymm5 + vpblendd ymm8, ymm8, ymm6, 0x88 + vpshufd ymm8, ymm8, 0x78 + vpunpckhdq ymm5, ymm5, ymm7 + vpunpckldq ymm6, ymm6, ymm5 + vpshufd ymm7, ymm6, 0x1E + vmovdqa ymm5, ymm9 + vmovdqa ymm6, ymm8 + jmp 9b +9: + vpxor ymm0, ymm0, ymm2 + vpxor ymm1, ymm1, ymm3 + mov eax, r13d + cmp rdx, r15 + jne 2b + vmovdqu xmmword ptr [rbx], xmm0 + vmovdqu xmmword ptr [rbx+0x10], xmm1 + vextracti128 xmmword ptr [rbx+0x20], ymm0, 0x01 + vextracti128 xmmword ptr [rbx+0x30], ymm1, 0x01 + vmovaps ymm8, ymmword ptr [rsp+0x260] + vmovaps ymm0, ymmword ptr [rsp+0x220] + vmovups ymm1, ymmword ptr [rsp+0x228] + vmovaps ymm2, ymmword ptr [rsp+0x240] + vmovups ymm3, ymmword ptr [rsp+0x248] + vblendvps ymm0, ymm0, ymm1, ymm8 + vblendvps ymm2, ymm2, ymm3, ymm8 + vmovaps ymmword ptr [rsp+0x220], ymm0 + vmovaps ymmword ptr [rsp+0x240], ymm2 + add rbx, 64 + add rdi, 16 + sub rsi, 2 +3: + test rsi, 0x1 + je 4b + vmovdqu xmm0, xmmword ptr [rcx] + vmovdqu xmm1, xmmword ptr [rcx+0x10] + vmovd xmm3, dword ptr [rsp+0x220] + vpinsrd xmm3, xmm3, dword ptr [rsp+0x240], 1 + vpinsrd xmm13, xmm3, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 + vmovdqa xmm14, xmmword ptr [ROT16+rip] + vmovdqa xmm15, xmmword ptr [ROT8+rip] + mov r8, qword ptr [rdi] + movzx eax, byte ptr [rbp+0x80] + or eax, r13d + xor edx, edx +.p2align 5 +2: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + vmovdqa xmm2, xmmword ptr [BLAKE3_IV+rip] + vmovdqa xmm3, xmm13 + vpinsrd xmm3, xmm3, eax, 3 + vmovups xmm8, xmmword ptr [r8+rdx-0x40] + vmovups xmm9, xmmword ptr [r8+rdx-0x30] + vshufps xmm4, xmm8, xmm9, 136 + vshufps xmm5, xmm8, xmm9, 221 + vmovups xmm8, xmmword ptr [r8+rdx-0x20] + vmovups xmm9, xmmword ptr [r8+rdx-0x10] + vshufps xmm6, xmm8, xmm9, 136 + vshufps xmm7, xmm8, xmm9, 221 + vpshufd xmm6, xmm6, 0x93 + vpshufd xmm7, xmm7, 0x93 + mov al, 7 +9: + vpaddd xmm0, xmm0, xmm4 + vpaddd xmm0, xmm0, xmm1 + vpxor xmm3, xmm3, xmm0 + vpshufb xmm3, xmm3, xmm14 + vpaddd xmm2, xmm2, xmm3 + vpxor xmm1, xmm1, xmm2 + vpsrld xmm8, xmm1, 12 + vpslld xmm1, xmm1, 20 + vpor xmm1, xmm1, xmm8 + vpaddd xmm0, xmm0, xmm5 + vpaddd xmm0, xmm0, xmm1 + vpxor xmm3, xmm3, xmm0 + vpshufb xmm3, xmm3, xmm15 + vpaddd xmm2, xmm2, xmm3 + vpxor xmm1, xmm1, xmm2 + vpsrld xmm8, xmm1, 7 + vpslld xmm1, xmm1, 25 + vpor xmm1, xmm1, xmm8 + vpshufd xmm0, xmm0, 0x93 + vpshufd xmm3, xmm3, 0x4E + vpshufd xmm2, xmm2, 0x39 + vpaddd xmm0, xmm0, xmm6 + vpaddd xmm0, xmm0, xmm1 + vpxor xmm3, xmm3, xmm0 + vpshufb xmm3, xmm3, xmm14 + vpaddd xmm2, xmm2, xmm3 + vpxor xmm1, xmm1, xmm2 + vpsrld xmm8, xmm1, 12 + vpslld xmm1, xmm1, 20 + vpor xmm1, xmm1, xmm8 + vpaddd xmm0, xmm0, xmm7 + vpaddd xmm0, xmm0, xmm1 + vpxor xmm3, xmm3, xmm0 + vpshufb xmm3, xmm3, xmm15 + vpaddd xmm2, xmm2, xmm3 + vpxor xmm1, xmm1, xmm2 + vpsrld xmm8, xmm1, 7 + vpslld xmm1, xmm1, 25 + vpor xmm1, xmm1, xmm8 + vpshufd xmm0, xmm0, 0x39 + vpshufd xmm3, xmm3, 0x4E + vpshufd xmm2, xmm2, 0x93 + dec al + jz 9f + vshufps xmm8, xmm4, xmm5, 214 + vpshufd xmm9, xmm4, 0x0F + vpshufd xmm4, xmm8, 0x39 + vshufps xmm8, xmm6, xmm7, 250 + vpblendd xmm9, xmm9, xmm8, 0xAA + vpunpcklqdq xmm8, xmm7, xmm5 + vpblendd xmm8, xmm8, xmm6, 0x88 + vpshufd xmm8, xmm8, 0x78 + vpunpckhdq xmm5, xmm5, xmm7 + vpunpckldq xmm6, xmm6, xmm5 + vpshufd xmm7, xmm6, 0x1E + vmovdqa xmm5, xmm9 + vmovdqa xmm6, xmm8 + jmp 9b +9: + vpxor xmm0, xmm0, xmm2 + vpxor xmm1, xmm1, xmm3 + mov eax, r13d + cmp rdx, r15 + jne 2b + vmovdqu xmmword ptr [rbx], xmm0 + vmovdqu xmmword ptr [rbx+0x10], xmm1 + jmp 4b + +.section .rdata +.p2align 6 +ADD0: + .long 0, 1, 2, 3, 4, 5, 6, 7 +ADD1: + .long 8, 8, 8, 8, 8, 8, 8, 8 +BLAKE3_IV_0: + .long 0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667 + .long 0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667 +BLAKE3_IV_1: + .long 0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85 + .long 0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85 +BLAKE3_IV_2: + .long 0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372 + .long 0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372 +BLAKE3_IV_3: + .long 0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A + .long 0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A +BLAKE3_BLOCK_LEN: + .long 0x00000040, 0x00000040, 0x00000040, 0x00000040 + .long 0x00000040, 0x00000040, 0x00000040, 0x00000040 +ROT16: + .byte 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13 +ROT8: + .byte 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12 +CMP_MSB_MASK: + .long 0x80000000, 0x80000000, 0x80000000, 0x80000000 + .long 0x80000000, 0x80000000, 0x80000000, 0x80000000 +BLAKE3_IV: + .long 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A + diff --git a/third-party/blake3/c/blake3_avx2_x86-64_windows_msvc.asm b/third-party/blake3/c/blake3_avx2_x86-64_windows_msvc.asm new file mode 100644 index 00000000..352298ed --- /dev/null +++ b/third-party/blake3/c/blake3_avx2_x86-64_windows_msvc.asm @@ -0,0 +1,1828 @@ +public _blake3_hash_many_avx2 +public blake3_hash_many_avx2 + +_TEXT SEGMENT ALIGN(16) 'CODE' + +ALIGN 16 +blake3_hash_many_avx2 PROC +_blake3_hash_many_avx2 PROC + push r15 + push r14 + push r13 + push r12 + push rsi + push rdi + push rbx + push rbp + mov rbp, rsp + sub rsp, 880 + and rsp, 0FFFFFFFFFFFFFFC0H + vmovdqa xmmword ptr [rsp+2D0H], xmm6 + vmovdqa xmmword ptr [rsp+2E0H], xmm7 + vmovdqa xmmword ptr [rsp+2F0H], xmm8 + vmovdqa xmmword ptr [rsp+300H], xmm9 + vmovdqa xmmword ptr [rsp+310H], xmm10 + vmovdqa xmmword ptr [rsp+320H], xmm11 + vmovdqa xmmword ptr [rsp+330H], xmm12 + vmovdqa xmmword ptr [rsp+340H], xmm13 + vmovdqa xmmword ptr [rsp+350H], xmm14 + vmovdqa xmmword ptr [rsp+360H], xmm15 + mov rdi, rcx + mov rsi, rdx + mov rdx, r8 + mov rcx, r9 + mov r8, qword ptr [rbp+68H] + movzx r9, byte ptr [rbp+70H] + neg r9d + vmovd xmm0, r9d + vpbroadcastd ymm0, xmm0 + vmovdqa ymmword ptr [rsp+260H], ymm0 + vpand ymm1, ymm0, ymmword ptr [ADD0] + vpand ymm2, ymm0, ymmword ptr [ADD1] + vmovdqa ymmword ptr [rsp+2A0H], ymm2 + vmovd xmm2, r8d + vpbroadcastd ymm2, xmm2 + vpaddd ymm2, ymm2, ymm1 + vmovdqa ymmword ptr [rsp+220H], ymm2 + vpxor ymm1, ymm1, ymmword ptr [CMP_MSB_MASK] + vpxor ymm2, ymm2, ymmword ptr [CMP_MSB_MASK] + vpcmpgtd ymm2, ymm1, ymm2 + shr r8, 32 + vmovd xmm3, r8d + vpbroadcastd ymm3, xmm3 + vpsubd ymm3, ymm3, ymm2 + vmovdqa ymmword ptr [rsp+240H], ymm3 + shl rdx, 6 + mov qword ptr [rsp+2C0H], rdx + cmp rsi, 8 + jc final7blocks +outerloop8: + vpbroadcastd ymm0, dword ptr [rcx] + vpbroadcastd ymm1, dword ptr [rcx+4H] + vpbroadcastd ymm2, dword ptr [rcx+8H] + vpbroadcastd ymm3, dword ptr [rcx+0CH] + vpbroadcastd ymm4, dword ptr [rcx+10H] + vpbroadcastd ymm5, dword ptr [rcx+14H] + vpbroadcastd ymm6, dword ptr [rcx+18H] + vpbroadcastd ymm7, dword ptr [rcx+1CH] + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+8H] + mov r10, qword ptr [rdi+10H] + mov r11, qword ptr [rdi+18H] + mov r12, qword ptr [rdi+20H] + mov r13, qword ptr [rdi+28H] + mov r14, qword ptr [rdi+30H] + mov r15, qword ptr [rdi+38H] + movzx eax, byte ptr [rbp+78H] + movzx ebx, byte ptr [rbp+80H] + or eax, ebx + xor edx, edx +ALIGN 16 +innerloop8: + movzx ebx, byte ptr [rbp+88H] + or ebx, eax + add rdx, 64 + cmp rdx, qword ptr [rsp+2C0H] + cmove eax, ebx + mov dword ptr [rsp+200H], eax + vmovups xmm8, xmmword ptr [r8+rdx-40H] + vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-40H], 01H + vmovups xmm9, xmmword ptr [r9+rdx-40H] + vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-40H], 01H + vunpcklpd ymm12, ymm8, ymm9 + vunpckhpd ymm13, ymm8, ymm9 + vmovups xmm10, xmmword ptr [r10+rdx-40H] + vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-40H], 01H + vmovups xmm11, xmmword ptr [r11+rdx-40H] + vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-40H], 01H + vunpcklpd ymm14, ymm10, ymm11 + vunpckhpd ymm15, ymm10, ymm11 + vshufps ymm8, ymm12, ymm14, 136 + vmovaps ymmword ptr [rsp], ymm8 + vshufps ymm9, ymm12, ymm14, 221 + vmovaps ymmword ptr [rsp+20H], ymm9 + vshufps ymm10, ymm13, ymm15, 136 + vmovaps ymmword ptr [rsp+40H], ymm10 + vshufps ymm11, ymm13, ymm15, 221 + vmovaps ymmword ptr [rsp+60H], ymm11 + vmovups xmm8, xmmword ptr [r8+rdx-30H] + vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-30H], 01H + vmovups xmm9, xmmword ptr [r9+rdx-30H] + vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-30H], 01H + vunpcklpd ymm12, ymm8, ymm9 + vunpckhpd ymm13, ymm8, ymm9 + vmovups xmm10, xmmword ptr [r10+rdx-30H] + vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-30H], 01H + vmovups xmm11, xmmword ptr [r11+rdx-30H] + vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-30H], 01H + vunpcklpd ymm14, ymm10, ymm11 + vunpckhpd ymm15, ymm10, ymm11 + vshufps ymm8, ymm12, ymm14, 136 + vmovaps ymmword ptr [rsp+80H], ymm8 + vshufps ymm9, ymm12, ymm14, 221 + vmovaps ymmword ptr [rsp+0A0H], ymm9 + vshufps ymm10, ymm13, ymm15, 136 + vmovaps ymmword ptr [rsp+0C0H], ymm10 + vshufps ymm11, ymm13, ymm15, 221 + vmovaps ymmword ptr [rsp+0E0H], ymm11 + vmovups xmm8, xmmword ptr [r8+rdx-20H] + vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-20H], 01H + vmovups xmm9, xmmword ptr [r9+rdx-20H] + vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-20H], 01H + vunpcklpd ymm12, ymm8, ymm9 + vunpckhpd ymm13, ymm8, ymm9 + vmovups xmm10, xmmword ptr [r10+rdx-20H] + vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-20H], 01H + vmovups xmm11, xmmword ptr [r11+rdx-20H] + vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-20H], 01H + vunpcklpd ymm14, ymm10, ymm11 + vunpckhpd ymm15, ymm10, ymm11 + vshufps ymm8, ymm12, ymm14, 136 + vmovaps ymmword ptr [rsp+100H], ymm8 + vshufps ymm9, ymm12, ymm14, 221 + vmovaps ymmword ptr [rsp+120H], ymm9 + vshufps ymm10, ymm13, ymm15, 136 + vmovaps ymmword ptr [rsp+140H], ymm10 + vshufps ymm11, ymm13, ymm15, 221 + vmovaps ymmword ptr [rsp+160H], ymm11 + vmovups xmm8, xmmword ptr [r8+rdx-10H] + vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-10H], 01H + vmovups xmm9, xmmword ptr [r9+rdx-10H] + vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-10H], 01H + vunpcklpd ymm12, ymm8, ymm9 + vunpckhpd ymm13, ymm8, ymm9 + vmovups xmm10, xmmword ptr [r10+rdx-10H] + vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-10H], 01H + vmovups xmm11, xmmword ptr [r11+rdx-10H] + vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-10H], 01H + vunpcklpd ymm14, ymm10, ymm11 + vunpckhpd ymm15, ymm10, ymm11 + vshufps ymm8, ymm12, ymm14, 136 + vmovaps ymmword ptr [rsp+180H], ymm8 + vshufps ymm9, ymm12, ymm14, 221 + vmovaps ymmword ptr [rsp+1A0H], ymm9 + vshufps ymm10, ymm13, ymm15, 136 + vmovaps ymmword ptr [rsp+1C0H], ymm10 + vshufps ymm11, ymm13, ymm15, 221 + vmovaps ymmword ptr [rsp+1E0H], ymm11 + vpbroadcastd ymm15, dword ptr [rsp+200H] + prefetcht0 byte ptr [r8+rdx+80H] + prefetcht0 byte ptr [r12+rdx+80H] + prefetcht0 byte ptr [r9+rdx+80H] + prefetcht0 byte ptr [r13+rdx+80H] + prefetcht0 byte ptr [r10+rdx+80H] + prefetcht0 byte ptr [r14+rdx+80H] + prefetcht0 byte ptr [r11+rdx+80H] + prefetcht0 byte ptr [r15+rdx+80H] + vpaddd ymm0, ymm0, ymmword ptr [rsp] + vpaddd ymm1, ymm1, ymmword ptr [rsp+40H] + vpaddd ymm2, ymm2, ymmword ptr [rsp+80H] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0C0H] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm0, ymmword ptr [rsp+220H] + vpxor ymm13, ymm1, ymmword ptr [rsp+240H] + vpxor ymm14, ymm2, ymmword ptr [BLAKE3_BLOCK_LEN] + vpxor ymm15, ymm3, ymm15 + vbroadcasti128 ymm8, xmmword ptr [ROT16] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [BLAKE3_IV_0] + vpaddd ymm9, ymm13, ymmword ptr [BLAKE3_IV_1] + vpaddd ymm10, ymm14, ymmword ptr [BLAKE3_IV_2] + vpaddd ymm11, ymm15, ymmword ptr [BLAKE3_IV_3] + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+200H], ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+20H] + vpaddd ymm1, ymm1, ymmword ptr [rsp+60H] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0A0H] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0E0H] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+200H] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+200H], ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+100H] + vpaddd ymm1, ymm1, ymmword ptr [rsp+140H] + vpaddd ymm2, ymm2, ymmword ptr [rsp+180H] + vpaddd ymm3, ymm3, ymmword ptr [rsp+1C0H] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+200H] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+200H], ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+120H] + vpaddd ymm1, ymm1, ymmword ptr [rsp+160H] + vpaddd ymm2, ymm2, ymmword ptr [rsp+1A0H] + vpaddd ymm3, ymm3, ymmword ptr [rsp+1E0H] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+200H] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+200H], ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+40H] + vpaddd ymm1, ymm1, ymmword ptr [rsp+60H] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0E0H] + vpaddd ymm3, ymm3, ymmword ptr [rsp+80H] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+200H] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+200H], ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0C0H] + vpaddd ymm1, ymm1, ymmword ptr [rsp+140H] + vpaddd ymm2, ymm2, ymmword ptr [rsp] + vpaddd ymm3, ymm3, ymmword ptr [rsp+1A0H] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+200H] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+200H], ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+20H] + vpaddd ymm1, ymm1, ymmword ptr [rsp+180H] + vpaddd ymm2, ymm2, ymmword ptr [rsp+120H] + vpaddd ymm3, ymm3, ymmword ptr [rsp+1E0H] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+200H] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+200H], ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+160H] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0A0H] + vpaddd ymm2, ymm2, ymmword ptr [rsp+1C0H] + vpaddd ymm3, ymm3, ymmword ptr [rsp+100H] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+200H] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+200H], ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+60H] + vpaddd ymm1, ymm1, ymmword ptr [rsp+140H] + vpaddd ymm2, ymm2, ymmword ptr [rsp+1A0H] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0E0H] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+200H] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+200H], ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+80H] + vpaddd ymm1, ymm1, ymmword ptr [rsp+180H] + vpaddd ymm2, ymm2, ymmword ptr [rsp+40H] + vpaddd ymm3, ymm3, ymmword ptr [rsp+1C0H] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+200H] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+200H], ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0C0H] + vpaddd ymm1, ymm1, ymmword ptr [rsp+120H] + vpaddd ymm2, ymm2, ymmword ptr [rsp+160H] + vpaddd ymm3, ymm3, ymmword ptr [rsp+100H] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+200H] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+200H], ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0A0H] + vpaddd ymm1, ymm1, ymmword ptr [rsp] + vpaddd ymm2, ymm2, ymmword ptr [rsp+1E0H] + vpaddd ymm3, ymm3, ymmword ptr [rsp+20H] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+200H] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+200H], ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+140H] + vpaddd ymm1, ymm1, ymmword ptr [rsp+180H] + vpaddd ymm2, ymm2, ymmword ptr [rsp+1C0H] + vpaddd ymm3, ymm3, ymmword ptr [rsp+1A0H] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+200H] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+200H], ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0E0H] + vpaddd ymm1, ymm1, ymmword ptr [rsp+120H] + vpaddd ymm2, ymm2, ymmword ptr [rsp+60H] + vpaddd ymm3, ymm3, ymmword ptr [rsp+1E0H] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+200H] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+200H], ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+80H] + vpaddd ymm1, ymm1, ymmword ptr [rsp+160H] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0A0H] + vpaddd ymm3, ymm3, ymmword ptr [rsp+20H] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+200H] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+200H], ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp] + vpaddd ymm1, ymm1, ymmword ptr [rsp+40H] + vpaddd ymm2, ymm2, ymmword ptr [rsp+100H] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0C0H] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+200H] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+200H], ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+180H] + vpaddd ymm1, ymm1, ymmword ptr [rsp+120H] + vpaddd ymm2, ymm2, ymmword ptr [rsp+1E0H] + vpaddd ymm3, ymm3, ymmword ptr [rsp+1C0H] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+200H] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+200H], ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+1A0H] + vpaddd ymm1, ymm1, ymmword ptr [rsp+160H] + vpaddd ymm2, ymm2, ymmword ptr [rsp+140H] + vpaddd ymm3, ymm3, ymmword ptr [rsp+100H] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+200H] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+200H], ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0E0H] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0A0H] + vpaddd ymm2, ymm2, ymmword ptr [rsp] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0C0H] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+200H] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+200H], ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+40H] + vpaddd ymm1, ymm1, ymmword ptr [rsp+60H] + vpaddd ymm2, ymm2, ymmword ptr [rsp+20H] + vpaddd ymm3, ymm3, ymmword ptr [rsp+80H] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+200H] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+200H], ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+120H] + vpaddd ymm1, ymm1, ymmword ptr [rsp+160H] + vpaddd ymm2, ymm2, ymmword ptr [rsp+100H] + vpaddd ymm3, ymm3, ymmword ptr [rsp+1E0H] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+200H] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+200H], ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+1C0H] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0A0H] + vpaddd ymm2, ymm2, ymmword ptr [rsp+180H] + vpaddd ymm3, ymm3, ymmword ptr [rsp+20H] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+200H] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+200H], ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+1A0H] + vpaddd ymm1, ymm1, ymmword ptr [rsp] + vpaddd ymm2, ymm2, ymmword ptr [rsp+40H] + vpaddd ymm3, ymm3, ymmword ptr [rsp+80H] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+200H] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+200H], ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+60H] + vpaddd ymm1, ymm1, ymmword ptr [rsp+140H] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0C0H] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0E0H] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+200H] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+200H], ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+160H] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0A0H] + vpaddd ymm2, ymm2, ymmword ptr [rsp+20H] + vpaddd ymm3, ymm3, ymmword ptr [rsp+100H] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+200H] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+200H], ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+1E0H] + vpaddd ymm1, ymm1, ymmword ptr [rsp] + vpaddd ymm2, ymm2, ymmword ptr [rsp+120H] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0C0H] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+200H] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+200H], ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+1C0H] + vpaddd ymm1, ymm1, ymmword ptr [rsp+40H] + vpaddd ymm2, ymm2, ymmword ptr [rsp+60H] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0E0H] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+200H] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+200H], ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+140H] + vpaddd ymm1, ymm1, ymmword ptr [rsp+180H] + vpaddd ymm2, ymm2, ymmword ptr [rsp+80H] + vpaddd ymm3, ymm3, ymmword ptr [rsp+1A0H] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+200H] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vpxor ymm0, ymm0, ymm8 + vpxor ymm1, ymm1, ymm9 + vpxor ymm2, ymm2, ymm10 + vpxor ymm3, ymm3, ymm11 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpxor ymm4, ymm4, ymm12 + vpxor ymm5, ymm5, ymm13 + vpxor ymm6, ymm6, ymm14 + vpxor ymm7, ymm7, ymm15 + movzx eax, byte ptr [rbp+78H] + jne innerloop8 + mov rbx, qword ptr [rbp+90H] + vunpcklps ymm8, ymm0, ymm1 + vunpcklps ymm9, ymm2, ymm3 + vunpckhps ymm10, ymm0, ymm1 + vunpcklps ymm11, ymm4, ymm5 + vunpcklps ymm0, ymm6, ymm7 + vshufps ymm12, ymm8, ymm9, 78 + vblendps ymm1, ymm8, ymm12, 0CCH + vshufps ymm8, ymm11, ymm0, 78 + vunpckhps ymm13, ymm2, ymm3 + vblendps ymm2, ymm11, ymm8, 0CCH + vblendps ymm3, ymm12, ymm9, 0CCH + vperm2f128 ymm12, ymm1, ymm2, 20H + vmovups ymmword ptr [rbx], ymm12 + vunpckhps ymm14, ymm4, ymm5 + vblendps ymm4, ymm8, ymm0, 0CCH + vunpckhps ymm15, ymm6, ymm7 + vperm2f128 ymm7, ymm3, ymm4, 20H + vmovups ymmword ptr [rbx+20H], ymm7 + vshufps ymm5, ymm10, ymm13, 78 + vblendps ymm6, ymm5, ymm13, 0CCH + vshufps ymm13, ymm14, ymm15, 78 + vblendps ymm10, ymm10, ymm5, 0CCH + vblendps ymm14, ymm14, ymm13, 0CCH + vperm2f128 ymm8, ymm10, ymm14, 20H + vmovups ymmword ptr [rbx+40H], ymm8 + vblendps ymm15, ymm13, ymm15, 0CCH + vperm2f128 ymm13, ymm6, ymm15, 20H + vmovups ymmword ptr [rbx+60H], ymm13 + vperm2f128 ymm9, ymm1, ymm2, 31H + vperm2f128 ymm11, ymm3, ymm4, 31H + vmovups ymmword ptr [rbx+80H], ymm9 + vperm2f128 ymm14, ymm10, ymm14, 31H + vperm2f128 ymm15, ymm6, ymm15, 31H + vmovups ymmword ptr [rbx+0A0H], ymm11 + vmovups ymmword ptr [rbx+0C0H], ymm14 + vmovups ymmword ptr [rbx+0E0H], ymm15 + vmovdqa ymm0, ymmword ptr [rsp+2A0H] + vpaddd ymm1, ymm0, ymmword ptr [rsp+220H] + vmovdqa ymmword ptr [rsp+220H], ymm1 + vpxor ymm0, ymm0, ymmword ptr [CMP_MSB_MASK] + vpxor ymm2, ymm1, ymmword ptr [CMP_MSB_MASK] + vpcmpgtd ymm2, ymm0, ymm2 + vmovdqa ymm0, ymmword ptr [rsp+240H] + vpsubd ymm2, ymm0, ymm2 + vmovdqa ymmword ptr [rsp+240H], ymm2 + add rdi, 64 + add rbx, 256 + mov qword ptr [rbp+90H], rbx + sub rsi, 8 + cmp rsi, 8 + jnc outerloop8 + test rsi, rsi + jnz final7blocks +unwind: + vzeroupper + vmovdqa xmm6, xmmword ptr [rsp+2D0H] + vmovdqa xmm7, xmmword ptr [rsp+2E0H] + vmovdqa xmm8, xmmword ptr [rsp+2F0H] + vmovdqa xmm9, xmmword ptr [rsp+300H] + vmovdqa xmm10, xmmword ptr [rsp+310H] + vmovdqa xmm11, xmmword ptr [rsp+320H] + vmovdqa xmm12, xmmword ptr [rsp+330H] + vmovdqa xmm13, xmmword ptr [rsp+340H] + vmovdqa xmm14, xmmword ptr [rsp+350H] + vmovdqa xmm15, xmmword ptr [rsp+360H] + mov rsp, rbp + pop rbp + pop rbx + pop rdi + pop rsi + pop r12 + pop r13 + pop r14 + pop r15 + ret +ALIGN 16 +final7blocks: + mov rbx, qword ptr [rbp+90H] + mov r15, qword ptr [rsp+2C0H] + movzx r13d, byte ptr [rbp+78H] + movzx r12d, byte ptr [rbp+88H] + test rsi, 4H + je final3blocks + vbroadcasti128 ymm0, xmmword ptr [rcx] + vbroadcasti128 ymm1, xmmword ptr [rcx+10H] + vmovdqa ymm8, ymm0 + vmovdqa ymm9, ymm1 + vbroadcasti128 ymm12, xmmword ptr [rsp+220H] + vbroadcasti128 ymm13, xmmword ptr [rsp+240H] + vpunpckldq ymm14, ymm12, ymm13 + vpunpckhdq ymm15, ymm12, ymm13 + vpermq ymm14, ymm14, 50H + vpermq ymm15, ymm15, 50H + vbroadcasti128 ymm12, xmmword ptr [BLAKE3_BLOCK_LEN] + vpblendd ymm14, ymm14, ymm12, 44H + vpblendd ymm15, ymm15, ymm12, 44H + vmovdqa ymmword ptr [rsp], ymm14 + vmovdqa ymmword ptr [rsp+20H], ymm15 + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+8H] + mov r10, qword ptr [rdi+10H] + mov r11, qword ptr [rdi+18H] + movzx eax, byte ptr [rbp+80H] + or eax, r13d + xor edx, edx +ALIGN 16 +innerloop4: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + mov dword ptr [rsp+200H], eax + vmovups ymm2, ymmword ptr [r8+rdx-40H] + vinsertf128 ymm2, ymm2, xmmword ptr [r9+rdx-40H], 01H + vmovups ymm3, ymmword ptr [r8+rdx-30H] + vinsertf128 ymm3, ymm3, xmmword ptr [r9+rdx-30H], 01H + vshufps ymm4, ymm2, ymm3, 136 + vshufps ymm5, ymm2, ymm3, 221 + vmovups ymm2, ymmword ptr [r8+rdx-20H] + vinsertf128 ymm2, ymm2, xmmword ptr [r9+rdx-20H], 01H + vmovups ymm3, ymmword ptr [r8+rdx-10H] + vinsertf128 ymm3, ymm3, xmmword ptr [r9+rdx-10H], 01H + vshufps ymm6, ymm2, ymm3, 136 + vshufps ymm7, ymm2, ymm3, 221 + vpshufd ymm6, ymm6, 93H + vpshufd ymm7, ymm7, 93H + vmovups ymm10, ymmword ptr [r10+rdx-40H] + vinsertf128 ymm10, ymm10, xmmword ptr [r11+rdx-40H], 01H + vmovups ymm11, ymmword ptr [r10+rdx-30H] + vinsertf128 ymm11, ymm11, xmmword ptr [r11+rdx-30H], 01H + vshufps ymm12, ymm10, ymm11, 136 + vshufps ymm13, ymm10, ymm11, 221 + vmovups ymm10, ymmword ptr [r10+rdx-20H] + vinsertf128 ymm10, ymm10, xmmword ptr [r11+rdx-20H], 01H + vmovups ymm11, ymmword ptr [r10+rdx-10H] + vinsertf128 ymm11, ymm11, xmmword ptr [r11+rdx-10H], 01H + vshufps ymm14, ymm10, ymm11, 136 + vshufps ymm15, ymm10, ymm11, 221 + vpshufd ymm14, ymm14, 93H + vpshufd ymm15, ymm15, 93H + vpbroadcastd ymm2, dword ptr [rsp+200H] + vmovdqa ymm3, ymmword ptr [rsp] + vmovdqa ymm11, ymmword ptr [rsp+20H] + vpblendd ymm3, ymm3, ymm2, 88H + vpblendd ymm11, ymm11, ymm2, 88H + vbroadcasti128 ymm2, xmmword ptr [BLAKE3_IV] + vmovdqa ymm10, ymm2 + mov al, 7 +roundloop4: + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm8, ymm8, ymm12 + vmovdqa ymmword ptr [rsp+40H], ymm4 + nop + vmovdqa ymmword ptr [rsp+60H], ymm12 + nop + vpaddd ymm0, ymm0, ymm1 + vpaddd ymm8, ymm8, ymm9 + vpxor ymm3, ymm3, ymm0 + vpxor ymm11, ymm11, ymm8 + vbroadcasti128 ymm4, xmmword ptr [ROT16] + vpshufb ymm3, ymm3, ymm4 + vpshufb ymm11, ymm11, ymm4 + vpaddd ymm2, ymm2, ymm3 + vpaddd ymm10, ymm10, ymm11 + vpxor ymm1, ymm1, ymm2 + vpxor ymm9, ymm9, ymm10 + vpsrld ymm4, ymm1, 12 + vpslld ymm1, ymm1, 20 + vpor ymm1, ymm1, ymm4 + vpsrld ymm4, ymm9, 12 + vpslld ymm9, ymm9, 20 + vpor ymm9, ymm9, ymm4 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm0, ymm0, ymm1 + vpaddd ymm8, ymm8, ymm9 + vmovdqa ymmword ptr [rsp+80H], ymm5 + vmovdqa ymmword ptr [rsp+0A0H], ymm13 + vpxor ymm3, ymm3, ymm0 + vpxor ymm11, ymm11, ymm8 + vbroadcasti128 ymm4, xmmword ptr [ROT8] + vpshufb ymm3, ymm3, ymm4 + vpshufb ymm11, ymm11, ymm4 + vpaddd ymm2, ymm2, ymm3 + vpaddd ymm10, ymm10, ymm11 + vpxor ymm1, ymm1, ymm2 + vpxor ymm9, ymm9, ymm10 + vpsrld ymm4, ymm1, 7 + vpslld ymm1, ymm1, 25 + vpor ymm1, ymm1, ymm4 + vpsrld ymm4, ymm9, 7 + vpslld ymm9, ymm9, 25 + vpor ymm9, ymm9, ymm4 + vpshufd ymm0, ymm0, 93H + vpshufd ymm8, ymm8, 93H + vpshufd ymm3, ymm3, 4EH + vpshufd ymm11, ymm11, 4EH + vpshufd ymm2, ymm2, 39H + vpshufd ymm10, ymm10, 39H + vpaddd ymm0, ymm0, ymm6 + vpaddd ymm8, ymm8, ymm14 + vpaddd ymm0, ymm0, ymm1 + vpaddd ymm8, ymm8, ymm9 + vpxor ymm3, ymm3, ymm0 + vpxor ymm11, ymm11, ymm8 + vbroadcasti128 ymm4, xmmword ptr [ROT16] + vpshufb ymm3, ymm3, ymm4 + vpshufb ymm11, ymm11, ymm4 + vpaddd ymm2, ymm2, ymm3 + vpaddd ymm10, ymm10, ymm11 + vpxor ymm1, ymm1, ymm2 + vpxor ymm9, ymm9, ymm10 + vpsrld ymm4, ymm1, 12 + vpslld ymm1, ymm1, 20 + vpor ymm1, ymm1, ymm4 + vpsrld ymm4, ymm9, 12 + vpslld ymm9, ymm9, 20 + vpor ymm9, ymm9, ymm4 + vpaddd ymm0, ymm0, ymm7 + vpaddd ymm8, ymm8, ymm15 + vpaddd ymm0, ymm0, ymm1 + vpaddd ymm8, ymm8, ymm9 + vpxor ymm3, ymm3, ymm0 + vpxor ymm11, ymm11, ymm8 + vbroadcasti128 ymm4, xmmword ptr [ROT8] + vpshufb ymm3, ymm3, ymm4 + vpshufb ymm11, ymm11, ymm4 + vpaddd ymm2, ymm2, ymm3 + vpaddd ymm10, ymm10, ymm11 + vpxor ymm1, ymm1, ymm2 + vpxor ymm9, ymm9, ymm10 + vpsrld ymm4, ymm1, 7 + vpslld ymm1, ymm1, 25 + vpor ymm1, ymm1, ymm4 + vpsrld ymm4, ymm9, 7 + vpslld ymm9, ymm9, 25 + vpor ymm9, ymm9, ymm4 + vpshufd ymm0, ymm0, 39H + vpshufd ymm8, ymm8, 39H + vpshufd ymm3, ymm3, 4EH + vpshufd ymm11, ymm11, 4EH + vpshufd ymm2, ymm2, 93H + vpshufd ymm10, ymm10, 93H + dec al + je endroundloop4 + vmovdqa ymm4, ymmword ptr [rsp+40H] + vmovdqa ymm5, ymmword ptr [rsp+80H] + vshufps ymm12, ymm4, ymm5, 214 + vpshufd ymm13, ymm4, 0FH + vpshufd ymm4, ymm12, 39H + vshufps ymm12, ymm6, ymm7, 250 + vpblendd ymm13, ymm13, ymm12, 0AAH + vpunpcklqdq ymm12, ymm7, ymm5 + vpblendd ymm12, ymm12, ymm6, 88H + vpshufd ymm12, ymm12, 78H + vpunpckhdq ymm5, ymm5, ymm7 + vpunpckldq ymm6, ymm6, ymm5 + vpshufd ymm7, ymm6, 1EH + vmovdqa ymmword ptr [rsp+40H], ymm13 + vmovdqa ymmword ptr [rsp+80H], ymm12 + vmovdqa ymm12, ymmword ptr [rsp+60H] + vmovdqa ymm13, ymmword ptr [rsp+0A0H] + vshufps ymm5, ymm12, ymm13, 214 + vpshufd ymm6, ymm12, 0FH + vpshufd ymm12, ymm5, 39H + vshufps ymm5, ymm14, ymm15, 250 + vpblendd ymm6, ymm6, ymm5, 0AAH + vpunpcklqdq ymm5, ymm15, ymm13 + vpblendd ymm5, ymm5, ymm14, 88H + vpshufd ymm5, ymm5, 78H + vpunpckhdq ymm13, ymm13, ymm15 + vpunpckldq ymm14, ymm14, ymm13 + vpshufd ymm15, ymm14, 1EH + vmovdqa ymm13, ymm6 + vmovdqa ymm14, ymm5 + vmovdqa ymm5, ymmword ptr [rsp+40H] + vmovdqa ymm6, ymmword ptr [rsp+80H] + jmp roundloop4 +endroundloop4: + vpxor ymm0, ymm0, ymm2 + vpxor ymm1, ymm1, ymm3 + vpxor ymm8, ymm8, ymm10 + vpxor ymm9, ymm9, ymm11 + mov eax, r13d + cmp rdx, r15 + jne innerloop4 + vmovdqu xmmword ptr [rbx], xmm0 + vmovdqu xmmword ptr [rbx+10H], xmm1 + vextracti128 xmmword ptr [rbx+20H], ymm0, 01H + vextracti128 xmmword ptr [rbx+30H], ymm1, 01H + vmovdqu xmmword ptr [rbx+40H], xmm8 + vmovdqu xmmword ptr [rbx+50H], xmm9 + vextracti128 xmmword ptr [rbx+60H], ymm8, 01H + vextracti128 xmmword ptr [rbx+70H], ymm9, 01H + vmovaps xmm8, xmmword ptr [rsp+260H] + vmovaps xmm0, xmmword ptr [rsp+220H] + vmovaps xmm1, xmmword ptr [rsp+230H] + vmovaps xmm2, xmmword ptr [rsp+240H] + vmovaps xmm3, xmmword ptr [rsp+250H] + vblendvps xmm0, xmm0, xmm1, xmm8 + vblendvps xmm2, xmm2, xmm3, xmm8 + vmovaps xmmword ptr [rsp+220H], xmm0 + vmovaps xmmword ptr [rsp+240H], xmm2 + add rbx, 128 + add rdi, 32 + sub rsi, 4 +final3blocks: + test rsi, 2H + je final1blocks + vbroadcasti128 ymm0, xmmword ptr [rcx] + vbroadcasti128 ymm1, xmmword ptr [rcx+10H] + vmovd xmm13, dword ptr [rsp+220H] + vpinsrd xmm13, xmm13, dword ptr [rsp+240H], 1 + vpinsrd xmm13, xmm13, dword ptr [BLAKE3_BLOCK_LEN], 2 + vmovd xmm14, dword ptr [rsp+224H] + vpinsrd xmm14, xmm14, dword ptr [rsp+244H], 1 + vpinsrd xmm14, xmm14, dword ptr [BLAKE3_BLOCK_LEN], 2 + vinserti128 ymm13, ymm13, xmm14, 01H + vbroadcasti128 ymm14, xmmword ptr [ROT16] + vbroadcasti128 ymm15, xmmword ptr [ROT8] + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+8H] + movzx eax, byte ptr [rbp+80H] + or eax, r13d + xor edx, edx +ALIGN 16 +innerloop2: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + mov dword ptr [rsp+200H], eax + vbroadcasti128 ymm2, xmmword ptr [BLAKE3_IV] + vpbroadcastd ymm8, dword ptr [rsp+200H] + vpblendd ymm3, ymm13, ymm8, 88H + vmovups ymm8, ymmword ptr [r8+rdx-40H] + vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-40H], 01H + vmovups ymm9, ymmword ptr [r8+rdx-30H] + vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-30H], 01H + vshufps ymm4, ymm8, ymm9, 136 + vshufps ymm5, ymm8, ymm9, 221 + vmovups ymm8, ymmword ptr [r8+rdx-20H] + vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-20H], 01H + vmovups ymm9, ymmword ptr [r8+rdx-10H] + vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-10H], 01H + vshufps ymm6, ymm8, ymm9, 136 + vshufps ymm7, ymm8, ymm9, 221 + vpshufd ymm6, ymm6, 93H + vpshufd ymm7, ymm7, 93H + mov al, 7 +roundloop2: + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm0, ymm0, ymm1 + vpxor ymm3, ymm3, ymm0 + vpshufb ymm3, ymm3, ymm14 + vpaddd ymm2, ymm2, ymm3 + vpxor ymm1, ymm1, ymm2 + vpsrld ymm8, ymm1, 12 + vpslld ymm1, ymm1, 20 + vpor ymm1, ymm1, ymm8 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm0, ymm0, ymm1 + vpxor ymm3, ymm3, ymm0 + vpshufb ymm3, ymm3, ymm15 + vpaddd ymm2, ymm2, ymm3 + vpxor ymm1, ymm1, ymm2 + vpsrld ymm8, ymm1, 7 + vpslld ymm1, ymm1, 25 + vpor ymm1, ymm1, ymm8 + vpshufd ymm0, ymm0, 93H + vpshufd ymm3, ymm3, 4EH + vpshufd ymm2, ymm2, 39H + vpaddd ymm0, ymm0, ymm6 + vpaddd ymm0, ymm0, ymm1 + vpxor ymm3, ymm3, ymm0 + vpshufb ymm3, ymm3, ymm14 + vpaddd ymm2, ymm2, ymm3 + vpxor ymm1, ymm1, ymm2 + vpsrld ymm8, ymm1, 12 + vpslld ymm1, ymm1, 20 + vpor ymm1, ymm1, ymm8 + vpaddd ymm0, ymm0, ymm7 + vpaddd ymm0, ymm0, ymm1 + vpxor ymm3, ymm3, ymm0 + vpshufb ymm3, ymm3, ymm15 + vpaddd ymm2, ymm2, ymm3 + vpxor ymm1, ymm1, ymm2 + vpsrld ymm8, ymm1, 7 + vpslld ymm1, ymm1, 25 + vpor ymm1, ymm1, ymm8 + vpshufd ymm0, ymm0, 39H + vpshufd ymm3, ymm3, 4EH + vpshufd ymm2, ymm2, 93H + dec al + jz endroundloop2 + vshufps ymm8, ymm4, ymm5, 214 + vpshufd ymm9, ymm4, 0FH + vpshufd ymm4, ymm8, 39H + vshufps ymm8, ymm6, ymm7, 250 + vpblendd ymm9, ymm9, ymm8, 0AAH + vpunpcklqdq ymm8, ymm7, ymm5 + vpblendd ymm8, ymm8, ymm6, 88H + vpshufd ymm8, ymm8, 78H + vpunpckhdq ymm5, ymm5, ymm7 + vpunpckldq ymm6, ymm6, ymm5 + vpshufd ymm7, ymm6, 1EH + vmovdqa ymm5, ymm9 + vmovdqa ymm6, ymm8 + jmp roundloop2 +endroundloop2: + vpxor ymm0, ymm0, ymm2 + vpxor ymm1, ymm1, ymm3 + mov eax, r13d + cmp rdx, r15 + jne innerloop2 + vmovdqu xmmword ptr [rbx], xmm0 + vmovdqu xmmword ptr [rbx+10H], xmm1 + vextracti128 xmmword ptr [rbx+20H], ymm0, 01H + vextracti128 xmmword ptr [rbx+30H], ymm1, 01H + vmovaps ymm8, ymmword ptr [rsp+260H] + vmovaps ymm0, ymmword ptr [rsp+220H] + vmovups ymm1, ymmword ptr [rsp+228H] + vmovaps ymm2, ymmword ptr [rsp+240H] + vmovups ymm3, ymmword ptr [rsp+248H] + vblendvps ymm0, ymm0, ymm1, ymm8 + vblendvps ymm2, ymm2, ymm3, ymm8 + vmovaps ymmword ptr [rsp+220H], ymm0 + vmovaps ymmword ptr [rsp+240H], ymm2 + add rbx, 64 + add rdi, 16 + sub rsi, 2 +final1blocks: + test rsi, 1H + je unwind + vmovdqu xmm0, xmmword ptr [rcx] + vmovdqu xmm1, xmmword ptr [rcx+10H] + vmovd xmm3, dword ptr [rsp+220H] + vpinsrd xmm3, xmm3, dword ptr [rsp+240H], 1 + vpinsrd xmm13, xmm3, dword ptr [BLAKE3_BLOCK_LEN], 2 + vmovdqa xmm14, xmmword ptr [ROT16] + vmovdqa xmm15, xmmword ptr [ROT8] + mov r8, qword ptr [rdi] + movzx eax, byte ptr [rbp+80H] + or eax, r13d + xor edx, edx +ALIGN 16 +innerloop1: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + vmovdqa xmm2, xmmword ptr [BLAKE3_IV] + vmovdqa xmm3, xmm13 + vpinsrd xmm3, xmm3, eax, 3 + vmovups xmm8, xmmword ptr [r8+rdx-40H] + vmovups xmm9, xmmword ptr [r8+rdx-30H] + vshufps xmm4, xmm8, xmm9, 136 + vshufps xmm5, xmm8, xmm9, 221 + vmovups xmm8, xmmword ptr [r8+rdx-20H] + vmovups xmm9, xmmword ptr [r8+rdx-10H] + vshufps xmm6, xmm8, xmm9, 136 + vshufps xmm7, xmm8, xmm9, 221 + vpshufd xmm6, xmm6, 93H + vpshufd xmm7, xmm7, 93H + mov al, 7 +roundloop1: + vpaddd xmm0, xmm0, xmm4 + vpaddd xmm0, xmm0, xmm1 + vpxor xmm3, xmm3, xmm0 + vpshufb xmm3, xmm3, xmm14 + vpaddd xmm2, xmm2, xmm3 + vpxor xmm1, xmm1, xmm2 + vpsrld xmm8, xmm1, 12 + vpslld xmm1, xmm1, 20 + vpor xmm1, xmm1, xmm8 + vpaddd xmm0, xmm0, xmm5 + vpaddd xmm0, xmm0, xmm1 + vpxor xmm3, xmm3, xmm0 + vpshufb xmm3, xmm3, xmm15 + vpaddd xmm2, xmm2, xmm3 + vpxor xmm1, xmm1, xmm2 + vpsrld xmm8, xmm1, 7 + vpslld xmm1, xmm1, 25 + vpor xmm1, xmm1, xmm8 + vpshufd xmm0, xmm0, 93H + vpshufd xmm3, xmm3, 4EH + vpshufd xmm2, xmm2, 39H + vpaddd xmm0, xmm0, xmm6 + vpaddd xmm0, xmm0, xmm1 + vpxor xmm3, xmm3, xmm0 + vpshufb xmm3, xmm3, xmm14 + vpaddd xmm2, xmm2, xmm3 + vpxor xmm1, xmm1, xmm2 + vpsrld xmm8, xmm1, 12 + vpslld xmm1, xmm1, 20 + vpor xmm1, xmm1, xmm8 + vpaddd xmm0, xmm0, xmm7 + vpaddd xmm0, xmm0, xmm1 + vpxor xmm3, xmm3, xmm0 + vpshufb xmm3, xmm3, xmm15 + vpaddd xmm2, xmm2, xmm3 + vpxor xmm1, xmm1, xmm2 + vpsrld xmm8, xmm1, 7 + vpslld xmm1, xmm1, 25 + vpor xmm1, xmm1, xmm8 + vpshufd xmm0, xmm0, 39H + vpshufd xmm3, xmm3, 4EH + vpshufd xmm2, xmm2, 93H + dec al + jz endroundloop1 + vshufps xmm8, xmm4, xmm5, 214 + vpshufd xmm9, xmm4, 0FH + vpshufd xmm4, xmm8, 39H + vshufps xmm8, xmm6, xmm7, 250 + vpblendd xmm9, xmm9, xmm8, 0AAH + vpunpcklqdq xmm8, xmm7, xmm5 + vpblendd xmm8, xmm8, xmm6, 88H + vpshufd xmm8, xmm8, 78H + vpunpckhdq xmm5, xmm5, xmm7 + vpunpckldq xmm6, xmm6, xmm5 + vpshufd xmm7, xmm6, 1EH + vmovdqa xmm5, xmm9 + vmovdqa xmm6, xmm8 + jmp roundloop1 +endroundloop1: + vpxor xmm0, xmm0, xmm2 + vpxor xmm1, xmm1, xmm3 + mov eax, r13d + cmp rdx, r15 + jne innerloop1 + vmovdqu xmmword ptr [rbx], xmm0 + vmovdqu xmmword ptr [rbx+10H], xmm1 + jmp unwind + +_blake3_hash_many_avx2 ENDP +blake3_hash_many_avx2 ENDP +_TEXT ENDS + +_RDATA SEGMENT READONLY PAGE ALIAS(".rdata") 'CONST' +ALIGN 64 +ADD0: + dd 0, 1, 2, 3, 4, 5, 6, 7 + +ADD1: + dd 8 dup (8) + +BLAKE3_IV_0: + dd 8 dup (6A09E667H) + +BLAKE3_IV_1: + dd 8 dup (0BB67AE85H) + +BLAKE3_IV_2: + dd 8 dup (3C6EF372H) + +BLAKE3_IV_3: + dd 8 dup (0A54FF53AH) + +BLAKE3_BLOCK_LEN: + dd 8 dup (64) + +ROT16: + db 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13 + +ROT8: + db 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12 + +CMP_MSB_MASK: + dd 8 dup(80000000H) + +BLAKE3_IV: + dd 6A09E667H, 0BB67AE85H, 3C6EF372H, 0A54FF53AH + +_RDATA ENDS +END diff --git a/third-party/blake3/c/blake3_avx512.c b/third-party/blake3/c/blake3_avx512.c new file mode 100644 index 00000000..d6b1ae9b --- /dev/null +++ b/third-party/blake3/c/blake3_avx512.c @@ -0,0 +1,1220 @@ +#include "blake3_impl.h" + +#include + +#define _mm_shuffle_ps2(a, b, c) \ + (_mm_castps_si128( \ + _mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), (c)))) + +INLINE __m128i loadu_128(const uint8_t src[16]) { + return _mm_loadu_si128((const __m128i *)src); +} + +INLINE __m256i loadu_256(const uint8_t src[32]) { + return _mm256_loadu_si256((const __m256i *)src); +} + +INLINE __m512i loadu_512(const uint8_t src[64]) { + return _mm512_loadu_si512((const __m512i *)src); +} + +INLINE void storeu_128(__m128i src, uint8_t dest[16]) { + _mm_storeu_si128((__m128i *)dest, src); +} + +INLINE void storeu_256(__m256i src, uint8_t dest[16]) { + _mm256_storeu_si256((__m256i *)dest, src); +} + +INLINE __m128i add_128(__m128i a, __m128i b) { return _mm_add_epi32(a, b); } + +INLINE __m256i add_256(__m256i a, __m256i b) { return _mm256_add_epi32(a, b); } + +INLINE __m512i add_512(__m512i a, __m512i b) { return _mm512_add_epi32(a, b); } + +INLINE __m128i xor_128(__m128i a, __m128i b) { return _mm_xor_si128(a, b); } + +INLINE __m256i xor_256(__m256i a, __m256i b) { return _mm256_xor_si256(a, b); } + +INLINE __m512i xor_512(__m512i a, __m512i b) { return _mm512_xor_si512(a, b); } + +INLINE __m128i set1_128(uint32_t x) { return _mm_set1_epi32((int32_t)x); } + +INLINE __m256i set1_256(uint32_t x) { return _mm256_set1_epi32((int32_t)x); } + +INLINE __m512i set1_512(uint32_t x) { return _mm512_set1_epi32((int32_t)x); } + +INLINE __m128i set4(uint32_t a, uint32_t b, uint32_t c, uint32_t d) { + return _mm_setr_epi32((int32_t)a, (int32_t)b, (int32_t)c, (int32_t)d); +} + +INLINE __m128i rot16_128(__m128i x) { return _mm_ror_epi32(x, 16); } + +INLINE __m256i rot16_256(__m256i x) { return _mm256_ror_epi32(x, 16); } + +INLINE __m512i rot16_512(__m512i x) { return _mm512_ror_epi32(x, 16); } + +INLINE __m128i rot12_128(__m128i x) { return _mm_ror_epi32(x, 12); } + +INLINE __m256i rot12_256(__m256i x) { return _mm256_ror_epi32(x, 12); } + +INLINE __m512i rot12_512(__m512i x) { return _mm512_ror_epi32(x, 12); } + +INLINE __m128i rot8_128(__m128i x) { return _mm_ror_epi32(x, 8); } + +INLINE __m256i rot8_256(__m256i x) { return _mm256_ror_epi32(x, 8); } + +INLINE __m512i rot8_512(__m512i x) { return _mm512_ror_epi32(x, 8); } + +INLINE __m128i rot7_128(__m128i x) { return _mm_ror_epi32(x, 7); } + +INLINE __m256i rot7_256(__m256i x) { return _mm256_ror_epi32(x, 7); } + +INLINE __m512i rot7_512(__m512i x) { return _mm512_ror_epi32(x, 7); } + +/* + * ---------------------------------------------------------------------------- + * compress_avx512 + * ---------------------------------------------------------------------------- + */ + +INLINE void g1(__m128i *row0, __m128i *row1, __m128i *row2, __m128i *row3, + __m128i m) { + *row0 = add_128(add_128(*row0, m), *row1); + *row3 = xor_128(*row3, *row0); + *row3 = rot16_128(*row3); + *row2 = add_128(*row2, *row3); + *row1 = xor_128(*row1, *row2); + *row1 = rot12_128(*row1); +} + +INLINE void g2(__m128i *row0, __m128i *row1, __m128i *row2, __m128i *row3, + __m128i m) { + *row0 = add_128(add_128(*row0, m), *row1); + *row3 = xor_128(*row3, *row0); + *row3 = rot8_128(*row3); + *row2 = add_128(*row2, *row3); + *row1 = xor_128(*row1, *row2); + *row1 = rot7_128(*row1); +} + +// Note the optimization here of leaving row1 as the unrotated row, rather than +// row0. All the message loads below are adjusted to compensate for this. See +// discussion at https://github.com/sneves/blake2-avx2/pull/4 +INLINE void diagonalize(__m128i *row0, __m128i *row2, __m128i *row3) { + *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(2, 1, 0, 3)); + *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2)); + *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(0, 3, 2, 1)); +} + +INLINE void undiagonalize(__m128i *row0, __m128i *row2, __m128i *row3) { + *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(0, 3, 2, 1)); + *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2)); + *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(2, 1, 0, 3)); +} + +INLINE void compress_pre(__m128i rows[4], const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, uint8_t flags) { + rows[0] = loadu_128((uint8_t *)&cv[0]); + rows[1] = loadu_128((uint8_t *)&cv[4]); + rows[2] = set4(IV[0], IV[1], IV[2], IV[3]); + rows[3] = set4(counter_low(counter), counter_high(counter), + (uint32_t)block_len, (uint32_t)flags); + + __m128i m0 = loadu_128(&block[sizeof(__m128i) * 0]); + __m128i m1 = loadu_128(&block[sizeof(__m128i) * 1]); + __m128i m2 = loadu_128(&block[sizeof(__m128i) * 2]); + __m128i m3 = loadu_128(&block[sizeof(__m128i) * 3]); + + __m128i t0, t1, t2, t3, tt; + + // Round 1. The first round permutes the message words from the original + // input order, into the groups that get mixed in parallel. + t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(2, 0, 2, 0)); // 6 4 2 0 + g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); + t1 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 3, 1)); // 7 5 3 1 + g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); + diagonalize(&rows[0], &rows[2], &rows[3]); + t2 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(2, 0, 2, 0)); // 14 12 10 8 + t2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2, 1, 0, 3)); // 12 10 8 14 + g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); + t3 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 1, 3, 1)); // 15 13 11 9 + t3 = _mm_shuffle_epi32(t3, _MM_SHUFFLE(2, 1, 0, 3)); // 13 11 9 15 + g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); + undiagonalize(&rows[0], &rows[2], &rows[3]); + m0 = t0; + m1 = t1; + m2 = t2; + m3 = t3; + + // Round 2. This round and all following rounds apply a fixed permutation + // to the message words from the round before. + t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); + t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); + t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); + tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); + t1 = _mm_blend_epi16(tt, t1, 0xCC); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); + diagonalize(&rows[0], &rows[2], &rows[3]); + t2 = _mm_unpacklo_epi64(m3, m1); + tt = _mm_blend_epi16(t2, m2, 0xC0); + t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); + t3 = _mm_unpackhi_epi32(m1, m3); + tt = _mm_unpacklo_epi32(m2, t3); + t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); + undiagonalize(&rows[0], &rows[2], &rows[3]); + m0 = t0; + m1 = t1; + m2 = t2; + m3 = t3; + + // Round 3 + t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); + t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); + t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); + tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); + t1 = _mm_blend_epi16(tt, t1, 0xCC); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); + diagonalize(&rows[0], &rows[2], &rows[3]); + t2 = _mm_unpacklo_epi64(m3, m1); + tt = _mm_blend_epi16(t2, m2, 0xC0); + t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); + t3 = _mm_unpackhi_epi32(m1, m3); + tt = _mm_unpacklo_epi32(m2, t3); + t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); + undiagonalize(&rows[0], &rows[2], &rows[3]); + m0 = t0; + m1 = t1; + m2 = t2; + m3 = t3; + + // Round 4 + t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); + t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); + t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); + tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); + t1 = _mm_blend_epi16(tt, t1, 0xCC); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); + diagonalize(&rows[0], &rows[2], &rows[3]); + t2 = _mm_unpacklo_epi64(m3, m1); + tt = _mm_blend_epi16(t2, m2, 0xC0); + t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); + t3 = _mm_unpackhi_epi32(m1, m3); + tt = _mm_unpacklo_epi32(m2, t3); + t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); + undiagonalize(&rows[0], &rows[2], &rows[3]); + m0 = t0; + m1 = t1; + m2 = t2; + m3 = t3; + + // Round 5 + t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); + t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); + t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); + tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); + t1 = _mm_blend_epi16(tt, t1, 0xCC); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); + diagonalize(&rows[0], &rows[2], &rows[3]); + t2 = _mm_unpacklo_epi64(m3, m1); + tt = _mm_blend_epi16(t2, m2, 0xC0); + t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); + t3 = _mm_unpackhi_epi32(m1, m3); + tt = _mm_unpacklo_epi32(m2, t3); + t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); + undiagonalize(&rows[0], &rows[2], &rows[3]); + m0 = t0; + m1 = t1; + m2 = t2; + m3 = t3; + + // Round 6 + t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); + t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); + t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); + tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); + t1 = _mm_blend_epi16(tt, t1, 0xCC); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); + diagonalize(&rows[0], &rows[2], &rows[3]); + t2 = _mm_unpacklo_epi64(m3, m1); + tt = _mm_blend_epi16(t2, m2, 0xC0); + t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); + t3 = _mm_unpackhi_epi32(m1, m3); + tt = _mm_unpacklo_epi32(m2, t3); + t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); + undiagonalize(&rows[0], &rows[2], &rows[3]); + m0 = t0; + m1 = t1; + m2 = t2; + m3 = t3; + + // Round 7 + t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); + t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); + t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); + tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); + t1 = _mm_blend_epi16(tt, t1, 0xCC); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); + diagonalize(&rows[0], &rows[2], &rows[3]); + t2 = _mm_unpacklo_epi64(m3, m1); + tt = _mm_blend_epi16(t2, m2, 0xC0); + t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); + t3 = _mm_unpackhi_epi32(m1, m3); + tt = _mm_unpacklo_epi32(m2, t3); + t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); + undiagonalize(&rows[0], &rows[2], &rows[3]); +} + +void blake3_compress_xof_avx512(const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, + uint8_t flags, uint8_t out[64]) { + __m128i rows[4]; + compress_pre(rows, cv, block, block_len, counter, flags); + storeu_128(xor_128(rows[0], rows[2]), &out[0]); + storeu_128(xor_128(rows[1], rows[3]), &out[16]); + storeu_128(xor_128(rows[2], loadu_128((uint8_t *)&cv[0])), &out[32]); + storeu_128(xor_128(rows[3], loadu_128((uint8_t *)&cv[4])), &out[48]); +} + +void blake3_compress_in_place_avx512(uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, + uint8_t flags) { + __m128i rows[4]; + compress_pre(rows, cv, block, block_len, counter, flags); + storeu_128(xor_128(rows[0], rows[2]), (uint8_t *)&cv[0]); + storeu_128(xor_128(rows[1], rows[3]), (uint8_t *)&cv[4]); +} + +/* + * ---------------------------------------------------------------------------- + * hash4_avx512 + * ---------------------------------------------------------------------------- + */ + +INLINE void round_fn4(__m128i v[16], __m128i m[16], size_t r) { + v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][0]]); + v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][2]]); + v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][4]]); + v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][6]]); + v[0] = add_128(v[0], v[4]); + v[1] = add_128(v[1], v[5]); + v[2] = add_128(v[2], v[6]); + v[3] = add_128(v[3], v[7]); + v[12] = xor_128(v[12], v[0]); + v[13] = xor_128(v[13], v[1]); + v[14] = xor_128(v[14], v[2]); + v[15] = xor_128(v[15], v[3]); + v[12] = rot16_128(v[12]); + v[13] = rot16_128(v[13]); + v[14] = rot16_128(v[14]); + v[15] = rot16_128(v[15]); + v[8] = add_128(v[8], v[12]); + v[9] = add_128(v[9], v[13]); + v[10] = add_128(v[10], v[14]); + v[11] = add_128(v[11], v[15]); + v[4] = xor_128(v[4], v[8]); + v[5] = xor_128(v[5], v[9]); + v[6] = xor_128(v[6], v[10]); + v[7] = xor_128(v[7], v[11]); + v[4] = rot12_128(v[4]); + v[5] = rot12_128(v[5]); + v[6] = rot12_128(v[6]); + v[7] = rot12_128(v[7]); + v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][1]]); + v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][3]]); + v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][5]]); + v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][7]]); + v[0] = add_128(v[0], v[4]); + v[1] = add_128(v[1], v[5]); + v[2] = add_128(v[2], v[6]); + v[3] = add_128(v[3], v[7]); + v[12] = xor_128(v[12], v[0]); + v[13] = xor_128(v[13], v[1]); + v[14] = xor_128(v[14], v[2]); + v[15] = xor_128(v[15], v[3]); + v[12] = rot8_128(v[12]); + v[13] = rot8_128(v[13]); + v[14] = rot8_128(v[14]); + v[15] = rot8_128(v[15]); + v[8] = add_128(v[8], v[12]); + v[9] = add_128(v[9], v[13]); + v[10] = add_128(v[10], v[14]); + v[11] = add_128(v[11], v[15]); + v[4] = xor_128(v[4], v[8]); + v[5] = xor_128(v[5], v[9]); + v[6] = xor_128(v[6], v[10]); + v[7] = xor_128(v[7], v[11]); + v[4] = rot7_128(v[4]); + v[5] = rot7_128(v[5]); + v[6] = rot7_128(v[6]); + v[7] = rot7_128(v[7]); + + v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][8]]); + v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][10]]); + v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][12]]); + v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][14]]); + v[0] = add_128(v[0], v[5]); + v[1] = add_128(v[1], v[6]); + v[2] = add_128(v[2], v[7]); + v[3] = add_128(v[3], v[4]); + v[15] = xor_128(v[15], v[0]); + v[12] = xor_128(v[12], v[1]); + v[13] = xor_128(v[13], v[2]); + v[14] = xor_128(v[14], v[3]); + v[15] = rot16_128(v[15]); + v[12] = rot16_128(v[12]); + v[13] = rot16_128(v[13]); + v[14] = rot16_128(v[14]); + v[10] = add_128(v[10], v[15]); + v[11] = add_128(v[11], v[12]); + v[8] = add_128(v[8], v[13]); + v[9] = add_128(v[9], v[14]); + v[5] = xor_128(v[5], v[10]); + v[6] = xor_128(v[6], v[11]); + v[7] = xor_128(v[7], v[8]); + v[4] = xor_128(v[4], v[9]); + v[5] = rot12_128(v[5]); + v[6] = rot12_128(v[6]); + v[7] = rot12_128(v[7]); + v[4] = rot12_128(v[4]); + v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][9]]); + v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][11]]); + v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][13]]); + v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][15]]); + v[0] = add_128(v[0], v[5]); + v[1] = add_128(v[1], v[6]); + v[2] = add_128(v[2], v[7]); + v[3] = add_128(v[3], v[4]); + v[15] = xor_128(v[15], v[0]); + v[12] = xor_128(v[12], v[1]); + v[13] = xor_128(v[13], v[2]); + v[14] = xor_128(v[14], v[3]); + v[15] = rot8_128(v[15]); + v[12] = rot8_128(v[12]); + v[13] = rot8_128(v[13]); + v[14] = rot8_128(v[14]); + v[10] = add_128(v[10], v[15]); + v[11] = add_128(v[11], v[12]); + v[8] = add_128(v[8], v[13]); + v[9] = add_128(v[9], v[14]); + v[5] = xor_128(v[5], v[10]); + v[6] = xor_128(v[6], v[11]); + v[7] = xor_128(v[7], v[8]); + v[4] = xor_128(v[4], v[9]); + v[5] = rot7_128(v[5]); + v[6] = rot7_128(v[6]); + v[7] = rot7_128(v[7]); + v[4] = rot7_128(v[4]); +} + +INLINE void transpose_vecs_128(__m128i vecs[4]) { + // Interleave 32-bit lanes. The low unpack is lanes 00/11 and the high is + // 22/33. Note that this doesn't split the vector into two lanes, as the + // AVX2 counterparts do. + __m128i ab_01 = _mm_unpacklo_epi32(vecs[0], vecs[1]); + __m128i ab_23 = _mm_unpackhi_epi32(vecs[0], vecs[1]); + __m128i cd_01 = _mm_unpacklo_epi32(vecs[2], vecs[3]); + __m128i cd_23 = _mm_unpackhi_epi32(vecs[2], vecs[3]); + + // Interleave 64-bit lanes. + __m128i abcd_0 = _mm_unpacklo_epi64(ab_01, cd_01); + __m128i abcd_1 = _mm_unpackhi_epi64(ab_01, cd_01); + __m128i abcd_2 = _mm_unpacklo_epi64(ab_23, cd_23); + __m128i abcd_3 = _mm_unpackhi_epi64(ab_23, cd_23); + + vecs[0] = abcd_0; + vecs[1] = abcd_1; + vecs[2] = abcd_2; + vecs[3] = abcd_3; +} + +INLINE void transpose_msg_vecs4(const uint8_t *const *inputs, + size_t block_offset, __m128i out[16]) { + out[0] = loadu_128(&inputs[0][block_offset + 0 * sizeof(__m128i)]); + out[1] = loadu_128(&inputs[1][block_offset + 0 * sizeof(__m128i)]); + out[2] = loadu_128(&inputs[2][block_offset + 0 * sizeof(__m128i)]); + out[3] = loadu_128(&inputs[3][block_offset + 0 * sizeof(__m128i)]); + out[4] = loadu_128(&inputs[0][block_offset + 1 * sizeof(__m128i)]); + out[5] = loadu_128(&inputs[1][block_offset + 1 * sizeof(__m128i)]); + out[6] = loadu_128(&inputs[2][block_offset + 1 * sizeof(__m128i)]); + out[7] = loadu_128(&inputs[3][block_offset + 1 * sizeof(__m128i)]); + out[8] = loadu_128(&inputs[0][block_offset + 2 * sizeof(__m128i)]); + out[9] = loadu_128(&inputs[1][block_offset + 2 * sizeof(__m128i)]); + out[10] = loadu_128(&inputs[2][block_offset + 2 * sizeof(__m128i)]); + out[11] = loadu_128(&inputs[3][block_offset + 2 * sizeof(__m128i)]); + out[12] = loadu_128(&inputs[0][block_offset + 3 * sizeof(__m128i)]); + out[13] = loadu_128(&inputs[1][block_offset + 3 * sizeof(__m128i)]); + out[14] = loadu_128(&inputs[2][block_offset + 3 * sizeof(__m128i)]); + out[15] = loadu_128(&inputs[3][block_offset + 3 * sizeof(__m128i)]); + for (size_t i = 0; i < 4; ++i) { + _mm_prefetch((const void *)&inputs[i][block_offset + 256], _MM_HINT_T0); + } + transpose_vecs_128(&out[0]); + transpose_vecs_128(&out[4]); + transpose_vecs_128(&out[8]); + transpose_vecs_128(&out[12]); +} + +INLINE void load_counters4(uint64_t counter, bool increment_counter, + __m128i *out_lo, __m128i *out_hi) { + uint64_t mask = (increment_counter ? ~0 : 0); + __m256i mask_vec = _mm256_set1_epi64x(mask); + __m256i deltas = _mm256_setr_epi64x(0, 1, 2, 3); + deltas = _mm256_and_si256(mask_vec, deltas); + __m256i counters = + _mm256_add_epi64(_mm256_set1_epi64x((int64_t)counter), deltas); + *out_lo = _mm256_cvtepi64_epi32(counters); + *out_hi = _mm256_cvtepi64_epi32(_mm256_srli_epi64(counters, 32)); +} + +static +void blake3_hash4_avx512(const uint8_t *const *inputs, size_t blocks, + const uint32_t key[8], uint64_t counter, + bool increment_counter, uint8_t flags, + uint8_t flags_start, uint8_t flags_end, uint8_t *out) { + __m128i h_vecs[8] = { + set1_128(key[0]), set1_128(key[1]), set1_128(key[2]), set1_128(key[3]), + set1_128(key[4]), set1_128(key[5]), set1_128(key[6]), set1_128(key[7]), + }; + __m128i counter_low_vec, counter_high_vec; + load_counters4(counter, increment_counter, &counter_low_vec, + &counter_high_vec); + uint8_t block_flags = flags | flags_start; + + for (size_t block = 0; block < blocks; block++) { + if (block + 1 == blocks) { + block_flags |= flags_end; + } + __m128i block_len_vec = set1_128(BLAKE3_BLOCK_LEN); + __m128i block_flags_vec = set1_128(block_flags); + __m128i msg_vecs[16]; + transpose_msg_vecs4(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs); + + __m128i v[16] = { + h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3], + h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7], + set1_128(IV[0]), set1_128(IV[1]), set1_128(IV[2]), set1_128(IV[3]), + counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec, + }; + round_fn4(v, msg_vecs, 0); + round_fn4(v, msg_vecs, 1); + round_fn4(v, msg_vecs, 2); + round_fn4(v, msg_vecs, 3); + round_fn4(v, msg_vecs, 4); + round_fn4(v, msg_vecs, 5); + round_fn4(v, msg_vecs, 6); + h_vecs[0] = xor_128(v[0], v[8]); + h_vecs[1] = xor_128(v[1], v[9]); + h_vecs[2] = xor_128(v[2], v[10]); + h_vecs[3] = xor_128(v[3], v[11]); + h_vecs[4] = xor_128(v[4], v[12]); + h_vecs[5] = xor_128(v[5], v[13]); + h_vecs[6] = xor_128(v[6], v[14]); + h_vecs[7] = xor_128(v[7], v[15]); + + block_flags = flags; + } + + transpose_vecs_128(&h_vecs[0]); + transpose_vecs_128(&h_vecs[4]); + // The first four vecs now contain the first half of each output, and the + // second four vecs contain the second half of each output. + storeu_128(h_vecs[0], &out[0 * sizeof(__m128i)]); + storeu_128(h_vecs[4], &out[1 * sizeof(__m128i)]); + storeu_128(h_vecs[1], &out[2 * sizeof(__m128i)]); + storeu_128(h_vecs[5], &out[3 * sizeof(__m128i)]); + storeu_128(h_vecs[2], &out[4 * sizeof(__m128i)]); + storeu_128(h_vecs[6], &out[5 * sizeof(__m128i)]); + storeu_128(h_vecs[3], &out[6 * sizeof(__m128i)]); + storeu_128(h_vecs[7], &out[7 * sizeof(__m128i)]); +} + +/* + * ---------------------------------------------------------------------------- + * hash8_avx512 + * ---------------------------------------------------------------------------- + */ + +INLINE void round_fn8(__m256i v[16], __m256i m[16], size_t r) { + v[0] = add_256(v[0], m[(size_t)MSG_SCHEDULE[r][0]]); + v[1] = add_256(v[1], m[(size_t)MSG_SCHEDULE[r][2]]); + v[2] = add_256(v[2], m[(size_t)MSG_SCHEDULE[r][4]]); + v[3] = add_256(v[3], m[(size_t)MSG_SCHEDULE[r][6]]); + v[0] = add_256(v[0], v[4]); + v[1] = add_256(v[1], v[5]); + v[2] = add_256(v[2], v[6]); + v[3] = add_256(v[3], v[7]); + v[12] = xor_256(v[12], v[0]); + v[13] = xor_256(v[13], v[1]); + v[14] = xor_256(v[14], v[2]); + v[15] = xor_256(v[15], v[3]); + v[12] = rot16_256(v[12]); + v[13] = rot16_256(v[13]); + v[14] = rot16_256(v[14]); + v[15] = rot16_256(v[15]); + v[8] = add_256(v[8], v[12]); + v[9] = add_256(v[9], v[13]); + v[10] = add_256(v[10], v[14]); + v[11] = add_256(v[11], v[15]); + v[4] = xor_256(v[4], v[8]); + v[5] = xor_256(v[5], v[9]); + v[6] = xor_256(v[6], v[10]); + v[7] = xor_256(v[7], v[11]); + v[4] = rot12_256(v[4]); + v[5] = rot12_256(v[5]); + v[6] = rot12_256(v[6]); + v[7] = rot12_256(v[7]); + v[0] = add_256(v[0], m[(size_t)MSG_SCHEDULE[r][1]]); + v[1] = add_256(v[1], m[(size_t)MSG_SCHEDULE[r][3]]); + v[2] = add_256(v[2], m[(size_t)MSG_SCHEDULE[r][5]]); + v[3] = add_256(v[3], m[(size_t)MSG_SCHEDULE[r][7]]); + v[0] = add_256(v[0], v[4]); + v[1] = add_256(v[1], v[5]); + v[2] = add_256(v[2], v[6]); + v[3] = add_256(v[3], v[7]); + v[12] = xor_256(v[12], v[0]); + v[13] = xor_256(v[13], v[1]); + v[14] = xor_256(v[14], v[2]); + v[15] = xor_256(v[15], v[3]); + v[12] = rot8_256(v[12]); + v[13] = rot8_256(v[13]); + v[14] = rot8_256(v[14]); + v[15] = rot8_256(v[15]); + v[8] = add_256(v[8], v[12]); + v[9] = add_256(v[9], v[13]); + v[10] = add_256(v[10], v[14]); + v[11] = add_256(v[11], v[15]); + v[4] = xor_256(v[4], v[8]); + v[5] = xor_256(v[5], v[9]); + v[6] = xor_256(v[6], v[10]); + v[7] = xor_256(v[7], v[11]); + v[4] = rot7_256(v[4]); + v[5] = rot7_256(v[5]); + v[6] = rot7_256(v[6]); + v[7] = rot7_256(v[7]); + + v[0] = add_256(v[0], m[(size_t)MSG_SCHEDULE[r][8]]); + v[1] = add_256(v[1], m[(size_t)MSG_SCHEDULE[r][10]]); + v[2] = add_256(v[2], m[(size_t)MSG_SCHEDULE[r][12]]); + v[3] = add_256(v[3], m[(size_t)MSG_SCHEDULE[r][14]]); + v[0] = add_256(v[0], v[5]); + v[1] = add_256(v[1], v[6]); + v[2] = add_256(v[2], v[7]); + v[3] = add_256(v[3], v[4]); + v[15] = xor_256(v[15], v[0]); + v[12] = xor_256(v[12], v[1]); + v[13] = xor_256(v[13], v[2]); + v[14] = xor_256(v[14], v[3]); + v[15] = rot16_256(v[15]); + v[12] = rot16_256(v[12]); + v[13] = rot16_256(v[13]); + v[14] = rot16_256(v[14]); + v[10] = add_256(v[10], v[15]); + v[11] = add_256(v[11], v[12]); + v[8] = add_256(v[8], v[13]); + v[9] = add_256(v[9], v[14]); + v[5] = xor_256(v[5], v[10]); + v[6] = xor_256(v[6], v[11]); + v[7] = xor_256(v[7], v[8]); + v[4] = xor_256(v[4], v[9]); + v[5] = rot12_256(v[5]); + v[6] = rot12_256(v[6]); + v[7] = rot12_256(v[7]); + v[4] = rot12_256(v[4]); + v[0] = add_256(v[0], m[(size_t)MSG_SCHEDULE[r][9]]); + v[1] = add_256(v[1], m[(size_t)MSG_SCHEDULE[r][11]]); + v[2] = add_256(v[2], m[(size_t)MSG_SCHEDULE[r][13]]); + v[3] = add_256(v[3], m[(size_t)MSG_SCHEDULE[r][15]]); + v[0] = add_256(v[0], v[5]); + v[1] = add_256(v[1], v[6]); + v[2] = add_256(v[2], v[7]); + v[3] = add_256(v[3], v[4]); + v[15] = xor_256(v[15], v[0]); + v[12] = xor_256(v[12], v[1]); + v[13] = xor_256(v[13], v[2]); + v[14] = xor_256(v[14], v[3]); + v[15] = rot8_256(v[15]); + v[12] = rot8_256(v[12]); + v[13] = rot8_256(v[13]); + v[14] = rot8_256(v[14]); + v[10] = add_256(v[10], v[15]); + v[11] = add_256(v[11], v[12]); + v[8] = add_256(v[8], v[13]); + v[9] = add_256(v[9], v[14]); + v[5] = xor_256(v[5], v[10]); + v[6] = xor_256(v[6], v[11]); + v[7] = xor_256(v[7], v[8]); + v[4] = xor_256(v[4], v[9]); + v[5] = rot7_256(v[5]); + v[6] = rot7_256(v[6]); + v[7] = rot7_256(v[7]); + v[4] = rot7_256(v[4]); +} + +INLINE void transpose_vecs_256(__m256i vecs[8]) { + // Interleave 32-bit lanes. The low unpack is lanes 00/11/44/55, and the high + // is 22/33/66/77. + __m256i ab_0145 = _mm256_unpacklo_epi32(vecs[0], vecs[1]); + __m256i ab_2367 = _mm256_unpackhi_epi32(vecs[0], vecs[1]); + __m256i cd_0145 = _mm256_unpacklo_epi32(vecs[2], vecs[3]); + __m256i cd_2367 = _mm256_unpackhi_epi32(vecs[2], vecs[3]); + __m256i ef_0145 = _mm256_unpacklo_epi32(vecs[4], vecs[5]); + __m256i ef_2367 = _mm256_unpackhi_epi32(vecs[4], vecs[5]); + __m256i gh_0145 = _mm256_unpacklo_epi32(vecs[6], vecs[7]); + __m256i gh_2367 = _mm256_unpackhi_epi32(vecs[6], vecs[7]); + + // Interleave 64-bit lanes. The low unpack is lanes 00/22 and the high is + // 11/33. + __m256i abcd_04 = _mm256_unpacklo_epi64(ab_0145, cd_0145); + __m256i abcd_15 = _mm256_unpackhi_epi64(ab_0145, cd_0145); + __m256i abcd_26 = _mm256_unpacklo_epi64(ab_2367, cd_2367); + __m256i abcd_37 = _mm256_unpackhi_epi64(ab_2367, cd_2367); + __m256i efgh_04 = _mm256_unpacklo_epi64(ef_0145, gh_0145); + __m256i efgh_15 = _mm256_unpackhi_epi64(ef_0145, gh_0145); + __m256i efgh_26 = _mm256_unpacklo_epi64(ef_2367, gh_2367); + __m256i efgh_37 = _mm256_unpackhi_epi64(ef_2367, gh_2367); + + // Interleave 128-bit lanes. + vecs[0] = _mm256_permute2x128_si256(abcd_04, efgh_04, 0x20); + vecs[1] = _mm256_permute2x128_si256(abcd_15, efgh_15, 0x20); + vecs[2] = _mm256_permute2x128_si256(abcd_26, efgh_26, 0x20); + vecs[3] = _mm256_permute2x128_si256(abcd_37, efgh_37, 0x20); + vecs[4] = _mm256_permute2x128_si256(abcd_04, efgh_04, 0x31); + vecs[5] = _mm256_permute2x128_si256(abcd_15, efgh_15, 0x31); + vecs[6] = _mm256_permute2x128_si256(abcd_26, efgh_26, 0x31); + vecs[7] = _mm256_permute2x128_si256(abcd_37, efgh_37, 0x31); +} + +INLINE void transpose_msg_vecs8(const uint8_t *const *inputs, + size_t block_offset, __m256i out[16]) { + out[0] = loadu_256(&inputs[0][block_offset + 0 * sizeof(__m256i)]); + out[1] = loadu_256(&inputs[1][block_offset + 0 * sizeof(__m256i)]); + out[2] = loadu_256(&inputs[2][block_offset + 0 * sizeof(__m256i)]); + out[3] = loadu_256(&inputs[3][block_offset + 0 * sizeof(__m256i)]); + out[4] = loadu_256(&inputs[4][block_offset + 0 * sizeof(__m256i)]); + out[5] = loadu_256(&inputs[5][block_offset + 0 * sizeof(__m256i)]); + out[6] = loadu_256(&inputs[6][block_offset + 0 * sizeof(__m256i)]); + out[7] = loadu_256(&inputs[7][block_offset + 0 * sizeof(__m256i)]); + out[8] = loadu_256(&inputs[0][block_offset + 1 * sizeof(__m256i)]); + out[9] = loadu_256(&inputs[1][block_offset + 1 * sizeof(__m256i)]); + out[10] = loadu_256(&inputs[2][block_offset + 1 * sizeof(__m256i)]); + out[11] = loadu_256(&inputs[3][block_offset + 1 * sizeof(__m256i)]); + out[12] = loadu_256(&inputs[4][block_offset + 1 * sizeof(__m256i)]); + out[13] = loadu_256(&inputs[5][block_offset + 1 * sizeof(__m256i)]); + out[14] = loadu_256(&inputs[6][block_offset + 1 * sizeof(__m256i)]); + out[15] = loadu_256(&inputs[7][block_offset + 1 * sizeof(__m256i)]); + for (size_t i = 0; i < 8; ++i) { + _mm_prefetch((const void *)&inputs[i][block_offset + 256], _MM_HINT_T0); + } + transpose_vecs_256(&out[0]); + transpose_vecs_256(&out[8]); +} + +INLINE void load_counters8(uint64_t counter, bool increment_counter, + __m256i *out_lo, __m256i *out_hi) { + uint64_t mask = (increment_counter ? ~0 : 0); + __m512i mask_vec = _mm512_set1_epi64(mask); + __m512i deltas = _mm512_setr_epi64(0, 1, 2, 3, 4, 5, 6, 7); + deltas = _mm512_and_si512(mask_vec, deltas); + __m512i counters = + _mm512_add_epi64(_mm512_set1_epi64((int64_t)counter), deltas); + *out_lo = _mm512_cvtepi64_epi32(counters); + *out_hi = _mm512_cvtepi64_epi32(_mm512_srli_epi64(counters, 32)); +} + +static +void blake3_hash8_avx512(const uint8_t *const *inputs, size_t blocks, + const uint32_t key[8], uint64_t counter, + bool increment_counter, uint8_t flags, + uint8_t flags_start, uint8_t flags_end, uint8_t *out) { + __m256i h_vecs[8] = { + set1_256(key[0]), set1_256(key[1]), set1_256(key[2]), set1_256(key[3]), + set1_256(key[4]), set1_256(key[5]), set1_256(key[6]), set1_256(key[7]), + }; + __m256i counter_low_vec, counter_high_vec; + load_counters8(counter, increment_counter, &counter_low_vec, + &counter_high_vec); + uint8_t block_flags = flags | flags_start; + + for (size_t block = 0; block < blocks; block++) { + if (block + 1 == blocks) { + block_flags |= flags_end; + } + __m256i block_len_vec = set1_256(BLAKE3_BLOCK_LEN); + __m256i block_flags_vec = set1_256(block_flags); + __m256i msg_vecs[16]; + transpose_msg_vecs8(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs); + + __m256i v[16] = { + h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3], + h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7], + set1_256(IV[0]), set1_256(IV[1]), set1_256(IV[2]), set1_256(IV[3]), + counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec, + }; + round_fn8(v, msg_vecs, 0); + round_fn8(v, msg_vecs, 1); + round_fn8(v, msg_vecs, 2); + round_fn8(v, msg_vecs, 3); + round_fn8(v, msg_vecs, 4); + round_fn8(v, msg_vecs, 5); + round_fn8(v, msg_vecs, 6); + h_vecs[0] = xor_256(v[0], v[8]); + h_vecs[1] = xor_256(v[1], v[9]); + h_vecs[2] = xor_256(v[2], v[10]); + h_vecs[3] = xor_256(v[3], v[11]); + h_vecs[4] = xor_256(v[4], v[12]); + h_vecs[5] = xor_256(v[5], v[13]); + h_vecs[6] = xor_256(v[6], v[14]); + h_vecs[7] = xor_256(v[7], v[15]); + + block_flags = flags; + } + + transpose_vecs_256(h_vecs); + storeu_256(h_vecs[0], &out[0 * sizeof(__m256i)]); + storeu_256(h_vecs[1], &out[1 * sizeof(__m256i)]); + storeu_256(h_vecs[2], &out[2 * sizeof(__m256i)]); + storeu_256(h_vecs[3], &out[3 * sizeof(__m256i)]); + storeu_256(h_vecs[4], &out[4 * sizeof(__m256i)]); + storeu_256(h_vecs[5], &out[5 * sizeof(__m256i)]); + storeu_256(h_vecs[6], &out[6 * sizeof(__m256i)]); + storeu_256(h_vecs[7], &out[7 * sizeof(__m256i)]); +} + +/* + * ---------------------------------------------------------------------------- + * hash16_avx512 + * ---------------------------------------------------------------------------- + */ + +INLINE void round_fn16(__m512i v[16], __m512i m[16], size_t r) { + v[0] = add_512(v[0], m[(size_t)MSG_SCHEDULE[r][0]]); + v[1] = add_512(v[1], m[(size_t)MSG_SCHEDULE[r][2]]); + v[2] = add_512(v[2], m[(size_t)MSG_SCHEDULE[r][4]]); + v[3] = add_512(v[3], m[(size_t)MSG_SCHEDULE[r][6]]); + v[0] = add_512(v[0], v[4]); + v[1] = add_512(v[1], v[5]); + v[2] = add_512(v[2], v[6]); + v[3] = add_512(v[3], v[7]); + v[12] = xor_512(v[12], v[0]); + v[13] = xor_512(v[13], v[1]); + v[14] = xor_512(v[14], v[2]); + v[15] = xor_512(v[15], v[3]); + v[12] = rot16_512(v[12]); + v[13] = rot16_512(v[13]); + v[14] = rot16_512(v[14]); + v[15] = rot16_512(v[15]); + v[8] = add_512(v[8], v[12]); + v[9] = add_512(v[9], v[13]); + v[10] = add_512(v[10], v[14]); + v[11] = add_512(v[11], v[15]); + v[4] = xor_512(v[4], v[8]); + v[5] = xor_512(v[5], v[9]); + v[6] = xor_512(v[6], v[10]); + v[7] = xor_512(v[7], v[11]); + v[4] = rot12_512(v[4]); + v[5] = rot12_512(v[5]); + v[6] = rot12_512(v[6]); + v[7] = rot12_512(v[7]); + v[0] = add_512(v[0], m[(size_t)MSG_SCHEDULE[r][1]]); + v[1] = add_512(v[1], m[(size_t)MSG_SCHEDULE[r][3]]); + v[2] = add_512(v[2], m[(size_t)MSG_SCHEDULE[r][5]]); + v[3] = add_512(v[3], m[(size_t)MSG_SCHEDULE[r][7]]); + v[0] = add_512(v[0], v[4]); + v[1] = add_512(v[1], v[5]); + v[2] = add_512(v[2], v[6]); + v[3] = add_512(v[3], v[7]); + v[12] = xor_512(v[12], v[0]); + v[13] = xor_512(v[13], v[1]); + v[14] = xor_512(v[14], v[2]); + v[15] = xor_512(v[15], v[3]); + v[12] = rot8_512(v[12]); + v[13] = rot8_512(v[13]); + v[14] = rot8_512(v[14]); + v[15] = rot8_512(v[15]); + v[8] = add_512(v[8], v[12]); + v[9] = add_512(v[9], v[13]); + v[10] = add_512(v[10], v[14]); + v[11] = add_512(v[11], v[15]); + v[4] = xor_512(v[4], v[8]); + v[5] = xor_512(v[5], v[9]); + v[6] = xor_512(v[6], v[10]); + v[7] = xor_512(v[7], v[11]); + v[4] = rot7_512(v[4]); + v[5] = rot7_512(v[5]); + v[6] = rot7_512(v[6]); + v[7] = rot7_512(v[7]); + + v[0] = add_512(v[0], m[(size_t)MSG_SCHEDULE[r][8]]); + v[1] = add_512(v[1], m[(size_t)MSG_SCHEDULE[r][10]]); + v[2] = add_512(v[2], m[(size_t)MSG_SCHEDULE[r][12]]); + v[3] = add_512(v[3], m[(size_t)MSG_SCHEDULE[r][14]]); + v[0] = add_512(v[0], v[5]); + v[1] = add_512(v[1], v[6]); + v[2] = add_512(v[2], v[7]); + v[3] = add_512(v[3], v[4]); + v[15] = xor_512(v[15], v[0]); + v[12] = xor_512(v[12], v[1]); + v[13] = xor_512(v[13], v[2]); + v[14] = xor_512(v[14], v[3]); + v[15] = rot16_512(v[15]); + v[12] = rot16_512(v[12]); + v[13] = rot16_512(v[13]); + v[14] = rot16_512(v[14]); + v[10] = add_512(v[10], v[15]); + v[11] = add_512(v[11], v[12]); + v[8] = add_512(v[8], v[13]); + v[9] = add_512(v[9], v[14]); + v[5] = xor_512(v[5], v[10]); + v[6] = xor_512(v[6], v[11]); + v[7] = xor_512(v[7], v[8]); + v[4] = xor_512(v[4], v[9]); + v[5] = rot12_512(v[5]); + v[6] = rot12_512(v[6]); + v[7] = rot12_512(v[7]); + v[4] = rot12_512(v[4]); + v[0] = add_512(v[0], m[(size_t)MSG_SCHEDULE[r][9]]); + v[1] = add_512(v[1], m[(size_t)MSG_SCHEDULE[r][11]]); + v[2] = add_512(v[2], m[(size_t)MSG_SCHEDULE[r][13]]); + v[3] = add_512(v[3], m[(size_t)MSG_SCHEDULE[r][15]]); + v[0] = add_512(v[0], v[5]); + v[1] = add_512(v[1], v[6]); + v[2] = add_512(v[2], v[7]); + v[3] = add_512(v[3], v[4]); + v[15] = xor_512(v[15], v[0]); + v[12] = xor_512(v[12], v[1]); + v[13] = xor_512(v[13], v[2]); + v[14] = xor_512(v[14], v[3]); + v[15] = rot8_512(v[15]); + v[12] = rot8_512(v[12]); + v[13] = rot8_512(v[13]); + v[14] = rot8_512(v[14]); + v[10] = add_512(v[10], v[15]); + v[11] = add_512(v[11], v[12]); + v[8] = add_512(v[8], v[13]); + v[9] = add_512(v[9], v[14]); + v[5] = xor_512(v[5], v[10]); + v[6] = xor_512(v[6], v[11]); + v[7] = xor_512(v[7], v[8]); + v[4] = xor_512(v[4], v[9]); + v[5] = rot7_512(v[5]); + v[6] = rot7_512(v[6]); + v[7] = rot7_512(v[7]); + v[4] = rot7_512(v[4]); +} + +// 0b10001000, or lanes a0/a2/b0/b2 in little-endian order +#define LO_IMM8 0x88 + +INLINE __m512i unpack_lo_128(__m512i a, __m512i b) { + return _mm512_shuffle_i32x4(a, b, LO_IMM8); +} + +// 0b11011101, or lanes a1/a3/b1/b3 in little-endian order +#define HI_IMM8 0xdd + +INLINE __m512i unpack_hi_128(__m512i a, __m512i b) { + return _mm512_shuffle_i32x4(a, b, HI_IMM8); +} + +INLINE void transpose_vecs_512(__m512i vecs[16]) { + // Interleave 32-bit lanes. The _0 unpack is lanes + // 0/0/1/1/4/4/5/5/8/8/9/9/12/12/13/13, and the _2 unpack is lanes + // 2/2/3/3/6/6/7/7/10/10/11/11/14/14/15/15. + __m512i ab_0 = _mm512_unpacklo_epi32(vecs[0], vecs[1]); + __m512i ab_2 = _mm512_unpackhi_epi32(vecs[0], vecs[1]); + __m512i cd_0 = _mm512_unpacklo_epi32(vecs[2], vecs[3]); + __m512i cd_2 = _mm512_unpackhi_epi32(vecs[2], vecs[3]); + __m512i ef_0 = _mm512_unpacklo_epi32(vecs[4], vecs[5]); + __m512i ef_2 = _mm512_unpackhi_epi32(vecs[4], vecs[5]); + __m512i gh_0 = _mm512_unpacklo_epi32(vecs[6], vecs[7]); + __m512i gh_2 = _mm512_unpackhi_epi32(vecs[6], vecs[7]); + __m512i ij_0 = _mm512_unpacklo_epi32(vecs[8], vecs[9]); + __m512i ij_2 = _mm512_unpackhi_epi32(vecs[8], vecs[9]); + __m512i kl_0 = _mm512_unpacklo_epi32(vecs[10], vecs[11]); + __m512i kl_2 = _mm512_unpackhi_epi32(vecs[10], vecs[11]); + __m512i mn_0 = _mm512_unpacklo_epi32(vecs[12], vecs[13]); + __m512i mn_2 = _mm512_unpackhi_epi32(vecs[12], vecs[13]); + __m512i op_0 = _mm512_unpacklo_epi32(vecs[14], vecs[15]); + __m512i op_2 = _mm512_unpackhi_epi32(vecs[14], vecs[15]); + + // Interleave 64-bit lanes. The _0 unpack is lanes + // 0/0/0/0/4/4/4/4/8/8/8/8/12/12/12/12, the _1 unpack is lanes + // 1/1/1/1/5/5/5/5/9/9/9/9/13/13/13/13, the _2 unpack is lanes + // 2/2/2/2/6/6/6/6/10/10/10/10/14/14/14/14, and the _3 unpack is lanes + // 3/3/3/3/7/7/7/7/11/11/11/11/15/15/15/15. + __m512i abcd_0 = _mm512_unpacklo_epi64(ab_0, cd_0); + __m512i abcd_1 = _mm512_unpackhi_epi64(ab_0, cd_0); + __m512i abcd_2 = _mm512_unpacklo_epi64(ab_2, cd_2); + __m512i abcd_3 = _mm512_unpackhi_epi64(ab_2, cd_2); + __m512i efgh_0 = _mm512_unpacklo_epi64(ef_0, gh_0); + __m512i efgh_1 = _mm512_unpackhi_epi64(ef_0, gh_0); + __m512i efgh_2 = _mm512_unpacklo_epi64(ef_2, gh_2); + __m512i efgh_3 = _mm512_unpackhi_epi64(ef_2, gh_2); + __m512i ijkl_0 = _mm512_unpacklo_epi64(ij_0, kl_0); + __m512i ijkl_1 = _mm512_unpackhi_epi64(ij_0, kl_0); + __m512i ijkl_2 = _mm512_unpacklo_epi64(ij_2, kl_2); + __m512i ijkl_3 = _mm512_unpackhi_epi64(ij_2, kl_2); + __m512i mnop_0 = _mm512_unpacklo_epi64(mn_0, op_0); + __m512i mnop_1 = _mm512_unpackhi_epi64(mn_0, op_0); + __m512i mnop_2 = _mm512_unpacklo_epi64(mn_2, op_2); + __m512i mnop_3 = _mm512_unpackhi_epi64(mn_2, op_2); + + // Interleave 128-bit lanes. The _0 unpack is + // 0/0/0/0/8/8/8/8/0/0/0/0/8/8/8/8, the _1 unpack is + // 1/1/1/1/9/9/9/9/1/1/1/1/9/9/9/9, and so on. + __m512i abcdefgh_0 = unpack_lo_128(abcd_0, efgh_0); + __m512i abcdefgh_1 = unpack_lo_128(abcd_1, efgh_1); + __m512i abcdefgh_2 = unpack_lo_128(abcd_2, efgh_2); + __m512i abcdefgh_3 = unpack_lo_128(abcd_3, efgh_3); + __m512i abcdefgh_4 = unpack_hi_128(abcd_0, efgh_0); + __m512i abcdefgh_5 = unpack_hi_128(abcd_1, efgh_1); + __m512i abcdefgh_6 = unpack_hi_128(abcd_2, efgh_2); + __m512i abcdefgh_7 = unpack_hi_128(abcd_3, efgh_3); + __m512i ijklmnop_0 = unpack_lo_128(ijkl_0, mnop_0); + __m512i ijklmnop_1 = unpack_lo_128(ijkl_1, mnop_1); + __m512i ijklmnop_2 = unpack_lo_128(ijkl_2, mnop_2); + __m512i ijklmnop_3 = unpack_lo_128(ijkl_3, mnop_3); + __m512i ijklmnop_4 = unpack_hi_128(ijkl_0, mnop_0); + __m512i ijklmnop_5 = unpack_hi_128(ijkl_1, mnop_1); + __m512i ijklmnop_6 = unpack_hi_128(ijkl_2, mnop_2); + __m512i ijklmnop_7 = unpack_hi_128(ijkl_3, mnop_3); + + // Interleave 128-bit lanes again for the final outputs. + vecs[0] = unpack_lo_128(abcdefgh_0, ijklmnop_0); + vecs[1] = unpack_lo_128(abcdefgh_1, ijklmnop_1); + vecs[2] = unpack_lo_128(abcdefgh_2, ijklmnop_2); + vecs[3] = unpack_lo_128(abcdefgh_3, ijklmnop_3); + vecs[4] = unpack_lo_128(abcdefgh_4, ijklmnop_4); + vecs[5] = unpack_lo_128(abcdefgh_5, ijklmnop_5); + vecs[6] = unpack_lo_128(abcdefgh_6, ijklmnop_6); + vecs[7] = unpack_lo_128(abcdefgh_7, ijklmnop_7); + vecs[8] = unpack_hi_128(abcdefgh_0, ijklmnop_0); + vecs[9] = unpack_hi_128(abcdefgh_1, ijklmnop_1); + vecs[10] = unpack_hi_128(abcdefgh_2, ijklmnop_2); + vecs[11] = unpack_hi_128(abcdefgh_3, ijklmnop_3); + vecs[12] = unpack_hi_128(abcdefgh_4, ijklmnop_4); + vecs[13] = unpack_hi_128(abcdefgh_5, ijklmnop_5); + vecs[14] = unpack_hi_128(abcdefgh_6, ijklmnop_6); + vecs[15] = unpack_hi_128(abcdefgh_7, ijklmnop_7); +} + +INLINE void transpose_msg_vecs16(const uint8_t *const *inputs, + size_t block_offset, __m512i out[16]) { + out[0] = loadu_512(&inputs[0][block_offset]); + out[1] = loadu_512(&inputs[1][block_offset]); + out[2] = loadu_512(&inputs[2][block_offset]); + out[3] = loadu_512(&inputs[3][block_offset]); + out[4] = loadu_512(&inputs[4][block_offset]); + out[5] = loadu_512(&inputs[5][block_offset]); + out[6] = loadu_512(&inputs[6][block_offset]); + out[7] = loadu_512(&inputs[7][block_offset]); + out[8] = loadu_512(&inputs[8][block_offset]); + out[9] = loadu_512(&inputs[9][block_offset]); + out[10] = loadu_512(&inputs[10][block_offset]); + out[11] = loadu_512(&inputs[11][block_offset]); + out[12] = loadu_512(&inputs[12][block_offset]); + out[13] = loadu_512(&inputs[13][block_offset]); + out[14] = loadu_512(&inputs[14][block_offset]); + out[15] = loadu_512(&inputs[15][block_offset]); + for (size_t i = 0; i < 16; ++i) { + _mm_prefetch((const void *)&inputs[i][block_offset + 256], _MM_HINT_T0); + } + transpose_vecs_512(out); +} + +INLINE void load_counters16(uint64_t counter, bool increment_counter, + __m512i *out_lo, __m512i *out_hi) { + const __m512i mask = _mm512_set1_epi32(-(int32_t)increment_counter); + const __m512i deltas = _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); + const __m512i masked_deltas = _mm512_and_si512(deltas, mask); + const __m512i low_words = _mm512_add_epi32( + _mm512_set1_epi32((int32_t)counter), + masked_deltas); + // The carry bit is 1 if the high bit of the word was 1 before addition and is + // 0 after. + // NOTE: It would be a bit more natural to use _mm512_cmp_epu32_mask to + // compute the carry bits here, and originally we did, but that intrinsic is + // broken under GCC 5.4. See https://github.com/BLAKE3-team/BLAKE3/issues/271. + const __m512i carries = _mm512_srli_epi32( + _mm512_andnot_si512( + low_words, // 0 after (gets inverted by andnot) + _mm512_set1_epi32((int32_t)counter)), // and 1 before + 31); + const __m512i high_words = _mm512_add_epi32( + _mm512_set1_epi32((int32_t)(counter >> 32)), + carries); + *out_lo = low_words; + *out_hi = high_words; +} + +static +void blake3_hash16_avx512(const uint8_t *const *inputs, size_t blocks, + const uint32_t key[8], uint64_t counter, + bool increment_counter, uint8_t flags, + uint8_t flags_start, uint8_t flags_end, + uint8_t *out) { + __m512i h_vecs[8] = { + set1_512(key[0]), set1_512(key[1]), set1_512(key[2]), set1_512(key[3]), + set1_512(key[4]), set1_512(key[5]), set1_512(key[6]), set1_512(key[7]), + }; + __m512i counter_low_vec, counter_high_vec; + load_counters16(counter, increment_counter, &counter_low_vec, + &counter_high_vec); + uint8_t block_flags = flags | flags_start; + + for (size_t block = 0; block < blocks; block++) { + if (block + 1 == blocks) { + block_flags |= flags_end; + } + __m512i block_len_vec = set1_512(BLAKE3_BLOCK_LEN); + __m512i block_flags_vec = set1_512(block_flags); + __m512i msg_vecs[16]; + transpose_msg_vecs16(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs); + + __m512i v[16] = { + h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3], + h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7], + set1_512(IV[0]), set1_512(IV[1]), set1_512(IV[2]), set1_512(IV[3]), + counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec, + }; + round_fn16(v, msg_vecs, 0); + round_fn16(v, msg_vecs, 1); + round_fn16(v, msg_vecs, 2); + round_fn16(v, msg_vecs, 3); + round_fn16(v, msg_vecs, 4); + round_fn16(v, msg_vecs, 5); + round_fn16(v, msg_vecs, 6); + h_vecs[0] = xor_512(v[0], v[8]); + h_vecs[1] = xor_512(v[1], v[9]); + h_vecs[2] = xor_512(v[2], v[10]); + h_vecs[3] = xor_512(v[3], v[11]); + h_vecs[4] = xor_512(v[4], v[12]); + h_vecs[5] = xor_512(v[5], v[13]); + h_vecs[6] = xor_512(v[6], v[14]); + h_vecs[7] = xor_512(v[7], v[15]); + + block_flags = flags; + } + + // transpose_vecs_512 operates on a 16x16 matrix of words, but we only have 8 + // state vectors. Pad the matrix with zeros. After transposition, store the + // lower half of each vector. + __m512i padded[16] = { + h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3], + h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7], + set1_512(0), set1_512(0), set1_512(0), set1_512(0), + set1_512(0), set1_512(0), set1_512(0), set1_512(0), + }; + transpose_vecs_512(padded); + _mm256_mask_storeu_epi32(&out[0 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[0])); + _mm256_mask_storeu_epi32(&out[1 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[1])); + _mm256_mask_storeu_epi32(&out[2 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[2])); + _mm256_mask_storeu_epi32(&out[3 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[3])); + _mm256_mask_storeu_epi32(&out[4 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[4])); + _mm256_mask_storeu_epi32(&out[5 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[5])); + _mm256_mask_storeu_epi32(&out[6 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[6])); + _mm256_mask_storeu_epi32(&out[7 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[7])); + _mm256_mask_storeu_epi32(&out[8 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[8])); + _mm256_mask_storeu_epi32(&out[9 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[9])); + _mm256_mask_storeu_epi32(&out[10 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[10])); + _mm256_mask_storeu_epi32(&out[11 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[11])); + _mm256_mask_storeu_epi32(&out[12 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[12])); + _mm256_mask_storeu_epi32(&out[13 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[13])); + _mm256_mask_storeu_epi32(&out[14 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[14])); + _mm256_mask_storeu_epi32(&out[15 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[15])); +} + +/* + * ---------------------------------------------------------------------------- + * hash_many_avx512 + * ---------------------------------------------------------------------------- + */ + +INLINE void hash_one_avx512(const uint8_t *input, size_t blocks, + const uint32_t key[8], uint64_t counter, + uint8_t flags, uint8_t flags_start, + uint8_t flags_end, uint8_t out[BLAKE3_OUT_LEN]) { + uint32_t cv[8]; + memcpy(cv, key, BLAKE3_KEY_LEN); + uint8_t block_flags = flags | flags_start; + while (blocks > 0) { + if (blocks == 1) { + block_flags |= flags_end; + } + blake3_compress_in_place_avx512(cv, input, BLAKE3_BLOCK_LEN, counter, + block_flags); + input = &input[BLAKE3_BLOCK_LEN]; + blocks -= 1; + block_flags = flags; + } + memcpy(out, cv, BLAKE3_OUT_LEN); +} + +void blake3_hash_many_avx512(const uint8_t *const *inputs, size_t num_inputs, + size_t blocks, const uint32_t key[8], + uint64_t counter, bool increment_counter, + uint8_t flags, uint8_t flags_start, + uint8_t flags_end, uint8_t *out) { + while (num_inputs >= 16) { + blake3_hash16_avx512(inputs, blocks, key, counter, increment_counter, flags, + flags_start, flags_end, out); + if (increment_counter) { + counter += 16; + } + inputs += 16; + num_inputs -= 16; + out = &out[16 * BLAKE3_OUT_LEN]; + } + while (num_inputs >= 8) { + blake3_hash8_avx512(inputs, blocks, key, counter, increment_counter, flags, + flags_start, flags_end, out); + if (increment_counter) { + counter += 8; + } + inputs += 8; + num_inputs -= 8; + out = &out[8 * BLAKE3_OUT_LEN]; + } + while (num_inputs >= 4) { + blake3_hash4_avx512(inputs, blocks, key, counter, increment_counter, flags, + flags_start, flags_end, out); + if (increment_counter) { + counter += 4; + } + inputs += 4; + num_inputs -= 4; + out = &out[4 * BLAKE3_OUT_LEN]; + } + while (num_inputs > 0) { + hash_one_avx512(inputs[0], blocks, key, counter, flags, flags_start, + flags_end, out); + if (increment_counter) { + counter += 1; + } + inputs += 1; + num_inputs -= 1; + out = &out[BLAKE3_OUT_LEN]; + } +} diff --git a/third-party/blake3/c/blake3_avx512_x86-64_unix.S b/third-party/blake3/c/blake3_avx512_x86-64_unix.S new file mode 100644 index 00000000..a06aede0 --- /dev/null +++ b/third-party/blake3/c/blake3_avx512_x86-64_unix.S @@ -0,0 +1,2585 @@ +#if defined(__ELF__) && defined(__linux__) +.section .note.GNU-stack,"",%progbits +#endif + +#if defined(__ELF__) && defined(__CET__) && defined(__has_include) +#if __has_include() +#include +#endif +#endif + +#if !defined(_CET_ENDBR) +#define _CET_ENDBR +#endif + +.intel_syntax noprefix +.global _blake3_hash_many_avx512 +.global blake3_hash_many_avx512 +.global blake3_compress_in_place_avx512 +.global _blake3_compress_in_place_avx512 +.global blake3_compress_xof_avx512 +.global _blake3_compress_xof_avx512 + +#ifdef __APPLE__ +.text +#else +.section .text +#endif +.p2align 6 +_blake3_hash_many_avx512: +blake3_hash_many_avx512: + _CET_ENDBR + push r15 + push r14 + push r13 + push r12 + push rbx + push rbp + mov rbp, rsp + sub rsp, 144 + and rsp, 0xFFFFFFFFFFFFFFC0 + neg r9 + kmovw k1, r9d + vmovd xmm0, r8d + vpbroadcastd ymm0, xmm0 + shr r8, 32 + vmovd xmm1, r8d + vpbroadcastd ymm1, xmm1 + vmovdqa ymm4, ymm1 + vmovdqa ymm5, ymm1 + vpaddd ymm2, ymm0, ymmword ptr [ADD0+rip] + vpaddd ymm3, ymm0, ymmword ptr [ADD0+32+rip] + vpcmpltud k2, ymm2, ymm0 + vpcmpltud k3, ymm3, ymm0 + vpaddd ymm4 {k2}, ymm4, dword ptr [ADD1+rip] {1to8} + vpaddd ymm5 {k3}, ymm5, dword ptr [ADD1+rip] {1to8} + knotw k2, k1 + vmovdqa32 ymm2 {k2}, ymm0 + vmovdqa32 ymm3 {k2}, ymm0 + vmovdqa32 ymm4 {k2}, ymm1 + vmovdqa32 ymm5 {k2}, ymm1 + vmovdqa ymmword ptr [rsp], ymm2 + vmovdqa ymmword ptr [rsp+0x1*0x20], ymm3 + vmovdqa ymmword ptr [rsp+0x2*0x20], ymm4 + vmovdqa ymmword ptr [rsp+0x3*0x20], ymm5 + shl rdx, 6 + mov qword ptr [rsp+0x80], rdx + cmp rsi, 16 + jc 3f +2: + vpbroadcastd zmm0, dword ptr [rcx] + vpbroadcastd zmm1, dword ptr [rcx+0x1*0x4] + vpbroadcastd zmm2, dword ptr [rcx+0x2*0x4] + vpbroadcastd zmm3, dword ptr [rcx+0x3*0x4] + vpbroadcastd zmm4, dword ptr [rcx+0x4*0x4] + vpbroadcastd zmm5, dword ptr [rcx+0x5*0x4] + vpbroadcastd zmm6, dword ptr [rcx+0x6*0x4] + vpbroadcastd zmm7, dword ptr [rcx+0x7*0x4] + movzx eax, byte ptr [rbp+0x38] + movzx ebx, byte ptr [rbp+0x40] + or eax, ebx + xor edx, edx +.p2align 5 +9: + movzx ebx, byte ptr [rbp+0x48] + or ebx, eax + add rdx, 64 + cmp rdx, qword ptr [rsp+0x80] + cmove eax, ebx + mov dword ptr [rsp+0x88], eax + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+0x8] + mov r10, qword ptr [rdi+0x10] + mov r11, qword ptr [rdi+0x18] + mov r12, qword ptr [rdi+0x40] + mov r13, qword ptr [rdi+0x48] + mov r14, qword ptr [rdi+0x50] + mov r15, qword ptr [rdi+0x58] + vmovdqu32 ymm16, ymmword ptr [rdx+r8-0x2*0x20] + vinserti64x4 zmm16, zmm16, ymmword ptr [rdx+r12-0x2*0x20], 0x01 + vmovdqu32 ymm17, ymmword ptr [rdx+r9-0x2*0x20] + vinserti64x4 zmm17, zmm17, ymmword ptr [rdx+r13-0x2*0x20], 0x01 + vpunpcklqdq zmm8, zmm16, zmm17 + vpunpckhqdq zmm9, zmm16, zmm17 + vmovdqu32 ymm18, ymmword ptr [rdx+r10-0x2*0x20] + vinserti64x4 zmm18, zmm18, ymmword ptr [rdx+r14-0x2*0x20], 0x01 + vmovdqu32 ymm19, ymmword ptr [rdx+r11-0x2*0x20] + vinserti64x4 zmm19, zmm19, ymmword ptr [rdx+r15-0x2*0x20], 0x01 + vpunpcklqdq zmm10, zmm18, zmm19 + vpunpckhqdq zmm11, zmm18, zmm19 + mov r8, qword ptr [rdi+0x20] + mov r9, qword ptr [rdi+0x28] + mov r10, qword ptr [rdi+0x30] + mov r11, qword ptr [rdi+0x38] + mov r12, qword ptr [rdi+0x60] + mov r13, qword ptr [rdi+0x68] + mov r14, qword ptr [rdi+0x70] + mov r15, qword ptr [rdi+0x78] + vmovdqu32 ymm16, ymmword ptr [rdx+r8-0x2*0x20] + vinserti64x4 zmm16, zmm16, ymmword ptr [rdx+r12-0x2*0x20], 0x01 + vmovdqu32 ymm17, ymmword ptr [rdx+r9-0x2*0x20] + vinserti64x4 zmm17, zmm17, ymmword ptr [rdx+r13-0x2*0x20], 0x01 + vpunpcklqdq zmm12, zmm16, zmm17 + vpunpckhqdq zmm13, zmm16, zmm17 + vmovdqu32 ymm18, ymmword ptr [rdx+r10-0x2*0x20] + vinserti64x4 zmm18, zmm18, ymmword ptr [rdx+r14-0x2*0x20], 0x01 + vmovdqu32 ymm19, ymmword ptr [rdx+r11-0x2*0x20] + vinserti64x4 zmm19, zmm19, ymmword ptr [rdx+r15-0x2*0x20], 0x01 + vpunpcklqdq zmm14, zmm18, zmm19 + vpunpckhqdq zmm15, zmm18, zmm19 + vmovdqa32 zmm27, zmmword ptr [INDEX0+rip] + vmovdqa32 zmm31, zmmword ptr [INDEX1+rip] + vshufps zmm16, zmm8, zmm10, 136 + vshufps zmm17, zmm12, zmm14, 136 + vmovdqa32 zmm20, zmm16 + vpermt2d zmm16, zmm27, zmm17 + vpermt2d zmm20, zmm31, zmm17 + vshufps zmm17, zmm8, zmm10, 221 + vshufps zmm30, zmm12, zmm14, 221 + vmovdqa32 zmm21, zmm17 + vpermt2d zmm17, zmm27, zmm30 + vpermt2d zmm21, zmm31, zmm30 + vshufps zmm18, zmm9, zmm11, 136 + vshufps zmm8, zmm13, zmm15, 136 + vmovdqa32 zmm22, zmm18 + vpermt2d zmm18, zmm27, zmm8 + vpermt2d zmm22, zmm31, zmm8 + vshufps zmm19, zmm9, zmm11, 221 + vshufps zmm8, zmm13, zmm15, 221 + vmovdqa32 zmm23, zmm19 + vpermt2d zmm19, zmm27, zmm8 + vpermt2d zmm23, zmm31, zmm8 + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+0x8] + mov r10, qword ptr [rdi+0x10] + mov r11, qword ptr [rdi+0x18] + mov r12, qword ptr [rdi+0x40] + mov r13, qword ptr [rdi+0x48] + mov r14, qword ptr [rdi+0x50] + mov r15, qword ptr [rdi+0x58] + vmovdqu32 ymm24, ymmword ptr [r8+rdx-0x1*0x20] + vinserti64x4 zmm24, zmm24, ymmword ptr [r12+rdx-0x1*0x20], 0x01 + vmovdqu32 ymm25, ymmword ptr [r9+rdx-0x1*0x20] + vinserti64x4 zmm25, zmm25, ymmword ptr [r13+rdx-0x1*0x20], 0x01 + vpunpcklqdq zmm8, zmm24, zmm25 + vpunpckhqdq zmm9, zmm24, zmm25 + vmovdqu32 ymm24, ymmword ptr [r10+rdx-0x1*0x20] + vinserti64x4 zmm24, zmm24, ymmword ptr [r14+rdx-0x1*0x20], 0x01 + vmovdqu32 ymm25, ymmword ptr [r11+rdx-0x1*0x20] + vinserti64x4 zmm25, zmm25, ymmword ptr [r15+rdx-0x1*0x20], 0x01 + vpunpcklqdq zmm10, zmm24, zmm25 + vpunpckhqdq zmm11, zmm24, zmm25 + prefetcht0 [r8+rdx+0x80] + prefetcht0 [r12+rdx+0x80] + prefetcht0 [r9+rdx+0x80] + prefetcht0 [r13+rdx+0x80] + prefetcht0 [r10+rdx+0x80] + prefetcht0 [r14+rdx+0x80] + prefetcht0 [r11+rdx+0x80] + prefetcht0 [r15+rdx+0x80] + mov r8, qword ptr [rdi+0x20] + mov r9, qword ptr [rdi+0x28] + mov r10, qword ptr [rdi+0x30] + mov r11, qword ptr [rdi+0x38] + mov r12, qword ptr [rdi+0x60] + mov r13, qword ptr [rdi+0x68] + mov r14, qword ptr [rdi+0x70] + mov r15, qword ptr [rdi+0x78] + vmovdqu32 ymm24, ymmword ptr [r8+rdx-0x1*0x20] + vinserti64x4 zmm24, zmm24, ymmword ptr [r12+rdx-0x1*0x20], 0x01 + vmovdqu32 ymm25, ymmword ptr [r9+rdx-0x1*0x20] + vinserti64x4 zmm25, zmm25, ymmword ptr [r13+rdx-0x1*0x20], 0x01 + vpunpcklqdq zmm12, zmm24, zmm25 + vpunpckhqdq zmm13, zmm24, zmm25 + vmovdqu32 ymm24, ymmword ptr [r10+rdx-0x1*0x20] + vinserti64x4 zmm24, zmm24, ymmword ptr [r14+rdx-0x1*0x20], 0x01 + vmovdqu32 ymm25, ymmword ptr [r11+rdx-0x1*0x20] + vinserti64x4 zmm25, zmm25, ymmword ptr [r15+rdx-0x1*0x20], 0x01 + vpunpcklqdq zmm14, zmm24, zmm25 + vpunpckhqdq zmm15, zmm24, zmm25 + prefetcht0 [r8+rdx+0x80] + prefetcht0 [r12+rdx+0x80] + prefetcht0 [r9+rdx+0x80] + prefetcht0 [r13+rdx+0x80] + prefetcht0 [r10+rdx+0x80] + prefetcht0 [r14+rdx+0x80] + prefetcht0 [r11+rdx+0x80] + prefetcht0 [r15+rdx+0x80] + vshufps zmm24, zmm8, zmm10, 136 + vshufps zmm30, zmm12, zmm14, 136 + vmovdqa32 zmm28, zmm24 + vpermt2d zmm24, zmm27, zmm30 + vpermt2d zmm28, zmm31, zmm30 + vshufps zmm25, zmm8, zmm10, 221 + vshufps zmm30, zmm12, zmm14, 221 + vmovdqa32 zmm29, zmm25 + vpermt2d zmm25, zmm27, zmm30 + vpermt2d zmm29, zmm31, zmm30 + vshufps zmm26, zmm9, zmm11, 136 + vshufps zmm8, zmm13, zmm15, 136 + vmovdqa32 zmm30, zmm26 + vpermt2d zmm26, zmm27, zmm8 + vpermt2d zmm30, zmm31, zmm8 + vshufps zmm8, zmm9, zmm11, 221 + vshufps zmm10, zmm13, zmm15, 221 + vpermi2d zmm27, zmm8, zmm10 + vpermi2d zmm31, zmm8, zmm10 + vpbroadcastd zmm8, dword ptr [BLAKE3_IV_0+rip] + vpbroadcastd zmm9, dword ptr [BLAKE3_IV_1+rip] + vpbroadcastd zmm10, dword ptr [BLAKE3_IV_2+rip] + vpbroadcastd zmm11, dword ptr [BLAKE3_IV_3+rip] + vmovdqa32 zmm12, zmmword ptr [rsp] + vmovdqa32 zmm13, zmmword ptr [rsp+0x1*0x40] + vpbroadcastd zmm14, dword ptr [BLAKE3_BLOCK_LEN+rip] + vpbroadcastd zmm15, dword ptr [rsp+0x22*0x4] + vpaddd zmm0, zmm0, zmm16 + vpaddd zmm1, zmm1, zmm18 + vpaddd zmm2, zmm2, zmm20 + vpaddd zmm3, zmm3, zmm22 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vprord zmm15, zmm15, 16 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 12 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vpaddd zmm0, zmm0, zmm17 + vpaddd zmm1, zmm1, zmm19 + vpaddd zmm2, zmm2, zmm21 + vpaddd zmm3, zmm3, zmm23 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vprord zmm15, zmm15, 8 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 7 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vpaddd zmm0, zmm0, zmm24 + vpaddd zmm1, zmm1, zmm26 + vpaddd zmm2, zmm2, zmm28 + vpaddd zmm3, zmm3, zmm30 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 16 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vprord zmm4, zmm4, 12 + vpaddd zmm0, zmm0, zmm25 + vpaddd zmm1, zmm1, zmm27 + vpaddd zmm2, zmm2, zmm29 + vpaddd zmm3, zmm3, zmm31 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 8 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vprord zmm4, zmm4, 7 + vpaddd zmm0, zmm0, zmm18 + vpaddd zmm1, zmm1, zmm19 + vpaddd zmm2, zmm2, zmm23 + vpaddd zmm3, zmm3, zmm20 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vprord zmm15, zmm15, 16 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 12 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vpaddd zmm0, zmm0, zmm22 + vpaddd zmm1, zmm1, zmm26 + vpaddd zmm2, zmm2, zmm16 + vpaddd zmm3, zmm3, zmm29 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vprord zmm15, zmm15, 8 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 7 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vpaddd zmm0, zmm0, zmm17 + vpaddd zmm1, zmm1, zmm28 + vpaddd zmm2, zmm2, zmm25 + vpaddd zmm3, zmm3, zmm31 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 16 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vprord zmm4, zmm4, 12 + vpaddd zmm0, zmm0, zmm27 + vpaddd zmm1, zmm1, zmm21 + vpaddd zmm2, zmm2, zmm30 + vpaddd zmm3, zmm3, zmm24 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 8 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vprord zmm4, zmm4, 7 + vpaddd zmm0, zmm0, zmm19 + vpaddd zmm1, zmm1, zmm26 + vpaddd zmm2, zmm2, zmm29 + vpaddd zmm3, zmm3, zmm23 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vprord zmm15, zmm15, 16 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 12 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vpaddd zmm0, zmm0, zmm20 + vpaddd zmm1, zmm1, zmm28 + vpaddd zmm2, zmm2, zmm18 + vpaddd zmm3, zmm3, zmm30 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vprord zmm15, zmm15, 8 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 7 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vpaddd zmm0, zmm0, zmm22 + vpaddd zmm1, zmm1, zmm25 + vpaddd zmm2, zmm2, zmm27 + vpaddd zmm3, zmm3, zmm24 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 16 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vprord zmm4, zmm4, 12 + vpaddd zmm0, zmm0, zmm21 + vpaddd zmm1, zmm1, zmm16 + vpaddd zmm2, zmm2, zmm31 + vpaddd zmm3, zmm3, zmm17 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 8 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vprord zmm4, zmm4, 7 + vpaddd zmm0, zmm0, zmm26 + vpaddd zmm1, zmm1, zmm28 + vpaddd zmm2, zmm2, zmm30 + vpaddd zmm3, zmm3, zmm29 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vprord zmm15, zmm15, 16 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 12 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vpaddd zmm0, zmm0, zmm23 + vpaddd zmm1, zmm1, zmm25 + vpaddd zmm2, zmm2, zmm19 + vpaddd zmm3, zmm3, zmm31 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vprord zmm15, zmm15, 8 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 7 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vpaddd zmm0, zmm0, zmm20 + vpaddd zmm1, zmm1, zmm27 + vpaddd zmm2, zmm2, zmm21 + vpaddd zmm3, zmm3, zmm17 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 16 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vprord zmm4, zmm4, 12 + vpaddd zmm0, zmm0, zmm16 + vpaddd zmm1, zmm1, zmm18 + vpaddd zmm2, zmm2, zmm24 + vpaddd zmm3, zmm3, zmm22 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 8 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vprord zmm4, zmm4, 7 + vpaddd zmm0, zmm0, zmm28 + vpaddd zmm1, zmm1, zmm25 + vpaddd zmm2, zmm2, zmm31 + vpaddd zmm3, zmm3, zmm30 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vprord zmm15, zmm15, 16 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 12 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vpaddd zmm0, zmm0, zmm29 + vpaddd zmm1, zmm1, zmm27 + vpaddd zmm2, zmm2, zmm26 + vpaddd zmm3, zmm3, zmm24 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vprord zmm15, zmm15, 8 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 7 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vpaddd zmm0, zmm0, zmm23 + vpaddd zmm1, zmm1, zmm21 + vpaddd zmm2, zmm2, zmm16 + vpaddd zmm3, zmm3, zmm22 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 16 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vprord zmm4, zmm4, 12 + vpaddd zmm0, zmm0, zmm18 + vpaddd zmm1, zmm1, zmm19 + vpaddd zmm2, zmm2, zmm17 + vpaddd zmm3, zmm3, zmm20 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 8 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vprord zmm4, zmm4, 7 + vpaddd zmm0, zmm0, zmm25 + vpaddd zmm1, zmm1, zmm27 + vpaddd zmm2, zmm2, zmm24 + vpaddd zmm3, zmm3, zmm31 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vprord zmm15, zmm15, 16 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 12 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vpaddd zmm0, zmm0, zmm30 + vpaddd zmm1, zmm1, zmm21 + vpaddd zmm2, zmm2, zmm28 + vpaddd zmm3, zmm3, zmm17 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vprord zmm15, zmm15, 8 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 7 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vpaddd zmm0, zmm0, zmm29 + vpaddd zmm1, zmm1, zmm16 + vpaddd zmm2, zmm2, zmm18 + vpaddd zmm3, zmm3, zmm20 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 16 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vprord zmm4, zmm4, 12 + vpaddd zmm0, zmm0, zmm19 + vpaddd zmm1, zmm1, zmm26 + vpaddd zmm2, zmm2, zmm22 + vpaddd zmm3, zmm3, zmm23 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 8 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vprord zmm4, zmm4, 7 + vpaddd zmm0, zmm0, zmm27 + vpaddd zmm1, zmm1, zmm21 + vpaddd zmm2, zmm2, zmm17 + vpaddd zmm3, zmm3, zmm24 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vprord zmm15, zmm15, 16 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 12 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vpaddd zmm0, zmm0, zmm31 + vpaddd zmm1, zmm1, zmm16 + vpaddd zmm2, zmm2, zmm25 + vpaddd zmm3, zmm3, zmm22 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vprord zmm15, zmm15, 8 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 7 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vpaddd zmm0, zmm0, zmm30 + vpaddd zmm1, zmm1, zmm18 + vpaddd zmm2, zmm2, zmm19 + vpaddd zmm3, zmm3, zmm23 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 16 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vprord zmm4, zmm4, 12 + vpaddd zmm0, zmm0, zmm26 + vpaddd zmm1, zmm1, zmm28 + vpaddd zmm2, zmm2, zmm20 + vpaddd zmm3, zmm3, zmm29 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 8 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vprord zmm4, zmm4, 7 + vpxord zmm0, zmm0, zmm8 + vpxord zmm1, zmm1, zmm9 + vpxord zmm2, zmm2, zmm10 + vpxord zmm3, zmm3, zmm11 + vpxord zmm4, zmm4, zmm12 + vpxord zmm5, zmm5, zmm13 + vpxord zmm6, zmm6, zmm14 + vpxord zmm7, zmm7, zmm15 + movzx eax, byte ptr [rbp+0x38] + jne 9b + mov rbx, qword ptr [rbp+0x50] + vpunpckldq zmm16, zmm0, zmm1 + vpunpckhdq zmm17, zmm0, zmm1 + vpunpckldq zmm18, zmm2, zmm3 + vpunpckhdq zmm19, zmm2, zmm3 + vpunpckldq zmm20, zmm4, zmm5 + vpunpckhdq zmm21, zmm4, zmm5 + vpunpckldq zmm22, zmm6, zmm7 + vpunpckhdq zmm23, zmm6, zmm7 + vpunpcklqdq zmm0, zmm16, zmm18 + vpunpckhqdq zmm1, zmm16, zmm18 + vpunpcklqdq zmm2, zmm17, zmm19 + vpunpckhqdq zmm3, zmm17, zmm19 + vpunpcklqdq zmm4, zmm20, zmm22 + vpunpckhqdq zmm5, zmm20, zmm22 + vpunpcklqdq zmm6, zmm21, zmm23 + vpunpckhqdq zmm7, zmm21, zmm23 + vshufi32x4 zmm16, zmm0, zmm4, 0x88 + vshufi32x4 zmm17, zmm1, zmm5, 0x88 + vshufi32x4 zmm18, zmm2, zmm6, 0x88 + vshufi32x4 zmm19, zmm3, zmm7, 0x88 + vshufi32x4 zmm20, zmm0, zmm4, 0xDD + vshufi32x4 zmm21, zmm1, zmm5, 0xDD + vshufi32x4 zmm22, zmm2, zmm6, 0xDD + vshufi32x4 zmm23, zmm3, zmm7, 0xDD + vshufi32x4 zmm0, zmm16, zmm17, 0x88 + vshufi32x4 zmm1, zmm18, zmm19, 0x88 + vshufi32x4 zmm2, zmm20, zmm21, 0x88 + vshufi32x4 zmm3, zmm22, zmm23, 0x88 + vshufi32x4 zmm4, zmm16, zmm17, 0xDD + vshufi32x4 zmm5, zmm18, zmm19, 0xDD + vshufi32x4 zmm6, zmm20, zmm21, 0xDD + vshufi32x4 zmm7, zmm22, zmm23, 0xDD + vmovdqu32 zmmword ptr [rbx], zmm0 + vmovdqu32 zmmword ptr [rbx+0x1*0x40], zmm1 + vmovdqu32 zmmword ptr [rbx+0x2*0x40], zmm2 + vmovdqu32 zmmword ptr [rbx+0x3*0x40], zmm3 + vmovdqu32 zmmword ptr [rbx+0x4*0x40], zmm4 + vmovdqu32 zmmword ptr [rbx+0x5*0x40], zmm5 + vmovdqu32 zmmword ptr [rbx+0x6*0x40], zmm6 + vmovdqu32 zmmword ptr [rbx+0x7*0x40], zmm7 + vmovdqa32 zmm0, zmmword ptr [rsp] + vmovdqa32 zmm1, zmmword ptr [rsp+0x1*0x40] + vmovdqa32 zmm2, zmm0 + vpaddd zmm2{k1}, zmm0, dword ptr [ADD16+rip] {1to16} + vpcmpltud k2, zmm2, zmm0 + vpaddd zmm1 {k2}, zmm1, dword ptr [ADD1+rip] {1to16} + vmovdqa32 zmmword ptr [rsp], zmm2 + vmovdqa32 zmmword ptr [rsp+0x1*0x40], zmm1 + add rdi, 128 + add rbx, 512 + mov qword ptr [rbp+0x50], rbx + sub rsi, 16 + cmp rsi, 16 + jnc 2b + test rsi, rsi + jnz 3f +4: + vzeroupper + mov rsp, rbp + pop rbp + pop rbx + pop r12 + pop r13 + pop r14 + pop r15 + ret +.p2align 6 +3: + test esi, 0x8 + je 3f + vpbroadcastd ymm0, dword ptr [rcx] + vpbroadcastd ymm1, dword ptr [rcx+0x4] + vpbroadcastd ymm2, dword ptr [rcx+0x8] + vpbroadcastd ymm3, dword ptr [rcx+0xC] + vpbroadcastd ymm4, dword ptr [rcx+0x10] + vpbroadcastd ymm5, dword ptr [rcx+0x14] + vpbroadcastd ymm6, dword ptr [rcx+0x18] + vpbroadcastd ymm7, dword ptr [rcx+0x1C] + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+0x8] + mov r10, qword ptr [rdi+0x10] + mov r11, qword ptr [rdi+0x18] + mov r12, qword ptr [rdi+0x20] + mov r13, qword ptr [rdi+0x28] + mov r14, qword ptr [rdi+0x30] + mov r15, qword ptr [rdi+0x38] + movzx eax, byte ptr [rbp+0x38] + movzx ebx, byte ptr [rbp+0x40] + or eax, ebx + xor edx, edx +2: + movzx ebx, byte ptr [rbp+0x48] + or ebx, eax + add rdx, 64 + cmp rdx, qword ptr [rsp+0x80] + cmove eax, ebx + mov dword ptr [rsp+0x88], eax + vmovups xmm8, xmmword ptr [r8+rdx-0x40] + vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x40], 0x01 + vmovups xmm9, xmmword ptr [r9+rdx-0x40] + vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x40], 0x01 + vunpcklpd ymm12, ymm8, ymm9 + vunpckhpd ymm13, ymm8, ymm9 + vmovups xmm10, xmmword ptr [r10+rdx-0x40] + vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x40], 0x01 + vmovups xmm11, xmmword ptr [r11+rdx-0x40] + vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x40], 0x01 + vunpcklpd ymm14, ymm10, ymm11 + vunpckhpd ymm15, ymm10, ymm11 + vshufps ymm16, ymm12, ymm14, 136 + vshufps ymm17, ymm12, ymm14, 221 + vshufps ymm18, ymm13, ymm15, 136 + vshufps ymm19, ymm13, ymm15, 221 + vmovups xmm8, xmmword ptr [r8+rdx-0x30] + vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x30], 0x01 + vmovups xmm9, xmmword ptr [r9+rdx-0x30] + vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x30], 0x01 + vunpcklpd ymm12, ymm8, ymm9 + vunpckhpd ymm13, ymm8, ymm9 + vmovups xmm10, xmmword ptr [r10+rdx-0x30] + vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x30], 0x01 + vmovups xmm11, xmmword ptr [r11+rdx-0x30] + vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x30], 0x01 + vunpcklpd ymm14, ymm10, ymm11 + vunpckhpd ymm15, ymm10, ymm11 + vshufps ymm20, ymm12, ymm14, 136 + vshufps ymm21, ymm12, ymm14, 221 + vshufps ymm22, ymm13, ymm15, 136 + vshufps ymm23, ymm13, ymm15, 221 + vmovups xmm8, xmmword ptr [r8+rdx-0x20] + vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x20], 0x01 + vmovups xmm9, xmmword ptr [r9+rdx-0x20] + vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x20], 0x01 + vunpcklpd ymm12, ymm8, ymm9 + vunpckhpd ymm13, ymm8, ymm9 + vmovups xmm10, xmmword ptr [r10+rdx-0x20] + vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x20], 0x01 + vmovups xmm11, xmmword ptr [r11+rdx-0x20] + vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x20], 0x01 + vunpcklpd ymm14, ymm10, ymm11 + vunpckhpd ymm15, ymm10, ymm11 + vshufps ymm24, ymm12, ymm14, 136 + vshufps ymm25, ymm12, ymm14, 221 + vshufps ymm26, ymm13, ymm15, 136 + vshufps ymm27, ymm13, ymm15, 221 + vmovups xmm8, xmmword ptr [r8+rdx-0x10] + vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x10], 0x01 + vmovups xmm9, xmmword ptr [r9+rdx-0x10] + vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x10], 0x01 + vunpcklpd ymm12, ymm8, ymm9 + vunpckhpd ymm13, ymm8, ymm9 + vmovups xmm10, xmmword ptr [r10+rdx-0x10] + vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x10], 0x01 + vmovups xmm11, xmmword ptr [r11+rdx-0x10] + vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x10], 0x01 + vunpcklpd ymm14, ymm10, ymm11 + vunpckhpd ymm15, ymm10, ymm11 + vshufps ymm28, ymm12, ymm14, 136 + vshufps ymm29, ymm12, ymm14, 221 + vshufps ymm30, ymm13, ymm15, 136 + vshufps ymm31, ymm13, ymm15, 221 + vpbroadcastd ymm8, dword ptr [BLAKE3_IV_0+rip] + vpbroadcastd ymm9, dword ptr [BLAKE3_IV_1+rip] + vpbroadcastd ymm10, dword ptr [BLAKE3_IV_2+rip] + vpbroadcastd ymm11, dword ptr [BLAKE3_IV_3+rip] + vmovdqa ymm12, ymmword ptr [rsp] + vmovdqa ymm13, ymmword ptr [rsp+0x40] + vpbroadcastd ymm14, dword ptr [BLAKE3_BLOCK_LEN+rip] + vpbroadcastd ymm15, dword ptr [rsp+0x88] + vpaddd ymm0, ymm0, ymm16 + vpaddd ymm1, ymm1, ymm18 + vpaddd ymm2, ymm2, ymm20 + vpaddd ymm3, ymm3, ymm22 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vprord ymm15, ymm15, 16 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 12 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vpaddd ymm0, ymm0, ymm17 + vpaddd ymm1, ymm1, ymm19 + vpaddd ymm2, ymm2, ymm21 + vpaddd ymm3, ymm3, ymm23 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vprord ymm15, ymm15, 8 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 7 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vpaddd ymm0, ymm0, ymm24 + vpaddd ymm1, ymm1, ymm26 + vpaddd ymm2, ymm2, ymm28 + vpaddd ymm3, ymm3, ymm30 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 16 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vprord ymm4, ymm4, 12 + vpaddd ymm0, ymm0, ymm25 + vpaddd ymm1, ymm1, ymm27 + vpaddd ymm2, ymm2, ymm29 + vpaddd ymm3, ymm3, ymm31 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 8 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vprord ymm4, ymm4, 7 + vpaddd ymm0, ymm0, ymm18 + vpaddd ymm1, ymm1, ymm19 + vpaddd ymm2, ymm2, ymm23 + vpaddd ymm3, ymm3, ymm20 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vprord ymm15, ymm15, 16 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 12 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vpaddd ymm0, ymm0, ymm22 + vpaddd ymm1, ymm1, ymm26 + vpaddd ymm2, ymm2, ymm16 + vpaddd ymm3, ymm3, ymm29 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vprord ymm15, ymm15, 8 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 7 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vpaddd ymm0, ymm0, ymm17 + vpaddd ymm1, ymm1, ymm28 + vpaddd ymm2, ymm2, ymm25 + vpaddd ymm3, ymm3, ymm31 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 16 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vprord ymm4, ymm4, 12 + vpaddd ymm0, ymm0, ymm27 + vpaddd ymm1, ymm1, ymm21 + vpaddd ymm2, ymm2, ymm30 + vpaddd ymm3, ymm3, ymm24 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 8 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vprord ymm4, ymm4, 7 + vpaddd ymm0, ymm0, ymm19 + vpaddd ymm1, ymm1, ymm26 + vpaddd ymm2, ymm2, ymm29 + vpaddd ymm3, ymm3, ymm23 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vprord ymm15, ymm15, 16 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 12 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vpaddd ymm0, ymm0, ymm20 + vpaddd ymm1, ymm1, ymm28 + vpaddd ymm2, ymm2, ymm18 + vpaddd ymm3, ymm3, ymm30 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vprord ymm15, ymm15, 8 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 7 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vpaddd ymm0, ymm0, ymm22 + vpaddd ymm1, ymm1, ymm25 + vpaddd ymm2, ymm2, ymm27 + vpaddd ymm3, ymm3, ymm24 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 16 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vprord ymm4, ymm4, 12 + vpaddd ymm0, ymm0, ymm21 + vpaddd ymm1, ymm1, ymm16 + vpaddd ymm2, ymm2, ymm31 + vpaddd ymm3, ymm3, ymm17 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 8 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vprord ymm4, ymm4, 7 + vpaddd ymm0, ymm0, ymm26 + vpaddd ymm1, ymm1, ymm28 + vpaddd ymm2, ymm2, ymm30 + vpaddd ymm3, ymm3, ymm29 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vprord ymm15, ymm15, 16 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 12 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vpaddd ymm0, ymm0, ymm23 + vpaddd ymm1, ymm1, ymm25 + vpaddd ymm2, ymm2, ymm19 + vpaddd ymm3, ymm3, ymm31 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vprord ymm15, ymm15, 8 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 7 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vpaddd ymm0, ymm0, ymm20 + vpaddd ymm1, ymm1, ymm27 + vpaddd ymm2, ymm2, ymm21 + vpaddd ymm3, ymm3, ymm17 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 16 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vprord ymm4, ymm4, 12 + vpaddd ymm0, ymm0, ymm16 + vpaddd ymm1, ymm1, ymm18 + vpaddd ymm2, ymm2, ymm24 + vpaddd ymm3, ymm3, ymm22 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 8 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vprord ymm4, ymm4, 7 + vpaddd ymm0, ymm0, ymm28 + vpaddd ymm1, ymm1, ymm25 + vpaddd ymm2, ymm2, ymm31 + vpaddd ymm3, ymm3, ymm30 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vprord ymm15, ymm15, 16 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 12 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vpaddd ymm0, ymm0, ymm29 + vpaddd ymm1, ymm1, ymm27 + vpaddd ymm2, ymm2, ymm26 + vpaddd ymm3, ymm3, ymm24 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vprord ymm15, ymm15, 8 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 7 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vpaddd ymm0, ymm0, ymm23 + vpaddd ymm1, ymm1, ymm21 + vpaddd ymm2, ymm2, ymm16 + vpaddd ymm3, ymm3, ymm22 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 16 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vprord ymm4, ymm4, 12 + vpaddd ymm0, ymm0, ymm18 + vpaddd ymm1, ymm1, ymm19 + vpaddd ymm2, ymm2, ymm17 + vpaddd ymm3, ymm3, ymm20 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 8 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vprord ymm4, ymm4, 7 + vpaddd ymm0, ymm0, ymm25 + vpaddd ymm1, ymm1, ymm27 + vpaddd ymm2, ymm2, ymm24 + vpaddd ymm3, ymm3, ymm31 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vprord ymm15, ymm15, 16 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 12 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vpaddd ymm0, ymm0, ymm30 + vpaddd ymm1, ymm1, ymm21 + vpaddd ymm2, ymm2, ymm28 + vpaddd ymm3, ymm3, ymm17 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vprord ymm15, ymm15, 8 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 7 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vpaddd ymm0, ymm0, ymm29 + vpaddd ymm1, ymm1, ymm16 + vpaddd ymm2, ymm2, ymm18 + vpaddd ymm3, ymm3, ymm20 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 16 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vprord ymm4, ymm4, 12 + vpaddd ymm0, ymm0, ymm19 + vpaddd ymm1, ymm1, ymm26 + vpaddd ymm2, ymm2, ymm22 + vpaddd ymm3, ymm3, ymm23 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 8 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vprord ymm4, ymm4, 7 + vpaddd ymm0, ymm0, ymm27 + vpaddd ymm1, ymm1, ymm21 + vpaddd ymm2, ymm2, ymm17 + vpaddd ymm3, ymm3, ymm24 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vprord ymm15, ymm15, 16 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 12 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vpaddd ymm0, ymm0, ymm31 + vpaddd ymm1, ymm1, ymm16 + vpaddd ymm2, ymm2, ymm25 + vpaddd ymm3, ymm3, ymm22 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vprord ymm15, ymm15, 8 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 7 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vpaddd ymm0, ymm0, ymm30 + vpaddd ymm1, ymm1, ymm18 + vpaddd ymm2, ymm2, ymm19 + vpaddd ymm3, ymm3, ymm23 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 16 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vprord ymm4, ymm4, 12 + vpaddd ymm0, ymm0, ymm26 + vpaddd ymm1, ymm1, ymm28 + vpaddd ymm2, ymm2, ymm20 + vpaddd ymm3, ymm3, ymm29 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 8 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vprord ymm4, ymm4, 7 + vpxor ymm0, ymm0, ymm8 + vpxor ymm1, ymm1, ymm9 + vpxor ymm2, ymm2, ymm10 + vpxor ymm3, ymm3, ymm11 + vpxor ymm4, ymm4, ymm12 + vpxor ymm5, ymm5, ymm13 + vpxor ymm6, ymm6, ymm14 + vpxor ymm7, ymm7, ymm15 + movzx eax, byte ptr [rbp+0x38] + jne 2b + mov rbx, qword ptr [rbp+0x50] + vunpcklps ymm8, ymm0, ymm1 + vunpcklps ymm9, ymm2, ymm3 + vunpckhps ymm10, ymm0, ymm1 + vunpcklps ymm11, ymm4, ymm5 + vunpcklps ymm0, ymm6, ymm7 + vshufps ymm12, ymm8, ymm9, 78 + vblendps ymm1, ymm8, ymm12, 0xCC + vshufps ymm8, ymm11, ymm0, 78 + vunpckhps ymm13, ymm2, ymm3 + vblendps ymm2, ymm11, ymm8, 0xCC + vblendps ymm3, ymm12, ymm9, 0xCC + vperm2f128 ymm12, ymm1, ymm2, 0x20 + vmovups ymmword ptr [rbx], ymm12 + vunpckhps ymm14, ymm4, ymm5 + vblendps ymm4, ymm8, ymm0, 0xCC + vunpckhps ymm15, ymm6, ymm7 + vperm2f128 ymm7, ymm3, ymm4, 0x20 + vmovups ymmword ptr [rbx+0x20], ymm7 + vshufps ymm5, ymm10, ymm13, 78 + vblendps ymm6, ymm5, ymm13, 0xCC + vshufps ymm13, ymm14, ymm15, 78 + vblendps ymm10, ymm10, ymm5, 0xCC + vblendps ymm14, ymm14, ymm13, 0xCC + vperm2f128 ymm8, ymm10, ymm14, 0x20 + vmovups ymmword ptr [rbx+0x40], ymm8 + vblendps ymm15, ymm13, ymm15, 0xCC + vperm2f128 ymm13, ymm6, ymm15, 0x20 + vmovups ymmword ptr [rbx+0x60], ymm13 + vperm2f128 ymm9, ymm1, ymm2, 0x31 + vperm2f128 ymm11, ymm3, ymm4, 0x31 + vmovups ymmword ptr [rbx+0x80], ymm9 + vperm2f128 ymm14, ymm10, ymm14, 0x31 + vperm2f128 ymm15, ymm6, ymm15, 0x31 + vmovups ymmword ptr [rbx+0xA0], ymm11 + vmovups ymmword ptr [rbx+0xC0], ymm14 + vmovups ymmword ptr [rbx+0xE0], ymm15 + vmovdqa ymm0, ymmword ptr [rsp] + vmovdqa ymm2, ymmword ptr [rsp+0x2*0x20] + vmovdqa32 ymm0 {k1}, ymmword ptr [rsp+0x1*0x20] + vmovdqa32 ymm2 {k1}, ymmword ptr [rsp+0x3*0x20] + vmovdqa ymmword ptr [rsp], ymm0 + vmovdqa ymmword ptr [rsp+0x2*0x20], ymm2 + add rbx, 256 + mov qword ptr [rbp+0x50], rbx + add rdi, 64 + sub rsi, 8 +3: + mov rbx, qword ptr [rbp+0x50] + mov r15, qword ptr [rsp+0x80] + movzx r13, byte ptr [rbp+0x38] + movzx r12, byte ptr [rbp+0x48] + test esi, 0x4 + je 3f + vbroadcasti32x4 zmm0, xmmword ptr [rcx] + vbroadcasti32x4 zmm1, xmmword ptr [rcx+0x1*0x10] + vmovdqa xmm12, xmmword ptr [rsp] + vmovdqa xmm13, xmmword ptr [rsp+0x4*0x10] + vpunpckldq xmm14, xmm12, xmm13 + vpunpckhdq xmm15, xmm12, xmm13 + vpermq ymm14, ymm14, 0xDC + vpermq ymm15, ymm15, 0xDC + vpbroadcastd zmm12, dword ptr [BLAKE3_BLOCK_LEN+rip] + vinserti64x4 zmm13, zmm14, ymm15, 0x01 + mov eax, 17476 + kmovw k2, eax + vpblendmd zmm13 {k2}, zmm13, zmm12 + vbroadcasti32x4 zmm15, xmmword ptr [BLAKE3_IV+rip] + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+0x8] + mov r10, qword ptr [rdi+0x10] + mov r11, qword ptr [rdi+0x18] + mov eax, 43690 + kmovw k3, eax + mov eax, 34952 + kmovw k4, eax + movzx eax, byte ptr [rbp+0x40] + or eax, r13d + xor edx, edx +.p2align 5 +2: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + mov dword ptr [rsp+0x88], eax + vmovdqa32 zmm2, zmm15 + vpbroadcastd zmm8, dword ptr [rsp+0x22*0x4] + vpblendmd zmm3 {k4}, zmm13, zmm8 + vmovups zmm8, zmmword ptr [r8+rdx-0x1*0x40] + vinserti32x4 zmm8, zmm8, xmmword ptr [r9+rdx-0x4*0x10], 0x01 + vinserti32x4 zmm8, zmm8, xmmword ptr [r10+rdx-0x4*0x10], 0x02 + vinserti32x4 zmm8, zmm8, xmmword ptr [r11+rdx-0x4*0x10], 0x03 + vmovups zmm9, zmmword ptr [r8+rdx-0x30] + vinserti32x4 zmm9, zmm9, xmmword ptr [r9+rdx-0x3*0x10], 0x01 + vinserti32x4 zmm9, zmm9, xmmword ptr [r10+rdx-0x3*0x10], 0x02 + vinserti32x4 zmm9, zmm9, xmmword ptr [r11+rdx-0x3*0x10], 0x03 + vshufps zmm4, zmm8, zmm9, 136 + vshufps zmm5, zmm8, zmm9, 221 + vmovups zmm8, zmmword ptr [r8+rdx-0x20] + vinserti32x4 zmm8, zmm8, xmmword ptr [r9+rdx-0x2*0x10], 0x01 + vinserti32x4 zmm8, zmm8, xmmword ptr [r10+rdx-0x2*0x10], 0x02 + vinserti32x4 zmm8, zmm8, xmmword ptr [r11+rdx-0x2*0x10], 0x03 + vmovups zmm9, zmmword ptr [r8+rdx-0x10] + vinserti32x4 zmm9, zmm9, xmmword ptr [r9+rdx-0x1*0x10], 0x01 + vinserti32x4 zmm9, zmm9, xmmword ptr [r10+rdx-0x1*0x10], 0x02 + vinserti32x4 zmm9, zmm9, xmmword ptr [r11+rdx-0x1*0x10], 0x03 + vshufps zmm6, zmm8, zmm9, 136 + vshufps zmm7, zmm8, zmm9, 221 + vpshufd zmm6, zmm6, 0x93 + vpshufd zmm7, zmm7, 0x93 + mov al, 7 +9: + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm0, zmm0, zmm1 + vpxord zmm3, zmm3, zmm0 + vprord zmm3, zmm3, 16 + vpaddd zmm2, zmm2, zmm3 + vpxord zmm1, zmm1, zmm2 + vprord zmm1, zmm1, 12 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm0, zmm0, zmm1 + vpxord zmm3, zmm3, zmm0 + vprord zmm3, zmm3, 8 + vpaddd zmm2, zmm2, zmm3 + vpxord zmm1, zmm1, zmm2 + vprord zmm1, zmm1, 7 + vpshufd zmm0, zmm0, 0x93 + vpshufd zmm3, zmm3, 0x4E + vpshufd zmm2, zmm2, 0x39 + vpaddd zmm0, zmm0, zmm6 + vpaddd zmm0, zmm0, zmm1 + vpxord zmm3, zmm3, zmm0 + vprord zmm3, zmm3, 16 + vpaddd zmm2, zmm2, zmm3 + vpxord zmm1, zmm1, zmm2 + vprord zmm1, zmm1, 12 + vpaddd zmm0, zmm0, zmm7 + vpaddd zmm0, zmm0, zmm1 + vpxord zmm3, zmm3, zmm0 + vprord zmm3, zmm3, 8 + vpaddd zmm2, zmm2, zmm3 + vpxord zmm1, zmm1, zmm2 + vprord zmm1, zmm1, 7 + vpshufd zmm0, zmm0, 0x39 + vpshufd zmm3, zmm3, 0x4E + vpshufd zmm2, zmm2, 0x93 + dec al + jz 9f + vshufps zmm8, zmm4, zmm5, 214 + vpshufd zmm9, zmm4, 0x0F + vpshufd zmm4, zmm8, 0x39 + vshufps zmm8, zmm6, zmm7, 250 + vpblendmd zmm9 {k3}, zmm9, zmm8 + vpunpcklqdq zmm8, zmm7, zmm5 + vpblendmd zmm8 {k4}, zmm8, zmm6 + vpshufd zmm8, zmm8, 0x78 + vpunpckhdq zmm5, zmm5, zmm7 + vpunpckldq zmm6, zmm6, zmm5 + vpshufd zmm7, zmm6, 0x1E + vmovdqa32 zmm5, zmm9 + vmovdqa32 zmm6, zmm8 + jmp 9b +9: + vpxord zmm0, zmm0, zmm2 + vpxord zmm1, zmm1, zmm3 + mov eax, r13d + cmp rdx, r15 + jne 2b + vmovdqu xmmword ptr [rbx], xmm0 + vmovdqu xmmword ptr [rbx+0x10], xmm1 + vextracti128 xmmword ptr [rbx+0x20], ymm0, 0x01 + vextracti128 xmmword ptr [rbx+0x30], ymm1, 0x01 + vextracti32x4 xmmword ptr [rbx+0x4*0x10], zmm0, 0x02 + vextracti32x4 xmmword ptr [rbx+0x5*0x10], zmm1, 0x02 + vextracti32x4 xmmword ptr [rbx+0x6*0x10], zmm0, 0x03 + vextracti32x4 xmmword ptr [rbx+0x7*0x10], zmm1, 0x03 + vmovdqa xmm0, xmmword ptr [rsp] + vmovdqa xmm2, xmmword ptr [rsp+0x40] + vmovdqa32 xmm0 {k1}, xmmword ptr [rsp+0x1*0x10] + vmovdqa32 xmm2 {k1}, xmmword ptr [rsp+0x5*0x10] + vmovdqa xmmword ptr [rsp], xmm0 + vmovdqa xmmword ptr [rsp+0x40], xmm2 + add rbx, 128 + add rdi, 32 + sub rsi, 4 +3: + test esi, 0x2 + je 3f + vbroadcasti128 ymm0, xmmword ptr [rcx] + vbroadcasti128 ymm1, xmmword ptr [rcx+0x10] + vmovd xmm13, dword ptr [rsp] + vpinsrd xmm13, xmm13, dword ptr [rsp+0x40], 1 + vpinsrd xmm13, xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 + vmovd xmm14, dword ptr [rsp+0x4] + vpinsrd xmm14, xmm14, dword ptr [rsp+0x44], 1 + vpinsrd xmm14, xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 + vinserti128 ymm13, ymm13, xmm14, 0x01 + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+0x8] + movzx eax, byte ptr [rbp+0x40] + or eax, r13d + xor edx, edx +.p2align 5 +2: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + mov dword ptr [rsp+0x88], eax + vbroadcasti128 ymm2, xmmword ptr [BLAKE3_IV+rip] + vpbroadcastd ymm8, dword ptr [rsp+0x88] + vpblendd ymm3, ymm13, ymm8, 0x88 + vmovups ymm8, ymmword ptr [r8+rdx-0x40] + vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-0x40], 0x01 + vmovups ymm9, ymmword ptr [r8+rdx-0x30] + vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-0x30], 0x01 + vshufps ymm4, ymm8, ymm9, 136 + vshufps ymm5, ymm8, ymm9, 221 + vmovups ymm8, ymmword ptr [r8+rdx-0x20] + vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-0x20], 0x01 + vmovups ymm9, ymmword ptr [r8+rdx-0x10] + vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-0x10], 0x01 + vshufps ymm6, ymm8, ymm9, 136 + vshufps ymm7, ymm8, ymm9, 221 + vpshufd ymm6, ymm6, 0x93 + vpshufd ymm7, ymm7, 0x93 + mov al, 7 +9: + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm0, ymm0, ymm1 + vpxord ymm3, ymm3, ymm0 + vprord ymm3, ymm3, 16 + vpaddd ymm2, ymm2, ymm3 + vpxord ymm1, ymm1, ymm2 + vprord ymm1, ymm1, 12 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm0, ymm0, ymm1 + vpxord ymm3, ymm3, ymm0 + vprord ymm3, ymm3, 8 + vpaddd ymm2, ymm2, ymm3 + vpxord ymm1, ymm1, ymm2 + vprord ymm1, ymm1, 7 + vpshufd ymm0, ymm0, 0x93 + vpshufd ymm3, ymm3, 0x4E + vpshufd ymm2, ymm2, 0x39 + vpaddd ymm0, ymm0, ymm6 + vpaddd ymm0, ymm0, ymm1 + vpxord ymm3, ymm3, ymm0 + vprord ymm3, ymm3, 16 + vpaddd ymm2, ymm2, ymm3 + vpxord ymm1, ymm1, ymm2 + vprord ymm1, ymm1, 12 + vpaddd ymm0, ymm0, ymm7 + vpaddd ymm0, ymm0, ymm1 + vpxord ymm3, ymm3, ymm0 + vprord ymm3, ymm3, 8 + vpaddd ymm2, ymm2, ymm3 + vpxord ymm1, ymm1, ymm2 + vprord ymm1, ymm1, 7 + vpshufd ymm0, ymm0, 0x39 + vpshufd ymm3, ymm3, 0x4E + vpshufd ymm2, ymm2, 0x93 + dec al + jz 9f + vshufps ymm8, ymm4, ymm5, 214 + vpshufd ymm9, ymm4, 0x0F + vpshufd ymm4, ymm8, 0x39 + vshufps ymm8, ymm6, ymm7, 250 + vpblendd ymm9, ymm9, ymm8, 0xAA + vpunpcklqdq ymm8, ymm7, ymm5 + vpblendd ymm8, ymm8, ymm6, 0x88 + vpshufd ymm8, ymm8, 0x78 + vpunpckhdq ymm5, ymm5, ymm7 + vpunpckldq ymm6, ymm6, ymm5 + vpshufd ymm7, ymm6, 0x1E + vmovdqa ymm5, ymm9 + vmovdqa ymm6, ymm8 + jmp 9b +9: + vpxor ymm0, ymm0, ymm2 + vpxor ymm1, ymm1, ymm3 + mov eax, r13d + cmp rdx, r15 + jne 2b + vmovdqu xmmword ptr [rbx], xmm0 + vmovdqu xmmword ptr [rbx+0x10], xmm1 + vextracti128 xmmword ptr [rbx+0x20], ymm0, 0x01 + vextracti128 xmmword ptr [rbx+0x30], ymm1, 0x01 + vmovdqa xmm0, xmmword ptr [rsp] + vmovdqa xmm2, xmmword ptr [rsp+0x4*0x10] + vmovdqu32 xmm0 {k1}, xmmword ptr [rsp+0x8] + vmovdqu32 xmm2 {k1}, xmmword ptr [rsp+0x48] + vmovdqa xmmword ptr [rsp], xmm0 + vmovdqa xmmword ptr [rsp+0x4*0x10], xmm2 + add rbx, 64 + add rdi, 16 + sub rsi, 2 +3: + test esi, 0x1 + je 4b + vmovdqu xmm0, xmmword ptr [rcx] + vmovdqu xmm1, xmmword ptr [rcx+0x10] + vmovd xmm14, dword ptr [rsp] + vpinsrd xmm14, xmm14, dword ptr [rsp+0x40], 1 + vpinsrd xmm14, xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 + vmovdqa xmm15, xmmword ptr [BLAKE3_IV+rip] + mov r8, qword ptr [rdi] + movzx eax, byte ptr [rbp+0x40] + or eax, r13d + xor edx, edx +.p2align 5 +2: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + vpinsrd xmm3, xmm14, eax, 3 + vmovdqa xmm2, xmm15 + vmovups xmm8, xmmword ptr [r8+rdx-0x40] + vmovups xmm9, xmmword ptr [r8+rdx-0x30] + vshufps xmm4, xmm8, xmm9, 136 + vshufps xmm5, xmm8, xmm9, 221 + vmovups xmm8, xmmword ptr [r8+rdx-0x20] + vmovups xmm9, xmmword ptr [r8+rdx-0x10] + vshufps xmm6, xmm8, xmm9, 136 + vshufps xmm7, xmm8, xmm9, 221 + vpshufd xmm6, xmm6, 0x93 + vpshufd xmm7, xmm7, 0x93 + mov al, 7 +9: + vpaddd xmm0, xmm0, xmm4 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 16 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 12 + vpaddd xmm0, xmm0, xmm5 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 8 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 7 + vpshufd xmm0, xmm0, 0x93 + vpshufd xmm3, xmm3, 0x4E + vpshufd xmm2, xmm2, 0x39 + vpaddd xmm0, xmm0, xmm6 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 16 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 12 + vpaddd xmm0, xmm0, xmm7 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 8 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 7 + vpshufd xmm0, xmm0, 0x39 + vpshufd xmm3, xmm3, 0x4E + vpshufd xmm2, xmm2, 0x93 + dec al + jz 9f + vshufps xmm8, xmm4, xmm5, 214 + vpshufd xmm9, xmm4, 0x0F + vpshufd xmm4, xmm8, 0x39 + vshufps xmm8, xmm6, xmm7, 250 + vpblendd xmm9, xmm9, xmm8, 0xAA + vpunpcklqdq xmm8, xmm7, xmm5 + vpblendd xmm8, xmm8, xmm6, 0x88 + vpshufd xmm8, xmm8, 0x78 + vpunpckhdq xmm5, xmm5, xmm7 + vpunpckldq xmm6, xmm6, xmm5 + vpshufd xmm7, xmm6, 0x1E + vmovdqa xmm5, xmm9 + vmovdqa xmm6, xmm8 + jmp 9b +9: + vpxor xmm0, xmm0, xmm2 + vpxor xmm1, xmm1, xmm3 + mov eax, r13d + cmp rdx, r15 + jne 2b + vmovdqu xmmword ptr [rbx], xmm0 + vmovdqu xmmword ptr [rbx+0x10], xmm1 + jmp 4b +.p2align 6 +_blake3_compress_in_place_avx512: +blake3_compress_in_place_avx512: + _CET_ENDBR + vmovdqu xmm0, xmmword ptr [rdi] + vmovdqu xmm1, xmmword ptr [rdi+0x10] + movzx eax, r8b + movzx edx, dl + shl rax, 32 + add rdx, rax + vmovq xmm3, rcx + vmovq xmm4, rdx + vpunpcklqdq xmm3, xmm3, xmm4 + vmovaps xmm2, xmmword ptr [BLAKE3_IV+rip] + vmovups xmm8, xmmword ptr [rsi] + vmovups xmm9, xmmword ptr [rsi+0x10] + vshufps xmm4, xmm8, xmm9, 136 + vshufps xmm5, xmm8, xmm9, 221 + vmovups xmm8, xmmword ptr [rsi+0x20] + vmovups xmm9, xmmword ptr [rsi+0x30] + vshufps xmm6, xmm8, xmm9, 136 + vshufps xmm7, xmm8, xmm9, 221 + vpshufd xmm6, xmm6, 0x93 + vpshufd xmm7, xmm7, 0x93 + mov al, 7 +9: + vpaddd xmm0, xmm0, xmm4 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 16 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 12 + vpaddd xmm0, xmm0, xmm5 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 8 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 7 + vpshufd xmm0, xmm0, 0x93 + vpshufd xmm3, xmm3, 0x4E + vpshufd xmm2, xmm2, 0x39 + vpaddd xmm0, xmm0, xmm6 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 16 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 12 + vpaddd xmm0, xmm0, xmm7 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 8 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 7 + vpshufd xmm0, xmm0, 0x39 + vpshufd xmm3, xmm3, 0x4E + vpshufd xmm2, xmm2, 0x93 + dec al + jz 9f + vshufps xmm8, xmm4, xmm5, 214 + vpshufd xmm9, xmm4, 0x0F + vpshufd xmm4, xmm8, 0x39 + vshufps xmm8, xmm6, xmm7, 250 + vpblendd xmm9, xmm9, xmm8, 0xAA + vpunpcklqdq xmm8, xmm7, xmm5 + vpblendd xmm8, xmm8, xmm6, 0x88 + vpshufd xmm8, xmm8, 0x78 + vpunpckhdq xmm5, xmm5, xmm7 + vpunpckldq xmm6, xmm6, xmm5 + vpshufd xmm7, xmm6, 0x1E + vmovdqa xmm5, xmm9 + vmovdqa xmm6, xmm8 + jmp 9b +9: + vpxor xmm0, xmm0, xmm2 + vpxor xmm1, xmm1, xmm3 + vmovdqu xmmword ptr [rdi], xmm0 + vmovdqu xmmword ptr [rdi+0x10], xmm1 + ret + +.p2align 6 +_blake3_compress_xof_avx512: +blake3_compress_xof_avx512: + _CET_ENDBR + vmovdqu xmm0, xmmword ptr [rdi] + vmovdqu xmm1, xmmword ptr [rdi+0x10] + movzx eax, r8b + movzx edx, dl + shl rax, 32 + add rdx, rax + vmovq xmm3, rcx + vmovq xmm4, rdx + vpunpcklqdq xmm3, xmm3, xmm4 + vmovaps xmm2, xmmword ptr [BLAKE3_IV+rip] + vmovups xmm8, xmmword ptr [rsi] + vmovups xmm9, xmmword ptr [rsi+0x10] + vshufps xmm4, xmm8, xmm9, 136 + vshufps xmm5, xmm8, xmm9, 221 + vmovups xmm8, xmmword ptr [rsi+0x20] + vmovups xmm9, xmmword ptr [rsi+0x30] + vshufps xmm6, xmm8, xmm9, 136 + vshufps xmm7, xmm8, xmm9, 221 + vpshufd xmm6, xmm6, 0x93 + vpshufd xmm7, xmm7, 0x93 + mov al, 7 +9: + vpaddd xmm0, xmm0, xmm4 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 16 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 12 + vpaddd xmm0, xmm0, xmm5 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 8 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 7 + vpshufd xmm0, xmm0, 0x93 + vpshufd xmm3, xmm3, 0x4E + vpshufd xmm2, xmm2, 0x39 + vpaddd xmm0, xmm0, xmm6 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 16 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 12 + vpaddd xmm0, xmm0, xmm7 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 8 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 7 + vpshufd xmm0, xmm0, 0x39 + vpshufd xmm3, xmm3, 0x4E + vpshufd xmm2, xmm2, 0x93 + dec al + jz 9f + vshufps xmm8, xmm4, xmm5, 214 + vpshufd xmm9, xmm4, 0x0F + vpshufd xmm4, xmm8, 0x39 + vshufps xmm8, xmm6, xmm7, 250 + vpblendd xmm9, xmm9, xmm8, 0xAA + vpunpcklqdq xmm8, xmm7, xmm5 + vpblendd xmm8, xmm8, xmm6, 0x88 + vpshufd xmm8, xmm8, 0x78 + vpunpckhdq xmm5, xmm5, xmm7 + vpunpckldq xmm6, xmm6, xmm5 + vpshufd xmm7, xmm6, 0x1E + vmovdqa xmm5, xmm9 + vmovdqa xmm6, xmm8 + jmp 9b +9: + vpxor xmm0, xmm0, xmm2 + vpxor xmm1, xmm1, xmm3 + vpxor xmm2, xmm2, [rdi] + vpxor xmm3, xmm3, [rdi+0x10] + vmovdqu xmmword ptr [r9], xmm0 + vmovdqu xmmword ptr [r9+0x10], xmm1 + vmovdqu xmmword ptr [r9+0x20], xmm2 + vmovdqu xmmword ptr [r9+0x30], xmm3 + ret + +#ifdef __APPLE__ +.static_data +#else +.section .rodata +#endif +.p2align 6 +INDEX0: + .long 0, 1, 2, 3, 16, 17, 18, 19 + .long 8, 9, 10, 11, 24, 25, 26, 27 +INDEX1: + .long 4, 5, 6, 7, 20, 21, 22, 23 + .long 12, 13, 14, 15, 28, 29, 30, 31 +ADD0: + .long 0, 1, 2, 3, 4, 5, 6, 7 + .long 8, 9, 10, 11, 12, 13, 14, 15 +ADD1: .long 1 + +ADD16: .long 16 +BLAKE3_BLOCK_LEN: + .long 64 +.p2align 6 +BLAKE3_IV: +BLAKE3_IV_0: + .long 0x6A09E667 +BLAKE3_IV_1: + .long 0xBB67AE85 +BLAKE3_IV_2: + .long 0x3C6EF372 +BLAKE3_IV_3: + .long 0xA54FF53A diff --git a/third-party/blake3/c/blake3_avx512_x86-64_windows_gnu.S b/third-party/blake3/c/blake3_avx512_x86-64_windows_gnu.S new file mode 100644 index 00000000..ba4fc5fa --- /dev/null +++ b/third-party/blake3/c/blake3_avx512_x86-64_windows_gnu.S @@ -0,0 +1,2615 @@ +.intel_syntax noprefix + +.global _blake3_hash_many_avx512 +.global blake3_hash_many_avx512 +.global blake3_compress_in_place_avx512 +.global _blake3_compress_in_place_avx512 +.global blake3_compress_xof_avx512 +.global _blake3_compress_xof_avx512 + +.section .text +.p2align 6 +_blake3_hash_many_avx512: +blake3_hash_many_avx512: + push r15 + push r14 + push r13 + push r12 + push rdi + push rsi + push rbx + push rbp + mov rbp, rsp + sub rsp, 304 + and rsp, 0xFFFFFFFFFFFFFFC0 + vmovdqa xmmword ptr [rsp+0x90], xmm6 + vmovdqa xmmword ptr [rsp+0xA0], xmm7 + vmovdqa xmmword ptr [rsp+0xB0], xmm8 + vmovdqa xmmword ptr [rsp+0xC0], xmm9 + vmovdqa xmmword ptr [rsp+0xD0], xmm10 + vmovdqa xmmword ptr [rsp+0xE0], xmm11 + vmovdqa xmmword ptr [rsp+0xF0], xmm12 + vmovdqa xmmword ptr [rsp+0x100], xmm13 + vmovdqa xmmword ptr [rsp+0x110], xmm14 + vmovdqa xmmword ptr [rsp+0x120], xmm15 + mov rdi, rcx + mov rsi, rdx + mov rdx, r8 + mov rcx, r9 + mov r8, qword ptr [rbp+0x68] + movzx r9, byte ptr [rbp+0x70] + neg r9 + kmovw k1, r9d + vmovd xmm0, r8d + vpbroadcastd ymm0, xmm0 + shr r8, 32 + vmovd xmm1, r8d + vpbroadcastd ymm1, xmm1 + vmovdqa ymm4, ymm1 + vmovdqa ymm5, ymm1 + vpaddd ymm2, ymm0, ymmword ptr [ADD0+rip] + vpaddd ymm3, ymm0, ymmword ptr [ADD0+32+rip] + vpcmpltud k2, ymm2, ymm0 + vpcmpltud k3, ymm3, ymm0 + vpaddd ymm4 {k2}, ymm4, dword ptr [ADD1+rip] {1to8} + vpaddd ymm5 {k3}, ymm5, dword ptr [ADD1+rip] {1to8} + knotw k2, k1 + vmovdqa32 ymm2 {k2}, ymm0 + vmovdqa32 ymm3 {k2}, ymm0 + vmovdqa32 ymm4 {k2}, ymm1 + vmovdqa32 ymm5 {k2}, ymm1 + vmovdqa ymmword ptr [rsp], ymm2 + vmovdqa ymmword ptr [rsp+0x20], ymm3 + vmovdqa ymmword ptr [rsp+0x40], ymm4 + vmovdqa ymmword ptr [rsp+0x60], ymm5 + shl rdx, 6 + mov qword ptr [rsp+0x80], rdx + cmp rsi, 16 + jc 3f +2: + vpbroadcastd zmm0, dword ptr [rcx] + vpbroadcastd zmm1, dword ptr [rcx+0x1*0x4] + vpbroadcastd zmm2, dword ptr [rcx+0x2*0x4] + vpbroadcastd zmm3, dword ptr [rcx+0x3*0x4] + vpbroadcastd zmm4, dword ptr [rcx+0x4*0x4] + vpbroadcastd zmm5, dword ptr [rcx+0x5*0x4] + vpbroadcastd zmm6, dword ptr [rcx+0x6*0x4] + vpbroadcastd zmm7, dword ptr [rcx+0x7*0x4] + movzx eax, byte ptr [rbp+0x78] + movzx ebx, byte ptr [rbp+0x80] + or eax, ebx + xor edx, edx +.p2align 5 +9: + movzx ebx, byte ptr [rbp+0x88] + or ebx, eax + add rdx, 64 + cmp rdx, qword ptr [rsp+0x80] + cmove eax, ebx + mov dword ptr [rsp+0x88], eax + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+0x8] + mov r10, qword ptr [rdi+0x10] + mov r11, qword ptr [rdi+0x18] + mov r12, qword ptr [rdi+0x40] + mov r13, qword ptr [rdi+0x48] + mov r14, qword ptr [rdi+0x50] + mov r15, qword ptr [rdi+0x58] + vmovdqu32 ymm16, ymmword ptr [rdx+r8-0x2*0x20] + vinserti64x4 zmm16, zmm16, ymmword ptr [rdx+r12-0x2*0x20], 0x01 + vmovdqu32 ymm17, ymmword ptr [rdx+r9-0x2*0x20] + vinserti64x4 zmm17, zmm17, ymmword ptr [rdx+r13-0x2*0x20], 0x01 + vpunpcklqdq zmm8, zmm16, zmm17 + vpunpckhqdq zmm9, zmm16, zmm17 + vmovdqu32 ymm18, ymmword ptr [rdx+r10-0x2*0x20] + vinserti64x4 zmm18, zmm18, ymmword ptr [rdx+r14-0x2*0x20], 0x01 + vmovdqu32 ymm19, ymmword ptr [rdx+r11-0x2*0x20] + vinserti64x4 zmm19, zmm19, ymmword ptr [rdx+r15-0x2*0x20], 0x01 + vpunpcklqdq zmm10, zmm18, zmm19 + vpunpckhqdq zmm11, zmm18, zmm19 + mov r8, qword ptr [rdi+0x20] + mov r9, qword ptr [rdi+0x28] + mov r10, qword ptr [rdi+0x30] + mov r11, qword ptr [rdi+0x38] + mov r12, qword ptr [rdi+0x60] + mov r13, qword ptr [rdi+0x68] + mov r14, qword ptr [rdi+0x70] + mov r15, qword ptr [rdi+0x78] + vmovdqu32 ymm16, ymmword ptr [rdx+r8-0x2*0x20] + vinserti64x4 zmm16, zmm16, ymmword ptr [rdx+r12-0x2*0x20], 0x01 + vmovdqu32 ymm17, ymmword ptr [rdx+r9-0x2*0x20] + vinserti64x4 zmm17, zmm17, ymmword ptr [rdx+r13-0x2*0x20], 0x01 + vpunpcklqdq zmm12, zmm16, zmm17 + vpunpckhqdq zmm13, zmm16, zmm17 + vmovdqu32 ymm18, ymmword ptr [rdx+r10-0x2*0x20] + vinserti64x4 zmm18, zmm18, ymmword ptr [rdx+r14-0x2*0x20], 0x01 + vmovdqu32 ymm19, ymmword ptr [rdx+r11-0x2*0x20] + vinserti64x4 zmm19, zmm19, ymmword ptr [rdx+r15-0x2*0x20], 0x01 + vpunpcklqdq zmm14, zmm18, zmm19 + vpunpckhqdq zmm15, zmm18, zmm19 + vmovdqa32 zmm27, zmmword ptr [INDEX0+rip] + vmovdqa32 zmm31, zmmword ptr [INDEX1+rip] + vshufps zmm16, zmm8, zmm10, 136 + vshufps zmm17, zmm12, zmm14, 136 + vmovdqa32 zmm20, zmm16 + vpermt2d zmm16, zmm27, zmm17 + vpermt2d zmm20, zmm31, zmm17 + vshufps zmm17, zmm8, zmm10, 221 + vshufps zmm30, zmm12, zmm14, 221 + vmovdqa32 zmm21, zmm17 + vpermt2d zmm17, zmm27, zmm30 + vpermt2d zmm21, zmm31, zmm30 + vshufps zmm18, zmm9, zmm11, 136 + vshufps zmm8, zmm13, zmm15, 136 + vmovdqa32 zmm22, zmm18 + vpermt2d zmm18, zmm27, zmm8 + vpermt2d zmm22, zmm31, zmm8 + vshufps zmm19, zmm9, zmm11, 221 + vshufps zmm8, zmm13, zmm15, 221 + vmovdqa32 zmm23, zmm19 + vpermt2d zmm19, zmm27, zmm8 + vpermt2d zmm23, zmm31, zmm8 + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+0x8] + mov r10, qword ptr [rdi+0x10] + mov r11, qword ptr [rdi+0x18] + mov r12, qword ptr [rdi+0x40] + mov r13, qword ptr [rdi+0x48] + mov r14, qword ptr [rdi+0x50] + mov r15, qword ptr [rdi+0x58] + vmovdqu32 ymm24, ymmword ptr [r8+rdx-0x1*0x20] + vinserti64x4 zmm24, zmm24, ymmword ptr [r12+rdx-0x1*0x20], 0x01 + vmovdqu32 ymm25, ymmword ptr [r9+rdx-0x1*0x20] + vinserti64x4 zmm25, zmm25, ymmword ptr [r13+rdx-0x1*0x20], 0x01 + vpunpcklqdq zmm8, zmm24, zmm25 + vpunpckhqdq zmm9, zmm24, zmm25 + vmovdqu32 ymm24, ymmword ptr [r10+rdx-0x1*0x20] + vinserti64x4 zmm24, zmm24, ymmword ptr [r14+rdx-0x1*0x20], 0x01 + vmovdqu32 ymm25, ymmword ptr [r11+rdx-0x1*0x20] + vinserti64x4 zmm25, zmm25, ymmword ptr [r15+rdx-0x1*0x20], 0x01 + vpunpcklqdq zmm10, zmm24, zmm25 + vpunpckhqdq zmm11, zmm24, zmm25 + prefetcht0 [r8+rdx+0x80] + prefetcht0 [r12+rdx+0x80] + prefetcht0 [r9+rdx+0x80] + prefetcht0 [r13+rdx+0x80] + prefetcht0 [r10+rdx+0x80] + prefetcht0 [r14+rdx+0x80] + prefetcht0 [r11+rdx+0x80] + prefetcht0 [r15+rdx+0x80] + mov r8, qword ptr [rdi+0x20] + mov r9, qword ptr [rdi+0x28] + mov r10, qword ptr [rdi+0x30] + mov r11, qword ptr [rdi+0x38] + mov r12, qword ptr [rdi+0x60] + mov r13, qword ptr [rdi+0x68] + mov r14, qword ptr [rdi+0x70] + mov r15, qword ptr [rdi+0x78] + vmovdqu32 ymm24, ymmword ptr [r8+rdx-0x1*0x20] + vinserti64x4 zmm24, zmm24, ymmword ptr [r12+rdx-0x1*0x20], 0x01 + vmovdqu32 ymm25, ymmword ptr [r9+rdx-0x1*0x20] + vinserti64x4 zmm25, zmm25, ymmword ptr [r13+rdx-0x1*0x20], 0x01 + vpunpcklqdq zmm12, zmm24, zmm25 + vpunpckhqdq zmm13, zmm24, zmm25 + vmovdqu32 ymm24, ymmword ptr [r10+rdx-0x1*0x20] + vinserti64x4 zmm24, zmm24, ymmword ptr [r14+rdx-0x1*0x20], 0x01 + vmovdqu32 ymm25, ymmword ptr [r11+rdx-0x1*0x20] + vinserti64x4 zmm25, zmm25, ymmword ptr [r15+rdx-0x1*0x20], 0x01 + vpunpcklqdq zmm14, zmm24, zmm25 + vpunpckhqdq zmm15, zmm24, zmm25 + prefetcht0 [r8+rdx+0x80] + prefetcht0 [r12+rdx+0x80] + prefetcht0 [r9+rdx+0x80] + prefetcht0 [r13+rdx+0x80] + prefetcht0 [r10+rdx+0x80] + prefetcht0 [r14+rdx+0x80] + prefetcht0 [r11+rdx+0x80] + prefetcht0 [r15+rdx+0x80] + vshufps zmm24, zmm8, zmm10, 136 + vshufps zmm30, zmm12, zmm14, 136 + vmovdqa32 zmm28, zmm24 + vpermt2d zmm24, zmm27, zmm30 + vpermt2d zmm28, zmm31, zmm30 + vshufps zmm25, zmm8, zmm10, 221 + vshufps zmm30, zmm12, zmm14, 221 + vmovdqa32 zmm29, zmm25 + vpermt2d zmm25, zmm27, zmm30 + vpermt2d zmm29, zmm31, zmm30 + vshufps zmm26, zmm9, zmm11, 136 + vshufps zmm8, zmm13, zmm15, 136 + vmovdqa32 zmm30, zmm26 + vpermt2d zmm26, zmm27, zmm8 + vpermt2d zmm30, zmm31, zmm8 + vshufps zmm8, zmm9, zmm11, 221 + vshufps zmm10, zmm13, zmm15, 221 + vpermi2d zmm27, zmm8, zmm10 + vpermi2d zmm31, zmm8, zmm10 + vpbroadcastd zmm8, dword ptr [BLAKE3_IV_0+rip] + vpbroadcastd zmm9, dword ptr [BLAKE3_IV_1+rip] + vpbroadcastd zmm10, dword ptr [BLAKE3_IV_2+rip] + vpbroadcastd zmm11, dword ptr [BLAKE3_IV_3+rip] + vmovdqa32 zmm12, zmmword ptr [rsp] + vmovdqa32 zmm13, zmmword ptr [rsp+0x1*0x40] + vpbroadcastd zmm14, dword ptr [BLAKE3_BLOCK_LEN+rip] + vpbroadcastd zmm15, dword ptr [rsp+0x22*0x4] + vpaddd zmm0, zmm0, zmm16 + vpaddd zmm1, zmm1, zmm18 + vpaddd zmm2, zmm2, zmm20 + vpaddd zmm3, zmm3, zmm22 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vprord zmm15, zmm15, 16 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 12 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vpaddd zmm0, zmm0, zmm17 + vpaddd zmm1, zmm1, zmm19 + vpaddd zmm2, zmm2, zmm21 + vpaddd zmm3, zmm3, zmm23 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vprord zmm15, zmm15, 8 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 7 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vpaddd zmm0, zmm0, zmm24 + vpaddd zmm1, zmm1, zmm26 + vpaddd zmm2, zmm2, zmm28 + vpaddd zmm3, zmm3, zmm30 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 16 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vprord zmm4, zmm4, 12 + vpaddd zmm0, zmm0, zmm25 + vpaddd zmm1, zmm1, zmm27 + vpaddd zmm2, zmm2, zmm29 + vpaddd zmm3, zmm3, zmm31 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 8 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vprord zmm4, zmm4, 7 + vpaddd zmm0, zmm0, zmm18 + vpaddd zmm1, zmm1, zmm19 + vpaddd zmm2, zmm2, zmm23 + vpaddd zmm3, zmm3, zmm20 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vprord zmm15, zmm15, 16 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 12 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vpaddd zmm0, zmm0, zmm22 + vpaddd zmm1, zmm1, zmm26 + vpaddd zmm2, zmm2, zmm16 + vpaddd zmm3, zmm3, zmm29 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vprord zmm15, zmm15, 8 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 7 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vpaddd zmm0, zmm0, zmm17 + vpaddd zmm1, zmm1, zmm28 + vpaddd zmm2, zmm2, zmm25 + vpaddd zmm3, zmm3, zmm31 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 16 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vprord zmm4, zmm4, 12 + vpaddd zmm0, zmm0, zmm27 + vpaddd zmm1, zmm1, zmm21 + vpaddd zmm2, zmm2, zmm30 + vpaddd zmm3, zmm3, zmm24 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 8 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vprord zmm4, zmm4, 7 + vpaddd zmm0, zmm0, zmm19 + vpaddd zmm1, zmm1, zmm26 + vpaddd zmm2, zmm2, zmm29 + vpaddd zmm3, zmm3, zmm23 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vprord zmm15, zmm15, 16 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 12 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vpaddd zmm0, zmm0, zmm20 + vpaddd zmm1, zmm1, zmm28 + vpaddd zmm2, zmm2, zmm18 + vpaddd zmm3, zmm3, zmm30 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vprord zmm15, zmm15, 8 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 7 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vpaddd zmm0, zmm0, zmm22 + vpaddd zmm1, zmm1, zmm25 + vpaddd zmm2, zmm2, zmm27 + vpaddd zmm3, zmm3, zmm24 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 16 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vprord zmm4, zmm4, 12 + vpaddd zmm0, zmm0, zmm21 + vpaddd zmm1, zmm1, zmm16 + vpaddd zmm2, zmm2, zmm31 + vpaddd zmm3, zmm3, zmm17 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 8 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vprord zmm4, zmm4, 7 + vpaddd zmm0, zmm0, zmm26 + vpaddd zmm1, zmm1, zmm28 + vpaddd zmm2, zmm2, zmm30 + vpaddd zmm3, zmm3, zmm29 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vprord zmm15, zmm15, 16 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 12 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vpaddd zmm0, zmm0, zmm23 + vpaddd zmm1, zmm1, zmm25 + vpaddd zmm2, zmm2, zmm19 + vpaddd zmm3, zmm3, zmm31 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vprord zmm15, zmm15, 8 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 7 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vpaddd zmm0, zmm0, zmm20 + vpaddd zmm1, zmm1, zmm27 + vpaddd zmm2, zmm2, zmm21 + vpaddd zmm3, zmm3, zmm17 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 16 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vprord zmm4, zmm4, 12 + vpaddd zmm0, zmm0, zmm16 + vpaddd zmm1, zmm1, zmm18 + vpaddd zmm2, zmm2, zmm24 + vpaddd zmm3, zmm3, zmm22 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 8 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vprord zmm4, zmm4, 7 + vpaddd zmm0, zmm0, zmm28 + vpaddd zmm1, zmm1, zmm25 + vpaddd zmm2, zmm2, zmm31 + vpaddd zmm3, zmm3, zmm30 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vprord zmm15, zmm15, 16 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 12 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vpaddd zmm0, zmm0, zmm29 + vpaddd zmm1, zmm1, zmm27 + vpaddd zmm2, zmm2, zmm26 + vpaddd zmm3, zmm3, zmm24 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vprord zmm15, zmm15, 8 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 7 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vpaddd zmm0, zmm0, zmm23 + vpaddd zmm1, zmm1, zmm21 + vpaddd zmm2, zmm2, zmm16 + vpaddd zmm3, zmm3, zmm22 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 16 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vprord zmm4, zmm4, 12 + vpaddd zmm0, zmm0, zmm18 + vpaddd zmm1, zmm1, zmm19 + vpaddd zmm2, zmm2, zmm17 + vpaddd zmm3, zmm3, zmm20 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 8 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vprord zmm4, zmm4, 7 + vpaddd zmm0, zmm0, zmm25 + vpaddd zmm1, zmm1, zmm27 + vpaddd zmm2, zmm2, zmm24 + vpaddd zmm3, zmm3, zmm31 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vprord zmm15, zmm15, 16 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 12 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vpaddd zmm0, zmm0, zmm30 + vpaddd zmm1, zmm1, zmm21 + vpaddd zmm2, zmm2, zmm28 + vpaddd zmm3, zmm3, zmm17 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vprord zmm15, zmm15, 8 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 7 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vpaddd zmm0, zmm0, zmm29 + vpaddd zmm1, zmm1, zmm16 + vpaddd zmm2, zmm2, zmm18 + vpaddd zmm3, zmm3, zmm20 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 16 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vprord zmm4, zmm4, 12 + vpaddd zmm0, zmm0, zmm19 + vpaddd zmm1, zmm1, zmm26 + vpaddd zmm2, zmm2, zmm22 + vpaddd zmm3, zmm3, zmm23 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 8 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vprord zmm4, zmm4, 7 + vpaddd zmm0, zmm0, zmm27 + vpaddd zmm1, zmm1, zmm21 + vpaddd zmm2, zmm2, zmm17 + vpaddd zmm3, zmm3, zmm24 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vprord zmm15, zmm15, 16 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 12 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vpaddd zmm0, zmm0, zmm31 + vpaddd zmm1, zmm1, zmm16 + vpaddd zmm2, zmm2, zmm25 + vpaddd zmm3, zmm3, zmm22 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vprord zmm15, zmm15, 8 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 7 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vpaddd zmm0, zmm0, zmm30 + vpaddd zmm1, zmm1, zmm18 + vpaddd zmm2, zmm2, zmm19 + vpaddd zmm3, zmm3, zmm23 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 16 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vprord zmm4, zmm4, 12 + vpaddd zmm0, zmm0, zmm26 + vpaddd zmm1, zmm1, zmm28 + vpaddd zmm2, zmm2, zmm20 + vpaddd zmm3, zmm3, zmm29 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 8 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vprord zmm4, zmm4, 7 + vpxord zmm0, zmm0, zmm8 + vpxord zmm1, zmm1, zmm9 + vpxord zmm2, zmm2, zmm10 + vpxord zmm3, zmm3, zmm11 + vpxord zmm4, zmm4, zmm12 + vpxord zmm5, zmm5, zmm13 + vpxord zmm6, zmm6, zmm14 + vpxord zmm7, zmm7, zmm15 + movzx eax, byte ptr [rbp+0x78] + jne 9b + mov rbx, qword ptr [rbp+0x90] + vpunpckldq zmm16, zmm0, zmm1 + vpunpckhdq zmm17, zmm0, zmm1 + vpunpckldq zmm18, zmm2, zmm3 + vpunpckhdq zmm19, zmm2, zmm3 + vpunpckldq zmm20, zmm4, zmm5 + vpunpckhdq zmm21, zmm4, zmm5 + vpunpckldq zmm22, zmm6, zmm7 + vpunpckhdq zmm23, zmm6, zmm7 + vpunpcklqdq zmm0, zmm16, zmm18 + vpunpckhqdq zmm1, zmm16, zmm18 + vpunpcklqdq zmm2, zmm17, zmm19 + vpunpckhqdq zmm3, zmm17, zmm19 + vpunpcklqdq zmm4, zmm20, zmm22 + vpunpckhqdq zmm5, zmm20, zmm22 + vpunpcklqdq zmm6, zmm21, zmm23 + vpunpckhqdq zmm7, zmm21, zmm23 + vshufi32x4 zmm16, zmm0, zmm4, 0x88 + vshufi32x4 zmm17, zmm1, zmm5, 0x88 + vshufi32x4 zmm18, zmm2, zmm6, 0x88 + vshufi32x4 zmm19, zmm3, zmm7, 0x88 + vshufi32x4 zmm20, zmm0, zmm4, 0xDD + vshufi32x4 zmm21, zmm1, zmm5, 0xDD + vshufi32x4 zmm22, zmm2, zmm6, 0xDD + vshufi32x4 zmm23, zmm3, zmm7, 0xDD + vshufi32x4 zmm0, zmm16, zmm17, 0x88 + vshufi32x4 zmm1, zmm18, zmm19, 0x88 + vshufi32x4 zmm2, zmm20, zmm21, 0x88 + vshufi32x4 zmm3, zmm22, zmm23, 0x88 + vshufi32x4 zmm4, zmm16, zmm17, 0xDD + vshufi32x4 zmm5, zmm18, zmm19, 0xDD + vshufi32x4 zmm6, zmm20, zmm21, 0xDD + vshufi32x4 zmm7, zmm22, zmm23, 0xDD + vmovdqu32 zmmword ptr [rbx], zmm0 + vmovdqu32 zmmword ptr [rbx+0x1*0x40], zmm1 + vmovdqu32 zmmword ptr [rbx+0x2*0x40], zmm2 + vmovdqu32 zmmword ptr [rbx+0x3*0x40], zmm3 + vmovdqu32 zmmword ptr [rbx+0x4*0x40], zmm4 + vmovdqu32 zmmword ptr [rbx+0x5*0x40], zmm5 + vmovdqu32 zmmword ptr [rbx+0x6*0x40], zmm6 + vmovdqu32 zmmword ptr [rbx+0x7*0x40], zmm7 + vmovdqa32 zmm0, zmmword ptr [rsp] + vmovdqa32 zmm1, zmmword ptr [rsp+0x1*0x40] + vmovdqa32 zmm2, zmm0 + vpaddd zmm2{k1}, zmm0, dword ptr [ADD16+rip] {1to16} + vpcmpltud k2, zmm2, zmm0 + vpaddd zmm1 {k2}, zmm1, dword ptr [ADD1+rip] {1to16} + vmovdqa32 zmmword ptr [rsp], zmm2 + vmovdqa32 zmmword ptr [rsp+0x1*0x40], zmm1 + add rdi, 128 + add rbx, 512 + mov qword ptr [rbp+0x90], rbx + sub rsi, 16 + cmp rsi, 16 + jnc 2b + test rsi, rsi + jne 3f +4: + vzeroupper + vmovdqa xmm6, xmmword ptr [rsp+0x90] + vmovdqa xmm7, xmmword ptr [rsp+0xA0] + vmovdqa xmm8, xmmword ptr [rsp+0xB0] + vmovdqa xmm9, xmmword ptr [rsp+0xC0] + vmovdqa xmm10, xmmword ptr [rsp+0xD0] + vmovdqa xmm11, xmmword ptr [rsp+0xE0] + vmovdqa xmm12, xmmword ptr [rsp+0xF0] + vmovdqa xmm13, xmmword ptr [rsp+0x100] + vmovdqa xmm14, xmmword ptr [rsp+0x110] + vmovdqa xmm15, xmmword ptr [rsp+0x120] + mov rsp, rbp + pop rbp + pop rbx + pop rsi + pop rdi + pop r12 + pop r13 + pop r14 + pop r15 + ret +.p2align 6 +3: + test esi, 0x8 + je 3f + vpbroadcastd ymm0, dword ptr [rcx] + vpbroadcastd ymm1, dword ptr [rcx+0x4] + vpbroadcastd ymm2, dword ptr [rcx+0x8] + vpbroadcastd ymm3, dword ptr [rcx+0xC] + vpbroadcastd ymm4, dword ptr [rcx+0x10] + vpbroadcastd ymm5, dword ptr [rcx+0x14] + vpbroadcastd ymm6, dword ptr [rcx+0x18] + vpbroadcastd ymm7, dword ptr [rcx+0x1C] + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+0x8] + mov r10, qword ptr [rdi+0x10] + mov r11, qword ptr [rdi+0x18] + mov r12, qword ptr [rdi+0x20] + mov r13, qword ptr [rdi+0x28] + mov r14, qword ptr [rdi+0x30] + mov r15, qword ptr [rdi+0x38] + movzx eax, byte ptr [rbp+0x78] + movzx ebx, byte ptr [rbp+0x80] + or eax, ebx + xor edx, edx +2: + movzx ebx, byte ptr [rbp+0x88] + or ebx, eax + add rdx, 64 + cmp rdx, qword ptr [rsp+0x80] + cmove eax, ebx + mov dword ptr [rsp+0x88], eax + vmovups xmm8, xmmword ptr [r8+rdx-0x40] + vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x40], 0x01 + vmovups xmm9, xmmword ptr [r9+rdx-0x40] + vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x40], 0x01 + vunpcklpd ymm12, ymm8, ymm9 + vunpckhpd ymm13, ymm8, ymm9 + vmovups xmm10, xmmword ptr [r10+rdx-0x40] + vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x40], 0x01 + vmovups xmm11, xmmword ptr [r11+rdx-0x40] + vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x40], 0x01 + vunpcklpd ymm14, ymm10, ymm11 + vunpckhpd ymm15, ymm10, ymm11 + vshufps ymm16, ymm12, ymm14, 136 + vshufps ymm17, ymm12, ymm14, 221 + vshufps ymm18, ymm13, ymm15, 136 + vshufps ymm19, ymm13, ymm15, 221 + vmovups xmm8, xmmword ptr [r8+rdx-0x30] + vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x30], 0x01 + vmovups xmm9, xmmword ptr [r9+rdx-0x30] + vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x30], 0x01 + vunpcklpd ymm12, ymm8, ymm9 + vunpckhpd ymm13, ymm8, ymm9 + vmovups xmm10, xmmword ptr [r10+rdx-0x30] + vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x30], 0x01 + vmovups xmm11, xmmword ptr [r11+rdx-0x30] + vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x30], 0x01 + vunpcklpd ymm14, ymm10, ymm11 + vunpckhpd ymm15, ymm10, ymm11 + vshufps ymm20, ymm12, ymm14, 136 + vshufps ymm21, ymm12, ymm14, 221 + vshufps ymm22, ymm13, ymm15, 136 + vshufps ymm23, ymm13, ymm15, 221 + vmovups xmm8, xmmword ptr [r8+rdx-0x20] + vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x20], 0x01 + vmovups xmm9, xmmword ptr [r9+rdx-0x20] + vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x20], 0x01 + vunpcklpd ymm12, ymm8, ymm9 + vunpckhpd ymm13, ymm8, ymm9 + vmovups xmm10, xmmword ptr [r10+rdx-0x20] + vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x20], 0x01 + vmovups xmm11, xmmword ptr [r11+rdx-0x20] + vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x20], 0x01 + vunpcklpd ymm14, ymm10, ymm11 + vunpckhpd ymm15, ymm10, ymm11 + vshufps ymm24, ymm12, ymm14, 136 + vshufps ymm25, ymm12, ymm14, 221 + vshufps ymm26, ymm13, ymm15, 136 + vshufps ymm27, ymm13, ymm15, 221 + vmovups xmm8, xmmword ptr [r8+rdx-0x10] + vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x10], 0x01 + vmovups xmm9, xmmword ptr [r9+rdx-0x10] + vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x10], 0x01 + vunpcklpd ymm12, ymm8, ymm9 + vunpckhpd ymm13, ymm8, ymm9 + vmovups xmm10, xmmword ptr [r10+rdx-0x10] + vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x10], 0x01 + vmovups xmm11, xmmword ptr [r11+rdx-0x10] + vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x10], 0x01 + vunpcklpd ymm14, ymm10, ymm11 + vunpckhpd ymm15, ymm10, ymm11 + vshufps ymm28, ymm12, ymm14, 136 + vshufps ymm29, ymm12, ymm14, 221 + vshufps ymm30, ymm13, ymm15, 136 + vshufps ymm31, ymm13, ymm15, 221 + vpbroadcastd ymm8, dword ptr [BLAKE3_IV_0+rip] + vpbroadcastd ymm9, dword ptr [BLAKE3_IV_1+rip] + vpbroadcastd ymm10, dword ptr [BLAKE3_IV_2+rip] + vpbroadcastd ymm11, dword ptr [BLAKE3_IV_3+rip] + vmovdqa ymm12, ymmword ptr [rsp] + vmovdqa ymm13, ymmword ptr [rsp+0x40] + vpbroadcastd ymm14, dword ptr [BLAKE3_BLOCK_LEN+rip] + vpbroadcastd ymm15, dword ptr [rsp+0x88] + vpaddd ymm0, ymm0, ymm16 + vpaddd ymm1, ymm1, ymm18 + vpaddd ymm2, ymm2, ymm20 + vpaddd ymm3, ymm3, ymm22 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vprord ymm15, ymm15, 16 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 12 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vpaddd ymm0, ymm0, ymm17 + vpaddd ymm1, ymm1, ymm19 + vpaddd ymm2, ymm2, ymm21 + vpaddd ymm3, ymm3, ymm23 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vprord ymm15, ymm15, 8 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 7 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vpaddd ymm0, ymm0, ymm24 + vpaddd ymm1, ymm1, ymm26 + vpaddd ymm2, ymm2, ymm28 + vpaddd ymm3, ymm3, ymm30 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 16 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vprord ymm4, ymm4, 12 + vpaddd ymm0, ymm0, ymm25 + vpaddd ymm1, ymm1, ymm27 + vpaddd ymm2, ymm2, ymm29 + vpaddd ymm3, ymm3, ymm31 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 8 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vprord ymm4, ymm4, 7 + vpaddd ymm0, ymm0, ymm18 + vpaddd ymm1, ymm1, ymm19 + vpaddd ymm2, ymm2, ymm23 + vpaddd ymm3, ymm3, ymm20 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vprord ymm15, ymm15, 16 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 12 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vpaddd ymm0, ymm0, ymm22 + vpaddd ymm1, ymm1, ymm26 + vpaddd ymm2, ymm2, ymm16 + vpaddd ymm3, ymm3, ymm29 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vprord ymm15, ymm15, 8 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 7 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vpaddd ymm0, ymm0, ymm17 + vpaddd ymm1, ymm1, ymm28 + vpaddd ymm2, ymm2, ymm25 + vpaddd ymm3, ymm3, ymm31 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 16 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vprord ymm4, ymm4, 12 + vpaddd ymm0, ymm0, ymm27 + vpaddd ymm1, ymm1, ymm21 + vpaddd ymm2, ymm2, ymm30 + vpaddd ymm3, ymm3, ymm24 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 8 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vprord ymm4, ymm4, 7 + vpaddd ymm0, ymm0, ymm19 + vpaddd ymm1, ymm1, ymm26 + vpaddd ymm2, ymm2, ymm29 + vpaddd ymm3, ymm3, ymm23 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vprord ymm15, ymm15, 16 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 12 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vpaddd ymm0, ymm0, ymm20 + vpaddd ymm1, ymm1, ymm28 + vpaddd ymm2, ymm2, ymm18 + vpaddd ymm3, ymm3, ymm30 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vprord ymm15, ymm15, 8 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 7 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vpaddd ymm0, ymm0, ymm22 + vpaddd ymm1, ymm1, ymm25 + vpaddd ymm2, ymm2, ymm27 + vpaddd ymm3, ymm3, ymm24 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 16 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vprord ymm4, ymm4, 12 + vpaddd ymm0, ymm0, ymm21 + vpaddd ymm1, ymm1, ymm16 + vpaddd ymm2, ymm2, ymm31 + vpaddd ymm3, ymm3, ymm17 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 8 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vprord ymm4, ymm4, 7 + vpaddd ymm0, ymm0, ymm26 + vpaddd ymm1, ymm1, ymm28 + vpaddd ymm2, ymm2, ymm30 + vpaddd ymm3, ymm3, ymm29 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vprord ymm15, ymm15, 16 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 12 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vpaddd ymm0, ymm0, ymm23 + vpaddd ymm1, ymm1, ymm25 + vpaddd ymm2, ymm2, ymm19 + vpaddd ymm3, ymm3, ymm31 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vprord ymm15, ymm15, 8 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 7 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vpaddd ymm0, ymm0, ymm20 + vpaddd ymm1, ymm1, ymm27 + vpaddd ymm2, ymm2, ymm21 + vpaddd ymm3, ymm3, ymm17 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 16 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vprord ymm4, ymm4, 12 + vpaddd ymm0, ymm0, ymm16 + vpaddd ymm1, ymm1, ymm18 + vpaddd ymm2, ymm2, ymm24 + vpaddd ymm3, ymm3, ymm22 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 8 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vprord ymm4, ymm4, 7 + vpaddd ymm0, ymm0, ymm28 + vpaddd ymm1, ymm1, ymm25 + vpaddd ymm2, ymm2, ymm31 + vpaddd ymm3, ymm3, ymm30 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vprord ymm15, ymm15, 16 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 12 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vpaddd ymm0, ymm0, ymm29 + vpaddd ymm1, ymm1, ymm27 + vpaddd ymm2, ymm2, ymm26 + vpaddd ymm3, ymm3, ymm24 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vprord ymm15, ymm15, 8 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 7 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vpaddd ymm0, ymm0, ymm23 + vpaddd ymm1, ymm1, ymm21 + vpaddd ymm2, ymm2, ymm16 + vpaddd ymm3, ymm3, ymm22 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 16 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vprord ymm4, ymm4, 12 + vpaddd ymm0, ymm0, ymm18 + vpaddd ymm1, ymm1, ymm19 + vpaddd ymm2, ymm2, ymm17 + vpaddd ymm3, ymm3, ymm20 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 8 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vprord ymm4, ymm4, 7 + vpaddd ymm0, ymm0, ymm25 + vpaddd ymm1, ymm1, ymm27 + vpaddd ymm2, ymm2, ymm24 + vpaddd ymm3, ymm3, ymm31 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vprord ymm15, ymm15, 16 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 12 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vpaddd ymm0, ymm0, ymm30 + vpaddd ymm1, ymm1, ymm21 + vpaddd ymm2, ymm2, ymm28 + vpaddd ymm3, ymm3, ymm17 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vprord ymm15, ymm15, 8 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 7 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vpaddd ymm0, ymm0, ymm29 + vpaddd ymm1, ymm1, ymm16 + vpaddd ymm2, ymm2, ymm18 + vpaddd ymm3, ymm3, ymm20 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 16 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vprord ymm4, ymm4, 12 + vpaddd ymm0, ymm0, ymm19 + vpaddd ymm1, ymm1, ymm26 + vpaddd ymm2, ymm2, ymm22 + vpaddd ymm3, ymm3, ymm23 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 8 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vprord ymm4, ymm4, 7 + vpaddd ymm0, ymm0, ymm27 + vpaddd ymm1, ymm1, ymm21 + vpaddd ymm2, ymm2, ymm17 + vpaddd ymm3, ymm3, ymm24 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vprord ymm15, ymm15, 16 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 12 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vpaddd ymm0, ymm0, ymm31 + vpaddd ymm1, ymm1, ymm16 + vpaddd ymm2, ymm2, ymm25 + vpaddd ymm3, ymm3, ymm22 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vprord ymm15, ymm15, 8 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 7 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vpaddd ymm0, ymm0, ymm30 + vpaddd ymm1, ymm1, ymm18 + vpaddd ymm2, ymm2, ymm19 + vpaddd ymm3, ymm3, ymm23 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 16 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vprord ymm4, ymm4, 12 + vpaddd ymm0, ymm0, ymm26 + vpaddd ymm1, ymm1, ymm28 + vpaddd ymm2, ymm2, ymm20 + vpaddd ymm3, ymm3, ymm29 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 8 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vprord ymm4, ymm4, 7 + vpxor ymm0, ymm0, ymm8 + vpxor ymm1, ymm1, ymm9 + vpxor ymm2, ymm2, ymm10 + vpxor ymm3, ymm3, ymm11 + vpxor ymm4, ymm4, ymm12 + vpxor ymm5, ymm5, ymm13 + vpxor ymm6, ymm6, ymm14 + vpxor ymm7, ymm7, ymm15 + movzx eax, byte ptr [rbp+0x78] + jne 2b + mov rbx, qword ptr [rbp+0x90] + vunpcklps ymm8, ymm0, ymm1 + vunpcklps ymm9, ymm2, ymm3 + vunpckhps ymm10, ymm0, ymm1 + vunpcklps ymm11, ymm4, ymm5 + vunpcklps ymm0, ymm6, ymm7 + vshufps ymm12, ymm8, ymm9, 78 + vblendps ymm1, ymm8, ymm12, 0xCC + vshufps ymm8, ymm11, ymm0, 78 + vunpckhps ymm13, ymm2, ymm3 + vblendps ymm2, ymm11, ymm8, 0xCC + vblendps ymm3, ymm12, ymm9, 0xCC + vperm2f128 ymm12, ymm1, ymm2, 0x20 + vmovups ymmword ptr [rbx], ymm12 + vunpckhps ymm14, ymm4, ymm5 + vblendps ymm4, ymm8, ymm0, 0xCC + vunpckhps ymm15, ymm6, ymm7 + vperm2f128 ymm7, ymm3, ymm4, 0x20 + vmovups ymmword ptr [rbx+0x20], ymm7 + vshufps ymm5, ymm10, ymm13, 78 + vblendps ymm6, ymm5, ymm13, 0xCC + vshufps ymm13, ymm14, ymm15, 78 + vblendps ymm10, ymm10, ymm5, 0xCC + vblendps ymm14, ymm14, ymm13, 0xCC + vperm2f128 ymm8, ymm10, ymm14, 0x20 + vmovups ymmword ptr [rbx+0x40], ymm8 + vblendps ymm15, ymm13, ymm15, 0xCC + vperm2f128 ymm13, ymm6, ymm15, 0x20 + vmovups ymmword ptr [rbx+0x60], ymm13 + vperm2f128 ymm9, ymm1, ymm2, 0x31 + vperm2f128 ymm11, ymm3, ymm4, 0x31 + vmovups ymmword ptr [rbx+0x80], ymm9 + vperm2f128 ymm14, ymm10, ymm14, 0x31 + vperm2f128 ymm15, ymm6, ymm15, 0x31 + vmovups ymmword ptr [rbx+0xA0], ymm11 + vmovups ymmword ptr [rbx+0xC0], ymm14 + vmovups ymmword ptr [rbx+0xE0], ymm15 + vmovdqa ymm0, ymmword ptr [rsp] + vmovdqa ymm2, ymmword ptr [rsp+0x40] + vmovdqa32 ymm0 {k1}, ymmword ptr [rsp+0x1*0x20] + vmovdqa32 ymm2 {k1}, ymmword ptr [rsp+0x3*0x20] + vmovdqa ymmword ptr [rsp], ymm0 + vmovdqa ymmword ptr [rsp+0x40], ymm2 + add rbx, 256 + mov qword ptr [rbp+0x90], rbx + add rdi, 64 + sub rsi, 8 +3: + mov rbx, qword ptr [rbp+0x90] + mov r15, qword ptr [rsp+0x80] + movzx r13, byte ptr [rbp+0x78] + movzx r12, byte ptr [rbp+0x88] + test esi, 0x4 + je 3f + vbroadcasti32x4 zmm0, xmmword ptr [rcx] + vbroadcasti32x4 zmm1, xmmword ptr [rcx+0x1*0x10] + vmovdqa xmm12, xmmword ptr [rsp] + vmovdqa xmm13, xmmword ptr [rsp+0x40] + vpunpckldq xmm14, xmm12, xmm13 + vpunpckhdq xmm15, xmm12, xmm13 + vpermq ymm14, ymm14, 0xDC + vpermq ymm15, ymm15, 0xDC + vpbroadcastd zmm12, dword ptr [BLAKE3_BLOCK_LEN+rip] + vinserti64x4 zmm13, zmm14, ymm15, 0x01 + mov eax, 17476 + kmovw k2, eax + vpblendmd zmm13 {k2}, zmm13, zmm12 + vbroadcasti32x4 zmm15, xmmword ptr [BLAKE3_IV+rip] + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+0x8] + mov r10, qword ptr [rdi+0x10] + mov r11, qword ptr [rdi+0x18] + mov eax, 43690 + kmovw k3, eax + mov eax, 34952 + kmovw k4, eax + movzx eax, byte ptr [rbp+0x80] + or eax, r13d + xor edx, edx +.p2align 5 +2: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + mov dword ptr [rsp+0x88], eax + vmovdqa32 zmm2, zmm15 + vpbroadcastd zmm8, dword ptr [rsp+0x22*0x4] + vpblendmd zmm3 {k4}, zmm13, zmm8 + vmovups zmm8, zmmword ptr [r8+rdx-0x1*0x40] + vinserti32x4 zmm8, zmm8, xmmword ptr [r9+rdx-0x4*0x10], 0x01 + vinserti32x4 zmm8, zmm8, xmmword ptr [r10+rdx-0x4*0x10], 0x02 + vinserti32x4 zmm8, zmm8, xmmword ptr [r11+rdx-0x4*0x10], 0x03 + vmovups zmm9, zmmword ptr [r8+rdx-0x30] + vinserti32x4 zmm9, zmm9, xmmword ptr [r9+rdx-0x3*0x10], 0x01 + vinserti32x4 zmm9, zmm9, xmmword ptr [r10+rdx-0x3*0x10], 0x02 + vinserti32x4 zmm9, zmm9, xmmword ptr [r11+rdx-0x3*0x10], 0x03 + vshufps zmm4, zmm8, zmm9, 136 + vshufps zmm5, zmm8, zmm9, 221 + vmovups zmm8, zmmword ptr [r8+rdx-0x20] + vinserti32x4 zmm8, zmm8, xmmword ptr [r9+rdx-0x2*0x10], 0x01 + vinserti32x4 zmm8, zmm8, xmmword ptr [r10+rdx-0x2*0x10], 0x02 + vinserti32x4 zmm8, zmm8, xmmword ptr [r11+rdx-0x2*0x10], 0x03 + vmovups zmm9, zmmword ptr [r8+rdx-0x10] + vinserti32x4 zmm9, zmm9, xmmword ptr [r9+rdx-0x1*0x10], 0x01 + vinserti32x4 zmm9, zmm9, xmmword ptr [r10+rdx-0x1*0x10], 0x02 + vinserti32x4 zmm9, zmm9, xmmword ptr [r11+rdx-0x1*0x10], 0x03 + vshufps zmm6, zmm8, zmm9, 136 + vshufps zmm7, zmm8, zmm9, 221 + vpshufd zmm6, zmm6, 0x93 + vpshufd zmm7, zmm7, 0x93 + mov al, 7 +9: + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm0, zmm0, zmm1 + vpxord zmm3, zmm3, zmm0 + vprord zmm3, zmm3, 16 + vpaddd zmm2, zmm2, zmm3 + vpxord zmm1, zmm1, zmm2 + vprord zmm1, zmm1, 12 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm0, zmm0, zmm1 + vpxord zmm3, zmm3, zmm0 + vprord zmm3, zmm3, 8 + vpaddd zmm2, zmm2, zmm3 + vpxord zmm1, zmm1, zmm2 + vprord zmm1, zmm1, 7 + vpshufd zmm0, zmm0, 0x93 + vpshufd zmm3, zmm3, 0x4E + vpshufd zmm2, zmm2, 0x39 + vpaddd zmm0, zmm0, zmm6 + vpaddd zmm0, zmm0, zmm1 + vpxord zmm3, zmm3, zmm0 + vprord zmm3, zmm3, 16 + vpaddd zmm2, zmm2, zmm3 + vpxord zmm1, zmm1, zmm2 + vprord zmm1, zmm1, 12 + vpaddd zmm0, zmm0, zmm7 + vpaddd zmm0, zmm0, zmm1 + vpxord zmm3, zmm3, zmm0 + vprord zmm3, zmm3, 8 + vpaddd zmm2, zmm2, zmm3 + vpxord zmm1, zmm1, zmm2 + vprord zmm1, zmm1, 7 + vpshufd zmm0, zmm0, 0x39 + vpshufd zmm3, zmm3, 0x4E + vpshufd zmm2, zmm2, 0x93 + dec al + jz 9f + vshufps zmm8, zmm4, zmm5, 214 + vpshufd zmm9, zmm4, 0x0F + vpshufd zmm4, zmm8, 0x39 + vshufps zmm8, zmm6, zmm7, 250 + vpblendmd zmm9 {k3}, zmm9, zmm8 + vpunpcklqdq zmm8, zmm7, zmm5 + vpblendmd zmm8 {k4}, zmm8, zmm6 + vpshufd zmm8, zmm8, 0x78 + vpunpckhdq zmm5, zmm5, zmm7 + vpunpckldq zmm6, zmm6, zmm5 + vpshufd zmm7, zmm6, 0x1E + vmovdqa32 zmm5, zmm9 + vmovdqa32 zmm6, zmm8 + jmp 9b +9: + vpxord zmm0, zmm0, zmm2 + vpxord zmm1, zmm1, zmm3 + mov eax, r13d + cmp rdx, r15 + jne 2b + vmovdqu xmmword ptr [rbx], xmm0 + vmovdqu xmmword ptr [rbx+0x10], xmm1 + vextracti128 xmmword ptr [rbx+0x20], ymm0, 0x01 + vextracti128 xmmword ptr [rbx+0x30], ymm1, 0x01 + vextracti32x4 xmmword ptr [rbx+0x4*0x10], zmm0, 0x02 + vextracti32x4 xmmword ptr [rbx+0x5*0x10], zmm1, 0x02 + vextracti32x4 xmmword ptr [rbx+0x6*0x10], zmm0, 0x03 + vextracti32x4 xmmword ptr [rbx+0x7*0x10], zmm1, 0x03 + vmovdqa xmm0, xmmword ptr [rsp] + vmovdqa xmm2, xmmword ptr [rsp+0x40] + vmovdqa32 xmm0 {k1}, xmmword ptr [rsp+0x1*0x10] + vmovdqa32 xmm2 {k1}, xmmword ptr [rsp+0x5*0x10] + vmovdqa xmmword ptr [rsp], xmm0 + vmovdqa xmmword ptr [rsp+0x40], xmm2 + add rbx, 128 + add rdi, 32 + sub rsi, 4 +3: + test esi, 0x2 + je 3f + vbroadcasti128 ymm0, xmmword ptr [rcx] + vbroadcasti128 ymm1, xmmword ptr [rcx+0x10] + vmovd xmm13, dword ptr [rsp] + vpinsrd xmm13, xmm13, dword ptr [rsp+0x40], 1 + vpinsrd xmm13, xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 + vmovd xmm14, dword ptr [rsp+0x4] + vpinsrd xmm14, xmm14, dword ptr [rsp+0x44], 1 + vpinsrd xmm14, xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 + vinserti128 ymm13, ymm13, xmm14, 0x01 + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+0x8] + movzx eax, byte ptr [rbp+0x80] + or eax, r13d + xor edx, edx +.p2align 5 +2: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + mov dword ptr [rsp+0x88], eax + vbroadcasti128 ymm2, xmmword ptr [BLAKE3_IV+rip] + vpbroadcastd ymm8, dword ptr [rsp+0x88] + vpblendd ymm3, ymm13, ymm8, 0x88 + vmovups ymm8, ymmword ptr [r8+rdx-0x40] + vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-0x40], 0x01 + vmovups ymm9, ymmword ptr [r8+rdx-0x30] + vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-0x30], 0x01 + vshufps ymm4, ymm8, ymm9, 136 + vshufps ymm5, ymm8, ymm9, 221 + vmovups ymm8, ymmword ptr [r8+rdx-0x20] + vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-0x20], 0x01 + vmovups ymm9, ymmword ptr [r8+rdx-0x10] + vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-0x10], 0x01 + vshufps ymm6, ymm8, ymm9, 136 + vshufps ymm7, ymm8, ymm9, 221 + vpshufd ymm6, ymm6, 0x93 + vpshufd ymm7, ymm7, 0x93 + mov al, 7 +9: + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm0, ymm0, ymm1 + vpxord ymm3, ymm3, ymm0 + vprord ymm3, ymm3, 16 + vpaddd ymm2, ymm2, ymm3 + vpxord ymm1, ymm1, ymm2 + vprord ymm1, ymm1, 12 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm0, ymm0, ymm1 + vpxord ymm3, ymm3, ymm0 + vprord ymm3, ymm3, 8 + vpaddd ymm2, ymm2, ymm3 + vpxord ymm1, ymm1, ymm2 + vprord ymm1, ymm1, 7 + vpshufd ymm0, ymm0, 0x93 + vpshufd ymm3, ymm3, 0x4E + vpshufd ymm2, ymm2, 0x39 + vpaddd ymm0, ymm0, ymm6 + vpaddd ymm0, ymm0, ymm1 + vpxord ymm3, ymm3, ymm0 + vprord ymm3, ymm3, 16 + vpaddd ymm2, ymm2, ymm3 + vpxord ymm1, ymm1, ymm2 + vprord ymm1, ymm1, 12 + vpaddd ymm0, ymm0, ymm7 + vpaddd ymm0, ymm0, ymm1 + vpxord ymm3, ymm3, ymm0 + vprord ymm3, ymm3, 8 + vpaddd ymm2, ymm2, ymm3 + vpxord ymm1, ymm1, ymm2 + vprord ymm1, ymm1, 7 + vpshufd ymm0, ymm0, 0x39 + vpshufd ymm3, ymm3, 0x4E + vpshufd ymm2, ymm2, 0x93 + dec al + jz 9f + vshufps ymm8, ymm4, ymm5, 214 + vpshufd ymm9, ymm4, 0x0F + vpshufd ymm4, ymm8, 0x39 + vshufps ymm8, ymm6, ymm7, 250 + vpblendd ymm9, ymm9, ymm8, 0xAA + vpunpcklqdq ymm8, ymm7, ymm5 + vpblendd ymm8, ymm8, ymm6, 0x88 + vpshufd ymm8, ymm8, 0x78 + vpunpckhdq ymm5, ymm5, ymm7 + vpunpckldq ymm6, ymm6, ymm5 + vpshufd ymm7, ymm6, 0x1E + vmovdqa ymm5, ymm9 + vmovdqa ymm6, ymm8 + jmp 9b +9: + vpxor ymm0, ymm0, ymm2 + vpxor ymm1, ymm1, ymm3 + mov eax, r13d + cmp rdx, r15 + jne 2b + vmovdqu xmmword ptr [rbx], xmm0 + vmovdqu xmmword ptr [rbx+0x10], xmm1 + vextracti128 xmmword ptr [rbx+0x20], ymm0, 0x01 + vextracti128 xmmword ptr [rbx+0x30], ymm1, 0x01 + vmovdqa xmm0, xmmword ptr [rsp] + vmovdqa xmm2, xmmword ptr [rsp+0x40] + vmovdqu32 xmm0 {k1}, xmmword ptr [rsp+0x8] + vmovdqu32 xmm2 {k1}, xmmword ptr [rsp+0x48] + vmovdqa xmmword ptr [rsp], xmm0 + vmovdqa xmmword ptr [rsp+0x40], xmm2 + add rbx, 64 + add rdi, 16 + sub rsi, 2 +3: + test esi, 0x1 + je 4b + vmovdqu xmm0, xmmword ptr [rcx] + vmovdqu xmm1, xmmword ptr [rcx+0x10] + vmovd xmm14, dword ptr [rsp] + vpinsrd xmm14, xmm14, dword ptr [rsp+0x40], 1 + vpinsrd xmm14, xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 + vmovdqa xmm15, xmmword ptr [BLAKE3_IV+rip] + mov r8, qword ptr [rdi] + movzx eax, byte ptr [rbp+0x80] + or eax, r13d + xor edx, edx +.p2align 5 +2: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + vpinsrd xmm3, xmm14, eax, 3 + vmovdqa xmm2, xmm15 + vmovups xmm8, xmmword ptr [r8+rdx-0x40] + vmovups xmm9, xmmword ptr [r8+rdx-0x30] + vshufps xmm4, xmm8, xmm9, 136 + vshufps xmm5, xmm8, xmm9, 221 + vmovups xmm8, xmmword ptr [r8+rdx-0x20] + vmovups xmm9, xmmword ptr [r8+rdx-0x10] + vshufps xmm6, xmm8, xmm9, 136 + vshufps xmm7, xmm8, xmm9, 221 + vpshufd xmm6, xmm6, 0x93 + vpshufd xmm7, xmm7, 0x93 + mov al, 7 +9: + vpaddd xmm0, xmm0, xmm4 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 16 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 12 + vpaddd xmm0, xmm0, xmm5 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 8 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 7 + vpshufd xmm0, xmm0, 0x93 + vpshufd xmm3, xmm3, 0x4E + vpshufd xmm2, xmm2, 0x39 + vpaddd xmm0, xmm0, xmm6 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 16 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 12 + vpaddd xmm0, xmm0, xmm7 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 8 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 7 + vpshufd xmm0, xmm0, 0x39 + vpshufd xmm3, xmm3, 0x4E + vpshufd xmm2, xmm2, 0x93 + dec al + jz 9f + vshufps xmm8, xmm4, xmm5, 214 + vpshufd xmm9, xmm4, 0x0F + vpshufd xmm4, xmm8, 0x39 + vshufps xmm8, xmm6, xmm7, 250 + vpblendd xmm9, xmm9, xmm8, 0xAA + vpunpcklqdq xmm8, xmm7, xmm5 + vpblendd xmm8, xmm8, xmm6, 0x88 + vpshufd xmm8, xmm8, 0x78 + vpunpckhdq xmm5, xmm5, xmm7 + vpunpckldq xmm6, xmm6, xmm5 + vpshufd xmm7, xmm6, 0x1E + vmovdqa xmm5, xmm9 + vmovdqa xmm6, xmm8 + jmp 9b +9: + vpxor xmm0, xmm0, xmm2 + vpxor xmm1, xmm1, xmm3 + mov eax, r13d + cmp rdx, r15 + jne 2b + vmovdqu xmmword ptr [rbx], xmm0 + vmovdqu xmmword ptr [rbx+0x10], xmm1 + jmp 4b + + +.p2align 6 +_blake3_compress_in_place_avx512: +blake3_compress_in_place_avx512: + sub rsp, 72 + vmovdqa xmmword ptr [rsp], xmm6 + vmovdqa xmmword ptr [rsp+0x10], xmm7 + vmovdqa xmmword ptr [rsp+0x20], xmm8 + vmovdqa xmmword ptr [rsp+0x30], xmm9 + vmovdqu xmm0, xmmword ptr [rcx] + vmovdqu xmm1, xmmword ptr [rcx+0x10] + movzx eax, byte ptr [rsp+0x70] + movzx r8d, r8b + shl rax, 32 + add r8, rax + vmovq xmm3, r9 + vmovq xmm4, r8 + vpunpcklqdq xmm3, xmm3, xmm4 + vmovaps xmm2, xmmword ptr [BLAKE3_IV+rip] + vmovups xmm8, xmmword ptr [rdx] + vmovups xmm9, xmmword ptr [rdx+0x10] + vshufps xmm4, xmm8, xmm9, 136 + vshufps xmm5, xmm8, xmm9, 221 + vmovups xmm8, xmmword ptr [rdx+0x20] + vmovups xmm9, xmmword ptr [rdx+0x30] + vshufps xmm6, xmm8, xmm9, 136 + vshufps xmm7, xmm8, xmm9, 221 + vpshufd xmm6, xmm6, 0x93 + vpshufd xmm7, xmm7, 0x93 + mov al, 7 +9: + vpaddd xmm0, xmm0, xmm4 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 16 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 12 + vpaddd xmm0, xmm0, xmm5 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 8 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 7 + vpshufd xmm0, xmm0, 0x93 + vpshufd xmm3, xmm3, 0x4E + vpshufd xmm2, xmm2, 0x39 + vpaddd xmm0, xmm0, xmm6 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 16 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 12 + vpaddd xmm0, xmm0, xmm7 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 8 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 7 + vpshufd xmm0, xmm0, 0x39 + vpshufd xmm3, xmm3, 0x4E + vpshufd xmm2, xmm2, 0x93 + dec al + jz 9f + vshufps xmm8, xmm4, xmm5, 214 + vpshufd xmm9, xmm4, 0x0F + vpshufd xmm4, xmm8, 0x39 + vshufps xmm8, xmm6, xmm7, 250 + vpblendd xmm9, xmm9, xmm8, 0xAA + vpunpcklqdq xmm8, xmm7, xmm5 + vpblendd xmm8, xmm8, xmm6, 0x88 + vpshufd xmm8, xmm8, 0x78 + vpunpckhdq xmm5, xmm5, xmm7 + vpunpckldq xmm6, xmm6, xmm5 + vpshufd xmm7, xmm6, 0x1E + vmovdqa xmm5, xmm9 + vmovdqa xmm6, xmm8 + jmp 9b +9: + vpxor xmm0, xmm0, xmm2 + vpxor xmm1, xmm1, xmm3 + vmovdqu xmmword ptr [rcx], xmm0 + vmovdqu xmmword ptr [rcx+0x10], xmm1 + vmovdqa xmm6, xmmword ptr [rsp] + vmovdqa xmm7, xmmword ptr [rsp+0x10] + vmovdqa xmm8, xmmword ptr [rsp+0x20] + vmovdqa xmm9, xmmword ptr [rsp+0x30] + add rsp, 72 + ret + + +.p2align 6 +_blake3_compress_xof_avx512: +blake3_compress_xof_avx512: + sub rsp, 72 + vmovdqa xmmword ptr [rsp], xmm6 + vmovdqa xmmword ptr [rsp+0x10], xmm7 + vmovdqa xmmword ptr [rsp+0x20], xmm8 + vmovdqa xmmword ptr [rsp+0x30], xmm9 + vmovdqu xmm0, xmmword ptr [rcx] + vmovdqu xmm1, xmmword ptr [rcx+0x10] + movzx eax, byte ptr [rsp+0x70] + movzx r8d, r8b + mov r10, qword ptr [rsp+0x78] + shl rax, 32 + add r8, rax + vmovq xmm3, r9 + vmovq xmm4, r8 + vpunpcklqdq xmm3, xmm3, xmm4 + vmovaps xmm2, xmmword ptr [BLAKE3_IV+rip] + vmovups xmm8, xmmword ptr [rdx] + vmovups xmm9, xmmword ptr [rdx+0x10] + vshufps xmm4, xmm8, xmm9, 136 + vshufps xmm5, xmm8, xmm9, 221 + vmovups xmm8, xmmword ptr [rdx+0x20] + vmovups xmm9, xmmword ptr [rdx+0x30] + vshufps xmm6, xmm8, xmm9, 136 + vshufps xmm7, xmm8, xmm9, 221 + vpshufd xmm6, xmm6, 0x93 + vpshufd xmm7, xmm7, 0x93 + mov al, 7 +9: + vpaddd xmm0, xmm0, xmm4 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 16 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 12 + vpaddd xmm0, xmm0, xmm5 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 8 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 7 + vpshufd xmm0, xmm0, 0x93 + vpshufd xmm3, xmm3, 0x4E + vpshufd xmm2, xmm2, 0x39 + vpaddd xmm0, xmm0, xmm6 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 16 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 12 + vpaddd xmm0, xmm0, xmm7 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 8 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 7 + vpshufd xmm0, xmm0, 0x39 + vpshufd xmm3, xmm3, 0x4E + vpshufd xmm2, xmm2, 0x93 + dec al + jz 9f + vshufps xmm8, xmm4, xmm5, 214 + vpshufd xmm9, xmm4, 0x0F + vpshufd xmm4, xmm8, 0x39 + vshufps xmm8, xmm6, xmm7, 250 + vpblendd xmm9, xmm9, xmm8, 0xAA + vpunpcklqdq xmm8, xmm7, xmm5 + vpblendd xmm8, xmm8, xmm6, 0x88 + vpshufd xmm8, xmm8, 0x78 + vpunpckhdq xmm5, xmm5, xmm7 + vpunpckldq xmm6, xmm6, xmm5 + vpshufd xmm7, xmm6, 0x1E + vmovdqa xmm5, xmm9 + vmovdqa xmm6, xmm8 + jmp 9b +9: + vpxor xmm0, xmm0, xmm2 + vpxor xmm1, xmm1, xmm3 + vpxor xmm2, xmm2, xmmword ptr [rcx] + vpxor xmm3, xmm3, xmmword ptr [rcx+0x10] + vmovdqu xmmword ptr [r10], xmm0 + vmovdqu xmmword ptr [r10+0x10], xmm1 + vmovdqu xmmword ptr [r10+0x20], xmm2 + vmovdqu xmmword ptr [r10+0x30], xmm3 + vmovdqa xmm6, xmmword ptr [rsp] + vmovdqa xmm7, xmmword ptr [rsp+0x10] + vmovdqa xmm8, xmmword ptr [rsp+0x20] + vmovdqa xmm9, xmmword ptr [rsp+0x30] + add rsp, 72 + ret + +.section .rdata +.p2align 6 +INDEX0: + .long 0, 1, 2, 3, 16, 17, 18, 19 + .long 8, 9, 10, 11, 24, 25, 26, 27 +INDEX1: + .long 4, 5, 6, 7, 20, 21, 22, 23 + .long 12, 13, 14, 15, 28, 29, 30, 31 +ADD0: + .long 0, 1, 2, 3, 4, 5, 6, 7 + .long 8, 9, 10, 11, 12, 13, 14, 15 +ADD1: .long 1 + +ADD16: .long 16 +BLAKE3_BLOCK_LEN: + .long 64 +.p2align 6 +BLAKE3_IV: +BLAKE3_IV_0: + .long 0x6A09E667 +BLAKE3_IV_1: + .long 0xBB67AE85 +BLAKE3_IV_2: + .long 0x3C6EF372 +BLAKE3_IV_3: + .long 0xA54FF53A diff --git a/third-party/blake3/c/blake3_avx512_x86-64_windows_msvc.asm b/third-party/blake3/c/blake3_avx512_x86-64_windows_msvc.asm new file mode 100644 index 00000000..b19efbaa --- /dev/null +++ b/third-party/blake3/c/blake3_avx512_x86-64_windows_msvc.asm @@ -0,0 +1,2634 @@ +public _blake3_hash_many_avx512 +public blake3_hash_many_avx512 +public blake3_compress_in_place_avx512 +public _blake3_compress_in_place_avx512 +public blake3_compress_xof_avx512 +public _blake3_compress_xof_avx512 + +_TEXT SEGMENT ALIGN(16) 'CODE' + +ALIGN 16 +blake3_hash_many_avx512 PROC +_blake3_hash_many_avx512 PROC + push r15 + push r14 + push r13 + push r12 + push rdi + push rsi + push rbx + push rbp + mov rbp, rsp + sub rsp, 304 + and rsp, 0FFFFFFFFFFFFFFC0H + vmovdqa xmmword ptr [rsp+90H], xmm6 + vmovdqa xmmword ptr [rsp+0A0H], xmm7 + vmovdqa xmmword ptr [rsp+0B0H], xmm8 + vmovdqa xmmword ptr [rsp+0C0H], xmm9 + vmovdqa xmmword ptr [rsp+0D0H], xmm10 + vmovdqa xmmword ptr [rsp+0E0H], xmm11 + vmovdqa xmmword ptr [rsp+0F0H], xmm12 + vmovdqa xmmword ptr [rsp+100H], xmm13 + vmovdqa xmmword ptr [rsp+110H], xmm14 + vmovdqa xmmword ptr [rsp+120H], xmm15 + mov rdi, rcx + mov rsi, rdx + mov rdx, r8 + mov rcx, r9 + mov r8, qword ptr [rbp+68H] + movzx r9, byte ptr [rbp+70H] + neg r9 + kmovw k1, r9d + vmovd xmm0, r8d + vpbroadcastd ymm0, xmm0 + shr r8, 32 + vmovd xmm1, r8d + vpbroadcastd ymm1, xmm1 + vmovdqa ymm4, ymm1 + vmovdqa ymm5, ymm1 + vpaddd ymm2, ymm0, ymmword ptr [ADD0] + vpaddd ymm3, ymm0, ymmword ptr [ADD0+32] + vpcmpud k2, ymm2, ymm0, 1 + vpcmpud k3, ymm3, ymm0, 1 + ; XXX: ml64.exe does not currently understand the syntax. We use a workaround. + vpbroadcastd ymm6, dword ptr [ADD1] + vpaddd ymm4 {k2}, ymm4, ymm6 + vpaddd ymm5 {k3}, ymm5, ymm6 + ; vpaddd ymm4 {k2}, ymm4, dword ptr [ADD1] {1to8} + ; vpaddd ymm5 {k3}, ymm5, dword ptr [ADD1] {1to8} + knotw k2, k1 + vmovdqa32 ymm2 {k2}, ymm0 + vmovdqa32 ymm3 {k2}, ymm0 + vmovdqa32 ymm4 {k2}, ymm1 + vmovdqa32 ymm5 {k2}, ymm1 + vmovdqa ymmword ptr [rsp], ymm2 + vmovdqa ymmword ptr [rsp+20H], ymm3 + vmovdqa ymmword ptr [rsp+40H], ymm4 + vmovdqa ymmword ptr [rsp+60H], ymm5 + shl rdx, 6 + mov qword ptr [rsp+80H], rdx + cmp rsi, 16 + jc final15blocks +outerloop16: + vpbroadcastd zmm0, dword ptr [rcx] + vpbroadcastd zmm1, dword ptr [rcx+1H*4H] + vpbroadcastd zmm2, dword ptr [rcx+2H*4H] + vpbroadcastd zmm3, dword ptr [rcx+3H*4H] + vpbroadcastd zmm4, dword ptr [rcx+4H*4H] + vpbroadcastd zmm5, dword ptr [rcx+5H*4H] + vpbroadcastd zmm6, dword ptr [rcx+6H*4H] + vpbroadcastd zmm7, dword ptr [rcx+7H*4H] + movzx eax, byte ptr [rbp+78H] + movzx ebx, byte ptr [rbp+80H] + or eax, ebx + xor edx, edx +ALIGN 16 +innerloop16: + movzx ebx, byte ptr [rbp+88H] + or ebx, eax + add rdx, 64 + cmp rdx, qword ptr [rsp+80H] + cmove eax, ebx + mov dword ptr [rsp+88H], eax + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+8H] + mov r10, qword ptr [rdi+10H] + mov r11, qword ptr [rdi+18H] + mov r12, qword ptr [rdi+40H] + mov r13, qword ptr [rdi+48H] + mov r14, qword ptr [rdi+50H] + mov r15, qword ptr [rdi+58H] + vmovdqu32 ymm16, ymmword ptr [rdx+r8-2H*20H] + vinserti64x4 zmm16, zmm16, ymmword ptr [rdx+r12-2H*20H], 01H + vmovdqu32 ymm17, ymmword ptr [rdx+r9-2H*20H] + vinserti64x4 zmm17, zmm17, ymmword ptr [rdx+r13-2H*20H], 01H + vpunpcklqdq zmm8, zmm16, zmm17 + vpunpckhqdq zmm9, zmm16, zmm17 + vmovdqu32 ymm18, ymmword ptr [rdx+r10-2H*20H] + vinserti64x4 zmm18, zmm18, ymmword ptr [rdx+r14-2H*20H], 01H + vmovdqu32 ymm19, ymmword ptr [rdx+r11-2H*20H] + vinserti64x4 zmm19, zmm19, ymmword ptr [rdx+r15-2H*20H], 01H + vpunpcklqdq zmm10, zmm18, zmm19 + vpunpckhqdq zmm11, zmm18, zmm19 + mov r8, qword ptr [rdi+20H] + mov r9, qword ptr [rdi+28H] + mov r10, qword ptr [rdi+30H] + mov r11, qword ptr [rdi+38H] + mov r12, qword ptr [rdi+60H] + mov r13, qword ptr [rdi+68H] + mov r14, qword ptr [rdi+70H] + mov r15, qword ptr [rdi+78H] + vmovdqu32 ymm16, ymmword ptr [rdx+r8-2H*20H] + vinserti64x4 zmm16, zmm16, ymmword ptr [rdx+r12-2H*20H], 01H + vmovdqu32 ymm17, ymmword ptr [rdx+r9-2H*20H] + vinserti64x4 zmm17, zmm17, ymmword ptr [rdx+r13-2H*20H], 01H + vpunpcklqdq zmm12, zmm16, zmm17 + vpunpckhqdq zmm13, zmm16, zmm17 + vmovdqu32 ymm18, ymmword ptr [rdx+r10-2H*20H] + vinserti64x4 zmm18, zmm18, ymmword ptr [rdx+r14-2H*20H], 01H + vmovdqu32 ymm19, ymmword ptr [rdx+r11-2H*20H] + vinserti64x4 zmm19, zmm19, ymmword ptr [rdx+r15-2H*20H], 01H + vpunpcklqdq zmm14, zmm18, zmm19 + vpunpckhqdq zmm15, zmm18, zmm19 + vmovdqa32 zmm27, zmmword ptr [INDEX0] + vmovdqa32 zmm31, zmmword ptr [INDEX1] + vshufps zmm16, zmm8, zmm10, 136 + vshufps zmm17, zmm12, zmm14, 136 + vmovdqa32 zmm20, zmm16 + vpermt2d zmm16, zmm27, zmm17 + vpermt2d zmm20, zmm31, zmm17 + vshufps zmm17, zmm8, zmm10, 221 + vshufps zmm30, zmm12, zmm14, 221 + vmovdqa32 zmm21, zmm17 + vpermt2d zmm17, zmm27, zmm30 + vpermt2d zmm21, zmm31, zmm30 + vshufps zmm18, zmm9, zmm11, 136 + vshufps zmm8, zmm13, zmm15, 136 + vmovdqa32 zmm22, zmm18 + vpermt2d zmm18, zmm27, zmm8 + vpermt2d zmm22, zmm31, zmm8 + vshufps zmm19, zmm9, zmm11, 221 + vshufps zmm8, zmm13, zmm15, 221 + vmovdqa32 zmm23, zmm19 + vpermt2d zmm19, zmm27, zmm8 + vpermt2d zmm23, zmm31, zmm8 + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+8H] + mov r10, qword ptr [rdi+10H] + mov r11, qword ptr [rdi+18H] + mov r12, qword ptr [rdi+40H] + mov r13, qword ptr [rdi+48H] + mov r14, qword ptr [rdi+50H] + mov r15, qword ptr [rdi+58H] + vmovdqu32 ymm24, ymmword ptr [r8+rdx-1H*20H] + vinserti64x4 zmm24, zmm24, ymmword ptr [r12+rdx-1H*20H], 01H + vmovdqu32 ymm25, ymmword ptr [r9+rdx-1H*20H] + vinserti64x4 zmm25, zmm25, ymmword ptr [r13+rdx-1H*20H], 01H + vpunpcklqdq zmm8, zmm24, zmm25 + vpunpckhqdq zmm9, zmm24, zmm25 + vmovdqu32 ymm24, ymmword ptr [r10+rdx-1H*20H] + vinserti64x4 zmm24, zmm24, ymmword ptr [r14+rdx-1H*20H], 01H + vmovdqu32 ymm25, ymmword ptr [r11+rdx-1H*20H] + vinserti64x4 zmm25, zmm25, ymmword ptr [r15+rdx-1H*20H], 01H + vpunpcklqdq zmm10, zmm24, zmm25 + vpunpckhqdq zmm11, zmm24, zmm25 + prefetcht0 byte ptr [r8+rdx+80H] + prefetcht0 byte ptr [r12+rdx+80H] + prefetcht0 byte ptr [r9+rdx+80H] + prefetcht0 byte ptr [r13+rdx+80H] + prefetcht0 byte ptr [r10+rdx+80H] + prefetcht0 byte ptr [r14+rdx+80H] + prefetcht0 byte ptr [r11+rdx+80H] + prefetcht0 byte ptr [r15+rdx+80H] + mov r8, qword ptr [rdi+20H] + mov r9, qword ptr [rdi+28H] + mov r10, qword ptr [rdi+30H] + mov r11, qword ptr [rdi+38H] + mov r12, qword ptr [rdi+60H] + mov r13, qword ptr [rdi+68H] + mov r14, qword ptr [rdi+70H] + mov r15, qword ptr [rdi+78H] + vmovdqu32 ymm24, ymmword ptr [r8+rdx-1H*20H] + vinserti64x4 zmm24, zmm24, ymmword ptr [r12+rdx-1H*20H], 01H + vmovdqu32 ymm25, ymmword ptr [r9+rdx-1H*20H] + vinserti64x4 zmm25, zmm25, ymmword ptr [r13+rdx-1H*20H], 01H + vpunpcklqdq zmm12, zmm24, zmm25 + vpunpckhqdq zmm13, zmm24, zmm25 + vmovdqu32 ymm24, ymmword ptr [r10+rdx-1H*20H] + vinserti64x4 zmm24, zmm24, ymmword ptr [r14+rdx-1H*20H], 01H + vmovdqu32 ymm25, ymmword ptr [r11+rdx-1H*20H] + vinserti64x4 zmm25, zmm25, ymmword ptr [r15+rdx-1H*20H], 01H + vpunpcklqdq zmm14, zmm24, zmm25 + vpunpckhqdq zmm15, zmm24, zmm25 + prefetcht0 byte ptr [r8+rdx+80H] + prefetcht0 byte ptr [r12+rdx+80H] + prefetcht0 byte ptr [r9+rdx+80H] + prefetcht0 byte ptr [r13+rdx+80H] + prefetcht0 byte ptr [r10+rdx+80H] + prefetcht0 byte ptr [r14+rdx+80H] + prefetcht0 byte ptr [r11+rdx+80H] + prefetcht0 byte ptr [r15+rdx+80H] + vshufps zmm24, zmm8, zmm10, 136 + vshufps zmm30, zmm12, zmm14, 136 + vmovdqa32 zmm28, zmm24 + vpermt2d zmm24, zmm27, zmm30 + vpermt2d zmm28, zmm31, zmm30 + vshufps zmm25, zmm8, zmm10, 221 + vshufps zmm30, zmm12, zmm14, 221 + vmovdqa32 zmm29, zmm25 + vpermt2d zmm25, zmm27, zmm30 + vpermt2d zmm29, zmm31, zmm30 + vshufps zmm26, zmm9, zmm11, 136 + vshufps zmm8, zmm13, zmm15, 136 + vmovdqa32 zmm30, zmm26 + vpermt2d zmm26, zmm27, zmm8 + vpermt2d zmm30, zmm31, zmm8 + vshufps zmm8, zmm9, zmm11, 221 + vshufps zmm10, zmm13, zmm15, 221 + vpermi2d zmm27, zmm8, zmm10 + vpermi2d zmm31, zmm8, zmm10 + vpbroadcastd zmm8, dword ptr [BLAKE3_IV_0] + vpbroadcastd zmm9, dword ptr [BLAKE3_IV_1] + vpbroadcastd zmm10, dword ptr [BLAKE3_IV_2] + vpbroadcastd zmm11, dword ptr [BLAKE3_IV_3] + vmovdqa32 zmm12, zmmword ptr [rsp] + vmovdqa32 zmm13, zmmword ptr [rsp+1H*40H] + vpbroadcastd zmm14, dword ptr [BLAKE3_BLOCK_LEN] + vpbroadcastd zmm15, dword ptr [rsp+22H*4H] + vpaddd zmm0, zmm0, zmm16 + vpaddd zmm1, zmm1, zmm18 + vpaddd zmm2, zmm2, zmm20 + vpaddd zmm3, zmm3, zmm22 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vprord zmm15, zmm15, 16 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 12 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vpaddd zmm0, zmm0, zmm17 + vpaddd zmm1, zmm1, zmm19 + vpaddd zmm2, zmm2, zmm21 + vpaddd zmm3, zmm3, zmm23 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vprord zmm15, zmm15, 8 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 7 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vpaddd zmm0, zmm0, zmm24 + vpaddd zmm1, zmm1, zmm26 + vpaddd zmm2, zmm2, zmm28 + vpaddd zmm3, zmm3, zmm30 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 16 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vprord zmm4, zmm4, 12 + vpaddd zmm0, zmm0, zmm25 + vpaddd zmm1, zmm1, zmm27 + vpaddd zmm2, zmm2, zmm29 + vpaddd zmm3, zmm3, zmm31 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 8 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vprord zmm4, zmm4, 7 + vpaddd zmm0, zmm0, zmm18 + vpaddd zmm1, zmm1, zmm19 + vpaddd zmm2, zmm2, zmm23 + vpaddd zmm3, zmm3, zmm20 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vprord zmm15, zmm15, 16 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 12 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vpaddd zmm0, zmm0, zmm22 + vpaddd zmm1, zmm1, zmm26 + vpaddd zmm2, zmm2, zmm16 + vpaddd zmm3, zmm3, zmm29 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vprord zmm15, zmm15, 8 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 7 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vpaddd zmm0, zmm0, zmm17 + vpaddd zmm1, zmm1, zmm28 + vpaddd zmm2, zmm2, zmm25 + vpaddd zmm3, zmm3, zmm31 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 16 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vprord zmm4, zmm4, 12 + vpaddd zmm0, zmm0, zmm27 + vpaddd zmm1, zmm1, zmm21 + vpaddd zmm2, zmm2, zmm30 + vpaddd zmm3, zmm3, zmm24 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 8 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vprord zmm4, zmm4, 7 + vpaddd zmm0, zmm0, zmm19 + vpaddd zmm1, zmm1, zmm26 + vpaddd zmm2, zmm2, zmm29 + vpaddd zmm3, zmm3, zmm23 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vprord zmm15, zmm15, 16 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 12 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vpaddd zmm0, zmm0, zmm20 + vpaddd zmm1, zmm1, zmm28 + vpaddd zmm2, zmm2, zmm18 + vpaddd zmm3, zmm3, zmm30 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vprord zmm15, zmm15, 8 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 7 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vpaddd zmm0, zmm0, zmm22 + vpaddd zmm1, zmm1, zmm25 + vpaddd zmm2, zmm2, zmm27 + vpaddd zmm3, zmm3, zmm24 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 16 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vprord zmm4, zmm4, 12 + vpaddd zmm0, zmm0, zmm21 + vpaddd zmm1, zmm1, zmm16 + vpaddd zmm2, zmm2, zmm31 + vpaddd zmm3, zmm3, zmm17 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 8 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vprord zmm4, zmm4, 7 + vpaddd zmm0, zmm0, zmm26 + vpaddd zmm1, zmm1, zmm28 + vpaddd zmm2, zmm2, zmm30 + vpaddd zmm3, zmm3, zmm29 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vprord zmm15, zmm15, 16 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 12 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vpaddd zmm0, zmm0, zmm23 + vpaddd zmm1, zmm1, zmm25 + vpaddd zmm2, zmm2, zmm19 + vpaddd zmm3, zmm3, zmm31 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vprord zmm15, zmm15, 8 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 7 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vpaddd zmm0, zmm0, zmm20 + vpaddd zmm1, zmm1, zmm27 + vpaddd zmm2, zmm2, zmm21 + vpaddd zmm3, zmm3, zmm17 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 16 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vprord zmm4, zmm4, 12 + vpaddd zmm0, zmm0, zmm16 + vpaddd zmm1, zmm1, zmm18 + vpaddd zmm2, zmm2, zmm24 + vpaddd zmm3, zmm3, zmm22 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 8 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vprord zmm4, zmm4, 7 + vpaddd zmm0, zmm0, zmm28 + vpaddd zmm1, zmm1, zmm25 + vpaddd zmm2, zmm2, zmm31 + vpaddd zmm3, zmm3, zmm30 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vprord zmm15, zmm15, 16 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 12 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vpaddd zmm0, zmm0, zmm29 + vpaddd zmm1, zmm1, zmm27 + vpaddd zmm2, zmm2, zmm26 + vpaddd zmm3, zmm3, zmm24 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vprord zmm15, zmm15, 8 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 7 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vpaddd zmm0, zmm0, zmm23 + vpaddd zmm1, zmm1, zmm21 + vpaddd zmm2, zmm2, zmm16 + vpaddd zmm3, zmm3, zmm22 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 16 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vprord zmm4, zmm4, 12 + vpaddd zmm0, zmm0, zmm18 + vpaddd zmm1, zmm1, zmm19 + vpaddd zmm2, zmm2, zmm17 + vpaddd zmm3, zmm3, zmm20 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 8 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vprord zmm4, zmm4, 7 + vpaddd zmm0, zmm0, zmm25 + vpaddd zmm1, zmm1, zmm27 + vpaddd zmm2, zmm2, zmm24 + vpaddd zmm3, zmm3, zmm31 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vprord zmm15, zmm15, 16 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 12 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vpaddd zmm0, zmm0, zmm30 + vpaddd zmm1, zmm1, zmm21 + vpaddd zmm2, zmm2, zmm28 + vpaddd zmm3, zmm3, zmm17 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vprord zmm15, zmm15, 8 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 7 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vpaddd zmm0, zmm0, zmm29 + vpaddd zmm1, zmm1, zmm16 + vpaddd zmm2, zmm2, zmm18 + vpaddd zmm3, zmm3, zmm20 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 16 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vprord zmm4, zmm4, 12 + vpaddd zmm0, zmm0, zmm19 + vpaddd zmm1, zmm1, zmm26 + vpaddd zmm2, zmm2, zmm22 + vpaddd zmm3, zmm3, zmm23 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 8 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vprord zmm4, zmm4, 7 + vpaddd zmm0, zmm0, zmm27 + vpaddd zmm1, zmm1, zmm21 + vpaddd zmm2, zmm2, zmm17 + vpaddd zmm3, zmm3, zmm24 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vprord zmm15, zmm15, 16 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 12 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vpaddd zmm0, zmm0, zmm31 + vpaddd zmm1, zmm1, zmm16 + vpaddd zmm2, zmm2, zmm25 + vpaddd zmm3, zmm3, zmm22 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vprord zmm15, zmm15, 8 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 7 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vpaddd zmm0, zmm0, zmm30 + vpaddd zmm1, zmm1, zmm18 + vpaddd zmm2, zmm2, zmm19 + vpaddd zmm3, zmm3, zmm23 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 16 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vprord zmm4, zmm4, 12 + vpaddd zmm0, zmm0, zmm26 + vpaddd zmm1, zmm1, zmm28 + vpaddd zmm2, zmm2, zmm20 + vpaddd zmm3, zmm3, zmm29 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 8 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vprord zmm4, zmm4, 7 + vpxord zmm0, zmm0, zmm8 + vpxord zmm1, zmm1, zmm9 + vpxord zmm2, zmm2, zmm10 + vpxord zmm3, zmm3, zmm11 + vpxord zmm4, zmm4, zmm12 + vpxord zmm5, zmm5, zmm13 + vpxord zmm6, zmm6, zmm14 + vpxord zmm7, zmm7, zmm15 + movzx eax, byte ptr [rbp+78H] + jne innerloop16 + mov rbx, qword ptr [rbp+90H] + vpunpckldq zmm16, zmm0, zmm1 + vpunpckhdq zmm17, zmm0, zmm1 + vpunpckldq zmm18, zmm2, zmm3 + vpunpckhdq zmm19, zmm2, zmm3 + vpunpckldq zmm20, zmm4, zmm5 + vpunpckhdq zmm21, zmm4, zmm5 + vpunpckldq zmm22, zmm6, zmm7 + vpunpckhdq zmm23, zmm6, zmm7 + vpunpcklqdq zmm0, zmm16, zmm18 + vpunpckhqdq zmm1, zmm16, zmm18 + vpunpcklqdq zmm2, zmm17, zmm19 + vpunpckhqdq zmm3, zmm17, zmm19 + vpunpcklqdq zmm4, zmm20, zmm22 + vpunpckhqdq zmm5, zmm20, zmm22 + vpunpcklqdq zmm6, zmm21, zmm23 + vpunpckhqdq zmm7, zmm21, zmm23 + vshufi32x4 zmm16, zmm0, zmm4, 88H + vshufi32x4 zmm17, zmm1, zmm5, 88H + vshufi32x4 zmm18, zmm2, zmm6, 88H + vshufi32x4 zmm19, zmm3, zmm7, 88H + vshufi32x4 zmm20, zmm0, zmm4, 0DDH + vshufi32x4 zmm21, zmm1, zmm5, 0DDH + vshufi32x4 zmm22, zmm2, zmm6, 0DDH + vshufi32x4 zmm23, zmm3, zmm7, 0DDH + vshufi32x4 zmm0, zmm16, zmm17, 88H + vshufi32x4 zmm1, zmm18, zmm19, 88H + vshufi32x4 zmm2, zmm20, zmm21, 88H + vshufi32x4 zmm3, zmm22, zmm23, 88H + vshufi32x4 zmm4, zmm16, zmm17, 0DDH + vshufi32x4 zmm5, zmm18, zmm19, 0DDH + vshufi32x4 zmm6, zmm20, zmm21, 0DDH + vshufi32x4 zmm7, zmm22, zmm23, 0DDH + vmovdqu32 zmmword ptr [rbx], zmm0 + vmovdqu32 zmmword ptr [rbx+1H*40H], zmm1 + vmovdqu32 zmmword ptr [rbx+2H*40H], zmm2 + vmovdqu32 zmmword ptr [rbx+3H*40H], zmm3 + vmovdqu32 zmmword ptr [rbx+4H*40H], zmm4 + vmovdqu32 zmmword ptr [rbx+5H*40H], zmm5 + vmovdqu32 zmmword ptr [rbx+6H*40H], zmm6 + vmovdqu32 zmmword ptr [rbx+7H*40H], zmm7 + vmovdqa32 zmm0, zmmword ptr [rsp] + vmovdqa32 zmm1, zmmword ptr [rsp+1H*40H] + vmovdqa32 zmm2, zmm0 + ; XXX: ml64.exe does not currently understand the syntax. We use a workaround. + vpbroadcastd zmm4, dword ptr [ADD16] + vpbroadcastd zmm5, dword ptr [ADD1] + vpaddd zmm2{k1}, zmm0, zmm4 + ; vpaddd zmm2{k1}, zmm0, dword ptr [ADD16] ; {1to16} + vpcmpud k2, zmm2, zmm0, 1 + vpaddd zmm1 {k2}, zmm1, zmm5 + ; vpaddd zmm1 {k2}, zmm1, dword ptr [ADD1] ; {1to16} + vmovdqa32 zmmword ptr [rsp], zmm2 + vmovdqa32 zmmword ptr [rsp+1H*40H], zmm1 + add rdi, 128 + add rbx, 512 + mov qword ptr [rbp+90H], rbx + sub rsi, 16 + cmp rsi, 16 + jnc outerloop16 + test rsi, rsi + jne final15blocks +unwind: + vzeroupper + vmovdqa xmm6, xmmword ptr [rsp+90H] + vmovdqa xmm7, xmmword ptr [rsp+0A0H] + vmovdqa xmm8, xmmword ptr [rsp+0B0H] + vmovdqa xmm9, xmmword ptr [rsp+0C0H] + vmovdqa xmm10, xmmword ptr [rsp+0D0H] + vmovdqa xmm11, xmmword ptr [rsp+0E0H] + vmovdqa xmm12, xmmword ptr [rsp+0F0H] + vmovdqa xmm13, xmmword ptr [rsp+100H] + vmovdqa xmm14, xmmword ptr [rsp+110H] + vmovdqa xmm15, xmmword ptr [rsp+120H] + mov rsp, rbp + pop rbp + pop rbx + pop rsi + pop rdi + pop r12 + pop r13 + pop r14 + pop r15 + ret +ALIGN 16 +final15blocks: + test esi, 8H + je final7blocks + vpbroadcastd ymm0, dword ptr [rcx] + vpbroadcastd ymm1, dword ptr [rcx+4H] + vpbroadcastd ymm2, dword ptr [rcx+8H] + vpbroadcastd ymm3, dword ptr [rcx+0CH] + vpbroadcastd ymm4, dword ptr [rcx+10H] + vpbroadcastd ymm5, dword ptr [rcx+14H] + vpbroadcastd ymm6, dword ptr [rcx+18H] + vpbroadcastd ymm7, dword ptr [rcx+1CH] + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+8H] + mov r10, qword ptr [rdi+10H] + mov r11, qword ptr [rdi+18H] + mov r12, qword ptr [rdi+20H] + mov r13, qword ptr [rdi+28H] + mov r14, qword ptr [rdi+30H] + mov r15, qword ptr [rdi+38H] + movzx eax, byte ptr [rbp+78H] + movzx ebx, byte ptr [rbp+80H] + or eax, ebx + xor edx, edx +innerloop8: + movzx ebx, byte ptr [rbp+88H] + or ebx, eax + add rdx, 64 + cmp rdx, qword ptr [rsp+80H] + cmove eax, ebx + mov dword ptr [rsp+88H], eax + vmovups xmm8, xmmword ptr [r8+rdx-40H] + vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-40H], 01H + vmovups xmm9, xmmword ptr [r9+rdx-40H] + vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-40H], 01H + vunpcklpd ymm12, ymm8, ymm9 + vunpckhpd ymm13, ymm8, ymm9 + vmovups xmm10, xmmword ptr [r10+rdx-40H] + vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-40H], 01H + vmovups xmm11, xmmword ptr [r11+rdx-40H] + vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-40H], 01H + vunpcklpd ymm14, ymm10, ymm11 + vunpckhpd ymm15, ymm10, ymm11 + vshufps ymm16, ymm12, ymm14, 136 + vshufps ymm17, ymm12, ymm14, 221 + vshufps ymm18, ymm13, ymm15, 136 + vshufps ymm19, ymm13, ymm15, 221 + vmovups xmm8, xmmword ptr [r8+rdx-30H] + vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-30H], 01H + vmovups xmm9, xmmword ptr [r9+rdx-30H] + vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-30H], 01H + vunpcklpd ymm12, ymm8, ymm9 + vunpckhpd ymm13, ymm8, ymm9 + vmovups xmm10, xmmword ptr [r10+rdx-30H] + vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-30H], 01H + vmovups xmm11, xmmword ptr [r11+rdx-30H] + vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-30H], 01H + vunpcklpd ymm14, ymm10, ymm11 + vunpckhpd ymm15, ymm10, ymm11 + vshufps ymm20, ymm12, ymm14, 136 + vshufps ymm21, ymm12, ymm14, 221 + vshufps ymm22, ymm13, ymm15, 136 + vshufps ymm23, ymm13, ymm15, 221 + vmovups xmm8, xmmword ptr [r8+rdx-20H] + vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-20H], 01H + vmovups xmm9, xmmword ptr [r9+rdx-20H] + vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-20H], 01H + vunpcklpd ymm12, ymm8, ymm9 + vunpckhpd ymm13, ymm8, ymm9 + vmovups xmm10, xmmword ptr [r10+rdx-20H] + vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-20H], 01H + vmovups xmm11, xmmword ptr [r11+rdx-20H] + vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-20H], 01H + vunpcklpd ymm14, ymm10, ymm11 + vunpckhpd ymm15, ymm10, ymm11 + vshufps ymm24, ymm12, ymm14, 136 + vshufps ymm25, ymm12, ymm14, 221 + vshufps ymm26, ymm13, ymm15, 136 + vshufps ymm27, ymm13, ymm15, 221 + vmovups xmm8, xmmword ptr [r8+rdx-10H] + vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-10H], 01H + vmovups xmm9, xmmword ptr [r9+rdx-10H] + vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-10H], 01H + vunpcklpd ymm12, ymm8, ymm9 + vunpckhpd ymm13, ymm8, ymm9 + vmovups xmm10, xmmword ptr [r10+rdx-10H] + vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-10H], 01H + vmovups xmm11, xmmword ptr [r11+rdx-10H] + vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-10H], 01H + vunpcklpd ymm14, ymm10, ymm11 + vunpckhpd ymm15, ymm10, ymm11 + vshufps ymm28, ymm12, ymm14, 136 + vshufps ymm29, ymm12, ymm14, 221 + vshufps ymm30, ymm13, ymm15, 136 + vshufps ymm31, ymm13, ymm15, 221 + vpbroadcastd ymm8, dword ptr [BLAKE3_IV_0] + vpbroadcastd ymm9, dword ptr [BLAKE3_IV_1] + vpbroadcastd ymm10, dword ptr [BLAKE3_IV_2] + vpbroadcastd ymm11, dword ptr [BLAKE3_IV_3] + vmovdqa ymm12, ymmword ptr [rsp] + vmovdqa ymm13, ymmword ptr [rsp+40H] + vpbroadcastd ymm14, dword ptr [BLAKE3_BLOCK_LEN] + vpbroadcastd ymm15, dword ptr [rsp+88H] + vpaddd ymm0, ymm0, ymm16 + vpaddd ymm1, ymm1, ymm18 + vpaddd ymm2, ymm2, ymm20 + vpaddd ymm3, ymm3, ymm22 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vprord ymm15, ymm15, 16 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 12 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vpaddd ymm0, ymm0, ymm17 + vpaddd ymm1, ymm1, ymm19 + vpaddd ymm2, ymm2, ymm21 + vpaddd ymm3, ymm3, ymm23 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vprord ymm15, ymm15, 8 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 7 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vpaddd ymm0, ymm0, ymm24 + vpaddd ymm1, ymm1, ymm26 + vpaddd ymm2, ymm2, ymm28 + vpaddd ymm3, ymm3, ymm30 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 16 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vprord ymm4, ymm4, 12 + vpaddd ymm0, ymm0, ymm25 + vpaddd ymm1, ymm1, ymm27 + vpaddd ymm2, ymm2, ymm29 + vpaddd ymm3, ymm3, ymm31 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 8 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vprord ymm4, ymm4, 7 + vpaddd ymm0, ymm0, ymm18 + vpaddd ymm1, ymm1, ymm19 + vpaddd ymm2, ymm2, ymm23 + vpaddd ymm3, ymm3, ymm20 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vprord ymm15, ymm15, 16 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 12 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vpaddd ymm0, ymm0, ymm22 + vpaddd ymm1, ymm1, ymm26 + vpaddd ymm2, ymm2, ymm16 + vpaddd ymm3, ymm3, ymm29 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vprord ymm15, ymm15, 8 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 7 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vpaddd ymm0, ymm0, ymm17 + vpaddd ymm1, ymm1, ymm28 + vpaddd ymm2, ymm2, ymm25 + vpaddd ymm3, ymm3, ymm31 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 16 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vprord ymm4, ymm4, 12 + vpaddd ymm0, ymm0, ymm27 + vpaddd ymm1, ymm1, ymm21 + vpaddd ymm2, ymm2, ymm30 + vpaddd ymm3, ymm3, ymm24 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 8 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vprord ymm4, ymm4, 7 + vpaddd ymm0, ymm0, ymm19 + vpaddd ymm1, ymm1, ymm26 + vpaddd ymm2, ymm2, ymm29 + vpaddd ymm3, ymm3, ymm23 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vprord ymm15, ymm15, 16 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 12 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vpaddd ymm0, ymm0, ymm20 + vpaddd ymm1, ymm1, ymm28 + vpaddd ymm2, ymm2, ymm18 + vpaddd ymm3, ymm3, ymm30 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vprord ymm15, ymm15, 8 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 7 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vpaddd ymm0, ymm0, ymm22 + vpaddd ymm1, ymm1, ymm25 + vpaddd ymm2, ymm2, ymm27 + vpaddd ymm3, ymm3, ymm24 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 16 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vprord ymm4, ymm4, 12 + vpaddd ymm0, ymm0, ymm21 + vpaddd ymm1, ymm1, ymm16 + vpaddd ymm2, ymm2, ymm31 + vpaddd ymm3, ymm3, ymm17 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 8 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vprord ymm4, ymm4, 7 + vpaddd ymm0, ymm0, ymm26 + vpaddd ymm1, ymm1, ymm28 + vpaddd ymm2, ymm2, ymm30 + vpaddd ymm3, ymm3, ymm29 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vprord ymm15, ymm15, 16 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 12 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vpaddd ymm0, ymm0, ymm23 + vpaddd ymm1, ymm1, ymm25 + vpaddd ymm2, ymm2, ymm19 + vpaddd ymm3, ymm3, ymm31 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vprord ymm15, ymm15, 8 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 7 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vpaddd ymm0, ymm0, ymm20 + vpaddd ymm1, ymm1, ymm27 + vpaddd ymm2, ymm2, ymm21 + vpaddd ymm3, ymm3, ymm17 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 16 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vprord ymm4, ymm4, 12 + vpaddd ymm0, ymm0, ymm16 + vpaddd ymm1, ymm1, ymm18 + vpaddd ymm2, ymm2, ymm24 + vpaddd ymm3, ymm3, ymm22 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 8 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vprord ymm4, ymm4, 7 + vpaddd ymm0, ymm0, ymm28 + vpaddd ymm1, ymm1, ymm25 + vpaddd ymm2, ymm2, ymm31 + vpaddd ymm3, ymm3, ymm30 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vprord ymm15, ymm15, 16 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 12 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vpaddd ymm0, ymm0, ymm29 + vpaddd ymm1, ymm1, ymm27 + vpaddd ymm2, ymm2, ymm26 + vpaddd ymm3, ymm3, ymm24 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vprord ymm15, ymm15, 8 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 7 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vpaddd ymm0, ymm0, ymm23 + vpaddd ymm1, ymm1, ymm21 + vpaddd ymm2, ymm2, ymm16 + vpaddd ymm3, ymm3, ymm22 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 16 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vprord ymm4, ymm4, 12 + vpaddd ymm0, ymm0, ymm18 + vpaddd ymm1, ymm1, ymm19 + vpaddd ymm2, ymm2, ymm17 + vpaddd ymm3, ymm3, ymm20 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 8 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vprord ymm4, ymm4, 7 + vpaddd ymm0, ymm0, ymm25 + vpaddd ymm1, ymm1, ymm27 + vpaddd ymm2, ymm2, ymm24 + vpaddd ymm3, ymm3, ymm31 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vprord ymm15, ymm15, 16 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 12 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vpaddd ymm0, ymm0, ymm30 + vpaddd ymm1, ymm1, ymm21 + vpaddd ymm2, ymm2, ymm28 + vpaddd ymm3, ymm3, ymm17 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vprord ymm15, ymm15, 8 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 7 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vpaddd ymm0, ymm0, ymm29 + vpaddd ymm1, ymm1, ymm16 + vpaddd ymm2, ymm2, ymm18 + vpaddd ymm3, ymm3, ymm20 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 16 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vprord ymm4, ymm4, 12 + vpaddd ymm0, ymm0, ymm19 + vpaddd ymm1, ymm1, ymm26 + vpaddd ymm2, ymm2, ymm22 + vpaddd ymm3, ymm3, ymm23 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 8 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vprord ymm4, ymm4, 7 + vpaddd ymm0, ymm0, ymm27 + vpaddd ymm1, ymm1, ymm21 + vpaddd ymm2, ymm2, ymm17 + vpaddd ymm3, ymm3, ymm24 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vprord ymm15, ymm15, 16 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 12 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vpaddd ymm0, ymm0, ymm31 + vpaddd ymm1, ymm1, ymm16 + vpaddd ymm2, ymm2, ymm25 + vpaddd ymm3, ymm3, ymm22 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vprord ymm15, ymm15, 8 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 7 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vpaddd ymm0, ymm0, ymm30 + vpaddd ymm1, ymm1, ymm18 + vpaddd ymm2, ymm2, ymm19 + vpaddd ymm3, ymm3, ymm23 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 16 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vprord ymm4, ymm4, 12 + vpaddd ymm0, ymm0, ymm26 + vpaddd ymm1, ymm1, ymm28 + vpaddd ymm2, ymm2, ymm20 + vpaddd ymm3, ymm3, ymm29 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 8 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vprord ymm4, ymm4, 7 + vpxor ymm0, ymm0, ymm8 + vpxor ymm1, ymm1, ymm9 + vpxor ymm2, ymm2, ymm10 + vpxor ymm3, ymm3, ymm11 + vpxor ymm4, ymm4, ymm12 + vpxor ymm5, ymm5, ymm13 + vpxor ymm6, ymm6, ymm14 + vpxor ymm7, ymm7, ymm15 + movzx eax, byte ptr [rbp+78H] + jne innerloop8 + mov rbx, qword ptr [rbp+90H] + vunpcklps ymm8, ymm0, ymm1 + vunpcklps ymm9, ymm2, ymm3 + vunpckhps ymm10, ymm0, ymm1 + vunpcklps ymm11, ymm4, ymm5 + vunpcklps ymm0, ymm6, ymm7 + vshufps ymm12, ymm8, ymm9, 78 + vblendps ymm1, ymm8, ymm12, 0CCH + vshufps ymm8, ymm11, ymm0, 78 + vunpckhps ymm13, ymm2, ymm3 + vblendps ymm2, ymm11, ymm8, 0CCH + vblendps ymm3, ymm12, ymm9, 0CCH + vperm2f128 ymm12, ymm1, ymm2, 20H + vmovups ymmword ptr [rbx], ymm12 + vunpckhps ymm14, ymm4, ymm5 + vblendps ymm4, ymm8, ymm0, 0CCH + vunpckhps ymm15, ymm6, ymm7 + vperm2f128 ymm7, ymm3, ymm4, 20H + vmovups ymmword ptr [rbx+20H], ymm7 + vshufps ymm5, ymm10, ymm13, 78 + vblendps ymm6, ymm5, ymm13, 0CCH + vshufps ymm13, ymm14, ymm15, 78 + vblendps ymm10, ymm10, ymm5, 0CCH + vblendps ymm14, ymm14, ymm13, 0CCH + vperm2f128 ymm8, ymm10, ymm14, 20H + vmovups ymmword ptr [rbx+40H], ymm8 + vblendps ymm15, ymm13, ymm15, 0CCH + vperm2f128 ymm13, ymm6, ymm15, 20H + vmovups ymmword ptr [rbx+60H], ymm13 + vperm2f128 ymm9, ymm1, ymm2, 31H + vperm2f128 ymm11, ymm3, ymm4, 31H + vmovups ymmword ptr [rbx+80H], ymm9 + vperm2f128 ymm14, ymm10, ymm14, 31H + vperm2f128 ymm15, ymm6, ymm15, 31H + vmovups ymmword ptr [rbx+0A0H], ymm11 + vmovups ymmword ptr [rbx+0C0H], ymm14 + vmovups ymmword ptr [rbx+0E0H], ymm15 + vmovdqa ymm0, ymmword ptr [rsp] + vmovdqa ymm2, ymmword ptr [rsp+40H] + vmovdqa32 ymm0 {k1}, ymmword ptr [rsp+1H*20H] + vmovdqa32 ymm2 {k1}, ymmword ptr [rsp+3H*20H] + vmovdqa ymmword ptr [rsp], ymm0 + vmovdqa ymmword ptr [rsp+40H], ymm2 + add rbx, 256 + mov qword ptr [rbp+90H], rbx + add rdi, 64 + sub rsi, 8 +final7blocks: + mov rbx, qword ptr [rbp+90H] + mov r15, qword ptr [rsp+80H] + movzx r13, byte ptr [rbp+78H] + movzx r12, byte ptr [rbp+88H] + test esi, 4H + je final3blocks + vbroadcasti32x4 zmm0, xmmword ptr [rcx] + vbroadcasti32x4 zmm1, xmmword ptr [rcx+1H*10H] + vmovdqa xmm12, xmmword ptr [rsp] + vmovdqa xmm13, xmmword ptr [rsp+40H] + vpunpckldq xmm14, xmm12, xmm13 + vpunpckhdq xmm15, xmm12, xmm13 + vpermq ymm14, ymm14, 0DCH + vpermq ymm15, ymm15, 0DCH + vpbroadcastd zmm12, dword ptr [BLAKE3_BLOCK_LEN] + vinserti64x4 zmm13, zmm14, ymm15, 01H + mov eax, 17476 + kmovw k2, eax + vpblendmd zmm13 {k2}, zmm13, zmm12 + vbroadcasti32x4 zmm15, xmmword ptr [BLAKE3_IV] + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+8H] + mov r10, qword ptr [rdi+10H] + mov r11, qword ptr [rdi+18H] + mov eax, 43690 + kmovw k3, eax + mov eax, 34952 + kmovw k4, eax + movzx eax, byte ptr [rbp+80H] + or eax, r13d + xor edx, edx +ALIGN 16 +innerloop4: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + mov dword ptr [rsp+88H], eax + vmovdqa32 zmm2, zmm15 + vpbroadcastd zmm8, dword ptr [rsp+22H*4H] + vpblendmd zmm3 {k4}, zmm13, zmm8 + vmovups zmm8, zmmword ptr [r8+rdx-1H*40H] + vinserti32x4 zmm8, zmm8, xmmword ptr [r9+rdx-4H*10H], 01H + vinserti32x4 zmm8, zmm8, xmmword ptr [r10+rdx-4H*10H], 02H + vinserti32x4 zmm8, zmm8, xmmword ptr [r11+rdx-4H*10H], 03H + vmovups zmm9, zmmword ptr [r8+rdx-30H] + vinserti32x4 zmm9, zmm9, xmmword ptr [r9+rdx-3H*10H], 01H + vinserti32x4 zmm9, zmm9, xmmword ptr [r10+rdx-3H*10H], 02H + vinserti32x4 zmm9, zmm9, xmmword ptr [r11+rdx-3H*10H], 03H + vshufps zmm4, zmm8, zmm9, 136 + vshufps zmm5, zmm8, zmm9, 221 + vmovups zmm8, zmmword ptr [r8+rdx-20H] + vinserti32x4 zmm8, zmm8, xmmword ptr [r9+rdx-2H*10H], 01H + vinserti32x4 zmm8, zmm8, xmmword ptr [r10+rdx-2H*10H], 02H + vinserti32x4 zmm8, zmm8, xmmword ptr [r11+rdx-2H*10H], 03H + vmovups zmm9, zmmword ptr [r8+rdx-10H] + vinserti32x4 zmm9, zmm9, xmmword ptr [r9+rdx-1H*10H], 01H + vinserti32x4 zmm9, zmm9, xmmword ptr [r10+rdx-1H*10H], 02H + vinserti32x4 zmm9, zmm9, xmmword ptr [r11+rdx-1H*10H], 03H + vshufps zmm6, zmm8, zmm9, 136 + vshufps zmm7, zmm8, zmm9, 221 + vpshufd zmm6, zmm6, 93H + vpshufd zmm7, zmm7, 93H + mov al, 7 +roundloop4: + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm0, zmm0, zmm1 + vpxord zmm3, zmm3, zmm0 + vprord zmm3, zmm3, 16 + vpaddd zmm2, zmm2, zmm3 + vpxord zmm1, zmm1, zmm2 + vprord zmm1, zmm1, 12 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm0, zmm0, zmm1 + vpxord zmm3, zmm3, zmm0 + vprord zmm3, zmm3, 8 + vpaddd zmm2, zmm2, zmm3 + vpxord zmm1, zmm1, zmm2 + vprord zmm1, zmm1, 7 + vpshufd zmm0, zmm0, 93H + vpshufd zmm3, zmm3, 4EH + vpshufd zmm2, zmm2, 39H + vpaddd zmm0, zmm0, zmm6 + vpaddd zmm0, zmm0, zmm1 + vpxord zmm3, zmm3, zmm0 + vprord zmm3, zmm3, 16 + vpaddd zmm2, zmm2, zmm3 + vpxord zmm1, zmm1, zmm2 + vprord zmm1, zmm1, 12 + vpaddd zmm0, zmm0, zmm7 + vpaddd zmm0, zmm0, zmm1 + vpxord zmm3, zmm3, zmm0 + vprord zmm3, zmm3, 8 + vpaddd zmm2, zmm2, zmm3 + vpxord zmm1, zmm1, zmm2 + vprord zmm1, zmm1, 7 + vpshufd zmm0, zmm0, 39H + vpshufd zmm3, zmm3, 4EH + vpshufd zmm2, zmm2, 93H + dec al + jz endroundloop4 + vshufps zmm8, zmm4, zmm5, 214 + vpshufd zmm9, zmm4, 0FH + vpshufd zmm4, zmm8, 39H + vshufps zmm8, zmm6, zmm7, 250 + vpblendmd zmm9 {k3}, zmm9, zmm8 + vpunpcklqdq zmm8, zmm7, zmm5 + vpblendmd zmm8 {k4}, zmm8, zmm6 + vpshufd zmm8, zmm8, 78H + vpunpckhdq zmm5, zmm5, zmm7 + vpunpckldq zmm6, zmm6, zmm5 + vpshufd zmm7, zmm6, 1EH + vmovdqa32 zmm5, zmm9 + vmovdqa32 zmm6, zmm8 + jmp roundloop4 +endroundloop4: + vpxord zmm0, zmm0, zmm2 + vpxord zmm1, zmm1, zmm3 + mov eax, r13d + cmp rdx, r15 + jne innerloop4 + vmovdqu xmmword ptr [rbx], xmm0 + vmovdqu xmmword ptr [rbx+10H], xmm1 + vextracti128 xmmword ptr [rbx+20H], ymm0, 01H + vextracti128 xmmword ptr [rbx+30H], ymm1, 01H + vextracti32x4 xmmword ptr [rbx+4H*10H], zmm0, 02H + vextracti32x4 xmmword ptr [rbx+5H*10H], zmm1, 02H + vextracti32x4 xmmword ptr [rbx+6H*10H], zmm0, 03H + vextracti32x4 xmmword ptr [rbx+7H*10H], zmm1, 03H + vmovdqa xmm0, xmmword ptr [rsp] + vmovdqa xmm2, xmmword ptr [rsp+40H] + vmovdqa32 xmm0 {k1}, xmmword ptr [rsp+1H*10H] + vmovdqa32 xmm2 {k1}, xmmword ptr [rsp+5H*10H] + vmovdqa xmmword ptr [rsp], xmm0 + vmovdqa xmmword ptr [rsp+40H], xmm2 + add rbx, 128 + add rdi, 32 + sub rsi, 4 +final3blocks: + test esi, 2H + je final1block + vbroadcasti128 ymm0, xmmword ptr [rcx] + vbroadcasti128 ymm1, xmmword ptr [rcx+10H] + vmovd xmm13, dword ptr [rsp] + vpinsrd xmm13, xmm13, dword ptr [rsp+40H], 1 + vpinsrd xmm13, xmm13, dword ptr [BLAKE3_BLOCK_LEN], 2 + vmovd xmm14, dword ptr [rsp+4H] + vpinsrd xmm14, xmm14, dword ptr [rsp+44H], 1 + vpinsrd xmm14, xmm14, dword ptr [BLAKE3_BLOCK_LEN], 2 + vinserti128 ymm13, ymm13, xmm14, 01H + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+8H] + movzx eax, byte ptr [rbp+80H] + or eax, r13d + xor edx, edx +ALIGN 16 +innerloop2: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + mov dword ptr [rsp+88H], eax + vbroadcasti128 ymm2, xmmword ptr [BLAKE3_IV] + vpbroadcastd ymm8, dword ptr [rsp+88H] + vpblendd ymm3, ymm13, ymm8, 88H + vmovups ymm8, ymmword ptr [r8+rdx-40H] + vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-40H], 01H + vmovups ymm9, ymmword ptr [r8+rdx-30H] + vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-30H], 01H + vshufps ymm4, ymm8, ymm9, 136 + vshufps ymm5, ymm8, ymm9, 221 + vmovups ymm8, ymmword ptr [r8+rdx-20H] + vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-20H], 01H + vmovups ymm9, ymmword ptr [r8+rdx-10H] + vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-10H], 01H + vshufps ymm6, ymm8, ymm9, 136 + vshufps ymm7, ymm8, ymm9, 221 + vpshufd ymm6, ymm6, 93H + vpshufd ymm7, ymm7, 93H + mov al, 7 +roundloop2: + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm0, ymm0, ymm1 + vpxord ymm3, ymm3, ymm0 + vprord ymm3, ymm3, 16 + vpaddd ymm2, ymm2, ymm3 + vpxord ymm1, ymm1, ymm2 + vprord ymm1, ymm1, 12 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm0, ymm0, ymm1 + vpxord ymm3, ymm3, ymm0 + vprord ymm3, ymm3, 8 + vpaddd ymm2, ymm2, ymm3 + vpxord ymm1, ymm1, ymm2 + vprord ymm1, ymm1, 7 + vpshufd ymm0, ymm0, 93H + vpshufd ymm3, ymm3, 4EH + vpshufd ymm2, ymm2, 39H + vpaddd ymm0, ymm0, ymm6 + vpaddd ymm0, ymm0, ymm1 + vpxord ymm3, ymm3, ymm0 + vprord ymm3, ymm3, 16 + vpaddd ymm2, ymm2, ymm3 + vpxord ymm1, ymm1, ymm2 + vprord ymm1, ymm1, 12 + vpaddd ymm0, ymm0, ymm7 + vpaddd ymm0, ymm0, ymm1 + vpxord ymm3, ymm3, ymm0 + vprord ymm3, ymm3, 8 + vpaddd ymm2, ymm2, ymm3 + vpxord ymm1, ymm1, ymm2 + vprord ymm1, ymm1, 7 + vpshufd ymm0, ymm0, 39H + vpshufd ymm3, ymm3, 4EH + vpshufd ymm2, ymm2, 93H + dec al + jz endroundloop2 + vshufps ymm8, ymm4, ymm5, 214 + vpshufd ymm9, ymm4, 0FH + vpshufd ymm4, ymm8, 39H + vshufps ymm8, ymm6, ymm7, 250 + vpblendd ymm9, ymm9, ymm8, 0AAH + vpunpcklqdq ymm8, ymm7, ymm5 + vpblendd ymm8, ymm8, ymm6, 88H + vpshufd ymm8, ymm8, 78H + vpunpckhdq ymm5, ymm5, ymm7 + vpunpckldq ymm6, ymm6, ymm5 + vpshufd ymm7, ymm6, 1EH + vmovdqa ymm5, ymm9 + vmovdqa ymm6, ymm8 + jmp roundloop2 +endroundloop2: + vpxor ymm0, ymm0, ymm2 + vpxor ymm1, ymm1, ymm3 + mov eax, r13d + cmp rdx, r15 + jne innerloop2 + vmovdqu xmmword ptr [rbx], xmm0 + vmovdqu xmmword ptr [rbx+10H], xmm1 + vextracti128 xmmword ptr [rbx+20H], ymm0, 01H + vextracti128 xmmword ptr [rbx+30H], ymm1, 01H + vmovdqa xmm0, xmmword ptr [rsp] + vmovdqa xmm2, xmmword ptr [rsp+40H] + vmovdqu32 xmm0 {k1}, xmmword ptr [rsp+8H] + vmovdqu32 xmm2 {k1}, xmmword ptr [rsp+48H] + vmovdqa xmmword ptr [rsp], xmm0 + vmovdqa xmmword ptr [rsp+40H], xmm2 + add rbx, 64 + add rdi, 16 + sub rsi, 2 +final1block: + test esi, 1H + je unwind + vmovdqu xmm0, xmmword ptr [rcx] + vmovdqu xmm1, xmmword ptr [rcx+10H] + vmovd xmm14, dword ptr [rsp] + vpinsrd xmm14, xmm14, dword ptr [rsp+40H], 1 + vpinsrd xmm14, xmm14, dword ptr [BLAKE3_BLOCK_LEN], 2 + vmovdqa xmm15, xmmword ptr [BLAKE3_IV] + mov r8, qword ptr [rdi] + movzx eax, byte ptr [rbp+80H] + or eax, r13d + xor edx, edx +ALIGN 16 +innerloop1: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + vpinsrd xmm3, xmm14, eax, 3 + vmovdqa xmm2, xmm15 + vmovups xmm8, xmmword ptr [r8+rdx-40H] + vmovups xmm9, xmmword ptr [r8+rdx-30H] + vshufps xmm4, xmm8, xmm9, 136 + vshufps xmm5, xmm8, xmm9, 221 + vmovups xmm8, xmmword ptr [r8+rdx-20H] + vmovups xmm9, xmmword ptr [r8+rdx-10H] + vshufps xmm6, xmm8, xmm9, 136 + vshufps xmm7, xmm8, xmm9, 221 + vpshufd xmm6, xmm6, 93H + vpshufd xmm7, xmm7, 93H + mov al, 7 +roundloop1: + vpaddd xmm0, xmm0, xmm4 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 16 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 12 + vpaddd xmm0, xmm0, xmm5 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 8 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 7 + vpshufd xmm0, xmm0, 93H + vpshufd xmm3, xmm3, 4EH + vpshufd xmm2, xmm2, 39H + vpaddd xmm0, xmm0, xmm6 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 16 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 12 + vpaddd xmm0, xmm0, xmm7 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 8 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 7 + vpshufd xmm0, xmm0, 39H + vpshufd xmm3, xmm3, 4EH + vpshufd xmm2, xmm2, 93H + dec al + jz endroundloop1 + vshufps xmm8, xmm4, xmm5, 214 + vpshufd xmm9, xmm4, 0FH + vpshufd xmm4, xmm8, 39H + vshufps xmm8, xmm6, xmm7, 250 + vpblendd xmm9, xmm9, xmm8, 0AAH + vpunpcklqdq xmm8, xmm7, xmm5 + vpblendd xmm8, xmm8, xmm6, 88H + vpshufd xmm8, xmm8, 78H + vpunpckhdq xmm5, xmm5, xmm7 + vpunpckldq xmm6, xmm6, xmm5 + vpshufd xmm7, xmm6, 1EH + vmovdqa xmm5, xmm9 + vmovdqa xmm6, xmm8 + jmp roundloop1 +endroundloop1: + vpxor xmm0, xmm0, xmm2 + vpxor xmm1, xmm1, xmm3 + mov eax, r13d + cmp rdx, r15 + jne innerloop1 + vmovdqu xmmword ptr [rbx], xmm0 + vmovdqu xmmword ptr [rbx+10H], xmm1 + jmp unwind + +_blake3_hash_many_avx512 ENDP +blake3_hash_many_avx512 ENDP + +ALIGN 16 +blake3_compress_in_place_avx512 PROC +_blake3_compress_in_place_avx512 PROC + sub rsp, 72 + vmovdqa xmmword ptr [rsp], xmm6 + vmovdqa xmmword ptr [rsp+10H], xmm7 + vmovdqa xmmword ptr [rsp+20H], xmm8 + vmovdqa xmmword ptr [rsp+30H], xmm9 + vmovdqu xmm0, xmmword ptr [rcx] + vmovdqu xmm1, xmmword ptr [rcx+10H] + movzx eax, byte ptr [rsp+70H] + movzx r8d, r8b + shl rax, 32 + add r8, rax + vmovq xmm3, r9 + vmovq xmm4, r8 + vpunpcklqdq xmm3, xmm3, xmm4 + vmovaps xmm2, xmmword ptr [BLAKE3_IV] + vmovups xmm8, xmmword ptr [rdx] + vmovups xmm9, xmmword ptr [rdx+10H] + vshufps xmm4, xmm8, xmm9, 136 + vshufps xmm5, xmm8, xmm9, 221 + vmovups xmm8, xmmword ptr [rdx+20H] + vmovups xmm9, xmmword ptr [rdx+30H] + vshufps xmm6, xmm8, xmm9, 136 + vshufps xmm7, xmm8, xmm9, 221 + vpshufd xmm6, xmm6, 93H + vpshufd xmm7, xmm7, 93H + mov al, 7 +@@: + vpaddd xmm0, xmm0, xmm4 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 16 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 12 + vpaddd xmm0, xmm0, xmm5 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 8 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 7 + vpshufd xmm0, xmm0, 93H + vpshufd xmm3, xmm3, 4EH + vpshufd xmm2, xmm2, 39H + vpaddd xmm0, xmm0, xmm6 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 16 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 12 + vpaddd xmm0, xmm0, xmm7 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 8 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 7 + vpshufd xmm0, xmm0, 39H + vpshufd xmm3, xmm3, 4EH + vpshufd xmm2, xmm2, 93H + dec al + jz @F + vshufps xmm8, xmm4, xmm5, 214 + vpshufd xmm9, xmm4, 0FH + vpshufd xmm4, xmm8, 39H + vshufps xmm8, xmm6, xmm7, 250 + vpblendd xmm9, xmm9, xmm8, 0AAH + vpunpcklqdq xmm8, xmm7, xmm5 + vpblendd xmm8, xmm8, xmm6, 88H + vpshufd xmm8, xmm8, 78H + vpunpckhdq xmm5, xmm5, xmm7 + vpunpckldq xmm6, xmm6, xmm5 + vpshufd xmm7, xmm6, 1EH + vmovdqa xmm5, xmm9 + vmovdqa xmm6, xmm8 + jmp @B +@@: + vpxor xmm0, xmm0, xmm2 + vpxor xmm1, xmm1, xmm3 + vmovdqu xmmword ptr [rcx], xmm0 + vmovdqu xmmword ptr [rcx+10H], xmm1 + vmovdqa xmm6, xmmword ptr [rsp] + vmovdqa xmm7, xmmword ptr [rsp+10H] + vmovdqa xmm8, xmmword ptr [rsp+20H] + vmovdqa xmm9, xmmword ptr [rsp+30H] + add rsp, 72 + ret +_blake3_compress_in_place_avx512 ENDP +blake3_compress_in_place_avx512 ENDP + +ALIGN 16 +blake3_compress_xof_avx512 PROC +_blake3_compress_xof_avx512 PROC + sub rsp, 72 + vmovdqa xmmword ptr [rsp], xmm6 + vmovdqa xmmword ptr [rsp+10H], xmm7 + vmovdqa xmmword ptr [rsp+20H], xmm8 + vmovdqa xmmword ptr [rsp+30H], xmm9 + vmovdqu xmm0, xmmword ptr [rcx] + vmovdqu xmm1, xmmword ptr [rcx+10H] + movzx eax, byte ptr [rsp+70H] + movzx r8d, r8b + mov r10, qword ptr [rsp+78H] + shl rax, 32 + add r8, rax + vmovq xmm3, r9 + vmovq xmm4, r8 + vpunpcklqdq xmm3, xmm3, xmm4 + vmovaps xmm2, xmmword ptr [BLAKE3_IV] + vmovups xmm8, xmmword ptr [rdx] + vmovups xmm9, xmmword ptr [rdx+10H] + vshufps xmm4, xmm8, xmm9, 136 + vshufps xmm5, xmm8, xmm9, 221 + vmovups xmm8, xmmword ptr [rdx+20H] + vmovups xmm9, xmmword ptr [rdx+30H] + vshufps xmm6, xmm8, xmm9, 136 + vshufps xmm7, xmm8, xmm9, 221 + vpshufd xmm6, xmm6, 93H + vpshufd xmm7, xmm7, 93H + mov al, 7 +@@: + vpaddd xmm0, xmm0, xmm4 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 16 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 12 + vpaddd xmm0, xmm0, xmm5 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 8 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 7 + vpshufd xmm0, xmm0, 93H + vpshufd xmm3, xmm3, 4EH + vpshufd xmm2, xmm2, 39H + vpaddd xmm0, xmm0, xmm6 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 16 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 12 + vpaddd xmm0, xmm0, xmm7 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 8 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 7 + vpshufd xmm0, xmm0, 39H + vpshufd xmm3, xmm3, 4EH + vpshufd xmm2, xmm2, 93H + dec al + jz @F + vshufps xmm8, xmm4, xmm5, 214 + vpshufd xmm9, xmm4, 0FH + vpshufd xmm4, xmm8, 39H + vshufps xmm8, xmm6, xmm7, 250 + vpblendd xmm9, xmm9, xmm8, 0AAH + vpunpcklqdq xmm8, xmm7, xmm5 + vpblendd xmm8, xmm8, xmm6, 88H + vpshufd xmm8, xmm8, 78H + vpunpckhdq xmm5, xmm5, xmm7 + vpunpckldq xmm6, xmm6, xmm5 + vpshufd xmm7, xmm6, 1EH + vmovdqa xmm5, xmm9 + vmovdqa xmm6, xmm8 + jmp @B +@@: + vpxor xmm0, xmm0, xmm2 + vpxor xmm1, xmm1, xmm3 + vpxor xmm2, xmm2, xmmword ptr [rcx] + vpxor xmm3, xmm3, xmmword ptr [rcx+10H] + vmovdqu xmmword ptr [r10], xmm0 + vmovdqu xmmword ptr [r10+10H], xmm1 + vmovdqu xmmword ptr [r10+20H], xmm2 + vmovdqu xmmword ptr [r10+30H], xmm3 + vmovdqa xmm6, xmmword ptr [rsp] + vmovdqa xmm7, xmmword ptr [rsp+10H] + vmovdqa xmm8, xmmword ptr [rsp+20H] + vmovdqa xmm9, xmmword ptr [rsp+30H] + add rsp, 72 + ret +_blake3_compress_xof_avx512 ENDP +blake3_compress_xof_avx512 ENDP + +_TEXT ENDS + +_RDATA SEGMENT READONLY PAGE ALIAS(".rdata") 'CONST' +ALIGN 64 +INDEX0: + dd 0, 1, 2, 3, 16, 17, 18, 19 + dd 8, 9, 10, 11, 24, 25, 26, 27 +INDEX1: + dd 4, 5, 6, 7, 20, 21, 22, 23 + dd 12, 13, 14, 15, 28, 29, 30, 31 +ADD0: + dd 0, 1, 2, 3, 4, 5, 6, 7 + dd 8, 9, 10, 11, 12, 13, 14, 15 +ADD1: + dd 1 +ADD16: + dd 16 +BLAKE3_BLOCK_LEN: + dd 64 +ALIGN 64 +BLAKE3_IV: +BLAKE3_IV_0: + dd 06A09E667H +BLAKE3_IV_1: + dd 0BB67AE85H +BLAKE3_IV_2: + dd 03C6EF372H +BLAKE3_IV_3: + dd 0A54FF53AH + +_RDATA ENDS +END diff --git a/third-party/blake3/c/blake3_c_rust_bindings/Cargo.toml b/third-party/blake3/c/blake3_c_rust_bindings/Cargo.toml new file mode 100644 index 00000000..fff9f416 --- /dev/null +++ b/third-party/blake3/c/blake3_c_rust_bindings/Cargo.toml @@ -0,0 +1,29 @@ +# These are Rust bindings for the C implementation of BLAKE3. As there is a +# native (and faster) Rust implementation of BLAKE3 provided in this same repo, +# these bindings are not expected to be used in production. They're intended +# for testing and benchmarking. + +[package] +name = "blake3_c_rust_bindings" +version = "0.0.0" +description = "TESTING ONLY Rust bindings for the BLAKE3 C implementation" +edition = "2021" + +[features] +# By default the x86-64 build uses assembly implementations. This feature makes +# the build use the C intrinsics implementations instead. +prefer_intrinsics = [] +# Activate NEON bindings. We don't currently do any CPU feature detection for +# this. If this Cargo feature is on, the NEON gets used. +neon = [] + +[dev-dependencies] +arrayref = "0.3.5" +arrayvec = { version = "0.7.0", default-features = false } +page_size = "0.4.1" +rand = "0.7.2" +rand_chacha = "0.2.1" +reference_impl = { path = "../../reference_impl" } + +[build-dependencies] +cc = "1.0.48" diff --git a/third-party/blake3/c/blake3_c_rust_bindings/README.md b/third-party/blake3/c/blake3_c_rust_bindings/README.md new file mode 100644 index 00000000..c44726b9 --- /dev/null +++ b/third-party/blake3/c/blake3_c_rust_bindings/README.md @@ -0,0 +1,4 @@ +These are Rust bindings for the C implementation of BLAKE3. As there is +a native Rust implementation of BLAKE3 provided in this same repo, these +bindings are not expected to be used in production. They're intended for +testing and benchmarking. diff --git a/third-party/blake3/c/blake3_c_rust_bindings/benches/bench.rs b/third-party/blake3/c/blake3_c_rust_bindings/benches/bench.rs new file mode 100644 index 00000000..6e75351f --- /dev/null +++ b/third-party/blake3/c/blake3_c_rust_bindings/benches/bench.rs @@ -0,0 +1,393 @@ +#![feature(test)] + +extern crate test; + +use arrayref::array_ref; +use arrayvec::ArrayVec; +use rand::prelude::*; +use test::Bencher; + +const KIB: usize = 1024; +const MAX_SIMD_DEGREE: usize = 16; + +const BLOCK_LEN: usize = 64; +const CHUNK_LEN: usize = 1024; +const OUT_LEN: usize = 32; + +// This struct randomizes two things: +// 1. The actual bytes of input. +// 2. The page offset the input starts at. +pub struct RandomInput { + buf: Vec, + len: usize, + offsets: Vec, + offset_index: usize, +} + +impl RandomInput { + pub fn new(b: &mut Bencher, len: usize) -> Self { + b.bytes += len as u64; + let page_size: usize = page_size::get(); + let mut buf = vec![0u8; len + page_size]; + let mut rng = rand::thread_rng(); + rng.fill_bytes(&mut buf); + let mut offsets: Vec = (0..page_size).collect(); + offsets.shuffle(&mut rng); + Self { + buf, + len, + offsets, + offset_index: 0, + } + } + + pub fn get(&mut self) -> &[u8] { + let offset = self.offsets[self.offset_index]; + self.offset_index += 1; + if self.offset_index >= self.offsets.len() { + self.offset_index = 0; + } + &self.buf[offset..][..self.len] + } +} + +type CompressInPlaceFn = + unsafe extern "C" fn(cv: *mut u32, block: *const u8, block_len: u8, counter: u64, flags: u8); + +fn bench_single_compression_fn(b: &mut Bencher, f: CompressInPlaceFn) { + let mut state = [1u32; 8]; + let mut r = RandomInput::new(b, 64); + let input = array_ref!(r.get(), 0, 64); + b.iter(|| unsafe { f(state.as_mut_ptr(), input.as_ptr(), 64, 0, 0) }); +} + +#[bench] +fn bench_single_compression_portable(b: &mut Bencher) { + bench_single_compression_fn( + b, + blake3_c_rust_bindings::ffi::blake3_compress_in_place_portable, + ); +} + +#[bench] +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +fn bench_single_compression_sse2(b: &mut Bencher) { + if !blake3_c_rust_bindings::sse2_detected() { + return; + } + bench_single_compression_fn( + b, + blake3_c_rust_bindings::ffi::x86::blake3_compress_in_place_sse2, + ); +} + +#[bench] +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +fn bench_single_compression_sse41(b: &mut Bencher) { + if !blake3_c_rust_bindings::sse41_detected() { + return; + } + bench_single_compression_fn( + b, + blake3_c_rust_bindings::ffi::x86::blake3_compress_in_place_sse41, + ); +} + +#[bench] +fn bench_single_compression_avx512(b: &mut Bencher) { + if !blake3_c_rust_bindings::avx512_detected() { + return; + } + bench_single_compression_fn( + b, + blake3_c_rust_bindings::ffi::x86::blake3_compress_in_place_avx512, + ); +} + +type HashManyFn = unsafe extern "C" fn( + inputs: *const *const u8, + num_inputs: usize, + blocks: usize, + key: *const u32, + counter: u64, + increment_counter: bool, + flags: u8, + flags_start: u8, + flags_end: u8, + out: *mut u8, +); + +fn bench_many_chunks_fn(b: &mut Bencher, f: HashManyFn, degree: usize) { + let mut inputs = Vec::new(); + for _ in 0..degree { + inputs.push(RandomInput::new(b, CHUNK_LEN)); + } + b.iter(|| { + let input_arrays: ArrayVec<&[u8; CHUNK_LEN], MAX_SIMD_DEGREE> = inputs + .iter_mut() + .take(degree) + .map(|i| array_ref!(i.get(), 0, CHUNK_LEN)) + .collect(); + let mut out = [0; MAX_SIMD_DEGREE * OUT_LEN]; + unsafe { + f( + input_arrays.as_ptr() as _, + input_arrays.len(), + CHUNK_LEN / BLOCK_LEN, + [0u32; 8].as_ptr(), + 0, + true, + 0, + 0, + 0, + out.as_mut_ptr(), + ) + } + }); +} + +#[bench] +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +fn bench_many_chunks_sse2(b: &mut Bencher) { + if !blake3_c_rust_bindings::sse2_detected() { + return; + } + bench_many_chunks_fn( + b, + blake3_c_rust_bindings::ffi::x86::blake3_hash_many_sse2, + 4, + ); +} + +#[bench] +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +fn bench_many_chunks_sse41(b: &mut Bencher) { + if !blake3_c_rust_bindings::sse41_detected() { + return; + } + bench_many_chunks_fn( + b, + blake3_c_rust_bindings::ffi::x86::blake3_hash_many_sse41, + 4, + ); +} + +#[bench] +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +fn bench_many_chunks_avx2(b: &mut Bencher) { + if !blake3_c_rust_bindings::avx2_detected() { + return; + } + bench_many_chunks_fn( + b, + blake3_c_rust_bindings::ffi::x86::blake3_hash_many_avx2, + 8, + ); +} + +#[bench] +fn bench_many_chunks_avx512(b: &mut Bencher) { + if !blake3_c_rust_bindings::avx512_detected() { + return; + } + bench_many_chunks_fn( + b, + blake3_c_rust_bindings::ffi::x86::blake3_hash_many_avx512, + 16, + ); +} + +#[bench] +#[cfg(feature = "neon")] +fn bench_many_chunks_neon(b: &mut Bencher) { + // When "neon" is on, NEON support is assumed. + bench_many_chunks_fn( + b, + blake3_c_rust_bindings::ffi::neon::blake3_hash_many_neon, + 4, + ); +} + +// TODO: When we get const generics we can unify this with the chunks code. +fn bench_many_parents_fn(b: &mut Bencher, f: HashManyFn, degree: usize) { + let mut inputs = Vec::new(); + for _ in 0..degree { + inputs.push(RandomInput::new(b, BLOCK_LEN)); + } + b.iter(|| { + let input_arrays: ArrayVec<&[u8; BLOCK_LEN], MAX_SIMD_DEGREE> = inputs + .iter_mut() + .take(degree) + .map(|i| array_ref!(i.get(), 0, BLOCK_LEN)) + .collect(); + let mut out = [0; MAX_SIMD_DEGREE * OUT_LEN]; + unsafe { + f( + input_arrays.as_ptr() as _, + input_arrays.len(), + 1, + [0u32; 8].as_ptr(), + 0, + false, + 0, + 0, + 0, + out.as_mut_ptr(), + ) + } + }); +} + +#[bench] +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +fn bench_many_parents_sse2(b: &mut Bencher) { + if !blake3_c_rust_bindings::sse2_detected() { + return; + } + bench_many_parents_fn( + b, + blake3_c_rust_bindings::ffi::x86::blake3_hash_many_sse2, + 4, + ); +} + +#[bench] +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +fn bench_many_parents_sse41(b: &mut Bencher) { + if !blake3_c_rust_bindings::sse41_detected() { + return; + } + bench_many_parents_fn( + b, + blake3_c_rust_bindings::ffi::x86::blake3_hash_many_sse41, + 4, + ); +} + +#[bench] +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +fn bench_many_parents_avx2(b: &mut Bencher) { + if !blake3_c_rust_bindings::avx2_detected() { + return; + } + bench_many_parents_fn( + b, + blake3_c_rust_bindings::ffi::x86::blake3_hash_many_avx2, + 8, + ); +} + +#[bench] +fn bench_many_parents_avx512(b: &mut Bencher) { + if !blake3_c_rust_bindings::avx512_detected() { + return; + } + bench_many_parents_fn( + b, + blake3_c_rust_bindings::ffi::x86::blake3_hash_many_avx512, + 16, + ); +} + +#[bench] +#[cfg(feature = "neon")] +fn bench_many_parents_neon(b: &mut Bencher) { + // When "neon" is on, NEON support is assumed. + bench_many_parents_fn( + b, + blake3_c_rust_bindings::ffi::neon::blake3_hash_many_neon, + 4, + ); +} + +fn bench_incremental(b: &mut Bencher, len: usize) { + let mut input = RandomInput::new(b, len); + b.iter(|| { + let mut hasher = blake3_c_rust_bindings::Hasher::new(); + hasher.update(input.get()); + let mut out = [0; 32]; + hasher.finalize(&mut out); + out + }); +} + +#[bench] +fn bench_incremental_0001_block(b: &mut Bencher) { + bench_incremental(b, BLOCK_LEN); +} + +#[bench] +fn bench_incremental_0001_kib(b: &mut Bencher) { + bench_incremental(b, 1 * KIB); +} + +#[bench] +fn bench_incremental_0002_kib(b: &mut Bencher) { + bench_incremental(b, 2 * KIB); +} + +#[bench] +fn bench_incremental_0004_kib(b: &mut Bencher) { + bench_incremental(b, 4 * KIB); +} + +#[bench] +fn bench_incremental_0008_kib(b: &mut Bencher) { + bench_incremental(b, 8 * KIB); +} + +#[bench] +fn bench_incremental_0016_kib(b: &mut Bencher) { + bench_incremental(b, 16 * KIB); +} + +#[bench] +fn bench_incremental_0032_kib(b: &mut Bencher) { + bench_incremental(b, 32 * KIB); +} + +#[bench] +fn bench_incremental_0064_kib(b: &mut Bencher) { + bench_incremental(b, 64 * KIB); +} + +#[bench] +fn bench_incremental_0128_kib(b: &mut Bencher) { + bench_incremental(b, 128 * KIB); +} + +#[bench] +fn bench_incremental_0256_kib(b: &mut Bencher) { + bench_incremental(b, 256 * KIB); +} + +#[bench] +fn bench_incremental_0512_kib(b: &mut Bencher) { + bench_incremental(b, 512 * KIB); +} + +#[bench] +fn bench_incremental_1024_kib(b: &mut Bencher) { + bench_incremental(b, 1024 * KIB); +} + +// This checks that update() splits up its input in increasing powers of 2, so +// that it can recover a high degree of parallelism when the number of bytes +// hashed so far is uneven. The performance of this benchmark should be +// reasonably close to bench_incremental_0064_kib, within 80% or so. When we +// had a bug in this logic (https://github.com/BLAKE3-team/BLAKE3/issues/69), +// performance was less than half. +#[bench] +fn bench_two_updates(b: &mut Bencher) { + let len = 65536; + let mut input = RandomInput::new(b, len); + b.iter(|| { + let mut hasher = blake3_c_rust_bindings::Hasher::new(); + let input = input.get(); + hasher.update(&input[..1]); + hasher.update(&input[1..]); + let mut out = [0; 32]; + hasher.finalize(&mut out); + out + }); +} diff --git a/third-party/blake3/c/blake3_c_rust_bindings/build.rs b/third-party/blake3/c/blake3_c_rust_bindings/build.rs new file mode 100644 index 00000000..624dbb97 --- /dev/null +++ b/third-party/blake3/c/blake3_c_rust_bindings/build.rs @@ -0,0 +1,190 @@ +use std::env; + +fn defined(var: &str) -> bool { + env::var_os(var).is_some() +} + +fn target_components() -> Vec { + let target = env::var("TARGET").unwrap(); + target.split("-").map(|s| s.to_string()).collect() +} + +fn is_x86_64() -> bool { + target_components()[0] == "x86_64" +} + +fn is_x86_32() -> bool { + let arch = &target_components()[0]; + arch == "i386" || arch == "i586" || arch == "i686" +} + +fn is_armv7() -> bool { + target_components()[0] == "armv7" +} + +fn is_aarch64() -> bool { + target_components()[0] == "aarch64" +} + +// Windows targets may be using the MSVC toolchain or the GNU toolchain. The +// right compiler flags to use depend on the toolchain. (And we don't want to +// use flag_if_supported, because we don't want features to be silently +// disabled by old compilers.) +fn is_windows_msvc() -> bool { + // Some targets are only two components long, so check in steps. + target_components()[1] == "pc" + && target_components()[2] == "windows" + && target_components()[3] == "msvc" +} + +fn is_windows_gnu() -> bool { + // Some targets are only two components long, so check in steps. + target_components()[1] == "pc" + && target_components()[2] == "windows" + && target_components()[3] == "gnu" +} + +fn new_build() -> cc::Build { + let mut build = cc::Build::new(); + if !is_windows_msvc() { + build.flag("-std=c11"); + } + build +} + +fn c_dir_path(filename: &str) -> String { + // The `cross` tool doesn't support reading files in parent directories. As a hacky workaround + // in `cross_test.sh`, we move the c/ directory around and set BLAKE3_C_DIR_OVERRIDE. Regular + // building and testing doesn't require this. + if let Ok(c_dir_override) = env::var("BLAKE3_C_DIR_OVERRIDE") { + c_dir_override + "/" + filename + } else { + "../".to_string() + filename + } +} + +fn main() -> Result<(), Box> { + let mut base_build = new_build(); + base_build.file(c_dir_path("blake3.c")); + base_build.file(c_dir_path("blake3_dispatch.c")); + base_build.file(c_dir_path("blake3_portable.c")); + base_build.compile("blake3_base"); + + if is_x86_64() && !defined("CARGO_FEATURE_PREFER_INTRINSICS") { + // On 64-bit, use the assembly implementations, unless the + // "prefer_intrinsics" feature is enabled. + if is_windows_msvc() { + let mut build = new_build(); + build.file(c_dir_path("blake3_sse2_x86-64_windows_msvc.asm")); + build.file(c_dir_path("blake3_sse41_x86-64_windows_msvc.asm")); + build.file(c_dir_path("blake3_avx2_x86-64_windows_msvc.asm")); + build.file(c_dir_path("blake3_avx512_x86-64_windows_msvc.asm")); + build.compile("blake3_asm"); + } else if is_windows_gnu() { + let mut build = new_build(); + build.file(c_dir_path("blake3_sse2_x86-64_windows_gnu.S")); + build.file(c_dir_path("blake3_sse41_x86-64_windows_gnu.S")); + build.file(c_dir_path("blake3_avx2_x86-64_windows_gnu.S")); + build.file(c_dir_path("blake3_avx512_x86-64_windows_gnu.S")); + build.compile("blake3_asm"); + } else { + // All non-Windows implementations are assumed to support + // Linux-style assembly. These files do contain a small + // explicit workaround for macOS also. + let mut build = new_build(); + build.file(c_dir_path("blake3_sse2_x86-64_unix.S")); + build.file(c_dir_path("blake3_sse41_x86-64_unix.S")); + build.file(c_dir_path("blake3_avx2_x86-64_unix.S")); + build.file(c_dir_path("blake3_avx512_x86-64_unix.S")); + build.compile("blake3_asm"); + } + } else if is_x86_64() || is_x86_32() { + // Assembly implementations are only for 64-bit. On 32-bit, or if + // the "prefer_intrinsics" feature is enabled, use the + // intrinsics-based C implementations. These each need to be + // compiled separately, with the corresponding instruction set + // extension explicitly enabled in the compiler. + + let mut sse2_build = new_build(); + sse2_build.file(c_dir_path("blake3_sse2.c")); + if is_windows_msvc() { + // /arch:SSE2 is the default on x86 and undefined on x86_64: + // https://docs.microsoft.com/en-us/cpp/build/reference/arch-x86 + // It also includes SSE4.1 intrinsics: + // https://stackoverflow.com/a/32183222/823869 + } else { + sse2_build.flag("-msse2"); + } + sse2_build.compile("blake3_sse2"); + + let mut sse41_build = new_build(); + sse41_build.file(c_dir_path("blake3_sse41.c")); + if is_windows_msvc() { + // /arch:SSE2 is the default on x86 and undefined on x86_64: + // https://docs.microsoft.com/en-us/cpp/build/reference/arch-x86 + // It also includes SSE4.1 intrinsics: + // https://stackoverflow.com/a/32183222/823869 + } else { + sse41_build.flag("-msse4.1"); + } + sse41_build.compile("blake3_sse41"); + + let mut avx2_build = new_build(); + avx2_build.file(c_dir_path("blake3_avx2.c")); + if is_windows_msvc() { + avx2_build.flag("/arch:AVX2"); + } else { + avx2_build.flag("-mavx2"); + } + avx2_build.compile("blake3_avx2"); + + let mut avx512_build = new_build(); + avx512_build.file(c_dir_path("blake3_avx512.c")); + if is_windows_msvc() { + // Note that a lot of versions of MSVC don't support /arch:AVX512, + // and they'll discard it with a warning, hopefully leading to a + // build error. + avx512_build.flag("/arch:AVX512"); + } else { + avx512_build.flag("-mavx512f"); + avx512_build.flag("-mavx512vl"); + } + avx512_build.compile("blake3_avx512"); + } + + // We only build NEON code here if + // 1) it's requested + // and 2) the root crate is not already building it. + // The only time this will really happen is if you build this + // crate by hand with the "neon" feature for some reason. + // + // In addition, 3) if the target is aarch64, NEON is on by default. + if defined("CARGO_FEATURE_NEON") || is_aarch64() { + let mut neon_build = new_build(); + neon_build.file(c_dir_path("blake3_neon.c")); + // ARMv7 platforms that support NEON generally need the following + // flags. AArch64 supports NEON by default and does not support -mpfu. + if is_armv7() { + neon_build.flag("-mfpu=neon-vfpv4"); + neon_build.flag("-mfloat-abi=hard"); + } + neon_build.compile("blake3_neon"); + } + + // The `cc` crate does not automatically emit rerun-if directives for the + // environment variables it supports, in particular for $CC. We expect to + // do a lot of benchmarking across different compilers, so we explicitly + // add the variables that we're likely to need. + println!("cargo:rerun-if-env-changed=CC"); + println!("cargo:rerun-if-env-changed=CFLAGS"); + + // Ditto for source files, though these shouldn't change as often. + for file in std::fs::read_dir("..")? { + println!( + "cargo:rerun-if-changed={}", + file?.path().to_str().expect("utf-8") + ); + } + + Ok(()) +} diff --git a/third-party/blake3/c/blake3_c_rust_bindings/cross_test.sh b/third-party/blake3/c/blake3_c_rust_bindings/cross_test.sh new file mode 100755 index 00000000..94d50aff --- /dev/null +++ b/third-party/blake3/c/blake3_c_rust_bindings/cross_test.sh @@ -0,0 +1,31 @@ +#! /usr/bin/env bash + +# This hacky script works around the fact that `cross test` does not support +# path dependencies. (It uses a docker shared folder to let the guest access +# project files, so parent directories aren't available.) Solve this problem by +# copying the entire project to a temp dir and rearranging paths to put "c" and +# "reference_impl" underneath "blake3_c_rust_bindings", so that everything is +# accessible. Hopefully this will just run on CI forever and no one will ever +# read this and discover my deep shame. + +set -e -u -o pipefail + +project_root="$(realpath "$(dirname "$BASH_SOURCE")/../..")" +tmpdir="$(mktemp -d)" +echo "Running cross tests in $tmpdir" +cd "$tmpdir" +git clone "$project_root" blake3 +mv blake3/c/blake3_c_rust_bindings . +mv blake3/reference_impl blake3_c_rust_bindings +mv blake3/c blake3_c_rust_bindings +cd blake3_c_rust_bindings +sed -i 's|reference_impl = { path = "../../reference_impl" }|reference_impl = { path = "reference_impl" }|' Cargo.toml + +export BLAKE3_C_DIR_OVERRIDE="./c" +cat > Cross.toml << EOF +[build.env] +passthrough = [ + "BLAKE3_C_DIR_OVERRIDE", +] +EOF +cross test "$@" diff --git a/third-party/blake3/c/blake3_c_rust_bindings/src/lib.rs b/third-party/blake3/c/blake3_c_rust_bindings/src/lib.rs new file mode 100644 index 00000000..41e4938b --- /dev/null +++ b/third-party/blake3/c/blake3_c_rust_bindings/src/lib.rs @@ -0,0 +1,306 @@ +//! These are Rust bindings for the C implementation of BLAKE3. As there is a +//! native (and faster) Rust implementation of BLAKE3 provided in this same +//! repo, these bindings are not expected to be used in production. They're +//! intended for testing and benchmarking. + +use std::ffi::{c_void, CString}; +use std::mem::MaybeUninit; + +#[cfg(test)] +mod test; + +pub const BLOCK_LEN: usize = 64; +pub const CHUNK_LEN: usize = 1024; +pub const OUT_LEN: usize = 32; + +// Feature detection functions for tests and benchmarks. Note that the C code +// does its own feature detection in blake3_dispatch.c. +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +pub fn sse2_detected() -> bool { + is_x86_feature_detected!("sse2") +} + +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +pub fn sse41_detected() -> bool { + is_x86_feature_detected!("sse4.1") +} + +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +pub fn avx2_detected() -> bool { + is_x86_feature_detected!("avx2") +} + +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +pub fn avx512_detected() -> bool { + is_x86_feature_detected!("avx512f") && is_x86_feature_detected!("avx512vl") +} + +#[derive(Clone)] +pub struct Hasher(ffi::blake3_hasher); + +impl Hasher { + pub fn new() -> Self { + let mut c_state = MaybeUninit::uninit(); + unsafe { + ffi::blake3_hasher_init(c_state.as_mut_ptr()); + Self(c_state.assume_init()) + } + } + + pub fn new_keyed(key: &[u8; 32]) -> Self { + let mut c_state = MaybeUninit::uninit(); + unsafe { + ffi::blake3_hasher_init_keyed(c_state.as_mut_ptr(), key.as_ptr()); + Self(c_state.assume_init()) + } + } + + pub fn new_derive_key(context: &str) -> Self { + let mut c_state = MaybeUninit::uninit(); + let context_c_string = CString::new(context).expect("valid C string, no null bytes"); + unsafe { + ffi::blake3_hasher_init_derive_key(c_state.as_mut_ptr(), context_c_string.as_ptr()); + Self(c_state.assume_init()) + } + } + + pub fn new_derive_key_raw(context: &[u8]) -> Self { + let mut c_state = MaybeUninit::uninit(); + unsafe { + ffi::blake3_hasher_init_derive_key_raw( + c_state.as_mut_ptr(), + context.as_ptr() as *const _, + context.len(), + ); + Self(c_state.assume_init()) + } + } + + pub fn update(&mut self, input: &[u8]) { + unsafe { + ffi::blake3_hasher_update(&mut self.0, input.as_ptr() as *const c_void, input.len()); + } + } + + pub fn finalize(&self, output: &mut [u8]) { + unsafe { + ffi::blake3_hasher_finalize(&self.0, output.as_mut_ptr(), output.len()); + } + } + + pub fn finalize_seek(&self, seek: u64, output: &mut [u8]) { + unsafe { + ffi::blake3_hasher_finalize_seek(&self.0, seek, output.as_mut_ptr(), output.len()); + } + } + + pub fn reset(&mut self) { + unsafe { + ffi::blake3_hasher_reset(&mut self.0); + } + } +} + +pub mod ffi { + #[repr(C)] + #[derive(Copy, Clone)] + pub struct blake3_chunk_state { + pub cv: [u32; 8usize], + pub chunk_counter: u64, + pub buf: [u8; 64usize], + pub buf_len: u8, + pub blocks_compressed: u8, + pub flags: u8, + } + + #[repr(C)] + #[derive(Copy, Clone)] + pub struct blake3_hasher { + pub key: [u32; 8usize], + pub chunk: blake3_chunk_state, + pub cv_stack_len: u8, + pub cv_stack: [u8; 1728usize], + } + + extern "C" { + // public interface + pub fn blake3_hasher_init(self_: *mut blake3_hasher); + pub fn blake3_hasher_init_keyed(self_: *mut blake3_hasher, key: *const u8); + pub fn blake3_hasher_init_derive_key( + self_: *mut blake3_hasher, + context: *const ::std::os::raw::c_char, + ); + pub fn blake3_hasher_init_derive_key_raw( + self_: *mut blake3_hasher, + context: *const ::std::os::raw::c_void, + context_len: usize, + ); + pub fn blake3_hasher_update( + self_: *mut blake3_hasher, + input: *const ::std::os::raw::c_void, + input_len: usize, + ); + pub fn blake3_hasher_finalize(self_: *const blake3_hasher, out: *mut u8, out_len: usize); + pub fn blake3_hasher_finalize_seek( + self_: *const blake3_hasher, + seek: u64, + out: *mut u8, + out_len: usize, + ); + pub fn blake3_hasher_reset(self_: *mut blake3_hasher); + + // portable low-level functions + pub fn blake3_compress_in_place_portable( + cv: *mut u32, + block: *const u8, + block_len: u8, + counter: u64, + flags: u8, + ); + pub fn blake3_compress_xof_portable( + cv: *const u32, + block: *const u8, + block_len: u8, + counter: u64, + flags: u8, + out: *mut u8, + ); + pub fn blake3_hash_many_portable( + inputs: *const *const u8, + num_inputs: usize, + blocks: usize, + key: *const u32, + counter: u64, + increment_counter: bool, + flags: u8, + flags_start: u8, + flags_end: u8, + out: *mut u8, + ); + } + + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + pub mod x86 { + extern "C" { + // SSE2 low level functions + pub fn blake3_compress_in_place_sse2( + cv: *mut u32, + block: *const u8, + block_len: u8, + counter: u64, + flags: u8, + ); + pub fn blake3_compress_xof_sse2( + cv: *const u32, + block: *const u8, + block_len: u8, + counter: u64, + flags: u8, + out: *mut u8, + ); + pub fn blake3_hash_many_sse2( + inputs: *const *const u8, + num_inputs: usize, + blocks: usize, + key: *const u32, + counter: u64, + increment_counter: bool, + flags: u8, + flags_start: u8, + flags_end: u8, + out: *mut u8, + ); + + // SSE4.1 low level functions + pub fn blake3_compress_in_place_sse41( + cv: *mut u32, + block: *const u8, + block_len: u8, + counter: u64, + flags: u8, + ); + pub fn blake3_compress_xof_sse41( + cv: *const u32, + block: *const u8, + block_len: u8, + counter: u64, + flags: u8, + out: *mut u8, + ); + pub fn blake3_hash_many_sse41( + inputs: *const *const u8, + num_inputs: usize, + blocks: usize, + key: *const u32, + counter: u64, + increment_counter: bool, + flags: u8, + flags_start: u8, + flags_end: u8, + out: *mut u8, + ); + + // AVX2 low level functions + pub fn blake3_hash_many_avx2( + inputs: *const *const u8, + num_inputs: usize, + blocks: usize, + key: *const u32, + counter: u64, + increment_counter: bool, + flags: u8, + flags_start: u8, + flags_end: u8, + out: *mut u8, + ); + + // AVX-512 low level functions + pub fn blake3_compress_xof_avx512( + cv: *const u32, + block: *const u8, + block_len: u8, + counter: u64, + flags: u8, + out: *mut u8, + ); + pub fn blake3_compress_in_place_avx512( + cv: *mut u32, + block: *const u8, + block_len: u8, + counter: u64, + flags: u8, + ); + pub fn blake3_hash_many_avx512( + inputs: *const *const u8, + num_inputs: usize, + blocks: usize, + key: *const u32, + counter: u64, + increment_counter: bool, + flags: u8, + flags_start: u8, + flags_end: u8, + out: *mut u8, + ); + } + } + + #[cfg(feature = "neon")] + pub mod neon { + extern "C" { + // NEON low level functions + pub fn blake3_hash_many_neon( + inputs: *const *const u8, + num_inputs: usize, + blocks: usize, + key: *const u32, + counter: u64, + increment_counter: bool, + flags: u8, + flags_start: u8, + flags_end: u8, + out: *mut u8, + ); + } + } +} diff --git a/third-party/blake3/c/blake3_c_rust_bindings/src/test.rs b/third-party/blake3/c/blake3_c_rust_bindings/src/test.rs new file mode 100644 index 00000000..1fc077c8 --- /dev/null +++ b/third-party/blake3/c/blake3_c_rust_bindings/src/test.rs @@ -0,0 +1,570 @@ +// Most of this code is duplicated from the root `blake3` crate. Perhaps we +// could share more of it in the future. + +use crate::{BLOCK_LEN, CHUNK_LEN, OUT_LEN}; +use arrayref::{array_mut_ref, array_ref}; +use arrayvec::ArrayVec; +use core::usize; +use rand::prelude::*; + +const CHUNK_START: u8 = 1 << 0; +const CHUNK_END: u8 = 1 << 1; +const PARENT: u8 = 1 << 2; +const ROOT: u8 = 1 << 3; +const KEYED_HASH: u8 = 1 << 4; +// const DERIVE_KEY_CONTEXT: u8 = 1 << 5; +// const DERIVE_KEY_MATERIAL: u8 = 1 << 6; + +// Interesting input lengths to run tests on. +pub const TEST_CASES: &[usize] = &[ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + BLOCK_LEN - 1, + BLOCK_LEN, + BLOCK_LEN + 1, + 2 * BLOCK_LEN - 1, + 2 * BLOCK_LEN, + 2 * BLOCK_LEN + 1, + CHUNK_LEN - 1, + CHUNK_LEN, + CHUNK_LEN + 1, + 2 * CHUNK_LEN, + 2 * CHUNK_LEN + 1, + 3 * CHUNK_LEN, + 3 * CHUNK_LEN + 1, + 4 * CHUNK_LEN, + 4 * CHUNK_LEN + 1, + 5 * CHUNK_LEN, + 5 * CHUNK_LEN + 1, + 6 * CHUNK_LEN, + 6 * CHUNK_LEN + 1, + 7 * CHUNK_LEN, + 7 * CHUNK_LEN + 1, + 8 * CHUNK_LEN, + 8 * CHUNK_LEN + 1, + 16 * CHUNK_LEN, // AVX512's bandwidth + 31 * CHUNK_LEN, // 16 + 8 + 4 + 2 + 1 + 100 * CHUNK_LEN, // subtrees larger than MAX_SIMD_DEGREE chunks +]; + +pub const TEST_CASES_MAX: usize = 100 * CHUNK_LEN; + +// There's a test to make sure these two are equal below. +pub const TEST_KEY: [u8; 32] = *b"whats the Elvish word for friend"; +pub const TEST_KEY_WORDS: [u32; 8] = [ + 1952540791, 1752440947, 1816469605, 1752394102, 1919907616, 1868963940, 1919295602, 1684956521, +]; + +// Paint the input with a repeating byte pattern. We use a cycle length of 251, +// because that's the largest prime number less than 256. This makes it +// unlikely to swapping any two adjacent input blocks or chunks will give the +// same answer. +fn paint_test_input(buf: &mut [u8]) { + for (i, b) in buf.iter_mut().enumerate() { + *b = (i % 251) as u8; + } +} + +#[inline(always)] +fn le_bytes_from_words_32(words: &[u32; 8]) -> [u8; 32] { + let mut out = [0; 32]; + *array_mut_ref!(out, 0 * 4, 4) = words[0].to_le_bytes(); + *array_mut_ref!(out, 1 * 4, 4) = words[1].to_le_bytes(); + *array_mut_ref!(out, 2 * 4, 4) = words[2].to_le_bytes(); + *array_mut_ref!(out, 3 * 4, 4) = words[3].to_le_bytes(); + *array_mut_ref!(out, 4 * 4, 4) = words[4].to_le_bytes(); + *array_mut_ref!(out, 5 * 4, 4) = words[5].to_le_bytes(); + *array_mut_ref!(out, 6 * 4, 4) = words[6].to_le_bytes(); + *array_mut_ref!(out, 7 * 4, 4) = words[7].to_le_bytes(); + out +} + +type CompressInPlaceFn = + unsafe extern "C" fn(cv: *mut u32, block: *const u8, block_len: u8, counter: u64, flags: u8); + +type CompressXofFn = unsafe extern "C" fn( + cv: *const u32, + block: *const u8, + block_len: u8, + counter: u64, + flags: u8, + out: *mut u8, +); + +// A shared helper function for platform-specific tests. +pub fn test_compress_fn(compress_in_place_fn: CompressInPlaceFn, compress_xof_fn: CompressXofFn) { + let initial_state = TEST_KEY_WORDS; + let block_len: u8 = 61; + let mut block = [0; BLOCK_LEN]; + paint_test_input(&mut block[..block_len as usize]); + // Use a counter with set bits in both 32-bit words. + let counter = (5u64 << 32) + 6; + let flags = CHUNK_END | ROOT | KEYED_HASH; + + let mut portable_out = [0; 64]; + unsafe { + crate::ffi::blake3_compress_xof_portable( + initial_state.as_ptr(), + block.as_ptr(), + block_len, + counter, + flags, + portable_out.as_mut_ptr(), + ); + } + + let mut test_state = initial_state; + unsafe { + compress_in_place_fn( + test_state.as_mut_ptr(), + block.as_ptr(), + block_len, + counter, + flags, + ) + }; + let test_state_bytes = le_bytes_from_words_32(&test_state); + let mut test_xof = [0; 64]; + unsafe { + compress_xof_fn( + initial_state.as_ptr(), + block.as_ptr(), + block_len, + counter, + flags, + test_xof.as_mut_ptr(), + ) + }; + + assert_eq!(&portable_out[..32], &test_state_bytes[..]); + assert_eq!(&portable_out[..], &test_xof[..]); +} + +// Testing the portable implementation against itself is circular, but why not. +#[test] +fn test_compress_portable() { + test_compress_fn( + crate::ffi::blake3_compress_in_place_portable, + crate::ffi::blake3_compress_xof_portable, + ); +} + +#[test] +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +fn test_compress_sse2() { + if !crate::sse2_detected() { + return; + } + test_compress_fn( + crate::ffi::x86::blake3_compress_in_place_sse2, + crate::ffi::x86::blake3_compress_xof_sse2, + ); +} + +#[test] +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +fn test_compress_sse41() { + if !crate::sse41_detected() { + return; + } + test_compress_fn( + crate::ffi::x86::blake3_compress_in_place_sse41, + crate::ffi::x86::blake3_compress_xof_sse41, + ); +} + +#[test] +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +fn test_compress_avx512() { + if !crate::avx512_detected() { + return; + } + test_compress_fn( + crate::ffi::x86::blake3_compress_in_place_avx512, + crate::ffi::x86::blake3_compress_xof_avx512, + ); +} + +type HashManyFn = unsafe extern "C" fn( + inputs: *const *const u8, + num_inputs: usize, + blocks: usize, + key: *const u32, + counter: u64, + increment_counter: bool, + flags: u8, + flags_start: u8, + flags_end: u8, + out: *mut u8, +); + +// A shared helper function for platform-specific tests. +pub fn test_hash_many_fn(hash_many_fn: HashManyFn) { + // Test a few different initial counter values. + // - 0: The base case. + // - u32::MAX: The low word of the counter overflows for all inputs except the first. + // - i32::MAX: *No* overflow. But carry bugs in tricky SIMD code can screw this up, if you XOR + // when you're supposed to ANDNOT... + let initial_counters = [0, u32::MAX as u64, i32::MAX as u64]; + for counter in initial_counters { + dbg!(counter); + + // 31 (16 + 8 + 4 + 2 + 1) inputs + const NUM_INPUTS: usize = 31; + let mut input_buf = [0; CHUNK_LEN * NUM_INPUTS]; + crate::test::paint_test_input(&mut input_buf); + + // First hash chunks. + let mut chunks = ArrayVec::<&[u8; CHUNK_LEN], NUM_INPUTS>::new(); + for i in 0..NUM_INPUTS { + chunks.push(array_ref!(input_buf, i * CHUNK_LEN, CHUNK_LEN)); + } + let mut portable_chunks_out = [0; NUM_INPUTS * OUT_LEN]; + unsafe { + crate::ffi::blake3_hash_many_portable( + chunks.as_ptr() as _, + chunks.len(), + CHUNK_LEN / BLOCK_LEN, + TEST_KEY_WORDS.as_ptr(), + counter, + true, + KEYED_HASH, + CHUNK_START, + CHUNK_END, + portable_chunks_out.as_mut_ptr(), + ); + } + + let mut test_chunks_out = [0; NUM_INPUTS * OUT_LEN]; + unsafe { + hash_many_fn( + chunks.as_ptr() as _, + chunks.len(), + CHUNK_LEN / BLOCK_LEN, + TEST_KEY_WORDS.as_ptr(), + counter, + true, + KEYED_HASH, + CHUNK_START, + CHUNK_END, + test_chunks_out.as_mut_ptr(), + ); + } + for n in 0..NUM_INPUTS { + dbg!(n); + assert_eq!( + &portable_chunks_out[n * OUT_LEN..][..OUT_LEN], + &test_chunks_out[n * OUT_LEN..][..OUT_LEN] + ); + } + + // Then hash parents. + let mut parents = ArrayVec::<&[u8; 2 * OUT_LEN], NUM_INPUTS>::new(); + for i in 0..NUM_INPUTS { + parents.push(array_ref!(input_buf, i * 2 * OUT_LEN, 2 * OUT_LEN)); + } + let mut portable_parents_out = [0; NUM_INPUTS * OUT_LEN]; + unsafe { + crate::ffi::blake3_hash_many_portable( + parents.as_ptr() as _, + parents.len(), + 1, + TEST_KEY_WORDS.as_ptr(), + counter, + false, + KEYED_HASH | PARENT, + 0, + 0, + portable_parents_out.as_mut_ptr(), + ); + } + + let mut test_parents_out = [0; NUM_INPUTS * OUT_LEN]; + unsafe { + hash_many_fn( + parents.as_ptr() as _, + parents.len(), + 1, + TEST_KEY_WORDS.as_ptr(), + counter, + false, + KEYED_HASH | PARENT, + 0, + 0, + test_parents_out.as_mut_ptr(), + ); + } + for n in 0..NUM_INPUTS { + dbg!(n); + assert_eq!( + &portable_parents_out[n * OUT_LEN..][..OUT_LEN], + &test_parents_out[n * OUT_LEN..][..OUT_LEN] + ); + } + } +} + +// Testing the portable implementation against itself is circular, but why not. +#[test] +fn test_hash_many_portable() { + test_hash_many_fn(crate::ffi::blake3_hash_many_portable); +} + +#[test] +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +fn test_hash_many_sse2() { + if !crate::sse2_detected() { + return; + } + test_hash_many_fn(crate::ffi::x86::blake3_hash_many_sse2); +} + +#[test] +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +fn test_hash_many_sse41() { + if !crate::sse41_detected() { + return; + } + test_hash_many_fn(crate::ffi::x86::blake3_hash_many_sse41); +} + +#[test] +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +fn test_hash_many_avx2() { + if !crate::avx2_detected() { + return; + } + test_hash_many_fn(crate::ffi::x86::blake3_hash_many_avx2); +} + +#[test] +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +fn test_hash_many_avx512() { + if !crate::avx512_detected() { + return; + } + test_hash_many_fn(crate::ffi::x86::blake3_hash_many_avx512); +} + +#[test] +#[cfg(feature = "neon")] +fn test_hash_many_neon() { + test_hash_many_fn(crate::ffi::neon::blake3_hash_many_neon); +} + +#[test] +fn test_compare_reference_impl() { + const OUT: usize = 303; // more than 64, not a multiple of 4 + let mut input_buf = [0; TEST_CASES_MAX]; + paint_test_input(&mut input_buf); + for &case in TEST_CASES { + let input = &input_buf[..case]; + dbg!(case); + + // regular + { + let mut reference_hasher = reference_impl::Hasher::new(); + reference_hasher.update(input); + let mut expected_out = [0; OUT]; + reference_hasher.finalize(&mut expected_out); + + let mut test_hasher = crate::Hasher::new(); + test_hasher.update(input); + let mut test_out = [0; OUT]; + test_hasher.finalize(&mut test_out); + assert_eq!(test_out[..], expected_out[..]); + } + + // keyed + { + let mut reference_hasher = reference_impl::Hasher::new_keyed(&TEST_KEY); + reference_hasher.update(input); + let mut expected_out = [0; OUT]; + reference_hasher.finalize(&mut expected_out); + + let mut test_hasher = crate::Hasher::new_keyed(&TEST_KEY); + test_hasher.update(input); + let mut test_out = [0; OUT]; + test_hasher.finalize(&mut test_out); + assert_eq!(test_out[..], expected_out[..]); + } + + // derive_key + { + let context = "BLAKE3 2019-12-27 16:13:59 example context (not the test vector one)"; + let mut reference_hasher = reference_impl::Hasher::new_derive_key(context); + reference_hasher.update(input); + let mut expected_out = [0; OUT]; + reference_hasher.finalize(&mut expected_out); + + // the regular C string API + let mut test_hasher = crate::Hasher::new_derive_key(context); + test_hasher.update(input); + let mut test_out = [0; OUT]; + test_hasher.finalize(&mut test_out); + assert_eq!(test_out[..], expected_out[..]); + + // the raw bytes API + let mut test_hasher_raw = crate::Hasher::new_derive_key_raw(context.as_bytes()); + test_hasher_raw.update(input); + let mut test_out_raw = [0; OUT]; + test_hasher_raw.finalize(&mut test_out_raw); + assert_eq!(test_out_raw[..], expected_out[..]); + } + } +} + +fn reference_hash(input: &[u8]) -> [u8; OUT_LEN] { + let mut hasher = reference_impl::Hasher::new(); + hasher.update(input); + let mut bytes = [0; OUT_LEN]; + hasher.finalize(&mut bytes); + bytes.into() +} + +#[test] +fn test_compare_update_multiple() { + // Don't use all the long test cases here, since that's unnecessarily slow + // in debug mode. + let mut short_test_cases = TEST_CASES; + while *short_test_cases.last().unwrap() > 4 * CHUNK_LEN { + short_test_cases = &short_test_cases[..short_test_cases.len() - 1]; + } + assert_eq!(*short_test_cases.last().unwrap(), 4 * CHUNK_LEN); + + let mut input_buf = [0; 2 * TEST_CASES_MAX]; + paint_test_input(&mut input_buf); + + for &first_update in short_test_cases { + dbg!(first_update); + let first_input = &input_buf[..first_update]; + let mut test_hasher = crate::Hasher::new(); + test_hasher.update(first_input); + + for &second_update in short_test_cases { + dbg!(second_update); + let second_input = &input_buf[first_update..][..second_update]; + let total_input = &input_buf[..first_update + second_update]; + + // Clone the hasher with first_update bytes already written, so + // that the next iteration can reuse it. + let mut test_hasher = test_hasher.clone(); + test_hasher.update(second_input); + let mut test_out = [0; OUT_LEN]; + test_hasher.finalize(&mut test_out); + + let expected = reference_hash(total_input); + assert_eq!(expected, test_out); + } + } +} + +#[test] +fn test_fuzz_hasher() { + const INPUT_MAX: usize = 4 * CHUNK_LEN; + let mut input_buf = [0; 3 * INPUT_MAX]; + paint_test_input(&mut input_buf); + + // Don't do too many iterations in debug mode, to keep the tests under a + // second or so. CI should run tests in release mode also. Provide an + // environment variable for specifying a larger number of fuzz iterations. + let num_tests = if cfg!(debug_assertions) { 100 } else { 10_000 }; + + // Use a fixed RNG seed for reproducibility. + let mut rng = rand_chacha::ChaCha8Rng::from_seed([1; 32]); + for _num_test in 0..num_tests { + dbg!(_num_test); + let mut hasher = crate::Hasher::new(); + let mut total_input = 0; + // For each test, write 3 inputs of random length. + for _ in 0..3 { + let input_len = rng.gen_range(0, INPUT_MAX + 1); + dbg!(input_len); + let input = &input_buf[total_input..][..input_len]; + hasher.update(input); + total_input += input_len; + } + let expected = reference_hash(&input_buf[..total_input]); + let mut test_out = [0; 32]; + hasher.finalize(&mut test_out); + assert_eq!(expected, test_out); + } +} + +#[test] +fn test_finalize_seek() { + let mut expected = [0; 1000]; + { + let mut reference_hasher = reference_impl::Hasher::new(); + reference_hasher.update(b"foobarbaz"); + reference_hasher.finalize(&mut expected); + } + + let mut test_hasher = crate::Hasher::new(); + test_hasher.update(b"foobarbaz"); + + let mut out = [0; 103]; + for &seek in &[0, 1, 7, 59, 63, 64, 65, 501, expected.len() - out.len()] { + dbg!(seek); + test_hasher.finalize_seek(seek as u64, &mut out); + assert_eq!(&expected[seek..][..out.len()], &out[..]); + } +} + +#[test] +fn test_reset() { + { + let mut hasher = crate::Hasher::new(); + hasher.update(&[42; 3 * CHUNK_LEN + 7]); + hasher.reset(); + hasher.update(&[42; CHUNK_LEN + 3]); + let mut output = [0; 32]; + hasher.finalize(&mut output); + + let mut reference_hasher = reference_impl::Hasher::new(); + reference_hasher.update(&[42; CHUNK_LEN + 3]); + let mut reference_hash = [0; 32]; + reference_hasher.finalize(&mut reference_hash); + + assert_eq!(reference_hash, output); + } + { + let key = &[99; 32]; + let mut hasher = crate::Hasher::new_keyed(key); + hasher.update(&[42; 3 * CHUNK_LEN + 7]); + hasher.reset(); + hasher.update(&[42; CHUNK_LEN + 3]); + let mut output = [0; 32]; + hasher.finalize(&mut output); + + let mut reference_hasher = reference_impl::Hasher::new_keyed(key); + reference_hasher.update(&[42; CHUNK_LEN + 3]); + let mut reference_hash = [0; 32]; + reference_hasher.finalize(&mut reference_hash); + + assert_eq!(reference_hash, output); + } + { + let context = "BLAKE3 2020-02-12 10:20:58 reset test"; + let mut hasher = crate::Hasher::new_derive_key(context); + hasher.update(&[42; 3 * CHUNK_LEN + 7]); + hasher.reset(); + hasher.update(&[42; CHUNK_LEN + 3]); + let mut output = [0; 32]; + hasher.finalize(&mut output); + + let mut reference_hasher = reference_impl::Hasher::new_derive_key(context); + reference_hasher.update(&[42; CHUNK_LEN + 3]); + let mut reference_hash = [0; 32]; + reference_hasher.finalize(&mut reference_hash); + + assert_eq!(reference_hash, output); + } +} diff --git a/third-party/blake3/c/blake3_dispatch.c b/third-party/blake3/c/blake3_dispatch.c new file mode 100644 index 00000000..2ab0093e --- /dev/null +++ b/third-party/blake3/c/blake3_dispatch.c @@ -0,0 +1,276 @@ +#include +#include +#include + +#include "blake3_impl.h" + +#if defined(IS_X86) +#if defined(_MSC_VER) +#include +#elif defined(__GNUC__) +#include +#else +#undef IS_X86 /* Unimplemented! */ +#endif +#endif + +#define MAYBE_UNUSED(x) (void)((x)) + +#if defined(IS_X86) +static uint64_t xgetbv(void) { +#if defined(_MSC_VER) + return _xgetbv(0); +#else + uint32_t eax = 0, edx = 0; + __asm__ __volatile__("xgetbv\n" : "=a"(eax), "=d"(edx) : "c"(0)); + return ((uint64_t)edx << 32) | eax; +#endif +} + +static void cpuid(uint32_t out[4], uint32_t id) { +#if defined(_MSC_VER) + __cpuid((int *)out, id); +#elif defined(__i386__) || defined(_M_IX86) + __asm__ __volatile__("movl %%ebx, %1\n" + "cpuid\n" + "xchgl %1, %%ebx\n" + : "=a"(out[0]), "=r"(out[1]), "=c"(out[2]), "=d"(out[3]) + : "a"(id)); +#else + __asm__ __volatile__("cpuid\n" + : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3]) + : "a"(id)); +#endif +} + +static void cpuidex(uint32_t out[4], uint32_t id, uint32_t sid) { +#if defined(_MSC_VER) + __cpuidex((int *)out, id, sid); +#elif defined(__i386__) || defined(_M_IX86) + __asm__ __volatile__("movl %%ebx, %1\n" + "cpuid\n" + "xchgl %1, %%ebx\n" + : "=a"(out[0]), "=r"(out[1]), "=c"(out[2]), "=d"(out[3]) + : "a"(id), "c"(sid)); +#else + __asm__ __volatile__("cpuid\n" + : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3]) + : "a"(id), "c"(sid)); +#endif +} + +#endif + +enum cpu_feature { + SSE2 = 1 << 0, + SSSE3 = 1 << 1, + SSE41 = 1 << 2, + AVX = 1 << 3, + AVX2 = 1 << 4, + AVX512F = 1 << 5, + AVX512VL = 1 << 6, + /* ... */ + UNDEFINED = 1 << 30 +}; + +#if !defined(BLAKE3_TESTING) +static /* Allow the variable to be controlled manually for testing */ +#endif + enum cpu_feature g_cpu_features = UNDEFINED; + +#if !defined(BLAKE3_TESTING) +static +#endif + enum cpu_feature + get_cpu_features(void) { + + if (g_cpu_features != UNDEFINED) { + return g_cpu_features; + } else { +#if defined(IS_X86) + uint32_t regs[4] = {0}; + uint32_t *eax = ®s[0], *ebx = ®s[1], *ecx = ®s[2], *edx = ®s[3]; + (void)edx; + enum cpu_feature features = 0; + cpuid(regs, 0); + const int max_id = *eax; + cpuid(regs, 1); +#if defined(__amd64__) || defined(_M_X64) + features |= SSE2; +#else + if (*edx & (1UL << 26)) + features |= SSE2; +#endif + if (*ecx & (1UL << 9)) + features |= SSSE3; + if (*ecx & (1UL << 19)) + features |= SSE41; + + if (*ecx & (1UL << 27)) { // OSXSAVE + const uint64_t mask = xgetbv(); + if ((mask & 6) == 6) { // SSE and AVX states + if (*ecx & (1UL << 28)) + features |= AVX; + if (max_id >= 7) { + cpuidex(regs, 7, 0); + if (*ebx & (1UL << 5)) + features |= AVX2; + if ((mask & 224) == 224) { // Opmask, ZMM_Hi256, Hi16_Zmm + if (*ebx & (1UL << 31)) + features |= AVX512VL; + if (*ebx & (1UL << 16)) + features |= AVX512F; + } + } + } + } + g_cpu_features = features; + return features; +#else + /* How to detect NEON? */ + return 0; +#endif + } +} + +void blake3_compress_in_place(uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, + uint8_t flags) { +#if defined(IS_X86) + const enum cpu_feature features = get_cpu_features(); + MAYBE_UNUSED(features); +#if !defined(BLAKE3_NO_AVX512) + if (features & AVX512VL) { + blake3_compress_in_place_avx512(cv, block, block_len, counter, flags); + return; + } +#endif +#if !defined(BLAKE3_NO_SSE41) + if (features & SSE41) { + blake3_compress_in_place_sse41(cv, block, block_len, counter, flags); + return; + } +#endif +#if !defined(BLAKE3_NO_SSE2) + if (features & SSE2) { + blake3_compress_in_place_sse2(cv, block, block_len, counter, flags); + return; + } +#endif +#endif + blake3_compress_in_place_portable(cv, block, block_len, counter, flags); +} + +void blake3_compress_xof(const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, uint8_t flags, + uint8_t out[64]) { +#if defined(IS_X86) + const enum cpu_feature features = get_cpu_features(); + MAYBE_UNUSED(features); +#if !defined(BLAKE3_NO_AVX512) + if (features & AVX512VL) { + blake3_compress_xof_avx512(cv, block, block_len, counter, flags, out); + return; + } +#endif +#if !defined(BLAKE3_NO_SSE41) + if (features & SSE41) { + blake3_compress_xof_sse41(cv, block, block_len, counter, flags, out); + return; + } +#endif +#if !defined(BLAKE3_NO_SSE2) + if (features & SSE2) { + blake3_compress_xof_sse2(cv, block, block_len, counter, flags, out); + return; + } +#endif +#endif + blake3_compress_xof_portable(cv, block, block_len, counter, flags, out); +} + +void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs, + size_t blocks, const uint32_t key[8], uint64_t counter, + bool increment_counter, uint8_t flags, + uint8_t flags_start, uint8_t flags_end, uint8_t *out) { +#if defined(IS_X86) + const enum cpu_feature features = get_cpu_features(); + MAYBE_UNUSED(features); +#if !defined(BLAKE3_NO_AVX512) + if ((features & (AVX512F|AVX512VL)) == (AVX512F|AVX512VL)) { + blake3_hash_many_avx512(inputs, num_inputs, blocks, key, counter, + increment_counter, flags, flags_start, flags_end, + out); + return; + } +#endif +#if !defined(BLAKE3_NO_AVX2) + if (features & AVX2) { + blake3_hash_many_avx2(inputs, num_inputs, blocks, key, counter, + increment_counter, flags, flags_start, flags_end, + out); + return; + } +#endif +#if !defined(BLAKE3_NO_SSE41) + if (features & SSE41) { + blake3_hash_many_sse41(inputs, num_inputs, blocks, key, counter, + increment_counter, flags, flags_start, flags_end, + out); + return; + } +#endif +#if !defined(BLAKE3_NO_SSE2) + if (features & SSE2) { + blake3_hash_many_sse2(inputs, num_inputs, blocks, key, counter, + increment_counter, flags, flags_start, flags_end, + out); + return; + } +#endif +#endif + +#if BLAKE3_USE_NEON == 1 + blake3_hash_many_neon(inputs, num_inputs, blocks, key, counter, + increment_counter, flags, flags_start, flags_end, out); + return; +#endif + + blake3_hash_many_portable(inputs, num_inputs, blocks, key, counter, + increment_counter, flags, flags_start, flags_end, + out); +} + +// The dynamically detected SIMD degree of the current platform. +size_t blake3_simd_degree(void) { +#if defined(IS_X86) + const enum cpu_feature features = get_cpu_features(); + MAYBE_UNUSED(features); +#if !defined(BLAKE3_NO_AVX512) + if ((features & (AVX512F|AVX512VL)) == (AVX512F|AVX512VL)) { + return 16; + } +#endif +#if !defined(BLAKE3_NO_AVX2) + if (features & AVX2) { + return 8; + } +#endif +#if !defined(BLAKE3_NO_SSE41) + if (features & SSE41) { + return 4; + } +#endif +#if !defined(BLAKE3_NO_SSE2) + if (features & SSE2) { + return 4; + } +#endif +#endif +#if BLAKE3_USE_NEON == 1 + return 4; +#endif + return 1; +} diff --git a/third-party/blake3/c/blake3_impl.h b/third-party/blake3/c/blake3_impl.h new file mode 100644 index 00000000..3ba9ceb0 --- /dev/null +++ b/third-party/blake3/c/blake3_impl.h @@ -0,0 +1,281 @@ +#ifndef BLAKE3_IMPL_H +#define BLAKE3_IMPL_H + +#include +#include +#include +#include +#include + +#include "blake3.h" + +// internal flags +enum blake3_flags { + CHUNK_START = 1 << 0, + CHUNK_END = 1 << 1, + PARENT = 1 << 2, + ROOT = 1 << 3, + KEYED_HASH = 1 << 4, + DERIVE_KEY_CONTEXT = 1 << 5, + DERIVE_KEY_MATERIAL = 1 << 6, +}; + +// This C implementation tries to support recent versions of GCC, Clang, and +// MSVC. +#if defined(_MSC_VER) +#define INLINE static __forceinline +#else +#define INLINE static inline __attribute__((always_inline)) +#endif + +#if defined(__x86_64__) || defined(_M_X64) +#define IS_X86 +#define IS_X86_64 +#endif + +#if defined(__i386__) || defined(_M_IX86) +#define IS_X86 +#define IS_X86_32 +#endif + +#if defined(__aarch64__) || defined(_M_ARM64) +#define IS_AARCH64 +#endif + +#if defined(IS_X86) +#if defined(_MSC_VER) +#include +#endif +#endif + +#if !defined(BLAKE3_USE_NEON) + // If BLAKE3_USE_NEON not manually set, autodetect based on AArch64ness + #if defined(IS_AARCH64) + #define BLAKE3_USE_NEON 1 + #else + #define BLAKE3_USE_NEON 0 + #endif +#endif + +#if defined(IS_X86) +#define MAX_SIMD_DEGREE 16 +#elif BLAKE3_USE_NEON == 1 +#define MAX_SIMD_DEGREE 4 +#else +#define MAX_SIMD_DEGREE 1 +#endif + +// There are some places where we want a static size that's equal to the +// MAX_SIMD_DEGREE, but also at least 2. +#define MAX_SIMD_DEGREE_OR_2 (MAX_SIMD_DEGREE > 2 ? MAX_SIMD_DEGREE : 2) + +static const uint32_t IV[8] = {0x6A09E667UL, 0xBB67AE85UL, 0x3C6EF372UL, + 0xA54FF53AUL, 0x510E527FUL, 0x9B05688CUL, + 0x1F83D9ABUL, 0x5BE0CD19UL}; + +static const uint8_t MSG_SCHEDULE[7][16] = { + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, + {2, 6, 3, 10, 7, 0, 4, 13, 1, 11, 12, 5, 9, 14, 15, 8}, + {3, 4, 10, 12, 13, 2, 7, 14, 6, 5, 9, 0, 11, 15, 8, 1}, + {10, 7, 12, 9, 14, 3, 13, 15, 4, 0, 11, 2, 5, 8, 1, 6}, + {12, 13, 9, 11, 15, 10, 14, 8, 7, 2, 5, 3, 0, 1, 6, 4}, + {9, 14, 11, 5, 8, 12, 15, 1, 13, 3, 0, 10, 2, 6, 4, 7}, + {11, 15, 5, 0, 1, 9, 8, 6, 14, 10, 2, 12, 3, 4, 7, 13}, +}; + +/* Find index of the highest set bit */ +/* x is assumed to be nonzero. */ +static unsigned int highest_one(uint64_t x) { +#if defined(__GNUC__) || defined(__clang__) + return 63 ^ (unsigned int)__builtin_clzll(x); +#elif defined(_MSC_VER) && defined(IS_X86_64) + unsigned long index; + _BitScanReverse64(&index, x); + return index; +#elif defined(_MSC_VER) && defined(IS_X86_32) + if(x >> 32) { + unsigned long index; + _BitScanReverse(&index, (unsigned long)(x >> 32)); + return 32 + index; + } else { + unsigned long index; + _BitScanReverse(&index, (unsigned long)x); + return index; + } +#else + unsigned int c = 0; + if(x & 0xffffffff00000000ULL) { x >>= 32; c += 32; } + if(x & 0x00000000ffff0000ULL) { x >>= 16; c += 16; } + if(x & 0x000000000000ff00ULL) { x >>= 8; c += 8; } + if(x & 0x00000000000000f0ULL) { x >>= 4; c += 4; } + if(x & 0x000000000000000cULL) { x >>= 2; c += 2; } + if(x & 0x0000000000000002ULL) { c += 1; } + return c; +#endif +} + +// Count the number of 1 bits. +INLINE unsigned int popcnt(uint64_t x) { +#if defined(__GNUC__) || defined(__clang__) + return (unsigned int)__builtin_popcountll(x); +#else + unsigned int count = 0; + while (x != 0) { + count += 1; + x &= x - 1; + } + return count; +#endif +} + +// Largest power of two less than or equal to x. As a special case, returns 1 +// when x is 0. +INLINE uint64_t round_down_to_power_of_2(uint64_t x) { + return 1ULL << highest_one(x | 1); +} + +INLINE uint32_t counter_low(uint64_t counter) { return (uint32_t)counter; } + +INLINE uint32_t counter_high(uint64_t counter) { + return (uint32_t)(counter >> 32); +} + +INLINE uint32_t load32(const void *src) { + const uint8_t *p = (const uint8_t *)src; + return ((uint32_t)(p[0]) << 0) | ((uint32_t)(p[1]) << 8) | + ((uint32_t)(p[2]) << 16) | ((uint32_t)(p[3]) << 24); +} + +INLINE void load_key_words(const uint8_t key[BLAKE3_KEY_LEN], + uint32_t key_words[8]) { + key_words[0] = load32(&key[0 * 4]); + key_words[1] = load32(&key[1 * 4]); + key_words[2] = load32(&key[2 * 4]); + key_words[3] = load32(&key[3 * 4]); + key_words[4] = load32(&key[4 * 4]); + key_words[5] = load32(&key[5 * 4]); + key_words[6] = load32(&key[6 * 4]); + key_words[7] = load32(&key[7 * 4]); +} + +INLINE void store32(void *dst, uint32_t w) { + uint8_t *p = (uint8_t *)dst; + p[0] = (uint8_t)(w >> 0); + p[1] = (uint8_t)(w >> 8); + p[2] = (uint8_t)(w >> 16); + p[3] = (uint8_t)(w >> 24); +} + +INLINE void store_cv_words(uint8_t bytes_out[32], uint32_t cv_words[8]) { + store32(&bytes_out[0 * 4], cv_words[0]); + store32(&bytes_out[1 * 4], cv_words[1]); + store32(&bytes_out[2 * 4], cv_words[2]); + store32(&bytes_out[3 * 4], cv_words[3]); + store32(&bytes_out[4 * 4], cv_words[4]); + store32(&bytes_out[5 * 4], cv_words[5]); + store32(&bytes_out[6 * 4], cv_words[6]); + store32(&bytes_out[7 * 4], cv_words[7]); +} + +void blake3_compress_in_place(uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, + uint8_t flags); + +void blake3_compress_xof(const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, uint8_t flags, + uint8_t out[64]); + +void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs, + size_t blocks, const uint32_t key[8], uint64_t counter, + bool increment_counter, uint8_t flags, + uint8_t flags_start, uint8_t flags_end, uint8_t *out); + +size_t blake3_simd_degree(void); + + +// Declarations for implementation-specific functions. +void blake3_compress_in_place_portable(uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, + uint8_t flags); + +void blake3_compress_xof_portable(const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, + uint8_t flags, uint8_t out[64]); + +void blake3_hash_many_portable(const uint8_t *const *inputs, size_t num_inputs, + size_t blocks, const uint32_t key[8], + uint64_t counter, bool increment_counter, + uint8_t flags, uint8_t flags_start, + uint8_t flags_end, uint8_t *out); + +#if defined(IS_X86) +#if !defined(BLAKE3_NO_SSE2) +void blake3_compress_in_place_sse2(uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, + uint8_t flags); +void blake3_compress_xof_sse2(const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, + uint8_t flags, uint8_t out[64]); +void blake3_hash_many_sse2(const uint8_t *const *inputs, size_t num_inputs, + size_t blocks, const uint32_t key[8], + uint64_t counter, bool increment_counter, + uint8_t flags, uint8_t flags_start, + uint8_t flags_end, uint8_t *out); +#endif +#if !defined(BLAKE3_NO_SSE41) +void blake3_compress_in_place_sse41(uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, + uint8_t flags); +void blake3_compress_xof_sse41(const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, + uint8_t flags, uint8_t out[64]); +void blake3_hash_many_sse41(const uint8_t *const *inputs, size_t num_inputs, + size_t blocks, const uint32_t key[8], + uint64_t counter, bool increment_counter, + uint8_t flags, uint8_t flags_start, + uint8_t flags_end, uint8_t *out); +#endif +#if !defined(BLAKE3_NO_AVX2) +void blake3_hash_many_avx2(const uint8_t *const *inputs, size_t num_inputs, + size_t blocks, const uint32_t key[8], + uint64_t counter, bool increment_counter, + uint8_t flags, uint8_t flags_start, + uint8_t flags_end, uint8_t *out); +#endif +#if !defined(BLAKE3_NO_AVX512) +void blake3_compress_in_place_avx512(uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, + uint8_t flags); + +void blake3_compress_xof_avx512(const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, + uint8_t flags, uint8_t out[64]); + +void blake3_hash_many_avx512(const uint8_t *const *inputs, size_t num_inputs, + size_t blocks, const uint32_t key[8], + uint64_t counter, bool increment_counter, + uint8_t flags, uint8_t flags_start, + uint8_t flags_end, uint8_t *out); +#endif +#endif + +#if BLAKE3_USE_NEON == 1 +void blake3_hash_many_neon(const uint8_t *const *inputs, size_t num_inputs, + size_t blocks, const uint32_t key[8], + uint64_t counter, bool increment_counter, + uint8_t flags, uint8_t flags_start, + uint8_t flags_end, uint8_t *out); +#endif + + +#endif /* BLAKE3_IMPL_H */ diff --git a/third-party/blake3/c/blake3_neon.c b/third-party/blake3/c/blake3_neon.c new file mode 100644 index 00000000..8a818fc7 --- /dev/null +++ b/third-party/blake3/c/blake3_neon.c @@ -0,0 +1,368 @@ +#include "blake3_impl.h" + +#include + +#ifdef __ARM_BIG_ENDIAN +#error "This implementation only supports little-endian ARM." +// It might be that all we need for big-endian support here is to get the loads +// and stores right, but step zero would be finding a way to test it in CI. +#endif + +INLINE uint32x4_t loadu_128(const uint8_t src[16]) { + // vld1q_u32 has alignment requirements. Don't use it. + uint32x4_t x; + memcpy(&x, src, 16); + return x; +} + +INLINE void storeu_128(uint32x4_t src, uint8_t dest[16]) { + // vst1q_u32 has alignment requirements. Don't use it. + memcpy(dest, &src, 16); +} + +INLINE uint32x4_t add_128(uint32x4_t a, uint32x4_t b) { + return vaddq_u32(a, b); +} + +INLINE uint32x4_t xor_128(uint32x4_t a, uint32x4_t b) { + return veorq_u32(a, b); +} + +INLINE uint32x4_t set1_128(uint32_t x) { return vld1q_dup_u32(&x); } + +INLINE uint32x4_t set4(uint32_t a, uint32_t b, uint32_t c, uint32_t d) { + uint32_t array[4] = {a, b, c, d}; + return vld1q_u32(array); +} + +INLINE uint32x4_t rot16_128(uint32x4_t x) { + // The straightfoward implementation would be two shifts and an or, but that's + // slower on microarchitectures we've tested. See + // https://github.com/BLAKE3-team/BLAKE3/pull/319. + // return vorrq_u32(vshrq_n_u32(x, 16), vshlq_n_u32(x, 32 - 16)); + return vreinterpretq_u32_u16(vrev32q_u16(vreinterpretq_u16_u32(x))); +} + +INLINE uint32x4_t rot12_128(uint32x4_t x) { + // See comment in rot16_128. + // return vorrq_u32(vshrq_n_u32(x, 12), vshlq_n_u32(x, 32 - 12)); + return vsriq_n_u32(vshlq_n_u32(x, 32-12), x, 12); +} + +INLINE uint32x4_t rot8_128(uint32x4_t x) { + // See comment in rot16_128. + // return vorrq_u32(vshrq_n_u32(x, 8), vshlq_n_u32(x, 32 - 8)); +#if defined(__clang__) + return vreinterpretq_u32_u8(__builtin_shufflevector(vreinterpretq_u8_u32(x), vreinterpretq_u8_u32(x), 1,2,3,0,5,6,7,4,9,10,11,8,13,14,15,12)); +#elif __GNUC__ * 10000 + __GNUC_MINOR__ * 100 >=40700 + static const uint8x16_t r8 = {1,2,3,0,5,6,7,4,9,10,11,8,13,14,15,12}; + return vreinterpretq_u32_u8(__builtin_shuffle(vreinterpretq_u8_u32(x), vreinterpretq_u8_u32(x), r8)); +#else + return vsriq_n_u32(vshlq_n_u32(x, 32-8), x, 8); +#endif +} + +INLINE uint32x4_t rot7_128(uint32x4_t x) { + // See comment in rot16_128. + // return vorrq_u32(vshrq_n_u32(x, 7), vshlq_n_u32(x, 32 - 7)); + return vsriq_n_u32(vshlq_n_u32(x, 32-7), x, 7); +} + +// TODO: compress_neon + +// TODO: hash2_neon + +/* + * ---------------------------------------------------------------------------- + * hash4_neon + * ---------------------------------------------------------------------------- + */ + +INLINE void round_fn4(uint32x4_t v[16], uint32x4_t m[16], size_t r) { + v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][0]]); + v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][2]]); + v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][4]]); + v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][6]]); + v[0] = add_128(v[0], v[4]); + v[1] = add_128(v[1], v[5]); + v[2] = add_128(v[2], v[6]); + v[3] = add_128(v[3], v[7]); + v[12] = xor_128(v[12], v[0]); + v[13] = xor_128(v[13], v[1]); + v[14] = xor_128(v[14], v[2]); + v[15] = xor_128(v[15], v[3]); + v[12] = rot16_128(v[12]); + v[13] = rot16_128(v[13]); + v[14] = rot16_128(v[14]); + v[15] = rot16_128(v[15]); + v[8] = add_128(v[8], v[12]); + v[9] = add_128(v[9], v[13]); + v[10] = add_128(v[10], v[14]); + v[11] = add_128(v[11], v[15]); + v[4] = xor_128(v[4], v[8]); + v[5] = xor_128(v[5], v[9]); + v[6] = xor_128(v[6], v[10]); + v[7] = xor_128(v[7], v[11]); + v[4] = rot12_128(v[4]); + v[5] = rot12_128(v[5]); + v[6] = rot12_128(v[6]); + v[7] = rot12_128(v[7]); + v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][1]]); + v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][3]]); + v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][5]]); + v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][7]]); + v[0] = add_128(v[0], v[4]); + v[1] = add_128(v[1], v[5]); + v[2] = add_128(v[2], v[6]); + v[3] = add_128(v[3], v[7]); + v[12] = xor_128(v[12], v[0]); + v[13] = xor_128(v[13], v[1]); + v[14] = xor_128(v[14], v[2]); + v[15] = xor_128(v[15], v[3]); + v[12] = rot8_128(v[12]); + v[13] = rot8_128(v[13]); + v[14] = rot8_128(v[14]); + v[15] = rot8_128(v[15]); + v[8] = add_128(v[8], v[12]); + v[9] = add_128(v[9], v[13]); + v[10] = add_128(v[10], v[14]); + v[11] = add_128(v[11], v[15]); + v[4] = xor_128(v[4], v[8]); + v[5] = xor_128(v[5], v[9]); + v[6] = xor_128(v[6], v[10]); + v[7] = xor_128(v[7], v[11]); + v[4] = rot7_128(v[4]); + v[5] = rot7_128(v[5]); + v[6] = rot7_128(v[6]); + v[7] = rot7_128(v[7]); + + v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][8]]); + v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][10]]); + v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][12]]); + v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][14]]); + v[0] = add_128(v[0], v[5]); + v[1] = add_128(v[1], v[6]); + v[2] = add_128(v[2], v[7]); + v[3] = add_128(v[3], v[4]); + v[15] = xor_128(v[15], v[0]); + v[12] = xor_128(v[12], v[1]); + v[13] = xor_128(v[13], v[2]); + v[14] = xor_128(v[14], v[3]); + v[15] = rot16_128(v[15]); + v[12] = rot16_128(v[12]); + v[13] = rot16_128(v[13]); + v[14] = rot16_128(v[14]); + v[10] = add_128(v[10], v[15]); + v[11] = add_128(v[11], v[12]); + v[8] = add_128(v[8], v[13]); + v[9] = add_128(v[9], v[14]); + v[5] = xor_128(v[5], v[10]); + v[6] = xor_128(v[6], v[11]); + v[7] = xor_128(v[7], v[8]); + v[4] = xor_128(v[4], v[9]); + v[5] = rot12_128(v[5]); + v[6] = rot12_128(v[6]); + v[7] = rot12_128(v[7]); + v[4] = rot12_128(v[4]); + v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][9]]); + v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][11]]); + v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][13]]); + v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][15]]); + v[0] = add_128(v[0], v[5]); + v[1] = add_128(v[1], v[6]); + v[2] = add_128(v[2], v[7]); + v[3] = add_128(v[3], v[4]); + v[15] = xor_128(v[15], v[0]); + v[12] = xor_128(v[12], v[1]); + v[13] = xor_128(v[13], v[2]); + v[14] = xor_128(v[14], v[3]); + v[15] = rot8_128(v[15]); + v[12] = rot8_128(v[12]); + v[13] = rot8_128(v[13]); + v[14] = rot8_128(v[14]); + v[10] = add_128(v[10], v[15]); + v[11] = add_128(v[11], v[12]); + v[8] = add_128(v[8], v[13]); + v[9] = add_128(v[9], v[14]); + v[5] = xor_128(v[5], v[10]); + v[6] = xor_128(v[6], v[11]); + v[7] = xor_128(v[7], v[8]); + v[4] = xor_128(v[4], v[9]); + v[5] = rot7_128(v[5]); + v[6] = rot7_128(v[6]); + v[7] = rot7_128(v[7]); + v[4] = rot7_128(v[4]); +} + +INLINE void transpose_vecs_128(uint32x4_t vecs[4]) { + // Individually transpose the four 2x2 sub-matrices in each corner. + uint32x4x2_t rows01 = vtrnq_u32(vecs[0], vecs[1]); + uint32x4x2_t rows23 = vtrnq_u32(vecs[2], vecs[3]); + + // Swap the top-right and bottom-left 2x2s (which just got transposed). + vecs[0] = + vcombine_u32(vget_low_u32(rows01.val[0]), vget_low_u32(rows23.val[0])); + vecs[1] = + vcombine_u32(vget_low_u32(rows01.val[1]), vget_low_u32(rows23.val[1])); + vecs[2] = + vcombine_u32(vget_high_u32(rows01.val[0]), vget_high_u32(rows23.val[0])); + vecs[3] = + vcombine_u32(vget_high_u32(rows01.val[1]), vget_high_u32(rows23.val[1])); +} + +INLINE void transpose_msg_vecs4(const uint8_t *const *inputs, + size_t block_offset, uint32x4_t out[16]) { + out[0] = loadu_128(&inputs[0][block_offset + 0 * sizeof(uint32x4_t)]); + out[1] = loadu_128(&inputs[1][block_offset + 0 * sizeof(uint32x4_t)]); + out[2] = loadu_128(&inputs[2][block_offset + 0 * sizeof(uint32x4_t)]); + out[3] = loadu_128(&inputs[3][block_offset + 0 * sizeof(uint32x4_t)]); + out[4] = loadu_128(&inputs[0][block_offset + 1 * sizeof(uint32x4_t)]); + out[5] = loadu_128(&inputs[1][block_offset + 1 * sizeof(uint32x4_t)]); + out[6] = loadu_128(&inputs[2][block_offset + 1 * sizeof(uint32x4_t)]); + out[7] = loadu_128(&inputs[3][block_offset + 1 * sizeof(uint32x4_t)]); + out[8] = loadu_128(&inputs[0][block_offset + 2 * sizeof(uint32x4_t)]); + out[9] = loadu_128(&inputs[1][block_offset + 2 * sizeof(uint32x4_t)]); + out[10] = loadu_128(&inputs[2][block_offset + 2 * sizeof(uint32x4_t)]); + out[11] = loadu_128(&inputs[3][block_offset + 2 * sizeof(uint32x4_t)]); + out[12] = loadu_128(&inputs[0][block_offset + 3 * sizeof(uint32x4_t)]); + out[13] = loadu_128(&inputs[1][block_offset + 3 * sizeof(uint32x4_t)]); + out[14] = loadu_128(&inputs[2][block_offset + 3 * sizeof(uint32x4_t)]); + out[15] = loadu_128(&inputs[3][block_offset + 3 * sizeof(uint32x4_t)]); + transpose_vecs_128(&out[0]); + transpose_vecs_128(&out[4]); + transpose_vecs_128(&out[8]); + transpose_vecs_128(&out[12]); +} + +INLINE void load_counters4(uint64_t counter, bool increment_counter, + uint32x4_t *out_low, uint32x4_t *out_high) { + uint64_t mask = (increment_counter ? ~0 : 0); + *out_low = set4( + counter_low(counter + (mask & 0)), counter_low(counter + (mask & 1)), + counter_low(counter + (mask & 2)), counter_low(counter + (mask & 3))); + *out_high = set4( + counter_high(counter + (mask & 0)), counter_high(counter + (mask & 1)), + counter_high(counter + (mask & 2)), counter_high(counter + (mask & 3))); +} + +void blake3_hash4_neon(const uint8_t *const *inputs, size_t blocks, + const uint32_t key[8], uint64_t counter, + bool increment_counter, uint8_t flags, + uint8_t flags_start, uint8_t flags_end, uint8_t *out) { + uint32x4_t h_vecs[8] = { + set1_128(key[0]), set1_128(key[1]), set1_128(key[2]), set1_128(key[3]), + set1_128(key[4]), set1_128(key[5]), set1_128(key[6]), set1_128(key[7]), + }; + uint32x4_t counter_low_vec, counter_high_vec; + load_counters4(counter, increment_counter, &counter_low_vec, + &counter_high_vec); + uint8_t block_flags = flags | flags_start; + + for (size_t block = 0; block < blocks; block++) { + if (block + 1 == blocks) { + block_flags |= flags_end; + } + uint32x4_t block_len_vec = set1_128(BLAKE3_BLOCK_LEN); + uint32x4_t block_flags_vec = set1_128(block_flags); + uint32x4_t msg_vecs[16]; + transpose_msg_vecs4(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs); + + uint32x4_t v[16] = { + h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3], + h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7], + set1_128(IV[0]), set1_128(IV[1]), set1_128(IV[2]), set1_128(IV[3]), + counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec, + }; + round_fn4(v, msg_vecs, 0); + round_fn4(v, msg_vecs, 1); + round_fn4(v, msg_vecs, 2); + round_fn4(v, msg_vecs, 3); + round_fn4(v, msg_vecs, 4); + round_fn4(v, msg_vecs, 5); + round_fn4(v, msg_vecs, 6); + h_vecs[0] = xor_128(v[0], v[8]); + h_vecs[1] = xor_128(v[1], v[9]); + h_vecs[2] = xor_128(v[2], v[10]); + h_vecs[3] = xor_128(v[3], v[11]); + h_vecs[4] = xor_128(v[4], v[12]); + h_vecs[5] = xor_128(v[5], v[13]); + h_vecs[6] = xor_128(v[6], v[14]); + h_vecs[7] = xor_128(v[7], v[15]); + + block_flags = flags; + } + + transpose_vecs_128(&h_vecs[0]); + transpose_vecs_128(&h_vecs[4]); + // The first four vecs now contain the first half of each output, and the + // second four vecs contain the second half of each output. + storeu_128(h_vecs[0], &out[0 * sizeof(uint32x4_t)]); + storeu_128(h_vecs[4], &out[1 * sizeof(uint32x4_t)]); + storeu_128(h_vecs[1], &out[2 * sizeof(uint32x4_t)]); + storeu_128(h_vecs[5], &out[3 * sizeof(uint32x4_t)]); + storeu_128(h_vecs[2], &out[4 * sizeof(uint32x4_t)]); + storeu_128(h_vecs[6], &out[5 * sizeof(uint32x4_t)]); + storeu_128(h_vecs[3], &out[6 * sizeof(uint32x4_t)]); + storeu_128(h_vecs[7], &out[7 * sizeof(uint32x4_t)]); +} + +/* + * ---------------------------------------------------------------------------- + * hash_many_neon + * ---------------------------------------------------------------------------- + */ + +void blake3_compress_in_place_portable(uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, + uint8_t flags); + +INLINE void hash_one_neon(const uint8_t *input, size_t blocks, + const uint32_t key[8], uint64_t counter, + uint8_t flags, uint8_t flags_start, uint8_t flags_end, + uint8_t out[BLAKE3_OUT_LEN]) { + uint32_t cv[8]; + memcpy(cv, key, BLAKE3_KEY_LEN); + uint8_t block_flags = flags | flags_start; + while (blocks > 0) { + if (blocks == 1) { + block_flags |= flags_end; + } + // TODO: Implement compress_neon. However note that according to + // https://github.com/BLAKE2/BLAKE2/commit/7965d3e6e1b4193438b8d3a656787587d2579227, + // compress_neon might not be any faster than compress_portable. + blake3_compress_in_place_portable(cv, input, BLAKE3_BLOCK_LEN, counter, + block_flags); + input = &input[BLAKE3_BLOCK_LEN]; + blocks -= 1; + block_flags = flags; + } + memcpy(out, cv, BLAKE3_OUT_LEN); +} + +void blake3_hash_many_neon(const uint8_t *const *inputs, size_t num_inputs, + size_t blocks, const uint32_t key[8], + uint64_t counter, bool increment_counter, + uint8_t flags, uint8_t flags_start, + uint8_t flags_end, uint8_t *out) { + while (num_inputs >= 4) { + blake3_hash4_neon(inputs, blocks, key, counter, increment_counter, flags, + flags_start, flags_end, out); + if (increment_counter) { + counter += 4; + } + inputs += 4; + num_inputs -= 4; + out = &out[4 * BLAKE3_OUT_LEN]; + } + while (num_inputs > 0) { + hash_one_neon(inputs[0], blocks, key, counter, flags, flags_start, + flags_end, out); + if (increment_counter) { + counter += 1; + } + inputs += 1; + num_inputs -= 1; + out = &out[BLAKE3_OUT_LEN]; + } +} diff --git a/third-party/blake3/c/blake3_portable.c b/third-party/blake3/c/blake3_portable.c new file mode 100644 index 00000000..062dd1b4 --- /dev/null +++ b/third-party/blake3/c/blake3_portable.c @@ -0,0 +1,160 @@ +#include "blake3_impl.h" +#include + +INLINE uint32_t rotr32(uint32_t w, uint32_t c) { + return (w >> c) | (w << (32 - c)); +} + +INLINE void g(uint32_t *state, size_t a, size_t b, size_t c, size_t d, + uint32_t x, uint32_t y) { + state[a] = state[a] + state[b] + x; + state[d] = rotr32(state[d] ^ state[a], 16); + state[c] = state[c] + state[d]; + state[b] = rotr32(state[b] ^ state[c], 12); + state[a] = state[a] + state[b] + y; + state[d] = rotr32(state[d] ^ state[a], 8); + state[c] = state[c] + state[d]; + state[b] = rotr32(state[b] ^ state[c], 7); +} + +INLINE void round_fn(uint32_t state[16], const uint32_t *msg, size_t round) { + // Select the message schedule based on the round. + const uint8_t *schedule = MSG_SCHEDULE[round]; + + // Mix the columns. + g(state, 0, 4, 8, 12, msg[schedule[0]], msg[schedule[1]]); + g(state, 1, 5, 9, 13, msg[schedule[2]], msg[schedule[3]]); + g(state, 2, 6, 10, 14, msg[schedule[4]], msg[schedule[5]]); + g(state, 3, 7, 11, 15, msg[schedule[6]], msg[schedule[7]]); + + // Mix the rows. + g(state, 0, 5, 10, 15, msg[schedule[8]], msg[schedule[9]]); + g(state, 1, 6, 11, 12, msg[schedule[10]], msg[schedule[11]]); + g(state, 2, 7, 8, 13, msg[schedule[12]], msg[schedule[13]]); + g(state, 3, 4, 9, 14, msg[schedule[14]], msg[schedule[15]]); +} + +INLINE void compress_pre(uint32_t state[16], const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, uint8_t flags) { + uint32_t block_words[16]; + block_words[0] = load32(block + 4 * 0); + block_words[1] = load32(block + 4 * 1); + block_words[2] = load32(block + 4 * 2); + block_words[3] = load32(block + 4 * 3); + block_words[4] = load32(block + 4 * 4); + block_words[5] = load32(block + 4 * 5); + block_words[6] = load32(block + 4 * 6); + block_words[7] = load32(block + 4 * 7); + block_words[8] = load32(block + 4 * 8); + block_words[9] = load32(block + 4 * 9); + block_words[10] = load32(block + 4 * 10); + block_words[11] = load32(block + 4 * 11); + block_words[12] = load32(block + 4 * 12); + block_words[13] = load32(block + 4 * 13); + block_words[14] = load32(block + 4 * 14); + block_words[15] = load32(block + 4 * 15); + + state[0] = cv[0]; + state[1] = cv[1]; + state[2] = cv[2]; + state[3] = cv[3]; + state[4] = cv[4]; + state[5] = cv[5]; + state[6] = cv[6]; + state[7] = cv[7]; + state[8] = IV[0]; + state[9] = IV[1]; + state[10] = IV[2]; + state[11] = IV[3]; + state[12] = counter_low(counter); + state[13] = counter_high(counter); + state[14] = (uint32_t)block_len; + state[15] = (uint32_t)flags; + + round_fn(state, &block_words[0], 0); + round_fn(state, &block_words[0], 1); + round_fn(state, &block_words[0], 2); + round_fn(state, &block_words[0], 3); + round_fn(state, &block_words[0], 4); + round_fn(state, &block_words[0], 5); + round_fn(state, &block_words[0], 6); +} + +void blake3_compress_in_place_portable(uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, + uint8_t flags) { + uint32_t state[16]; + compress_pre(state, cv, block, block_len, counter, flags); + cv[0] = state[0] ^ state[8]; + cv[1] = state[1] ^ state[9]; + cv[2] = state[2] ^ state[10]; + cv[3] = state[3] ^ state[11]; + cv[4] = state[4] ^ state[12]; + cv[5] = state[5] ^ state[13]; + cv[6] = state[6] ^ state[14]; + cv[7] = state[7] ^ state[15]; +} + +void blake3_compress_xof_portable(const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, + uint8_t flags, uint8_t out[64]) { + uint32_t state[16]; + compress_pre(state, cv, block, block_len, counter, flags); + + store32(&out[0 * 4], state[0] ^ state[8]); + store32(&out[1 * 4], state[1] ^ state[9]); + store32(&out[2 * 4], state[2] ^ state[10]); + store32(&out[3 * 4], state[3] ^ state[11]); + store32(&out[4 * 4], state[4] ^ state[12]); + store32(&out[5 * 4], state[5] ^ state[13]); + store32(&out[6 * 4], state[6] ^ state[14]); + store32(&out[7 * 4], state[7] ^ state[15]); + store32(&out[8 * 4], state[8] ^ cv[0]); + store32(&out[9 * 4], state[9] ^ cv[1]); + store32(&out[10 * 4], state[10] ^ cv[2]); + store32(&out[11 * 4], state[11] ^ cv[3]); + store32(&out[12 * 4], state[12] ^ cv[4]); + store32(&out[13 * 4], state[13] ^ cv[5]); + store32(&out[14 * 4], state[14] ^ cv[6]); + store32(&out[15 * 4], state[15] ^ cv[7]); +} + +INLINE void hash_one_portable(const uint8_t *input, size_t blocks, + const uint32_t key[8], uint64_t counter, + uint8_t flags, uint8_t flags_start, + uint8_t flags_end, uint8_t out[BLAKE3_OUT_LEN]) { + uint32_t cv[8]; + memcpy(cv, key, BLAKE3_KEY_LEN); + uint8_t block_flags = flags | flags_start; + while (blocks > 0) { + if (blocks == 1) { + block_flags |= flags_end; + } + blake3_compress_in_place_portable(cv, input, BLAKE3_BLOCK_LEN, counter, + block_flags); + input = &input[BLAKE3_BLOCK_LEN]; + blocks -= 1; + block_flags = flags; + } + store_cv_words(out, cv); +} + +void blake3_hash_many_portable(const uint8_t *const *inputs, size_t num_inputs, + size_t blocks, const uint32_t key[8], + uint64_t counter, bool increment_counter, + uint8_t flags, uint8_t flags_start, + uint8_t flags_end, uint8_t *out) { + while (num_inputs > 0) { + hash_one_portable(inputs[0], blocks, key, counter, flags, flags_start, + flags_end, out); + if (increment_counter) { + counter += 1; + } + inputs += 1; + num_inputs -= 1; + out = &out[BLAKE3_OUT_LEN]; + } +} diff --git a/third-party/blake3/c/blake3_sse2.c b/third-party/blake3/c/blake3_sse2.c new file mode 100644 index 00000000..691e1c68 --- /dev/null +++ b/third-party/blake3/c/blake3_sse2.c @@ -0,0 +1,566 @@ +#include "blake3_impl.h" + +#include + +#define DEGREE 4 + +#define _mm_shuffle_ps2(a, b, c) \ + (_mm_castps_si128( \ + _mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), (c)))) + +INLINE __m128i loadu(const uint8_t src[16]) { + return _mm_loadu_si128((const __m128i *)src); +} + +INLINE void storeu(__m128i src, uint8_t dest[16]) { + _mm_storeu_si128((__m128i *)dest, src); +} + +INLINE __m128i addv(__m128i a, __m128i b) { return _mm_add_epi32(a, b); } + +// Note that clang-format doesn't like the name "xor" for some reason. +INLINE __m128i xorv(__m128i a, __m128i b) { return _mm_xor_si128(a, b); } + +INLINE __m128i set1(uint32_t x) { return _mm_set1_epi32((int32_t)x); } + +INLINE __m128i set4(uint32_t a, uint32_t b, uint32_t c, uint32_t d) { + return _mm_setr_epi32((int32_t)a, (int32_t)b, (int32_t)c, (int32_t)d); +} + +INLINE __m128i rot16(__m128i x) { + return _mm_shufflehi_epi16(_mm_shufflelo_epi16(x, 0xB1), 0xB1); +} + +INLINE __m128i rot12(__m128i x) { + return xorv(_mm_srli_epi32(x, 12), _mm_slli_epi32(x, 32 - 12)); +} + +INLINE __m128i rot8(__m128i x) { + return xorv(_mm_srli_epi32(x, 8), _mm_slli_epi32(x, 32 - 8)); +} + +INLINE __m128i rot7(__m128i x) { + return xorv(_mm_srli_epi32(x, 7), _mm_slli_epi32(x, 32 - 7)); +} + +INLINE void g1(__m128i *row0, __m128i *row1, __m128i *row2, __m128i *row3, + __m128i m) { + *row0 = addv(addv(*row0, m), *row1); + *row3 = xorv(*row3, *row0); + *row3 = rot16(*row3); + *row2 = addv(*row2, *row3); + *row1 = xorv(*row1, *row2); + *row1 = rot12(*row1); +} + +INLINE void g2(__m128i *row0, __m128i *row1, __m128i *row2, __m128i *row3, + __m128i m) { + *row0 = addv(addv(*row0, m), *row1); + *row3 = xorv(*row3, *row0); + *row3 = rot8(*row3); + *row2 = addv(*row2, *row3); + *row1 = xorv(*row1, *row2); + *row1 = rot7(*row1); +} + +// Note the optimization here of leaving row1 as the unrotated row, rather than +// row0. All the message loads below are adjusted to compensate for this. See +// discussion at https://github.com/sneves/blake2-avx2/pull/4 +INLINE void diagonalize(__m128i *row0, __m128i *row2, __m128i *row3) { + *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(2, 1, 0, 3)); + *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2)); + *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(0, 3, 2, 1)); +} + +INLINE void undiagonalize(__m128i *row0, __m128i *row2, __m128i *row3) { + *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(0, 3, 2, 1)); + *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2)); + *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(2, 1, 0, 3)); +} + +INLINE __m128i blend_epi16(__m128i a, __m128i b, const int16_t imm8) { + const __m128i bits = _mm_set_epi16(0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01); + __m128i mask = _mm_set1_epi16(imm8); + mask = _mm_and_si128(mask, bits); + mask = _mm_cmpeq_epi16(mask, bits); + return _mm_or_si128(_mm_and_si128(mask, b), _mm_andnot_si128(mask, a)); +} + +INLINE void compress_pre(__m128i rows[4], const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, uint8_t flags) { + rows[0] = loadu((uint8_t *)&cv[0]); + rows[1] = loadu((uint8_t *)&cv[4]); + rows[2] = set4(IV[0], IV[1], IV[2], IV[3]); + rows[3] = set4(counter_low(counter), counter_high(counter), + (uint32_t)block_len, (uint32_t)flags); + + __m128i m0 = loadu(&block[sizeof(__m128i) * 0]); + __m128i m1 = loadu(&block[sizeof(__m128i) * 1]); + __m128i m2 = loadu(&block[sizeof(__m128i) * 2]); + __m128i m3 = loadu(&block[sizeof(__m128i) * 3]); + + __m128i t0, t1, t2, t3, tt; + + // Round 1. The first round permutes the message words from the original + // input order, into the groups that get mixed in parallel. + t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(2, 0, 2, 0)); // 6 4 2 0 + g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); + t1 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 3, 1)); // 7 5 3 1 + g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); + diagonalize(&rows[0], &rows[2], &rows[3]); + t2 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(2, 0, 2, 0)); // 14 12 10 8 + t2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2, 1, 0, 3)); // 12 10 8 14 + g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); + t3 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 1, 3, 1)); // 15 13 11 9 + t3 = _mm_shuffle_epi32(t3, _MM_SHUFFLE(2, 1, 0, 3)); // 13 11 9 15 + g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); + undiagonalize(&rows[0], &rows[2], &rows[3]); + m0 = t0; + m1 = t1; + m2 = t2; + m3 = t3; + + // Round 2. This round and all following rounds apply a fixed permutation + // to the message words from the round before. + t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); + t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); + t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); + tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); + t1 = blend_epi16(tt, t1, 0xCC); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); + diagonalize(&rows[0], &rows[2], &rows[3]); + t2 = _mm_unpacklo_epi64(m3, m1); + tt = blend_epi16(t2, m2, 0xC0); + t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); + t3 = _mm_unpackhi_epi32(m1, m3); + tt = _mm_unpacklo_epi32(m2, t3); + t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); + undiagonalize(&rows[0], &rows[2], &rows[3]); + m0 = t0; + m1 = t1; + m2 = t2; + m3 = t3; + + // Round 3 + t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); + t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); + t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); + tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); + t1 = blend_epi16(tt, t1, 0xCC); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); + diagonalize(&rows[0], &rows[2], &rows[3]); + t2 = _mm_unpacklo_epi64(m3, m1); + tt = blend_epi16(t2, m2, 0xC0); + t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); + t3 = _mm_unpackhi_epi32(m1, m3); + tt = _mm_unpacklo_epi32(m2, t3); + t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); + undiagonalize(&rows[0], &rows[2], &rows[3]); + m0 = t0; + m1 = t1; + m2 = t2; + m3 = t3; + + // Round 4 + t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); + t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); + t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); + tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); + t1 = blend_epi16(tt, t1, 0xCC); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); + diagonalize(&rows[0], &rows[2], &rows[3]); + t2 = _mm_unpacklo_epi64(m3, m1); + tt = blend_epi16(t2, m2, 0xC0); + t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); + t3 = _mm_unpackhi_epi32(m1, m3); + tt = _mm_unpacklo_epi32(m2, t3); + t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); + undiagonalize(&rows[0], &rows[2], &rows[3]); + m0 = t0; + m1 = t1; + m2 = t2; + m3 = t3; + + // Round 5 + t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); + t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); + t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); + tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); + t1 = blend_epi16(tt, t1, 0xCC); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); + diagonalize(&rows[0], &rows[2], &rows[3]); + t2 = _mm_unpacklo_epi64(m3, m1); + tt = blend_epi16(t2, m2, 0xC0); + t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); + t3 = _mm_unpackhi_epi32(m1, m3); + tt = _mm_unpacklo_epi32(m2, t3); + t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); + undiagonalize(&rows[0], &rows[2], &rows[3]); + m0 = t0; + m1 = t1; + m2 = t2; + m3 = t3; + + // Round 6 + t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); + t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); + t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); + tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); + t1 = blend_epi16(tt, t1, 0xCC); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); + diagonalize(&rows[0], &rows[2], &rows[3]); + t2 = _mm_unpacklo_epi64(m3, m1); + tt = blend_epi16(t2, m2, 0xC0); + t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); + t3 = _mm_unpackhi_epi32(m1, m3); + tt = _mm_unpacklo_epi32(m2, t3); + t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); + undiagonalize(&rows[0], &rows[2], &rows[3]); + m0 = t0; + m1 = t1; + m2 = t2; + m3 = t3; + + // Round 7 + t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); + t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); + t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); + tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); + t1 = blend_epi16(tt, t1, 0xCC); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); + diagonalize(&rows[0], &rows[2], &rows[3]); + t2 = _mm_unpacklo_epi64(m3, m1); + tt = blend_epi16(t2, m2, 0xC0); + t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); + t3 = _mm_unpackhi_epi32(m1, m3); + tt = _mm_unpacklo_epi32(m2, t3); + t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); + undiagonalize(&rows[0], &rows[2], &rows[3]); +} + +void blake3_compress_in_place_sse2(uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, + uint8_t flags) { + __m128i rows[4]; + compress_pre(rows, cv, block, block_len, counter, flags); + storeu(xorv(rows[0], rows[2]), (uint8_t *)&cv[0]); + storeu(xorv(rows[1], rows[3]), (uint8_t *)&cv[4]); +} + +void blake3_compress_xof_sse2(const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, + uint8_t flags, uint8_t out[64]) { + __m128i rows[4]; + compress_pre(rows, cv, block, block_len, counter, flags); + storeu(xorv(rows[0], rows[2]), &out[0]); + storeu(xorv(rows[1], rows[3]), &out[16]); + storeu(xorv(rows[2], loadu((uint8_t *)&cv[0])), &out[32]); + storeu(xorv(rows[3], loadu((uint8_t *)&cv[4])), &out[48]); +} + +INLINE void round_fn(__m128i v[16], __m128i m[16], size_t r) { + v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][0]]); + v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][2]]); + v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][4]]); + v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][6]]); + v[0] = addv(v[0], v[4]); + v[1] = addv(v[1], v[5]); + v[2] = addv(v[2], v[6]); + v[3] = addv(v[3], v[7]); + v[12] = xorv(v[12], v[0]); + v[13] = xorv(v[13], v[1]); + v[14] = xorv(v[14], v[2]); + v[15] = xorv(v[15], v[3]); + v[12] = rot16(v[12]); + v[13] = rot16(v[13]); + v[14] = rot16(v[14]); + v[15] = rot16(v[15]); + v[8] = addv(v[8], v[12]); + v[9] = addv(v[9], v[13]); + v[10] = addv(v[10], v[14]); + v[11] = addv(v[11], v[15]); + v[4] = xorv(v[4], v[8]); + v[5] = xorv(v[5], v[9]); + v[6] = xorv(v[6], v[10]); + v[7] = xorv(v[7], v[11]); + v[4] = rot12(v[4]); + v[5] = rot12(v[5]); + v[6] = rot12(v[6]); + v[7] = rot12(v[7]); + v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][1]]); + v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][3]]); + v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][5]]); + v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][7]]); + v[0] = addv(v[0], v[4]); + v[1] = addv(v[1], v[5]); + v[2] = addv(v[2], v[6]); + v[3] = addv(v[3], v[7]); + v[12] = xorv(v[12], v[0]); + v[13] = xorv(v[13], v[1]); + v[14] = xorv(v[14], v[2]); + v[15] = xorv(v[15], v[3]); + v[12] = rot8(v[12]); + v[13] = rot8(v[13]); + v[14] = rot8(v[14]); + v[15] = rot8(v[15]); + v[8] = addv(v[8], v[12]); + v[9] = addv(v[9], v[13]); + v[10] = addv(v[10], v[14]); + v[11] = addv(v[11], v[15]); + v[4] = xorv(v[4], v[8]); + v[5] = xorv(v[5], v[9]); + v[6] = xorv(v[6], v[10]); + v[7] = xorv(v[7], v[11]); + v[4] = rot7(v[4]); + v[5] = rot7(v[5]); + v[6] = rot7(v[6]); + v[7] = rot7(v[7]); + + v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][8]]); + v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][10]]); + v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][12]]); + v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][14]]); + v[0] = addv(v[0], v[5]); + v[1] = addv(v[1], v[6]); + v[2] = addv(v[2], v[7]); + v[3] = addv(v[3], v[4]); + v[15] = xorv(v[15], v[0]); + v[12] = xorv(v[12], v[1]); + v[13] = xorv(v[13], v[2]); + v[14] = xorv(v[14], v[3]); + v[15] = rot16(v[15]); + v[12] = rot16(v[12]); + v[13] = rot16(v[13]); + v[14] = rot16(v[14]); + v[10] = addv(v[10], v[15]); + v[11] = addv(v[11], v[12]); + v[8] = addv(v[8], v[13]); + v[9] = addv(v[9], v[14]); + v[5] = xorv(v[5], v[10]); + v[6] = xorv(v[6], v[11]); + v[7] = xorv(v[7], v[8]); + v[4] = xorv(v[4], v[9]); + v[5] = rot12(v[5]); + v[6] = rot12(v[6]); + v[7] = rot12(v[7]); + v[4] = rot12(v[4]); + v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][9]]); + v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][11]]); + v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][13]]); + v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][15]]); + v[0] = addv(v[0], v[5]); + v[1] = addv(v[1], v[6]); + v[2] = addv(v[2], v[7]); + v[3] = addv(v[3], v[4]); + v[15] = xorv(v[15], v[0]); + v[12] = xorv(v[12], v[1]); + v[13] = xorv(v[13], v[2]); + v[14] = xorv(v[14], v[3]); + v[15] = rot8(v[15]); + v[12] = rot8(v[12]); + v[13] = rot8(v[13]); + v[14] = rot8(v[14]); + v[10] = addv(v[10], v[15]); + v[11] = addv(v[11], v[12]); + v[8] = addv(v[8], v[13]); + v[9] = addv(v[9], v[14]); + v[5] = xorv(v[5], v[10]); + v[6] = xorv(v[6], v[11]); + v[7] = xorv(v[7], v[8]); + v[4] = xorv(v[4], v[9]); + v[5] = rot7(v[5]); + v[6] = rot7(v[6]); + v[7] = rot7(v[7]); + v[4] = rot7(v[4]); +} + +INLINE void transpose_vecs(__m128i vecs[DEGREE]) { + // Interleave 32-bit lanes. The low unpack is lanes 00/11 and the high is + // 22/33. Note that this doesn't split the vector into two lanes, as the + // AVX2 counterparts do. + __m128i ab_01 = _mm_unpacklo_epi32(vecs[0], vecs[1]); + __m128i ab_23 = _mm_unpackhi_epi32(vecs[0], vecs[1]); + __m128i cd_01 = _mm_unpacklo_epi32(vecs[2], vecs[3]); + __m128i cd_23 = _mm_unpackhi_epi32(vecs[2], vecs[3]); + + // Interleave 64-bit lanes. + __m128i abcd_0 = _mm_unpacklo_epi64(ab_01, cd_01); + __m128i abcd_1 = _mm_unpackhi_epi64(ab_01, cd_01); + __m128i abcd_2 = _mm_unpacklo_epi64(ab_23, cd_23); + __m128i abcd_3 = _mm_unpackhi_epi64(ab_23, cd_23); + + vecs[0] = abcd_0; + vecs[1] = abcd_1; + vecs[2] = abcd_2; + vecs[3] = abcd_3; +} + +INLINE void transpose_msg_vecs(const uint8_t *const *inputs, + size_t block_offset, __m128i out[16]) { + out[0] = loadu(&inputs[0][block_offset + 0 * sizeof(__m128i)]); + out[1] = loadu(&inputs[1][block_offset + 0 * sizeof(__m128i)]); + out[2] = loadu(&inputs[2][block_offset + 0 * sizeof(__m128i)]); + out[3] = loadu(&inputs[3][block_offset + 0 * sizeof(__m128i)]); + out[4] = loadu(&inputs[0][block_offset + 1 * sizeof(__m128i)]); + out[5] = loadu(&inputs[1][block_offset + 1 * sizeof(__m128i)]); + out[6] = loadu(&inputs[2][block_offset + 1 * sizeof(__m128i)]); + out[7] = loadu(&inputs[3][block_offset + 1 * sizeof(__m128i)]); + out[8] = loadu(&inputs[0][block_offset + 2 * sizeof(__m128i)]); + out[9] = loadu(&inputs[1][block_offset + 2 * sizeof(__m128i)]); + out[10] = loadu(&inputs[2][block_offset + 2 * sizeof(__m128i)]); + out[11] = loadu(&inputs[3][block_offset + 2 * sizeof(__m128i)]); + out[12] = loadu(&inputs[0][block_offset + 3 * sizeof(__m128i)]); + out[13] = loadu(&inputs[1][block_offset + 3 * sizeof(__m128i)]); + out[14] = loadu(&inputs[2][block_offset + 3 * sizeof(__m128i)]); + out[15] = loadu(&inputs[3][block_offset + 3 * sizeof(__m128i)]); + for (size_t i = 0; i < 4; ++i) { + _mm_prefetch((const void *)&inputs[i][block_offset + 256], _MM_HINT_T0); + } + transpose_vecs(&out[0]); + transpose_vecs(&out[4]); + transpose_vecs(&out[8]); + transpose_vecs(&out[12]); +} + +INLINE void load_counters(uint64_t counter, bool increment_counter, + __m128i *out_lo, __m128i *out_hi) { + const __m128i mask = _mm_set1_epi32(-(int32_t)increment_counter); + const __m128i add0 = _mm_set_epi32(3, 2, 1, 0); + const __m128i add1 = _mm_and_si128(mask, add0); + __m128i l = _mm_add_epi32(_mm_set1_epi32((int32_t)counter), add1); + __m128i carry = _mm_cmpgt_epi32(_mm_xor_si128(add1, _mm_set1_epi32(0x80000000)), + _mm_xor_si128( l, _mm_set1_epi32(0x80000000))); + __m128i h = _mm_sub_epi32(_mm_set1_epi32((int32_t)(counter >> 32)), carry); + *out_lo = l; + *out_hi = h; +} + +static +void blake3_hash4_sse2(const uint8_t *const *inputs, size_t blocks, + const uint32_t key[8], uint64_t counter, + bool increment_counter, uint8_t flags, + uint8_t flags_start, uint8_t flags_end, uint8_t *out) { + __m128i h_vecs[8] = { + set1(key[0]), set1(key[1]), set1(key[2]), set1(key[3]), + set1(key[4]), set1(key[5]), set1(key[6]), set1(key[7]), + }; + __m128i counter_low_vec, counter_high_vec; + load_counters(counter, increment_counter, &counter_low_vec, + &counter_high_vec); + uint8_t block_flags = flags | flags_start; + + for (size_t block = 0; block < blocks; block++) { + if (block + 1 == blocks) { + block_flags |= flags_end; + } + __m128i block_len_vec = set1(BLAKE3_BLOCK_LEN); + __m128i block_flags_vec = set1(block_flags); + __m128i msg_vecs[16]; + transpose_msg_vecs(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs); + + __m128i v[16] = { + h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3], + h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7], + set1(IV[0]), set1(IV[1]), set1(IV[2]), set1(IV[3]), + counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec, + }; + round_fn(v, msg_vecs, 0); + round_fn(v, msg_vecs, 1); + round_fn(v, msg_vecs, 2); + round_fn(v, msg_vecs, 3); + round_fn(v, msg_vecs, 4); + round_fn(v, msg_vecs, 5); + round_fn(v, msg_vecs, 6); + h_vecs[0] = xorv(v[0], v[8]); + h_vecs[1] = xorv(v[1], v[9]); + h_vecs[2] = xorv(v[2], v[10]); + h_vecs[3] = xorv(v[3], v[11]); + h_vecs[4] = xorv(v[4], v[12]); + h_vecs[5] = xorv(v[5], v[13]); + h_vecs[6] = xorv(v[6], v[14]); + h_vecs[7] = xorv(v[7], v[15]); + + block_flags = flags; + } + + transpose_vecs(&h_vecs[0]); + transpose_vecs(&h_vecs[4]); + // The first four vecs now contain the first half of each output, and the + // second four vecs contain the second half of each output. + storeu(h_vecs[0], &out[0 * sizeof(__m128i)]); + storeu(h_vecs[4], &out[1 * sizeof(__m128i)]); + storeu(h_vecs[1], &out[2 * sizeof(__m128i)]); + storeu(h_vecs[5], &out[3 * sizeof(__m128i)]); + storeu(h_vecs[2], &out[4 * sizeof(__m128i)]); + storeu(h_vecs[6], &out[5 * sizeof(__m128i)]); + storeu(h_vecs[3], &out[6 * sizeof(__m128i)]); + storeu(h_vecs[7], &out[7 * sizeof(__m128i)]); +} + +INLINE void hash_one_sse2(const uint8_t *input, size_t blocks, + const uint32_t key[8], uint64_t counter, + uint8_t flags, uint8_t flags_start, + uint8_t flags_end, uint8_t out[BLAKE3_OUT_LEN]) { + uint32_t cv[8]; + memcpy(cv, key, BLAKE3_KEY_LEN); + uint8_t block_flags = flags | flags_start; + while (blocks > 0) { + if (blocks == 1) { + block_flags |= flags_end; + } + blake3_compress_in_place_sse2(cv, input, BLAKE3_BLOCK_LEN, counter, + block_flags); + input = &input[BLAKE3_BLOCK_LEN]; + blocks -= 1; + block_flags = flags; + } + memcpy(out, cv, BLAKE3_OUT_LEN); +} + +void blake3_hash_many_sse2(const uint8_t *const *inputs, size_t num_inputs, + size_t blocks, const uint32_t key[8], + uint64_t counter, bool increment_counter, + uint8_t flags, uint8_t flags_start, + uint8_t flags_end, uint8_t *out) { + while (num_inputs >= DEGREE) { + blake3_hash4_sse2(inputs, blocks, key, counter, increment_counter, flags, + flags_start, flags_end, out); + if (increment_counter) { + counter += DEGREE; + } + inputs += DEGREE; + num_inputs -= DEGREE; + out = &out[DEGREE * BLAKE3_OUT_LEN]; + } + while (num_inputs > 0) { + hash_one_sse2(inputs[0], blocks, key, counter, flags, flags_start, + flags_end, out); + if (increment_counter) { + counter += 1; + } + inputs += 1; + num_inputs -= 1; + out = &out[BLAKE3_OUT_LEN]; + } +} diff --git a/third-party/blake3/c/blake3_sse2_x86-64_unix.S b/third-party/blake3/c/blake3_sse2_x86-64_unix.S new file mode 100644 index 00000000..99f033fe --- /dev/null +++ b/third-party/blake3/c/blake3_sse2_x86-64_unix.S @@ -0,0 +1,2291 @@ +#if defined(__ELF__) && defined(__linux__) +.section .note.GNU-stack,"",%progbits +#endif + +#if defined(__ELF__) && defined(__CET__) && defined(__has_include) +#if __has_include() +#include +#endif +#endif + +#if !defined(_CET_ENDBR) +#define _CET_ENDBR +#endif + +.intel_syntax noprefix +.global blake3_hash_many_sse2 +.global _blake3_hash_many_sse2 +.global blake3_compress_in_place_sse2 +.global _blake3_compress_in_place_sse2 +.global blake3_compress_xof_sse2 +.global _blake3_compress_xof_sse2 +#ifdef __APPLE__ +.text +#else +.section .text +#endif + .p2align 6 +_blake3_hash_many_sse2: +blake3_hash_many_sse2: + _CET_ENDBR + push r15 + push r14 + push r13 + push r12 + push rbx + push rbp + mov rbp, rsp + sub rsp, 360 + and rsp, 0xFFFFFFFFFFFFFFC0 + neg r9d + movd xmm0, r9d + pshufd xmm0, xmm0, 0x00 + movdqa xmmword ptr [rsp+0x130], xmm0 + movdqa xmm1, xmm0 + pand xmm1, xmmword ptr [ADD0+rip] + pand xmm0, xmmword ptr [ADD1+rip] + movdqa xmmword ptr [rsp+0x150], xmm0 + movd xmm0, r8d + pshufd xmm0, xmm0, 0x00 + paddd xmm0, xmm1 + movdqa xmmword ptr [rsp+0x110], xmm0 + pxor xmm0, xmmword ptr [CMP_MSB_MASK+rip] + pxor xmm1, xmmword ptr [CMP_MSB_MASK+rip] + pcmpgtd xmm1, xmm0 + shr r8, 32 + movd xmm2, r8d + pshufd xmm2, xmm2, 0x00 + psubd xmm2, xmm1 + movdqa xmmword ptr [rsp+0x120], xmm2 + mov rbx, qword ptr [rbp+0x50] + mov r15, rdx + shl r15, 6 + movzx r13d, byte ptr [rbp+0x38] + movzx r12d, byte ptr [rbp+0x48] + cmp rsi, 4 + jc 3f +2: + movdqu xmm3, xmmword ptr [rcx] + pshufd xmm0, xmm3, 0x00 + pshufd xmm1, xmm3, 0x55 + pshufd xmm2, xmm3, 0xAA + pshufd xmm3, xmm3, 0xFF + movdqu xmm7, xmmword ptr [rcx+0x10] + pshufd xmm4, xmm7, 0x00 + pshufd xmm5, xmm7, 0x55 + pshufd xmm6, xmm7, 0xAA + pshufd xmm7, xmm7, 0xFF + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+0x8] + mov r10, qword ptr [rdi+0x10] + mov r11, qword ptr [rdi+0x18] + movzx eax, byte ptr [rbp+0x40] + or eax, r13d + xor edx, edx +9: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + movdqu xmm8, xmmword ptr [r8+rdx-0x40] + movdqu xmm9, xmmword ptr [r9+rdx-0x40] + movdqu xmm10, xmmword ptr [r10+rdx-0x40] + movdqu xmm11, xmmword ptr [r11+rdx-0x40] + movdqa xmm12, xmm8 + punpckldq xmm8, xmm9 + punpckhdq xmm12, xmm9 + movdqa xmm14, xmm10 + punpckldq xmm10, xmm11 + punpckhdq xmm14, xmm11 + movdqa xmm9, xmm8 + punpcklqdq xmm8, xmm10 + punpckhqdq xmm9, xmm10 + movdqa xmm13, xmm12 + punpcklqdq xmm12, xmm14 + punpckhqdq xmm13, xmm14 + movdqa xmmword ptr [rsp], xmm8 + movdqa xmmword ptr [rsp+0x10], xmm9 + movdqa xmmword ptr [rsp+0x20], xmm12 + movdqa xmmword ptr [rsp+0x30], xmm13 + movdqu xmm8, xmmword ptr [r8+rdx-0x30] + movdqu xmm9, xmmword ptr [r9+rdx-0x30] + movdqu xmm10, xmmword ptr [r10+rdx-0x30] + movdqu xmm11, xmmword ptr [r11+rdx-0x30] + movdqa xmm12, xmm8 + punpckldq xmm8, xmm9 + punpckhdq xmm12, xmm9 + movdqa xmm14, xmm10 + punpckldq xmm10, xmm11 + punpckhdq xmm14, xmm11 + movdqa xmm9, xmm8 + punpcklqdq xmm8, xmm10 + punpckhqdq xmm9, xmm10 + movdqa xmm13, xmm12 + punpcklqdq xmm12, xmm14 + punpckhqdq xmm13, xmm14 + movdqa xmmword ptr [rsp+0x40], xmm8 + movdqa xmmword ptr [rsp+0x50], xmm9 + movdqa xmmword ptr [rsp+0x60], xmm12 + movdqa xmmword ptr [rsp+0x70], xmm13 + movdqu xmm8, xmmword ptr [r8+rdx-0x20] + movdqu xmm9, xmmword ptr [r9+rdx-0x20] + movdqu xmm10, xmmword ptr [r10+rdx-0x20] + movdqu xmm11, xmmword ptr [r11+rdx-0x20] + movdqa xmm12, xmm8 + punpckldq xmm8, xmm9 + punpckhdq xmm12, xmm9 + movdqa xmm14, xmm10 + punpckldq xmm10, xmm11 + punpckhdq xmm14, xmm11 + movdqa xmm9, xmm8 + punpcklqdq xmm8, xmm10 + punpckhqdq xmm9, xmm10 + movdqa xmm13, xmm12 + punpcklqdq xmm12, xmm14 + punpckhqdq xmm13, xmm14 + movdqa xmmword ptr [rsp+0x80], xmm8 + movdqa xmmword ptr [rsp+0x90], xmm9 + movdqa xmmword ptr [rsp+0xA0], xmm12 + movdqa xmmword ptr [rsp+0xB0], xmm13 + movdqu xmm8, xmmword ptr [r8+rdx-0x10] + movdqu xmm9, xmmword ptr [r9+rdx-0x10] + movdqu xmm10, xmmword ptr [r10+rdx-0x10] + movdqu xmm11, xmmword ptr [r11+rdx-0x10] + movdqa xmm12, xmm8 + punpckldq xmm8, xmm9 + punpckhdq xmm12, xmm9 + movdqa xmm14, xmm10 + punpckldq xmm10, xmm11 + punpckhdq xmm14, xmm11 + movdqa xmm9, xmm8 + punpcklqdq xmm8, xmm10 + punpckhqdq xmm9, xmm10 + movdqa xmm13, xmm12 + punpcklqdq xmm12, xmm14 + punpckhqdq xmm13, xmm14 + movdqa xmmword ptr [rsp+0xC0], xmm8 + movdqa xmmword ptr [rsp+0xD0], xmm9 + movdqa xmmword ptr [rsp+0xE0], xmm12 + movdqa xmmword ptr [rsp+0xF0], xmm13 + movdqa xmm9, xmmword ptr [BLAKE3_IV_1+rip] + movdqa xmm10, xmmword ptr [BLAKE3_IV_2+rip] + movdqa xmm11, xmmword ptr [BLAKE3_IV_3+rip] + movdqa xmm12, xmmword ptr [rsp+0x110] + movdqa xmm13, xmmword ptr [rsp+0x120] + movdqa xmm14, xmmword ptr [BLAKE3_BLOCK_LEN+rip] + movd xmm15, eax + pshufd xmm15, xmm15, 0x00 + prefetcht0 [r8+rdx+0x80] + prefetcht0 [r9+rdx+0x80] + prefetcht0 [r10+rdx+0x80] + prefetcht0 [r11+rdx+0x80] + paddd xmm0, xmmword ptr [rsp] + paddd xmm1, xmmword ptr [rsp+0x20] + paddd xmm2, xmmword ptr [rsp+0x40] + paddd xmm3, xmmword ptr [rsp+0x60] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 + movdqa xmm8, xmmword ptr [BLAKE3_IV_0+rip] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x10] + paddd xmm1, xmmword ptr [rsp+0x30] + paddd xmm2, xmmword ptr [rsp+0x50] + paddd xmm3, xmmword ptr [rsp+0x70] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x80] + paddd xmm1, xmmword ptr [rsp+0xA0] + paddd xmm2, xmmword ptr [rsp+0xC0] + paddd xmm3, xmmword ptr [rsp+0xE0] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0x90] + paddd xmm1, xmmword ptr [rsp+0xB0] + paddd xmm2, xmmword ptr [rsp+0xD0] + paddd xmm3, xmmword ptr [rsp+0xF0] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0x20] + paddd xmm1, xmmword ptr [rsp+0x30] + paddd xmm2, xmmword ptr [rsp+0x70] + paddd xmm3, xmmword ptr [rsp+0x40] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x60] + paddd xmm1, xmmword ptr [rsp+0xA0] + paddd xmm2, xmmword ptr [rsp] + paddd xmm3, xmmword ptr [rsp+0xD0] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x10] + paddd xmm1, xmmword ptr [rsp+0xC0] + paddd xmm2, xmmword ptr [rsp+0x90] + paddd xmm3, xmmword ptr [rsp+0xF0] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0xB0] + paddd xmm1, xmmword ptr [rsp+0x50] + paddd xmm2, xmmword ptr [rsp+0xE0] + paddd xmm3, xmmword ptr [rsp+0x80] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0x30] + paddd xmm1, xmmword ptr [rsp+0xA0] + paddd xmm2, xmmword ptr [rsp+0xD0] + paddd xmm3, xmmword ptr [rsp+0x70] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x40] + paddd xmm1, xmmword ptr [rsp+0xC0] + paddd xmm2, xmmword ptr [rsp+0x20] + paddd xmm3, xmmword ptr [rsp+0xE0] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x60] + paddd xmm1, xmmword ptr [rsp+0x90] + paddd xmm2, xmmword ptr [rsp+0xB0] + paddd xmm3, xmmword ptr [rsp+0x80] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0x50] + paddd xmm1, xmmword ptr [rsp] + paddd xmm2, xmmword ptr [rsp+0xF0] + paddd xmm3, xmmword ptr [rsp+0x10] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0xA0] + paddd xmm1, xmmword ptr [rsp+0xC0] + paddd xmm2, xmmword ptr [rsp+0xE0] + paddd xmm3, xmmword ptr [rsp+0xD0] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x70] + paddd xmm1, xmmword ptr [rsp+0x90] + paddd xmm2, xmmword ptr [rsp+0x30] + paddd xmm3, xmmword ptr [rsp+0xF0] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x40] + paddd xmm1, xmmword ptr [rsp+0xB0] + paddd xmm2, xmmword ptr [rsp+0x50] + paddd xmm3, xmmword ptr [rsp+0x10] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp] + paddd xmm1, xmmword ptr [rsp+0x20] + paddd xmm2, xmmword ptr [rsp+0x80] + paddd xmm3, xmmword ptr [rsp+0x60] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0xC0] + paddd xmm1, xmmword ptr [rsp+0x90] + paddd xmm2, xmmword ptr [rsp+0xF0] + paddd xmm3, xmmword ptr [rsp+0xE0] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0xD0] + paddd xmm1, xmmword ptr [rsp+0xB0] + paddd xmm2, xmmword ptr [rsp+0xA0] + paddd xmm3, xmmword ptr [rsp+0x80] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x70] + paddd xmm1, xmmword ptr [rsp+0x50] + paddd xmm2, xmmword ptr [rsp] + paddd xmm3, xmmword ptr [rsp+0x60] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0x20] + paddd xmm1, xmmword ptr [rsp+0x30] + paddd xmm2, xmmword ptr [rsp+0x10] + paddd xmm3, xmmword ptr [rsp+0x40] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0x90] + paddd xmm1, xmmword ptr [rsp+0xB0] + paddd xmm2, xmmword ptr [rsp+0x80] + paddd xmm3, xmmword ptr [rsp+0xF0] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0xE0] + paddd xmm1, xmmword ptr [rsp+0x50] + paddd xmm2, xmmword ptr [rsp+0xC0] + paddd xmm3, xmmword ptr [rsp+0x10] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0xD0] + paddd xmm1, xmmword ptr [rsp] + paddd xmm2, xmmword ptr [rsp+0x20] + paddd xmm3, xmmword ptr [rsp+0x40] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0x30] + paddd xmm1, xmmword ptr [rsp+0xA0] + paddd xmm2, xmmword ptr [rsp+0x60] + paddd xmm3, xmmword ptr [rsp+0x70] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0xB0] + paddd xmm1, xmmword ptr [rsp+0x50] + paddd xmm2, xmmword ptr [rsp+0x10] + paddd xmm3, xmmword ptr [rsp+0x80] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0xF0] + paddd xmm1, xmmword ptr [rsp] + paddd xmm2, xmmword ptr [rsp+0x90] + paddd xmm3, xmmword ptr [rsp+0x60] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0xE0] + paddd xmm1, xmmword ptr [rsp+0x20] + paddd xmm2, xmmword ptr [rsp+0x30] + paddd xmm3, xmmword ptr [rsp+0x70] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0xA0] + paddd xmm1, xmmword ptr [rsp+0xC0] + paddd xmm2, xmmword ptr [rsp+0x40] + paddd xmm3, xmmword ptr [rsp+0xD0] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + pxor xmm0, xmm8 + pxor xmm1, xmm9 + pxor xmm2, xmm10 + pxor xmm3, xmm11 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + pxor xmm4, xmm12 + pxor xmm5, xmm13 + pxor xmm6, xmm14 + pxor xmm7, xmm15 + mov eax, r13d + jne 9b + movdqa xmm9, xmm0 + punpckldq xmm0, xmm1 + punpckhdq xmm9, xmm1 + movdqa xmm11, xmm2 + punpckldq xmm2, xmm3 + punpckhdq xmm11, xmm3 + movdqa xmm1, xmm0 + punpcklqdq xmm0, xmm2 + punpckhqdq xmm1, xmm2 + movdqa xmm3, xmm9 + punpcklqdq xmm9, xmm11 + punpckhqdq xmm3, xmm11 + movdqu xmmword ptr [rbx], xmm0 + movdqu xmmword ptr [rbx+0x20], xmm1 + movdqu xmmword ptr [rbx+0x40], xmm9 + movdqu xmmword ptr [rbx+0x60], xmm3 + movdqa xmm9, xmm4 + punpckldq xmm4, xmm5 + punpckhdq xmm9, xmm5 + movdqa xmm11, xmm6 + punpckldq xmm6, xmm7 + punpckhdq xmm11, xmm7 + movdqa xmm5, xmm4 + punpcklqdq xmm4, xmm6 + punpckhqdq xmm5, xmm6 + movdqa xmm7, xmm9 + punpcklqdq xmm9, xmm11 + punpckhqdq xmm7, xmm11 + movdqu xmmword ptr [rbx+0x10], xmm4 + movdqu xmmword ptr [rbx+0x30], xmm5 + movdqu xmmword ptr [rbx+0x50], xmm9 + movdqu xmmword ptr [rbx+0x70], xmm7 + movdqa xmm1, xmmword ptr [rsp+0x110] + movdqa xmm0, xmm1 + paddd xmm1, xmmword ptr [rsp+0x150] + movdqa xmmword ptr [rsp+0x110], xmm1 + pxor xmm0, xmmword ptr [CMP_MSB_MASK+rip] + pxor xmm1, xmmword ptr [CMP_MSB_MASK+rip] + pcmpgtd xmm0, xmm1 + movdqa xmm1, xmmword ptr [rsp+0x120] + psubd xmm1, xmm0 + movdqa xmmword ptr [rsp+0x120], xmm1 + add rbx, 128 + add rdi, 32 + sub rsi, 4 + cmp rsi, 4 + jnc 2b + test rsi, rsi + jnz 3f +4: + mov rsp, rbp + pop rbp + pop rbx + pop r12 + pop r13 + pop r14 + pop r15 + ret +.p2align 5 +3: + test esi, 0x2 + je 3f + movups xmm0, xmmword ptr [rcx] + movups xmm1, xmmword ptr [rcx+0x10] + movaps xmm8, xmm0 + movaps xmm9, xmm1 + movd xmm13, dword ptr [rsp+0x110] + movd xmm14, dword ptr [rsp+0x120] + punpckldq xmm13, xmm14 + movaps xmmword ptr [rsp], xmm13 + movd xmm14, dword ptr [rsp+0x114] + movd xmm13, dword ptr [rsp+0x124] + punpckldq xmm14, xmm13 + movaps xmmword ptr [rsp+0x10], xmm14 + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+0x8] + movzx eax, byte ptr [rbp+0x40] + or eax, r13d + xor edx, edx +2: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + movaps xmm2, xmmword ptr [BLAKE3_IV+rip] + movaps xmm10, xmm2 + movups xmm4, xmmword ptr [r8+rdx-0x40] + movups xmm5, xmmword ptr [r8+rdx-0x30] + movaps xmm3, xmm4 + shufps xmm4, xmm5, 136 + shufps xmm3, xmm5, 221 + movaps xmm5, xmm3 + movups xmm6, xmmword ptr [r8+rdx-0x20] + movups xmm7, xmmword ptr [r8+rdx-0x10] + movaps xmm3, xmm6 + shufps xmm6, xmm7, 136 + pshufd xmm6, xmm6, 0x93 + shufps xmm3, xmm7, 221 + pshufd xmm7, xmm3, 0x93 + movups xmm12, xmmword ptr [r9+rdx-0x40] + movups xmm13, xmmword ptr [r9+rdx-0x30] + movaps xmm11, xmm12 + shufps xmm12, xmm13, 136 + shufps xmm11, xmm13, 221 + movaps xmm13, xmm11 + movups xmm14, xmmword ptr [r9+rdx-0x20] + movups xmm15, xmmword ptr [r9+rdx-0x10] + movaps xmm11, xmm14 + shufps xmm14, xmm15, 136 + pshufd xmm14, xmm14, 0x93 + shufps xmm11, xmm15, 221 + pshufd xmm15, xmm11, 0x93 + shl rax, 0x20 + or rax, 0x40 + movq xmm3, rax + movdqa xmmword ptr [rsp+0x20], xmm3 + movaps xmm3, xmmword ptr [rsp] + movaps xmm11, xmmword ptr [rsp+0x10] + punpcklqdq xmm3, xmmword ptr [rsp+0x20] + punpcklqdq xmm11, xmmword ptr [rsp+0x20] + mov al, 7 +9: + paddd xmm0, xmm4 + paddd xmm8, xmm12 + movaps xmmword ptr [rsp+0x20], xmm4 + movaps xmmword ptr [rsp+0x30], xmm12 + paddd xmm0, xmm1 + paddd xmm8, xmm9 + pxor xmm3, xmm0 + pxor xmm11, xmm8 + pshuflw xmm3, xmm3, 0xB1 + pshufhw xmm3, xmm3, 0xB1 + pshuflw xmm11, xmm11, 0xB1 + pshufhw xmm11, xmm11, 0xB1 + paddd xmm2, xmm3 + paddd xmm10, xmm11 + pxor xmm1, xmm2 + pxor xmm9, xmm10 + movdqa xmm4, xmm1 + pslld xmm1, 20 + psrld xmm4, 12 + por xmm1, xmm4 + movdqa xmm4, xmm9 + pslld xmm9, 20 + psrld xmm4, 12 + por xmm9, xmm4 + paddd xmm0, xmm5 + paddd xmm8, xmm13 + movaps xmmword ptr [rsp+0x40], xmm5 + movaps xmmword ptr [rsp+0x50], xmm13 + paddd xmm0, xmm1 + paddd xmm8, xmm9 + pxor xmm3, xmm0 + pxor xmm11, xmm8 + movdqa xmm13, xmm3 + psrld xmm3, 8 + pslld xmm13, 24 + pxor xmm3, xmm13 + movdqa xmm13, xmm11 + psrld xmm11, 8 + pslld xmm13, 24 + pxor xmm11, xmm13 + paddd xmm2, xmm3 + paddd xmm10, xmm11 + pxor xmm1, xmm2 + pxor xmm9, xmm10 + movdqa xmm4, xmm1 + pslld xmm1, 25 + psrld xmm4, 7 + por xmm1, xmm4 + movdqa xmm4, xmm9 + pslld xmm9, 25 + psrld xmm4, 7 + por xmm9, xmm4 + pshufd xmm0, xmm0, 0x93 + pshufd xmm8, xmm8, 0x93 + pshufd xmm3, xmm3, 0x4E + pshufd xmm11, xmm11, 0x4E + pshufd xmm2, xmm2, 0x39 + pshufd xmm10, xmm10, 0x39 + paddd xmm0, xmm6 + paddd xmm8, xmm14 + paddd xmm0, xmm1 + paddd xmm8, xmm9 + pxor xmm3, xmm0 + pxor xmm11, xmm8 + pshuflw xmm3, xmm3, 0xB1 + pshufhw xmm3, xmm3, 0xB1 + pshuflw xmm11, xmm11, 0xB1 + pshufhw xmm11, xmm11, 0xB1 + paddd xmm2, xmm3 + paddd xmm10, xmm11 + pxor xmm1, xmm2 + pxor xmm9, xmm10 + movdqa xmm4, xmm1 + pslld xmm1, 20 + psrld xmm4, 12 + por xmm1, xmm4 + movdqa xmm4, xmm9 + pslld xmm9, 20 + psrld xmm4, 12 + por xmm9, xmm4 + paddd xmm0, xmm7 + paddd xmm8, xmm15 + paddd xmm0, xmm1 + paddd xmm8, xmm9 + pxor xmm3, xmm0 + pxor xmm11, xmm8 + movdqa xmm13, xmm3 + psrld xmm3, 8 + pslld xmm13, 24 + pxor xmm3, xmm13 + movdqa xmm13, xmm11 + psrld xmm11, 8 + pslld xmm13, 24 + pxor xmm11, xmm13 + paddd xmm2, xmm3 + paddd xmm10, xmm11 + pxor xmm1, xmm2 + pxor xmm9, xmm10 + movdqa xmm4, xmm1 + pslld xmm1, 25 + psrld xmm4, 7 + por xmm1, xmm4 + movdqa xmm4, xmm9 + pslld xmm9, 25 + psrld xmm4, 7 + por xmm9, xmm4 + pshufd xmm0, xmm0, 0x39 + pshufd xmm8, xmm8, 0x39 + pshufd xmm3, xmm3, 0x4E + pshufd xmm11, xmm11, 0x4E + pshufd xmm2, xmm2, 0x93 + pshufd xmm10, xmm10, 0x93 + dec al + je 9f + movdqa xmm12, xmmword ptr [rsp+0x20] + movdqa xmm5, xmmword ptr [rsp+0x40] + pshufd xmm13, xmm12, 0x0F + shufps xmm12, xmm5, 214 + pshufd xmm4, xmm12, 0x39 + movdqa xmm12, xmm6 + shufps xmm12, xmm7, 250 + pand xmm13, xmmword ptr [PBLENDW_0x33_MASK+rip] + pand xmm12, xmmword ptr [PBLENDW_0xCC_MASK+rip] + por xmm13, xmm12 + movdqa xmmword ptr [rsp+0x20], xmm13 + movdqa xmm12, xmm7 + punpcklqdq xmm12, xmm5 + movdqa xmm13, xmm6 + pand xmm12, xmmword ptr [PBLENDW_0x3F_MASK+rip] + pand xmm13, xmmword ptr [PBLENDW_0xC0_MASK+rip] + por xmm12, xmm13 + pshufd xmm12, xmm12, 0x78 + punpckhdq xmm5, xmm7 + punpckldq xmm6, xmm5 + pshufd xmm7, xmm6, 0x1E + movdqa xmmword ptr [rsp+0x40], xmm12 + movdqa xmm5, xmmword ptr [rsp+0x30] + movdqa xmm13, xmmword ptr [rsp+0x50] + pshufd xmm6, xmm5, 0x0F + shufps xmm5, xmm13, 214 + pshufd xmm12, xmm5, 0x39 + movdqa xmm5, xmm14 + shufps xmm5, xmm15, 250 + pand xmm6, xmmword ptr [PBLENDW_0x33_MASK+rip] + pand xmm5, xmmword ptr [PBLENDW_0xCC_MASK+rip] + por xmm6, xmm5 + movdqa xmm5, xmm15 + punpcklqdq xmm5, xmm13 + movdqa xmmword ptr [rsp+0x30], xmm2 + movdqa xmm2, xmm14 + pand xmm5, xmmword ptr [PBLENDW_0x3F_MASK+rip] + pand xmm2, xmmword ptr [PBLENDW_0xC0_MASK+rip] + por xmm5, xmm2 + movdqa xmm2, xmmword ptr [rsp+0x30] + pshufd xmm5, xmm5, 0x78 + punpckhdq xmm13, xmm15 + punpckldq xmm14, xmm13 + pshufd xmm15, xmm14, 0x1E + movdqa xmm13, xmm6 + movdqa xmm14, xmm5 + movdqa xmm5, xmmword ptr [rsp+0x20] + movdqa xmm6, xmmword ptr [rsp+0x40] + jmp 9b +9: + pxor xmm0, xmm2 + pxor xmm1, xmm3 + pxor xmm8, xmm10 + pxor xmm9, xmm11 + mov eax, r13d + cmp rdx, r15 + jne 2b + movups xmmword ptr [rbx], xmm0 + movups xmmword ptr [rbx+0x10], xmm1 + movups xmmword ptr [rbx+0x20], xmm8 + movups xmmword ptr [rbx+0x30], xmm9 + mov eax, dword ptr [rsp+0x130] + neg eax + mov r10d, dword ptr [rsp+0x110+8*rax] + mov r11d, dword ptr [rsp+0x120+8*rax] + mov dword ptr [rsp+0x110], r10d + mov dword ptr [rsp+0x120], r11d + add rdi, 16 + add rbx, 64 + sub rsi, 2 +3: + test esi, 0x1 + je 4b + movups xmm0, xmmword ptr [rcx] + movups xmm1, xmmword ptr [rcx+0x10] + movd xmm13, dword ptr [rsp+0x110] + movd xmm14, dword ptr [rsp+0x120] + punpckldq xmm13, xmm14 + mov r8, qword ptr [rdi] + movzx eax, byte ptr [rbp+0x40] + or eax, r13d + xor edx, edx +2: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + movaps xmm2, xmmword ptr [BLAKE3_IV+rip] + shl rax, 32 + or rax, 64 + movq xmm12, rax + movdqa xmm3, xmm13 + punpcklqdq xmm3, xmm12 + movups xmm4, xmmword ptr [r8+rdx-0x40] + movups xmm5, xmmword ptr [r8+rdx-0x30] + movaps xmm8, xmm4 + shufps xmm4, xmm5, 136 + shufps xmm8, xmm5, 221 + movaps xmm5, xmm8 + movups xmm6, xmmword ptr [r8+rdx-0x20] + movups xmm7, xmmword ptr [r8+rdx-0x10] + movaps xmm8, xmm6 + shufps xmm6, xmm7, 136 + pshufd xmm6, xmm6, 0x93 + shufps xmm8, xmm7, 221 + pshufd xmm7, xmm8, 0x93 + mov al, 7 +9: + paddd xmm0, xmm4 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshuflw xmm3, xmm3, 0xB1 + pshufhw xmm3, xmm3, 0xB1 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm5 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + movdqa xmm14, xmm3 + psrld xmm3, 8 + pslld xmm14, 24 + pxor xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 0x93 + pshufd xmm3, xmm3, 0x4E + pshufd xmm2, xmm2, 0x39 + paddd xmm0, xmm6 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshuflw xmm3, xmm3, 0xB1 + pshufhw xmm3, xmm3, 0xB1 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm7 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + movdqa xmm14, xmm3 + psrld xmm3, 8 + pslld xmm14, 24 + pxor xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 0x39 + pshufd xmm3, xmm3, 0x4E + pshufd xmm2, xmm2, 0x93 + dec al + jz 9f + movdqa xmm8, xmm4 + shufps xmm8, xmm5, 214 + pshufd xmm9, xmm4, 0x0F + pshufd xmm4, xmm8, 0x39 + movdqa xmm8, xmm6 + shufps xmm8, xmm7, 250 + pand xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip] + pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip] + por xmm9, xmm8 + movdqa xmm8, xmm7 + punpcklqdq xmm8, xmm5 + movdqa xmm10, xmm6 + pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip] + pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip] + por xmm8, xmm10 + pshufd xmm8, xmm8, 0x78 + punpckhdq xmm5, xmm7 + punpckldq xmm6, xmm5 + pshufd xmm7, xmm6, 0x1E + movdqa xmm5, xmm9 + movdqa xmm6, xmm8 + jmp 9b +9: + pxor xmm0, xmm2 + pxor xmm1, xmm3 + mov eax, r13d + cmp rdx, r15 + jne 2b + movups xmmword ptr [rbx], xmm0 + movups xmmword ptr [rbx+0x10], xmm1 + jmp 4b + +.p2align 6 +blake3_compress_in_place_sse2: +_blake3_compress_in_place_sse2: + _CET_ENDBR + movups xmm0, xmmword ptr [rdi] + movups xmm1, xmmword ptr [rdi+0x10] + movaps xmm2, xmmword ptr [BLAKE3_IV+rip] + shl r8, 32 + add rdx, r8 + movq xmm3, rcx + movq xmm4, rdx + punpcklqdq xmm3, xmm4 + movups xmm4, xmmword ptr [rsi] + movups xmm5, xmmword ptr [rsi+0x10] + movaps xmm8, xmm4 + shufps xmm4, xmm5, 136 + shufps xmm8, xmm5, 221 + movaps xmm5, xmm8 + movups xmm6, xmmword ptr [rsi+0x20] + movups xmm7, xmmword ptr [rsi+0x30] + movaps xmm8, xmm6 + shufps xmm6, xmm7, 136 + pshufd xmm6, xmm6, 0x93 + shufps xmm8, xmm7, 221 + pshufd xmm7, xmm8, 0x93 + mov al, 7 +9: + paddd xmm0, xmm4 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshuflw xmm3, xmm3, 0xB1 + pshufhw xmm3, xmm3, 0xB1 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm5 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + movdqa xmm14, xmm3 + psrld xmm3, 8 + pslld xmm14, 24 + pxor xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 0x93 + pshufd xmm3, xmm3, 0x4E + pshufd xmm2, xmm2, 0x39 + paddd xmm0, xmm6 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshuflw xmm3, xmm3, 0xB1 + pshufhw xmm3, xmm3, 0xB1 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm7 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + movdqa xmm14, xmm3 + psrld xmm3, 8 + pslld xmm14, 24 + pxor xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 0x39 + pshufd xmm3, xmm3, 0x4E + pshufd xmm2, xmm2, 0x93 + dec al + jz 9f + movdqa xmm8, xmm4 + shufps xmm8, xmm5, 214 + pshufd xmm9, xmm4, 0x0F + pshufd xmm4, xmm8, 0x39 + movdqa xmm8, xmm6 + shufps xmm8, xmm7, 250 + pand xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip] + pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip] + por xmm9, xmm8 + movdqa xmm8, xmm7 + punpcklqdq xmm8, xmm5 + movdqa xmm10, xmm6 + pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip] + pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip] + por xmm8, xmm10 + pshufd xmm8, xmm8, 0x78 + punpckhdq xmm5, xmm7 + punpckldq xmm6, xmm5 + pshufd xmm7, xmm6, 0x1E + movdqa xmm5, xmm9 + movdqa xmm6, xmm8 + jmp 9b +9: + pxor xmm0, xmm2 + pxor xmm1, xmm3 + movups xmmword ptr [rdi], xmm0 + movups xmmword ptr [rdi+0x10], xmm1 + ret + +.p2align 6 +blake3_compress_xof_sse2: +_blake3_compress_xof_sse2: + _CET_ENDBR + movups xmm0, xmmword ptr [rdi] + movups xmm1, xmmword ptr [rdi+0x10] + movaps xmm2, xmmword ptr [BLAKE3_IV+rip] + movzx eax, r8b + movzx edx, dl + shl rax, 32 + add rdx, rax + movq xmm3, rcx + movq xmm4, rdx + punpcklqdq xmm3, xmm4 + movups xmm4, xmmword ptr [rsi] + movups xmm5, xmmword ptr [rsi+0x10] + movaps xmm8, xmm4 + shufps xmm4, xmm5, 136 + shufps xmm8, xmm5, 221 + movaps xmm5, xmm8 + movups xmm6, xmmword ptr [rsi+0x20] + movups xmm7, xmmword ptr [rsi+0x30] + movaps xmm8, xmm6 + shufps xmm6, xmm7, 136 + pshufd xmm6, xmm6, 0x93 + shufps xmm8, xmm7, 221 + pshufd xmm7, xmm8, 0x93 + mov al, 7 +9: + paddd xmm0, xmm4 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshuflw xmm3, xmm3, 0xB1 + pshufhw xmm3, xmm3, 0xB1 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm5 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + movdqa xmm14, xmm3 + psrld xmm3, 8 + pslld xmm14, 24 + pxor xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 0x93 + pshufd xmm3, xmm3, 0x4E + pshufd xmm2, xmm2, 0x39 + paddd xmm0, xmm6 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshuflw xmm3, xmm3, 0xB1 + pshufhw xmm3, xmm3, 0xB1 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm7 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + movdqa xmm14, xmm3 + psrld xmm3, 8 + pslld xmm14, 24 + pxor xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 0x39 + pshufd xmm3, xmm3, 0x4E + pshufd xmm2, xmm2, 0x93 + dec al + jz 9f + movdqa xmm8, xmm4 + shufps xmm8, xmm5, 214 + pshufd xmm9, xmm4, 0x0F + pshufd xmm4, xmm8, 0x39 + movdqa xmm8, xmm6 + shufps xmm8, xmm7, 250 + pand xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip] + pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip] + por xmm9, xmm8 + movdqa xmm8, xmm7 + punpcklqdq xmm8, xmm5 + movdqa xmm10, xmm6 + pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip] + pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip] + por xmm8, xmm10 + pshufd xmm8, xmm8, 0x78 + punpckhdq xmm5, xmm7 + punpckldq xmm6, xmm5 + pshufd xmm7, xmm6, 0x1E + movdqa xmm5, xmm9 + movdqa xmm6, xmm8 + jmp 9b +9: + movdqu xmm4, xmmword ptr [rdi] + movdqu xmm5, xmmword ptr [rdi+0x10] + pxor xmm0, xmm2 + pxor xmm1, xmm3 + pxor xmm2, xmm4 + pxor xmm3, xmm5 + movups xmmword ptr [r9], xmm0 + movups xmmword ptr [r9+0x10], xmm1 + movups xmmword ptr [r9+0x20], xmm2 + movups xmmword ptr [r9+0x30], xmm3 + ret + + +#ifdef __APPLE__ +.static_data +#else +.section .rodata +#endif +.p2align 6 +BLAKE3_IV: + .long 0x6A09E667, 0xBB67AE85 + .long 0x3C6EF372, 0xA54FF53A +ADD0: + .long 0, 1, 2, 3 +ADD1: + .long 4, 4, 4, 4 +BLAKE3_IV_0: + .long 0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667 +BLAKE3_IV_1: + .long 0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85 +BLAKE3_IV_2: + .long 0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372 +BLAKE3_IV_3: + .long 0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A +BLAKE3_BLOCK_LEN: + .long 64, 64, 64, 64 +CMP_MSB_MASK: + .long 0x80000000, 0x80000000, 0x80000000, 0x80000000 +PBLENDW_0x33_MASK: + .long 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000 +PBLENDW_0xCC_MASK: + .long 0x00000000, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF +PBLENDW_0x3F_MASK: + .long 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000 +PBLENDW_0xC0_MASK: + .long 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF diff --git a/third-party/blake3/c/blake3_sse2_x86-64_windows_gnu.S b/third-party/blake3/c/blake3_sse2_x86-64_windows_gnu.S new file mode 100644 index 00000000..4facb50e --- /dev/null +++ b/third-party/blake3/c/blake3_sse2_x86-64_windows_gnu.S @@ -0,0 +1,2332 @@ +.intel_syntax noprefix +.global blake3_hash_many_sse2 +.global _blake3_hash_many_sse2 +.global blake3_compress_in_place_sse2 +.global _blake3_compress_in_place_sse2 +.global blake3_compress_xof_sse2 +.global _blake3_compress_xof_sse2 +.section .text + .p2align 6 +_blake3_hash_many_sse2: +blake3_hash_many_sse2: + push r15 + push r14 + push r13 + push r12 + push rsi + push rdi + push rbx + push rbp + mov rbp, rsp + sub rsp, 528 + and rsp, 0xFFFFFFFFFFFFFFC0 + movdqa xmmword ptr [rsp+0x170], xmm6 + movdqa xmmword ptr [rsp+0x180], xmm7 + movdqa xmmword ptr [rsp+0x190], xmm8 + movdqa xmmword ptr [rsp+0x1A0], xmm9 + movdqa xmmword ptr [rsp+0x1B0], xmm10 + movdqa xmmword ptr [rsp+0x1C0], xmm11 + movdqa xmmword ptr [rsp+0x1D0], xmm12 + movdqa xmmword ptr [rsp+0x1E0], xmm13 + movdqa xmmword ptr [rsp+0x1F0], xmm14 + movdqa xmmword ptr [rsp+0x200], xmm15 + mov rdi, rcx + mov rsi, rdx + mov rdx, r8 + mov rcx, r9 + mov r8, qword ptr [rbp+0x68] + movzx r9, byte ptr [rbp+0x70] + neg r9d + movd xmm0, r9d + pshufd xmm0, xmm0, 0x00 + movdqa xmmword ptr [rsp+0x130], xmm0 + movdqa xmm1, xmm0 + pand xmm1, xmmword ptr [ADD0+rip] + pand xmm0, xmmword ptr [ADD1+rip] + movdqa xmmword ptr [rsp+0x150], xmm0 + movd xmm0, r8d + pshufd xmm0, xmm0, 0x00 + paddd xmm0, xmm1 + movdqa xmmword ptr [rsp+0x110], xmm0 + pxor xmm0, xmmword ptr [CMP_MSB_MASK+rip] + pxor xmm1, xmmword ptr [CMP_MSB_MASK+rip] + pcmpgtd xmm1, xmm0 + shr r8, 32 + movd xmm2, r8d + pshufd xmm2, xmm2, 0x00 + psubd xmm2, xmm1 + movdqa xmmword ptr [rsp+0x120], xmm2 + mov rbx, qword ptr [rbp+0x90] + mov r15, rdx + shl r15, 6 + movzx r13d, byte ptr [rbp+0x78] + movzx r12d, byte ptr [rbp+0x88] + cmp rsi, 4 + jc 3f +2: + movdqu xmm3, xmmword ptr [rcx] + pshufd xmm0, xmm3, 0x00 + pshufd xmm1, xmm3, 0x55 + pshufd xmm2, xmm3, 0xAA + pshufd xmm3, xmm3, 0xFF + movdqu xmm7, xmmword ptr [rcx+0x10] + pshufd xmm4, xmm7, 0x00 + pshufd xmm5, xmm7, 0x55 + pshufd xmm6, xmm7, 0xAA + pshufd xmm7, xmm7, 0xFF + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+0x8] + mov r10, qword ptr [rdi+0x10] + mov r11, qword ptr [rdi+0x18] + movzx eax, byte ptr [rbp+0x80] + or eax, r13d + xor edx, edx +9: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + movdqu xmm8, xmmword ptr [r8+rdx-0x40] + movdqu xmm9, xmmword ptr [r9+rdx-0x40] + movdqu xmm10, xmmword ptr [r10+rdx-0x40] + movdqu xmm11, xmmword ptr [r11+rdx-0x40] + movdqa xmm12, xmm8 + punpckldq xmm8, xmm9 + punpckhdq xmm12, xmm9 + movdqa xmm14, xmm10 + punpckldq xmm10, xmm11 + punpckhdq xmm14, xmm11 + movdqa xmm9, xmm8 + punpcklqdq xmm8, xmm10 + punpckhqdq xmm9, xmm10 + movdqa xmm13, xmm12 + punpcklqdq xmm12, xmm14 + punpckhqdq xmm13, xmm14 + movdqa xmmword ptr [rsp], xmm8 + movdqa xmmword ptr [rsp+0x10], xmm9 + movdqa xmmword ptr [rsp+0x20], xmm12 + movdqa xmmword ptr [rsp+0x30], xmm13 + movdqu xmm8, xmmword ptr [r8+rdx-0x30] + movdqu xmm9, xmmword ptr [r9+rdx-0x30] + movdqu xmm10, xmmword ptr [r10+rdx-0x30] + movdqu xmm11, xmmword ptr [r11+rdx-0x30] + movdqa xmm12, xmm8 + punpckldq xmm8, xmm9 + punpckhdq xmm12, xmm9 + movdqa xmm14, xmm10 + punpckldq xmm10, xmm11 + punpckhdq xmm14, xmm11 + movdqa xmm9, xmm8 + punpcklqdq xmm8, xmm10 + punpckhqdq xmm9, xmm10 + movdqa xmm13, xmm12 + punpcklqdq xmm12, xmm14 + punpckhqdq xmm13, xmm14 + movdqa xmmword ptr [rsp+0x40], xmm8 + movdqa xmmword ptr [rsp+0x50], xmm9 + movdqa xmmword ptr [rsp+0x60], xmm12 + movdqa xmmword ptr [rsp+0x70], xmm13 + movdqu xmm8, xmmword ptr [r8+rdx-0x20] + movdqu xmm9, xmmword ptr [r9+rdx-0x20] + movdqu xmm10, xmmword ptr [r10+rdx-0x20] + movdqu xmm11, xmmword ptr [r11+rdx-0x20] + movdqa xmm12, xmm8 + punpckldq xmm8, xmm9 + punpckhdq xmm12, xmm9 + movdqa xmm14, xmm10 + punpckldq xmm10, xmm11 + punpckhdq xmm14, xmm11 + movdqa xmm9, xmm8 + punpcklqdq xmm8, xmm10 + punpckhqdq xmm9, xmm10 + movdqa xmm13, xmm12 + punpcklqdq xmm12, xmm14 + punpckhqdq xmm13, xmm14 + movdqa xmmword ptr [rsp+0x80], xmm8 + movdqa xmmword ptr [rsp+0x90], xmm9 + movdqa xmmword ptr [rsp+0xA0], xmm12 + movdqa xmmword ptr [rsp+0xB0], xmm13 + movdqu xmm8, xmmword ptr [r8+rdx-0x10] + movdqu xmm9, xmmword ptr [r9+rdx-0x10] + movdqu xmm10, xmmword ptr [r10+rdx-0x10] + movdqu xmm11, xmmword ptr [r11+rdx-0x10] + movdqa xmm12, xmm8 + punpckldq xmm8, xmm9 + punpckhdq xmm12, xmm9 + movdqa xmm14, xmm10 + punpckldq xmm10, xmm11 + punpckhdq xmm14, xmm11 + movdqa xmm9, xmm8 + punpcklqdq xmm8, xmm10 + punpckhqdq xmm9, xmm10 + movdqa xmm13, xmm12 + punpcklqdq xmm12, xmm14 + punpckhqdq xmm13, xmm14 + movdqa xmmword ptr [rsp+0xC0], xmm8 + movdqa xmmword ptr [rsp+0xD0], xmm9 + movdqa xmmword ptr [rsp+0xE0], xmm12 + movdqa xmmword ptr [rsp+0xF0], xmm13 + movdqa xmm9, xmmword ptr [BLAKE3_IV_1+rip] + movdqa xmm10, xmmword ptr [BLAKE3_IV_2+rip] + movdqa xmm11, xmmword ptr [BLAKE3_IV_3+rip] + movdqa xmm12, xmmword ptr [rsp+0x110] + movdqa xmm13, xmmword ptr [rsp+0x120] + movdqa xmm14, xmmword ptr [BLAKE3_BLOCK_LEN+rip] + movd xmm15, eax + pshufd xmm15, xmm15, 0x00 + prefetcht0 [r8+rdx+0x80] + prefetcht0 [r9+rdx+0x80] + prefetcht0 [r10+rdx+0x80] + prefetcht0 [r11+rdx+0x80] + paddd xmm0, xmmword ptr [rsp] + paddd xmm1, xmmword ptr [rsp+0x20] + paddd xmm2, xmmword ptr [rsp+0x40] + paddd xmm3, xmmword ptr [rsp+0x60] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 + movdqa xmm8, xmmword ptr [BLAKE3_IV_0+rip] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x10] + paddd xmm1, xmmword ptr [rsp+0x30] + paddd xmm2, xmmword ptr [rsp+0x50] + paddd xmm3, xmmword ptr [rsp+0x70] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x80] + paddd xmm1, xmmword ptr [rsp+0xA0] + paddd xmm2, xmmword ptr [rsp+0xC0] + paddd xmm3, xmmword ptr [rsp+0xE0] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0x90] + paddd xmm1, xmmword ptr [rsp+0xB0] + paddd xmm2, xmmword ptr [rsp+0xD0] + paddd xmm3, xmmword ptr [rsp+0xF0] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0x20] + paddd xmm1, xmmword ptr [rsp+0x30] + paddd xmm2, xmmword ptr [rsp+0x70] + paddd xmm3, xmmword ptr [rsp+0x40] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x60] + paddd xmm1, xmmword ptr [rsp+0xA0] + paddd xmm2, xmmword ptr [rsp] + paddd xmm3, xmmword ptr [rsp+0xD0] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x10] + paddd xmm1, xmmword ptr [rsp+0xC0] + paddd xmm2, xmmword ptr [rsp+0x90] + paddd xmm3, xmmword ptr [rsp+0xF0] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0xB0] + paddd xmm1, xmmword ptr [rsp+0x50] + paddd xmm2, xmmword ptr [rsp+0xE0] + paddd xmm3, xmmword ptr [rsp+0x80] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0x30] + paddd xmm1, xmmword ptr [rsp+0xA0] + paddd xmm2, xmmword ptr [rsp+0xD0] + paddd xmm3, xmmword ptr [rsp+0x70] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x40] + paddd xmm1, xmmword ptr [rsp+0xC0] + paddd xmm2, xmmword ptr [rsp+0x20] + paddd xmm3, xmmword ptr [rsp+0xE0] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x60] + paddd xmm1, xmmword ptr [rsp+0x90] + paddd xmm2, xmmword ptr [rsp+0xB0] + paddd xmm3, xmmword ptr [rsp+0x80] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0x50] + paddd xmm1, xmmword ptr [rsp] + paddd xmm2, xmmword ptr [rsp+0xF0] + paddd xmm3, xmmword ptr [rsp+0x10] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0xA0] + paddd xmm1, xmmword ptr [rsp+0xC0] + paddd xmm2, xmmword ptr [rsp+0xE0] + paddd xmm3, xmmword ptr [rsp+0xD0] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x70] + paddd xmm1, xmmword ptr [rsp+0x90] + paddd xmm2, xmmword ptr [rsp+0x30] + paddd xmm3, xmmword ptr [rsp+0xF0] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x40] + paddd xmm1, xmmword ptr [rsp+0xB0] + paddd xmm2, xmmword ptr [rsp+0x50] + paddd xmm3, xmmword ptr [rsp+0x10] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp] + paddd xmm1, xmmword ptr [rsp+0x20] + paddd xmm2, xmmword ptr [rsp+0x80] + paddd xmm3, xmmword ptr [rsp+0x60] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0xC0] + paddd xmm1, xmmword ptr [rsp+0x90] + paddd xmm2, xmmword ptr [rsp+0xF0] + paddd xmm3, xmmword ptr [rsp+0xE0] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0xD0] + paddd xmm1, xmmword ptr [rsp+0xB0] + paddd xmm2, xmmword ptr [rsp+0xA0] + paddd xmm3, xmmword ptr [rsp+0x80] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x70] + paddd xmm1, xmmword ptr [rsp+0x50] + paddd xmm2, xmmword ptr [rsp] + paddd xmm3, xmmword ptr [rsp+0x60] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0x20] + paddd xmm1, xmmword ptr [rsp+0x30] + paddd xmm2, xmmword ptr [rsp+0x10] + paddd xmm3, xmmword ptr [rsp+0x40] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0x90] + paddd xmm1, xmmword ptr [rsp+0xB0] + paddd xmm2, xmmword ptr [rsp+0x80] + paddd xmm3, xmmword ptr [rsp+0xF0] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0xE0] + paddd xmm1, xmmword ptr [rsp+0x50] + paddd xmm2, xmmword ptr [rsp+0xC0] + paddd xmm3, xmmword ptr [rsp+0x10] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0xD0] + paddd xmm1, xmmword ptr [rsp] + paddd xmm2, xmmword ptr [rsp+0x20] + paddd xmm3, xmmword ptr [rsp+0x40] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0x30] + paddd xmm1, xmmword ptr [rsp+0xA0] + paddd xmm2, xmmword ptr [rsp+0x60] + paddd xmm3, xmmword ptr [rsp+0x70] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0xB0] + paddd xmm1, xmmword ptr [rsp+0x50] + paddd xmm2, xmmword ptr [rsp+0x10] + paddd xmm3, xmmword ptr [rsp+0x80] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0xF0] + paddd xmm1, xmmword ptr [rsp] + paddd xmm2, xmmword ptr [rsp+0x90] + paddd xmm3, xmmword ptr [rsp+0x60] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0xE0] + paddd xmm1, xmmword ptr [rsp+0x20] + paddd xmm2, xmmword ptr [rsp+0x30] + paddd xmm3, xmmword ptr [rsp+0x70] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0xA0] + paddd xmm1, xmmword ptr [rsp+0xC0] + paddd xmm2, xmmword ptr [rsp+0x40] + paddd xmm3, xmmword ptr [rsp+0xD0] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + pxor xmm0, xmm8 + pxor xmm1, xmm9 + pxor xmm2, xmm10 + pxor xmm3, xmm11 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + pxor xmm4, xmm12 + pxor xmm5, xmm13 + pxor xmm6, xmm14 + pxor xmm7, xmm15 + mov eax, r13d + jne 9b + movdqa xmm9, xmm0 + punpckldq xmm0, xmm1 + punpckhdq xmm9, xmm1 + movdqa xmm11, xmm2 + punpckldq xmm2, xmm3 + punpckhdq xmm11, xmm3 + movdqa xmm1, xmm0 + punpcklqdq xmm0, xmm2 + punpckhqdq xmm1, xmm2 + movdqa xmm3, xmm9 + punpcklqdq xmm9, xmm11 + punpckhqdq xmm3, xmm11 + movdqu xmmword ptr [rbx], xmm0 + movdqu xmmword ptr [rbx+0x20], xmm1 + movdqu xmmword ptr [rbx+0x40], xmm9 + movdqu xmmword ptr [rbx+0x60], xmm3 + movdqa xmm9, xmm4 + punpckldq xmm4, xmm5 + punpckhdq xmm9, xmm5 + movdqa xmm11, xmm6 + punpckldq xmm6, xmm7 + punpckhdq xmm11, xmm7 + movdqa xmm5, xmm4 + punpcklqdq xmm4, xmm6 + punpckhqdq xmm5, xmm6 + movdqa xmm7, xmm9 + punpcklqdq xmm9, xmm11 + punpckhqdq xmm7, xmm11 + movdqu xmmword ptr [rbx+0x10], xmm4 + movdqu xmmword ptr [rbx+0x30], xmm5 + movdqu xmmword ptr [rbx+0x50], xmm9 + movdqu xmmword ptr [rbx+0x70], xmm7 + movdqa xmm1, xmmword ptr [rsp+0x110] + movdqa xmm0, xmm1 + paddd xmm1, xmmword ptr [rsp+0x150] + movdqa xmmword ptr [rsp+0x110], xmm1 + pxor xmm0, xmmword ptr [CMP_MSB_MASK+rip] + pxor xmm1, xmmword ptr [CMP_MSB_MASK+rip] + pcmpgtd xmm0, xmm1 + movdqa xmm1, xmmword ptr [rsp+0x120] + psubd xmm1, xmm0 + movdqa xmmword ptr [rsp+0x120], xmm1 + add rbx, 128 + add rdi, 32 + sub rsi, 4 + cmp rsi, 4 + jnc 2b + test rsi, rsi + jne 3f +4: + movdqa xmm6, xmmword ptr [rsp+0x170] + movdqa xmm7, xmmword ptr [rsp+0x180] + movdqa xmm8, xmmword ptr [rsp+0x190] + movdqa xmm9, xmmword ptr [rsp+0x1A0] + movdqa xmm10, xmmword ptr [rsp+0x1B0] + movdqa xmm11, xmmword ptr [rsp+0x1C0] + movdqa xmm12, xmmword ptr [rsp+0x1D0] + movdqa xmm13, xmmword ptr [rsp+0x1E0] + movdqa xmm14, xmmword ptr [rsp+0x1F0] + movdqa xmm15, xmmword ptr [rsp+0x200] + mov rsp, rbp + pop rbp + pop rbx + pop rdi + pop rsi + pop r12 + pop r13 + pop r14 + pop r15 + ret +.p2align 5 +3: + test esi, 0x2 + je 3f + movups xmm0, xmmword ptr [rcx] + movups xmm1, xmmword ptr [rcx+0x10] + movaps xmm8, xmm0 + movaps xmm9, xmm1 + movd xmm13, dword ptr [rsp+0x110] + movd xmm14, dword ptr [rsp+0x120] + punpckldq xmm13, xmm14 + movaps xmmword ptr [rsp], xmm13 + movd xmm14, dword ptr [rsp+0x114] + movd xmm13, dword ptr [rsp+0x124] + punpckldq xmm14, xmm13 + movaps xmmword ptr [rsp+0x10], xmm14 + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+0x8] + movzx eax, byte ptr [rbp+0x80] + or eax, r13d + xor edx, edx +2: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + movaps xmm2, xmmword ptr [BLAKE3_IV+rip] + movaps xmm10, xmm2 + movups xmm4, xmmword ptr [r8+rdx-0x40] + movups xmm5, xmmword ptr [r8+rdx-0x30] + movaps xmm3, xmm4 + shufps xmm4, xmm5, 136 + shufps xmm3, xmm5, 221 + movaps xmm5, xmm3 + movups xmm6, xmmword ptr [r8+rdx-0x20] + movups xmm7, xmmword ptr [r8+rdx-0x10] + movaps xmm3, xmm6 + shufps xmm6, xmm7, 136 + pshufd xmm6, xmm6, 0x93 + shufps xmm3, xmm7, 221 + pshufd xmm7, xmm3, 0x93 + movups xmm12, xmmword ptr [r9+rdx-0x40] + movups xmm13, xmmword ptr [r9+rdx-0x30] + movaps xmm11, xmm12 + shufps xmm12, xmm13, 136 + shufps xmm11, xmm13, 221 + movaps xmm13, xmm11 + movups xmm14, xmmword ptr [r9+rdx-0x20] + movups xmm15, xmmword ptr [r9+rdx-0x10] + movaps xmm11, xmm14 + shufps xmm14, xmm15, 136 + pshufd xmm14, xmm14, 0x93 + shufps xmm11, xmm15, 221 + pshufd xmm15, xmm11, 0x93 + shl rax, 0x20 + or rax, 0x40 + movq xmm3, rax + movdqa xmmword ptr [rsp+0x20], xmm3 + movaps xmm3, xmmword ptr [rsp] + movaps xmm11, xmmword ptr [rsp+0x10] + punpcklqdq xmm3, xmmword ptr [rsp+0x20] + punpcklqdq xmm11, xmmword ptr [rsp+0x20] + mov al, 7 +9: + paddd xmm0, xmm4 + paddd xmm8, xmm12 + movaps xmmword ptr [rsp+0x20], xmm4 + movaps xmmword ptr [rsp+0x30], xmm12 + paddd xmm0, xmm1 + paddd xmm8, xmm9 + pxor xmm3, xmm0 + pxor xmm11, xmm8 + pshuflw xmm3, xmm3, 0xB1 + pshufhw xmm3, xmm3, 0xB1 + pshuflw xmm11, xmm11, 0xB1 + pshufhw xmm11, xmm11, 0xB1 + paddd xmm2, xmm3 + paddd xmm10, xmm11 + pxor xmm1, xmm2 + pxor xmm9, xmm10 + movdqa xmm4, xmm1 + pslld xmm1, 20 + psrld xmm4, 12 + por xmm1, xmm4 + movdqa xmm4, xmm9 + pslld xmm9, 20 + psrld xmm4, 12 + por xmm9, xmm4 + paddd xmm0, xmm5 + paddd xmm8, xmm13 + movaps xmmword ptr [rsp+0x40], xmm5 + movaps xmmword ptr [rsp+0x50], xmm13 + paddd xmm0, xmm1 + paddd xmm8, xmm9 + pxor xmm3, xmm0 + pxor xmm11, xmm8 + movdqa xmm13, xmm3 + psrld xmm3, 8 + pslld xmm13, 24 + pxor xmm3, xmm13 + movdqa xmm13, xmm11 + psrld xmm11, 8 + pslld xmm13, 24 + pxor xmm11, xmm13 + paddd xmm2, xmm3 + paddd xmm10, xmm11 + pxor xmm1, xmm2 + pxor xmm9, xmm10 + movdqa xmm4, xmm1 + pslld xmm1, 25 + psrld xmm4, 7 + por xmm1, xmm4 + movdqa xmm4, xmm9 + pslld xmm9, 25 + psrld xmm4, 7 + por xmm9, xmm4 + pshufd xmm0, xmm0, 0x93 + pshufd xmm8, xmm8, 0x93 + pshufd xmm3, xmm3, 0x4E + pshufd xmm11, xmm11, 0x4E + pshufd xmm2, xmm2, 0x39 + pshufd xmm10, xmm10, 0x39 + paddd xmm0, xmm6 + paddd xmm8, xmm14 + paddd xmm0, xmm1 + paddd xmm8, xmm9 + pxor xmm3, xmm0 + pxor xmm11, xmm8 + pshuflw xmm3, xmm3, 0xB1 + pshufhw xmm3, xmm3, 0xB1 + pshuflw xmm11, xmm11, 0xB1 + pshufhw xmm11, xmm11, 0xB1 + paddd xmm2, xmm3 + paddd xmm10, xmm11 + pxor xmm1, xmm2 + pxor xmm9, xmm10 + movdqa xmm4, xmm1 + pslld xmm1, 20 + psrld xmm4, 12 + por xmm1, xmm4 + movdqa xmm4, xmm9 + pslld xmm9, 20 + psrld xmm4, 12 + por xmm9, xmm4 + paddd xmm0, xmm7 + paddd xmm8, xmm15 + paddd xmm0, xmm1 + paddd xmm8, xmm9 + pxor xmm3, xmm0 + pxor xmm11, xmm8 + movdqa xmm13, xmm3 + psrld xmm3, 8 + pslld xmm13, 24 + pxor xmm3, xmm13 + movdqa xmm13, xmm11 + psrld xmm11, 8 + pslld xmm13, 24 + pxor xmm11, xmm13 + paddd xmm2, xmm3 + paddd xmm10, xmm11 + pxor xmm1, xmm2 + pxor xmm9, xmm10 + movdqa xmm4, xmm1 + pslld xmm1, 25 + psrld xmm4, 7 + por xmm1, xmm4 + movdqa xmm4, xmm9 + pslld xmm9, 25 + psrld xmm4, 7 + por xmm9, xmm4 + pshufd xmm0, xmm0, 0x39 + pshufd xmm8, xmm8, 0x39 + pshufd xmm3, xmm3, 0x4E + pshufd xmm11, xmm11, 0x4E + pshufd xmm2, xmm2, 0x93 + pshufd xmm10, xmm10, 0x93 + dec al + je 9f + movdqa xmm12, xmmword ptr [rsp+0x20] + movdqa xmm5, xmmword ptr [rsp+0x40] + pshufd xmm13, xmm12, 0x0F + shufps xmm12, xmm5, 214 + pshufd xmm4, xmm12, 0x39 + movdqa xmm12, xmm6 + shufps xmm12, xmm7, 250 + pand xmm13, xmmword ptr [PBLENDW_0x33_MASK+rip] + pand xmm12, xmmword ptr [PBLENDW_0xCC_MASK+rip] + por xmm13, xmm12 + movdqa xmmword ptr [rsp+0x20], xmm13 + movdqa xmm12, xmm7 + punpcklqdq xmm12, xmm5 + movdqa xmm13, xmm6 + pand xmm12, xmmword ptr [PBLENDW_0x3F_MASK+rip] + pand xmm13, xmmword ptr [PBLENDW_0xC0_MASK+rip] + por xmm12, xmm13 + pshufd xmm12, xmm12, 0x78 + punpckhdq xmm5, xmm7 + punpckldq xmm6, xmm5 + pshufd xmm7, xmm6, 0x1E + movdqa xmmword ptr [rsp+0x40], xmm12 + movdqa xmm5, xmmword ptr [rsp+0x30] + movdqa xmm13, xmmword ptr [rsp+0x50] + pshufd xmm6, xmm5, 0x0F + shufps xmm5, xmm13, 214 + pshufd xmm12, xmm5, 0x39 + movdqa xmm5, xmm14 + shufps xmm5, xmm15, 250 + pand xmm6, xmmword ptr [PBLENDW_0x33_MASK+rip] + pand xmm5, xmmword ptr [PBLENDW_0xCC_MASK+rip] + por xmm6, xmm5 + movdqa xmm5, xmm15 + punpcklqdq xmm5, xmm13 + movdqa xmmword ptr [rsp+0x30], xmm2 + movdqa xmm2, xmm14 + pand xmm5, xmmword ptr [PBLENDW_0x3F_MASK+rip] + pand xmm2, xmmword ptr [PBLENDW_0xC0_MASK+rip] + por xmm5, xmm2 + movdqa xmm2, xmmword ptr [rsp+0x30] + pshufd xmm5, xmm5, 0x78 + punpckhdq xmm13, xmm15 + punpckldq xmm14, xmm13 + pshufd xmm15, xmm14, 0x1E + movdqa xmm13, xmm6 + movdqa xmm14, xmm5 + movdqa xmm5, xmmword ptr [rsp+0x20] + movdqa xmm6, xmmword ptr [rsp+0x40] + jmp 9b +9: + pxor xmm0, xmm2 + pxor xmm1, xmm3 + pxor xmm8, xmm10 + pxor xmm9, xmm11 + mov eax, r13d + cmp rdx, r15 + jne 2b + movups xmmword ptr [rbx], xmm0 + movups xmmword ptr [rbx+0x10], xmm1 + movups xmmword ptr [rbx+0x20], xmm8 + movups xmmword ptr [rbx+0x30], xmm9 + mov eax, dword ptr [rsp+0x130] + neg eax + mov r10d, dword ptr [rsp+0x110+8*rax] + mov r11d, dword ptr [rsp+0x120+8*rax] + mov dword ptr [rsp+0x110], r10d + mov dword ptr [rsp+0x120], r11d + add rdi, 16 + add rbx, 64 + sub rsi, 2 +3: + test esi, 0x1 + je 4b + movups xmm0, xmmword ptr [rcx] + movups xmm1, xmmword ptr [rcx+0x10] + movd xmm13, dword ptr [rsp+0x110] + movd xmm14, dword ptr [rsp+0x120] + punpckldq xmm13, xmm14 + mov r8, qword ptr [rdi] + movzx eax, byte ptr [rbp+0x80] + or eax, r13d + xor edx, edx +2: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + movaps xmm2, xmmword ptr [BLAKE3_IV+rip] + shl rax, 32 + or rax, 64 + movq xmm12, rax + movdqa xmm3, xmm13 + punpcklqdq xmm3, xmm12 + movups xmm4, xmmword ptr [r8+rdx-0x40] + movups xmm5, xmmword ptr [r8+rdx-0x30] + movaps xmm8, xmm4 + shufps xmm4, xmm5, 136 + shufps xmm8, xmm5, 221 + movaps xmm5, xmm8 + movups xmm6, xmmword ptr [r8+rdx-0x20] + movups xmm7, xmmword ptr [r8+rdx-0x10] + movaps xmm8, xmm6 + shufps xmm6, xmm7, 136 + pshufd xmm6, xmm6, 0x93 + shufps xmm8, xmm7, 221 + pshufd xmm7, xmm8, 0x93 + mov al, 7 +9: + paddd xmm0, xmm4 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshuflw xmm3, xmm3, 0xB1 + pshufhw xmm3, xmm3, 0xB1 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm5 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + movdqa xmm14, xmm3 + psrld xmm3, 8 + pslld xmm14, 24 + pxor xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 0x93 + pshufd xmm3, xmm3, 0x4E + pshufd xmm2, xmm2, 0x39 + paddd xmm0, xmm6 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshuflw xmm3, xmm3, 0xB1 + pshufhw xmm3, xmm3, 0xB1 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm7 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + movdqa xmm14, xmm3 + psrld xmm3, 8 + pslld xmm14, 24 + pxor xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 0x39 + pshufd xmm3, xmm3, 0x4E + pshufd xmm2, xmm2, 0x93 + dec al + jz 9f + movdqa xmm8, xmm4 + shufps xmm8, xmm5, 214 + pshufd xmm9, xmm4, 0x0F + pshufd xmm4, xmm8, 0x39 + movdqa xmm8, xmm6 + shufps xmm8, xmm7, 250 + pand xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip] + pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip] + por xmm9, xmm8 + movdqa xmm8, xmm7 + punpcklqdq xmm8, xmm5 + movdqa xmm10, xmm6 + pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip] + pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip] + por xmm8, xmm10 + pshufd xmm8, xmm8, 0x78 + punpckhdq xmm5, xmm7 + punpckldq xmm6, xmm5 + pshufd xmm7, xmm6, 0x1E + movdqa xmm5, xmm9 + movdqa xmm6, xmm8 + jmp 9b +9: + pxor xmm0, xmm2 + pxor xmm1, xmm3 + mov eax, r13d + cmp rdx, r15 + jne 2b + movups xmmword ptr [rbx], xmm0 + movups xmmword ptr [rbx+0x10], xmm1 + jmp 4b + +.p2align 6 +blake3_compress_in_place_sse2: +_blake3_compress_in_place_sse2: + sub rsp, 120 + movdqa xmmword ptr [rsp], xmm6 + movdqa xmmword ptr [rsp+0x10], xmm7 + movdqa xmmword ptr [rsp+0x20], xmm8 + movdqa xmmword ptr [rsp+0x30], xmm9 + movdqa xmmword ptr [rsp+0x40], xmm11 + movdqa xmmword ptr [rsp+0x50], xmm14 + movdqa xmmword ptr [rsp+0x60], xmm15 + movups xmm0, xmmword ptr [rcx] + movups xmm1, xmmword ptr [rcx+0x10] + movaps xmm2, xmmword ptr [BLAKE3_IV+rip] + movzx eax, byte ptr [rsp+0xA0] + movzx r8d, r8b + shl rax, 32 + add r8, rax + movq xmm3, r9 + movq xmm4, r8 + punpcklqdq xmm3, xmm4 + movups xmm4, xmmword ptr [rdx] + movups xmm5, xmmword ptr [rdx+0x10] + movaps xmm8, xmm4 + shufps xmm4, xmm5, 136 + shufps xmm8, xmm5, 221 + movaps xmm5, xmm8 + movups xmm6, xmmword ptr [rdx+0x20] + movups xmm7, xmmword ptr [rdx+0x30] + movaps xmm8, xmm6 + shufps xmm6, xmm7, 136 + pshufd xmm6, xmm6, 0x93 + shufps xmm8, xmm7, 221 + pshufd xmm7, xmm8, 0x93 + mov al, 7 +9: + paddd xmm0, xmm4 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshuflw xmm3, xmm3, 0xB1 + pshufhw xmm3, xmm3, 0xB1 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm5 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + movdqa xmm14, xmm3 + psrld xmm3, 8 + pslld xmm14, 24 + pxor xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 0x93 + pshufd xmm3, xmm3, 0x4E + pshufd xmm2, xmm2, 0x39 + paddd xmm0, xmm6 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshuflw xmm3, xmm3, 0xB1 + pshufhw xmm3, xmm3, 0xB1 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm7 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + movdqa xmm14, xmm3 + psrld xmm3, 8 + pslld xmm14, 24 + pxor xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 0x39 + pshufd xmm3, xmm3, 0x4E + pshufd xmm2, xmm2, 0x93 + dec al + jz 9f + movdqa xmm8, xmm4 + shufps xmm8, xmm5, 214 + pshufd xmm9, xmm4, 0x0F + pshufd xmm4, xmm8, 0x39 + movdqa xmm8, xmm6 + shufps xmm8, xmm7, 250 + pand xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip] + pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip] + por xmm9, xmm8 + movdqa xmm8, xmm7 + punpcklqdq xmm8, xmm5 + movdqa xmm14, xmm6 + pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip] + pand xmm14, xmmword ptr [PBLENDW_0xC0_MASK+rip] + por xmm8, xmm14 + pshufd xmm8, xmm8, 0x78 + punpckhdq xmm5, xmm7 + punpckldq xmm6, xmm5 + pshufd xmm7, xmm6, 0x1E + movdqa xmm5, xmm9 + movdqa xmm6, xmm8 + jmp 9b +9: + pxor xmm0, xmm2 + pxor xmm1, xmm3 + movups xmmword ptr [rcx], xmm0 + movups xmmword ptr [rcx+0x10], xmm1 + movdqa xmm6, xmmword ptr [rsp] + movdqa xmm7, xmmword ptr [rsp+0x10] + movdqa xmm8, xmmword ptr [rsp+0x20] + movdqa xmm9, xmmword ptr [rsp+0x30] + movdqa xmm11, xmmword ptr [rsp+0x40] + movdqa xmm14, xmmword ptr [rsp+0x50] + movdqa xmm15, xmmword ptr [rsp+0x60] + add rsp, 120 + ret + + +.p2align 6 +_blake3_compress_xof_sse2: +blake3_compress_xof_sse2: + sub rsp, 120 + movdqa xmmword ptr [rsp], xmm6 + movdqa xmmword ptr [rsp+0x10], xmm7 + movdqa xmmword ptr [rsp+0x20], xmm8 + movdqa xmmword ptr [rsp+0x30], xmm9 + movdqa xmmword ptr [rsp+0x40], xmm11 + movdqa xmmword ptr [rsp+0x50], xmm14 + movdqa xmmword ptr [rsp+0x60], xmm15 + movups xmm0, xmmword ptr [rcx] + movups xmm1, xmmword ptr [rcx+0x10] + movaps xmm2, xmmword ptr [BLAKE3_IV+rip] + movzx eax, byte ptr [rsp+0xA0] + movzx r8d, r8b + mov r10, qword ptr [rsp+0xA8] + shl rax, 32 + add r8, rax + movq xmm3, r9 + movq xmm4, r8 + punpcklqdq xmm3, xmm4 + movups xmm4, xmmword ptr [rdx] + movups xmm5, xmmword ptr [rdx+0x10] + movaps xmm8, xmm4 + shufps xmm4, xmm5, 136 + shufps xmm8, xmm5, 221 + movaps xmm5, xmm8 + movups xmm6, xmmword ptr [rdx+0x20] + movups xmm7, xmmword ptr [rdx+0x30] + movaps xmm8, xmm6 + shufps xmm6, xmm7, 136 + pshufd xmm6, xmm6, 0x93 + shufps xmm8, xmm7, 221 + pshufd xmm7, xmm8, 0x93 + mov al, 7 +9: + paddd xmm0, xmm4 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshuflw xmm3, xmm3, 0xB1 + pshufhw xmm3, xmm3, 0xB1 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm5 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + movdqa xmm14, xmm3 + psrld xmm3, 8 + pslld xmm14, 24 + pxor xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 0x93 + pshufd xmm3, xmm3, 0x4E + pshufd xmm2, xmm2, 0x39 + paddd xmm0, xmm6 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshuflw xmm3, xmm3, 0xB1 + pshufhw xmm3, xmm3, 0xB1 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm7 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + movdqa xmm14, xmm3 + psrld xmm3, 8 + pslld xmm14, 24 + pxor xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 0x39 + pshufd xmm3, xmm3, 0x4E + pshufd xmm2, xmm2, 0x93 + dec al + jz 9f + movdqa xmm8, xmm4 + shufps xmm8, xmm5, 214 + pshufd xmm9, xmm4, 0x0F + pshufd xmm4, xmm8, 0x39 + movdqa xmm8, xmm6 + shufps xmm8, xmm7, 250 + pand xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip] + pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip] + por xmm9, xmm8 + movdqa xmm8, xmm7 + punpcklqdq xmm8, xmm5 + movdqa xmm14, xmm6 + pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip] + pand xmm14, xmmword ptr [PBLENDW_0xC0_MASK+rip] + por xmm8, xmm14 + pshufd xmm8, xmm8, 0x78 + punpckhdq xmm5, xmm7 + punpckldq xmm6, xmm5 + pshufd xmm7, xmm6, 0x1E + movdqa xmm5, xmm9 + movdqa xmm6, xmm8 + jmp 9b +9: + movdqu xmm4, xmmword ptr [rcx] + movdqu xmm5, xmmword ptr [rcx+0x10] + pxor xmm0, xmm2 + pxor xmm1, xmm3 + pxor xmm2, xmm4 + pxor xmm3, xmm5 + movups xmmword ptr [r10], xmm0 + movups xmmword ptr [r10+0x10], xmm1 + movups xmmword ptr [r10+0x20], xmm2 + movups xmmword ptr [r10+0x30], xmm3 + movdqa xmm6, xmmword ptr [rsp] + movdqa xmm7, xmmword ptr [rsp+0x10] + movdqa xmm8, xmmword ptr [rsp+0x20] + movdqa xmm9, xmmword ptr [rsp+0x30] + movdqa xmm11, xmmword ptr [rsp+0x40] + movdqa xmm14, xmmword ptr [rsp+0x50] + movdqa xmm15, xmmword ptr [rsp+0x60] + add rsp, 120 + ret + + +.section .rdata +.p2align 6 +BLAKE3_IV: + .long 0x6A09E667, 0xBB67AE85 + .long 0x3C6EF372, 0xA54FF53A +ADD0: + .long 0, 1, 2, 3 +ADD1: + .long 4, 4, 4, 4 +BLAKE3_IV_0: + .long 0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667 +BLAKE3_IV_1: + .long 0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85 +BLAKE3_IV_2: + .long 0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372 +BLAKE3_IV_3: + .long 0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A +BLAKE3_BLOCK_LEN: + .long 64, 64, 64, 64 +CMP_MSB_MASK: + .long 0x80000000, 0x80000000, 0x80000000, 0x80000000 +PBLENDW_0x33_MASK: + .long 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000 +PBLENDW_0xCC_MASK: + .long 0x00000000, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF +PBLENDW_0x3F_MASK: + .long 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000 +PBLENDW_0xC0_MASK: + .long 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF diff --git a/third-party/blake3/c/blake3_sse2_x86-64_windows_msvc.asm b/third-party/blake3/c/blake3_sse2_x86-64_windows_msvc.asm new file mode 100644 index 00000000..507502f1 --- /dev/null +++ b/third-party/blake3/c/blake3_sse2_x86-64_windows_msvc.asm @@ -0,0 +1,2350 @@ +public _blake3_hash_many_sse2 +public blake3_hash_many_sse2 +public blake3_compress_in_place_sse2 +public _blake3_compress_in_place_sse2 +public blake3_compress_xof_sse2 +public _blake3_compress_xof_sse2 + +_TEXT SEGMENT ALIGN(16) 'CODE' + +ALIGN 16 +blake3_hash_many_sse2 PROC +_blake3_hash_many_sse2 PROC + push r15 + push r14 + push r13 + push r12 + push rsi + push rdi + push rbx + push rbp + mov rbp, rsp + sub rsp, 528 + and rsp, 0FFFFFFFFFFFFFFC0H + movdqa xmmword ptr [rsp+170H], xmm6 + movdqa xmmword ptr [rsp+180H], xmm7 + movdqa xmmword ptr [rsp+190H], xmm8 + movdqa xmmword ptr [rsp+1A0H], xmm9 + movdqa xmmword ptr [rsp+1B0H], xmm10 + movdqa xmmword ptr [rsp+1C0H], xmm11 + movdqa xmmword ptr [rsp+1D0H], xmm12 + movdqa xmmword ptr [rsp+1E0H], xmm13 + movdqa xmmword ptr [rsp+1F0H], xmm14 + movdqa xmmword ptr [rsp+200H], xmm15 + mov rdi, rcx + mov rsi, rdx + mov rdx, r8 + mov rcx, r9 + mov r8, qword ptr [rbp+68H] + movzx r9, byte ptr [rbp+70H] + neg r9d + movd xmm0, r9d + pshufd xmm0, xmm0, 00H + movdqa xmmword ptr [rsp+130H], xmm0 + movdqa xmm1, xmm0 + pand xmm1, xmmword ptr [ADD0] + pand xmm0, xmmword ptr [ADD1] + movdqa xmmword ptr [rsp+150H], xmm0 + movd xmm0, r8d + pshufd xmm0, xmm0, 00H + paddd xmm0, xmm1 + movdqa xmmword ptr [rsp+110H], xmm0 + pxor xmm0, xmmword ptr [CMP_MSB_MASK] + pxor xmm1, xmmword ptr [CMP_MSB_MASK] + pcmpgtd xmm1, xmm0 + shr r8, 32 + movd xmm2, r8d + pshufd xmm2, xmm2, 00H + psubd xmm2, xmm1 + movdqa xmmword ptr [rsp+120H], xmm2 + mov rbx, qword ptr [rbp+90H] + mov r15, rdx + shl r15, 6 + movzx r13d, byte ptr [rbp+78H] + movzx r12d, byte ptr [rbp+88H] + cmp rsi, 4 + jc final3blocks +outerloop4: + movdqu xmm3, xmmword ptr [rcx] + pshufd xmm0, xmm3, 00H + pshufd xmm1, xmm3, 55H + pshufd xmm2, xmm3, 0AAH + pshufd xmm3, xmm3, 0FFH + movdqu xmm7, xmmword ptr [rcx+10H] + pshufd xmm4, xmm7, 00H + pshufd xmm5, xmm7, 55H + pshufd xmm6, xmm7, 0AAH + pshufd xmm7, xmm7, 0FFH + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+8H] + mov r10, qword ptr [rdi+10H] + mov r11, qword ptr [rdi+18H] + movzx eax, byte ptr [rbp+80H] + or eax, r13d + xor edx, edx +innerloop4: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + movdqu xmm8, xmmword ptr [r8+rdx-40H] + movdqu xmm9, xmmword ptr [r9+rdx-40H] + movdqu xmm10, xmmword ptr [r10+rdx-40H] + movdqu xmm11, xmmword ptr [r11+rdx-40H] + movdqa xmm12, xmm8 + punpckldq xmm8, xmm9 + punpckhdq xmm12, xmm9 + movdqa xmm14, xmm10 + punpckldq xmm10, xmm11 + punpckhdq xmm14, xmm11 + movdqa xmm9, xmm8 + punpcklqdq xmm8, xmm10 + punpckhqdq xmm9, xmm10 + movdqa xmm13, xmm12 + punpcklqdq xmm12, xmm14 + punpckhqdq xmm13, xmm14 + movdqa xmmword ptr [rsp], xmm8 + movdqa xmmword ptr [rsp+10H], xmm9 + movdqa xmmword ptr [rsp+20H], xmm12 + movdqa xmmword ptr [rsp+30H], xmm13 + movdqu xmm8, xmmword ptr [r8+rdx-30H] + movdqu xmm9, xmmword ptr [r9+rdx-30H] + movdqu xmm10, xmmword ptr [r10+rdx-30H] + movdqu xmm11, xmmword ptr [r11+rdx-30H] + movdqa xmm12, xmm8 + punpckldq xmm8, xmm9 + punpckhdq xmm12, xmm9 + movdqa xmm14, xmm10 + punpckldq xmm10, xmm11 + punpckhdq xmm14, xmm11 + movdqa xmm9, xmm8 + punpcklqdq xmm8, xmm10 + punpckhqdq xmm9, xmm10 + movdqa xmm13, xmm12 + punpcklqdq xmm12, xmm14 + punpckhqdq xmm13, xmm14 + movdqa xmmword ptr [rsp+40H], xmm8 + movdqa xmmword ptr [rsp+50H], xmm9 + movdqa xmmword ptr [rsp+60H], xmm12 + movdqa xmmword ptr [rsp+70H], xmm13 + movdqu xmm8, xmmword ptr [r8+rdx-20H] + movdqu xmm9, xmmword ptr [r9+rdx-20H] + movdqu xmm10, xmmword ptr [r10+rdx-20H] + movdqu xmm11, xmmword ptr [r11+rdx-20H] + movdqa xmm12, xmm8 + punpckldq xmm8, xmm9 + punpckhdq xmm12, xmm9 + movdqa xmm14, xmm10 + punpckldq xmm10, xmm11 + punpckhdq xmm14, xmm11 + movdqa xmm9, xmm8 + punpcklqdq xmm8, xmm10 + punpckhqdq xmm9, xmm10 + movdqa xmm13, xmm12 + punpcklqdq xmm12, xmm14 + punpckhqdq xmm13, xmm14 + movdqa xmmword ptr [rsp+80H], xmm8 + movdqa xmmword ptr [rsp+90H], xmm9 + movdqa xmmword ptr [rsp+0A0H], xmm12 + movdqa xmmword ptr [rsp+0B0H], xmm13 + movdqu xmm8, xmmword ptr [r8+rdx-10H] + movdqu xmm9, xmmword ptr [r9+rdx-10H] + movdqu xmm10, xmmword ptr [r10+rdx-10H] + movdqu xmm11, xmmword ptr [r11+rdx-10H] + movdqa xmm12, xmm8 + punpckldq xmm8, xmm9 + punpckhdq xmm12, xmm9 + movdqa xmm14, xmm10 + punpckldq xmm10, xmm11 + punpckhdq xmm14, xmm11 + movdqa xmm9, xmm8 + punpcklqdq xmm8, xmm10 + punpckhqdq xmm9, xmm10 + movdqa xmm13, xmm12 + punpcklqdq xmm12, xmm14 + punpckhqdq xmm13, xmm14 + movdqa xmmword ptr [rsp+0C0H], xmm8 + movdqa xmmword ptr [rsp+0D0H], xmm9 + movdqa xmmword ptr [rsp+0E0H], xmm12 + movdqa xmmword ptr [rsp+0F0H], xmm13 + movdqa xmm9, xmmword ptr [BLAKE3_IV_1] + movdqa xmm10, xmmword ptr [BLAKE3_IV_2] + movdqa xmm11, xmmword ptr [BLAKE3_IV_3] + movdqa xmm12, xmmword ptr [rsp+110H] + movdqa xmm13, xmmword ptr [rsp+120H] + movdqa xmm14, xmmword ptr [BLAKE3_BLOCK_LEN] + movd xmm15, eax + pshufd xmm15, xmm15, 00H + prefetcht0 byte ptr [r8+rdx+80H] + prefetcht0 byte ptr [r9+rdx+80H] + prefetcht0 byte ptr [r10+rdx+80H] + prefetcht0 byte ptr [r11+rdx+80H] + paddd xmm0, xmmword ptr [rsp] + paddd xmm1, xmmword ptr [rsp+20H] + paddd xmm2, xmmword ptr [rsp+40H] + paddd xmm3, xmmword ptr [rsp+60H] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + pshuflw xmm12, xmm12, 0B1H + pshufhw xmm12, xmm12, 0B1H + pshuflw xmm13, xmm13, 0B1H + pshufhw xmm13, xmm13, 0B1H + pshuflw xmm14, xmm14, 0B1H + pshufhw xmm14, xmm14, 0B1H + pshuflw xmm15, xmm15, 0B1H + pshufhw xmm15, xmm15, 0B1H + movdqa xmm8, xmmword ptr [BLAKE3_IV_0] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+10H] + paddd xmm1, xmmword ptr [rsp+30H] + paddd xmm2, xmmword ptr [rsp+50H] + paddd xmm3, xmmword ptr [rsp+70H] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+80H] + paddd xmm1, xmmword ptr [rsp+0A0H] + paddd xmm2, xmmword ptr [rsp+0C0H] + paddd xmm3, xmmword ptr [rsp+0E0H] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + pshuflw xmm15, xmm15, 0B1H + pshufhw xmm15, xmm15, 0B1H + pshuflw xmm12, xmm12, 0B1H + pshufhw xmm12, xmm12, 0B1H + pshuflw xmm13, xmm13, 0B1H + pshufhw xmm13, xmm13, 0B1H + pshuflw xmm14, xmm14, 0B1H + pshufhw xmm14, xmm14, 0B1H + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+90H] + paddd xmm1, xmmword ptr [rsp+0B0H] + paddd xmm2, xmmword ptr [rsp+0D0H] + paddd xmm3, xmmword ptr [rsp+0F0H] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+20H] + paddd xmm1, xmmword ptr [rsp+30H] + paddd xmm2, xmmword ptr [rsp+70H] + paddd xmm3, xmmword ptr [rsp+40H] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + pshuflw xmm12, xmm12, 0B1H + pshufhw xmm12, xmm12, 0B1H + pshuflw xmm13, xmm13, 0B1H + pshufhw xmm13, xmm13, 0B1H + pshuflw xmm14, xmm14, 0B1H + pshufhw xmm14, xmm14, 0B1H + pshuflw xmm15, xmm15, 0B1H + pshufhw xmm15, xmm15, 0B1H + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+60H] + paddd xmm1, xmmword ptr [rsp+0A0H] + paddd xmm2, xmmword ptr [rsp] + paddd xmm3, xmmword ptr [rsp+0D0H] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+10H] + paddd xmm1, xmmword ptr [rsp+0C0H] + paddd xmm2, xmmword ptr [rsp+90H] + paddd xmm3, xmmword ptr [rsp+0F0H] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + pshuflw xmm15, xmm15, 0B1H + pshufhw xmm15, xmm15, 0B1H + pshuflw xmm12, xmm12, 0B1H + pshufhw xmm12, xmm12, 0B1H + pshuflw xmm13, xmm13, 0B1H + pshufhw xmm13, xmm13, 0B1H + pshuflw xmm14, xmm14, 0B1H + pshufhw xmm14, xmm14, 0B1H + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0B0H] + paddd xmm1, xmmword ptr [rsp+50H] + paddd xmm2, xmmword ptr [rsp+0E0H] + paddd xmm3, xmmword ptr [rsp+80H] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+30H] + paddd xmm1, xmmword ptr [rsp+0A0H] + paddd xmm2, xmmword ptr [rsp+0D0H] + paddd xmm3, xmmword ptr [rsp+70H] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + pshuflw xmm12, xmm12, 0B1H + pshufhw xmm12, xmm12, 0B1H + pshuflw xmm13, xmm13, 0B1H + pshufhw xmm13, xmm13, 0B1H + pshuflw xmm14, xmm14, 0B1H + pshufhw xmm14, xmm14, 0B1H + pshuflw xmm15, xmm15, 0B1H + pshufhw xmm15, xmm15, 0B1H + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+40H] + paddd xmm1, xmmword ptr [rsp+0C0H] + paddd xmm2, xmmword ptr [rsp+20H] + paddd xmm3, xmmword ptr [rsp+0E0H] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+60H] + paddd xmm1, xmmword ptr [rsp+90H] + paddd xmm2, xmmword ptr [rsp+0B0H] + paddd xmm3, xmmword ptr [rsp+80H] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + pshuflw xmm15, xmm15, 0B1H + pshufhw xmm15, xmm15, 0B1H + pshuflw xmm12, xmm12, 0B1H + pshufhw xmm12, xmm12, 0B1H + pshuflw xmm13, xmm13, 0B1H + pshufhw xmm13, xmm13, 0B1H + pshuflw xmm14, xmm14, 0B1H + pshufhw xmm14, xmm14, 0B1H + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+50H] + paddd xmm1, xmmword ptr [rsp] + paddd xmm2, xmmword ptr [rsp+0F0H] + paddd xmm3, xmmword ptr [rsp+10H] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0A0H] + paddd xmm1, xmmword ptr [rsp+0C0H] + paddd xmm2, xmmword ptr [rsp+0E0H] + paddd xmm3, xmmword ptr [rsp+0D0H] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + pshuflw xmm12, xmm12, 0B1H + pshufhw xmm12, xmm12, 0B1H + pshuflw xmm13, xmm13, 0B1H + pshufhw xmm13, xmm13, 0B1H + pshuflw xmm14, xmm14, 0B1H + pshufhw xmm14, xmm14, 0B1H + pshuflw xmm15, xmm15, 0B1H + pshufhw xmm15, xmm15, 0B1H + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+70H] + paddd xmm1, xmmword ptr [rsp+90H] + paddd xmm2, xmmword ptr [rsp+30H] + paddd xmm3, xmmword ptr [rsp+0F0H] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+40H] + paddd xmm1, xmmword ptr [rsp+0B0H] + paddd xmm2, xmmword ptr [rsp+50H] + paddd xmm3, xmmword ptr [rsp+10H] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + pshuflw xmm15, xmm15, 0B1H + pshufhw xmm15, xmm15, 0B1H + pshuflw xmm12, xmm12, 0B1H + pshufhw xmm12, xmm12, 0B1H + pshuflw xmm13, xmm13, 0B1H + pshufhw xmm13, xmm13, 0B1H + pshuflw xmm14, xmm14, 0B1H + pshufhw xmm14, xmm14, 0B1H + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp] + paddd xmm1, xmmword ptr [rsp+20H] + paddd xmm2, xmmword ptr [rsp+80H] + paddd xmm3, xmmword ptr [rsp+60H] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0C0H] + paddd xmm1, xmmword ptr [rsp+90H] + paddd xmm2, xmmword ptr [rsp+0F0H] + paddd xmm3, xmmword ptr [rsp+0E0H] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + pshuflw xmm12, xmm12, 0B1H + pshufhw xmm12, xmm12, 0B1H + pshuflw xmm13, xmm13, 0B1H + pshufhw xmm13, xmm13, 0B1H + pshuflw xmm14, xmm14, 0B1H + pshufhw xmm14, xmm14, 0B1H + pshuflw xmm15, xmm15, 0B1H + pshufhw xmm15, xmm15, 0B1H + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0D0H] + paddd xmm1, xmmword ptr [rsp+0B0H] + paddd xmm2, xmmword ptr [rsp+0A0H] + paddd xmm3, xmmword ptr [rsp+80H] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+70H] + paddd xmm1, xmmword ptr [rsp+50H] + paddd xmm2, xmmword ptr [rsp] + paddd xmm3, xmmword ptr [rsp+60H] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + pshuflw xmm15, xmm15, 0B1H + pshufhw xmm15, xmm15, 0B1H + pshuflw xmm12, xmm12, 0B1H + pshufhw xmm12, xmm12, 0B1H + pshuflw xmm13, xmm13, 0B1H + pshufhw xmm13, xmm13, 0B1H + pshuflw xmm14, xmm14, 0B1H + pshufhw xmm14, xmm14, 0B1H + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+20H] + paddd xmm1, xmmword ptr [rsp+30H] + paddd xmm2, xmmword ptr [rsp+10H] + paddd xmm3, xmmword ptr [rsp+40H] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+90H] + paddd xmm1, xmmword ptr [rsp+0B0H] + paddd xmm2, xmmword ptr [rsp+80H] + paddd xmm3, xmmword ptr [rsp+0F0H] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + pshuflw xmm12, xmm12, 0B1H + pshufhw xmm12, xmm12, 0B1H + pshuflw xmm13, xmm13, 0B1H + pshufhw xmm13, xmm13, 0B1H + pshuflw xmm14, xmm14, 0B1H + pshufhw xmm14, xmm14, 0B1H + pshuflw xmm15, xmm15, 0B1H + pshufhw xmm15, xmm15, 0B1H + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0E0H] + paddd xmm1, xmmword ptr [rsp+50H] + paddd xmm2, xmmword ptr [rsp+0C0H] + paddd xmm3, xmmword ptr [rsp+10H] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0D0H] + paddd xmm1, xmmword ptr [rsp] + paddd xmm2, xmmword ptr [rsp+20H] + paddd xmm3, xmmword ptr [rsp+40H] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + pshuflw xmm15, xmm15, 0B1H + pshufhw xmm15, xmm15, 0B1H + pshuflw xmm12, xmm12, 0B1H + pshufhw xmm12, xmm12, 0B1H + pshuflw xmm13, xmm13, 0B1H + pshufhw xmm13, xmm13, 0B1H + pshuflw xmm14, xmm14, 0B1H + pshufhw xmm14, xmm14, 0B1H + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+30H] + paddd xmm1, xmmword ptr [rsp+0A0H] + paddd xmm2, xmmword ptr [rsp+60H] + paddd xmm3, xmmword ptr [rsp+70H] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0B0H] + paddd xmm1, xmmword ptr [rsp+50H] + paddd xmm2, xmmword ptr [rsp+10H] + paddd xmm3, xmmword ptr [rsp+80H] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + pshuflw xmm12, xmm12, 0B1H + pshufhw xmm12, xmm12, 0B1H + pshuflw xmm13, xmm13, 0B1H + pshufhw xmm13, xmm13, 0B1H + pshuflw xmm14, xmm14, 0B1H + pshufhw xmm14, xmm14, 0B1H + pshuflw xmm15, xmm15, 0B1H + pshufhw xmm15, xmm15, 0B1H + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0F0H] + paddd xmm1, xmmword ptr [rsp] + paddd xmm2, xmmword ptr [rsp+90H] + paddd xmm3, xmmword ptr [rsp+60H] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0E0H] + paddd xmm1, xmmword ptr [rsp+20H] + paddd xmm2, xmmword ptr [rsp+30H] + paddd xmm3, xmmword ptr [rsp+70H] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + pshuflw xmm15, xmm15, 0B1H + pshufhw xmm15, xmm15, 0B1H + pshuflw xmm12, xmm12, 0B1H + pshufhw xmm12, xmm12, 0B1H + pshuflw xmm13, xmm13, 0B1H + pshufhw xmm13, xmm13, 0B1H + pshuflw xmm14, xmm14, 0B1H + pshufhw xmm14, xmm14, 0B1H + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0A0H] + paddd xmm1, xmmword ptr [rsp+0C0H] + paddd xmm2, xmmword ptr [rsp+40H] + paddd xmm3, xmmword ptr [rsp+0D0H] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + pxor xmm0, xmm8 + pxor xmm1, xmm9 + pxor xmm2, xmm10 + pxor xmm3, xmm11 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + pxor xmm4, xmm12 + pxor xmm5, xmm13 + pxor xmm6, xmm14 + pxor xmm7, xmm15 + mov eax, r13d + jne innerloop4 + movdqa xmm9, xmm0 + punpckldq xmm0, xmm1 + punpckhdq xmm9, xmm1 + movdqa xmm11, xmm2 + punpckldq xmm2, xmm3 + punpckhdq xmm11, xmm3 + movdqa xmm1, xmm0 + punpcklqdq xmm0, xmm2 + punpckhqdq xmm1, xmm2 + movdqa xmm3, xmm9 + punpcklqdq xmm9, xmm11 + punpckhqdq xmm3, xmm11 + movdqu xmmword ptr [rbx], xmm0 + movdqu xmmword ptr [rbx+20H], xmm1 + movdqu xmmword ptr [rbx+40H], xmm9 + movdqu xmmword ptr [rbx+60H], xmm3 + movdqa xmm9, xmm4 + punpckldq xmm4, xmm5 + punpckhdq xmm9, xmm5 + movdqa xmm11, xmm6 + punpckldq xmm6, xmm7 + punpckhdq xmm11, xmm7 + movdqa xmm5, xmm4 + punpcklqdq xmm4, xmm6 + punpckhqdq xmm5, xmm6 + movdqa xmm7, xmm9 + punpcklqdq xmm9, xmm11 + punpckhqdq xmm7, xmm11 + movdqu xmmword ptr [rbx+10H], xmm4 + movdqu xmmword ptr [rbx+30H], xmm5 + movdqu xmmword ptr [rbx+50H], xmm9 + movdqu xmmword ptr [rbx+70H], xmm7 + movdqa xmm1, xmmword ptr [rsp+110H] + movdqa xmm0, xmm1 + paddd xmm1, xmmword ptr [rsp+150H] + movdqa xmmword ptr [rsp+110H], xmm1 + pxor xmm0, xmmword ptr [CMP_MSB_MASK] + pxor xmm1, xmmword ptr [CMP_MSB_MASK] + pcmpgtd xmm0, xmm1 + movdqa xmm1, xmmword ptr [rsp+120H] + psubd xmm1, xmm0 + movdqa xmmword ptr [rsp+120H], xmm1 + add rbx, 128 + add rdi, 32 + sub rsi, 4 + cmp rsi, 4 + jnc outerloop4 + test rsi, rsi + jne final3blocks +unwind: + movdqa xmm6, xmmword ptr [rsp+170H] + movdqa xmm7, xmmword ptr [rsp+180H] + movdqa xmm8, xmmword ptr [rsp+190H] + movdqa xmm9, xmmword ptr [rsp+1A0H] + movdqa xmm10, xmmword ptr [rsp+1B0H] + movdqa xmm11, xmmword ptr [rsp+1C0H] + movdqa xmm12, xmmword ptr [rsp+1D0H] + movdqa xmm13, xmmword ptr [rsp+1E0H] + movdqa xmm14, xmmword ptr [rsp+1F0H] + movdqa xmm15, xmmword ptr [rsp+200H] + mov rsp, rbp + pop rbp + pop rbx + pop rdi + pop rsi + pop r12 + pop r13 + pop r14 + pop r15 + ret +ALIGN 16 +final3blocks: + test esi, 2H + je final1block + movups xmm0, xmmword ptr [rcx] + movups xmm1, xmmword ptr [rcx+10H] + movaps xmm8, xmm0 + movaps xmm9, xmm1 + movd xmm13, dword ptr [rsp+110H] + movd xmm14, dword ptr [rsp+120H] + punpckldq xmm13, xmm14 + movaps xmmword ptr [rsp], xmm13 + movd xmm14, dword ptr [rsp+114H] + movd xmm13, dword ptr [rsp+124H] + punpckldq xmm14, xmm13 + movaps xmmword ptr [rsp+10H], xmm14 + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+8H] + movzx eax, byte ptr [rbp+80H] + or eax, r13d + xor edx, edx +innerloop2: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + movaps xmm2, xmmword ptr [BLAKE3_IV] + movaps xmm10, xmm2 + movups xmm4, xmmword ptr [r8+rdx-40H] + movups xmm5, xmmword ptr [r8+rdx-30H] + movaps xmm3, xmm4 + shufps xmm4, xmm5, 136 + shufps xmm3, xmm5, 221 + movaps xmm5, xmm3 + movups xmm6, xmmword ptr [r8+rdx-20H] + movups xmm7, xmmword ptr [r8+rdx-10H] + movaps xmm3, xmm6 + shufps xmm6, xmm7, 136 + pshufd xmm6, xmm6, 93H + shufps xmm3, xmm7, 221 + pshufd xmm7, xmm3, 93H + movups xmm12, xmmword ptr [r9+rdx-40H] + movups xmm13, xmmword ptr [r9+rdx-30H] + movaps xmm11, xmm12 + shufps xmm12, xmm13, 136 + shufps xmm11, xmm13, 221 + movaps xmm13, xmm11 + movups xmm14, xmmword ptr [r9+rdx-20H] + movups xmm15, xmmword ptr [r9+rdx-10H] + movaps xmm11, xmm14 + shufps xmm14, xmm15, 136 + pshufd xmm14, xmm14, 93H + shufps xmm11, xmm15, 221 + pshufd xmm15, xmm11, 93H + shl rax, 20H + or rax, 40H + movd xmm3, rax + movdqa xmmword ptr [rsp+20H], xmm3 + movaps xmm3, xmmword ptr [rsp] + movaps xmm11, xmmword ptr [rsp+10H] + punpcklqdq xmm3, xmmword ptr [rsp+20H] + punpcklqdq xmm11, xmmword ptr [rsp+20H] + mov al, 7 +roundloop2: + paddd xmm0, xmm4 + paddd xmm8, xmm12 + movaps xmmword ptr [rsp+20H], xmm4 + movaps xmmword ptr [rsp+30H], xmm12 + paddd xmm0, xmm1 + paddd xmm8, xmm9 + pxor xmm3, xmm0 + pxor xmm11, xmm8 + pshuflw xmm3, xmm3, 0B1H + pshufhw xmm3, xmm3, 0B1H + pshuflw xmm11, xmm11, 0B1H + pshufhw xmm11, xmm11, 0B1H + paddd xmm2, xmm3 + paddd xmm10, xmm11 + pxor xmm1, xmm2 + pxor xmm9, xmm10 + movdqa xmm4, xmm1 + pslld xmm1, 20 + psrld xmm4, 12 + por xmm1, xmm4 + movdqa xmm4, xmm9 + pslld xmm9, 20 + psrld xmm4, 12 + por xmm9, xmm4 + paddd xmm0, xmm5 + paddd xmm8, xmm13 + movaps xmmword ptr [rsp+40H], xmm5 + movaps xmmword ptr [rsp+50H], xmm13 + paddd xmm0, xmm1 + paddd xmm8, xmm9 + pxor xmm3, xmm0 + pxor xmm11, xmm8 + movdqa xmm13, xmm3 + psrld xmm3, 8 + pslld xmm13, 24 + pxor xmm3, xmm13 + movdqa xmm13, xmm11 + psrld xmm11, 8 + pslld xmm13, 24 + pxor xmm11, xmm13 + paddd xmm2, xmm3 + paddd xmm10, xmm11 + pxor xmm1, xmm2 + pxor xmm9, xmm10 + movdqa xmm4, xmm1 + pslld xmm1, 25 + psrld xmm4, 7 + por xmm1, xmm4 + movdqa xmm4, xmm9 + pslld xmm9, 25 + psrld xmm4, 7 + por xmm9, xmm4 + pshufd xmm0, xmm0, 93H + pshufd xmm8, xmm8, 93H + pshufd xmm3, xmm3, 4EH + pshufd xmm11, xmm11, 4EH + pshufd xmm2, xmm2, 39H + pshufd xmm10, xmm10, 39H + paddd xmm0, xmm6 + paddd xmm8, xmm14 + paddd xmm0, xmm1 + paddd xmm8, xmm9 + pxor xmm3, xmm0 + pxor xmm11, xmm8 + pshuflw xmm3, xmm3, 0B1H + pshufhw xmm3, xmm3, 0B1H + pshuflw xmm11, xmm11, 0B1H + pshufhw xmm11, xmm11, 0B1H + paddd xmm2, xmm3 + paddd xmm10, xmm11 + pxor xmm1, xmm2 + pxor xmm9, xmm10 + movdqa xmm4, xmm1 + pslld xmm1, 20 + psrld xmm4, 12 + por xmm1, xmm4 + movdqa xmm4, xmm9 + pslld xmm9, 20 + psrld xmm4, 12 + por xmm9, xmm4 + paddd xmm0, xmm7 + paddd xmm8, xmm15 + paddd xmm0, xmm1 + paddd xmm8, xmm9 + pxor xmm3, xmm0 + pxor xmm11, xmm8 + movdqa xmm13, xmm3 + psrld xmm3, 8 + pslld xmm13, 24 + pxor xmm3, xmm13 + movdqa xmm13, xmm11 + psrld xmm11, 8 + pslld xmm13, 24 + pxor xmm11, xmm13 + paddd xmm2, xmm3 + paddd xmm10, xmm11 + pxor xmm1, xmm2 + pxor xmm9, xmm10 + movdqa xmm4, xmm1 + pslld xmm1, 25 + psrld xmm4, 7 + por xmm1, xmm4 + movdqa xmm4, xmm9 + pslld xmm9, 25 + psrld xmm4, 7 + por xmm9, xmm4 + pshufd xmm0, xmm0, 39H + pshufd xmm8, xmm8, 39H + pshufd xmm3, xmm3, 4EH + pshufd xmm11, xmm11, 4EH + pshufd xmm2, xmm2, 93H + pshufd xmm10, xmm10, 93H + dec al + je endroundloop2 + movdqa xmm12, xmmword ptr [rsp+20H] + movdqa xmm5, xmmword ptr [rsp+40H] + pshufd xmm13, xmm12, 0FH + shufps xmm12, xmm5, 214 + pshufd xmm4, xmm12, 39H + movdqa xmm12, xmm6 + shufps xmm12, xmm7, 250 + pand xmm13, xmmword ptr [PBLENDW_0x33_MASK] + pand xmm12, xmmword ptr [PBLENDW_0xCC_MASK] + por xmm13, xmm12 + movdqa xmmword ptr [rsp+20H], xmm13 + movdqa xmm12, xmm7 + punpcklqdq xmm12, xmm5 + movdqa xmm13, xmm6 + pand xmm12, xmmword ptr [PBLENDW_0x3F_MASK] + pand xmm13, xmmword ptr [PBLENDW_0xC0_MASK] + por xmm12, xmm13 + pshufd xmm12, xmm12, 78H + punpckhdq xmm5, xmm7 + punpckldq xmm6, xmm5 + pshufd xmm7, xmm6, 1EH + movdqa xmmword ptr [rsp+40H], xmm12 + movdqa xmm5, xmmword ptr [rsp+30H] + movdqa xmm13, xmmword ptr [rsp+50H] + pshufd xmm6, xmm5, 0FH + shufps xmm5, xmm13, 214 + pshufd xmm12, xmm5, 39H + movdqa xmm5, xmm14 + shufps xmm5, xmm15, 250 + pand xmm6, xmmword ptr [PBLENDW_0x33_MASK] + pand xmm5, xmmword ptr [PBLENDW_0xCC_MASK] + por xmm6, xmm5 + movdqa xmm5, xmm15 + punpcklqdq xmm5, xmm13 + movdqa xmmword ptr [rsp+30H], xmm2 + movdqa xmm2, xmm14 + pand xmm5, xmmword ptr [PBLENDW_0x3F_MASK] + pand xmm2, xmmword ptr [PBLENDW_0xC0_MASK] + por xmm5, xmm2 + movdqa xmm2, xmmword ptr [rsp+30H] + pshufd xmm5, xmm5, 78H + punpckhdq xmm13, xmm15 + punpckldq xmm14, xmm13 + pshufd xmm15, xmm14, 1EH + movdqa xmm13, xmm6 + movdqa xmm14, xmm5 + movdqa xmm5, xmmword ptr [rsp+20H] + movdqa xmm6, xmmword ptr [rsp+40H] + jmp roundloop2 +endroundloop2: + pxor xmm0, xmm2 + pxor xmm1, xmm3 + pxor xmm8, xmm10 + pxor xmm9, xmm11 + mov eax, r13d + cmp rdx, r15 + jne innerloop2 + movups xmmword ptr [rbx], xmm0 + movups xmmword ptr [rbx+10H], xmm1 + movups xmmword ptr [rbx+20H], xmm8 + movups xmmword ptr [rbx+30H], xmm9 + mov eax, dword ptr [rsp+130H] + neg eax + mov r10d, dword ptr [rsp+110H+8*rax] + mov r11d, dword ptr [rsp+120H+8*rax] + mov dword ptr [rsp+110H], r10d + mov dword ptr [rsp+120H], r11d + add rdi, 16 + add rbx, 64 + sub rsi, 2 +final1block: + test esi, 1H + je unwind + movups xmm0, xmmword ptr [rcx] + movups xmm1, xmmword ptr [rcx+10H] + movd xmm13, dword ptr [rsp+110H] + movd xmm14, dword ptr [rsp+120H] + punpckldq xmm13, xmm14 + mov r8, qword ptr [rdi] + movzx eax, byte ptr [rbp+80H] + or eax, r13d + xor edx, edx +innerloop1: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + movaps xmm2, xmmword ptr [BLAKE3_IV] + shl rax, 32 + or rax, 64 + movd xmm12, rax + movdqa xmm3, xmm13 + punpcklqdq xmm3, xmm12 + movups xmm4, xmmword ptr [r8+rdx-40H] + movups xmm5, xmmword ptr [r8+rdx-30H] + movaps xmm8, xmm4 + shufps xmm4, xmm5, 136 + shufps xmm8, xmm5, 221 + movaps xmm5, xmm8 + movups xmm6, xmmword ptr [r8+rdx-20H] + movups xmm7, xmmword ptr [r8+rdx-10H] + movaps xmm8, xmm6 + shufps xmm6, xmm7, 136 + pshufd xmm6, xmm6, 93H + shufps xmm8, xmm7, 221 + pshufd xmm7, xmm8, 93H + mov al, 7 +roundloop1: + paddd xmm0, xmm4 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshuflw xmm3, xmm3, 0B1H + pshufhw xmm3, xmm3, 0B1H + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm5 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + movdqa xmm14, xmm3 + psrld xmm3, 8 + pslld xmm14, 24 + pxor xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 93H + pshufd xmm3, xmm3, 4EH + pshufd xmm2, xmm2, 39H + paddd xmm0, xmm6 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshuflw xmm3, xmm3, 0B1H + pshufhw xmm3, xmm3, 0B1H + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm7 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + movdqa xmm14, xmm3 + psrld xmm3, 8 + pslld xmm14, 24 + pxor xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 39H + pshufd xmm3, xmm3, 4EH + pshufd xmm2, xmm2, 93H + dec al + jz endroundloop1 + movdqa xmm8, xmm4 + shufps xmm8, xmm5, 214 + pshufd xmm9, xmm4, 0FH + pshufd xmm4, xmm8, 39H + movdqa xmm8, xmm6 + shufps xmm8, xmm7, 250 + pand xmm9, xmmword ptr [PBLENDW_0x33_MASK] + pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK] + por xmm9, xmm8 + movdqa xmm8, xmm7 + punpcklqdq xmm8, xmm5 + movdqa xmm10, xmm6 + pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK] + pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK] + por xmm8, xmm10 + pshufd xmm8, xmm8, 78H + punpckhdq xmm5, xmm7 + punpckldq xmm6, xmm5 + pshufd xmm7, xmm6, 1EH + movdqa xmm5, xmm9 + movdqa xmm6, xmm8 + jmp roundloop1 +endroundloop1: + pxor xmm0, xmm2 + pxor xmm1, xmm3 + mov eax, r13d + cmp rdx, r15 + jne innerloop1 + movups xmmword ptr [rbx], xmm0 + movups xmmword ptr [rbx+10H], xmm1 + jmp unwind +_blake3_hash_many_sse2 ENDP +blake3_hash_many_sse2 ENDP + +blake3_compress_in_place_sse2 PROC +_blake3_compress_in_place_sse2 PROC + sub rsp, 120 + movdqa xmmword ptr [rsp], xmm6 + movdqa xmmword ptr [rsp+10H], xmm7 + movdqa xmmword ptr [rsp+20H], xmm8 + movdqa xmmword ptr [rsp+30H], xmm9 + movdqa xmmword ptr [rsp+40H], xmm11 + movdqa xmmword ptr [rsp+50H], xmm14 + movdqa xmmword ptr [rsp+60H], xmm15 + movups xmm0, xmmword ptr [rcx] + movups xmm1, xmmword ptr [rcx+10H] + movaps xmm2, xmmword ptr [BLAKE3_IV] + movzx eax, byte ptr [rsp+0A0H] + movzx r8d, r8b + shl rax, 32 + add r8, rax + movd xmm3, r9 + movd xmm4, r8 + punpcklqdq xmm3, xmm4 + movups xmm4, xmmword ptr [rdx] + movups xmm5, xmmword ptr [rdx+10H] + movaps xmm8, xmm4 + shufps xmm4, xmm5, 136 + shufps xmm8, xmm5, 221 + movaps xmm5, xmm8 + movups xmm6, xmmword ptr [rdx+20H] + movups xmm7, xmmword ptr [rdx+30H] + movaps xmm8, xmm6 + shufps xmm6, xmm7, 136 + pshufd xmm6, xmm6, 93H + shufps xmm8, xmm7, 221 + pshufd xmm7, xmm8, 93H + mov al, 7 +@@: + paddd xmm0, xmm4 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshuflw xmm3, xmm3, 0B1H + pshufhw xmm3, xmm3, 0B1H + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm5 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + movdqa xmm14, xmm3 + psrld xmm3, 8 + pslld xmm14, 24 + pxor xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 93H + pshufd xmm3, xmm3, 4EH + pshufd xmm2, xmm2, 39H + paddd xmm0, xmm6 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshuflw xmm3, xmm3, 0B1H + pshufhw xmm3, xmm3, 0B1H + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm7 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + movdqa xmm14, xmm3 + psrld xmm3, 8 + pslld xmm14, 24 + pxor xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 39H + pshufd xmm3, xmm3, 4EH + pshufd xmm2, xmm2, 93H + dec al + jz @F + movdqa xmm8, xmm4 + shufps xmm8, xmm5, 214 + pshufd xmm9, xmm4, 0FH + pshufd xmm4, xmm8, 39H + movdqa xmm8, xmm6 + shufps xmm8, xmm7, 250 + pand xmm9, xmmword ptr [PBLENDW_0x33_MASK] + pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK] + por xmm9, xmm8 + movdqa xmm8, xmm7 + punpcklqdq xmm8, xmm5 + movdqa xmm14, xmm6 + pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK] + pand xmm14, xmmword ptr [PBLENDW_0xC0_MASK] + por xmm8, xmm14 + pshufd xmm8, xmm8, 78H + punpckhdq xmm5, xmm7 + punpckldq xmm6, xmm5 + pshufd xmm7, xmm6, 1EH + movdqa xmm5, xmm9 + movdqa xmm6, xmm8 + jmp @B +@@: + pxor xmm0, xmm2 + pxor xmm1, xmm3 + movups xmmword ptr [rcx], xmm0 + movups xmmword ptr [rcx+10H], xmm1 + movdqa xmm6, xmmword ptr [rsp] + movdqa xmm7, xmmword ptr [rsp+10H] + movdqa xmm8, xmmword ptr [rsp+20H] + movdqa xmm9, xmmword ptr [rsp+30H] + movdqa xmm11, xmmword ptr [rsp+40H] + movdqa xmm14, xmmword ptr [rsp+50H] + movdqa xmm15, xmmword ptr [rsp+60H] + add rsp, 120 + ret +_blake3_compress_in_place_sse2 ENDP +blake3_compress_in_place_sse2 ENDP + +ALIGN 16 +blake3_compress_xof_sse2 PROC +_blake3_compress_xof_sse2 PROC + sub rsp, 120 + movdqa xmmword ptr [rsp], xmm6 + movdqa xmmword ptr [rsp+10H], xmm7 + movdqa xmmword ptr [rsp+20H], xmm8 + movdqa xmmword ptr [rsp+30H], xmm9 + movdqa xmmword ptr [rsp+40H], xmm11 + movdqa xmmword ptr [rsp+50H], xmm14 + movdqa xmmword ptr [rsp+60H], xmm15 + movups xmm0, xmmword ptr [rcx] + movups xmm1, xmmword ptr [rcx+10H] + movaps xmm2, xmmword ptr [BLAKE3_IV] + movzx eax, byte ptr [rsp+0A0H] + movzx r8d, r8b + mov r10, qword ptr [rsp+0A8H] + shl rax, 32 + add r8, rax + movd xmm3, r9 + movd xmm4, r8 + punpcklqdq xmm3, xmm4 + movups xmm4, xmmword ptr [rdx] + movups xmm5, xmmword ptr [rdx+10H] + movaps xmm8, xmm4 + shufps xmm4, xmm5, 136 + shufps xmm8, xmm5, 221 + movaps xmm5, xmm8 + movups xmm6, xmmword ptr [rdx+20H] + movups xmm7, xmmword ptr [rdx+30H] + movaps xmm8, xmm6 + shufps xmm6, xmm7, 136 + pshufd xmm6, xmm6, 93H + shufps xmm8, xmm7, 221 + pshufd xmm7, xmm8, 93H + mov al, 7 +@@: + paddd xmm0, xmm4 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshuflw xmm3, xmm3, 0B1H + pshufhw xmm3, xmm3, 0B1H + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm5 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + movdqa xmm14, xmm3 + psrld xmm3, 8 + pslld xmm14, 24 + pxor xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 93H + pshufd xmm3, xmm3, 4EH + pshufd xmm2, xmm2, 39H + paddd xmm0, xmm6 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshuflw xmm3, xmm3, 0B1H + pshufhw xmm3, xmm3, 0B1H + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm7 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + movdqa xmm14, xmm3 + psrld xmm3, 8 + pslld xmm14, 24 + pxor xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 39H + pshufd xmm3, xmm3, 4EH + pshufd xmm2, xmm2, 93H + dec al + jz @F + movdqa xmm8, xmm4 + shufps xmm8, xmm5, 214 + pshufd xmm9, xmm4, 0FH + pshufd xmm4, xmm8, 39H + movdqa xmm8, xmm6 + shufps xmm8, xmm7, 250 + pand xmm9, xmmword ptr [PBLENDW_0x33_MASK] + pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK] + por xmm9, xmm8 + movdqa xmm8, xmm7 + punpcklqdq xmm8, xmm5 + movdqa xmm14, xmm6 + pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK] + pand xmm14, xmmword ptr [PBLENDW_0xC0_MASK] + por xmm8, xmm14 + pshufd xmm8, xmm8, 78H + punpckhdq xmm5, xmm7 + punpckldq xmm6, xmm5 + pshufd xmm7, xmm6, 1EH + movdqa xmm5, xmm9 + movdqa xmm6, xmm8 + jmp @B +@@: + movdqu xmm4, xmmword ptr [rcx] + movdqu xmm5, xmmword ptr [rcx+10H] + pxor xmm0, xmm2 + pxor xmm1, xmm3 + pxor xmm2, xmm4 + pxor xmm3, xmm5 + movups xmmword ptr [r10], xmm0 + movups xmmword ptr [r10+10H], xmm1 + movups xmmword ptr [r10+20H], xmm2 + movups xmmword ptr [r10+30H], xmm3 + movdqa xmm6, xmmword ptr [rsp] + movdqa xmm7, xmmword ptr [rsp+10H] + movdqa xmm8, xmmword ptr [rsp+20H] + movdqa xmm9, xmmword ptr [rsp+30H] + movdqa xmm11, xmmword ptr [rsp+40H] + movdqa xmm14, xmmword ptr [rsp+50H] + movdqa xmm15, xmmword ptr [rsp+60H] + add rsp, 120 + ret +_blake3_compress_xof_sse2 ENDP +blake3_compress_xof_sse2 ENDP + +_TEXT ENDS + + +_RDATA SEGMENT READONLY PAGE ALIAS(".rdata") 'CONST' +ALIGN 64 +BLAKE3_IV: + dd 6A09E667H, 0BB67AE85H, 3C6EF372H, 0A54FF53AH + +ADD0: + dd 0, 1, 2, 3 + +ADD1: + dd 4 dup (4) + +BLAKE3_IV_0: + dd 4 dup (6A09E667H) + +BLAKE3_IV_1: + dd 4 dup (0BB67AE85H) + +BLAKE3_IV_2: + dd 4 dup (3C6EF372H) + +BLAKE3_IV_3: + dd 4 dup (0A54FF53AH) + +BLAKE3_BLOCK_LEN: + dd 4 dup (64) + +CMP_MSB_MASK: + dd 8 dup(80000000H) + +PBLENDW_0x33_MASK: + dd 0FFFFFFFFH, 000000000H, 0FFFFFFFFH, 000000000H +PBLENDW_0xCC_MASK: + dd 000000000H, 0FFFFFFFFH, 000000000H, 0FFFFFFFFH +PBLENDW_0x3F_MASK: + dd 0FFFFFFFFH, 0FFFFFFFFH, 0FFFFFFFFH, 000000000H +PBLENDW_0xC0_MASK: + dd 000000000H, 000000000H, 000000000H, 0FFFFFFFFH + +_RDATA ENDS +END diff --git a/third-party/blake3/c/blake3_sse41.c b/third-party/blake3/c/blake3_sse41.c new file mode 100644 index 00000000..4653a856 --- /dev/null +++ b/third-party/blake3/c/blake3_sse41.c @@ -0,0 +1,560 @@ +#include "blake3_impl.h" + +#include + +#define DEGREE 4 + +#define _mm_shuffle_ps2(a, b, c) \ + (_mm_castps_si128( \ + _mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), (c)))) + +INLINE __m128i loadu(const uint8_t src[16]) { + return _mm_loadu_si128((const __m128i *)src); +} + +INLINE void storeu(__m128i src, uint8_t dest[16]) { + _mm_storeu_si128((__m128i *)dest, src); +} + +INLINE __m128i addv(__m128i a, __m128i b) { return _mm_add_epi32(a, b); } + +// Note that clang-format doesn't like the name "xor" for some reason. +INLINE __m128i xorv(__m128i a, __m128i b) { return _mm_xor_si128(a, b); } + +INLINE __m128i set1(uint32_t x) { return _mm_set1_epi32((int32_t)x); } + +INLINE __m128i set4(uint32_t a, uint32_t b, uint32_t c, uint32_t d) { + return _mm_setr_epi32((int32_t)a, (int32_t)b, (int32_t)c, (int32_t)d); +} + +INLINE __m128i rot16(__m128i x) { + return _mm_shuffle_epi8( + x, _mm_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2)); +} + +INLINE __m128i rot12(__m128i x) { + return xorv(_mm_srli_epi32(x, 12), _mm_slli_epi32(x, 32 - 12)); +} + +INLINE __m128i rot8(__m128i x) { + return _mm_shuffle_epi8( + x, _mm_set_epi8(12, 15, 14, 13, 8, 11, 10, 9, 4, 7, 6, 5, 0, 3, 2, 1)); +} + +INLINE __m128i rot7(__m128i x) { + return xorv(_mm_srli_epi32(x, 7), _mm_slli_epi32(x, 32 - 7)); +} + +INLINE void g1(__m128i *row0, __m128i *row1, __m128i *row2, __m128i *row3, + __m128i m) { + *row0 = addv(addv(*row0, m), *row1); + *row3 = xorv(*row3, *row0); + *row3 = rot16(*row3); + *row2 = addv(*row2, *row3); + *row1 = xorv(*row1, *row2); + *row1 = rot12(*row1); +} + +INLINE void g2(__m128i *row0, __m128i *row1, __m128i *row2, __m128i *row3, + __m128i m) { + *row0 = addv(addv(*row0, m), *row1); + *row3 = xorv(*row3, *row0); + *row3 = rot8(*row3); + *row2 = addv(*row2, *row3); + *row1 = xorv(*row1, *row2); + *row1 = rot7(*row1); +} + +// Note the optimization here of leaving row1 as the unrotated row, rather than +// row0. All the message loads below are adjusted to compensate for this. See +// discussion at https://github.com/sneves/blake2-avx2/pull/4 +INLINE void diagonalize(__m128i *row0, __m128i *row2, __m128i *row3) { + *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(2, 1, 0, 3)); + *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2)); + *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(0, 3, 2, 1)); +} + +INLINE void undiagonalize(__m128i *row0, __m128i *row2, __m128i *row3) { + *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(0, 3, 2, 1)); + *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2)); + *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(2, 1, 0, 3)); +} + +INLINE void compress_pre(__m128i rows[4], const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, uint8_t flags) { + rows[0] = loadu((uint8_t *)&cv[0]); + rows[1] = loadu((uint8_t *)&cv[4]); + rows[2] = set4(IV[0], IV[1], IV[2], IV[3]); + rows[3] = set4(counter_low(counter), counter_high(counter), + (uint32_t)block_len, (uint32_t)flags); + + __m128i m0 = loadu(&block[sizeof(__m128i) * 0]); + __m128i m1 = loadu(&block[sizeof(__m128i) * 1]); + __m128i m2 = loadu(&block[sizeof(__m128i) * 2]); + __m128i m3 = loadu(&block[sizeof(__m128i) * 3]); + + __m128i t0, t1, t2, t3, tt; + + // Round 1. The first round permutes the message words from the original + // input order, into the groups that get mixed in parallel. + t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(2, 0, 2, 0)); // 6 4 2 0 + g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); + t1 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 3, 1)); // 7 5 3 1 + g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); + diagonalize(&rows[0], &rows[2], &rows[3]); + t2 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(2, 0, 2, 0)); // 14 12 10 8 + t2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2, 1, 0, 3)); // 12 10 8 14 + g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); + t3 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 1, 3, 1)); // 15 13 11 9 + t3 = _mm_shuffle_epi32(t3, _MM_SHUFFLE(2, 1, 0, 3)); // 13 11 9 15 + g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); + undiagonalize(&rows[0], &rows[2], &rows[3]); + m0 = t0; + m1 = t1; + m2 = t2; + m3 = t3; + + // Round 2. This round and all following rounds apply a fixed permutation + // to the message words from the round before. + t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); + t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); + t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); + tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); + t1 = _mm_blend_epi16(tt, t1, 0xCC); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); + diagonalize(&rows[0], &rows[2], &rows[3]); + t2 = _mm_unpacklo_epi64(m3, m1); + tt = _mm_blend_epi16(t2, m2, 0xC0); + t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); + t3 = _mm_unpackhi_epi32(m1, m3); + tt = _mm_unpacklo_epi32(m2, t3); + t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); + undiagonalize(&rows[0], &rows[2], &rows[3]); + m0 = t0; + m1 = t1; + m2 = t2; + m3 = t3; + + // Round 3 + t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); + t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); + t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); + tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); + t1 = _mm_blend_epi16(tt, t1, 0xCC); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); + diagonalize(&rows[0], &rows[2], &rows[3]); + t2 = _mm_unpacklo_epi64(m3, m1); + tt = _mm_blend_epi16(t2, m2, 0xC0); + t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); + t3 = _mm_unpackhi_epi32(m1, m3); + tt = _mm_unpacklo_epi32(m2, t3); + t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); + undiagonalize(&rows[0], &rows[2], &rows[3]); + m0 = t0; + m1 = t1; + m2 = t2; + m3 = t3; + + // Round 4 + t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); + t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); + t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); + tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); + t1 = _mm_blend_epi16(tt, t1, 0xCC); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); + diagonalize(&rows[0], &rows[2], &rows[3]); + t2 = _mm_unpacklo_epi64(m3, m1); + tt = _mm_blend_epi16(t2, m2, 0xC0); + t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); + t3 = _mm_unpackhi_epi32(m1, m3); + tt = _mm_unpacklo_epi32(m2, t3); + t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); + undiagonalize(&rows[0], &rows[2], &rows[3]); + m0 = t0; + m1 = t1; + m2 = t2; + m3 = t3; + + // Round 5 + t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); + t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); + t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); + tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); + t1 = _mm_blend_epi16(tt, t1, 0xCC); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); + diagonalize(&rows[0], &rows[2], &rows[3]); + t2 = _mm_unpacklo_epi64(m3, m1); + tt = _mm_blend_epi16(t2, m2, 0xC0); + t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); + t3 = _mm_unpackhi_epi32(m1, m3); + tt = _mm_unpacklo_epi32(m2, t3); + t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); + undiagonalize(&rows[0], &rows[2], &rows[3]); + m0 = t0; + m1 = t1; + m2 = t2; + m3 = t3; + + // Round 6 + t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); + t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); + t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); + tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); + t1 = _mm_blend_epi16(tt, t1, 0xCC); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); + diagonalize(&rows[0], &rows[2], &rows[3]); + t2 = _mm_unpacklo_epi64(m3, m1); + tt = _mm_blend_epi16(t2, m2, 0xC0); + t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); + t3 = _mm_unpackhi_epi32(m1, m3); + tt = _mm_unpacklo_epi32(m2, t3); + t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); + undiagonalize(&rows[0], &rows[2], &rows[3]); + m0 = t0; + m1 = t1; + m2 = t2; + m3 = t3; + + // Round 7 + t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); + t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); + t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); + tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); + t1 = _mm_blend_epi16(tt, t1, 0xCC); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); + diagonalize(&rows[0], &rows[2], &rows[3]); + t2 = _mm_unpacklo_epi64(m3, m1); + tt = _mm_blend_epi16(t2, m2, 0xC0); + t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); + g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); + t3 = _mm_unpackhi_epi32(m1, m3); + tt = _mm_unpacklo_epi32(m2, t3); + t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); + g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); + undiagonalize(&rows[0], &rows[2], &rows[3]); +} + +void blake3_compress_in_place_sse41(uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, + uint8_t flags) { + __m128i rows[4]; + compress_pre(rows, cv, block, block_len, counter, flags); + storeu(xorv(rows[0], rows[2]), (uint8_t *)&cv[0]); + storeu(xorv(rows[1], rows[3]), (uint8_t *)&cv[4]); +} + +void blake3_compress_xof_sse41(const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, + uint8_t flags, uint8_t out[64]) { + __m128i rows[4]; + compress_pre(rows, cv, block, block_len, counter, flags); + storeu(xorv(rows[0], rows[2]), &out[0]); + storeu(xorv(rows[1], rows[3]), &out[16]); + storeu(xorv(rows[2], loadu((uint8_t *)&cv[0])), &out[32]); + storeu(xorv(rows[3], loadu((uint8_t *)&cv[4])), &out[48]); +} + +INLINE void round_fn(__m128i v[16], __m128i m[16], size_t r) { + v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][0]]); + v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][2]]); + v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][4]]); + v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][6]]); + v[0] = addv(v[0], v[4]); + v[1] = addv(v[1], v[5]); + v[2] = addv(v[2], v[6]); + v[3] = addv(v[3], v[7]); + v[12] = xorv(v[12], v[0]); + v[13] = xorv(v[13], v[1]); + v[14] = xorv(v[14], v[2]); + v[15] = xorv(v[15], v[3]); + v[12] = rot16(v[12]); + v[13] = rot16(v[13]); + v[14] = rot16(v[14]); + v[15] = rot16(v[15]); + v[8] = addv(v[8], v[12]); + v[9] = addv(v[9], v[13]); + v[10] = addv(v[10], v[14]); + v[11] = addv(v[11], v[15]); + v[4] = xorv(v[4], v[8]); + v[5] = xorv(v[5], v[9]); + v[6] = xorv(v[6], v[10]); + v[7] = xorv(v[7], v[11]); + v[4] = rot12(v[4]); + v[5] = rot12(v[5]); + v[6] = rot12(v[6]); + v[7] = rot12(v[7]); + v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][1]]); + v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][3]]); + v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][5]]); + v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][7]]); + v[0] = addv(v[0], v[4]); + v[1] = addv(v[1], v[5]); + v[2] = addv(v[2], v[6]); + v[3] = addv(v[3], v[7]); + v[12] = xorv(v[12], v[0]); + v[13] = xorv(v[13], v[1]); + v[14] = xorv(v[14], v[2]); + v[15] = xorv(v[15], v[3]); + v[12] = rot8(v[12]); + v[13] = rot8(v[13]); + v[14] = rot8(v[14]); + v[15] = rot8(v[15]); + v[8] = addv(v[8], v[12]); + v[9] = addv(v[9], v[13]); + v[10] = addv(v[10], v[14]); + v[11] = addv(v[11], v[15]); + v[4] = xorv(v[4], v[8]); + v[5] = xorv(v[5], v[9]); + v[6] = xorv(v[6], v[10]); + v[7] = xorv(v[7], v[11]); + v[4] = rot7(v[4]); + v[5] = rot7(v[5]); + v[6] = rot7(v[6]); + v[7] = rot7(v[7]); + + v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][8]]); + v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][10]]); + v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][12]]); + v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][14]]); + v[0] = addv(v[0], v[5]); + v[1] = addv(v[1], v[6]); + v[2] = addv(v[2], v[7]); + v[3] = addv(v[3], v[4]); + v[15] = xorv(v[15], v[0]); + v[12] = xorv(v[12], v[1]); + v[13] = xorv(v[13], v[2]); + v[14] = xorv(v[14], v[3]); + v[15] = rot16(v[15]); + v[12] = rot16(v[12]); + v[13] = rot16(v[13]); + v[14] = rot16(v[14]); + v[10] = addv(v[10], v[15]); + v[11] = addv(v[11], v[12]); + v[8] = addv(v[8], v[13]); + v[9] = addv(v[9], v[14]); + v[5] = xorv(v[5], v[10]); + v[6] = xorv(v[6], v[11]); + v[7] = xorv(v[7], v[8]); + v[4] = xorv(v[4], v[9]); + v[5] = rot12(v[5]); + v[6] = rot12(v[6]); + v[7] = rot12(v[7]); + v[4] = rot12(v[4]); + v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][9]]); + v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][11]]); + v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][13]]); + v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][15]]); + v[0] = addv(v[0], v[5]); + v[1] = addv(v[1], v[6]); + v[2] = addv(v[2], v[7]); + v[3] = addv(v[3], v[4]); + v[15] = xorv(v[15], v[0]); + v[12] = xorv(v[12], v[1]); + v[13] = xorv(v[13], v[2]); + v[14] = xorv(v[14], v[3]); + v[15] = rot8(v[15]); + v[12] = rot8(v[12]); + v[13] = rot8(v[13]); + v[14] = rot8(v[14]); + v[10] = addv(v[10], v[15]); + v[11] = addv(v[11], v[12]); + v[8] = addv(v[8], v[13]); + v[9] = addv(v[9], v[14]); + v[5] = xorv(v[5], v[10]); + v[6] = xorv(v[6], v[11]); + v[7] = xorv(v[7], v[8]); + v[4] = xorv(v[4], v[9]); + v[5] = rot7(v[5]); + v[6] = rot7(v[6]); + v[7] = rot7(v[7]); + v[4] = rot7(v[4]); +} + +INLINE void transpose_vecs(__m128i vecs[DEGREE]) { + // Interleave 32-bit lanes. The low unpack is lanes 00/11 and the high is + // 22/33. Note that this doesn't split the vector into two lanes, as the + // AVX2 counterparts do. + __m128i ab_01 = _mm_unpacklo_epi32(vecs[0], vecs[1]); + __m128i ab_23 = _mm_unpackhi_epi32(vecs[0], vecs[1]); + __m128i cd_01 = _mm_unpacklo_epi32(vecs[2], vecs[3]); + __m128i cd_23 = _mm_unpackhi_epi32(vecs[2], vecs[3]); + + // Interleave 64-bit lanes. + __m128i abcd_0 = _mm_unpacklo_epi64(ab_01, cd_01); + __m128i abcd_1 = _mm_unpackhi_epi64(ab_01, cd_01); + __m128i abcd_2 = _mm_unpacklo_epi64(ab_23, cd_23); + __m128i abcd_3 = _mm_unpackhi_epi64(ab_23, cd_23); + + vecs[0] = abcd_0; + vecs[1] = abcd_1; + vecs[2] = abcd_2; + vecs[3] = abcd_3; +} + +INLINE void transpose_msg_vecs(const uint8_t *const *inputs, + size_t block_offset, __m128i out[16]) { + out[0] = loadu(&inputs[0][block_offset + 0 * sizeof(__m128i)]); + out[1] = loadu(&inputs[1][block_offset + 0 * sizeof(__m128i)]); + out[2] = loadu(&inputs[2][block_offset + 0 * sizeof(__m128i)]); + out[3] = loadu(&inputs[3][block_offset + 0 * sizeof(__m128i)]); + out[4] = loadu(&inputs[0][block_offset + 1 * sizeof(__m128i)]); + out[5] = loadu(&inputs[1][block_offset + 1 * sizeof(__m128i)]); + out[6] = loadu(&inputs[2][block_offset + 1 * sizeof(__m128i)]); + out[7] = loadu(&inputs[3][block_offset + 1 * sizeof(__m128i)]); + out[8] = loadu(&inputs[0][block_offset + 2 * sizeof(__m128i)]); + out[9] = loadu(&inputs[1][block_offset + 2 * sizeof(__m128i)]); + out[10] = loadu(&inputs[2][block_offset + 2 * sizeof(__m128i)]); + out[11] = loadu(&inputs[3][block_offset + 2 * sizeof(__m128i)]); + out[12] = loadu(&inputs[0][block_offset + 3 * sizeof(__m128i)]); + out[13] = loadu(&inputs[1][block_offset + 3 * sizeof(__m128i)]); + out[14] = loadu(&inputs[2][block_offset + 3 * sizeof(__m128i)]); + out[15] = loadu(&inputs[3][block_offset + 3 * sizeof(__m128i)]); + for (size_t i = 0; i < 4; ++i) { + _mm_prefetch((const void *)&inputs[i][block_offset + 256], _MM_HINT_T0); + } + transpose_vecs(&out[0]); + transpose_vecs(&out[4]); + transpose_vecs(&out[8]); + transpose_vecs(&out[12]); +} + +INLINE void load_counters(uint64_t counter, bool increment_counter, + __m128i *out_lo, __m128i *out_hi) { + const __m128i mask = _mm_set1_epi32(-(int32_t)increment_counter); + const __m128i add0 = _mm_set_epi32(3, 2, 1, 0); + const __m128i add1 = _mm_and_si128(mask, add0); + __m128i l = _mm_add_epi32(_mm_set1_epi32((int32_t)counter), add1); + __m128i carry = _mm_cmpgt_epi32(_mm_xor_si128(add1, _mm_set1_epi32(0x80000000)), + _mm_xor_si128( l, _mm_set1_epi32(0x80000000))); + __m128i h = _mm_sub_epi32(_mm_set1_epi32((int32_t)(counter >> 32)), carry); + *out_lo = l; + *out_hi = h; +} + +static +void blake3_hash4_sse41(const uint8_t *const *inputs, size_t blocks, + const uint32_t key[8], uint64_t counter, + bool increment_counter, uint8_t flags, + uint8_t flags_start, uint8_t flags_end, uint8_t *out) { + __m128i h_vecs[8] = { + set1(key[0]), set1(key[1]), set1(key[2]), set1(key[3]), + set1(key[4]), set1(key[5]), set1(key[6]), set1(key[7]), + }; + __m128i counter_low_vec, counter_high_vec; + load_counters(counter, increment_counter, &counter_low_vec, + &counter_high_vec); + uint8_t block_flags = flags | flags_start; + + for (size_t block = 0; block < blocks; block++) { + if (block + 1 == blocks) { + block_flags |= flags_end; + } + __m128i block_len_vec = set1(BLAKE3_BLOCK_LEN); + __m128i block_flags_vec = set1(block_flags); + __m128i msg_vecs[16]; + transpose_msg_vecs(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs); + + __m128i v[16] = { + h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3], + h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7], + set1(IV[0]), set1(IV[1]), set1(IV[2]), set1(IV[3]), + counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec, + }; + round_fn(v, msg_vecs, 0); + round_fn(v, msg_vecs, 1); + round_fn(v, msg_vecs, 2); + round_fn(v, msg_vecs, 3); + round_fn(v, msg_vecs, 4); + round_fn(v, msg_vecs, 5); + round_fn(v, msg_vecs, 6); + h_vecs[0] = xorv(v[0], v[8]); + h_vecs[1] = xorv(v[1], v[9]); + h_vecs[2] = xorv(v[2], v[10]); + h_vecs[3] = xorv(v[3], v[11]); + h_vecs[4] = xorv(v[4], v[12]); + h_vecs[5] = xorv(v[5], v[13]); + h_vecs[6] = xorv(v[6], v[14]); + h_vecs[7] = xorv(v[7], v[15]); + + block_flags = flags; + } + + transpose_vecs(&h_vecs[0]); + transpose_vecs(&h_vecs[4]); + // The first four vecs now contain the first half of each output, and the + // second four vecs contain the second half of each output. + storeu(h_vecs[0], &out[0 * sizeof(__m128i)]); + storeu(h_vecs[4], &out[1 * sizeof(__m128i)]); + storeu(h_vecs[1], &out[2 * sizeof(__m128i)]); + storeu(h_vecs[5], &out[3 * sizeof(__m128i)]); + storeu(h_vecs[2], &out[4 * sizeof(__m128i)]); + storeu(h_vecs[6], &out[5 * sizeof(__m128i)]); + storeu(h_vecs[3], &out[6 * sizeof(__m128i)]); + storeu(h_vecs[7], &out[7 * sizeof(__m128i)]); +} + +INLINE void hash_one_sse41(const uint8_t *input, size_t blocks, + const uint32_t key[8], uint64_t counter, + uint8_t flags, uint8_t flags_start, + uint8_t flags_end, uint8_t out[BLAKE3_OUT_LEN]) { + uint32_t cv[8]; + memcpy(cv, key, BLAKE3_KEY_LEN); + uint8_t block_flags = flags | flags_start; + while (blocks > 0) { + if (blocks == 1) { + block_flags |= flags_end; + } + blake3_compress_in_place_sse41(cv, input, BLAKE3_BLOCK_LEN, counter, + block_flags); + input = &input[BLAKE3_BLOCK_LEN]; + blocks -= 1; + block_flags = flags; + } + memcpy(out, cv, BLAKE3_OUT_LEN); +} + +void blake3_hash_many_sse41(const uint8_t *const *inputs, size_t num_inputs, + size_t blocks, const uint32_t key[8], + uint64_t counter, bool increment_counter, + uint8_t flags, uint8_t flags_start, + uint8_t flags_end, uint8_t *out) { + while (num_inputs >= DEGREE) { + blake3_hash4_sse41(inputs, blocks, key, counter, increment_counter, flags, + flags_start, flags_end, out); + if (increment_counter) { + counter += DEGREE; + } + inputs += DEGREE; + num_inputs -= DEGREE; + out = &out[DEGREE * BLAKE3_OUT_LEN]; + } + while (num_inputs > 0) { + hash_one_sse41(inputs[0], blocks, key, counter, flags, flags_start, + flags_end, out); + if (increment_counter) { + counter += 1; + } + inputs += 1; + num_inputs -= 1; + out = &out[BLAKE3_OUT_LEN]; + } +} diff --git a/third-party/blake3/c/blake3_sse41_x86-64_unix.S b/third-party/blake3/c/blake3_sse41_x86-64_unix.S new file mode 100644 index 00000000..a3ff6426 --- /dev/null +++ b/third-party/blake3/c/blake3_sse41_x86-64_unix.S @@ -0,0 +1,2028 @@ +#if defined(__ELF__) && defined(__linux__) +.section .note.GNU-stack,"",%progbits +#endif + +#if defined(__ELF__) && defined(__CET__) && defined(__has_include) +#if __has_include() +#include +#endif +#endif + +#if !defined(_CET_ENDBR) +#define _CET_ENDBR +#endif + +.intel_syntax noprefix +.global blake3_hash_many_sse41 +.global _blake3_hash_many_sse41 +.global blake3_compress_in_place_sse41 +.global _blake3_compress_in_place_sse41 +.global blake3_compress_xof_sse41 +.global _blake3_compress_xof_sse41 +#ifdef __APPLE__ +.text +#else +.section .text +#endif + .p2align 6 +_blake3_hash_many_sse41: +blake3_hash_many_sse41: + _CET_ENDBR + push r15 + push r14 + push r13 + push r12 + push rbx + push rbp + mov rbp, rsp + sub rsp, 360 + and rsp, 0xFFFFFFFFFFFFFFC0 + neg r9d + movd xmm0, r9d + pshufd xmm0, xmm0, 0x00 + movdqa xmmword ptr [rsp+0x130], xmm0 + movdqa xmm1, xmm0 + pand xmm1, xmmword ptr [ADD0+rip] + pand xmm0, xmmword ptr [ADD1+rip] + movdqa xmmword ptr [rsp+0x150], xmm0 + movd xmm0, r8d + pshufd xmm0, xmm0, 0x00 + paddd xmm0, xmm1 + movdqa xmmword ptr [rsp+0x110], xmm0 + pxor xmm0, xmmword ptr [CMP_MSB_MASK+rip] + pxor xmm1, xmmword ptr [CMP_MSB_MASK+rip] + pcmpgtd xmm1, xmm0 + shr r8, 32 + movd xmm2, r8d + pshufd xmm2, xmm2, 0x00 + psubd xmm2, xmm1 + movdqa xmmword ptr [rsp+0x120], xmm2 + mov rbx, qword ptr [rbp+0x50] + mov r15, rdx + shl r15, 6 + movzx r13d, byte ptr [rbp+0x38] + movzx r12d, byte ptr [rbp+0x48] + cmp rsi, 4 + jc 3f +2: + movdqu xmm3, xmmword ptr [rcx] + pshufd xmm0, xmm3, 0x00 + pshufd xmm1, xmm3, 0x55 + pshufd xmm2, xmm3, 0xAA + pshufd xmm3, xmm3, 0xFF + movdqu xmm7, xmmword ptr [rcx+0x10] + pshufd xmm4, xmm7, 0x00 + pshufd xmm5, xmm7, 0x55 + pshufd xmm6, xmm7, 0xAA + pshufd xmm7, xmm7, 0xFF + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+0x8] + mov r10, qword ptr [rdi+0x10] + mov r11, qword ptr [rdi+0x18] + movzx eax, byte ptr [rbp+0x40] + or eax, r13d + xor edx, edx +9: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + movdqu xmm8, xmmword ptr [r8+rdx-0x40] + movdqu xmm9, xmmword ptr [r9+rdx-0x40] + movdqu xmm10, xmmword ptr [r10+rdx-0x40] + movdqu xmm11, xmmword ptr [r11+rdx-0x40] + movdqa xmm12, xmm8 + punpckldq xmm8, xmm9 + punpckhdq xmm12, xmm9 + movdqa xmm14, xmm10 + punpckldq xmm10, xmm11 + punpckhdq xmm14, xmm11 + movdqa xmm9, xmm8 + punpcklqdq xmm8, xmm10 + punpckhqdq xmm9, xmm10 + movdqa xmm13, xmm12 + punpcklqdq xmm12, xmm14 + punpckhqdq xmm13, xmm14 + movdqa xmmword ptr [rsp], xmm8 + movdqa xmmword ptr [rsp+0x10], xmm9 + movdqa xmmword ptr [rsp+0x20], xmm12 + movdqa xmmword ptr [rsp+0x30], xmm13 + movdqu xmm8, xmmword ptr [r8+rdx-0x30] + movdqu xmm9, xmmword ptr [r9+rdx-0x30] + movdqu xmm10, xmmword ptr [r10+rdx-0x30] + movdqu xmm11, xmmword ptr [r11+rdx-0x30] + movdqa xmm12, xmm8 + punpckldq xmm8, xmm9 + punpckhdq xmm12, xmm9 + movdqa xmm14, xmm10 + punpckldq xmm10, xmm11 + punpckhdq xmm14, xmm11 + movdqa xmm9, xmm8 + punpcklqdq xmm8, xmm10 + punpckhqdq xmm9, xmm10 + movdqa xmm13, xmm12 + punpcklqdq xmm12, xmm14 + punpckhqdq xmm13, xmm14 + movdqa xmmword ptr [rsp+0x40], xmm8 + movdqa xmmword ptr [rsp+0x50], xmm9 + movdqa xmmword ptr [rsp+0x60], xmm12 + movdqa xmmword ptr [rsp+0x70], xmm13 + movdqu xmm8, xmmword ptr [r8+rdx-0x20] + movdqu xmm9, xmmword ptr [r9+rdx-0x20] + movdqu xmm10, xmmword ptr [r10+rdx-0x20] + movdqu xmm11, xmmword ptr [r11+rdx-0x20] + movdqa xmm12, xmm8 + punpckldq xmm8, xmm9 + punpckhdq xmm12, xmm9 + movdqa xmm14, xmm10 + punpckldq xmm10, xmm11 + punpckhdq xmm14, xmm11 + movdqa xmm9, xmm8 + punpcklqdq xmm8, xmm10 + punpckhqdq xmm9, xmm10 + movdqa xmm13, xmm12 + punpcklqdq xmm12, xmm14 + punpckhqdq xmm13, xmm14 + movdqa xmmword ptr [rsp+0x80], xmm8 + movdqa xmmword ptr [rsp+0x90], xmm9 + movdqa xmmword ptr [rsp+0xA0], xmm12 + movdqa xmmword ptr [rsp+0xB0], xmm13 + movdqu xmm8, xmmword ptr [r8+rdx-0x10] + movdqu xmm9, xmmword ptr [r9+rdx-0x10] + movdqu xmm10, xmmword ptr [r10+rdx-0x10] + movdqu xmm11, xmmword ptr [r11+rdx-0x10] + movdqa xmm12, xmm8 + punpckldq xmm8, xmm9 + punpckhdq xmm12, xmm9 + movdqa xmm14, xmm10 + punpckldq xmm10, xmm11 + punpckhdq xmm14, xmm11 + movdqa xmm9, xmm8 + punpcklqdq xmm8, xmm10 + punpckhqdq xmm9, xmm10 + movdqa xmm13, xmm12 + punpcklqdq xmm12, xmm14 + punpckhqdq xmm13, xmm14 + movdqa xmmword ptr [rsp+0xC0], xmm8 + movdqa xmmword ptr [rsp+0xD0], xmm9 + movdqa xmmword ptr [rsp+0xE0], xmm12 + movdqa xmmword ptr [rsp+0xF0], xmm13 + movdqa xmm9, xmmword ptr [BLAKE3_IV_1+rip] + movdqa xmm10, xmmword ptr [BLAKE3_IV_2+rip] + movdqa xmm11, xmmword ptr [BLAKE3_IV_3+rip] + movdqa xmm12, xmmword ptr [rsp+0x110] + movdqa xmm13, xmmword ptr [rsp+0x120] + movdqa xmm14, xmmword ptr [BLAKE3_BLOCK_LEN+rip] + movd xmm15, eax + pshufd xmm15, xmm15, 0x00 + prefetcht0 [r8+rdx+0x80] + prefetcht0 [r9+rdx+0x80] + prefetcht0 [r10+rdx+0x80] + prefetcht0 [r11+rdx+0x80] + paddd xmm0, xmmword ptr [rsp] + paddd xmm1, xmmword ptr [rsp+0x20] + paddd xmm2, xmmword ptr [rsp+0x40] + paddd xmm3, xmmword ptr [rsp+0x60] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT16+rip] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [BLAKE3_IV_0+rip] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x10] + paddd xmm1, xmmword ptr [rsp+0x30] + paddd xmm2, xmmword ptr [rsp+0x50] + paddd xmm3, xmmword ptr [rsp+0x70] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT8+rip] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x80] + paddd xmm1, xmmword ptr [rsp+0xA0] + paddd xmm2, xmmword ptr [rsp+0xC0] + paddd xmm3, xmmword ptr [rsp+0xE0] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT16+rip] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0x90] + paddd xmm1, xmmword ptr [rsp+0xB0] + paddd xmm2, xmmword ptr [rsp+0xD0] + paddd xmm3, xmmword ptr [rsp+0xF0] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT8+rip] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0x20] + paddd xmm1, xmmword ptr [rsp+0x30] + paddd xmm2, xmmword ptr [rsp+0x70] + paddd xmm3, xmmword ptr [rsp+0x40] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT16+rip] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x60] + paddd xmm1, xmmword ptr [rsp+0xA0] + paddd xmm2, xmmword ptr [rsp] + paddd xmm3, xmmword ptr [rsp+0xD0] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT8+rip] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x10] + paddd xmm1, xmmword ptr [rsp+0xC0] + paddd xmm2, xmmword ptr [rsp+0x90] + paddd xmm3, xmmword ptr [rsp+0xF0] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT16+rip] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0xB0] + paddd xmm1, xmmword ptr [rsp+0x50] + paddd xmm2, xmmword ptr [rsp+0xE0] + paddd xmm3, xmmword ptr [rsp+0x80] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT8+rip] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0x30] + paddd xmm1, xmmword ptr [rsp+0xA0] + paddd xmm2, xmmword ptr [rsp+0xD0] + paddd xmm3, xmmword ptr [rsp+0x70] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT16+rip] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x40] + paddd xmm1, xmmword ptr [rsp+0xC0] + paddd xmm2, xmmword ptr [rsp+0x20] + paddd xmm3, xmmword ptr [rsp+0xE0] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT8+rip] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x60] + paddd xmm1, xmmword ptr [rsp+0x90] + paddd xmm2, xmmword ptr [rsp+0xB0] + paddd xmm3, xmmword ptr [rsp+0x80] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT16+rip] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0x50] + paddd xmm1, xmmword ptr [rsp] + paddd xmm2, xmmword ptr [rsp+0xF0] + paddd xmm3, xmmword ptr [rsp+0x10] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT8+rip] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0xA0] + paddd xmm1, xmmword ptr [rsp+0xC0] + paddd xmm2, xmmword ptr [rsp+0xE0] + paddd xmm3, xmmword ptr [rsp+0xD0] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT16+rip] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x70] + paddd xmm1, xmmword ptr [rsp+0x90] + paddd xmm2, xmmword ptr [rsp+0x30] + paddd xmm3, xmmword ptr [rsp+0xF0] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT8+rip] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x40] + paddd xmm1, xmmword ptr [rsp+0xB0] + paddd xmm2, xmmword ptr [rsp+0x50] + paddd xmm3, xmmword ptr [rsp+0x10] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT16+rip] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp] + paddd xmm1, xmmword ptr [rsp+0x20] + paddd xmm2, xmmword ptr [rsp+0x80] + paddd xmm3, xmmword ptr [rsp+0x60] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT8+rip] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0xC0] + paddd xmm1, xmmword ptr [rsp+0x90] + paddd xmm2, xmmword ptr [rsp+0xF0] + paddd xmm3, xmmword ptr [rsp+0xE0] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT16+rip] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0xD0] + paddd xmm1, xmmword ptr [rsp+0xB0] + paddd xmm2, xmmword ptr [rsp+0xA0] + paddd xmm3, xmmword ptr [rsp+0x80] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT8+rip] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x70] + paddd xmm1, xmmword ptr [rsp+0x50] + paddd xmm2, xmmword ptr [rsp] + paddd xmm3, xmmword ptr [rsp+0x60] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT16+rip] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0x20] + paddd xmm1, xmmword ptr [rsp+0x30] + paddd xmm2, xmmword ptr [rsp+0x10] + paddd xmm3, xmmword ptr [rsp+0x40] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT8+rip] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0x90] + paddd xmm1, xmmword ptr [rsp+0xB0] + paddd xmm2, xmmword ptr [rsp+0x80] + paddd xmm3, xmmword ptr [rsp+0xF0] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT16+rip] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0xE0] + paddd xmm1, xmmword ptr [rsp+0x50] + paddd xmm2, xmmword ptr [rsp+0xC0] + paddd xmm3, xmmword ptr [rsp+0x10] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT8+rip] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0xD0] + paddd xmm1, xmmword ptr [rsp] + paddd xmm2, xmmword ptr [rsp+0x20] + paddd xmm3, xmmword ptr [rsp+0x40] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT16+rip] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0x30] + paddd xmm1, xmmword ptr [rsp+0xA0] + paddd xmm2, xmmword ptr [rsp+0x60] + paddd xmm3, xmmword ptr [rsp+0x70] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT8+rip] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0xB0] + paddd xmm1, xmmword ptr [rsp+0x50] + paddd xmm2, xmmword ptr [rsp+0x10] + paddd xmm3, xmmword ptr [rsp+0x80] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT16+rip] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0xF0] + paddd xmm1, xmmword ptr [rsp] + paddd xmm2, xmmword ptr [rsp+0x90] + paddd xmm3, xmmword ptr [rsp+0x60] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT8+rip] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0xE0] + paddd xmm1, xmmword ptr [rsp+0x20] + paddd xmm2, xmmword ptr [rsp+0x30] + paddd xmm3, xmmword ptr [rsp+0x70] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT16+rip] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0xA0] + paddd xmm1, xmmword ptr [rsp+0xC0] + paddd xmm2, xmmword ptr [rsp+0x40] + paddd xmm3, xmmword ptr [rsp+0xD0] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT8+rip] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + pxor xmm0, xmm8 + pxor xmm1, xmm9 + pxor xmm2, xmm10 + pxor xmm3, xmm11 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + pxor xmm4, xmm12 + pxor xmm5, xmm13 + pxor xmm6, xmm14 + pxor xmm7, xmm15 + mov eax, r13d + jne 9b + movdqa xmm9, xmm0 + punpckldq xmm0, xmm1 + punpckhdq xmm9, xmm1 + movdqa xmm11, xmm2 + punpckldq xmm2, xmm3 + punpckhdq xmm11, xmm3 + movdqa xmm1, xmm0 + punpcklqdq xmm0, xmm2 + punpckhqdq xmm1, xmm2 + movdqa xmm3, xmm9 + punpcklqdq xmm9, xmm11 + punpckhqdq xmm3, xmm11 + movdqu xmmword ptr [rbx], xmm0 + movdqu xmmword ptr [rbx+0x20], xmm1 + movdqu xmmword ptr [rbx+0x40], xmm9 + movdqu xmmword ptr [rbx+0x60], xmm3 + movdqa xmm9, xmm4 + punpckldq xmm4, xmm5 + punpckhdq xmm9, xmm5 + movdqa xmm11, xmm6 + punpckldq xmm6, xmm7 + punpckhdq xmm11, xmm7 + movdqa xmm5, xmm4 + punpcklqdq xmm4, xmm6 + punpckhqdq xmm5, xmm6 + movdqa xmm7, xmm9 + punpcklqdq xmm9, xmm11 + punpckhqdq xmm7, xmm11 + movdqu xmmword ptr [rbx+0x10], xmm4 + movdqu xmmword ptr [rbx+0x30], xmm5 + movdqu xmmword ptr [rbx+0x50], xmm9 + movdqu xmmword ptr [rbx+0x70], xmm7 + movdqa xmm1, xmmword ptr [rsp+0x110] + movdqa xmm0, xmm1 + paddd xmm1, xmmword ptr [rsp+0x150] + movdqa xmmword ptr [rsp+0x110], xmm1 + pxor xmm0, xmmword ptr [CMP_MSB_MASK+rip] + pxor xmm1, xmmword ptr [CMP_MSB_MASK+rip] + pcmpgtd xmm0, xmm1 + movdqa xmm1, xmmword ptr [rsp+0x120] + psubd xmm1, xmm0 + movdqa xmmword ptr [rsp+0x120], xmm1 + add rbx, 128 + add rdi, 32 + sub rsi, 4 + cmp rsi, 4 + jnc 2b + test rsi, rsi + jnz 3f +4: + mov rsp, rbp + pop rbp + pop rbx + pop r12 + pop r13 + pop r14 + pop r15 + ret +.p2align 5 +3: + test esi, 0x2 + je 3f + movups xmm0, xmmword ptr [rcx] + movups xmm1, xmmword ptr [rcx+0x10] + movaps xmm8, xmm0 + movaps xmm9, xmm1 + movd xmm13, dword ptr [rsp+0x110] + pinsrd xmm13, dword ptr [rsp+0x120], 1 + pinsrd xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 + movaps xmmword ptr [rsp], xmm13 + movd xmm14, dword ptr [rsp+0x114] + pinsrd xmm14, dword ptr [rsp+0x124], 1 + pinsrd xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 + movaps xmmword ptr [rsp+0x10], xmm14 + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+0x8] + movzx eax, byte ptr [rbp+0x40] + or eax, r13d + xor edx, edx +2: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + movaps xmm2, xmmword ptr [BLAKE3_IV+rip] + movaps xmm10, xmm2 + movups xmm4, xmmword ptr [r8+rdx-0x40] + movups xmm5, xmmword ptr [r8+rdx-0x30] + movaps xmm3, xmm4 + shufps xmm4, xmm5, 136 + shufps xmm3, xmm5, 221 + movaps xmm5, xmm3 + movups xmm6, xmmword ptr [r8+rdx-0x20] + movups xmm7, xmmword ptr [r8+rdx-0x10] + movaps xmm3, xmm6 + shufps xmm6, xmm7, 136 + pshufd xmm6, xmm6, 0x93 + shufps xmm3, xmm7, 221 + pshufd xmm7, xmm3, 0x93 + movups xmm12, xmmword ptr [r9+rdx-0x40] + movups xmm13, xmmword ptr [r9+rdx-0x30] + movaps xmm11, xmm12 + shufps xmm12, xmm13, 136 + shufps xmm11, xmm13, 221 + movaps xmm13, xmm11 + movups xmm14, xmmword ptr [r9+rdx-0x20] + movups xmm15, xmmword ptr [r9+rdx-0x10] + movaps xmm11, xmm14 + shufps xmm14, xmm15, 136 + pshufd xmm14, xmm14, 0x93 + shufps xmm11, xmm15, 221 + pshufd xmm15, xmm11, 0x93 + movaps xmm3, xmmword ptr [rsp] + movaps xmm11, xmmword ptr [rsp+0x10] + pinsrd xmm3, eax, 3 + pinsrd xmm11, eax, 3 + mov al, 7 +9: + paddd xmm0, xmm4 + paddd xmm8, xmm12 + movaps xmmword ptr [rsp+0x20], xmm4 + movaps xmmword ptr [rsp+0x30], xmm12 + paddd xmm0, xmm1 + paddd xmm8, xmm9 + pxor xmm3, xmm0 + pxor xmm11, xmm8 + movaps xmm12, xmmword ptr [ROT16+rip] + pshufb xmm3, xmm12 + pshufb xmm11, xmm12 + paddd xmm2, xmm3 + paddd xmm10, xmm11 + pxor xmm1, xmm2 + pxor xmm9, xmm10 + movdqa xmm4, xmm1 + pslld xmm1, 20 + psrld xmm4, 12 + por xmm1, xmm4 + movdqa xmm4, xmm9 + pslld xmm9, 20 + psrld xmm4, 12 + por xmm9, xmm4 + paddd xmm0, xmm5 + paddd xmm8, xmm13 + movaps xmmword ptr [rsp+0x40], xmm5 + movaps xmmword ptr [rsp+0x50], xmm13 + paddd xmm0, xmm1 + paddd xmm8, xmm9 + pxor xmm3, xmm0 + pxor xmm11, xmm8 + movaps xmm13, xmmword ptr [ROT8+rip] + pshufb xmm3, xmm13 + pshufb xmm11, xmm13 + paddd xmm2, xmm3 + paddd xmm10, xmm11 + pxor xmm1, xmm2 + pxor xmm9, xmm10 + movdqa xmm4, xmm1 + pslld xmm1, 25 + psrld xmm4, 7 + por xmm1, xmm4 + movdqa xmm4, xmm9 + pslld xmm9, 25 + psrld xmm4, 7 + por xmm9, xmm4 + pshufd xmm0, xmm0, 0x93 + pshufd xmm8, xmm8, 0x93 + pshufd xmm3, xmm3, 0x4E + pshufd xmm11, xmm11, 0x4E + pshufd xmm2, xmm2, 0x39 + pshufd xmm10, xmm10, 0x39 + paddd xmm0, xmm6 + paddd xmm8, xmm14 + paddd xmm0, xmm1 + paddd xmm8, xmm9 + pxor xmm3, xmm0 + pxor xmm11, xmm8 + pshufb xmm3, xmm12 + pshufb xmm11, xmm12 + paddd xmm2, xmm3 + paddd xmm10, xmm11 + pxor xmm1, xmm2 + pxor xmm9, xmm10 + movdqa xmm4, xmm1 + pslld xmm1, 20 + psrld xmm4, 12 + por xmm1, xmm4 + movdqa xmm4, xmm9 + pslld xmm9, 20 + psrld xmm4, 12 + por xmm9, xmm4 + paddd xmm0, xmm7 + paddd xmm8, xmm15 + paddd xmm0, xmm1 + paddd xmm8, xmm9 + pxor xmm3, xmm0 + pxor xmm11, xmm8 + pshufb xmm3, xmm13 + pshufb xmm11, xmm13 + paddd xmm2, xmm3 + paddd xmm10, xmm11 + pxor xmm1, xmm2 + pxor xmm9, xmm10 + movdqa xmm4, xmm1 + pslld xmm1, 25 + psrld xmm4, 7 + por xmm1, xmm4 + movdqa xmm4, xmm9 + pslld xmm9, 25 + psrld xmm4, 7 + por xmm9, xmm4 + pshufd xmm0, xmm0, 0x39 + pshufd xmm8, xmm8, 0x39 + pshufd xmm3, xmm3, 0x4E + pshufd xmm11, xmm11, 0x4E + pshufd xmm2, xmm2, 0x93 + pshufd xmm10, xmm10, 0x93 + dec al + je 9f + movdqa xmm12, xmmword ptr [rsp+0x20] + movdqa xmm5, xmmword ptr [rsp+0x40] + pshufd xmm13, xmm12, 0x0F + shufps xmm12, xmm5, 214 + pshufd xmm4, xmm12, 0x39 + movdqa xmm12, xmm6 + shufps xmm12, xmm7, 250 + pblendw xmm13, xmm12, 0xCC + movdqa xmm12, xmm7 + punpcklqdq xmm12, xmm5 + pblendw xmm12, xmm6, 0xC0 + pshufd xmm12, xmm12, 0x78 + punpckhdq xmm5, xmm7 + punpckldq xmm6, xmm5 + pshufd xmm7, xmm6, 0x1E + movdqa xmmword ptr [rsp+0x20], xmm13 + movdqa xmmword ptr [rsp+0x40], xmm12 + movdqa xmm5, xmmword ptr [rsp+0x30] + movdqa xmm13, xmmword ptr [rsp+0x50] + pshufd xmm6, xmm5, 0x0F + shufps xmm5, xmm13, 214 + pshufd xmm12, xmm5, 0x39 + movdqa xmm5, xmm14 + shufps xmm5, xmm15, 250 + pblendw xmm6, xmm5, 0xCC + movdqa xmm5, xmm15 + punpcklqdq xmm5, xmm13 + pblendw xmm5, xmm14, 0xC0 + pshufd xmm5, xmm5, 0x78 + punpckhdq xmm13, xmm15 + punpckldq xmm14, xmm13 + pshufd xmm15, xmm14, 0x1E + movdqa xmm13, xmm6 + movdqa xmm14, xmm5 + movdqa xmm5, xmmword ptr [rsp+0x20] + movdqa xmm6, xmmword ptr [rsp+0x40] + jmp 9b +9: + pxor xmm0, xmm2 + pxor xmm1, xmm3 + pxor xmm8, xmm10 + pxor xmm9, xmm11 + mov eax, r13d + cmp rdx, r15 + jne 2b + movups xmmword ptr [rbx], xmm0 + movups xmmword ptr [rbx+0x10], xmm1 + movups xmmword ptr [rbx+0x20], xmm8 + movups xmmword ptr [rbx+0x30], xmm9 + movdqa xmm0, xmmword ptr [rsp+0x130] + movdqa xmm1, xmmword ptr [rsp+0x110] + movdqa xmm2, xmmword ptr [rsp+0x120] + movdqu xmm3, xmmword ptr [rsp+0x118] + movdqu xmm4, xmmword ptr [rsp+0x128] + blendvps xmm1, xmm3, xmm0 + blendvps xmm2, xmm4, xmm0 + movdqa xmmword ptr [rsp+0x110], xmm1 + movdqa xmmword ptr [rsp+0x120], xmm2 + add rdi, 16 + add rbx, 64 + sub rsi, 2 +3: + test esi, 0x1 + je 4b + movups xmm0, xmmword ptr [rcx] + movups xmm1, xmmword ptr [rcx+0x10] + movd xmm13, dword ptr [rsp+0x110] + pinsrd xmm13, dword ptr [rsp+0x120], 1 + pinsrd xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 + movaps xmm14, xmmword ptr [ROT8+rip] + movaps xmm15, xmmword ptr [ROT16+rip] + mov r8, qword ptr [rdi] + movzx eax, byte ptr [rbp+0x40] + or eax, r13d + xor edx, edx +2: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + movaps xmm2, xmmword ptr [BLAKE3_IV+rip] + movaps xmm3, xmm13 + pinsrd xmm3, eax, 3 + movups xmm4, xmmword ptr [r8+rdx-0x40] + movups xmm5, xmmword ptr [r8+rdx-0x30] + movaps xmm8, xmm4 + shufps xmm4, xmm5, 136 + shufps xmm8, xmm5, 221 + movaps xmm5, xmm8 + movups xmm6, xmmword ptr [r8+rdx-0x20] + movups xmm7, xmmword ptr [r8+rdx-0x10] + movaps xmm8, xmm6 + shufps xmm6, xmm7, 136 + pshufd xmm6, xmm6, 0x93 + shufps xmm8, xmm7, 221 + pshufd xmm7, xmm8, 0x93 + mov al, 7 +9: + paddd xmm0, xmm4 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm15 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm5 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 0x93 + pshufd xmm3, xmm3, 0x4E + pshufd xmm2, xmm2, 0x39 + paddd xmm0, xmm6 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm15 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm7 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 0x39 + pshufd xmm3, xmm3, 0x4E + pshufd xmm2, xmm2, 0x93 + dec al + jz 9f + movdqa xmm8, xmm4 + shufps xmm8, xmm5, 214 + pshufd xmm9, xmm4, 0x0F + pshufd xmm4, xmm8, 0x39 + movdqa xmm8, xmm6 + shufps xmm8, xmm7, 250 + pblendw xmm9, xmm8, 0xCC + movdqa xmm8, xmm7 + punpcklqdq xmm8, xmm5 + pblendw xmm8, xmm6, 0xC0 + pshufd xmm8, xmm8, 0x78 + punpckhdq xmm5, xmm7 + punpckldq xmm6, xmm5 + pshufd xmm7, xmm6, 0x1E + movdqa xmm5, xmm9 + movdqa xmm6, xmm8 + jmp 9b +9: + pxor xmm0, xmm2 + pxor xmm1, xmm3 + mov eax, r13d + cmp rdx, r15 + jne 2b + movups xmmword ptr [rbx], xmm0 + movups xmmword ptr [rbx+0x10], xmm1 + jmp 4b + +.p2align 6 +blake3_compress_in_place_sse41: +_blake3_compress_in_place_sse41: + _CET_ENDBR + movups xmm0, xmmword ptr [rdi] + movups xmm1, xmmword ptr [rdi+0x10] + movaps xmm2, xmmword ptr [BLAKE3_IV+rip] + shl r8, 32 + add rdx, r8 + movq xmm3, rcx + movq xmm4, rdx + punpcklqdq xmm3, xmm4 + movups xmm4, xmmword ptr [rsi] + movups xmm5, xmmword ptr [rsi+0x10] + movaps xmm8, xmm4 + shufps xmm4, xmm5, 136 + shufps xmm8, xmm5, 221 + movaps xmm5, xmm8 + movups xmm6, xmmword ptr [rsi+0x20] + movups xmm7, xmmword ptr [rsi+0x30] + movaps xmm8, xmm6 + shufps xmm6, xmm7, 136 + pshufd xmm6, xmm6, 0x93 + shufps xmm8, xmm7, 221 + pshufd xmm7, xmm8, 0x93 + movaps xmm14, xmmword ptr [ROT8+rip] + movaps xmm15, xmmword ptr [ROT16+rip] + mov al, 7 +9: + paddd xmm0, xmm4 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm15 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm5 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 0x93 + pshufd xmm3, xmm3, 0x4E + pshufd xmm2, xmm2, 0x39 + paddd xmm0, xmm6 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm15 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm7 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 0x39 + pshufd xmm3, xmm3, 0x4E + pshufd xmm2, xmm2, 0x93 + dec al + jz 9f + movdqa xmm8, xmm4 + shufps xmm8, xmm5, 214 + pshufd xmm9, xmm4, 0x0F + pshufd xmm4, xmm8, 0x39 + movdqa xmm8, xmm6 + shufps xmm8, xmm7, 250 + pblendw xmm9, xmm8, 0xCC + movdqa xmm8, xmm7 + punpcklqdq xmm8, xmm5 + pblendw xmm8, xmm6, 0xC0 + pshufd xmm8, xmm8, 0x78 + punpckhdq xmm5, xmm7 + punpckldq xmm6, xmm5 + pshufd xmm7, xmm6, 0x1E + movdqa xmm5, xmm9 + movdqa xmm6, xmm8 + jmp 9b +9: + pxor xmm0, xmm2 + pxor xmm1, xmm3 + movups xmmword ptr [rdi], xmm0 + movups xmmword ptr [rdi+0x10], xmm1 + ret + +.p2align 6 +blake3_compress_xof_sse41: +_blake3_compress_xof_sse41: + _CET_ENDBR + movups xmm0, xmmword ptr [rdi] + movups xmm1, xmmword ptr [rdi+0x10] + movaps xmm2, xmmword ptr [BLAKE3_IV+rip] + movzx eax, r8b + movzx edx, dl + shl rax, 32 + add rdx, rax + movq xmm3, rcx + movq xmm4, rdx + punpcklqdq xmm3, xmm4 + movups xmm4, xmmword ptr [rsi] + movups xmm5, xmmword ptr [rsi+0x10] + movaps xmm8, xmm4 + shufps xmm4, xmm5, 136 + shufps xmm8, xmm5, 221 + movaps xmm5, xmm8 + movups xmm6, xmmword ptr [rsi+0x20] + movups xmm7, xmmword ptr [rsi+0x30] + movaps xmm8, xmm6 + shufps xmm6, xmm7, 136 + pshufd xmm6, xmm6, 0x93 + shufps xmm8, xmm7, 221 + pshufd xmm7, xmm8, 0x93 + movaps xmm14, xmmword ptr [ROT8+rip] + movaps xmm15, xmmword ptr [ROT16+rip] + mov al, 7 +9: + paddd xmm0, xmm4 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm15 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm5 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 0x93 + pshufd xmm3, xmm3, 0x4E + pshufd xmm2, xmm2, 0x39 + paddd xmm0, xmm6 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm15 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm7 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 0x39 + pshufd xmm3, xmm3, 0x4E + pshufd xmm2, xmm2, 0x93 + dec al + jz 9f + movdqa xmm8, xmm4 + shufps xmm8, xmm5, 214 + pshufd xmm9, xmm4, 0x0F + pshufd xmm4, xmm8, 0x39 + movdqa xmm8, xmm6 + shufps xmm8, xmm7, 250 + pblendw xmm9, xmm8, 0xCC + movdqa xmm8, xmm7 + punpcklqdq xmm8, xmm5 + pblendw xmm8, xmm6, 0xC0 + pshufd xmm8, xmm8, 0x78 + punpckhdq xmm5, xmm7 + punpckldq xmm6, xmm5 + pshufd xmm7, xmm6, 0x1E + movdqa xmm5, xmm9 + movdqa xmm6, xmm8 + jmp 9b +9: + movdqu xmm4, xmmword ptr [rdi] + movdqu xmm5, xmmword ptr [rdi+0x10] + pxor xmm0, xmm2 + pxor xmm1, xmm3 + pxor xmm2, xmm4 + pxor xmm3, xmm5 + movups xmmword ptr [r9], xmm0 + movups xmmword ptr [r9+0x10], xmm1 + movups xmmword ptr [r9+0x20], xmm2 + movups xmmword ptr [r9+0x30], xmm3 + ret + + +#ifdef __APPLE__ +.static_data +#else +.section .rodata +#endif +.p2align 6 +BLAKE3_IV: + .long 0x6A09E667, 0xBB67AE85 + .long 0x3C6EF372, 0xA54FF53A +ROT16: + .byte 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13 +ROT8: + .byte 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12 +ADD0: + .long 0, 1, 2, 3 +ADD1: + .long 4, 4, 4, 4 +BLAKE3_IV_0: + .long 0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667 +BLAKE3_IV_1: + .long 0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85 +BLAKE3_IV_2: + .long 0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372 +BLAKE3_IV_3: + .long 0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A +BLAKE3_BLOCK_LEN: + .long 64, 64, 64, 64 +CMP_MSB_MASK: + .long 0x80000000, 0x80000000, 0x80000000, 0x80000000 diff --git a/third-party/blake3/c/blake3_sse41_x86-64_windows_gnu.S b/third-party/blake3/c/blake3_sse41_x86-64_windows_gnu.S new file mode 100644 index 00000000..02083f9d --- /dev/null +++ b/third-party/blake3/c/blake3_sse41_x86-64_windows_gnu.S @@ -0,0 +1,2069 @@ +.intel_syntax noprefix +.global blake3_hash_many_sse41 +.global _blake3_hash_many_sse41 +.global blake3_compress_in_place_sse41 +.global _blake3_compress_in_place_sse41 +.global blake3_compress_xof_sse41 +.global _blake3_compress_xof_sse41 +.section .text + .p2align 6 +_blake3_hash_many_sse41: +blake3_hash_many_sse41: + push r15 + push r14 + push r13 + push r12 + push rsi + push rdi + push rbx + push rbp + mov rbp, rsp + sub rsp, 528 + and rsp, 0xFFFFFFFFFFFFFFC0 + movdqa xmmword ptr [rsp+0x170], xmm6 + movdqa xmmword ptr [rsp+0x180], xmm7 + movdqa xmmword ptr [rsp+0x190], xmm8 + movdqa xmmword ptr [rsp+0x1A0], xmm9 + movdqa xmmword ptr [rsp+0x1B0], xmm10 + movdqa xmmword ptr [rsp+0x1C0], xmm11 + movdqa xmmword ptr [rsp+0x1D0], xmm12 + movdqa xmmword ptr [rsp+0x1E0], xmm13 + movdqa xmmword ptr [rsp+0x1F0], xmm14 + movdqa xmmword ptr [rsp+0x200], xmm15 + mov rdi, rcx + mov rsi, rdx + mov rdx, r8 + mov rcx, r9 + mov r8, qword ptr [rbp+0x68] + movzx r9, byte ptr [rbp+0x70] + neg r9d + movd xmm0, r9d + pshufd xmm0, xmm0, 0x00 + movdqa xmmword ptr [rsp+0x130], xmm0 + movdqa xmm1, xmm0 + pand xmm1, xmmword ptr [ADD0+rip] + pand xmm0, xmmword ptr [ADD1+rip] + movdqa xmmword ptr [rsp+0x150], xmm0 + movd xmm0, r8d + pshufd xmm0, xmm0, 0x00 + paddd xmm0, xmm1 + movdqa xmmword ptr [rsp+0x110], xmm0 + pxor xmm0, xmmword ptr [CMP_MSB_MASK+rip] + pxor xmm1, xmmword ptr [CMP_MSB_MASK+rip] + pcmpgtd xmm1, xmm0 + shr r8, 32 + movd xmm2, r8d + pshufd xmm2, xmm2, 0x00 + psubd xmm2, xmm1 + movdqa xmmword ptr [rsp+0x120], xmm2 + mov rbx, qword ptr [rbp+0x90] + mov r15, rdx + shl r15, 6 + movzx r13d, byte ptr [rbp+0x78] + movzx r12d, byte ptr [rbp+0x88] + cmp rsi, 4 + jc 3f +2: + movdqu xmm3, xmmword ptr [rcx] + pshufd xmm0, xmm3, 0x00 + pshufd xmm1, xmm3, 0x55 + pshufd xmm2, xmm3, 0xAA + pshufd xmm3, xmm3, 0xFF + movdqu xmm7, xmmword ptr [rcx+0x10] + pshufd xmm4, xmm7, 0x00 + pshufd xmm5, xmm7, 0x55 + pshufd xmm6, xmm7, 0xAA + pshufd xmm7, xmm7, 0xFF + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+0x8] + mov r10, qword ptr [rdi+0x10] + mov r11, qword ptr [rdi+0x18] + movzx eax, byte ptr [rbp+0x80] + or eax, r13d + xor edx, edx +9: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + movdqu xmm8, xmmword ptr [r8+rdx-0x40] + movdqu xmm9, xmmword ptr [r9+rdx-0x40] + movdqu xmm10, xmmword ptr [r10+rdx-0x40] + movdqu xmm11, xmmword ptr [r11+rdx-0x40] + movdqa xmm12, xmm8 + punpckldq xmm8, xmm9 + punpckhdq xmm12, xmm9 + movdqa xmm14, xmm10 + punpckldq xmm10, xmm11 + punpckhdq xmm14, xmm11 + movdqa xmm9, xmm8 + punpcklqdq xmm8, xmm10 + punpckhqdq xmm9, xmm10 + movdqa xmm13, xmm12 + punpcklqdq xmm12, xmm14 + punpckhqdq xmm13, xmm14 + movdqa xmmword ptr [rsp], xmm8 + movdqa xmmword ptr [rsp+0x10], xmm9 + movdqa xmmword ptr [rsp+0x20], xmm12 + movdqa xmmword ptr [rsp+0x30], xmm13 + movdqu xmm8, xmmword ptr [r8+rdx-0x30] + movdqu xmm9, xmmword ptr [r9+rdx-0x30] + movdqu xmm10, xmmword ptr [r10+rdx-0x30] + movdqu xmm11, xmmword ptr [r11+rdx-0x30] + movdqa xmm12, xmm8 + punpckldq xmm8, xmm9 + punpckhdq xmm12, xmm9 + movdqa xmm14, xmm10 + punpckldq xmm10, xmm11 + punpckhdq xmm14, xmm11 + movdqa xmm9, xmm8 + punpcklqdq xmm8, xmm10 + punpckhqdq xmm9, xmm10 + movdqa xmm13, xmm12 + punpcklqdq xmm12, xmm14 + punpckhqdq xmm13, xmm14 + movdqa xmmword ptr [rsp+0x40], xmm8 + movdqa xmmword ptr [rsp+0x50], xmm9 + movdqa xmmword ptr [rsp+0x60], xmm12 + movdqa xmmword ptr [rsp+0x70], xmm13 + movdqu xmm8, xmmword ptr [r8+rdx-0x20] + movdqu xmm9, xmmword ptr [r9+rdx-0x20] + movdqu xmm10, xmmword ptr [r10+rdx-0x20] + movdqu xmm11, xmmword ptr [r11+rdx-0x20] + movdqa xmm12, xmm8 + punpckldq xmm8, xmm9 + punpckhdq xmm12, xmm9 + movdqa xmm14, xmm10 + punpckldq xmm10, xmm11 + punpckhdq xmm14, xmm11 + movdqa xmm9, xmm8 + punpcklqdq xmm8, xmm10 + punpckhqdq xmm9, xmm10 + movdqa xmm13, xmm12 + punpcklqdq xmm12, xmm14 + punpckhqdq xmm13, xmm14 + movdqa xmmword ptr [rsp+0x80], xmm8 + movdqa xmmword ptr [rsp+0x90], xmm9 + movdqa xmmword ptr [rsp+0xA0], xmm12 + movdqa xmmword ptr [rsp+0xB0], xmm13 + movdqu xmm8, xmmword ptr [r8+rdx-0x10] + movdqu xmm9, xmmword ptr [r9+rdx-0x10] + movdqu xmm10, xmmword ptr [r10+rdx-0x10] + movdqu xmm11, xmmword ptr [r11+rdx-0x10] + movdqa xmm12, xmm8 + punpckldq xmm8, xmm9 + punpckhdq xmm12, xmm9 + movdqa xmm14, xmm10 + punpckldq xmm10, xmm11 + punpckhdq xmm14, xmm11 + movdqa xmm9, xmm8 + punpcklqdq xmm8, xmm10 + punpckhqdq xmm9, xmm10 + movdqa xmm13, xmm12 + punpcklqdq xmm12, xmm14 + punpckhqdq xmm13, xmm14 + movdqa xmmword ptr [rsp+0xC0], xmm8 + movdqa xmmword ptr [rsp+0xD0], xmm9 + movdqa xmmword ptr [rsp+0xE0], xmm12 + movdqa xmmword ptr [rsp+0xF0], xmm13 + movdqa xmm9, xmmword ptr [BLAKE3_IV_1+rip] + movdqa xmm10, xmmword ptr [BLAKE3_IV_2+rip] + movdqa xmm11, xmmword ptr [BLAKE3_IV_3+rip] + movdqa xmm12, xmmword ptr [rsp+0x110] + movdqa xmm13, xmmword ptr [rsp+0x120] + movdqa xmm14, xmmword ptr [BLAKE3_BLOCK_LEN+rip] + movd xmm15, eax + pshufd xmm15, xmm15, 0x00 + prefetcht0 [r8+rdx+0x80] + prefetcht0 [r9+rdx+0x80] + prefetcht0 [r10+rdx+0x80] + prefetcht0 [r11+rdx+0x80] + paddd xmm0, xmmword ptr [rsp] + paddd xmm1, xmmword ptr [rsp+0x20] + paddd xmm2, xmmword ptr [rsp+0x40] + paddd xmm3, xmmword ptr [rsp+0x60] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT16+rip] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [BLAKE3_IV_0+rip] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x10] + paddd xmm1, xmmword ptr [rsp+0x30] + paddd xmm2, xmmword ptr [rsp+0x50] + paddd xmm3, xmmword ptr [rsp+0x70] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT8+rip] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x80] + paddd xmm1, xmmword ptr [rsp+0xA0] + paddd xmm2, xmmword ptr [rsp+0xC0] + paddd xmm3, xmmword ptr [rsp+0xE0] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT16+rip] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0x90] + paddd xmm1, xmmword ptr [rsp+0xB0] + paddd xmm2, xmmword ptr [rsp+0xD0] + paddd xmm3, xmmword ptr [rsp+0xF0] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT8+rip] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0x20] + paddd xmm1, xmmword ptr [rsp+0x30] + paddd xmm2, xmmword ptr [rsp+0x70] + paddd xmm3, xmmword ptr [rsp+0x40] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT16+rip] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x60] + paddd xmm1, xmmword ptr [rsp+0xA0] + paddd xmm2, xmmword ptr [rsp] + paddd xmm3, xmmword ptr [rsp+0xD0] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT8+rip] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x10] + paddd xmm1, xmmword ptr [rsp+0xC0] + paddd xmm2, xmmword ptr [rsp+0x90] + paddd xmm3, xmmword ptr [rsp+0xF0] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT16+rip] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0xB0] + paddd xmm1, xmmword ptr [rsp+0x50] + paddd xmm2, xmmword ptr [rsp+0xE0] + paddd xmm3, xmmword ptr [rsp+0x80] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT8+rip] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0x30] + paddd xmm1, xmmword ptr [rsp+0xA0] + paddd xmm2, xmmword ptr [rsp+0xD0] + paddd xmm3, xmmword ptr [rsp+0x70] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT16+rip] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x40] + paddd xmm1, xmmword ptr [rsp+0xC0] + paddd xmm2, xmmword ptr [rsp+0x20] + paddd xmm3, xmmword ptr [rsp+0xE0] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT8+rip] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x60] + paddd xmm1, xmmword ptr [rsp+0x90] + paddd xmm2, xmmword ptr [rsp+0xB0] + paddd xmm3, xmmword ptr [rsp+0x80] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT16+rip] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0x50] + paddd xmm1, xmmword ptr [rsp] + paddd xmm2, xmmword ptr [rsp+0xF0] + paddd xmm3, xmmword ptr [rsp+0x10] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT8+rip] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0xA0] + paddd xmm1, xmmword ptr [rsp+0xC0] + paddd xmm2, xmmword ptr [rsp+0xE0] + paddd xmm3, xmmword ptr [rsp+0xD0] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT16+rip] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x70] + paddd xmm1, xmmword ptr [rsp+0x90] + paddd xmm2, xmmword ptr [rsp+0x30] + paddd xmm3, xmmword ptr [rsp+0xF0] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT8+rip] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x40] + paddd xmm1, xmmword ptr [rsp+0xB0] + paddd xmm2, xmmword ptr [rsp+0x50] + paddd xmm3, xmmword ptr [rsp+0x10] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT16+rip] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp] + paddd xmm1, xmmword ptr [rsp+0x20] + paddd xmm2, xmmword ptr [rsp+0x80] + paddd xmm3, xmmword ptr [rsp+0x60] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT8+rip] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0xC0] + paddd xmm1, xmmword ptr [rsp+0x90] + paddd xmm2, xmmword ptr [rsp+0xF0] + paddd xmm3, xmmword ptr [rsp+0xE0] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT16+rip] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0xD0] + paddd xmm1, xmmword ptr [rsp+0xB0] + paddd xmm2, xmmword ptr [rsp+0xA0] + paddd xmm3, xmmword ptr [rsp+0x80] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT8+rip] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x70] + paddd xmm1, xmmword ptr [rsp+0x50] + paddd xmm2, xmmword ptr [rsp] + paddd xmm3, xmmword ptr [rsp+0x60] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT16+rip] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0x20] + paddd xmm1, xmmword ptr [rsp+0x30] + paddd xmm2, xmmword ptr [rsp+0x10] + paddd xmm3, xmmword ptr [rsp+0x40] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT8+rip] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0x90] + paddd xmm1, xmmword ptr [rsp+0xB0] + paddd xmm2, xmmword ptr [rsp+0x80] + paddd xmm3, xmmword ptr [rsp+0xF0] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT16+rip] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0xE0] + paddd xmm1, xmmword ptr [rsp+0x50] + paddd xmm2, xmmword ptr [rsp+0xC0] + paddd xmm3, xmmword ptr [rsp+0x10] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT8+rip] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0xD0] + paddd xmm1, xmmword ptr [rsp] + paddd xmm2, xmmword ptr [rsp+0x20] + paddd xmm3, xmmword ptr [rsp+0x40] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT16+rip] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0x30] + paddd xmm1, xmmword ptr [rsp+0xA0] + paddd xmm2, xmmword ptr [rsp+0x60] + paddd xmm3, xmmword ptr [rsp+0x70] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT8+rip] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0xB0] + paddd xmm1, xmmword ptr [rsp+0x50] + paddd xmm2, xmmword ptr [rsp+0x10] + paddd xmm3, xmmword ptr [rsp+0x80] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT16+rip] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0xF0] + paddd xmm1, xmmword ptr [rsp] + paddd xmm2, xmmword ptr [rsp+0x90] + paddd xmm3, xmmword ptr [rsp+0x60] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT8+rip] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0xE0] + paddd xmm1, xmmword ptr [rsp+0x20] + paddd xmm2, xmmword ptr [rsp+0x30] + paddd xmm3, xmmword ptr [rsp+0x70] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT16+rip] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0xA0] + paddd xmm1, xmmword ptr [rsp+0xC0] + paddd xmm2, xmmword ptr [rsp+0x40] + paddd xmm3, xmmword ptr [rsp+0xD0] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT8+rip] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + pxor xmm0, xmm8 + pxor xmm1, xmm9 + pxor xmm2, xmm10 + pxor xmm3, xmm11 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + pxor xmm4, xmm12 + pxor xmm5, xmm13 + pxor xmm6, xmm14 + pxor xmm7, xmm15 + mov eax, r13d + jne 9b + movdqa xmm9, xmm0 + punpckldq xmm0, xmm1 + punpckhdq xmm9, xmm1 + movdqa xmm11, xmm2 + punpckldq xmm2, xmm3 + punpckhdq xmm11, xmm3 + movdqa xmm1, xmm0 + punpcklqdq xmm0, xmm2 + punpckhqdq xmm1, xmm2 + movdqa xmm3, xmm9 + punpcklqdq xmm9, xmm11 + punpckhqdq xmm3, xmm11 + movdqu xmmword ptr [rbx], xmm0 + movdqu xmmword ptr [rbx+0x20], xmm1 + movdqu xmmword ptr [rbx+0x40], xmm9 + movdqu xmmword ptr [rbx+0x60], xmm3 + movdqa xmm9, xmm4 + punpckldq xmm4, xmm5 + punpckhdq xmm9, xmm5 + movdqa xmm11, xmm6 + punpckldq xmm6, xmm7 + punpckhdq xmm11, xmm7 + movdqa xmm5, xmm4 + punpcklqdq xmm4, xmm6 + punpckhqdq xmm5, xmm6 + movdqa xmm7, xmm9 + punpcklqdq xmm9, xmm11 + punpckhqdq xmm7, xmm11 + movdqu xmmword ptr [rbx+0x10], xmm4 + movdqu xmmword ptr [rbx+0x30], xmm5 + movdqu xmmword ptr [rbx+0x50], xmm9 + movdqu xmmword ptr [rbx+0x70], xmm7 + movdqa xmm1, xmmword ptr [rsp+0x110] + movdqa xmm0, xmm1 + paddd xmm1, xmmword ptr [rsp+0x150] + movdqa xmmword ptr [rsp+0x110], xmm1 + pxor xmm0, xmmword ptr [CMP_MSB_MASK+rip] + pxor xmm1, xmmword ptr [CMP_MSB_MASK+rip] + pcmpgtd xmm0, xmm1 + movdqa xmm1, xmmword ptr [rsp+0x120] + psubd xmm1, xmm0 + movdqa xmmword ptr [rsp+0x120], xmm1 + add rbx, 128 + add rdi, 32 + sub rsi, 4 + cmp rsi, 4 + jnc 2b + test rsi, rsi + jne 3f +4: + movdqa xmm6, xmmword ptr [rsp+0x170] + movdqa xmm7, xmmword ptr [rsp+0x180] + movdqa xmm8, xmmword ptr [rsp+0x190] + movdqa xmm9, xmmword ptr [rsp+0x1A0] + movdqa xmm10, xmmword ptr [rsp+0x1B0] + movdqa xmm11, xmmword ptr [rsp+0x1C0] + movdqa xmm12, xmmword ptr [rsp+0x1D0] + movdqa xmm13, xmmword ptr [rsp+0x1E0] + movdqa xmm14, xmmword ptr [rsp+0x1F0] + movdqa xmm15, xmmword ptr [rsp+0x200] + mov rsp, rbp + pop rbp + pop rbx + pop rdi + pop rsi + pop r12 + pop r13 + pop r14 + pop r15 + ret +.p2align 5 +3: + test esi, 0x2 + je 3f + movups xmm0, xmmword ptr [rcx] + movups xmm1, xmmword ptr [rcx+0x10] + movaps xmm8, xmm0 + movaps xmm9, xmm1 + movd xmm13, dword ptr [rsp+0x110] + pinsrd xmm13, dword ptr [rsp+0x120], 1 + pinsrd xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 + movaps xmmword ptr [rsp], xmm13 + movd xmm14, dword ptr [rsp+0x114] + pinsrd xmm14, dword ptr [rsp+0x124], 1 + pinsrd xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 + movaps xmmword ptr [rsp+0x10], xmm14 + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+0x8] + movzx eax, byte ptr [rbp+0x80] + or eax, r13d + xor edx, edx +2: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + movaps xmm2, xmmword ptr [BLAKE3_IV+rip] + movaps xmm10, xmm2 + movups xmm4, xmmword ptr [r8+rdx-0x40] + movups xmm5, xmmword ptr [r8+rdx-0x30] + movaps xmm3, xmm4 + shufps xmm4, xmm5, 136 + shufps xmm3, xmm5, 221 + movaps xmm5, xmm3 + movups xmm6, xmmword ptr [r8+rdx-0x20] + movups xmm7, xmmword ptr [r8+rdx-0x10] + movaps xmm3, xmm6 + shufps xmm6, xmm7, 136 + pshufd xmm6, xmm6, 0x93 + shufps xmm3, xmm7, 221 + pshufd xmm7, xmm3, 0x93 + movups xmm12, xmmword ptr [r9+rdx-0x40] + movups xmm13, xmmword ptr [r9+rdx-0x30] + movaps xmm11, xmm12 + shufps xmm12, xmm13, 136 + shufps xmm11, xmm13, 221 + movaps xmm13, xmm11 + movups xmm14, xmmword ptr [r9+rdx-0x20] + movups xmm15, xmmword ptr [r9+rdx-0x10] + movaps xmm11, xmm14 + shufps xmm14, xmm15, 136 + pshufd xmm14, xmm14, 0x93 + shufps xmm11, xmm15, 221 + pshufd xmm15, xmm11, 0x93 + movaps xmm3, xmmword ptr [rsp] + movaps xmm11, xmmword ptr [rsp+0x10] + pinsrd xmm3, eax, 3 + pinsrd xmm11, eax, 3 + mov al, 7 +9: + paddd xmm0, xmm4 + paddd xmm8, xmm12 + movaps xmmword ptr [rsp+0x20], xmm4 + movaps xmmword ptr [rsp+0x30], xmm12 + paddd xmm0, xmm1 + paddd xmm8, xmm9 + pxor xmm3, xmm0 + pxor xmm11, xmm8 + movaps xmm12, xmmword ptr [ROT16+rip] + pshufb xmm3, xmm12 + pshufb xmm11, xmm12 + paddd xmm2, xmm3 + paddd xmm10, xmm11 + pxor xmm1, xmm2 + pxor xmm9, xmm10 + movdqa xmm4, xmm1 + pslld xmm1, 20 + psrld xmm4, 12 + por xmm1, xmm4 + movdqa xmm4, xmm9 + pslld xmm9, 20 + psrld xmm4, 12 + por xmm9, xmm4 + paddd xmm0, xmm5 + paddd xmm8, xmm13 + movaps xmmword ptr [rsp+0x40], xmm5 + movaps xmmword ptr [rsp+0x50], xmm13 + paddd xmm0, xmm1 + paddd xmm8, xmm9 + pxor xmm3, xmm0 + pxor xmm11, xmm8 + movaps xmm13, xmmword ptr [ROT8+rip] + pshufb xmm3, xmm13 + pshufb xmm11, xmm13 + paddd xmm2, xmm3 + paddd xmm10, xmm11 + pxor xmm1, xmm2 + pxor xmm9, xmm10 + movdqa xmm4, xmm1 + pslld xmm1, 25 + psrld xmm4, 7 + por xmm1, xmm4 + movdqa xmm4, xmm9 + pslld xmm9, 25 + psrld xmm4, 7 + por xmm9, xmm4 + pshufd xmm0, xmm0, 0x93 + pshufd xmm8, xmm8, 0x93 + pshufd xmm3, xmm3, 0x4E + pshufd xmm11, xmm11, 0x4E + pshufd xmm2, xmm2, 0x39 + pshufd xmm10, xmm10, 0x39 + paddd xmm0, xmm6 + paddd xmm8, xmm14 + paddd xmm0, xmm1 + paddd xmm8, xmm9 + pxor xmm3, xmm0 + pxor xmm11, xmm8 + pshufb xmm3, xmm12 + pshufb xmm11, xmm12 + paddd xmm2, xmm3 + paddd xmm10, xmm11 + pxor xmm1, xmm2 + pxor xmm9, xmm10 + movdqa xmm4, xmm1 + pslld xmm1, 20 + psrld xmm4, 12 + por xmm1, xmm4 + movdqa xmm4, xmm9 + pslld xmm9, 20 + psrld xmm4, 12 + por xmm9, xmm4 + paddd xmm0, xmm7 + paddd xmm8, xmm15 + paddd xmm0, xmm1 + paddd xmm8, xmm9 + pxor xmm3, xmm0 + pxor xmm11, xmm8 + pshufb xmm3, xmm13 + pshufb xmm11, xmm13 + paddd xmm2, xmm3 + paddd xmm10, xmm11 + pxor xmm1, xmm2 + pxor xmm9, xmm10 + movdqa xmm4, xmm1 + pslld xmm1, 25 + psrld xmm4, 7 + por xmm1, xmm4 + movdqa xmm4, xmm9 + pslld xmm9, 25 + psrld xmm4, 7 + por xmm9, xmm4 + pshufd xmm0, xmm0, 0x39 + pshufd xmm8, xmm8, 0x39 + pshufd xmm3, xmm3, 0x4E + pshufd xmm11, xmm11, 0x4E + pshufd xmm2, xmm2, 0x93 + pshufd xmm10, xmm10, 0x93 + dec al + je 9f + movdqa xmm12, xmmword ptr [rsp+0x20] + movdqa xmm5, xmmword ptr [rsp+0x40] + pshufd xmm13, xmm12, 0x0F + shufps xmm12, xmm5, 214 + pshufd xmm4, xmm12, 0x39 + movdqa xmm12, xmm6 + shufps xmm12, xmm7, 250 + pblendw xmm13, xmm12, 0xCC + movdqa xmm12, xmm7 + punpcklqdq xmm12, xmm5 + pblendw xmm12, xmm6, 0xC0 + pshufd xmm12, xmm12, 0x78 + punpckhdq xmm5, xmm7 + punpckldq xmm6, xmm5 + pshufd xmm7, xmm6, 0x1E + movdqa xmmword ptr [rsp+0x20], xmm13 + movdqa xmmword ptr [rsp+0x40], xmm12 + movdqa xmm5, xmmword ptr [rsp+0x30] + movdqa xmm13, xmmword ptr [rsp+0x50] + pshufd xmm6, xmm5, 0x0F + shufps xmm5, xmm13, 214 + pshufd xmm12, xmm5, 0x39 + movdqa xmm5, xmm14 + shufps xmm5, xmm15, 250 + pblendw xmm6, xmm5, 0xCC + movdqa xmm5, xmm15 + punpcklqdq xmm5, xmm13 + pblendw xmm5, xmm14, 0xC0 + pshufd xmm5, xmm5, 0x78 + punpckhdq xmm13, xmm15 + punpckldq xmm14, xmm13 + pshufd xmm15, xmm14, 0x1E + movdqa xmm13, xmm6 + movdqa xmm14, xmm5 + movdqa xmm5, xmmword ptr [rsp+0x20] + movdqa xmm6, xmmword ptr [rsp+0x40] + jmp 9b +9: + pxor xmm0, xmm2 + pxor xmm1, xmm3 + pxor xmm8, xmm10 + pxor xmm9, xmm11 + mov eax, r13d + cmp rdx, r15 + jne 2b + movups xmmword ptr [rbx], xmm0 + movups xmmword ptr [rbx+0x10], xmm1 + movups xmmword ptr [rbx+0x20], xmm8 + movups xmmword ptr [rbx+0x30], xmm9 + movdqa xmm0, xmmword ptr [rsp+0x130] + movdqa xmm1, xmmword ptr [rsp+0x110] + movdqa xmm2, xmmword ptr [rsp+0x120] + movdqu xmm3, xmmword ptr [rsp+0x118] + movdqu xmm4, xmmword ptr [rsp+0x128] + blendvps xmm1, xmm3, xmm0 + blendvps xmm2, xmm4, xmm0 + movdqa xmmword ptr [rsp+0x110], xmm1 + movdqa xmmword ptr [rsp+0x120], xmm2 + add rdi, 16 + add rbx, 64 + sub rsi, 2 +3: + test esi, 0x1 + je 4b + movups xmm0, xmmword ptr [rcx] + movups xmm1, xmmword ptr [rcx+0x10] + movd xmm13, dword ptr [rsp+0x110] + pinsrd xmm13, dword ptr [rsp+0x120], 1 + pinsrd xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 + movaps xmm14, xmmword ptr [ROT8+rip] + movaps xmm15, xmmword ptr [ROT16+rip] + mov r8, qword ptr [rdi] + movzx eax, byte ptr [rbp+0x80] + or eax, r13d + xor edx, edx +2: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + movaps xmm2, xmmword ptr [BLAKE3_IV+rip] + movaps xmm3, xmm13 + pinsrd xmm3, eax, 3 + movups xmm4, xmmword ptr [r8+rdx-0x40] + movups xmm5, xmmword ptr [r8+rdx-0x30] + movaps xmm8, xmm4 + shufps xmm4, xmm5, 136 + shufps xmm8, xmm5, 221 + movaps xmm5, xmm8 + movups xmm6, xmmword ptr [r8+rdx-0x20] + movups xmm7, xmmword ptr [r8+rdx-0x10] + movaps xmm8, xmm6 + shufps xmm6, xmm7, 136 + pshufd xmm6, xmm6, 0x93 + shufps xmm8, xmm7, 221 + pshufd xmm7, xmm8, 0x93 + mov al, 7 +9: + paddd xmm0, xmm4 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm15 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm5 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 0x93 + pshufd xmm3, xmm3, 0x4E + pshufd xmm2, xmm2, 0x39 + paddd xmm0, xmm6 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm15 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm7 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 0x39 + pshufd xmm3, xmm3, 0x4E + pshufd xmm2, xmm2, 0x93 + dec al + jz 9f + movdqa xmm8, xmm4 + shufps xmm8, xmm5, 214 + pshufd xmm9, xmm4, 0x0F + pshufd xmm4, xmm8, 0x39 + movdqa xmm8, xmm6 + shufps xmm8, xmm7, 250 + pblendw xmm9, xmm8, 0xCC + movdqa xmm8, xmm7 + punpcklqdq xmm8, xmm5 + pblendw xmm8, xmm6, 0xC0 + pshufd xmm8, xmm8, 0x78 + punpckhdq xmm5, xmm7 + punpckldq xmm6, xmm5 + pshufd xmm7, xmm6, 0x1E + movdqa xmm5, xmm9 + movdqa xmm6, xmm8 + jmp 9b +9: + pxor xmm0, xmm2 + pxor xmm1, xmm3 + mov eax, r13d + cmp rdx, r15 + jne 2b + movups xmmword ptr [rbx], xmm0 + movups xmmword ptr [rbx+0x10], xmm1 + jmp 4b + +.p2align 6 +blake3_compress_in_place_sse41: +_blake3_compress_in_place_sse41: + sub rsp, 120 + movdqa xmmword ptr [rsp], xmm6 + movdqa xmmword ptr [rsp+0x10], xmm7 + movdqa xmmword ptr [rsp+0x20], xmm8 + movdqa xmmword ptr [rsp+0x30], xmm9 + movdqa xmmword ptr [rsp+0x40], xmm11 + movdqa xmmword ptr [rsp+0x50], xmm14 + movdqa xmmword ptr [rsp+0x60], xmm15 + movups xmm0, xmmword ptr [rcx] + movups xmm1, xmmword ptr [rcx+0x10] + movaps xmm2, xmmword ptr [BLAKE3_IV+rip] + movzx eax, byte ptr [rsp+0xA0] + movzx r8d, r8b + shl rax, 32 + add r8, rax + movq xmm3, r9 + movq xmm4, r8 + punpcklqdq xmm3, xmm4 + movups xmm4, xmmword ptr [rdx] + movups xmm5, xmmword ptr [rdx+0x10] + movaps xmm8, xmm4 + shufps xmm4, xmm5, 136 + shufps xmm8, xmm5, 221 + movaps xmm5, xmm8 + movups xmm6, xmmword ptr [rdx+0x20] + movups xmm7, xmmword ptr [rdx+0x30] + movaps xmm8, xmm6 + shufps xmm6, xmm7, 136 + pshufd xmm6, xmm6, 0x93 + shufps xmm8, xmm7, 221 + pshufd xmm7, xmm8, 0x93 + movaps xmm14, xmmword ptr [ROT8+rip] + movaps xmm15, xmmword ptr [ROT16+rip] + mov al, 7 +9: + paddd xmm0, xmm4 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm15 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm5 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 0x93 + pshufd xmm3, xmm3, 0x4E + pshufd xmm2, xmm2, 0x39 + paddd xmm0, xmm6 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm15 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm7 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 0x39 + pshufd xmm3, xmm3, 0x4E + pshufd xmm2, xmm2, 0x93 + dec al + jz 9f + movdqa xmm8, xmm4 + shufps xmm8, xmm5, 214 + pshufd xmm9, xmm4, 0x0F + pshufd xmm4, xmm8, 0x39 + movdqa xmm8, xmm6 + shufps xmm8, xmm7, 250 + pblendw xmm9, xmm8, 0xCC + movdqa xmm8, xmm7 + punpcklqdq xmm8, xmm5 + pblendw xmm8, xmm6, 0xC0 + pshufd xmm8, xmm8, 0x78 + punpckhdq xmm5, xmm7 + punpckldq xmm6, xmm5 + pshufd xmm7, xmm6, 0x1E + movdqa xmm5, xmm9 + movdqa xmm6, xmm8 + jmp 9b +9: + pxor xmm0, xmm2 + pxor xmm1, xmm3 + movups xmmword ptr [rcx], xmm0 + movups xmmword ptr [rcx+0x10], xmm1 + movdqa xmm6, xmmword ptr [rsp] + movdqa xmm7, xmmword ptr [rsp+0x10] + movdqa xmm8, xmmword ptr [rsp+0x20] + movdqa xmm9, xmmword ptr [rsp+0x30] + movdqa xmm11, xmmword ptr [rsp+0x40] + movdqa xmm14, xmmword ptr [rsp+0x50] + movdqa xmm15, xmmword ptr [rsp+0x60] + add rsp, 120 + ret + + +.p2align 6 +_blake3_compress_xof_sse41: +blake3_compress_xof_sse41: + sub rsp, 120 + movdqa xmmword ptr [rsp], xmm6 + movdqa xmmword ptr [rsp+0x10], xmm7 + movdqa xmmword ptr [rsp+0x20], xmm8 + movdqa xmmword ptr [rsp+0x30], xmm9 + movdqa xmmword ptr [rsp+0x40], xmm11 + movdqa xmmword ptr [rsp+0x50], xmm14 + movdqa xmmword ptr [rsp+0x60], xmm15 + movups xmm0, xmmword ptr [rcx] + movups xmm1, xmmword ptr [rcx+0x10] + movaps xmm2, xmmword ptr [BLAKE3_IV+rip] + movzx eax, byte ptr [rsp+0xA0] + movzx r8d, r8b + mov r10, qword ptr [rsp+0xA8] + shl rax, 32 + add r8, rax + movq xmm3, r9 + movq xmm4, r8 + punpcklqdq xmm3, xmm4 + movups xmm4, xmmword ptr [rdx] + movups xmm5, xmmword ptr [rdx+0x10] + movaps xmm8, xmm4 + shufps xmm4, xmm5, 136 + shufps xmm8, xmm5, 221 + movaps xmm5, xmm8 + movups xmm6, xmmword ptr [rdx+0x20] + movups xmm7, xmmword ptr [rdx+0x30] + movaps xmm8, xmm6 + shufps xmm6, xmm7, 136 + pshufd xmm6, xmm6, 0x93 + shufps xmm8, xmm7, 221 + pshufd xmm7, xmm8, 0x93 + movaps xmm14, xmmword ptr [ROT8+rip] + movaps xmm15, xmmword ptr [ROT16+rip] + mov al, 7 +9: + paddd xmm0, xmm4 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm15 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm5 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 0x93 + pshufd xmm3, xmm3, 0x4E + pshufd xmm2, xmm2, 0x39 + paddd xmm0, xmm6 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm15 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm7 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 0x39 + pshufd xmm3, xmm3, 0x4E + pshufd xmm2, xmm2, 0x93 + dec al + jz 9f + movdqa xmm8, xmm4 + shufps xmm8, xmm5, 214 + pshufd xmm9, xmm4, 0x0F + pshufd xmm4, xmm8, 0x39 + movdqa xmm8, xmm6 + shufps xmm8, xmm7, 250 + pblendw xmm9, xmm8, 0xCC + movdqa xmm8, xmm7 + punpcklqdq xmm8, xmm5 + pblendw xmm8, xmm6, 0xC0 + pshufd xmm8, xmm8, 0x78 + punpckhdq xmm5, xmm7 + punpckldq xmm6, xmm5 + pshufd xmm7, xmm6, 0x1E + movdqa xmm5, xmm9 + movdqa xmm6, xmm8 + jmp 9b +9: + movdqu xmm4, xmmword ptr [rcx] + movdqu xmm5, xmmword ptr [rcx+0x10] + pxor xmm0, xmm2 + pxor xmm1, xmm3 + pxor xmm2, xmm4 + pxor xmm3, xmm5 + movups xmmword ptr [r10], xmm0 + movups xmmword ptr [r10+0x10], xmm1 + movups xmmword ptr [r10+0x20], xmm2 + movups xmmword ptr [r10+0x30], xmm3 + movdqa xmm6, xmmword ptr [rsp] + movdqa xmm7, xmmword ptr [rsp+0x10] + movdqa xmm8, xmmword ptr [rsp+0x20] + movdqa xmm9, xmmword ptr [rsp+0x30] + movdqa xmm11, xmmword ptr [rsp+0x40] + movdqa xmm14, xmmword ptr [rsp+0x50] + movdqa xmm15, xmmword ptr [rsp+0x60] + add rsp, 120 + ret + + +.section .rdata +.p2align 6 +BLAKE3_IV: + .long 0x6A09E667, 0xBB67AE85 + .long 0x3C6EF372, 0xA54FF53A +ROT16: + .byte 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13 +ROT8: + .byte 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12 +ADD0: + .long 0, 1, 2, 3 +ADD1: + .long 4, 4, 4, 4 +BLAKE3_IV_0: + .long 0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667 +BLAKE3_IV_1: + .long 0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85 +BLAKE3_IV_2: + .long 0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372 +BLAKE3_IV_3: + .long 0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A +BLAKE3_BLOCK_LEN: + .long 64, 64, 64, 64 +CMP_MSB_MASK: + .long 0x80000000, 0x80000000, 0x80000000, 0x80000000 diff --git a/third-party/blake3/c/blake3_sse41_x86-64_windows_msvc.asm b/third-party/blake3/c/blake3_sse41_x86-64_windows_msvc.asm new file mode 100644 index 00000000..8966c7b8 --- /dev/null +++ b/third-party/blake3/c/blake3_sse41_x86-64_windows_msvc.asm @@ -0,0 +1,2089 @@ +public _blake3_hash_many_sse41 +public blake3_hash_many_sse41 +public blake3_compress_in_place_sse41 +public _blake3_compress_in_place_sse41 +public blake3_compress_xof_sse41 +public _blake3_compress_xof_sse41 + +_TEXT SEGMENT ALIGN(16) 'CODE' + +ALIGN 16 +blake3_hash_many_sse41 PROC +_blake3_hash_many_sse41 PROC + push r15 + push r14 + push r13 + push r12 + push rsi + push rdi + push rbx + push rbp + mov rbp, rsp + sub rsp, 528 + and rsp, 0FFFFFFFFFFFFFFC0H + movdqa xmmword ptr [rsp+170H], xmm6 + movdqa xmmword ptr [rsp+180H], xmm7 + movdqa xmmword ptr [rsp+190H], xmm8 + movdqa xmmword ptr [rsp+1A0H], xmm9 + movdqa xmmword ptr [rsp+1B0H], xmm10 + movdqa xmmword ptr [rsp+1C0H], xmm11 + movdqa xmmword ptr [rsp+1D0H], xmm12 + movdqa xmmword ptr [rsp+1E0H], xmm13 + movdqa xmmword ptr [rsp+1F0H], xmm14 + movdqa xmmword ptr [rsp+200H], xmm15 + mov rdi, rcx + mov rsi, rdx + mov rdx, r8 + mov rcx, r9 + mov r8, qword ptr [rbp+68H] + movzx r9, byte ptr [rbp+70H] + neg r9d + movd xmm0, r9d + pshufd xmm0, xmm0, 00H + movdqa xmmword ptr [rsp+130H], xmm0 + movdqa xmm1, xmm0 + pand xmm1, xmmword ptr [ADD0] + pand xmm0, xmmword ptr [ADD1] + movdqa xmmword ptr [rsp+150H], xmm0 + movd xmm0, r8d + pshufd xmm0, xmm0, 00H + paddd xmm0, xmm1 + movdqa xmmword ptr [rsp+110H], xmm0 + pxor xmm0, xmmword ptr [CMP_MSB_MASK] + pxor xmm1, xmmword ptr [CMP_MSB_MASK] + pcmpgtd xmm1, xmm0 + shr r8, 32 + movd xmm2, r8d + pshufd xmm2, xmm2, 00H + psubd xmm2, xmm1 + movdqa xmmword ptr [rsp+120H], xmm2 + mov rbx, qword ptr [rbp+90H] + mov r15, rdx + shl r15, 6 + movzx r13d, byte ptr [rbp+78H] + movzx r12d, byte ptr [rbp+88H] + cmp rsi, 4 + jc final3blocks +outerloop4: + movdqu xmm3, xmmword ptr [rcx] + pshufd xmm0, xmm3, 00H + pshufd xmm1, xmm3, 55H + pshufd xmm2, xmm3, 0AAH + pshufd xmm3, xmm3, 0FFH + movdqu xmm7, xmmword ptr [rcx+10H] + pshufd xmm4, xmm7, 00H + pshufd xmm5, xmm7, 55H + pshufd xmm6, xmm7, 0AAH + pshufd xmm7, xmm7, 0FFH + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+8H] + mov r10, qword ptr [rdi+10H] + mov r11, qword ptr [rdi+18H] + movzx eax, byte ptr [rbp+80H] + or eax, r13d + xor edx, edx +innerloop4: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + movdqu xmm8, xmmword ptr [r8+rdx-40H] + movdqu xmm9, xmmword ptr [r9+rdx-40H] + movdqu xmm10, xmmword ptr [r10+rdx-40H] + movdqu xmm11, xmmword ptr [r11+rdx-40H] + movdqa xmm12, xmm8 + punpckldq xmm8, xmm9 + punpckhdq xmm12, xmm9 + movdqa xmm14, xmm10 + punpckldq xmm10, xmm11 + punpckhdq xmm14, xmm11 + movdqa xmm9, xmm8 + punpcklqdq xmm8, xmm10 + punpckhqdq xmm9, xmm10 + movdqa xmm13, xmm12 + punpcklqdq xmm12, xmm14 + punpckhqdq xmm13, xmm14 + movdqa xmmword ptr [rsp], xmm8 + movdqa xmmword ptr [rsp+10H], xmm9 + movdqa xmmword ptr [rsp+20H], xmm12 + movdqa xmmword ptr [rsp+30H], xmm13 + movdqu xmm8, xmmword ptr [r8+rdx-30H] + movdqu xmm9, xmmword ptr [r9+rdx-30H] + movdqu xmm10, xmmword ptr [r10+rdx-30H] + movdqu xmm11, xmmword ptr [r11+rdx-30H] + movdqa xmm12, xmm8 + punpckldq xmm8, xmm9 + punpckhdq xmm12, xmm9 + movdqa xmm14, xmm10 + punpckldq xmm10, xmm11 + punpckhdq xmm14, xmm11 + movdqa xmm9, xmm8 + punpcklqdq xmm8, xmm10 + punpckhqdq xmm9, xmm10 + movdqa xmm13, xmm12 + punpcklqdq xmm12, xmm14 + punpckhqdq xmm13, xmm14 + movdqa xmmword ptr [rsp+40H], xmm8 + movdqa xmmword ptr [rsp+50H], xmm9 + movdqa xmmword ptr [rsp+60H], xmm12 + movdqa xmmword ptr [rsp+70H], xmm13 + movdqu xmm8, xmmword ptr [r8+rdx-20H] + movdqu xmm9, xmmword ptr [r9+rdx-20H] + movdqu xmm10, xmmword ptr [r10+rdx-20H] + movdqu xmm11, xmmword ptr [r11+rdx-20H] + movdqa xmm12, xmm8 + punpckldq xmm8, xmm9 + punpckhdq xmm12, xmm9 + movdqa xmm14, xmm10 + punpckldq xmm10, xmm11 + punpckhdq xmm14, xmm11 + movdqa xmm9, xmm8 + punpcklqdq xmm8, xmm10 + punpckhqdq xmm9, xmm10 + movdqa xmm13, xmm12 + punpcklqdq xmm12, xmm14 + punpckhqdq xmm13, xmm14 + movdqa xmmword ptr [rsp+80H], xmm8 + movdqa xmmword ptr [rsp+90H], xmm9 + movdqa xmmword ptr [rsp+0A0H], xmm12 + movdqa xmmword ptr [rsp+0B0H], xmm13 + movdqu xmm8, xmmword ptr [r8+rdx-10H] + movdqu xmm9, xmmword ptr [r9+rdx-10H] + movdqu xmm10, xmmword ptr [r10+rdx-10H] + movdqu xmm11, xmmword ptr [r11+rdx-10H] + movdqa xmm12, xmm8 + punpckldq xmm8, xmm9 + punpckhdq xmm12, xmm9 + movdqa xmm14, xmm10 + punpckldq xmm10, xmm11 + punpckhdq xmm14, xmm11 + movdqa xmm9, xmm8 + punpcklqdq xmm8, xmm10 + punpckhqdq xmm9, xmm10 + movdqa xmm13, xmm12 + punpcklqdq xmm12, xmm14 + punpckhqdq xmm13, xmm14 + movdqa xmmword ptr [rsp+0C0H], xmm8 + movdqa xmmword ptr [rsp+0D0H], xmm9 + movdqa xmmword ptr [rsp+0E0H], xmm12 + movdqa xmmword ptr [rsp+0F0H], xmm13 + movdqa xmm9, xmmword ptr [BLAKE3_IV_1] + movdqa xmm10, xmmword ptr [BLAKE3_IV_2] + movdqa xmm11, xmmword ptr [BLAKE3_IV_3] + movdqa xmm12, xmmword ptr [rsp+110H] + movdqa xmm13, xmmword ptr [rsp+120H] + movdqa xmm14, xmmword ptr [BLAKE3_BLOCK_LEN] + movd xmm15, eax + pshufd xmm15, xmm15, 00H + prefetcht0 byte ptr [r8+rdx+80H] + prefetcht0 byte ptr [r9+rdx+80H] + prefetcht0 byte ptr [r10+rdx+80H] + prefetcht0 byte ptr [r11+rdx+80H] + paddd xmm0, xmmword ptr [rsp] + paddd xmm1, xmmword ptr [rsp+20H] + paddd xmm2, xmmword ptr [rsp+40H] + paddd xmm3, xmmword ptr [rsp+60H] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT16] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [BLAKE3_IV_0] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+10H] + paddd xmm1, xmmword ptr [rsp+30H] + paddd xmm2, xmmword ptr [rsp+50H] + paddd xmm3, xmmword ptr [rsp+70H] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT8] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+80H] + paddd xmm1, xmmword ptr [rsp+0A0H] + paddd xmm2, xmmword ptr [rsp+0C0H] + paddd xmm3, xmmword ptr [rsp+0E0H] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT16] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+90H] + paddd xmm1, xmmword ptr [rsp+0B0H] + paddd xmm2, xmmword ptr [rsp+0D0H] + paddd xmm3, xmmword ptr [rsp+0F0H] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT8] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+20H] + paddd xmm1, xmmword ptr [rsp+30H] + paddd xmm2, xmmword ptr [rsp+70H] + paddd xmm3, xmmword ptr [rsp+40H] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT16] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+60H] + paddd xmm1, xmmword ptr [rsp+0A0H] + paddd xmm2, xmmword ptr [rsp] + paddd xmm3, xmmword ptr [rsp+0D0H] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT8] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+10H] + paddd xmm1, xmmword ptr [rsp+0C0H] + paddd xmm2, xmmword ptr [rsp+90H] + paddd xmm3, xmmword ptr [rsp+0F0H] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT16] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0B0H] + paddd xmm1, xmmword ptr [rsp+50H] + paddd xmm2, xmmword ptr [rsp+0E0H] + paddd xmm3, xmmword ptr [rsp+80H] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT8] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+30H] + paddd xmm1, xmmword ptr [rsp+0A0H] + paddd xmm2, xmmword ptr [rsp+0D0H] + paddd xmm3, xmmword ptr [rsp+70H] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT16] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+40H] + paddd xmm1, xmmword ptr [rsp+0C0H] + paddd xmm2, xmmword ptr [rsp+20H] + paddd xmm3, xmmword ptr [rsp+0E0H] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT8] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+60H] + paddd xmm1, xmmword ptr [rsp+90H] + paddd xmm2, xmmword ptr [rsp+0B0H] + paddd xmm3, xmmword ptr [rsp+80H] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT16] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+50H] + paddd xmm1, xmmword ptr [rsp] + paddd xmm2, xmmword ptr [rsp+0F0H] + paddd xmm3, xmmword ptr [rsp+10H] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT8] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0A0H] + paddd xmm1, xmmword ptr [rsp+0C0H] + paddd xmm2, xmmword ptr [rsp+0E0H] + paddd xmm3, xmmword ptr [rsp+0D0H] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT16] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+70H] + paddd xmm1, xmmword ptr [rsp+90H] + paddd xmm2, xmmword ptr [rsp+30H] + paddd xmm3, xmmword ptr [rsp+0F0H] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT8] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+40H] + paddd xmm1, xmmword ptr [rsp+0B0H] + paddd xmm2, xmmword ptr [rsp+50H] + paddd xmm3, xmmword ptr [rsp+10H] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT16] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp] + paddd xmm1, xmmword ptr [rsp+20H] + paddd xmm2, xmmword ptr [rsp+80H] + paddd xmm3, xmmword ptr [rsp+60H] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT8] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0C0H] + paddd xmm1, xmmword ptr [rsp+90H] + paddd xmm2, xmmword ptr [rsp+0F0H] + paddd xmm3, xmmword ptr [rsp+0E0H] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT16] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0D0H] + paddd xmm1, xmmword ptr [rsp+0B0H] + paddd xmm2, xmmword ptr [rsp+0A0H] + paddd xmm3, xmmword ptr [rsp+80H] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT8] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+70H] + paddd xmm1, xmmword ptr [rsp+50H] + paddd xmm2, xmmword ptr [rsp] + paddd xmm3, xmmword ptr [rsp+60H] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT16] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+20H] + paddd xmm1, xmmword ptr [rsp+30H] + paddd xmm2, xmmword ptr [rsp+10H] + paddd xmm3, xmmword ptr [rsp+40H] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT8] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+90H] + paddd xmm1, xmmword ptr [rsp+0B0H] + paddd xmm2, xmmword ptr [rsp+80H] + paddd xmm3, xmmword ptr [rsp+0F0H] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT16] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0E0H] + paddd xmm1, xmmword ptr [rsp+50H] + paddd xmm2, xmmword ptr [rsp+0C0H] + paddd xmm3, xmmword ptr [rsp+10H] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT8] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0D0H] + paddd xmm1, xmmword ptr [rsp] + paddd xmm2, xmmword ptr [rsp+20H] + paddd xmm3, xmmword ptr [rsp+40H] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT16] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+30H] + paddd xmm1, xmmword ptr [rsp+0A0H] + paddd xmm2, xmmword ptr [rsp+60H] + paddd xmm3, xmmword ptr [rsp+70H] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT8] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0B0H] + paddd xmm1, xmmword ptr [rsp+50H] + paddd xmm2, xmmword ptr [rsp+10H] + paddd xmm3, xmmword ptr [rsp+80H] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT16] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0F0H] + paddd xmm1, xmmword ptr [rsp] + paddd xmm2, xmmword ptr [rsp+90H] + paddd xmm3, xmmword ptr [rsp+60H] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT8] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0E0H] + paddd xmm1, xmmword ptr [rsp+20H] + paddd xmm2, xmmword ptr [rsp+30H] + paddd xmm3, xmmword ptr [rsp+70H] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT16] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+100H], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0A0H] + paddd xmm1, xmmword ptr [rsp+0C0H] + paddd xmm2, xmmword ptr [rsp+40H] + paddd xmm3, xmmword ptr [rsp+0D0H] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT8] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+100H] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + pxor xmm0, xmm8 + pxor xmm1, xmm9 + pxor xmm2, xmm10 + pxor xmm3, xmm11 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + pxor xmm4, xmm12 + pxor xmm5, xmm13 + pxor xmm6, xmm14 + pxor xmm7, xmm15 + mov eax, r13d + jne innerloop4 + movdqa xmm9, xmm0 + punpckldq xmm0, xmm1 + punpckhdq xmm9, xmm1 + movdqa xmm11, xmm2 + punpckldq xmm2, xmm3 + punpckhdq xmm11, xmm3 + movdqa xmm1, xmm0 + punpcklqdq xmm0, xmm2 + punpckhqdq xmm1, xmm2 + movdqa xmm3, xmm9 + punpcklqdq xmm9, xmm11 + punpckhqdq xmm3, xmm11 + movdqu xmmword ptr [rbx], xmm0 + movdqu xmmword ptr [rbx+20H], xmm1 + movdqu xmmword ptr [rbx+40H], xmm9 + movdqu xmmword ptr [rbx+60H], xmm3 + movdqa xmm9, xmm4 + punpckldq xmm4, xmm5 + punpckhdq xmm9, xmm5 + movdqa xmm11, xmm6 + punpckldq xmm6, xmm7 + punpckhdq xmm11, xmm7 + movdqa xmm5, xmm4 + punpcklqdq xmm4, xmm6 + punpckhqdq xmm5, xmm6 + movdqa xmm7, xmm9 + punpcklqdq xmm9, xmm11 + punpckhqdq xmm7, xmm11 + movdqu xmmword ptr [rbx+10H], xmm4 + movdqu xmmword ptr [rbx+30H], xmm5 + movdqu xmmword ptr [rbx+50H], xmm9 + movdqu xmmword ptr [rbx+70H], xmm7 + movdqa xmm1, xmmword ptr [rsp+110H] + movdqa xmm0, xmm1 + paddd xmm1, xmmword ptr [rsp+150H] + movdqa xmmword ptr [rsp+110H], xmm1 + pxor xmm0, xmmword ptr [CMP_MSB_MASK] + pxor xmm1, xmmword ptr [CMP_MSB_MASK] + pcmpgtd xmm0, xmm1 + movdqa xmm1, xmmword ptr [rsp+120H] + psubd xmm1, xmm0 + movdqa xmmword ptr [rsp+120H], xmm1 + add rbx, 128 + add rdi, 32 + sub rsi, 4 + cmp rsi, 4 + jnc outerloop4 + test rsi, rsi + jne final3blocks +unwind: + movdqa xmm6, xmmword ptr [rsp+170H] + movdqa xmm7, xmmword ptr [rsp+180H] + movdqa xmm8, xmmword ptr [rsp+190H] + movdqa xmm9, xmmword ptr [rsp+1A0H] + movdqa xmm10, xmmword ptr [rsp+1B0H] + movdqa xmm11, xmmword ptr [rsp+1C0H] + movdqa xmm12, xmmword ptr [rsp+1D0H] + movdqa xmm13, xmmword ptr [rsp+1E0H] + movdqa xmm14, xmmword ptr [rsp+1F0H] + movdqa xmm15, xmmword ptr [rsp+200H] + mov rsp, rbp + pop rbp + pop rbx + pop rdi + pop rsi + pop r12 + pop r13 + pop r14 + pop r15 + ret +ALIGN 16 +final3blocks: + test esi, 2H + je final1block + movups xmm0, xmmword ptr [rcx] + movups xmm1, xmmword ptr [rcx+10H] + movaps xmm8, xmm0 + movaps xmm9, xmm1 + movd xmm13, dword ptr [rsp+110H] + pinsrd xmm13, dword ptr [rsp+120H], 1 + pinsrd xmm13, dword ptr [BLAKE3_BLOCK_LEN], 2 + movaps xmmword ptr [rsp], xmm13 + movd xmm14, dword ptr [rsp+114H] + pinsrd xmm14, dword ptr [rsp+124H], 1 + pinsrd xmm14, dword ptr [BLAKE3_BLOCK_LEN], 2 + movaps xmmword ptr [rsp+10H], xmm14 + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+8H] + movzx eax, byte ptr [rbp+80H] + or eax, r13d + xor edx, edx +innerloop2: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + movaps xmm2, xmmword ptr [BLAKE3_IV] + movaps xmm10, xmm2 + movups xmm4, xmmword ptr [r8+rdx-40H] + movups xmm5, xmmword ptr [r8+rdx-30H] + movaps xmm3, xmm4 + shufps xmm4, xmm5, 136 + shufps xmm3, xmm5, 221 + movaps xmm5, xmm3 + movups xmm6, xmmword ptr [r8+rdx-20H] + movups xmm7, xmmword ptr [r8+rdx-10H] + movaps xmm3, xmm6 + shufps xmm6, xmm7, 136 + pshufd xmm6, xmm6, 93H + shufps xmm3, xmm7, 221 + pshufd xmm7, xmm3, 93H + movups xmm12, xmmword ptr [r9+rdx-40H] + movups xmm13, xmmword ptr [r9+rdx-30H] + movaps xmm11, xmm12 + shufps xmm12, xmm13, 136 + shufps xmm11, xmm13, 221 + movaps xmm13, xmm11 + movups xmm14, xmmword ptr [r9+rdx-20H] + movups xmm15, xmmword ptr [r9+rdx-10H] + movaps xmm11, xmm14 + shufps xmm14, xmm15, 136 + pshufd xmm14, xmm14, 93H + shufps xmm11, xmm15, 221 + pshufd xmm15, xmm11, 93H + movaps xmm3, xmmword ptr [rsp] + movaps xmm11, xmmword ptr [rsp+10H] + pinsrd xmm3, eax, 3 + pinsrd xmm11, eax, 3 + mov al, 7 +roundloop2: + paddd xmm0, xmm4 + paddd xmm8, xmm12 + movaps xmmword ptr [rsp+20H], xmm4 + movaps xmmword ptr [rsp+30H], xmm12 + paddd xmm0, xmm1 + paddd xmm8, xmm9 + pxor xmm3, xmm0 + pxor xmm11, xmm8 + movaps xmm12, xmmword ptr [ROT16] + pshufb xmm3, xmm12 + pshufb xmm11, xmm12 + paddd xmm2, xmm3 + paddd xmm10, xmm11 + pxor xmm1, xmm2 + pxor xmm9, xmm10 + movdqa xmm4, xmm1 + pslld xmm1, 20 + psrld xmm4, 12 + por xmm1, xmm4 + movdqa xmm4, xmm9 + pslld xmm9, 20 + psrld xmm4, 12 + por xmm9, xmm4 + paddd xmm0, xmm5 + paddd xmm8, xmm13 + movaps xmmword ptr [rsp+40H], xmm5 + movaps xmmword ptr [rsp+50H], xmm13 + paddd xmm0, xmm1 + paddd xmm8, xmm9 + pxor xmm3, xmm0 + pxor xmm11, xmm8 + movaps xmm13, xmmword ptr [ROT8] + pshufb xmm3, xmm13 + pshufb xmm11, xmm13 + paddd xmm2, xmm3 + paddd xmm10, xmm11 + pxor xmm1, xmm2 + pxor xmm9, xmm10 + movdqa xmm4, xmm1 + pslld xmm1, 25 + psrld xmm4, 7 + por xmm1, xmm4 + movdqa xmm4, xmm9 + pslld xmm9, 25 + psrld xmm4, 7 + por xmm9, xmm4 + pshufd xmm0, xmm0, 93H + pshufd xmm8, xmm8, 93H + pshufd xmm3, xmm3, 4EH + pshufd xmm11, xmm11, 4EH + pshufd xmm2, xmm2, 39H + pshufd xmm10, xmm10, 39H + paddd xmm0, xmm6 + paddd xmm8, xmm14 + paddd xmm0, xmm1 + paddd xmm8, xmm9 + pxor xmm3, xmm0 + pxor xmm11, xmm8 + pshufb xmm3, xmm12 + pshufb xmm11, xmm12 + paddd xmm2, xmm3 + paddd xmm10, xmm11 + pxor xmm1, xmm2 + pxor xmm9, xmm10 + movdqa xmm4, xmm1 + pslld xmm1, 20 + psrld xmm4, 12 + por xmm1, xmm4 + movdqa xmm4, xmm9 + pslld xmm9, 20 + psrld xmm4, 12 + por xmm9, xmm4 + paddd xmm0, xmm7 + paddd xmm8, xmm15 + paddd xmm0, xmm1 + paddd xmm8, xmm9 + pxor xmm3, xmm0 + pxor xmm11, xmm8 + pshufb xmm3, xmm13 + pshufb xmm11, xmm13 + paddd xmm2, xmm3 + paddd xmm10, xmm11 + pxor xmm1, xmm2 + pxor xmm9, xmm10 + movdqa xmm4, xmm1 + pslld xmm1, 25 + psrld xmm4, 7 + por xmm1, xmm4 + movdqa xmm4, xmm9 + pslld xmm9, 25 + psrld xmm4, 7 + por xmm9, xmm4 + pshufd xmm0, xmm0, 39H + pshufd xmm8, xmm8, 39H + pshufd xmm3, xmm3, 4EH + pshufd xmm11, xmm11, 4EH + pshufd xmm2, xmm2, 93H + pshufd xmm10, xmm10, 93H + dec al + je endroundloop2 + movdqa xmm12, xmmword ptr [rsp+20H] + movdqa xmm5, xmmword ptr [rsp+40H] + pshufd xmm13, xmm12, 0FH + shufps xmm12, xmm5, 214 + pshufd xmm4, xmm12, 39H + movdqa xmm12, xmm6 + shufps xmm12, xmm7, 250 + pblendw xmm13, xmm12, 0CCH + movdqa xmm12, xmm7 + punpcklqdq xmm12, xmm5 + pblendw xmm12, xmm6, 0C0H + pshufd xmm12, xmm12, 78H + punpckhdq xmm5, xmm7 + punpckldq xmm6, xmm5 + pshufd xmm7, xmm6, 1EH + movdqa xmmword ptr [rsp+20H], xmm13 + movdqa xmmword ptr [rsp+40H], xmm12 + movdqa xmm5, xmmword ptr [rsp+30H] + movdqa xmm13, xmmword ptr [rsp+50H] + pshufd xmm6, xmm5, 0FH + shufps xmm5, xmm13, 214 + pshufd xmm12, xmm5, 39H + movdqa xmm5, xmm14 + shufps xmm5, xmm15, 250 + pblendw xmm6, xmm5, 0CCH + movdqa xmm5, xmm15 + punpcklqdq xmm5, xmm13 + pblendw xmm5, xmm14, 0C0H + pshufd xmm5, xmm5, 78H + punpckhdq xmm13, xmm15 + punpckldq xmm14, xmm13 + pshufd xmm15, xmm14, 1EH + movdqa xmm13, xmm6 + movdqa xmm14, xmm5 + movdqa xmm5, xmmword ptr [rsp+20H] + movdqa xmm6, xmmword ptr [rsp+40H] + jmp roundloop2 +endroundloop2: + pxor xmm0, xmm2 + pxor xmm1, xmm3 + pxor xmm8, xmm10 + pxor xmm9, xmm11 + mov eax, r13d + cmp rdx, r15 + jne innerloop2 + movups xmmword ptr [rbx], xmm0 + movups xmmword ptr [rbx+10H], xmm1 + movups xmmword ptr [rbx+20H], xmm8 + movups xmmword ptr [rbx+30H], xmm9 + movdqa xmm0, xmmword ptr [rsp+130H] + movdqa xmm1, xmmword ptr [rsp+110H] + movdqa xmm2, xmmword ptr [rsp+120H] + movdqu xmm3, xmmword ptr [rsp+118H] + movdqu xmm4, xmmword ptr [rsp+128H] + blendvps xmm1, xmm3, xmm0 + blendvps xmm2, xmm4, xmm0 + movdqa xmmword ptr [rsp+110H], xmm1 + movdqa xmmword ptr [rsp+120H], xmm2 + add rdi, 16 + add rbx, 64 + sub rsi, 2 +final1block: + test esi, 1H + je unwind + movups xmm0, xmmword ptr [rcx] + movups xmm1, xmmword ptr [rcx+10H] + movd xmm13, dword ptr [rsp+110H] + pinsrd xmm13, dword ptr [rsp+120H], 1 + pinsrd xmm13, dword ptr [BLAKE3_BLOCK_LEN], 2 + movaps xmm14, xmmword ptr [ROT8] + movaps xmm15, xmmword ptr [ROT16] + mov r8, qword ptr [rdi] + movzx eax, byte ptr [rbp+80H] + or eax, r13d + xor edx, edx +innerloop1: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + movaps xmm2, xmmword ptr [BLAKE3_IV] + movaps xmm3, xmm13 + pinsrd xmm3, eax, 3 + movups xmm4, xmmword ptr [r8+rdx-40H] + movups xmm5, xmmword ptr [r8+rdx-30H] + movaps xmm8, xmm4 + shufps xmm4, xmm5, 136 + shufps xmm8, xmm5, 221 + movaps xmm5, xmm8 + movups xmm6, xmmword ptr [r8+rdx-20H] + movups xmm7, xmmword ptr [r8+rdx-10H] + movaps xmm8, xmm6 + shufps xmm6, xmm7, 136 + pshufd xmm6, xmm6, 93H + shufps xmm8, xmm7, 221 + pshufd xmm7, xmm8, 93H + mov al, 7 +roundloop1: + paddd xmm0, xmm4 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm15 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm5 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 93H + pshufd xmm3, xmm3, 4EH + pshufd xmm2, xmm2, 39H + paddd xmm0, xmm6 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm15 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm7 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 39H + pshufd xmm3, xmm3, 4EH + pshufd xmm2, xmm2, 93H + dec al + jz endroundloop1 + movdqa xmm8, xmm4 + shufps xmm8, xmm5, 214 + pshufd xmm9, xmm4, 0FH + pshufd xmm4, xmm8, 39H + movdqa xmm8, xmm6 + shufps xmm8, xmm7, 250 + pblendw xmm9, xmm8, 0CCH + movdqa xmm8, xmm7 + punpcklqdq xmm8, xmm5 + pblendw xmm8, xmm6, 0C0H + pshufd xmm8, xmm8, 78H + punpckhdq xmm5, xmm7 + punpckldq xmm6, xmm5 + pshufd xmm7, xmm6, 1EH + movdqa xmm5, xmm9 + movdqa xmm6, xmm8 + jmp roundloop1 +endroundloop1: + pxor xmm0, xmm2 + pxor xmm1, xmm3 + mov eax, r13d + cmp rdx, r15 + jne innerloop1 + movups xmmword ptr [rbx], xmm0 + movups xmmword ptr [rbx+10H], xmm1 + jmp unwind +_blake3_hash_many_sse41 ENDP +blake3_hash_many_sse41 ENDP + +blake3_compress_in_place_sse41 PROC +_blake3_compress_in_place_sse41 PROC + sub rsp, 120 + movdqa xmmword ptr [rsp], xmm6 + movdqa xmmword ptr [rsp+10H], xmm7 + movdqa xmmword ptr [rsp+20H], xmm8 + movdqa xmmword ptr [rsp+30H], xmm9 + movdqa xmmword ptr [rsp+40H], xmm11 + movdqa xmmword ptr [rsp+50H], xmm14 + movdqa xmmword ptr [rsp+60H], xmm15 + movups xmm0, xmmword ptr [rcx] + movups xmm1, xmmword ptr [rcx+10H] + movaps xmm2, xmmword ptr [BLAKE3_IV] + movzx eax, byte ptr [rsp+0A0H] + movzx r8d, r8b + shl rax, 32 + add r8, rax + movd xmm3, r9 + movd xmm4, r8 + punpcklqdq xmm3, xmm4 + movups xmm4, xmmword ptr [rdx] + movups xmm5, xmmword ptr [rdx+10H] + movaps xmm8, xmm4 + shufps xmm4, xmm5, 136 + shufps xmm8, xmm5, 221 + movaps xmm5, xmm8 + movups xmm6, xmmword ptr [rdx+20H] + movups xmm7, xmmword ptr [rdx+30H] + movaps xmm8, xmm6 + shufps xmm6, xmm7, 136 + pshufd xmm6, xmm6, 93H + shufps xmm8, xmm7, 221 + pshufd xmm7, xmm8, 93H + movaps xmm14, xmmword ptr [ROT8] + movaps xmm15, xmmword ptr [ROT16] + mov al, 7 +@@: + paddd xmm0, xmm4 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm15 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm5 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 93H + pshufd xmm3, xmm3, 4EH + pshufd xmm2, xmm2, 39H + paddd xmm0, xmm6 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm15 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm7 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 39H + pshufd xmm3, xmm3, 4EH + pshufd xmm2, xmm2, 93H + dec al + jz @F + movdqa xmm8, xmm4 + shufps xmm8, xmm5, 214 + pshufd xmm9, xmm4, 0FH + pshufd xmm4, xmm8, 39H + movdqa xmm8, xmm6 + shufps xmm8, xmm7, 250 + pblendw xmm9, xmm8, 0CCH + movdqa xmm8, xmm7 + punpcklqdq xmm8, xmm5 + pblendw xmm8, xmm6, 0C0H + pshufd xmm8, xmm8, 78H + punpckhdq xmm5, xmm7 + punpckldq xmm6, xmm5 + pshufd xmm7, xmm6, 1EH + movdqa xmm5, xmm9 + movdqa xmm6, xmm8 + jmp @B +@@: + pxor xmm0, xmm2 + pxor xmm1, xmm3 + movups xmmword ptr [rcx], xmm0 + movups xmmword ptr [rcx+10H], xmm1 + movdqa xmm6, xmmword ptr [rsp] + movdqa xmm7, xmmword ptr [rsp+10H] + movdqa xmm8, xmmword ptr [rsp+20H] + movdqa xmm9, xmmword ptr [rsp+30H] + movdqa xmm11, xmmword ptr [rsp+40H] + movdqa xmm14, xmmword ptr [rsp+50H] + movdqa xmm15, xmmword ptr [rsp+60H] + add rsp, 120 + ret +_blake3_compress_in_place_sse41 ENDP +blake3_compress_in_place_sse41 ENDP + +ALIGN 16 +blake3_compress_xof_sse41 PROC +_blake3_compress_xof_sse41 PROC + sub rsp, 120 + movdqa xmmword ptr [rsp], xmm6 + movdqa xmmword ptr [rsp+10H], xmm7 + movdqa xmmword ptr [rsp+20H], xmm8 + movdqa xmmword ptr [rsp+30H], xmm9 + movdqa xmmword ptr [rsp+40H], xmm11 + movdqa xmmword ptr [rsp+50H], xmm14 + movdqa xmmword ptr [rsp+60H], xmm15 + movups xmm0, xmmword ptr [rcx] + movups xmm1, xmmword ptr [rcx+10H] + movaps xmm2, xmmword ptr [BLAKE3_IV] + movzx eax, byte ptr [rsp+0A0H] + movzx r8d, r8b + mov r10, qword ptr [rsp+0A8H] + shl rax, 32 + add r8, rax + movd xmm3, r9 + movd xmm4, r8 + punpcklqdq xmm3, xmm4 + movups xmm4, xmmword ptr [rdx] + movups xmm5, xmmword ptr [rdx+10H] + movaps xmm8, xmm4 + shufps xmm4, xmm5, 136 + shufps xmm8, xmm5, 221 + movaps xmm5, xmm8 + movups xmm6, xmmword ptr [rdx+20H] + movups xmm7, xmmword ptr [rdx+30H] + movaps xmm8, xmm6 + shufps xmm6, xmm7, 136 + pshufd xmm6, xmm6, 93H + shufps xmm8, xmm7, 221 + pshufd xmm7, xmm8, 93H + movaps xmm14, xmmword ptr [ROT8] + movaps xmm15, xmmword ptr [ROT16] + mov al, 7 +@@: + paddd xmm0, xmm4 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm15 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm5 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 93H + pshufd xmm3, xmm3, 4EH + pshufd xmm2, xmm2, 39H + paddd xmm0, xmm6 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm15 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm7 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 39H + pshufd xmm3, xmm3, 4EH + pshufd xmm2, xmm2, 93H + dec al + jz @F + movdqa xmm8, xmm4 + shufps xmm8, xmm5, 214 + pshufd xmm9, xmm4, 0FH + pshufd xmm4, xmm8, 39H + movdqa xmm8, xmm6 + shufps xmm8, xmm7, 250 + pblendw xmm9, xmm8, 0CCH + movdqa xmm8, xmm7 + punpcklqdq xmm8, xmm5 + pblendw xmm8, xmm6, 0C0H + pshufd xmm8, xmm8, 78H + punpckhdq xmm5, xmm7 + punpckldq xmm6, xmm5 + pshufd xmm7, xmm6, 1EH + movdqa xmm5, xmm9 + movdqa xmm6, xmm8 + jmp @B +@@: + movdqu xmm4, xmmword ptr [rcx] + movdqu xmm5, xmmword ptr [rcx+10H] + pxor xmm0, xmm2 + pxor xmm1, xmm3 + pxor xmm2, xmm4 + pxor xmm3, xmm5 + movups xmmword ptr [r10], xmm0 + movups xmmword ptr [r10+10H], xmm1 + movups xmmword ptr [r10+20H], xmm2 + movups xmmword ptr [r10+30H], xmm3 + movdqa xmm6, xmmword ptr [rsp] + movdqa xmm7, xmmword ptr [rsp+10H] + movdqa xmm8, xmmword ptr [rsp+20H] + movdqa xmm9, xmmword ptr [rsp+30H] + movdqa xmm11, xmmword ptr [rsp+40H] + movdqa xmm14, xmmword ptr [rsp+50H] + movdqa xmm15, xmmword ptr [rsp+60H] + add rsp, 120 + ret +_blake3_compress_xof_sse41 ENDP +blake3_compress_xof_sse41 ENDP + +_TEXT ENDS + + +_RDATA SEGMENT READONLY PAGE ALIAS(".rdata") 'CONST' +ALIGN 64 +BLAKE3_IV: + dd 6A09E667H, 0BB67AE85H, 3C6EF372H, 0A54FF53AH + +ADD0: + dd 0, 1, 2, 3 + +ADD1: + dd 4 dup (4) + +BLAKE3_IV_0: + dd 4 dup (6A09E667H) + +BLAKE3_IV_1: + dd 4 dup (0BB67AE85H) + +BLAKE3_IV_2: + dd 4 dup (3C6EF372H) + +BLAKE3_IV_3: + dd 4 dup (0A54FF53AH) + +BLAKE3_BLOCK_LEN: + dd 4 dup (64) + +ROT16: + db 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13 + +ROT8: + db 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12 + +CMP_MSB_MASK: + dd 8 dup(80000000H) + +_RDATA ENDS +END + diff --git a/third-party/blake3/c/example.c b/third-party/blake3/c/example.c new file mode 100644 index 00000000..ee8430b6 --- /dev/null +++ b/third-party/blake3/c/example.c @@ -0,0 +1,37 @@ +#include "blake3.h" +#include +#include +#include +#include +#include + +int main(void) { + // Initialize the hasher. + blake3_hasher hasher; + blake3_hasher_init(&hasher); + + // Read input bytes from stdin. + unsigned char buf[65536]; + while (1) { + ssize_t n = read(STDIN_FILENO, buf, sizeof(buf)); + if (n > 0) { + blake3_hasher_update(&hasher, buf, n); + } else if (n == 0) { + break; // end of file + } else { + fprintf(stderr, "read failed: %s\n", strerror(errno)); + exit(1); + } + } + + // Finalize the hash. BLAKE3_OUT_LEN is the default output length, 32 bytes. + uint8_t output[BLAKE3_OUT_LEN]; + blake3_hasher_finalize(&hasher, output, BLAKE3_OUT_LEN); + + // Print the hash as hexadecimal. + for (size_t i = 0; i < BLAKE3_OUT_LEN; i++) { + printf("%02x", output[i]); + } + printf("\n"); + return 0; +} diff --git a/third-party/blake3/c/libblake3.pc.in b/third-party/blake3/c/libblake3.pc.in new file mode 100644 index 00000000..9a5f21dc --- /dev/null +++ b/third-party/blake3/c/libblake3.pc.in @@ -0,0 +1,12 @@ +prefix="@CMAKE_INSTALL_PREFIX@" +exec_prefix="${prefix}" +libdir="${prefix}/@CMAKE_INSTALL_LIBDIR@" +includedir="${prefix}/@CMAKE_INSTALL_INCLUDEDIR@" + +Name: @PROJECT_NAME@ +Description: @PROJECT_DESCRIPTION@ +Version: @PROJECT_VERSION@ + +Requires: +Libs: -L"${libdir}" -lblake3 +Cflags: -I"${includedir}" @BLAKE3_PKGCONFIG_CFLAGS@ diff --git a/third-party/blake3/c/main.c b/third-party/blake3/c/main.c new file mode 100644 index 00000000..77cab58f --- /dev/null +++ b/third-party/blake3/c/main.c @@ -0,0 +1,166 @@ +/* + * This main file is intended for testing via `make test`. It does not build in + * other settings. See README.md in this directory for examples of how to build + * C code. + */ + +#include +#include +#include +#include +#include +#include + +#include "blake3.h" +#include "blake3_impl.h" + +#define HASH_MODE 0 +#define KEYED_HASH_MODE 1 +#define DERIVE_KEY_MODE 2 + +static void hex_char_value(uint8_t c, uint8_t *value, bool *valid) { + if ('0' <= c && c <= '9') { + *value = c - '0'; + *valid = true; + } else if ('a' <= c && c <= 'f') { + *value = 10 + c - 'a'; + *valid = true; + } else { + *valid = false; + } +} + +static int parse_key(char *hex_key, uint8_t out[BLAKE3_KEY_LEN]) { + size_t hex_len = strlen(hex_key); + if (hex_len != 64) { + fprintf(stderr, "Expected a 64-char hexadecimal key, got %zu chars.\n", + hex_len); + return 1; + } + for (size_t i = 0; i < 64; i++) { + uint8_t value; + bool valid; + hex_char_value(hex_key[i], &value, &valid); + if (!valid) { + fprintf(stderr, "Invalid hex char.\n"); + return 1; + } + if (i % 2 == 0) { + out[i / 2] = 0; + value <<= 4; + } + out[i / 2] += value; + } + return 0; +} + +/* A little repetition here */ +enum cpu_feature { + SSE2 = 1 << 0, + SSSE3 = 1 << 1, + SSE41 = 1 << 2, + AVX = 1 << 3, + AVX2 = 1 << 4, + AVX512F = 1 << 5, + AVX512VL = 1 << 6, + /* ... */ + UNDEFINED = 1 << 30 +}; + +extern enum cpu_feature g_cpu_features; +enum cpu_feature get_cpu_features(void); + +int main(int argc, char **argv) { + size_t out_len = BLAKE3_OUT_LEN; + uint8_t key[BLAKE3_KEY_LEN]; + char *context = ""; + uint8_t mode = HASH_MODE; + while (argc > 1) { + if (argc <= 2) { + fprintf(stderr, "Odd number of arguments.\n"); + return 1; + } + if (strcmp("--length", argv[1]) == 0) { + char *endptr = NULL; + errno = 0; + unsigned long long out_len_ll = strtoull(argv[2], &endptr, 10); + if (errno != 0 || out_len_ll > SIZE_MAX || endptr == argv[2] || + *endptr != 0) { + fprintf(stderr, "Bad length argument.\n"); + return 1; + } + out_len = (size_t)out_len_ll; + } else if (strcmp("--keyed", argv[1]) == 0) { + mode = KEYED_HASH_MODE; + int ret = parse_key(argv[2], key); + if (ret != 0) { + return ret; + } + } else if (strcmp("--derive-key", argv[1]) == 0) { + mode = DERIVE_KEY_MODE; + context = argv[2]; + } else { + fprintf(stderr, "Unknown flag.\n"); + return 1; + } + argc -= 2; + argv += 2; + } + + /* + * We're going to hash the input multiple times, so we need to buffer it all. + * This is just for test cases, so go ahead and assume that the input is less + * than 1 MiB. + */ + size_t buf_capacity = 1 << 20; + uint8_t *buf = malloc(buf_capacity); + assert(buf != NULL); + size_t buf_len = 0; + while (1) { + size_t n = fread(&buf[buf_len], 1, buf_capacity - buf_len, stdin); + if (n == 0) { + break; + } + buf_len += n; + assert(buf_len < buf_capacity); + } + + const int mask = get_cpu_features(); + int feature = 0; + do { + fprintf(stderr, "Testing 0x%08X\n", feature); + g_cpu_features = feature; + blake3_hasher hasher; + switch (mode) { + case HASH_MODE: + blake3_hasher_init(&hasher); + break; + case KEYED_HASH_MODE: + blake3_hasher_init_keyed(&hasher, key); + break; + case DERIVE_KEY_MODE: + blake3_hasher_init_derive_key(&hasher, context); + break; + default: + abort(); + } + + blake3_hasher_update(&hasher, buf, buf_len); + + /* TODO: An incremental output reader API to avoid this allocation. */ + uint8_t *out = malloc(out_len); + if (out_len > 0 && out == NULL) { + fprintf(stderr, "malloc() failed.\n"); + return 1; + } + blake3_hasher_finalize(&hasher, out, out_len); + for (size_t i = 0; i < out_len; i++) { + printf("%02x", out[i]); + } + printf("\n"); + free(out); + feature = (feature - mask) & mask; + } while (feature != 0); + free(buf); + return 0; +} diff --git a/third-party/blake3/c/test.py b/third-party/blake3/c/test.py new file mode 100755 index 00000000..98b1c3df --- /dev/null +++ b/third-party/blake3/c/test.py @@ -0,0 +1,97 @@ +#! /usr/bin/env python3 + +from binascii import hexlify +import json +from os import path +import subprocess + +HERE = path.dirname(__file__) +TEST_VECTORS_PATH = path.join(HERE, "..", "test_vectors", "test_vectors.json") +TEST_VECTORS = json.load(open(TEST_VECTORS_PATH)) + + +def run_blake3(args, input): + output = subprocess.run([path.join(HERE, "blake3")] + args, + input=input, + stdout=subprocess.PIPE, + check=True) + return output.stdout.decode().strip() + + +# Fill the input with a repeating byte pattern. We use a cycle length of 251, +# because that's the largest prime number less than 256. This makes it unlikely +# to swapping any two adjacent input blocks or chunks will give the same +# answer. +def make_test_input(length): + i = 0 + buf = bytearray() + while len(buf) < length: + buf.append(i) + i = (i + 1) % 251 + return buf + + +def main(): + for case in TEST_VECTORS["cases"]: + input_len = case["input_len"] + input = make_test_input(input_len) + hex_key = hexlify(TEST_VECTORS["key"].encode()) + context_string = TEST_VECTORS["context_string"] + expected_hash_xof = case["hash"] + expected_hash = expected_hash_xof[:64] + expected_keyed_hash_xof = case["keyed_hash"] + expected_keyed_hash = expected_keyed_hash_xof[:64] + expected_derive_key_xof = case["derive_key"] + expected_derive_key = expected_derive_key_xof[:64] + + # Test the default hash. + test_hash = run_blake3([], input) + for line in test_hash.splitlines(): + assert expected_hash == line, \ + "hash({}): {} != {}".format(input_len, expected_hash, line) + + # Test the extended hash. + xof_len = len(expected_hash_xof) // 2 + test_hash_xof = run_blake3(["--length", str(xof_len)], input) + for line in test_hash_xof.splitlines(): + assert expected_hash_xof == line, \ + "hash_xof({}): {} != {}".format( + input_len, expected_hash_xof, line) + + # Test the default keyed hash. + test_keyed_hash = run_blake3(["--keyed", hex_key], input) + for line in test_keyed_hash.splitlines(): + assert expected_keyed_hash == line, \ + "keyed_hash({}): {} != {}".format( + input_len, expected_keyed_hash, line) + + # Test the extended keyed hash. + xof_len = len(expected_keyed_hash_xof) // 2 + test_keyed_hash_xof = run_blake3( + ["--keyed", hex_key, "--length", + str(xof_len)], input) + for line in test_keyed_hash_xof.splitlines(): + assert expected_keyed_hash_xof == line, \ + "keyed_hash_xof({}): {} != {}".format( + input_len, expected_keyed_hash_xof, line) + + # Test the default derive key. + test_derive_key = run_blake3(["--derive-key", context_string], input) + for line in test_derive_key.splitlines(): + assert expected_derive_key == line, \ + "derive_key({}): {} != {}".format( + input_len, expected_derive_key, line) + + # Test the extended derive key. + xof_len = len(expected_derive_key_xof) // 2 + test_derive_key_xof = run_blake3( + ["--derive-key", context_string, "--length", + str(xof_len)], input) + for line in test_derive_key_xof.splitlines(): + assert expected_derive_key_xof == line, \ + "derive_key_xof({}): {} != {}".format( + input_len, expected_derive_key_xof, line) + + +if __name__ == "__main__": + main() diff --git a/third-party/blake3/media/B3.svg b/third-party/blake3/media/B3.svg new file mode 100644 index 00000000..a50da0ce --- /dev/null +++ b/third-party/blake3/media/B3.svg @@ -0,0 +1,70 @@ + + + + + + + + + + image/svg+xml + + + + + + + + + + + + diff --git a/third-party/blake3/media/BLAKE3.svg b/third-party/blake3/media/BLAKE3.svg new file mode 100644 index 00000000..2d50c2d3 --- /dev/null +++ b/third-party/blake3/media/BLAKE3.svg @@ -0,0 +1,85 @@ + + + + + + + + + + image/svg+xml + + + + + + + + + + + + + + + diff --git a/third-party/blake3/media/speed.svg b/third-party/blake3/media/speed.svg new file mode 100644 index 00000000..7bd65ca3 --- /dev/null +++ b/third-party/blake3/media/speed.svg @@ -0,0 +1,1474 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/third-party/blake3/reference_impl/Cargo.toml b/third-party/blake3/reference_impl/Cargo.toml new file mode 100644 index 00000000..e269fe9d --- /dev/null +++ b/third-party/blake3/reference_impl/Cargo.toml @@ -0,0 +1,8 @@ +[package] +name = "reference_impl" +version = "0.0.0" +edition = "2021" + +[lib] +name = "reference_impl" +path = "reference_impl.rs" diff --git a/third-party/blake3/reference_impl/README.md b/third-party/blake3/reference_impl/README.md new file mode 100644 index 00000000..6d123f3f --- /dev/null +++ b/third-party/blake3/reference_impl/README.md @@ -0,0 +1,14 @@ +This is the reference implementation of BLAKE3. It is used for testing and +as a readable example of the algorithms involved. Section 5.1 of [the BLAKE3 +spec](https://github.com/BLAKE3-team/BLAKE3-specs/blob/master/blake3.pdf) +discusses this implementation. You can render docs for this implementation +by running `cargo doc --open` in this directory. + +This implementation is a single file +([`reference_impl.rs`](reference_impl.rs)) with no dependencies. It is +not optimized for performance. + +There are ports of this reference implementation to other languages: + +- [C](https://github.com/oconnor663/blake3_reference_impl_c) +- [Python](https://github.com/oconnor663/pure_python_blake3) diff --git a/third-party/blake3/reference_impl/reference_impl.rs b/third-party/blake3/reference_impl/reference_impl.rs new file mode 100644 index 00000000..72ad525c --- /dev/null +++ b/third-party/blake3/reference_impl/reference_impl.rs @@ -0,0 +1,383 @@ +//! This is the reference implementation of BLAKE3. It is used for testing and +//! as a readable example of the algorithms involved. Section 5.1 of [the BLAKE3 +//! spec](https://github.com/BLAKE3-team/BLAKE3-specs/blob/master/blake3.pdf) +//! discusses this implementation. You can render docs for this implementation +//! by running `cargo doc --open` in this directory. +//! +//! # Example +//! +//! ``` +//! let mut hasher = reference_impl::Hasher::new(); +//! hasher.update(b"abc"); +//! hasher.update(b"def"); +//! let mut hash = [0; 32]; +//! hasher.finalize(&mut hash); +//! let mut extended_hash = [0; 500]; +//! hasher.finalize(&mut extended_hash); +//! assert_eq!(hash, extended_hash[..32]); +//! ``` + +use core::cmp::min; + +const OUT_LEN: usize = 32; +const KEY_LEN: usize = 32; +const BLOCK_LEN: usize = 64; +const CHUNK_LEN: usize = 1024; + +const CHUNK_START: u32 = 1 << 0; +const CHUNK_END: u32 = 1 << 1; +const PARENT: u32 = 1 << 2; +const ROOT: u32 = 1 << 3; +const KEYED_HASH: u32 = 1 << 4; +const DERIVE_KEY_CONTEXT: u32 = 1 << 5; +const DERIVE_KEY_MATERIAL: u32 = 1 << 6; + +const IV: [u32; 8] = [ + 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A, 0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19, +]; + +const MSG_PERMUTATION: [usize; 16] = [2, 6, 3, 10, 7, 0, 4, 13, 1, 11, 12, 5, 9, 14, 15, 8]; + +// The mixing function, G, which mixes either a column or a diagonal. +fn g(state: &mut [u32; 16], a: usize, b: usize, c: usize, d: usize, mx: u32, my: u32) { + state[a] = state[a].wrapping_add(state[b]).wrapping_add(mx); + state[d] = (state[d] ^ state[a]).rotate_right(16); + state[c] = state[c].wrapping_add(state[d]); + state[b] = (state[b] ^ state[c]).rotate_right(12); + state[a] = state[a].wrapping_add(state[b]).wrapping_add(my); + state[d] = (state[d] ^ state[a]).rotate_right(8); + state[c] = state[c].wrapping_add(state[d]); + state[b] = (state[b] ^ state[c]).rotate_right(7); +} + +fn round(state: &mut [u32; 16], m: &[u32; 16]) { + // Mix the columns. + g(state, 0, 4, 8, 12, m[0], m[1]); + g(state, 1, 5, 9, 13, m[2], m[3]); + g(state, 2, 6, 10, 14, m[4], m[5]); + g(state, 3, 7, 11, 15, m[6], m[7]); + // Mix the diagonals. + g(state, 0, 5, 10, 15, m[8], m[9]); + g(state, 1, 6, 11, 12, m[10], m[11]); + g(state, 2, 7, 8, 13, m[12], m[13]); + g(state, 3, 4, 9, 14, m[14], m[15]); +} + +fn permute(m: &mut [u32; 16]) { + let mut permuted = [0; 16]; + for i in 0..16 { + permuted[i] = m[MSG_PERMUTATION[i]]; + } + *m = permuted; +} + +fn compress( + chaining_value: &[u32; 8], + block_words: &[u32; 16], + counter: u64, + block_len: u32, + flags: u32, +) -> [u32; 16] { + let mut state = [ + chaining_value[0], + chaining_value[1], + chaining_value[2], + chaining_value[3], + chaining_value[4], + chaining_value[5], + chaining_value[6], + chaining_value[7], + IV[0], + IV[1], + IV[2], + IV[3], + counter as u32, + (counter >> 32) as u32, + block_len, + flags, + ]; + let mut block = *block_words; + + round(&mut state, &block); // round 1 + permute(&mut block); + round(&mut state, &block); // round 2 + permute(&mut block); + round(&mut state, &block); // round 3 + permute(&mut block); + round(&mut state, &block); // round 4 + permute(&mut block); + round(&mut state, &block); // round 5 + permute(&mut block); + round(&mut state, &block); // round 6 + permute(&mut block); + round(&mut state, &block); // round 7 + + for i in 0..8 { + state[i] ^= state[i + 8]; + state[i + 8] ^= chaining_value[i]; + } + state +} + +fn first_8_words(compression_output: [u32; 16]) -> [u32; 8] { + compression_output[0..8].try_into().unwrap() +} + +fn words_from_little_endian_bytes(bytes: &[u8], words: &mut [u32]) { + debug_assert_eq!(bytes.len(), 4 * words.len()); + for (four_bytes, word) in bytes.chunks_exact(4).zip(words) { + *word = u32::from_le_bytes(four_bytes.try_into().unwrap()); + } +} + +// Each chunk or parent node can produce either an 8-word chaining value or, by +// setting the ROOT flag, any number of final output bytes. The Output struct +// captures the state just prior to choosing between those two possibilities. +struct Output { + input_chaining_value: [u32; 8], + block_words: [u32; 16], + counter: u64, + block_len: u32, + flags: u32, +} + +impl Output { + fn chaining_value(&self) -> [u32; 8] { + first_8_words(compress( + &self.input_chaining_value, + &self.block_words, + self.counter, + self.block_len, + self.flags, + )) + } + + fn root_output_bytes(&self, out_slice: &mut [u8]) { + let mut output_block_counter = 0; + for out_block in out_slice.chunks_mut(2 * OUT_LEN) { + let words = compress( + &self.input_chaining_value, + &self.block_words, + output_block_counter, + self.block_len, + self.flags | ROOT, + ); + // The output length might not be a multiple of 4. + for (word, out_word) in words.iter().zip(out_block.chunks_mut(4)) { + out_word.copy_from_slice(&word.to_le_bytes()[..out_word.len()]); + } + output_block_counter += 1; + } + } +} + +struct ChunkState { + chaining_value: [u32; 8], + chunk_counter: u64, + block: [u8; BLOCK_LEN], + block_len: u8, + blocks_compressed: u8, + flags: u32, +} + +impl ChunkState { + fn new(key_words: [u32; 8], chunk_counter: u64, flags: u32) -> Self { + Self { + chaining_value: key_words, + chunk_counter, + block: [0; BLOCK_LEN], + block_len: 0, + blocks_compressed: 0, + flags, + } + } + + fn len(&self) -> usize { + BLOCK_LEN * self.blocks_compressed as usize + self.block_len as usize + } + + fn start_flag(&self) -> u32 { + if self.blocks_compressed == 0 { + CHUNK_START + } else { + 0 + } + } + + fn update(&mut self, mut input: &[u8]) { + while !input.is_empty() { + // If the block buffer is full, compress it and clear it. More + // input is coming, so this compression is not CHUNK_END. + if self.block_len as usize == BLOCK_LEN { + let mut block_words = [0; 16]; + words_from_little_endian_bytes(&self.block, &mut block_words); + self.chaining_value = first_8_words(compress( + &self.chaining_value, + &block_words, + self.chunk_counter, + BLOCK_LEN as u32, + self.flags | self.start_flag(), + )); + self.blocks_compressed += 1; + self.block = [0; BLOCK_LEN]; + self.block_len = 0; + } + + // Copy input bytes into the block buffer. + let want = BLOCK_LEN - self.block_len as usize; + let take = min(want, input.len()); + self.block[self.block_len as usize..][..take].copy_from_slice(&input[..take]); + self.block_len += take as u8; + input = &input[take..]; + } + } + + fn output(&self) -> Output { + let mut block_words = [0; 16]; + words_from_little_endian_bytes(&self.block, &mut block_words); + Output { + input_chaining_value: self.chaining_value, + block_words, + counter: self.chunk_counter, + block_len: self.block_len as u32, + flags: self.flags | self.start_flag() | CHUNK_END, + } + } +} + +fn parent_output( + left_child_cv: [u32; 8], + right_child_cv: [u32; 8], + key_words: [u32; 8], + flags: u32, +) -> Output { + let mut block_words = [0; 16]; + block_words[..8].copy_from_slice(&left_child_cv); + block_words[8..].copy_from_slice(&right_child_cv); + Output { + input_chaining_value: key_words, + block_words, + counter: 0, // Always 0 for parent nodes. + block_len: BLOCK_LEN as u32, // Always BLOCK_LEN (64) for parent nodes. + flags: PARENT | flags, + } +} + +fn parent_cv( + left_child_cv: [u32; 8], + right_child_cv: [u32; 8], + key_words: [u32; 8], + flags: u32, +) -> [u32; 8] { + parent_output(left_child_cv, right_child_cv, key_words, flags).chaining_value() +} + +/// An incremental hasher that can accept any number of writes. +pub struct Hasher { + chunk_state: ChunkState, + key_words: [u32; 8], + cv_stack: [[u32; 8]; 54], // Space for 54 subtree chaining values: + cv_stack_len: u8, // 2^54 * CHUNK_LEN = 2^64 + flags: u32, +} + +impl Hasher { + fn new_internal(key_words: [u32; 8], flags: u32) -> Self { + Self { + chunk_state: ChunkState::new(key_words, 0, flags), + key_words, + cv_stack: [[0; 8]; 54], + cv_stack_len: 0, + flags, + } + } + + /// Construct a new `Hasher` for the regular hash function. + pub fn new() -> Self { + Self::new_internal(IV, 0) + } + + /// Construct a new `Hasher` for the keyed hash function. + pub fn new_keyed(key: &[u8; KEY_LEN]) -> Self { + let mut key_words = [0; 8]; + words_from_little_endian_bytes(key, &mut key_words); + Self::new_internal(key_words, KEYED_HASH) + } + + /// Construct a new `Hasher` for the key derivation function. The context + /// string should be hardcoded, globally unique, and application-specific. + pub fn new_derive_key(context: &str) -> Self { + let mut context_hasher = Self::new_internal(IV, DERIVE_KEY_CONTEXT); + context_hasher.update(context.as_bytes()); + let mut context_key = [0; KEY_LEN]; + context_hasher.finalize(&mut context_key); + let mut context_key_words = [0; 8]; + words_from_little_endian_bytes(&context_key, &mut context_key_words); + Self::new_internal(context_key_words, DERIVE_KEY_MATERIAL) + } + + fn push_stack(&mut self, cv: [u32; 8]) { + self.cv_stack[self.cv_stack_len as usize] = cv; + self.cv_stack_len += 1; + } + + fn pop_stack(&mut self) -> [u32; 8] { + self.cv_stack_len -= 1; + self.cv_stack[self.cv_stack_len as usize] + } + + // Section 5.1.2 of the BLAKE3 spec explains this algorithm in more detail. + fn add_chunk_chaining_value(&mut self, mut new_cv: [u32; 8], mut total_chunks: u64) { + // This chunk might complete some subtrees. For each completed subtree, + // its left child will be the current top entry in the CV stack, and + // its right child will be the current value of `new_cv`. Pop each left + // child off the stack, merge it with `new_cv`, and overwrite `new_cv` + // with the result. After all these merges, push the final value of + // `new_cv` onto the stack. The number of completed subtrees is given + // by the number of trailing 0-bits in the new total number of chunks. + while total_chunks & 1 == 0 { + new_cv = parent_cv(self.pop_stack(), new_cv, self.key_words, self.flags); + total_chunks >>= 1; + } + self.push_stack(new_cv); + } + + /// Add input to the hash state. This can be called any number of times. + pub fn update(&mut self, mut input: &[u8]) { + while !input.is_empty() { + // If the current chunk is complete, finalize it and reset the + // chunk state. More input is coming, so this chunk is not ROOT. + if self.chunk_state.len() == CHUNK_LEN { + let chunk_cv = self.chunk_state.output().chaining_value(); + let total_chunks = self.chunk_state.chunk_counter + 1; + self.add_chunk_chaining_value(chunk_cv, total_chunks); + self.chunk_state = ChunkState::new(self.key_words, total_chunks, self.flags); + } + + // Compress input bytes into the current chunk state. + let want = CHUNK_LEN - self.chunk_state.len(); + let take = min(want, input.len()); + self.chunk_state.update(&input[..take]); + input = &input[take..]; + } + } + + /// Finalize the hash and write any number of output bytes. + pub fn finalize(&self, out_slice: &mut [u8]) { + // Starting with the Output from the current chunk, compute all the + // parent chaining values along the right edge of the tree, until we + // have the root Output. + let mut output = self.chunk_state.output(); + let mut parent_nodes_remaining = self.cv_stack_len as usize; + while parent_nodes_remaining > 0 { + parent_nodes_remaining -= 1; + output = parent_output( + self.cv_stack[parent_nodes_remaining], + output.chaining_value(), + self.key_words, + self.flags, + ); + } + output.root_output_bytes(out_slice); + } +} diff --git a/third-party/blake3/src/ffi_avx2.rs b/third-party/blake3/src/ffi_avx2.rs new file mode 100644 index 00000000..33961e9d --- /dev/null +++ b/third-party/blake3/src/ffi_avx2.rs @@ -0,0 +1,63 @@ +use crate::{CVWords, IncrementCounter, BLOCK_LEN, OUT_LEN}; + +// Note that there is no AVX2 implementation of compress_in_place or +// compress_xof. + +// Unsafe because this may only be called on platforms supporting AVX2. +pub unsafe fn hash_many( + inputs: &[&[u8; N]], + key: &CVWords, + counter: u64, + increment_counter: IncrementCounter, + flags: u8, + flags_start: u8, + flags_end: u8, + out: &mut [u8], +) { + // The Rust hash_many implementations do bounds checking on the `out` + // array, but the C implementations don't. Even though this is an unsafe + // function, assert the bounds here. + assert!(out.len() >= inputs.len() * OUT_LEN); + ffi::blake3_hash_many_avx2( + inputs.as_ptr() as *const *const u8, + inputs.len(), + N / BLOCK_LEN, + key.as_ptr(), + counter, + increment_counter.yes(), + flags, + flags_start, + flags_end, + out.as_mut_ptr(), + ) +} + +pub mod ffi { + extern "C" { + pub fn blake3_hash_many_avx2( + inputs: *const *const u8, + num_inputs: usize, + blocks: usize, + key: *const u32, + counter: u64, + increment_counter: bool, + flags: u8, + flags_start: u8, + flags_end: u8, + out: *mut u8, + ); + } +} + +#[cfg(test)] +mod test { + use super::*; + + #[test] + fn test_hash_many() { + if !crate::platform::avx2_detected() { + return; + } + crate::test::test_hash_many_fn(hash_many, hash_many); + } +} diff --git a/third-party/blake3/src/ffi_avx512.rs b/third-party/blake3/src/ffi_avx512.rs new file mode 100644 index 00000000..884f4813 --- /dev/null +++ b/third-party/blake3/src/ffi_avx512.rs @@ -0,0 +1,114 @@ +use crate::{CVWords, IncrementCounter, BLOCK_LEN, OUT_LEN}; + +// Unsafe because this may only be called on platforms supporting AVX-512. +pub unsafe fn compress_in_place( + cv: &mut CVWords, + block: &[u8; BLOCK_LEN], + block_len: u8, + counter: u64, + flags: u8, +) { + ffi::blake3_compress_in_place_avx512(cv.as_mut_ptr(), block.as_ptr(), block_len, counter, flags) +} + +// Unsafe because this may only be called on platforms supporting AVX-512. +pub unsafe fn compress_xof( + cv: &CVWords, + block: &[u8; BLOCK_LEN], + block_len: u8, + counter: u64, + flags: u8, +) -> [u8; 64] { + let mut out = [0u8; 64]; + ffi::blake3_compress_xof_avx512( + cv.as_ptr(), + block.as_ptr(), + block_len, + counter, + flags, + out.as_mut_ptr(), + ); + out +} + +// Unsafe because this may only be called on platforms supporting AVX-512. +pub unsafe fn hash_many( + inputs: &[&[u8; N]], + key: &CVWords, + counter: u64, + increment_counter: IncrementCounter, + flags: u8, + flags_start: u8, + flags_end: u8, + out: &mut [u8], +) { + // The Rust hash_many implementations do bounds checking on the `out` + // array, but the C implementations don't. Even though this is an unsafe + // function, assert the bounds here. + assert!(out.len() >= inputs.len() * OUT_LEN); + ffi::blake3_hash_many_avx512( + inputs.as_ptr() as *const *const u8, + inputs.len(), + N / BLOCK_LEN, + key.as_ptr(), + counter, + increment_counter.yes(), + flags, + flags_start, + flags_end, + out.as_mut_ptr(), + ) +} + +pub mod ffi { + extern "C" { + pub fn blake3_compress_in_place_avx512( + cv: *mut u32, + block: *const u8, + block_len: u8, + counter: u64, + flags: u8, + ); + pub fn blake3_compress_xof_avx512( + cv: *const u32, + block: *const u8, + block_len: u8, + counter: u64, + flags: u8, + out: *mut u8, + ); + pub fn blake3_hash_many_avx512( + inputs: *const *const u8, + num_inputs: usize, + blocks: usize, + key: *const u32, + counter: u64, + increment_counter: bool, + flags: u8, + flags_start: u8, + flags_end: u8, + out: *mut u8, + ); + } +} + +#[cfg(test)] +mod test { + use super::*; + + #[test] + fn test_compress() { + if !crate::platform::avx512_detected() { + return; + } + crate::test::test_compress_fn(compress_in_place, compress_xof); + } + + #[test] + fn test_hash_many() { + if !crate::platform::avx512_detected() { + return; + } + crate::test::test_hash_many_fn(hash_many, hash_many); + } +} diff --git a/third-party/blake3/src/ffi_neon.rs b/third-party/blake3/src/ffi_neon.rs new file mode 100644 index 00000000..54d07a4d --- /dev/null +++ b/third-party/blake3/src/ffi_neon.rs @@ -0,0 +1,82 @@ +use crate::{CVWords, IncrementCounter, BLOCK_LEN, OUT_LEN}; + +// Unsafe because this may only be called on platforms supporting NEON. +pub unsafe fn hash_many( + inputs: &[&[u8; N]], + key: &CVWords, + counter: u64, + increment_counter: IncrementCounter, + flags: u8, + flags_start: u8, + flags_end: u8, + out: &mut [u8], +) { + // The Rust hash_many implementations do bounds checking on the `out` + // array, but the C implementations don't. Even though this is an unsafe + // function, assert the bounds here. + assert!(out.len() >= inputs.len() * OUT_LEN); + ffi::blake3_hash_many_neon( + inputs.as_ptr() as *const *const u8, + inputs.len(), + N / BLOCK_LEN, + key.as_ptr(), + counter, + increment_counter.yes(), + flags, + flags_start, + flags_end, + out.as_mut_ptr(), + ) +} + +// blake3_neon.c normally depends on blake3_portable.c, because the NEON +// implementation only provides 4x compression, and it relies on the portable +// implementation for 1x compression. However, we expose the portable Rust +// implementation here instead, to avoid linking in unnecessary code. +#[no_mangle] +pub extern "C" fn blake3_compress_in_place_portable( + cv: *mut u32, + block: *const u8, + block_len: u8, + counter: u64, + flags: u8, +) { + unsafe { + crate::portable::compress_in_place( + &mut *(cv as *mut [u32; 8]), + &*(block as *const [u8; 64]), + block_len, + counter, + flags, + ) + } +} + +pub mod ffi { + extern "C" { + pub fn blake3_hash_many_neon( + inputs: *const *const u8, + num_inputs: usize, + blocks: usize, + key: *const u32, + counter: u64, + increment_counter: bool, + flags: u8, + flags_start: u8, + flags_end: u8, + out: *mut u8, + ); + } +} + +#[cfg(test)] +mod test { + use super::*; + + #[test] + fn test_hash_many() { + // This entire file is gated on feature="neon", so NEON support is + // assumed here. + crate::test::test_hash_many_fn(hash_many, hash_many); + } +} diff --git a/third-party/blake3/src/ffi_sse2.rs b/third-party/blake3/src/ffi_sse2.rs new file mode 100644 index 00000000..1c5da81f --- /dev/null +++ b/third-party/blake3/src/ffi_sse2.rs @@ -0,0 +1,114 @@ +use crate::{CVWords, IncrementCounter, BLOCK_LEN, OUT_LEN}; + +// Unsafe because this may only be called on platforms supporting SSE2. +pub unsafe fn compress_in_place( + cv: &mut CVWords, + block: &[u8; BLOCK_LEN], + block_len: u8, + counter: u64, + flags: u8, +) { + ffi::blake3_compress_in_place_sse2(cv.as_mut_ptr(), block.as_ptr(), block_len, counter, flags) +} + +// Unsafe because this may only be called on platforms supporting SSE2. +pub unsafe fn compress_xof( + cv: &CVWords, + block: &[u8; BLOCK_LEN], + block_len: u8, + counter: u64, + flags: u8, +) -> [u8; 64] { + let mut out = [0u8; 64]; + ffi::blake3_compress_xof_sse2( + cv.as_ptr(), + block.as_ptr(), + block_len, + counter, + flags, + out.as_mut_ptr(), + ); + out +} + +// Unsafe because this may only be called on platforms supporting SSE2. +pub unsafe fn hash_many( + inputs: &[&[u8; N]], + key: &CVWords, + counter: u64, + increment_counter: IncrementCounter, + flags: u8, + flags_start: u8, + flags_end: u8, + out: &mut [u8], +) { + // The Rust hash_many implementations do bounds checking on the `out` + // array, but the C implementations don't. Even though this is an unsafe + // function, assert the bounds here. + assert!(out.len() >= inputs.len() * OUT_LEN); + ffi::blake3_hash_many_sse2( + inputs.as_ptr() as *const *const u8, + inputs.len(), + N / BLOCK_LEN, + key.as_ptr(), + counter, + increment_counter.yes(), + flags, + flags_start, + flags_end, + out.as_mut_ptr(), + ) +} + +pub mod ffi { + extern "C" { + pub fn blake3_compress_in_place_sse2( + cv: *mut u32, + block: *const u8, + block_len: u8, + counter: u64, + flags: u8, + ); + pub fn blake3_compress_xof_sse2( + cv: *const u32, + block: *const u8, + block_len: u8, + counter: u64, + flags: u8, + out: *mut u8, + ); + pub fn blake3_hash_many_sse2( + inputs: *const *const u8, + num_inputs: usize, + blocks: usize, + key: *const u32, + counter: u64, + increment_counter: bool, + flags: u8, + flags_start: u8, + flags_end: u8, + out: *mut u8, + ); + } +} + +#[cfg(test)] +mod test { + use super::*; + + #[test] + fn test_compress() { + if !crate::platform::sse2_detected() { + return; + } + crate::test::test_compress_fn(compress_in_place, compress_xof); + } + + #[test] + fn test_hash_many() { + if !crate::platform::sse2_detected() { + return; + } + crate::test::test_hash_many_fn(hash_many, hash_many); + } +} diff --git a/third-party/blake3/src/ffi_sse41.rs b/third-party/blake3/src/ffi_sse41.rs new file mode 100644 index 00000000..62989c5e --- /dev/null +++ b/third-party/blake3/src/ffi_sse41.rs @@ -0,0 +1,114 @@ +use crate::{CVWords, IncrementCounter, BLOCK_LEN, OUT_LEN}; + +// Unsafe because this may only be called on platforms supporting SSE4.1. +pub unsafe fn compress_in_place( + cv: &mut CVWords, + block: &[u8; BLOCK_LEN], + block_len: u8, + counter: u64, + flags: u8, +) { + ffi::blake3_compress_in_place_sse41(cv.as_mut_ptr(), block.as_ptr(), block_len, counter, flags) +} + +// Unsafe because this may only be called on platforms supporting SSE4.1. +pub unsafe fn compress_xof( + cv: &CVWords, + block: &[u8; BLOCK_LEN], + block_len: u8, + counter: u64, + flags: u8, +) -> [u8; 64] { + let mut out = [0u8; 64]; + ffi::blake3_compress_xof_sse41( + cv.as_ptr(), + block.as_ptr(), + block_len, + counter, + flags, + out.as_mut_ptr(), + ); + out +} + +// Unsafe because this may only be called on platforms supporting SSE4.1. +pub unsafe fn hash_many( + inputs: &[&[u8; N]], + key: &CVWords, + counter: u64, + increment_counter: IncrementCounter, + flags: u8, + flags_start: u8, + flags_end: u8, + out: &mut [u8], +) { + // The Rust hash_many implementations do bounds checking on the `out` + // array, but the C implementations don't. Even though this is an unsafe + // function, assert the bounds here. + assert!(out.len() >= inputs.len() * OUT_LEN); + ffi::blake3_hash_many_sse41( + inputs.as_ptr() as *const *const u8, + inputs.len(), + N / BLOCK_LEN, + key.as_ptr(), + counter, + increment_counter.yes(), + flags, + flags_start, + flags_end, + out.as_mut_ptr(), + ) +} + +pub mod ffi { + extern "C" { + pub fn blake3_compress_in_place_sse41( + cv: *mut u32, + block: *const u8, + block_len: u8, + counter: u64, + flags: u8, + ); + pub fn blake3_compress_xof_sse41( + cv: *const u32, + block: *const u8, + block_len: u8, + counter: u64, + flags: u8, + out: *mut u8, + ); + pub fn blake3_hash_many_sse41( + inputs: *const *const u8, + num_inputs: usize, + blocks: usize, + key: *const u32, + counter: u64, + increment_counter: bool, + flags: u8, + flags_start: u8, + flags_end: u8, + out: *mut u8, + ); + } +} + +#[cfg(test)] +mod test { + use super::*; + + #[test] + fn test_compress() { + if !crate::platform::sse41_detected() { + return; + } + crate::test::test_compress_fn(compress_in_place, compress_xof); + } + + #[test] + fn test_hash_many() { + if !crate::platform::sse41_detected() { + return; + } + crate::test::test_hash_many_fn(hash_many, hash_many); + } +} diff --git a/third-party/blake3/src/guts.rs b/third-party/blake3/src/guts.rs new file mode 100644 index 00000000..ecde3261 --- /dev/null +++ b/third-party/blake3/src/guts.rs @@ -0,0 +1,101 @@ +//! This undocumented and unstable module is for use cases like the `bao` crate, +//! which need to traverse the BLAKE3 Merkle tree and work with chunk and parent +//! chaining values directly. There might be breaking changes to this module +//! between patch versions. +//! +//! We could stabilize something like this module in the future. If you have a +//! use case for it, please let us know by filing a GitHub issue. + +pub const BLOCK_LEN: usize = 64; +pub const CHUNK_LEN: usize = 1024; + +#[derive(Clone, Debug)] +pub struct ChunkState(crate::ChunkState); + +impl ChunkState { + // Currently this type only supports the regular hash mode. If an + // incremental user needs keyed_hash or derive_key, we can add that. + pub fn new(chunk_counter: u64) -> Self { + Self(crate::ChunkState::new( + crate::IV, + chunk_counter, + 0, + crate::platform::Platform::detect(), + )) + } + + #[inline] + pub fn len(&self) -> usize { + self.0.len() + } + + #[inline] + pub fn update(&mut self, input: &[u8]) -> &mut Self { + self.0.update(input); + self + } + + pub fn finalize(&self, is_root: bool) -> crate::Hash { + let output = self.0.output(); + if is_root { + output.root_hash() + } else { + output.chaining_value().into() + } + } +} + +// As above, this currently assumes the regular hash mode. If an incremental +// user needs keyed_hash or derive_key, we can add that. +pub fn parent_cv( + left_child: &crate::Hash, + right_child: &crate::Hash, + is_root: bool, +) -> crate::Hash { + let output = crate::parent_node_output( + left_child.as_bytes(), + right_child.as_bytes(), + crate::IV, + 0, + crate::platform::Platform::detect(), + ); + if is_root { + output.root_hash() + } else { + output.chaining_value().into() + } +} + +#[cfg(test)] +mod test { + use super::*; + + #[test] + fn test_chunk() { + assert_eq!( + crate::hash(b"foo"), + ChunkState::new(0).update(b"foo").finalize(true) + ); + } + + #[test] + fn test_parents() { + let mut hasher = crate::Hasher::new(); + let mut buf = [0; crate::CHUNK_LEN]; + + buf[0] = 'a' as u8; + hasher.update(&buf); + let chunk0_cv = ChunkState::new(0).update(&buf).finalize(false); + + buf[0] = 'b' as u8; + hasher.update(&buf); + let chunk1_cv = ChunkState::new(1).update(&buf).finalize(false); + + hasher.update(b"c"); + let chunk2_cv = ChunkState::new(2).update(b"c").finalize(false); + + let parent = parent_cv(&chunk0_cv, &chunk1_cv, false); + let root = parent_cv(&parent, &chunk2_cv, true); + assert_eq!(hasher.finalize(), root); + } +} diff --git a/third-party/blake3/src/join.rs b/third-party/blake3/src/join.rs new file mode 100644 index 00000000..227216a3 --- /dev/null +++ b/third-party/blake3/src/join.rs @@ -0,0 +1,92 @@ +//! The multi-threading abstractions used by `Hasher::update_with_join`. +//! +//! Different implementations of the `Join` trait determine whether +//! `Hasher::update_with_join` performs multi-threading on sufficiently large +//! inputs. The `SerialJoin` implementation is single-threaded, and the +//! `RayonJoin` implementation (gated by the `rayon` feature) is multi-threaded. +//! Interfaces other than `Hasher::update_with_join`, like [`hash`](crate::hash) +//! and [`Hasher::update`](crate::Hasher::update), always use `SerialJoin` +//! internally. +//! +//! The `Join` trait is an almost exact copy of the [`rayon::join`] API, and +//! `RayonJoin` is the only non-trivial implementation. Previously this trait +//! was public, but currently it's been re-privatized, as it's both 1) of no +//! value to most callers and 2) a pretty big implementation detail to commit +//! to. +//! +//! [`rayon::join`]: https://docs.rs/rayon/1.3.0/rayon/fn.join.html + +/// The trait that abstracts over single-threaded and multi-threaded recursion. +/// +/// See the [`join` module docs](index.html) for more details. +pub trait Join { + fn join(oper_a: A, oper_b: B) -> (RA, RB) + where + A: FnOnce() -> RA + Send, + B: FnOnce() -> RB + Send, + RA: Send, + RB: Send; +} + +/// The trivial, serial implementation of `Join`. The left and right sides are +/// executed one after the other, on the calling thread. The standalone hashing +/// functions and the `Hasher::update` method use this implementation +/// internally. +/// +/// See the [`join` module docs](index.html) for more details. +pub enum SerialJoin {} + +impl Join for SerialJoin { + #[inline] + fn join(oper_a: A, oper_b: B) -> (RA, RB) + where + A: FnOnce() -> RA + Send, + B: FnOnce() -> RB + Send, + RA: Send, + RB: Send, + { + (oper_a(), oper_b()) + } +} + +/// The Rayon-based implementation of `Join`. The left and right sides are +/// executed on the Rayon thread pool, potentially in parallel. This +/// implementation is gated by the `rayon` feature, which is off by default. +/// +/// See the [`join` module docs](index.html) for more details. +#[cfg(feature = "rayon")] +pub enum RayonJoin {} + +#[cfg(feature = "rayon")] +impl Join for RayonJoin { + #[inline] + fn join(oper_a: A, oper_b: B) -> (RA, RB) + where + A: FnOnce() -> RA + Send, + B: FnOnce() -> RB + Send, + RA: Send, + RB: Send, + { + rayon::join(oper_a, oper_b) + } +} + +#[cfg(test)] +mod test { + use super::*; + + #[test] + fn test_serial_join() { + let oper_a = || 1 + 1; + let oper_b = || 2 + 2; + assert_eq!((2, 4), SerialJoin::join(oper_a, oper_b)); + } + + #[test] + #[cfg(feature = "rayon")] + fn test_rayon_join() { + let oper_a = || 1 + 1; + let oper_b = || 2 + 2; + assert_eq!((2, 4), RayonJoin::join(oper_a, oper_b)); + } +} diff --git a/third-party/blake3/src/lib.rs b/third-party/blake3/src/lib.rs new file mode 100644 index 00000000..ac61fb27 --- /dev/null +++ b/third-party/blake3/src/lib.rs @@ -0,0 +1,1479 @@ +//! The official Rust implementation of the [BLAKE3] cryptographic hash +//! function. +//! +//! # Examples +//! +//! ``` +//! # fn main() -> Result<(), Box> { +//! // Hash an input all at once. +//! let hash1 = blake3::hash(b"foobarbaz"); +//! +//! // Hash an input incrementally. +//! let mut hasher = blake3::Hasher::new(); +//! hasher.update(b"foo"); +//! hasher.update(b"bar"); +//! hasher.update(b"baz"); +//! let hash2 = hasher.finalize(); +//! assert_eq!(hash1, hash2); +//! +//! // Extended output. OutputReader also implements Read and Seek. +//! # #[cfg(feature = "std")] { +//! let mut output = [0; 1000]; +//! let mut output_reader = hasher.finalize_xof(); +//! output_reader.fill(&mut output); +//! assert_eq!(hash1, output[..32]); +//! # } +//! +//! // Print a hash as hex. +//! println!("{}", hash1); +//! # Ok(()) +//! # } +//! ``` +//! +//! # Cargo Features +//! +//! The `std` feature (the only feature enabled by default) is required for +//! implementations of the [`Write`] and [`Seek`] traits, and also for runtime +//! CPU feature detection on x86. If this feature is disabled, the only way to +//! use the x86 SIMD implementations is to enable the corresponding instruction +//! sets globally, with e.g. `RUSTFLAGS="-C target-cpu=native"`. The resulting +//! binary will not be portable to other machines. +//! +//! The `rayon` feature (disabled by default, but enabled for [docs.rs]) adds +//! the [`Hasher::update_rayon`] method, for multithreaded hashing. However, +//! even if this feature is enabled, all other APIs remain single-threaded. +//! +//! The NEON implementation is enabled by default for AArch64 but requires the +//! `neon` feature for other ARM targets. Not all ARMv7 CPUs support NEON, and +//! enabling this feature will produce a binary that's not portable to CPUs +//! without NEON support. +//! +//! The `traits-preview` feature enables implementations of traits from the +//! RustCrypto [`digest`] crate, and re-exports that crate as +//! `traits::digest`. However, the traits aren't stable, and they're expected to +//! change in incompatible ways before that crate reaches 1.0. For that reason, +//! this crate makes no SemVer guarantees for this feature, and callers who use +//! it should expect breaking changes between patch versions. (The "-preview" +//! feature name follows the conventions of the RustCrypto [`signature`] crate.) +//! +//! [`Hasher::update_rayon`]: struct.Hasher.html#method.update_rayon +//! [BLAKE3]: https://blake3.io +//! [Rayon]: https://github.com/rayon-rs/rayon +//! [docs.rs]: https://docs.rs/ +//! [`Write`]: https://doc.rust-lang.org/std/io/trait.Write.html +//! [`Seek`]: https://doc.rust-lang.org/std/io/trait.Seek.html +//! [`digest`]: https://crates.io/crates/digest +//! [`signature`]: https://crates.io/crates/signature + +#![cfg_attr(not(feature = "std"), no_std)] + +#[cfg(test)] +mod test; + +// The guts module is for incremental use cases like the `bao` crate that need +// to explicitly compute chunk and parent chaining values. It is semi-stable +// and likely to keep working, but largely undocumented and not intended for +// widespread use. +#[doc(hidden)] +pub mod guts; + +/// Undocumented and unstable, for benchmarks only. +#[doc(hidden)] +pub mod platform; + +// Platform-specific implementations of the compression function. These +// BLAKE3-specific cfg flags are set in build.rs. +#[cfg(blake3_avx2_rust)] +#[path = "rust_avx2.rs"] +mod avx2; +#[cfg(blake3_avx2_ffi)] +#[path = "ffi_avx2.rs"] +mod avx2; +#[cfg(blake3_avx512_ffi)] +#[path = "ffi_avx512.rs"] +mod avx512; +#[cfg(blake3_neon)] +#[path = "ffi_neon.rs"] +mod neon; +mod portable; +#[cfg(blake3_sse2_rust)] +#[path = "rust_sse2.rs"] +mod sse2; +#[cfg(blake3_sse2_ffi)] +#[path = "ffi_sse2.rs"] +mod sse2; +#[cfg(blake3_sse41_rust)] +#[path = "rust_sse41.rs"] +mod sse41; +#[cfg(blake3_sse41_ffi)] +#[path = "ffi_sse41.rs"] +mod sse41; + +#[cfg(feature = "traits-preview")] +pub mod traits; + +mod join; + +use arrayref::{array_mut_ref, array_ref}; +use arrayvec::{ArrayString, ArrayVec}; +use core::cmp; +use core::fmt; +use platform::{Platform, MAX_SIMD_DEGREE, MAX_SIMD_DEGREE_OR_2}; + +/// The number of bytes in a [`Hash`](struct.Hash.html), 32. +pub const OUT_LEN: usize = 32; + +/// The number of bytes in a key, 32. +pub const KEY_LEN: usize = 32; + +const MAX_DEPTH: usize = 54; // 2^54 * CHUNK_LEN = 2^64 +use guts::{BLOCK_LEN, CHUNK_LEN}; + +// While iterating the compression function within a chunk, the CV is +// represented as words, to avoid doing two extra endianness conversions for +// each compression in the portable implementation. But the hash_many interface +// needs to hash both input bytes and parent nodes, so its better for its +// output CVs to be represented as bytes. +type CVWords = [u32; 8]; +type CVBytes = [u8; 32]; // little-endian + +const IV: &CVWords = &[ + 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A, 0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19, +]; + +const MSG_SCHEDULE: [[usize; 16]; 7] = [ + [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15], + [2, 6, 3, 10, 7, 0, 4, 13, 1, 11, 12, 5, 9, 14, 15, 8], + [3, 4, 10, 12, 13, 2, 7, 14, 6, 5, 9, 0, 11, 15, 8, 1], + [10, 7, 12, 9, 14, 3, 13, 15, 4, 0, 11, 2, 5, 8, 1, 6], + [12, 13, 9, 11, 15, 10, 14, 8, 7, 2, 5, 3, 0, 1, 6, 4], + [9, 14, 11, 5, 8, 12, 15, 1, 13, 3, 0, 10, 2, 6, 4, 7], + [11, 15, 5, 0, 1, 9, 8, 6, 14, 10, 2, 12, 3, 4, 7, 13], +]; + +// These are the internal flags that we use to domain separate root/non-root, +// chunk/parent, and chunk beginning/middle/end. These get set at the high end +// of the block flags word in the compression function, so their values start +// high and go down. +const CHUNK_START: u8 = 1 << 0; +const CHUNK_END: u8 = 1 << 1; +const PARENT: u8 = 1 << 2; +const ROOT: u8 = 1 << 3; +const KEYED_HASH: u8 = 1 << 4; +const DERIVE_KEY_CONTEXT: u8 = 1 << 5; +const DERIVE_KEY_MATERIAL: u8 = 1 << 6; + +#[inline] +fn counter_low(counter: u64) -> u32 { + counter as u32 +} + +#[inline] +fn counter_high(counter: u64) -> u32 { + (counter >> 32) as u32 +} + +/// An output of the default size, 32 bytes, which provides constant-time +/// equality checking. +/// +/// `Hash` implements [`From`] and [`Into`] for `[u8; 32]`, and it provides +/// [`from_bytes`] and [`as_bytes`] for explicit conversions between itself and +/// `[u8; 32]`. However, byte arrays and slices don't provide constant-time +/// equality checking, which is often a security requirement in software that +/// handles private data. `Hash` doesn't implement [`Deref`] or [`AsRef`], to +/// avoid situations where a type conversion happens implicitly and the +/// constant-time property is accidentally lost. +/// +/// `Hash` provides the [`to_hex`] and [`from_hex`] methods for converting to +/// and from hexadecimal. It also implements [`Display`] and [`FromStr`]. +/// +/// [`From`]: https://doc.rust-lang.org/std/convert/trait.From.html +/// [`Into`]: https://doc.rust-lang.org/std/convert/trait.Into.html +/// [`as_bytes`]: #method.as_bytes +/// [`from_bytes`]: #method.from_bytes +/// [`Deref`]: https://doc.rust-lang.org/stable/std/ops/trait.Deref.html +/// [`AsRef`]: https://doc.rust-lang.org/std/convert/trait.AsRef.html +/// [`to_hex`]: #method.to_hex +/// [`from_hex`]: #method.from_hex +/// [`Display`]: https://doc.rust-lang.org/std/fmt/trait.Display.html +/// [`FromStr`]: https://doc.rust-lang.org/std/str/trait.FromStr.html +#[derive(Clone, Copy, Hash)] +pub struct Hash([u8; OUT_LEN]); + +impl Hash { + /// The raw bytes of the `Hash`. Note that byte arrays don't provide + /// constant-time equality checking, so if you need to compare hashes, + /// prefer the `Hash` type. + #[inline] + pub const fn as_bytes(&self) -> &[u8; OUT_LEN] { + &self.0 + } + + /// Create a `Hash` from its raw bytes representation. + pub const fn from_bytes(bytes: [u8; OUT_LEN]) -> Self { + Self(bytes) + } + + /// Encode a `Hash` in lowercase hexadecimal. + /// + /// The returned [`ArrayString`] is a fixed size and doesn't allocate memory + /// on the heap. Note that [`ArrayString`] doesn't provide constant-time + /// equality checking, so if you need to compare hashes, prefer the `Hash` + /// type. + /// + /// [`ArrayString`]: https://docs.rs/arrayvec/0.5.1/arrayvec/struct.ArrayString.html + pub fn to_hex(&self) -> ArrayString<{ 2 * OUT_LEN }> { + let mut s = ArrayString::new(); + let table = b"0123456789abcdef"; + for &b in self.0.iter() { + s.push(table[(b >> 4) as usize] as char); + s.push(table[(b & 0xf) as usize] as char); + } + s + } + + /// Decode a `Hash` from hexadecimal. Both uppercase and lowercase ASCII + /// bytes are supported. + /// + /// Any byte outside the ranges `'0'...'9'`, `'a'...'f'`, and `'A'...'F'` + /// results in an error. An input length other than 64 also results in an + /// error. + /// + /// Note that `Hash` also implements `FromStr`, so `Hash::from_hex("...")` + /// is equivalent to `"...".parse()`. + pub fn from_hex(hex: impl AsRef<[u8]>) -> Result { + fn hex_val(byte: u8) -> Result { + match byte { + b'A'..=b'F' => Ok(byte - b'A' + 10), + b'a'..=b'f' => Ok(byte - b'a' + 10), + b'0'..=b'9' => Ok(byte - b'0'), + _ => Err(HexError(HexErrorInner::InvalidByte(byte))), + } + } + let hex_bytes: &[u8] = hex.as_ref(); + if hex_bytes.len() != OUT_LEN * 2 { + return Err(HexError(HexErrorInner::InvalidLen(hex_bytes.len()))); + } + let mut hash_bytes: [u8; OUT_LEN] = [0; OUT_LEN]; + for i in 0..OUT_LEN { + hash_bytes[i] = 16 * hex_val(hex_bytes[2 * i])? + hex_val(hex_bytes[2 * i + 1])?; + } + Ok(Hash::from(hash_bytes)) + } +} + +impl From<[u8; OUT_LEN]> for Hash { + #[inline] + fn from(bytes: [u8; OUT_LEN]) -> Self { + Self::from_bytes(bytes) + } +} + +impl From for [u8; OUT_LEN] { + #[inline] + fn from(hash: Hash) -> Self { + hash.0 + } +} + +impl core::str::FromStr for Hash { + type Err = HexError; + + fn from_str(s: &str) -> Result { + Hash::from_hex(s) + } +} + +/// This implementation is constant-time. +impl PartialEq for Hash { + #[inline] + fn eq(&self, other: &Hash) -> bool { + constant_time_eq::constant_time_eq_32(&self.0, &other.0) + } +} + +/// This implementation is constant-time. +impl PartialEq<[u8; OUT_LEN]> for Hash { + #[inline] + fn eq(&self, other: &[u8; OUT_LEN]) -> bool { + constant_time_eq::constant_time_eq_32(&self.0, other) + } +} + +/// This implementation is constant-time if the target is 32 bytes long. +impl PartialEq<[u8]> for Hash { + #[inline] + fn eq(&self, other: &[u8]) -> bool { + constant_time_eq::constant_time_eq(&self.0, other) + } +} + +impl Eq for Hash {} + +impl fmt::Display for Hash { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + // Formatting field as `&str` to reduce code size since the `Debug` + // dynamic dispatch table for `&str` is likely needed elsewhere already, + // but that for `ArrayString<[u8; 64]>` is not. + let hex = self.to_hex(); + let hex: &str = hex.as_str(); + + f.write_str(hex) + } +} + +impl fmt::Debug for Hash { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + // Formatting field as `&str` to reduce code size since the `Debug` + // dynamic dispatch table for `&str` is likely needed elsewhere already, + // but that for `ArrayString<[u8; 64]>` is not. + let hex = self.to_hex(); + let hex: &str = hex.as_str(); + + f.debug_tuple("Hash").field(&hex).finish() + } +} + +/// The error type for [`Hash::from_hex`]. +/// +/// The `.to_string()` representation of this error currently distinguishes between bad length +/// errors and bad character errors. This is to help with logging and debugging, but it isn't a +/// stable API detail, and it may change at any time. +#[derive(Clone, Debug)] +pub struct HexError(HexErrorInner); + +#[derive(Clone, Debug)] +enum HexErrorInner { + InvalidByte(u8), + InvalidLen(usize), +} + +impl fmt::Display for HexError { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match self.0 { + HexErrorInner::InvalidByte(byte) => { + if byte < 128 { + write!(f, "invalid hex character: {:?}", byte as char) + } else { + write!(f, "invalid hex character: 0x{:x}", byte) + } + } + HexErrorInner::InvalidLen(len) => { + write!(f, "expected 64 hex bytes, received {}", len) + } + } + } +} + +#[cfg(feature = "std")] +impl std::error::Error for HexError {} + +// Each chunk or parent node can produce either a 32-byte chaining value or, by +// setting the ROOT flag, any number of final output bytes. The Output struct +// captures the state just prior to choosing between those two possibilities. +#[derive(Clone)] +struct Output { + input_chaining_value: CVWords, + block: [u8; 64], + block_len: u8, + counter: u64, + flags: u8, + platform: Platform, +} + +impl Output { + fn chaining_value(&self) -> CVBytes { + let mut cv = self.input_chaining_value; + self.platform.compress_in_place( + &mut cv, + &self.block, + self.block_len, + self.counter, + self.flags, + ); + platform::le_bytes_from_words_32(&cv) + } + + fn root_hash(&self) -> Hash { + debug_assert_eq!(self.counter, 0); + let mut cv = self.input_chaining_value; + self.platform + .compress_in_place(&mut cv, &self.block, self.block_len, 0, self.flags | ROOT); + Hash(platform::le_bytes_from_words_32(&cv)) + } + + fn root_output_block(&self) -> [u8; 2 * OUT_LEN] { + self.platform.compress_xof( + &self.input_chaining_value, + &self.block, + self.block_len, + self.counter, + self.flags | ROOT, + ) + } +} + +#[derive(Clone)] +struct ChunkState { + cv: CVWords, + chunk_counter: u64, + buf: [u8; BLOCK_LEN], + buf_len: u8, + blocks_compressed: u8, + flags: u8, + platform: Platform, +} + +impl ChunkState { + fn new(key: &CVWords, chunk_counter: u64, flags: u8, platform: Platform) -> Self { + Self { + cv: *key, + chunk_counter, + buf: [0; BLOCK_LEN], + buf_len: 0, + blocks_compressed: 0, + flags, + platform, + } + } + + fn len(&self) -> usize { + BLOCK_LEN * self.blocks_compressed as usize + self.buf_len as usize + } + + fn fill_buf(&mut self, input: &mut &[u8]) { + let want = BLOCK_LEN - self.buf_len as usize; + let take = cmp::min(want, input.len()); + self.buf[self.buf_len as usize..][..take].copy_from_slice(&input[..take]); + self.buf_len += take as u8; + *input = &input[take..]; + } + + fn start_flag(&self) -> u8 { + if self.blocks_compressed == 0 { + CHUNK_START + } else { + 0 + } + } + + // Try to avoid buffering as much as possible, by compressing directly from + // the input slice when full blocks are available. + fn update(&mut self, mut input: &[u8]) -> &mut Self { + if self.buf_len > 0 { + self.fill_buf(&mut input); + if !input.is_empty() { + debug_assert_eq!(self.buf_len as usize, BLOCK_LEN); + let block_flags = self.flags | self.start_flag(); // borrowck + self.platform.compress_in_place( + &mut self.cv, + &self.buf, + BLOCK_LEN as u8, + self.chunk_counter, + block_flags, + ); + self.buf_len = 0; + self.buf = [0; BLOCK_LEN]; + self.blocks_compressed += 1; + } + } + + while input.len() > BLOCK_LEN { + debug_assert_eq!(self.buf_len, 0); + let block_flags = self.flags | self.start_flag(); // borrowck + self.platform.compress_in_place( + &mut self.cv, + array_ref!(input, 0, BLOCK_LEN), + BLOCK_LEN as u8, + self.chunk_counter, + block_flags, + ); + self.blocks_compressed += 1; + input = &input[BLOCK_LEN..]; + } + + self.fill_buf(&mut input); + debug_assert!(input.is_empty()); + debug_assert!(self.len() <= CHUNK_LEN); + self + } + + fn output(&self) -> Output { + let block_flags = self.flags | self.start_flag() | CHUNK_END; + Output { + input_chaining_value: self.cv, + block: self.buf, + block_len: self.buf_len, + counter: self.chunk_counter, + flags: block_flags, + platform: self.platform, + } + } +} + +// Don't derive(Debug), because the state may be secret. +impl fmt::Debug for ChunkState { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + f.debug_struct("ChunkState") + .field("len", &self.len()) + .field("chunk_counter", &self.chunk_counter) + .field("flags", &self.flags) + .field("platform", &self.platform) + .finish() + } +} + +// IMPLEMENTATION NOTE +// =================== +// The recursive function compress_subtree_wide(), implemented below, is the +// basis of high-performance BLAKE3. We use it both for all-at-once hashing, +// and for the incremental input with Hasher (though we have to be careful with +// subtree boundaries in the incremental case). compress_subtree_wide() applies +// several optimizations at the same time: +// - Multithreading with Rayon. +// - Parallel chunk hashing with SIMD. +// - Parallel parent hashing with SIMD. Note that while SIMD chunk hashing +// maxes out at MAX_SIMD_DEGREE*CHUNK_LEN, parallel parent hashing continues +// to benefit from larger inputs, because more levels of the tree benefit can +// use full-width SIMD vectors for parent hashing. Without parallel parent +// hashing, we lose about 10% of overall throughput on AVX2 and AVX-512. + +/// Undocumented and unstable, for benchmarks only. +#[doc(hidden)] +#[derive(Clone, Copy)] +pub enum IncrementCounter { + Yes, + No, +} + +impl IncrementCounter { + #[inline] + fn yes(&self) -> bool { + match self { + IncrementCounter::Yes => true, + IncrementCounter::No => false, + } + } +} + +// The largest power of two less than or equal to `n`, used for left_len() +// immediately below, and also directly in Hasher::update(). +fn largest_power_of_two_leq(n: usize) -> usize { + ((n / 2) + 1).next_power_of_two() +} + +// Given some input larger than one chunk, return the number of bytes that +// should go in the left subtree. This is the largest power-of-2 number of +// chunks that leaves at least 1 byte for the right subtree. +fn left_len(content_len: usize) -> usize { + debug_assert!(content_len > CHUNK_LEN); + // Subtract 1 to reserve at least one byte for the right side. + let full_chunks = (content_len - 1) / CHUNK_LEN; + largest_power_of_two_leq(full_chunks) * CHUNK_LEN +} + +// Use SIMD parallelism to hash up to MAX_SIMD_DEGREE chunks at the same time +// on a single thread. Write out the chunk chaining values and return the +// number of chunks hashed. These chunks are never the root and never empty; +// those cases use a different codepath. +fn compress_chunks_parallel( + input: &[u8], + key: &CVWords, + chunk_counter: u64, + flags: u8, + platform: Platform, + out: &mut [u8], +) -> usize { + debug_assert!(!input.is_empty(), "empty chunks below the root"); + debug_assert!(input.len() <= MAX_SIMD_DEGREE * CHUNK_LEN); + + let mut chunks_exact = input.chunks_exact(CHUNK_LEN); + let mut chunks_array = ArrayVec::<&[u8; CHUNK_LEN], MAX_SIMD_DEGREE>::new(); + for chunk in &mut chunks_exact { + chunks_array.push(array_ref!(chunk, 0, CHUNK_LEN)); + } + platform.hash_many( + &chunks_array, + key, + chunk_counter, + IncrementCounter::Yes, + flags, + CHUNK_START, + CHUNK_END, + out, + ); + + // Hash the remaining partial chunk, if there is one. Note that the empty + // chunk (meaning the empty message) is a different codepath. + let chunks_so_far = chunks_array.len(); + if !chunks_exact.remainder().is_empty() { + let counter = chunk_counter + chunks_so_far as u64; + let mut chunk_state = ChunkState::new(key, counter, flags, platform); + chunk_state.update(chunks_exact.remainder()); + *array_mut_ref!(out, chunks_so_far * OUT_LEN, OUT_LEN) = + chunk_state.output().chaining_value(); + chunks_so_far + 1 + } else { + chunks_so_far + } +} + +// Use SIMD parallelism to hash up to MAX_SIMD_DEGREE parents at the same time +// on a single thread. Write out the parent chaining values and return the +// number of parents hashed. (If there's an odd input chaining value left over, +// return it as an additional output.) These parents are never the root and +// never empty; those cases use a different codepath. +fn compress_parents_parallel( + child_chaining_values: &[u8], + key: &CVWords, + flags: u8, + platform: Platform, + out: &mut [u8], +) -> usize { + debug_assert_eq!(child_chaining_values.len() % OUT_LEN, 0, "wacky hash bytes"); + let num_children = child_chaining_values.len() / OUT_LEN; + debug_assert!(num_children >= 2, "not enough children"); + debug_assert!(num_children <= 2 * MAX_SIMD_DEGREE_OR_2, "too many"); + + let mut parents_exact = child_chaining_values.chunks_exact(BLOCK_LEN); + // Use MAX_SIMD_DEGREE_OR_2 rather than MAX_SIMD_DEGREE here, because of + // the requirements of compress_subtree_wide(). + let mut parents_array = ArrayVec::<&[u8; BLOCK_LEN], MAX_SIMD_DEGREE_OR_2>::new(); + for parent in &mut parents_exact { + parents_array.push(array_ref!(parent, 0, BLOCK_LEN)); + } + platform.hash_many( + &parents_array, + key, + 0, // Parents always use counter 0. + IncrementCounter::No, + flags | PARENT, + 0, // Parents have no start flags. + 0, // Parents have no end flags. + out, + ); + + // If there's an odd child left over, it becomes an output. + let parents_so_far = parents_array.len(); + if !parents_exact.remainder().is_empty() { + out[parents_so_far * OUT_LEN..][..OUT_LEN].copy_from_slice(parents_exact.remainder()); + parents_so_far + 1 + } else { + parents_so_far + } +} + +// The wide helper function returns (writes out) an array of chaining values +// and returns the length of that array. The number of chaining values returned +// is the dynamically detected SIMD degree, at most MAX_SIMD_DEGREE. Or fewer, +// if the input is shorter than that many chunks. The reason for maintaining a +// wide array of chaining values going back up the tree, is to allow the +// implementation to hash as many parents in parallel as possible. +// +// As a special case when the SIMD degree is 1, this function will still return +// at least 2 outputs. This guarantees that this function doesn't perform the +// root compression. (If it did, it would use the wrong flags, and also we +// wouldn't be able to implement extendable output.) Note that this function is +// not used when the whole input is only 1 chunk long; that's a different +// codepath. +// +// Why not just have the caller split the input on the first update(), instead +// of implementing this special rule? Because we don't want to limit SIMD or +// multithreading parallelism for that update(). +fn compress_subtree_wide( + input: &[u8], + key: &CVWords, + chunk_counter: u64, + flags: u8, + platform: Platform, + out: &mut [u8], +) -> usize { + // Note that the single chunk case does *not* bump the SIMD degree up to 2 + // when it is 1. This allows Rayon the option of multithreading even the + // 2-chunk case, which can help performance on smaller platforms. + if input.len() <= platform.simd_degree() * CHUNK_LEN { + return compress_chunks_parallel(input, key, chunk_counter, flags, platform, out); + } + + // With more than simd_degree chunks, we need to recurse. Start by dividing + // the input into left and right subtrees. (Note that this is only optimal + // as long as the SIMD degree is a power of 2. If we ever get a SIMD degree + // of 3 or something, we'll need a more complicated strategy.) + debug_assert_eq!(platform.simd_degree().count_ones(), 1, "power of 2"); + let (left, right) = input.split_at(left_len(input.len())); + let right_chunk_counter = chunk_counter + (left.len() / CHUNK_LEN) as u64; + + // Make space for the child outputs. Here we use MAX_SIMD_DEGREE_OR_2 to + // account for the special case of returning 2 outputs when the SIMD degree + // is 1. + let mut cv_array = [0; 2 * MAX_SIMD_DEGREE_OR_2 * OUT_LEN]; + let degree = if left.len() == CHUNK_LEN { + // The "simd_degree=1 and we're at the leaf nodes" case. + debug_assert_eq!(platform.simd_degree(), 1); + 1 + } else { + cmp::max(platform.simd_degree(), 2) + }; + let (left_out, right_out) = cv_array.split_at_mut(degree * OUT_LEN); + + // Recurse! For update_rayon(), this is where we take advantage of RayonJoin and use multiple + // threads. + let (left_n, right_n) = J::join( + || compress_subtree_wide::(left, key, chunk_counter, flags, platform, left_out), + || compress_subtree_wide::(right, key, right_chunk_counter, flags, platform, right_out), + ); + + // The special case again. If simd_degree=1, then we'll have left_n=1 and + // right_n=1. Rather than compressing them into a single output, return + // them directly, to make sure we always have at least two outputs. + debug_assert_eq!(left_n, degree); + debug_assert!(right_n >= 1 && right_n <= left_n); + if left_n == 1 { + out[..2 * OUT_LEN].copy_from_slice(&cv_array[..2 * OUT_LEN]); + return 2; + } + + // Otherwise, do one layer of parent node compression. + let num_children = left_n + right_n; + compress_parents_parallel( + &cv_array[..num_children * OUT_LEN], + key, + flags, + platform, + out, + ) +} + +// Hash a subtree with compress_subtree_wide(), and then condense the resulting +// list of chaining values down to a single parent node. Don't compress that +// last parent node, however. Instead, return its message bytes (the +// concatenated chaining values of its children). This is necessary when the +// first call to update() supplies a complete subtree, because the topmost +// parent node of that subtree could end up being the root. It's also necessary +// for extended output in the general case. +// +// As with compress_subtree_wide(), this function is not used on inputs of 1 +// chunk or less. That's a different codepath. +fn compress_subtree_to_parent_node( + input: &[u8], + key: &CVWords, + chunk_counter: u64, + flags: u8, + platform: Platform, +) -> [u8; BLOCK_LEN] { + debug_assert!(input.len() > CHUNK_LEN); + let mut cv_array = [0; MAX_SIMD_DEGREE_OR_2 * OUT_LEN]; + let mut num_cvs = + compress_subtree_wide::(input, &key, chunk_counter, flags, platform, &mut cv_array); + debug_assert!(num_cvs >= 2); + + // If MAX_SIMD_DEGREE is greater than 2 and there's enough input, + // compress_subtree_wide() returns more than 2 chaining values. Condense + // them into 2 by forming parent nodes repeatedly. + let mut out_array = [0; MAX_SIMD_DEGREE_OR_2 * OUT_LEN / 2]; + while num_cvs > 2 { + let cv_slice = &cv_array[..num_cvs * OUT_LEN]; + num_cvs = compress_parents_parallel(cv_slice, key, flags, platform, &mut out_array); + cv_array[..num_cvs * OUT_LEN].copy_from_slice(&out_array[..num_cvs * OUT_LEN]); + } + *array_ref!(cv_array, 0, 2 * OUT_LEN) +} + +// Hash a complete input all at once. Unlike compress_subtree_wide() and +// compress_subtree_to_parent_node(), this function handles the 1 chunk case. +fn hash_all_at_once(input: &[u8], key: &CVWords, flags: u8) -> Output { + let platform = Platform::detect(); + + // If the whole subtree is one chunk, hash it directly with a ChunkState. + if input.len() <= CHUNK_LEN { + return ChunkState::new(key, 0, flags, platform) + .update(input) + .output(); + } + + // Otherwise construct an Output object from the parent node returned by + // compress_subtree_to_parent_node(). + Output { + input_chaining_value: *key, + block: compress_subtree_to_parent_node::(input, key, 0, flags, platform), + block_len: BLOCK_LEN as u8, + counter: 0, + flags: flags | PARENT, + platform, + } +} + +/// The default hash function. +/// +/// For an incremental version that accepts multiple writes, see +/// [`Hasher::update`]. +/// +/// For output sizes other than 32 bytes, see [`Hasher::finalize_xof`] and +/// [`OutputReader`]. +/// +/// This function is always single-threaded. For multithreading support, see +/// [`Hasher::update_rayon`](struct.Hasher.html#method.update_rayon). +pub fn hash(input: &[u8]) -> Hash { + hash_all_at_once::(input, IV, 0).root_hash() +} + +/// The keyed hash function. +/// +/// This is suitable for use as a message authentication code, for example to +/// replace an HMAC instance. In that use case, the constant-time equality +/// checking provided by [`Hash`](struct.Hash.html) is almost always a security +/// requirement, and callers need to be careful not to compare MACs as raw +/// bytes. +/// +/// For output sizes other than 32 bytes, see [`Hasher::new_keyed`], +/// [`Hasher::finalize_xof`], and [`OutputReader`]. +/// +/// This function is always single-threaded. For multithreading support, see +/// [`Hasher::new_keyed`] and +/// [`Hasher::update_rayon`](struct.Hasher.html#method.update_rayon). +pub fn keyed_hash(key: &[u8; KEY_LEN], input: &[u8]) -> Hash { + let key_words = platform::words_from_le_bytes_32(key); + hash_all_at_once::(input, &key_words, KEYED_HASH).root_hash() +} + +/// The key derivation function. +/// +/// Given cryptographic key material of any length and a context string of any +/// length, this function outputs a 32-byte derived subkey. **The context string +/// should be hardcoded, globally unique, and application-specific.** A good +/// default format for such strings is `"[application] [commit timestamp] +/// [purpose]"`, e.g., `"example.com 2019-12-25 16:18:03 session tokens v1"`. +/// +/// Key derivation is important when you want to use the same key in multiple +/// algorithms or use cases. Using the same key with different cryptographic +/// algorithms is generally forbidden, and deriving a separate subkey for each +/// use case protects you from bad interactions. Derived keys also mitigate the +/// damage from one part of your application accidentally leaking its key. +/// +/// As a rare exception to that general rule, however, it is possible to use +/// `derive_key` itself with key material that you are already using with +/// another algorithm. You might need to do this if you're adding features to +/// an existing application, which does not yet use key derivation internally. +/// However, you still must not share key material with algorithms that forbid +/// key reuse entirely, like a one-time pad. For more on this, see sections 6.2 +/// and 7.8 of the [BLAKE3 paper](https://github.com/BLAKE3-team/BLAKE3-specs/blob/master/blake3.pdf). +/// +/// Note that BLAKE3 is not a password hash, and **`derive_key` should never be +/// used with passwords.** Instead, use a dedicated password hash like +/// [Argon2]. Password hashes are entirely different from generic hash +/// functions, with opposite design requirements. +/// +/// For output sizes other than 32 bytes, see [`Hasher::new_derive_key`], +/// [`Hasher::finalize_xof`], and [`OutputReader`]. +/// +/// This function is always single-threaded. For multithreading support, see +/// [`Hasher::new_derive_key`] and +/// [`Hasher::update_rayon`](struct.Hasher.html#method.update_rayon). +/// +/// [Argon2]: https://en.wikipedia.org/wiki/Argon2 +pub fn derive_key(context: &str, key_material: &[u8]) -> [u8; OUT_LEN] { + let context_key = + hash_all_at_once::(context.as_bytes(), IV, DERIVE_KEY_CONTEXT) + .root_hash(); + let context_key_words = platform::words_from_le_bytes_32(context_key.as_bytes()); + hash_all_at_once::(key_material, &context_key_words, DERIVE_KEY_MATERIAL) + .root_hash() + .0 +} + +fn parent_node_output( + left_child: &CVBytes, + right_child: &CVBytes, + key: &CVWords, + flags: u8, + platform: Platform, +) -> Output { + let mut block = [0; BLOCK_LEN]; + block[..32].copy_from_slice(left_child); + block[32..].copy_from_slice(right_child); + Output { + input_chaining_value: *key, + block, + block_len: BLOCK_LEN as u8, + counter: 0, + flags: flags | PARENT, + platform, + } +} + +/// An incremental hash state that can accept any number of writes. +/// +/// When the `traits-preview` Cargo feature is enabled, this type implements +/// several commonly used traits from the +/// [`digest`](https://crates.io/crates/digest) crate. However, those +/// traits aren't stable, and they're expected to change in incompatible ways +/// before that crate reaches 1.0. For that reason, this crate makes no SemVer +/// guarantees for this feature, and callers who use it should expect breaking +/// changes between patch versions. +/// +/// When the `rayon` Cargo feature is enabled, the +/// [`update_rayon`](#method.update_rayon) method is available for multithreaded +/// hashing. +/// +/// **Performance note:** The [`update`](#method.update) method can't take full +/// advantage of SIMD optimizations if its input buffer is too small or oddly +/// sized. Using a 16 KiB buffer, or any multiple of that, enables all currently +/// supported SIMD instruction sets. +/// +/// # Examples +/// +/// ``` +/// # fn main() -> Result<(), Box> { +/// // Hash an input incrementally. +/// let mut hasher = blake3::Hasher::new(); +/// hasher.update(b"foo"); +/// hasher.update(b"bar"); +/// hasher.update(b"baz"); +/// assert_eq!(hasher.finalize(), blake3::hash(b"foobarbaz")); +/// +/// // Extended output. OutputReader also implements Read and Seek. +/// # #[cfg(feature = "std")] { +/// let mut output = [0; 1000]; +/// let mut output_reader = hasher.finalize_xof(); +/// output_reader.fill(&mut output); +/// assert_eq!(&output[..32], blake3::hash(b"foobarbaz").as_bytes()); +/// # } +/// # Ok(()) +/// # } +/// ``` +#[derive(Clone)] +pub struct Hasher { + key: CVWords, + chunk_state: ChunkState, + // The stack size is MAX_DEPTH + 1 because we do lazy merging. For example, + // with 7 chunks, we have 3 entries in the stack. Adding an 8th chunk + // requires a 4th entry, rather than merging everything down to 1, because + // we don't know whether more input is coming. This is different from how + // the reference implementation does things. + cv_stack: ArrayVec, +} + +impl Hasher { + fn new_internal(key: &CVWords, flags: u8) -> Self { + Self { + key: *key, + chunk_state: ChunkState::new(key, 0, flags, Platform::detect()), + cv_stack: ArrayVec::new(), + } + } + + /// Construct a new `Hasher` for the regular hash function. + pub fn new() -> Self { + Self::new_internal(IV, 0) + } + + /// Construct a new `Hasher` for the keyed hash function. See + /// [`keyed_hash`]. + /// + /// [`keyed_hash`]: fn.keyed_hash.html + pub fn new_keyed(key: &[u8; KEY_LEN]) -> Self { + let key_words = platform::words_from_le_bytes_32(key); + Self::new_internal(&key_words, KEYED_HASH) + } + + /// Construct a new `Hasher` for the key derivation function. See + /// [`derive_key`]. The context string should be hardcoded, globally + /// unique, and application-specific. + /// + /// [`derive_key`]: fn.derive_key.html + pub fn new_derive_key(context: &str) -> Self { + let context_key = + hash_all_at_once::(context.as_bytes(), IV, DERIVE_KEY_CONTEXT) + .root_hash(); + let context_key_words = platform::words_from_le_bytes_32(context_key.as_bytes()); + Self::new_internal(&context_key_words, DERIVE_KEY_MATERIAL) + } + + /// Reset the `Hasher` to its initial state. + /// + /// This is functionally the same as overwriting the `Hasher` with a new + /// one, using the same key or context string if any. + pub fn reset(&mut self) -> &mut Self { + self.chunk_state = ChunkState::new( + &self.key, + 0, + self.chunk_state.flags, + self.chunk_state.platform, + ); + self.cv_stack.clear(); + self + } + + // As described in push_cv() below, we do "lazy merging", delaying merges + // until right before the next CV is about to be added. This is different + // from the reference implementation. Another difference is that we aren't + // always merging 1 chunk at a time. Instead, each CV might represent any + // power-of-two number of chunks, as long as the smaller-above-larger stack + // order is maintained. Instead of the "count the trailing 0-bits" + // algorithm described in the spec, we use a "count the total number of + // 1-bits" variant that doesn't require us to retain the subtree size of + // the CV on top of the stack. The principle is the same: each CV that + // should remain in the stack is represented by a 1-bit in the total number + // of chunks (or bytes) so far. + fn merge_cv_stack(&mut self, total_len: u64) { + let post_merge_stack_len = total_len.count_ones() as usize; + while self.cv_stack.len() > post_merge_stack_len { + let right_child = self.cv_stack.pop().unwrap(); + let left_child = self.cv_stack.pop().unwrap(); + let parent_output = parent_node_output( + &left_child, + &right_child, + &self.key, + self.chunk_state.flags, + self.chunk_state.platform, + ); + self.cv_stack.push(parent_output.chaining_value()); + } + } + + // In reference_impl.rs, we merge the new CV with existing CVs from the + // stack before pushing it. We can do that because we know more input is + // coming, so we know none of the merges are root. + // + // This setting is different. We want to feed as much input as possible to + // compress_subtree_wide(), without setting aside anything for the + // chunk_state. If the user gives us 64 KiB, we want to parallelize over + // all 64 KiB at once as a single subtree, if at all possible. + // + // This leads to two problems: + // 1) This 64 KiB input might be the only call that ever gets made to + // update. In this case, the root node of the 64 KiB subtree would be + // the root node of the whole tree, and it would need to be ROOT + // finalized. We can't compress it until we know. + // 2) This 64 KiB input might complete a larger tree, whose root node is + // similarly going to be the the root of the whole tree. For example, + // maybe we have 196 KiB (that is, 128 + 64) hashed so far. We can't + // compress the node at the root of the 256 KiB subtree until we know + // how to finalize it. + // + // The second problem is solved with "lazy merging". That is, when we're + // about to add a CV to the stack, we don't merge it with anything first, + // as the reference impl does. Instead we do merges using the *previous* CV + // that was added, which is sitting on top of the stack, and we put the new + // CV (unmerged) on top of the stack afterwards. This guarantees that we + // never merge the root node until finalize(). + // + // Solving the first problem requires an additional tool, + // compress_subtree_to_parent_node(). That function always returns the top + // *two* chaining values of the subtree it's compressing. We then do lazy + // merging with each of them separately, so that the second CV will always + // remain unmerged. (That also helps us support extendable output when + // we're hashing an input all-at-once.) + fn push_cv(&mut self, new_cv: &CVBytes, chunk_counter: u64) { + self.merge_cv_stack(chunk_counter); + self.cv_stack.push(*new_cv); + } + + /// Add input bytes to the hash state. You can call this any number of + /// times. + /// + /// This method is always single-threaded. For multithreading support, see + /// [`update_rayon`](#method.update_rayon) below (enabled with the `rayon` + /// Cargo feature). + /// + /// Note that the degree of SIMD parallelism that `update` can use is + /// limited by the size of this input buffer. The 8 KiB buffer currently + /// used by [`std::io::copy`] is enough to leverage AVX2, for example, but + /// not enough to leverage AVX-512. A 16 KiB buffer is large enough to + /// leverage all currently supported SIMD instruction sets. + /// + /// [`std::io::copy`]: https://doc.rust-lang.org/std/io/fn.copy.html + pub fn update(&mut self, input: &[u8]) -> &mut Self { + self.update_with_join::(input) + } + + /// Identical to [`update`](Hasher::update), but using Rayon-based + /// multithreading internally. + /// + /// This method is gated by the `rayon` Cargo feature, which is disabled by + /// default but enabled on [docs.rs](https://docs.rs). + /// + /// To get any performance benefit from multithreading, the input buffer + /// needs to be large. As a rule of thumb on x86_64, `update_rayon` is + /// _slower_ than `update` for inputs under 128 KiB. That threshold varies + /// quite a lot across different processors, and it's important to benchmark + /// your specific use case. + /// + /// Memory mapping an entire input file is a simple way to take advantage of + /// multithreading without needing to carefully tune your buffer size or + /// offload IO. However, on spinning disks where random access is expensive, + /// that approach can lead to disk thrashing and terrible IO performance. + /// Note that OS page caching can mask this problem, in which case it might + /// only appear for files larger than available RAM. Again, benchmarking + /// your specific use case is important. + #[cfg(feature = "rayon")] + pub fn update_rayon(&mut self, input: &[u8]) -> &mut Self { + self.update_with_join::(input) + } + + fn update_with_join(&mut self, mut input: &[u8]) -> &mut Self { + // If we have some partial chunk bytes in the internal chunk_state, we + // need to finish that chunk first. + if self.chunk_state.len() > 0 { + let want = CHUNK_LEN - self.chunk_state.len(); + let take = cmp::min(want, input.len()); + self.chunk_state.update(&input[..take]); + input = &input[take..]; + if !input.is_empty() { + // We've filled the current chunk, and there's more input + // coming, so we know it's not the root and we can finalize it. + // Then we'll proceed to hashing whole chunks below. + debug_assert_eq!(self.chunk_state.len(), CHUNK_LEN); + let chunk_cv = self.chunk_state.output().chaining_value(); + self.push_cv(&chunk_cv, self.chunk_state.chunk_counter); + self.chunk_state = ChunkState::new( + &self.key, + self.chunk_state.chunk_counter + 1, + self.chunk_state.flags, + self.chunk_state.platform, + ); + } else { + return self; + } + } + + // Now the chunk_state is clear, and we have more input. If there's + // more than a single chunk (so, definitely not the root chunk), hash + // the largest whole subtree we can, with the full benefits of SIMD and + // multithreading parallelism. Two restrictions: + // - The subtree has to be a power-of-2 number of chunks. Only subtrees + // along the right edge can be incomplete, and we don't know where + // the right edge is going to be until we get to finalize(). + // - The subtree must evenly divide the total number of chunks up until + // this point (if total is not 0). If the current incomplete subtree + // is only waiting for 1 more chunk, we can't hash a subtree of 4 + // chunks. We have to complete the current subtree first. + // Because we might need to break up the input to form powers of 2, or + // to evenly divide what we already have, this part runs in a loop. + while input.len() > CHUNK_LEN { + debug_assert_eq!(self.chunk_state.len(), 0, "no partial chunk data"); + debug_assert_eq!(CHUNK_LEN.count_ones(), 1, "power of 2 chunk len"); + let mut subtree_len = largest_power_of_two_leq(input.len()); + let count_so_far = self.chunk_state.chunk_counter * CHUNK_LEN as u64; + // Shrink the subtree_len until it evenly divides the count so far. + // We know that subtree_len itself is a power of 2, so we can use a + // bitmasking trick instead of an actual remainder operation. (Note + // that if the caller consistently passes power-of-2 inputs of the + // same size, as is hopefully typical, this loop condition will + // always fail, and subtree_len will always be the full length of + // the input.) + // + // An aside: We don't have to shrink subtree_len quite this much. + // For example, if count_so_far is 1, we could pass 2 chunks to + // compress_subtree_to_parent_node. Since we'll get 2 CVs back, + // we'll still get the right answer in the end, and we might get to + // use 2-way SIMD parallelism. The problem with this optimization, + // is that it gets us stuck always hashing 2 chunks. The total + // number of chunks will remain odd, and we'll never graduate to + // higher degrees of parallelism. See + // https://github.com/BLAKE3-team/BLAKE3/issues/69. + while (subtree_len - 1) as u64 & count_so_far != 0 { + subtree_len /= 2; + } + // The shrunken subtree_len might now be 1 chunk long. If so, hash + // that one chunk by itself. Otherwise, compress the subtree into a + // pair of CVs. + let subtree_chunks = (subtree_len / CHUNK_LEN) as u64; + if subtree_len <= CHUNK_LEN { + debug_assert_eq!(subtree_len, CHUNK_LEN); + self.push_cv( + &ChunkState::new( + &self.key, + self.chunk_state.chunk_counter, + self.chunk_state.flags, + self.chunk_state.platform, + ) + .update(&input[..subtree_len]) + .output() + .chaining_value(), + self.chunk_state.chunk_counter, + ); + } else { + // This is the high-performance happy path, though getting here + // depends on the caller giving us a long enough input. + let cv_pair = compress_subtree_to_parent_node::( + &input[..subtree_len], + &self.key, + self.chunk_state.chunk_counter, + self.chunk_state.flags, + self.chunk_state.platform, + ); + let left_cv = array_ref!(cv_pair, 0, 32); + let right_cv = array_ref!(cv_pair, 32, 32); + // Push the two CVs we received into the CV stack in order. Because + // the stack merges lazily, this guarantees we aren't merging the + // root. + self.push_cv(left_cv, self.chunk_state.chunk_counter); + self.push_cv( + right_cv, + self.chunk_state.chunk_counter + (subtree_chunks / 2), + ); + } + self.chunk_state.chunk_counter += subtree_chunks; + input = &input[subtree_len..]; + } + + // What remains is 1 chunk or less. Add it to the chunk state. + debug_assert!(input.len() <= CHUNK_LEN); + if !input.is_empty() { + self.chunk_state.update(input); + // Having added some input to the chunk_state, we know what's in + // the CV stack won't become the root node, and we can do an extra + // merge. This simplifies finalize(). + self.merge_cv_stack(self.chunk_state.chunk_counter); + } + + self + } + + fn final_output(&self) -> Output { + // If the current chunk is the only chunk, that makes it the root node + // also. Convert it directly into an Output. Otherwise, we need to + // merge subtrees below. + if self.cv_stack.is_empty() { + debug_assert_eq!(self.chunk_state.chunk_counter, 0); + return self.chunk_state.output(); + } + + // If there are any bytes in the ChunkState, finalize that chunk and + // merge its CV with everything in the CV stack. In that case, the work + // we did at the end of update() above guarantees that the stack + // doesn't contain any unmerged subtrees that need to be merged first. + // (This is important, because if there were two chunk hashes sitting + // on top of the stack, they would need to merge with each other, and + // merging a new chunk hash into them would be incorrect.) + // + // If there are no bytes in the ChunkState, we'll merge what's already + // in the stack. In this case it's fine if there are unmerged chunks on + // top, because we'll merge them with each other. Note that the case of + // the empty chunk is taken care of above. + let mut output: Output; + let mut num_cvs_remaining = self.cv_stack.len(); + if self.chunk_state.len() > 0 { + debug_assert_eq!( + self.cv_stack.len(), + self.chunk_state.chunk_counter.count_ones() as usize, + "cv stack does not need a merge" + ); + output = self.chunk_state.output(); + } else { + debug_assert!(self.cv_stack.len() >= 2); + output = parent_node_output( + &self.cv_stack[num_cvs_remaining - 2], + &self.cv_stack[num_cvs_remaining - 1], + &self.key, + self.chunk_state.flags, + self.chunk_state.platform, + ); + num_cvs_remaining -= 2; + } + while num_cvs_remaining > 0 { + output = parent_node_output( + &self.cv_stack[num_cvs_remaining - 1], + &output.chaining_value(), + &self.key, + self.chunk_state.flags, + self.chunk_state.platform, + ); + num_cvs_remaining -= 1; + } + output + } + + /// Finalize the hash state and return the [`Hash`](struct.Hash.html) of + /// the input. + /// + /// This method is idempotent. Calling it twice will give the same result. + /// You can also add more input and finalize again. + pub fn finalize(&self) -> Hash { + self.final_output().root_hash() + } + + /// Finalize the hash state and return an [`OutputReader`], which can + /// supply any number of output bytes. + /// + /// This method is idempotent. Calling it twice will give the same result. + /// You can also add more input and finalize again. + /// + /// [`OutputReader`]: struct.OutputReader.html + pub fn finalize_xof(&self) -> OutputReader { + OutputReader::new(self.final_output()) + } + + /// Return the total number of bytes hashed so far. + pub fn count(&self) -> u64 { + self.chunk_state.chunk_counter * CHUNK_LEN as u64 + self.chunk_state.len() as u64 + } +} + +// Don't derive(Debug), because the state may be secret. +impl fmt::Debug for Hasher { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + f.debug_struct("Hasher") + .field("flags", &self.chunk_state.flags) + .field("platform", &self.chunk_state.platform) + .finish() + } +} + +impl Default for Hasher { + #[inline] + fn default() -> Self { + Self::new() + } +} + +#[cfg(feature = "std")] +impl std::io::Write for Hasher { + /// This is equivalent to [`update`](#method.update). + #[inline] + fn write(&mut self, input: &[u8]) -> std::io::Result { + self.update(input); + Ok(input.len()) + } + + #[inline] + fn flush(&mut self) -> std::io::Result<()> { + Ok(()) + } +} + +/// An incremental reader for extended output, returned by +/// [`Hasher::finalize_xof`](struct.Hasher.html#method.finalize_xof). +/// +/// Shorter BLAKE3 outputs are prefixes of longer ones, and explicitly requesting a short output is +/// equivalent to truncating the default-length output. Note that this is a difference between +/// BLAKE2 and BLAKE3. +/// +/// # Security notes +/// +/// Outputs shorter than the default length of 32 bytes (256 bits) provide less security. An N-bit +/// BLAKE3 output is intended to provide N bits of first and second preimage resistance and N/2 +/// bits of collision resistance, for any N up to 256. Longer outputs don't provide any additional +/// security. +/// +/// Avoid relying on the secrecy of the output offset, that is, the number of output bytes read or +/// the arguments to [`seek`](struct.OutputReader.html#method.seek) or +/// [`set_position`](struct.OutputReader.html#method.set_position). [_Block-Cipher-Based Tree +/// Hashing_ by Aldo Gunsing](https://eprint.iacr.org/2022/283) shows that an attacker who knows +/// both the message and the key (if any) can easily determine the offset of an extended output. +/// For comparison, AES-CTR has a similar property: if you know the key, you can decrypt a block +/// from an unknown position in the output stream to recover its block index. Callers with strong +/// secret keys aren't affected in practice, but secret offsets are a [design +/// smell](https://en.wikipedia.org/wiki/Design_smell) in any case. +#[derive(Clone)] +pub struct OutputReader { + inner: Output, + position_within_block: u8, +} + +impl OutputReader { + fn new(inner: Output) -> Self { + Self { + inner, + position_within_block: 0, + } + } + + /// Fill a buffer with output bytes and advance the position of the + /// `OutputReader`. This is equivalent to [`Read::read`], except that it + /// doesn't return a `Result`. Both methods always fill the entire buffer. + /// + /// Note that `OutputReader` doesn't buffer output bytes internally, so + /// calling `fill` repeatedly with a short-length or odd-length slice will + /// end up performing the same compression multiple times. If you're + /// reading output in a loop, prefer a slice length that's a multiple of + /// 64. + /// + /// The maximum output size of BLAKE3 is 264-1 bytes. If you try + /// to extract more than that, for example by seeking near the end and + /// reading further, the behavior is unspecified. + /// + /// [`Read::read`]: #method.read + pub fn fill(&mut self, mut buf: &mut [u8]) { + while !buf.is_empty() { + let block: [u8; BLOCK_LEN] = self.inner.root_output_block(); + let output_bytes = &block[self.position_within_block as usize..]; + let take = cmp::min(buf.len(), output_bytes.len()); + buf[..take].copy_from_slice(&output_bytes[..take]); + buf = &mut buf[take..]; + self.position_within_block += take as u8; + if self.position_within_block == BLOCK_LEN as u8 { + self.inner.counter += 1; + self.position_within_block = 0; + } + } + } + + /// Return the current read position in the output stream. This is + /// equivalent to [`Seek::stream_position`], except that it doesn't return + /// a `Result`. The position of a new `OutputReader` starts at 0, and each + /// call to [`fill`] or [`Read::read`] moves the position forward by the + /// number of bytes read. + /// + /// [`Seek::stream_position`]: #method.stream_position + /// [`fill`]: #method.fill + /// [`Read::read`]: #method.read + pub fn position(&self) -> u64 { + self.inner.counter * BLOCK_LEN as u64 + self.position_within_block as u64 + } + + /// Seek to a new read position in the output stream. This is equivalent to + /// calling [`Seek::seek`] with [`SeekFrom::Start`], except that it doesn't + /// return a `Result`. + /// + /// [`Seek::seek`]: #method.seek + /// [`SeekFrom::Start`]: https://doc.rust-lang.org/std/io/enum.SeekFrom.html + pub fn set_position(&mut self, position: u64) { + self.position_within_block = (position % BLOCK_LEN as u64) as u8; + self.inner.counter = position / BLOCK_LEN as u64; + } +} + +// Don't derive(Debug), because the state may be secret. +impl fmt::Debug for OutputReader { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + f.debug_struct("OutputReader") + .field("position", &self.position()) + .finish() + } +} + +#[cfg(feature = "std")] +impl std::io::Read for OutputReader { + #[inline] + fn read(&mut self, buf: &mut [u8]) -> std::io::Result { + self.fill(buf); + Ok(buf.len()) + } +} + +#[cfg(feature = "std")] +impl std::io::Seek for OutputReader { + fn seek(&mut self, pos: std::io::SeekFrom) -> std::io::Result { + let max_position = u64::max_value() as i128; + let target_position: i128 = match pos { + std::io::SeekFrom::Start(x) => x as i128, + std::io::SeekFrom::Current(x) => self.position() as i128 + x as i128, + std::io::SeekFrom::End(_) => { + return Err(std::io::Error::new( + std::io::ErrorKind::InvalidInput, + "seek from end not supported", + )); + } + }; + if target_position < 0 { + return Err(std::io::Error::new( + std::io::ErrorKind::InvalidInput, + "seek before start", + )); + } + self.set_position(cmp::min(target_position, max_position) as u64); + Ok(self.position()) + } +} diff --git a/third-party/blake3/src/platform.rs b/third-party/blake3/src/platform.rs new file mode 100644 index 00000000..00058b16 --- /dev/null +++ b/third-party/blake3/src/platform.rs @@ -0,0 +1,487 @@ +use crate::{portable, CVWords, IncrementCounter, BLOCK_LEN}; +use arrayref::{array_mut_ref, array_ref}; + +cfg_if::cfg_if! { + if #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] { + cfg_if::cfg_if! { + if #[cfg(blake3_avx512_ffi)] { + pub const MAX_SIMD_DEGREE: usize = 16; + } else { + pub const MAX_SIMD_DEGREE: usize = 8; + } + } + } else if #[cfg(blake3_neon)] { + pub const MAX_SIMD_DEGREE: usize = 4; + } else { + pub const MAX_SIMD_DEGREE: usize = 1; + } +} + +// There are some places where we want a static size that's equal to the +// MAX_SIMD_DEGREE, but also at least 2. Constant contexts aren't currently +// allowed to use cmp::max, so we have to hardcode this additional constant +// value. Get rid of this once cmp::max is a const fn. +cfg_if::cfg_if! { + if #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] { + cfg_if::cfg_if! { + if #[cfg(blake3_avx512_ffi)] { + pub const MAX_SIMD_DEGREE_OR_2: usize = 16; + } else { + pub const MAX_SIMD_DEGREE_OR_2: usize = 8; + } + } + } else if #[cfg(blake3_neon)] { + pub const MAX_SIMD_DEGREE_OR_2: usize = 4; + } else { + pub const MAX_SIMD_DEGREE_OR_2: usize = 2; + } +} + +#[derive(Clone, Copy, Debug)] +pub enum Platform { + Portable, + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + SSE2, + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + SSE41, + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + AVX2, + #[cfg(blake3_avx512_ffi)] + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + AVX512, + #[cfg(blake3_neon)] + NEON, +} + +impl Platform { + #[allow(unreachable_code)] + pub fn detect() -> Self { + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + { + #[cfg(blake3_avx512_ffi)] + { + if avx512_detected() { + return Platform::AVX512; + } + } + if avx2_detected() { + return Platform::AVX2; + } + if sse41_detected() { + return Platform::SSE41; + } + if sse2_detected() { + return Platform::SSE2; + } + } + // We don't use dynamic feature detection for NEON. If the "neon" + // feature is on, NEON is assumed to be supported. + #[cfg(blake3_neon)] + { + return Platform::NEON; + } + Platform::Portable + } + + pub fn simd_degree(&self) -> usize { + let degree = match self { + Platform::Portable => 1, + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + Platform::SSE2 => 4, + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + Platform::SSE41 => 4, + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + Platform::AVX2 => 8, + #[cfg(blake3_avx512_ffi)] + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + Platform::AVX512 => 16, + #[cfg(blake3_neon)] + Platform::NEON => 4, + }; + debug_assert!(degree <= MAX_SIMD_DEGREE); + degree + } + + pub fn compress_in_place( + &self, + cv: &mut CVWords, + block: &[u8; BLOCK_LEN], + block_len: u8, + counter: u64, + flags: u8, + ) { + match self { + Platform::Portable => portable::compress_in_place(cv, block, block_len, counter, flags), + // Safe because detect() checked for platform support. + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + Platform::SSE2 => unsafe { + crate::sse2::compress_in_place(cv, block, block_len, counter, flags) + }, + // Safe because detect() checked for platform support. + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + Platform::SSE41 | Platform::AVX2 => unsafe { + crate::sse41::compress_in_place(cv, block, block_len, counter, flags) + }, + // Safe because detect() checked for platform support. + #[cfg(blake3_avx512_ffi)] + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + Platform::AVX512 => unsafe { + crate::avx512::compress_in_place(cv, block, block_len, counter, flags) + }, + // No NEON compress_in_place() implementation yet. + #[cfg(blake3_neon)] + Platform::NEON => portable::compress_in_place(cv, block, block_len, counter, flags), + } + } + + pub fn compress_xof( + &self, + cv: &CVWords, + block: &[u8; BLOCK_LEN], + block_len: u8, + counter: u64, + flags: u8, + ) -> [u8; 64] { + match self { + Platform::Portable => portable::compress_xof(cv, block, block_len, counter, flags), + // Safe because detect() checked for platform support. + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + Platform::SSE2 => unsafe { + crate::sse2::compress_xof(cv, block, block_len, counter, flags) + }, + // Safe because detect() checked for platform support. + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + Platform::SSE41 | Platform::AVX2 => unsafe { + crate::sse41::compress_xof(cv, block, block_len, counter, flags) + }, + // Safe because detect() checked for platform support. + #[cfg(blake3_avx512_ffi)] + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + Platform::AVX512 => unsafe { + crate::avx512::compress_xof(cv, block, block_len, counter, flags) + }, + // No NEON compress_xof() implementation yet. + #[cfg(blake3_neon)] + Platform::NEON => portable::compress_xof(cv, block, block_len, counter, flags), + } + } + + // IMPLEMENTATION NOTE + // =================== + // hash_many() applies two optimizations. The critically important + // optimization is the high-performance parallel SIMD hashing mode, + // described in detail in the spec. This more than doubles throughput per + // thread. Another optimization is keeping the state vectors transposed + // from block to block within a chunk. When state vectors are transposed + // after every block, there's a small but measurable performance loss. + // Compressing chunks with a dedicated loop avoids this. + + pub fn hash_many( + &self, + inputs: &[&[u8; N]], + key: &CVWords, + counter: u64, + increment_counter: IncrementCounter, + flags: u8, + flags_start: u8, + flags_end: u8, + out: &mut [u8], + ) { + match self { + Platform::Portable => portable::hash_many( + inputs, + key, + counter, + increment_counter, + flags, + flags_start, + flags_end, + out, + ), + // Safe because detect() checked for platform support. + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + Platform::SSE2 => unsafe { + crate::sse2::hash_many( + inputs, + key, + counter, + increment_counter, + flags, + flags_start, + flags_end, + out, + ) + }, + // Safe because detect() checked for platform support. + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + Platform::SSE41 => unsafe { + crate::sse41::hash_many( + inputs, + key, + counter, + increment_counter, + flags, + flags_start, + flags_end, + out, + ) + }, + // Safe because detect() checked for platform support. + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + Platform::AVX2 => unsafe { + crate::avx2::hash_many( + inputs, + key, + counter, + increment_counter, + flags, + flags_start, + flags_end, + out, + ) + }, + // Safe because detect() checked for platform support. + #[cfg(blake3_avx512_ffi)] + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + Platform::AVX512 => unsafe { + crate::avx512::hash_many( + inputs, + key, + counter, + increment_counter, + flags, + flags_start, + flags_end, + out, + ) + }, + // Assumed to be safe if the "neon" feature is on. + #[cfg(blake3_neon)] + Platform::NEON => unsafe { + crate::neon::hash_many( + inputs, + key, + counter, + increment_counter, + flags, + flags_start, + flags_end, + out, + ) + }, + } + } + + // Explicit platform constructors, for benchmarks. + + pub fn portable() -> Self { + Self::Portable + } + + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + pub fn sse2() -> Option { + if sse2_detected() { + Some(Self::SSE2) + } else { + None + } + } + + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + pub fn sse41() -> Option { + if sse41_detected() { + Some(Self::SSE41) + } else { + None + } + } + + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + pub fn avx2() -> Option { + if avx2_detected() { + Some(Self::AVX2) + } else { + None + } + } + + #[cfg(blake3_avx512_ffi)] + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + pub fn avx512() -> Option { + if avx512_detected() { + Some(Self::AVX512) + } else { + None + } + } + + #[cfg(blake3_neon)] + pub fn neon() -> Option { + // Assumed to be safe if the "neon" feature is on. + Some(Self::NEON) + } +} + +// Note that AVX-512 is divided into multiple featuresets, and we use two of +// them, F and VL. +#[cfg(blake3_avx512_ffi)] +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +#[inline(always)] +pub fn avx512_detected() -> bool { + // A testing-only short-circuit. + if cfg!(feature = "no_avx512") { + return false; + } + // Static check, e.g. for building with target-cpu=native. + #[cfg(all(target_feature = "avx512f", target_feature = "avx512vl"))] + { + return true; + } + // Dynamic check, if std is enabled. + #[cfg(feature = "std")] + { + if is_x86_feature_detected!("avx512f") && is_x86_feature_detected!("avx512vl") { + return true; + } + } + false +} + +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +#[inline(always)] +pub fn avx2_detected() -> bool { + // A testing-only short-circuit. + if cfg!(feature = "no_avx2") { + return false; + } + // Static check, e.g. for building with target-cpu=native. + #[cfg(target_feature = "avx2")] + { + return true; + } + // Dynamic check, if std is enabled. + #[cfg(feature = "std")] + { + if is_x86_feature_detected!("avx2") { + return true; + } + } + false +} + +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +#[inline(always)] +pub fn sse41_detected() -> bool { + // A testing-only short-circuit. + if cfg!(feature = "no_sse41") { + return false; + } + // Static check, e.g. for building with target-cpu=native. + #[cfg(target_feature = "sse4.1")] + { + return true; + } + // Dynamic check, if std is enabled. + #[cfg(feature = "std")] + { + if is_x86_feature_detected!("sse4.1") { + return true; + } + } + false +} + +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +#[inline(always)] +#[allow(unreachable_code)] +pub fn sse2_detected() -> bool { + // A testing-only short-circuit. + if cfg!(feature = "no_sse2") { + return false; + } + // Static check, e.g. for building with target-cpu=native. + #[cfg(target_feature = "sse2")] + { + return true; + } + // Dynamic check, if std is enabled. + #[cfg(feature = "std")] + { + if is_x86_feature_detected!("sse2") { + return true; + } + } + false +} + +#[inline(always)] +pub fn words_from_le_bytes_32(bytes: &[u8; 32]) -> [u32; 8] { + let mut out = [0; 8]; + out[0] = u32::from_le_bytes(*array_ref!(bytes, 0 * 4, 4)); + out[1] = u32::from_le_bytes(*array_ref!(bytes, 1 * 4, 4)); + out[2] = u32::from_le_bytes(*array_ref!(bytes, 2 * 4, 4)); + out[3] = u32::from_le_bytes(*array_ref!(bytes, 3 * 4, 4)); + out[4] = u32::from_le_bytes(*array_ref!(bytes, 4 * 4, 4)); + out[5] = u32::from_le_bytes(*array_ref!(bytes, 5 * 4, 4)); + out[6] = u32::from_le_bytes(*array_ref!(bytes, 6 * 4, 4)); + out[7] = u32::from_le_bytes(*array_ref!(bytes, 7 * 4, 4)); + out +} + +#[inline(always)] +pub fn words_from_le_bytes_64(bytes: &[u8; 64]) -> [u32; 16] { + let mut out = [0; 16]; + out[0] = u32::from_le_bytes(*array_ref!(bytes, 0 * 4, 4)); + out[1] = u32::from_le_bytes(*array_ref!(bytes, 1 * 4, 4)); + out[2] = u32::from_le_bytes(*array_ref!(bytes, 2 * 4, 4)); + out[3] = u32::from_le_bytes(*array_ref!(bytes, 3 * 4, 4)); + out[4] = u32::from_le_bytes(*array_ref!(bytes, 4 * 4, 4)); + out[5] = u32::from_le_bytes(*array_ref!(bytes, 5 * 4, 4)); + out[6] = u32::from_le_bytes(*array_ref!(bytes, 6 * 4, 4)); + out[7] = u32::from_le_bytes(*array_ref!(bytes, 7 * 4, 4)); + out[8] = u32::from_le_bytes(*array_ref!(bytes, 8 * 4, 4)); + out[9] = u32::from_le_bytes(*array_ref!(bytes, 9 * 4, 4)); + out[10] = u32::from_le_bytes(*array_ref!(bytes, 10 * 4, 4)); + out[11] = u32::from_le_bytes(*array_ref!(bytes, 11 * 4, 4)); + out[12] = u32::from_le_bytes(*array_ref!(bytes, 12 * 4, 4)); + out[13] = u32::from_le_bytes(*array_ref!(bytes, 13 * 4, 4)); + out[14] = u32::from_le_bytes(*array_ref!(bytes, 14 * 4, 4)); + out[15] = u32::from_le_bytes(*array_ref!(bytes, 15 * 4, 4)); + out +} + +#[inline(always)] +pub fn le_bytes_from_words_32(words: &[u32; 8]) -> [u8; 32] { + let mut out = [0; 32]; + *array_mut_ref!(out, 0 * 4, 4) = words[0].to_le_bytes(); + *array_mut_ref!(out, 1 * 4, 4) = words[1].to_le_bytes(); + *array_mut_ref!(out, 2 * 4, 4) = words[2].to_le_bytes(); + *array_mut_ref!(out, 3 * 4, 4) = words[3].to_le_bytes(); + *array_mut_ref!(out, 4 * 4, 4) = words[4].to_le_bytes(); + *array_mut_ref!(out, 5 * 4, 4) = words[5].to_le_bytes(); + *array_mut_ref!(out, 6 * 4, 4) = words[6].to_le_bytes(); + *array_mut_ref!(out, 7 * 4, 4) = words[7].to_le_bytes(); + out +} + +#[inline(always)] +pub fn le_bytes_from_words_64(words: &[u32; 16]) -> [u8; 64] { + let mut out = [0; 64]; + *array_mut_ref!(out, 0 * 4, 4) = words[0].to_le_bytes(); + *array_mut_ref!(out, 1 * 4, 4) = words[1].to_le_bytes(); + *array_mut_ref!(out, 2 * 4, 4) = words[2].to_le_bytes(); + *array_mut_ref!(out, 3 * 4, 4) = words[3].to_le_bytes(); + *array_mut_ref!(out, 4 * 4, 4) = words[4].to_le_bytes(); + *array_mut_ref!(out, 5 * 4, 4) = words[5].to_le_bytes(); + *array_mut_ref!(out, 6 * 4, 4) = words[6].to_le_bytes(); + *array_mut_ref!(out, 7 * 4, 4) = words[7].to_le_bytes(); + *array_mut_ref!(out, 8 * 4, 4) = words[8].to_le_bytes(); + *array_mut_ref!(out, 9 * 4, 4) = words[9].to_le_bytes(); + *array_mut_ref!(out, 10 * 4, 4) = words[10].to_le_bytes(); + *array_mut_ref!(out, 11 * 4, 4) = words[11].to_le_bytes(); + *array_mut_ref!(out, 12 * 4, 4) = words[12].to_le_bytes(); + *array_mut_ref!(out, 13 * 4, 4) = words[13].to_le_bytes(); + *array_mut_ref!(out, 14 * 4, 4) = words[14].to_le_bytes(); + *array_mut_ref!(out, 15 * 4, 4) = words[15].to_le_bytes(); + out +} diff --git a/third-party/blake3/src/portable.rs b/third-party/blake3/src/portable.rs new file mode 100644 index 00000000..7af6828b --- /dev/null +++ b/third-party/blake3/src/portable.rs @@ -0,0 +1,198 @@ +use crate::{ + counter_high, counter_low, CVBytes, CVWords, IncrementCounter, BLOCK_LEN, IV, MSG_SCHEDULE, + OUT_LEN, +}; +use arrayref::{array_mut_ref, array_ref}; + +#[inline(always)] +fn g(state: &mut [u32; 16], a: usize, b: usize, c: usize, d: usize, x: u32, y: u32) { + state[a] = state[a].wrapping_add(state[b]).wrapping_add(x); + state[d] = (state[d] ^ state[a]).rotate_right(16); + state[c] = state[c].wrapping_add(state[d]); + state[b] = (state[b] ^ state[c]).rotate_right(12); + state[a] = state[a].wrapping_add(state[b]).wrapping_add(y); + state[d] = (state[d] ^ state[a]).rotate_right(8); + state[c] = state[c].wrapping_add(state[d]); + state[b] = (state[b] ^ state[c]).rotate_right(7); +} + +#[inline(always)] +fn round(state: &mut [u32; 16], msg: &[u32; 16], round: usize) { + // Select the message schedule based on the round. + let schedule = MSG_SCHEDULE[round]; + + // Mix the columns. + g(state, 0, 4, 8, 12, msg[schedule[0]], msg[schedule[1]]); + g(state, 1, 5, 9, 13, msg[schedule[2]], msg[schedule[3]]); + g(state, 2, 6, 10, 14, msg[schedule[4]], msg[schedule[5]]); + g(state, 3, 7, 11, 15, msg[schedule[6]], msg[schedule[7]]); + + // Mix the diagonals. + g(state, 0, 5, 10, 15, msg[schedule[8]], msg[schedule[9]]); + g(state, 1, 6, 11, 12, msg[schedule[10]], msg[schedule[11]]); + g(state, 2, 7, 8, 13, msg[schedule[12]], msg[schedule[13]]); + g(state, 3, 4, 9, 14, msg[schedule[14]], msg[schedule[15]]); +} + +#[inline(always)] +fn compress_pre( + cv: &CVWords, + block: &[u8; BLOCK_LEN], + block_len: u8, + counter: u64, + flags: u8, +) -> [u32; 16] { + let block_words = crate::platform::words_from_le_bytes_64(block); + + let mut state = [ + cv[0], + cv[1], + cv[2], + cv[3], + cv[4], + cv[5], + cv[6], + cv[7], + IV[0], + IV[1], + IV[2], + IV[3], + counter_low(counter), + counter_high(counter), + block_len as u32, + flags as u32, + ]; + + round(&mut state, &block_words, 0); + round(&mut state, &block_words, 1); + round(&mut state, &block_words, 2); + round(&mut state, &block_words, 3); + round(&mut state, &block_words, 4); + round(&mut state, &block_words, 5); + round(&mut state, &block_words, 6); + + state +} + +pub fn compress_in_place( + cv: &mut CVWords, + block: &[u8; BLOCK_LEN], + block_len: u8, + counter: u64, + flags: u8, +) { + let state = compress_pre(cv, block, block_len, counter, flags); + + cv[0] = state[0] ^ state[8]; + cv[1] = state[1] ^ state[9]; + cv[2] = state[2] ^ state[10]; + cv[3] = state[3] ^ state[11]; + cv[4] = state[4] ^ state[12]; + cv[5] = state[5] ^ state[13]; + cv[6] = state[6] ^ state[14]; + cv[7] = state[7] ^ state[15]; +} + +pub fn compress_xof( + cv: &CVWords, + block: &[u8; BLOCK_LEN], + block_len: u8, + counter: u64, + flags: u8, +) -> [u8; 64] { + let mut state = compress_pre(cv, block, block_len, counter, flags); + state[0] ^= state[8]; + state[1] ^= state[9]; + state[2] ^= state[10]; + state[3] ^= state[11]; + state[4] ^= state[12]; + state[5] ^= state[13]; + state[6] ^= state[14]; + state[7] ^= state[15]; + state[8] ^= cv[0]; + state[9] ^= cv[1]; + state[10] ^= cv[2]; + state[11] ^= cv[3]; + state[12] ^= cv[4]; + state[13] ^= cv[5]; + state[14] ^= cv[6]; + state[15] ^= cv[7]; + crate::platform::le_bytes_from_words_64(&state) +} + +pub fn hash1( + input: &[u8; N], + key: &CVWords, + counter: u64, + flags: u8, + flags_start: u8, + flags_end: u8, + out: &mut CVBytes, +) { + debug_assert_eq!(N % BLOCK_LEN, 0, "uneven blocks"); + let mut cv = *key; + let mut block_flags = flags | flags_start; + let mut slice = &input[..]; + while slice.len() >= BLOCK_LEN { + if slice.len() == BLOCK_LEN { + block_flags |= flags_end; + } + compress_in_place( + &mut cv, + array_ref!(slice, 0, BLOCK_LEN), + BLOCK_LEN as u8, + counter, + block_flags, + ); + block_flags = flags; + slice = &slice[BLOCK_LEN..]; + } + *out = crate::platform::le_bytes_from_words_32(&cv); +} + +pub fn hash_many( + inputs: &[&[u8; N]], + key: &CVWords, + mut counter: u64, + increment_counter: IncrementCounter, + flags: u8, + flags_start: u8, + flags_end: u8, + out: &mut [u8], +) { + debug_assert!(out.len() >= inputs.len() * OUT_LEN, "out too short"); + for (&input, output) in inputs.iter().zip(out.chunks_exact_mut(OUT_LEN)) { + hash1( + input, + key, + counter, + flags, + flags_start, + flags_end, + array_mut_ref!(output, 0, OUT_LEN), + ); + if increment_counter.yes() { + counter += 1; + } + } +} + +#[cfg(test)] +pub mod test { + use super::*; + + // This is basically testing the portable implementation against itself, + // but it also checks that compress_in_place and compress_xof are + // consistent. And there are tests against the reference implementation and + // against hardcoded test vectors elsewhere. + #[test] + fn test_compress() { + crate::test::test_compress_fn(compress_in_place, compress_xof); + } + + // Ditto. + #[test] + fn test_hash_many() { + crate::test::test_hash_many_fn(hash_many, hash_many); + } +} diff --git a/third-party/blake3/src/rust_avx2.rs b/third-party/blake3/src/rust_avx2.rs new file mode 100644 index 00000000..a37a4caa --- /dev/null +++ b/third-party/blake3/src/rust_avx2.rs @@ -0,0 +1,474 @@ +#[cfg(target_arch = "x86")] +use core::arch::x86::*; +#[cfg(target_arch = "x86_64")] +use core::arch::x86_64::*; + +use crate::{ + counter_high, counter_low, CVWords, IncrementCounter, BLOCK_LEN, IV, MSG_SCHEDULE, OUT_LEN, +}; +use arrayref::{array_mut_ref, mut_array_refs}; + +pub const DEGREE: usize = 8; + +#[inline(always)] +unsafe fn loadu(src: *const u8) -> __m256i { + // This is an unaligned load, so the pointer cast is allowed. + _mm256_loadu_si256(src as *const __m256i) +} + +#[inline(always)] +unsafe fn storeu(src: __m256i, dest: *mut u8) { + // This is an unaligned store, so the pointer cast is allowed. + _mm256_storeu_si256(dest as *mut __m256i, src) +} + +#[inline(always)] +unsafe fn add(a: __m256i, b: __m256i) -> __m256i { + _mm256_add_epi32(a, b) +} + +#[inline(always)] +unsafe fn xor(a: __m256i, b: __m256i) -> __m256i { + _mm256_xor_si256(a, b) +} + +#[inline(always)] +unsafe fn set1(x: u32) -> __m256i { + _mm256_set1_epi32(x as i32) +} + +#[inline(always)] +unsafe fn set8(a: u32, b: u32, c: u32, d: u32, e: u32, f: u32, g: u32, h: u32) -> __m256i { + _mm256_setr_epi32( + a as i32, b as i32, c as i32, d as i32, e as i32, f as i32, g as i32, h as i32, + ) +} + +// These rotations are the "simple/shifts version". For the +// "complicated/shuffles version", see +// https://github.com/sneves/blake2-avx2/blob/b3723921f668df09ece52dcd225a36d4a4eea1d9/blake2s-common.h#L63-L66. +// For a discussion of the tradeoffs, see +// https://github.com/sneves/blake2-avx2/pull/5. Due to an LLVM bug +// (https://bugs.llvm.org/show_bug.cgi?id=44379), this version performs better +// on recent x86 chips. + +#[inline(always)] +unsafe fn rot16(x: __m256i) -> __m256i { + _mm256_or_si256(_mm256_srli_epi32(x, 16), _mm256_slli_epi32(x, 32 - 16)) +} + +#[inline(always)] +unsafe fn rot12(x: __m256i) -> __m256i { + _mm256_or_si256(_mm256_srli_epi32(x, 12), _mm256_slli_epi32(x, 32 - 12)) +} + +#[inline(always)] +unsafe fn rot8(x: __m256i) -> __m256i { + _mm256_or_si256(_mm256_srli_epi32(x, 8), _mm256_slli_epi32(x, 32 - 8)) +} + +#[inline(always)] +unsafe fn rot7(x: __m256i) -> __m256i { + _mm256_or_si256(_mm256_srli_epi32(x, 7), _mm256_slli_epi32(x, 32 - 7)) +} + +#[inline(always)] +unsafe fn round(v: &mut [__m256i; 16], m: &[__m256i; 16], r: usize) { + v[0] = add(v[0], m[MSG_SCHEDULE[r][0] as usize]); + v[1] = add(v[1], m[MSG_SCHEDULE[r][2] as usize]); + v[2] = add(v[2], m[MSG_SCHEDULE[r][4] as usize]); + v[3] = add(v[3], m[MSG_SCHEDULE[r][6] as usize]); + v[0] = add(v[0], v[4]); + v[1] = add(v[1], v[5]); + v[2] = add(v[2], v[6]); + v[3] = add(v[3], v[7]); + v[12] = xor(v[12], v[0]); + v[13] = xor(v[13], v[1]); + v[14] = xor(v[14], v[2]); + v[15] = xor(v[15], v[3]); + v[12] = rot16(v[12]); + v[13] = rot16(v[13]); + v[14] = rot16(v[14]); + v[15] = rot16(v[15]); + v[8] = add(v[8], v[12]); + v[9] = add(v[9], v[13]); + v[10] = add(v[10], v[14]); + v[11] = add(v[11], v[15]); + v[4] = xor(v[4], v[8]); + v[5] = xor(v[5], v[9]); + v[6] = xor(v[6], v[10]); + v[7] = xor(v[7], v[11]); + v[4] = rot12(v[4]); + v[5] = rot12(v[5]); + v[6] = rot12(v[6]); + v[7] = rot12(v[7]); + v[0] = add(v[0], m[MSG_SCHEDULE[r][1] as usize]); + v[1] = add(v[1], m[MSG_SCHEDULE[r][3] as usize]); + v[2] = add(v[2], m[MSG_SCHEDULE[r][5] as usize]); + v[3] = add(v[3], m[MSG_SCHEDULE[r][7] as usize]); + v[0] = add(v[0], v[4]); + v[1] = add(v[1], v[5]); + v[2] = add(v[2], v[6]); + v[3] = add(v[3], v[7]); + v[12] = xor(v[12], v[0]); + v[13] = xor(v[13], v[1]); + v[14] = xor(v[14], v[2]); + v[15] = xor(v[15], v[3]); + v[12] = rot8(v[12]); + v[13] = rot8(v[13]); + v[14] = rot8(v[14]); + v[15] = rot8(v[15]); + v[8] = add(v[8], v[12]); + v[9] = add(v[9], v[13]); + v[10] = add(v[10], v[14]); + v[11] = add(v[11], v[15]); + v[4] = xor(v[4], v[8]); + v[5] = xor(v[5], v[9]); + v[6] = xor(v[6], v[10]); + v[7] = xor(v[7], v[11]); + v[4] = rot7(v[4]); + v[5] = rot7(v[5]); + v[6] = rot7(v[6]); + v[7] = rot7(v[7]); + + v[0] = add(v[0], m[MSG_SCHEDULE[r][8] as usize]); + v[1] = add(v[1], m[MSG_SCHEDULE[r][10] as usize]); + v[2] = add(v[2], m[MSG_SCHEDULE[r][12] as usize]); + v[3] = add(v[3], m[MSG_SCHEDULE[r][14] as usize]); + v[0] = add(v[0], v[5]); + v[1] = add(v[1], v[6]); + v[2] = add(v[2], v[7]); + v[3] = add(v[3], v[4]); + v[15] = xor(v[15], v[0]); + v[12] = xor(v[12], v[1]); + v[13] = xor(v[13], v[2]); + v[14] = xor(v[14], v[3]); + v[15] = rot16(v[15]); + v[12] = rot16(v[12]); + v[13] = rot16(v[13]); + v[14] = rot16(v[14]); + v[10] = add(v[10], v[15]); + v[11] = add(v[11], v[12]); + v[8] = add(v[8], v[13]); + v[9] = add(v[9], v[14]); + v[5] = xor(v[5], v[10]); + v[6] = xor(v[6], v[11]); + v[7] = xor(v[7], v[8]); + v[4] = xor(v[4], v[9]); + v[5] = rot12(v[5]); + v[6] = rot12(v[6]); + v[7] = rot12(v[7]); + v[4] = rot12(v[4]); + v[0] = add(v[0], m[MSG_SCHEDULE[r][9] as usize]); + v[1] = add(v[1], m[MSG_SCHEDULE[r][11] as usize]); + v[2] = add(v[2], m[MSG_SCHEDULE[r][13] as usize]); + v[3] = add(v[3], m[MSG_SCHEDULE[r][15] as usize]); + v[0] = add(v[0], v[5]); + v[1] = add(v[1], v[6]); + v[2] = add(v[2], v[7]); + v[3] = add(v[3], v[4]); + v[15] = xor(v[15], v[0]); + v[12] = xor(v[12], v[1]); + v[13] = xor(v[13], v[2]); + v[14] = xor(v[14], v[3]); + v[15] = rot8(v[15]); + v[12] = rot8(v[12]); + v[13] = rot8(v[13]); + v[14] = rot8(v[14]); + v[10] = add(v[10], v[15]); + v[11] = add(v[11], v[12]); + v[8] = add(v[8], v[13]); + v[9] = add(v[9], v[14]); + v[5] = xor(v[5], v[10]); + v[6] = xor(v[6], v[11]); + v[7] = xor(v[7], v[8]); + v[4] = xor(v[4], v[9]); + v[5] = rot7(v[5]); + v[6] = rot7(v[6]); + v[7] = rot7(v[7]); + v[4] = rot7(v[4]); +} + +#[inline(always)] +unsafe fn interleave128(a: __m256i, b: __m256i) -> (__m256i, __m256i) { + ( + _mm256_permute2x128_si256(a, b, 0x20), + _mm256_permute2x128_si256(a, b, 0x31), + ) +} + +// There are several ways to do a transposition. We could do it naively, with 8 separate +// _mm256_set_epi32 instructions, referencing each of the 32 words explicitly. Or we could copy +// the vecs into contiguous storage and then use gather instructions. This third approach is to use +// a series of unpack instructions to interleave the vectors. In my benchmarks, interleaving is the +// fastest approach. To test this, run `cargo +nightly bench --bench libtest load_8` in the +// https://github.com/oconnor663/bao_experiments repo. +#[inline(always)] +unsafe fn transpose_vecs(vecs: &mut [__m256i; DEGREE]) { + // Interleave 32-bit lanes. The low unpack is lanes 00/11/44/55, and the high is 22/33/66/77. + let ab_0145 = _mm256_unpacklo_epi32(vecs[0], vecs[1]); + let ab_2367 = _mm256_unpackhi_epi32(vecs[0], vecs[1]); + let cd_0145 = _mm256_unpacklo_epi32(vecs[2], vecs[3]); + let cd_2367 = _mm256_unpackhi_epi32(vecs[2], vecs[3]); + let ef_0145 = _mm256_unpacklo_epi32(vecs[4], vecs[5]); + let ef_2367 = _mm256_unpackhi_epi32(vecs[4], vecs[5]); + let gh_0145 = _mm256_unpacklo_epi32(vecs[6], vecs[7]); + let gh_2367 = _mm256_unpackhi_epi32(vecs[6], vecs[7]); + + // Interleave 64-bit lanes. The low unpack is lanes 00/22 and the high is 11/33. + let abcd_04 = _mm256_unpacklo_epi64(ab_0145, cd_0145); + let abcd_15 = _mm256_unpackhi_epi64(ab_0145, cd_0145); + let abcd_26 = _mm256_unpacklo_epi64(ab_2367, cd_2367); + let abcd_37 = _mm256_unpackhi_epi64(ab_2367, cd_2367); + let efgh_04 = _mm256_unpacklo_epi64(ef_0145, gh_0145); + let efgh_15 = _mm256_unpackhi_epi64(ef_0145, gh_0145); + let efgh_26 = _mm256_unpacklo_epi64(ef_2367, gh_2367); + let efgh_37 = _mm256_unpackhi_epi64(ef_2367, gh_2367); + + // Interleave 128-bit lanes. + let (abcdefgh_0, abcdefgh_4) = interleave128(abcd_04, efgh_04); + let (abcdefgh_1, abcdefgh_5) = interleave128(abcd_15, efgh_15); + let (abcdefgh_2, abcdefgh_6) = interleave128(abcd_26, efgh_26); + let (abcdefgh_3, abcdefgh_7) = interleave128(abcd_37, efgh_37); + + vecs[0] = abcdefgh_0; + vecs[1] = abcdefgh_1; + vecs[2] = abcdefgh_2; + vecs[3] = abcdefgh_3; + vecs[4] = abcdefgh_4; + vecs[5] = abcdefgh_5; + vecs[6] = abcdefgh_6; + vecs[7] = abcdefgh_7; +} + +#[inline(always)] +unsafe fn transpose_msg_vecs(inputs: &[*const u8; DEGREE], block_offset: usize) -> [__m256i; 16] { + let mut vecs = [ + loadu(inputs[0].add(block_offset + 0 * 4 * DEGREE)), + loadu(inputs[1].add(block_offset + 0 * 4 * DEGREE)), + loadu(inputs[2].add(block_offset + 0 * 4 * DEGREE)), + loadu(inputs[3].add(block_offset + 0 * 4 * DEGREE)), + loadu(inputs[4].add(block_offset + 0 * 4 * DEGREE)), + loadu(inputs[5].add(block_offset + 0 * 4 * DEGREE)), + loadu(inputs[6].add(block_offset + 0 * 4 * DEGREE)), + loadu(inputs[7].add(block_offset + 0 * 4 * DEGREE)), + loadu(inputs[0].add(block_offset + 1 * 4 * DEGREE)), + loadu(inputs[1].add(block_offset + 1 * 4 * DEGREE)), + loadu(inputs[2].add(block_offset + 1 * 4 * DEGREE)), + loadu(inputs[3].add(block_offset + 1 * 4 * DEGREE)), + loadu(inputs[4].add(block_offset + 1 * 4 * DEGREE)), + loadu(inputs[5].add(block_offset + 1 * 4 * DEGREE)), + loadu(inputs[6].add(block_offset + 1 * 4 * DEGREE)), + loadu(inputs[7].add(block_offset + 1 * 4 * DEGREE)), + ]; + for i in 0..DEGREE { + _mm_prefetch(inputs[i].add(block_offset + 256) as *const i8, _MM_HINT_T0); + } + let squares = mut_array_refs!(&mut vecs, DEGREE, DEGREE); + transpose_vecs(squares.0); + transpose_vecs(squares.1); + vecs +} + +#[inline(always)] +unsafe fn load_counters(counter: u64, increment_counter: IncrementCounter) -> (__m256i, __m256i) { + let mask = if increment_counter.yes() { !0 } else { 0 }; + ( + set8( + counter_low(counter + (mask & 0)), + counter_low(counter + (mask & 1)), + counter_low(counter + (mask & 2)), + counter_low(counter + (mask & 3)), + counter_low(counter + (mask & 4)), + counter_low(counter + (mask & 5)), + counter_low(counter + (mask & 6)), + counter_low(counter + (mask & 7)), + ), + set8( + counter_high(counter + (mask & 0)), + counter_high(counter + (mask & 1)), + counter_high(counter + (mask & 2)), + counter_high(counter + (mask & 3)), + counter_high(counter + (mask & 4)), + counter_high(counter + (mask & 5)), + counter_high(counter + (mask & 6)), + counter_high(counter + (mask & 7)), + ), + ) +} + +#[target_feature(enable = "avx2")] +pub unsafe fn hash8( + inputs: &[*const u8; DEGREE], + blocks: usize, + key: &CVWords, + counter: u64, + increment_counter: IncrementCounter, + flags: u8, + flags_start: u8, + flags_end: u8, + out: &mut [u8; DEGREE * OUT_LEN], +) { + let mut h_vecs = [ + set1(key[0]), + set1(key[1]), + set1(key[2]), + set1(key[3]), + set1(key[4]), + set1(key[5]), + set1(key[6]), + set1(key[7]), + ]; + let (counter_low_vec, counter_high_vec) = load_counters(counter, increment_counter); + let mut block_flags = flags | flags_start; + + for block in 0..blocks { + if block + 1 == blocks { + block_flags |= flags_end; + } + let block_len_vec = set1(BLOCK_LEN as u32); // full blocks only + let block_flags_vec = set1(block_flags as u32); + let msg_vecs = transpose_msg_vecs(inputs, block * BLOCK_LEN); + + // The transposed compression function. Note that inlining this + // manually here improves compile times by a lot, compared to factoring + // it out into its own function and making it #[inline(always)]. Just + // guessing, it might have something to do with loop unrolling. + let mut v = [ + h_vecs[0], + h_vecs[1], + h_vecs[2], + h_vecs[3], + h_vecs[4], + h_vecs[5], + h_vecs[6], + h_vecs[7], + set1(IV[0]), + set1(IV[1]), + set1(IV[2]), + set1(IV[3]), + counter_low_vec, + counter_high_vec, + block_len_vec, + block_flags_vec, + ]; + round(&mut v, &msg_vecs, 0); + round(&mut v, &msg_vecs, 1); + round(&mut v, &msg_vecs, 2); + round(&mut v, &msg_vecs, 3); + round(&mut v, &msg_vecs, 4); + round(&mut v, &msg_vecs, 5); + round(&mut v, &msg_vecs, 6); + h_vecs[0] = xor(v[0], v[8]); + h_vecs[1] = xor(v[1], v[9]); + h_vecs[2] = xor(v[2], v[10]); + h_vecs[3] = xor(v[3], v[11]); + h_vecs[4] = xor(v[4], v[12]); + h_vecs[5] = xor(v[5], v[13]); + h_vecs[6] = xor(v[6], v[14]); + h_vecs[7] = xor(v[7], v[15]); + + block_flags = flags; + } + + transpose_vecs(&mut h_vecs); + storeu(h_vecs[0], out.as_mut_ptr().add(0 * 4 * DEGREE)); + storeu(h_vecs[1], out.as_mut_ptr().add(1 * 4 * DEGREE)); + storeu(h_vecs[2], out.as_mut_ptr().add(2 * 4 * DEGREE)); + storeu(h_vecs[3], out.as_mut_ptr().add(3 * 4 * DEGREE)); + storeu(h_vecs[4], out.as_mut_ptr().add(4 * 4 * DEGREE)); + storeu(h_vecs[5], out.as_mut_ptr().add(5 * 4 * DEGREE)); + storeu(h_vecs[6], out.as_mut_ptr().add(6 * 4 * DEGREE)); + storeu(h_vecs[7], out.as_mut_ptr().add(7 * 4 * DEGREE)); +} + +#[target_feature(enable = "avx2")] +pub unsafe fn hash_many( + mut inputs: &[&[u8; N]], + key: &CVWords, + mut counter: u64, + increment_counter: IncrementCounter, + flags: u8, + flags_start: u8, + flags_end: u8, + mut out: &mut [u8], +) { + debug_assert!(out.len() >= inputs.len() * OUT_LEN, "out too short"); + while inputs.len() >= DEGREE && out.len() >= DEGREE * OUT_LEN { + // Safe because the layout of arrays is guaranteed, and because the + // `blocks` count is determined statically from the argument type. + let input_ptrs: &[*const u8; DEGREE] = &*(inputs.as_ptr() as *const [*const u8; DEGREE]); + let blocks = N / BLOCK_LEN; + hash8( + input_ptrs, + blocks, + key, + counter, + increment_counter, + flags, + flags_start, + flags_end, + array_mut_ref!(out, 0, DEGREE * OUT_LEN), + ); + if increment_counter.yes() { + counter += DEGREE as u64; + } + inputs = &inputs[DEGREE..]; + out = &mut out[DEGREE * OUT_LEN..]; + } + crate::sse41::hash_many( + inputs, + key, + counter, + increment_counter, + flags, + flags_start, + flags_end, + out, + ); +} + +#[cfg(test)] +mod test { + use super::*; + + #[test] + fn test_transpose() { + if !crate::platform::avx2_detected() { + return; + } + + #[target_feature(enable = "avx2")] + unsafe fn transpose_wrapper(vecs: &mut [__m256i; DEGREE]) { + transpose_vecs(vecs); + } + + let mut matrix = [[0 as u32; DEGREE]; DEGREE]; + for i in 0..DEGREE { + for j in 0..DEGREE { + matrix[i][j] = (i * DEGREE + j) as u32; + } + } + + unsafe { + let mut vecs: [__m256i; DEGREE] = core::mem::transmute(matrix); + transpose_wrapper(&mut vecs); + matrix = core::mem::transmute(vecs); + } + + for i in 0..DEGREE { + for j in 0..DEGREE { + // Reversed indexes from above. + assert_eq!(matrix[j][i], (i * DEGREE + j) as u32); + } + } + } + + #[test] + fn test_hash_many() { + if !crate::platform::avx2_detected() { + return; + } + crate::test::test_hash_many_fn(hash_many, hash_many); + } +} diff --git a/third-party/blake3/src/rust_sse2.rs b/third-party/blake3/src/rust_sse2.rs new file mode 100644 index 00000000..bd2be69f --- /dev/null +++ b/third-party/blake3/src/rust_sse2.rs @@ -0,0 +1,775 @@ +#[cfg(target_arch = "x86")] +use core::arch::x86::*; +#[cfg(target_arch = "x86_64")] +use core::arch::x86_64::*; + +use crate::{ + counter_high, counter_low, CVBytes, CVWords, IncrementCounter, BLOCK_LEN, IV, MSG_SCHEDULE, + OUT_LEN, +}; +use arrayref::{array_mut_ref, array_ref, mut_array_refs}; + +pub const DEGREE: usize = 4; + +#[inline(always)] +unsafe fn loadu(src: *const u8) -> __m128i { + // This is an unaligned load, so the pointer cast is allowed. + _mm_loadu_si128(src as *const __m128i) +} + +#[inline(always)] +unsafe fn storeu(src: __m128i, dest: *mut u8) { + // This is an unaligned store, so the pointer cast is allowed. + _mm_storeu_si128(dest as *mut __m128i, src) +} + +#[inline(always)] +unsafe fn add(a: __m128i, b: __m128i) -> __m128i { + _mm_add_epi32(a, b) +} + +#[inline(always)] +unsafe fn xor(a: __m128i, b: __m128i) -> __m128i { + _mm_xor_si128(a, b) +} + +#[inline(always)] +unsafe fn set1(x: u32) -> __m128i { + _mm_set1_epi32(x as i32) +} + +#[inline(always)] +unsafe fn set4(a: u32, b: u32, c: u32, d: u32) -> __m128i { + _mm_setr_epi32(a as i32, b as i32, c as i32, d as i32) +} + +// These rotations are the "simple/shifts version". For the +// "complicated/shuffles version", see +// https://github.com/sneves/blake2-avx2/blob/b3723921f668df09ece52dcd225a36d4a4eea1d9/blake2s-common.h#L63-L66. +// For a discussion of the tradeoffs, see +// https://github.com/sneves/blake2-avx2/pull/5. Due to an LLVM bug +// (https://bugs.llvm.org/show_bug.cgi?id=44379), this version performs better +// on recent x86 chips. + +#[inline(always)] +unsafe fn rot16(a: __m128i) -> __m128i { + _mm_or_si128(_mm_srli_epi32(a, 16), _mm_slli_epi32(a, 32 - 16)) +} + +#[inline(always)] +unsafe fn rot12(a: __m128i) -> __m128i { + _mm_or_si128(_mm_srli_epi32(a, 12), _mm_slli_epi32(a, 32 - 12)) +} + +#[inline(always)] +unsafe fn rot8(a: __m128i) -> __m128i { + _mm_or_si128(_mm_srli_epi32(a, 8), _mm_slli_epi32(a, 32 - 8)) +} + +#[inline(always)] +unsafe fn rot7(a: __m128i) -> __m128i { + _mm_or_si128(_mm_srli_epi32(a, 7), _mm_slli_epi32(a, 32 - 7)) +} + +#[inline(always)] +unsafe fn g1( + row0: &mut __m128i, + row1: &mut __m128i, + row2: &mut __m128i, + row3: &mut __m128i, + m: __m128i, +) { + *row0 = add(add(*row0, m), *row1); + *row3 = xor(*row3, *row0); + *row3 = rot16(*row3); + *row2 = add(*row2, *row3); + *row1 = xor(*row1, *row2); + *row1 = rot12(*row1); +} + +#[inline(always)] +unsafe fn g2( + row0: &mut __m128i, + row1: &mut __m128i, + row2: &mut __m128i, + row3: &mut __m128i, + m: __m128i, +) { + *row0 = add(add(*row0, m), *row1); + *row3 = xor(*row3, *row0); + *row3 = rot8(*row3); + *row2 = add(*row2, *row3); + *row1 = xor(*row1, *row2); + *row1 = rot7(*row1); +} + +// Adapted from https://github.com/rust-lang-nursery/stdsimd/pull/479. +macro_rules! _MM_SHUFFLE { + ($z:expr, $y:expr, $x:expr, $w:expr) => { + ($z << 6) | ($y << 4) | ($x << 2) | $w + }; +} + +macro_rules! shuffle2 { + ($a:expr, $b:expr, $c:expr) => { + _mm_castps_si128(_mm_shuffle_ps( + _mm_castsi128_ps($a), + _mm_castsi128_ps($b), + $c, + )) + }; +} + +// Note the optimization here of leaving row1 as the unrotated row, rather than +// row0. All the message loads below are adjusted to compensate for this. See +// discussion at https://github.com/sneves/blake2-avx2/pull/4 +#[inline(always)] +unsafe fn diagonalize(row0: &mut __m128i, row2: &mut __m128i, row3: &mut __m128i) { + *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE!(2, 1, 0, 3)); + *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE!(1, 0, 3, 2)); + *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE!(0, 3, 2, 1)); +} + +#[inline(always)] +unsafe fn undiagonalize(row0: &mut __m128i, row2: &mut __m128i, row3: &mut __m128i) { + *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE!(0, 3, 2, 1)); + *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE!(1, 0, 3, 2)); + *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE!(2, 1, 0, 3)); +} + +#[inline(always)] +unsafe fn blend_epi16(a: __m128i, b: __m128i, imm8: i32) -> __m128i { + let bits = _mm_set_epi16(0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01); + let mut mask = _mm_set1_epi16(imm8 as i16); + mask = _mm_and_si128(mask, bits); + mask = _mm_cmpeq_epi16(mask, bits); + _mm_or_si128(_mm_and_si128(mask, b), _mm_andnot_si128(mask, a)) +} + +#[inline(always)] +unsafe fn compress_pre( + cv: &CVWords, + block: &[u8; BLOCK_LEN], + block_len: u8, + counter: u64, + flags: u8, +) -> [__m128i; 4] { + let row0 = &mut loadu(cv.as_ptr().add(0) as *const u8); + let row1 = &mut loadu(cv.as_ptr().add(4) as *const u8); + let row2 = &mut set4(IV[0], IV[1], IV[2], IV[3]); + let row3 = &mut set4( + counter_low(counter), + counter_high(counter), + block_len as u32, + flags as u32, + ); + + let mut m0 = loadu(block.as_ptr().add(0 * 4 * DEGREE)); + let mut m1 = loadu(block.as_ptr().add(1 * 4 * DEGREE)); + let mut m2 = loadu(block.as_ptr().add(2 * 4 * DEGREE)); + let mut m3 = loadu(block.as_ptr().add(3 * 4 * DEGREE)); + + let mut t0; + let mut t1; + let mut t2; + let mut t3; + let mut tt; + + // Round 1. The first round permutes the message words from the original + // input order, into the groups that get mixed in parallel. + t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(2, 0, 2, 0)); // 6 4 2 0 + g1(row0, row1, row2, row3, t0); + t1 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 3, 1)); // 7 5 3 1 + g2(row0, row1, row2, row3, t1); + diagonalize(row0, row2, row3); + t2 = shuffle2!(m2, m3, _MM_SHUFFLE!(2, 0, 2, 0)); // 14 12 10 8 + t2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE!(2, 1, 0, 3)); // 12 10 8 14 + g1(row0, row1, row2, row3, t2); + t3 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 1, 3, 1)); // 15 13 11 9 + t3 = _mm_shuffle_epi32(t3, _MM_SHUFFLE!(2, 1, 0, 3)); // 13 11 9 15 + g2(row0, row1, row2, row3, t3); + undiagonalize(row0, row2, row3); + m0 = t0; + m1 = t1; + m2 = t2; + m3 = t3; + + // Round 2. This round and all following rounds apply a fixed permutation + // to the message words from the round before. + t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2)); + t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1)); + g1(row0, row1, row2, row3, t0); + t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2)); + tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3)); + t1 = blend_epi16(tt, t1, 0xCC); + g2(row0, row1, row2, row3, t1); + diagonalize(row0, row2, row3); + t2 = _mm_unpacklo_epi64(m3, m1); + tt = blend_epi16(t2, m2, 0xC0); + t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0)); + g1(row0, row1, row2, row3, t2); + t3 = _mm_unpackhi_epi32(m1, m3); + tt = _mm_unpacklo_epi32(m2, t3); + t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2)); + g2(row0, row1, row2, row3, t3); + undiagonalize(row0, row2, row3); + m0 = t0; + m1 = t1; + m2 = t2; + m3 = t3; + + // Round 3 + t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2)); + t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1)); + g1(row0, row1, row2, row3, t0); + t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2)); + tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3)); + t1 = blend_epi16(tt, t1, 0xCC); + g2(row0, row1, row2, row3, t1); + diagonalize(row0, row2, row3); + t2 = _mm_unpacklo_epi64(m3, m1); + tt = blend_epi16(t2, m2, 0xC0); + t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0)); + g1(row0, row1, row2, row3, t2); + t3 = _mm_unpackhi_epi32(m1, m3); + tt = _mm_unpacklo_epi32(m2, t3); + t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2)); + g2(row0, row1, row2, row3, t3); + undiagonalize(row0, row2, row3); + m0 = t0; + m1 = t1; + m2 = t2; + m3 = t3; + + // Round 4 + t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2)); + t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1)); + g1(row0, row1, row2, row3, t0); + t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2)); + tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3)); + t1 = blend_epi16(tt, t1, 0xCC); + g2(row0, row1, row2, row3, t1); + diagonalize(row0, row2, row3); + t2 = _mm_unpacklo_epi64(m3, m1); + tt = blend_epi16(t2, m2, 0xC0); + t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0)); + g1(row0, row1, row2, row3, t2); + t3 = _mm_unpackhi_epi32(m1, m3); + tt = _mm_unpacklo_epi32(m2, t3); + t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2)); + g2(row0, row1, row2, row3, t3); + undiagonalize(row0, row2, row3); + m0 = t0; + m1 = t1; + m2 = t2; + m3 = t3; + + // Round 5 + t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2)); + t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1)); + g1(row0, row1, row2, row3, t0); + t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2)); + tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3)); + t1 = blend_epi16(tt, t1, 0xCC); + g2(row0, row1, row2, row3, t1); + diagonalize(row0, row2, row3); + t2 = _mm_unpacklo_epi64(m3, m1); + tt = blend_epi16(t2, m2, 0xC0); + t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0)); + g1(row0, row1, row2, row3, t2); + t3 = _mm_unpackhi_epi32(m1, m3); + tt = _mm_unpacklo_epi32(m2, t3); + t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2)); + g2(row0, row1, row2, row3, t3); + undiagonalize(row0, row2, row3); + m0 = t0; + m1 = t1; + m2 = t2; + m3 = t3; + + // Round 6 + t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2)); + t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1)); + g1(row0, row1, row2, row3, t0); + t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2)); + tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3)); + t1 = blend_epi16(tt, t1, 0xCC); + g2(row0, row1, row2, row3, t1); + diagonalize(row0, row2, row3); + t2 = _mm_unpacklo_epi64(m3, m1); + tt = blend_epi16(t2, m2, 0xC0); + t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0)); + g1(row0, row1, row2, row3, t2); + t3 = _mm_unpackhi_epi32(m1, m3); + tt = _mm_unpacklo_epi32(m2, t3); + t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2)); + g2(row0, row1, row2, row3, t3); + undiagonalize(row0, row2, row3); + m0 = t0; + m1 = t1; + m2 = t2; + m3 = t3; + + // Round 7 + t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2)); + t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1)); + g1(row0, row1, row2, row3, t0); + t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2)); + tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3)); + t1 = blend_epi16(tt, t1, 0xCC); + g2(row0, row1, row2, row3, t1); + diagonalize(row0, row2, row3); + t2 = _mm_unpacklo_epi64(m3, m1); + tt = blend_epi16(t2, m2, 0xC0); + t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0)); + g1(row0, row1, row2, row3, t2); + t3 = _mm_unpackhi_epi32(m1, m3); + tt = _mm_unpacklo_epi32(m2, t3); + t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2)); + g2(row0, row1, row2, row3, t3); + undiagonalize(row0, row2, row3); + + [*row0, *row1, *row2, *row3] +} + +#[target_feature(enable = "sse2")] +pub unsafe fn compress_in_place( + cv: &mut CVWords, + block: &[u8; BLOCK_LEN], + block_len: u8, + counter: u64, + flags: u8, +) { + let [row0, row1, row2, row3] = compress_pre(cv, block, block_len, counter, flags); + storeu(xor(row0, row2), cv.as_mut_ptr().add(0) as *mut u8); + storeu(xor(row1, row3), cv.as_mut_ptr().add(4) as *mut u8); +} + +#[target_feature(enable = "sse2")] +pub unsafe fn compress_xof( + cv: &CVWords, + block: &[u8; BLOCK_LEN], + block_len: u8, + counter: u64, + flags: u8, +) -> [u8; 64] { + let [mut row0, mut row1, mut row2, mut row3] = + compress_pre(cv, block, block_len, counter, flags); + row0 = xor(row0, row2); + row1 = xor(row1, row3); + row2 = xor(row2, loadu(cv.as_ptr().add(0) as *const u8)); + row3 = xor(row3, loadu(cv.as_ptr().add(4) as *const u8)); + core::mem::transmute([row0, row1, row2, row3]) +} + +#[inline(always)] +unsafe fn round(v: &mut [__m128i; 16], m: &[__m128i; 16], r: usize) { + v[0] = add(v[0], m[MSG_SCHEDULE[r][0] as usize]); + v[1] = add(v[1], m[MSG_SCHEDULE[r][2] as usize]); + v[2] = add(v[2], m[MSG_SCHEDULE[r][4] as usize]); + v[3] = add(v[3], m[MSG_SCHEDULE[r][6] as usize]); + v[0] = add(v[0], v[4]); + v[1] = add(v[1], v[5]); + v[2] = add(v[2], v[6]); + v[3] = add(v[3], v[7]); + v[12] = xor(v[12], v[0]); + v[13] = xor(v[13], v[1]); + v[14] = xor(v[14], v[2]); + v[15] = xor(v[15], v[3]); + v[12] = rot16(v[12]); + v[13] = rot16(v[13]); + v[14] = rot16(v[14]); + v[15] = rot16(v[15]); + v[8] = add(v[8], v[12]); + v[9] = add(v[9], v[13]); + v[10] = add(v[10], v[14]); + v[11] = add(v[11], v[15]); + v[4] = xor(v[4], v[8]); + v[5] = xor(v[5], v[9]); + v[6] = xor(v[6], v[10]); + v[7] = xor(v[7], v[11]); + v[4] = rot12(v[4]); + v[5] = rot12(v[5]); + v[6] = rot12(v[6]); + v[7] = rot12(v[7]); + v[0] = add(v[0], m[MSG_SCHEDULE[r][1] as usize]); + v[1] = add(v[1], m[MSG_SCHEDULE[r][3] as usize]); + v[2] = add(v[2], m[MSG_SCHEDULE[r][5] as usize]); + v[3] = add(v[3], m[MSG_SCHEDULE[r][7] as usize]); + v[0] = add(v[0], v[4]); + v[1] = add(v[1], v[5]); + v[2] = add(v[2], v[6]); + v[3] = add(v[3], v[7]); + v[12] = xor(v[12], v[0]); + v[13] = xor(v[13], v[1]); + v[14] = xor(v[14], v[2]); + v[15] = xor(v[15], v[3]); + v[12] = rot8(v[12]); + v[13] = rot8(v[13]); + v[14] = rot8(v[14]); + v[15] = rot8(v[15]); + v[8] = add(v[8], v[12]); + v[9] = add(v[9], v[13]); + v[10] = add(v[10], v[14]); + v[11] = add(v[11], v[15]); + v[4] = xor(v[4], v[8]); + v[5] = xor(v[5], v[9]); + v[6] = xor(v[6], v[10]); + v[7] = xor(v[7], v[11]); + v[4] = rot7(v[4]); + v[5] = rot7(v[5]); + v[6] = rot7(v[6]); + v[7] = rot7(v[7]); + + v[0] = add(v[0], m[MSG_SCHEDULE[r][8] as usize]); + v[1] = add(v[1], m[MSG_SCHEDULE[r][10] as usize]); + v[2] = add(v[2], m[MSG_SCHEDULE[r][12] as usize]); + v[3] = add(v[3], m[MSG_SCHEDULE[r][14] as usize]); + v[0] = add(v[0], v[5]); + v[1] = add(v[1], v[6]); + v[2] = add(v[2], v[7]); + v[3] = add(v[3], v[4]); + v[15] = xor(v[15], v[0]); + v[12] = xor(v[12], v[1]); + v[13] = xor(v[13], v[2]); + v[14] = xor(v[14], v[3]); + v[15] = rot16(v[15]); + v[12] = rot16(v[12]); + v[13] = rot16(v[13]); + v[14] = rot16(v[14]); + v[10] = add(v[10], v[15]); + v[11] = add(v[11], v[12]); + v[8] = add(v[8], v[13]); + v[9] = add(v[9], v[14]); + v[5] = xor(v[5], v[10]); + v[6] = xor(v[6], v[11]); + v[7] = xor(v[7], v[8]); + v[4] = xor(v[4], v[9]); + v[5] = rot12(v[5]); + v[6] = rot12(v[6]); + v[7] = rot12(v[7]); + v[4] = rot12(v[4]); + v[0] = add(v[0], m[MSG_SCHEDULE[r][9] as usize]); + v[1] = add(v[1], m[MSG_SCHEDULE[r][11] as usize]); + v[2] = add(v[2], m[MSG_SCHEDULE[r][13] as usize]); + v[3] = add(v[3], m[MSG_SCHEDULE[r][15] as usize]); + v[0] = add(v[0], v[5]); + v[1] = add(v[1], v[6]); + v[2] = add(v[2], v[7]); + v[3] = add(v[3], v[4]); + v[15] = xor(v[15], v[0]); + v[12] = xor(v[12], v[1]); + v[13] = xor(v[13], v[2]); + v[14] = xor(v[14], v[3]); + v[15] = rot8(v[15]); + v[12] = rot8(v[12]); + v[13] = rot8(v[13]); + v[14] = rot8(v[14]); + v[10] = add(v[10], v[15]); + v[11] = add(v[11], v[12]); + v[8] = add(v[8], v[13]); + v[9] = add(v[9], v[14]); + v[5] = xor(v[5], v[10]); + v[6] = xor(v[6], v[11]); + v[7] = xor(v[7], v[8]); + v[4] = xor(v[4], v[9]); + v[5] = rot7(v[5]); + v[6] = rot7(v[6]); + v[7] = rot7(v[7]); + v[4] = rot7(v[4]); +} + +#[inline(always)] +unsafe fn transpose_vecs(vecs: &mut [__m128i; DEGREE]) { + // Interleave 32-bit lanes. The low unpack is lanes 00/11 and the high is + // 22/33. Note that this doesn't split the vector into two lanes, as the + // AVX2 counterparts do. + let ab_01 = _mm_unpacklo_epi32(vecs[0], vecs[1]); + let ab_23 = _mm_unpackhi_epi32(vecs[0], vecs[1]); + let cd_01 = _mm_unpacklo_epi32(vecs[2], vecs[3]); + let cd_23 = _mm_unpackhi_epi32(vecs[2], vecs[3]); + + // Interleave 64-bit lanes. + let abcd_0 = _mm_unpacklo_epi64(ab_01, cd_01); + let abcd_1 = _mm_unpackhi_epi64(ab_01, cd_01); + let abcd_2 = _mm_unpacklo_epi64(ab_23, cd_23); + let abcd_3 = _mm_unpackhi_epi64(ab_23, cd_23); + + vecs[0] = abcd_0; + vecs[1] = abcd_1; + vecs[2] = abcd_2; + vecs[3] = abcd_3; +} + +#[inline(always)] +unsafe fn transpose_msg_vecs(inputs: &[*const u8; DEGREE], block_offset: usize) -> [__m128i; 16] { + let mut vecs = [ + loadu(inputs[0].add(block_offset + 0 * 4 * DEGREE)), + loadu(inputs[1].add(block_offset + 0 * 4 * DEGREE)), + loadu(inputs[2].add(block_offset + 0 * 4 * DEGREE)), + loadu(inputs[3].add(block_offset + 0 * 4 * DEGREE)), + loadu(inputs[0].add(block_offset + 1 * 4 * DEGREE)), + loadu(inputs[1].add(block_offset + 1 * 4 * DEGREE)), + loadu(inputs[2].add(block_offset + 1 * 4 * DEGREE)), + loadu(inputs[3].add(block_offset + 1 * 4 * DEGREE)), + loadu(inputs[0].add(block_offset + 2 * 4 * DEGREE)), + loadu(inputs[1].add(block_offset + 2 * 4 * DEGREE)), + loadu(inputs[2].add(block_offset + 2 * 4 * DEGREE)), + loadu(inputs[3].add(block_offset + 2 * 4 * DEGREE)), + loadu(inputs[0].add(block_offset + 3 * 4 * DEGREE)), + loadu(inputs[1].add(block_offset + 3 * 4 * DEGREE)), + loadu(inputs[2].add(block_offset + 3 * 4 * DEGREE)), + loadu(inputs[3].add(block_offset + 3 * 4 * DEGREE)), + ]; + for i in 0..DEGREE { + _mm_prefetch(inputs[i].add(block_offset + 256) as *const i8, _MM_HINT_T0); + } + let squares = mut_array_refs!(&mut vecs, DEGREE, DEGREE, DEGREE, DEGREE); + transpose_vecs(squares.0); + transpose_vecs(squares.1); + transpose_vecs(squares.2); + transpose_vecs(squares.3); + vecs +} + +#[inline(always)] +unsafe fn load_counters(counter: u64, increment_counter: IncrementCounter) -> (__m128i, __m128i) { + let mask = if increment_counter.yes() { !0 } else { 0 }; + ( + set4( + counter_low(counter + (mask & 0)), + counter_low(counter + (mask & 1)), + counter_low(counter + (mask & 2)), + counter_low(counter + (mask & 3)), + ), + set4( + counter_high(counter + (mask & 0)), + counter_high(counter + (mask & 1)), + counter_high(counter + (mask & 2)), + counter_high(counter + (mask & 3)), + ), + ) +} + +#[target_feature(enable = "sse2")] +pub unsafe fn hash4( + inputs: &[*const u8; DEGREE], + blocks: usize, + key: &CVWords, + counter: u64, + increment_counter: IncrementCounter, + flags: u8, + flags_start: u8, + flags_end: u8, + out: &mut [u8; DEGREE * OUT_LEN], +) { + let mut h_vecs = [ + set1(key[0]), + set1(key[1]), + set1(key[2]), + set1(key[3]), + set1(key[4]), + set1(key[5]), + set1(key[6]), + set1(key[7]), + ]; + let (counter_low_vec, counter_high_vec) = load_counters(counter, increment_counter); + let mut block_flags = flags | flags_start; + + for block in 0..blocks { + if block + 1 == blocks { + block_flags |= flags_end; + } + let block_len_vec = set1(BLOCK_LEN as u32); // full blocks only + let block_flags_vec = set1(block_flags as u32); + let msg_vecs = transpose_msg_vecs(inputs, block * BLOCK_LEN); + + // The transposed compression function. Note that inlining this + // manually here improves compile times by a lot, compared to factoring + // it out into its own function and making it #[inline(always)]. Just + // guessing, it might have something to do with loop unrolling. + let mut v = [ + h_vecs[0], + h_vecs[1], + h_vecs[2], + h_vecs[3], + h_vecs[4], + h_vecs[5], + h_vecs[6], + h_vecs[7], + set1(IV[0]), + set1(IV[1]), + set1(IV[2]), + set1(IV[3]), + counter_low_vec, + counter_high_vec, + block_len_vec, + block_flags_vec, + ]; + round(&mut v, &msg_vecs, 0); + round(&mut v, &msg_vecs, 1); + round(&mut v, &msg_vecs, 2); + round(&mut v, &msg_vecs, 3); + round(&mut v, &msg_vecs, 4); + round(&mut v, &msg_vecs, 5); + round(&mut v, &msg_vecs, 6); + h_vecs[0] = xor(v[0], v[8]); + h_vecs[1] = xor(v[1], v[9]); + h_vecs[2] = xor(v[2], v[10]); + h_vecs[3] = xor(v[3], v[11]); + h_vecs[4] = xor(v[4], v[12]); + h_vecs[5] = xor(v[5], v[13]); + h_vecs[6] = xor(v[6], v[14]); + h_vecs[7] = xor(v[7], v[15]); + + block_flags = flags; + } + + let squares = mut_array_refs!(&mut h_vecs, DEGREE, DEGREE); + transpose_vecs(squares.0); + transpose_vecs(squares.1); + // The first four vecs now contain the first half of each output, and the + // second four vecs contain the second half of each output. + storeu(h_vecs[0], out.as_mut_ptr().add(0 * 4 * DEGREE)); + storeu(h_vecs[4], out.as_mut_ptr().add(1 * 4 * DEGREE)); + storeu(h_vecs[1], out.as_mut_ptr().add(2 * 4 * DEGREE)); + storeu(h_vecs[5], out.as_mut_ptr().add(3 * 4 * DEGREE)); + storeu(h_vecs[2], out.as_mut_ptr().add(4 * 4 * DEGREE)); + storeu(h_vecs[6], out.as_mut_ptr().add(5 * 4 * DEGREE)); + storeu(h_vecs[3], out.as_mut_ptr().add(6 * 4 * DEGREE)); + storeu(h_vecs[7], out.as_mut_ptr().add(7 * 4 * DEGREE)); +} + +#[target_feature(enable = "sse2")] +unsafe fn hash1( + input: &[u8; N], + key: &CVWords, + counter: u64, + flags: u8, + flags_start: u8, + flags_end: u8, + out: &mut CVBytes, +) { + debug_assert_eq!(N % BLOCK_LEN, 0, "uneven blocks"); + let mut cv = *key; + let mut block_flags = flags | flags_start; + let mut slice = &input[..]; + while slice.len() >= BLOCK_LEN { + if slice.len() == BLOCK_LEN { + block_flags |= flags_end; + } + compress_in_place( + &mut cv, + array_ref!(slice, 0, BLOCK_LEN), + BLOCK_LEN as u8, + counter, + block_flags, + ); + block_flags = flags; + slice = &slice[BLOCK_LEN..]; + } + *out = core::mem::transmute(cv); // x86 is little-endian +} + +#[target_feature(enable = "sse2")] +pub unsafe fn hash_many( + mut inputs: &[&[u8; N]], + key: &CVWords, + mut counter: u64, + increment_counter: IncrementCounter, + flags: u8, + flags_start: u8, + flags_end: u8, + mut out: &mut [u8], +) { + debug_assert!(out.len() >= inputs.len() * OUT_LEN, "out too short"); + while inputs.len() >= DEGREE && out.len() >= DEGREE * OUT_LEN { + // Safe because the layout of arrays is guaranteed, and because the + // `blocks` count is determined statically from the argument type. + let input_ptrs: &[*const u8; DEGREE] = &*(inputs.as_ptr() as *const [*const u8; DEGREE]); + let blocks = N / BLOCK_LEN; + hash4( + input_ptrs, + blocks, + key, + counter, + increment_counter, + flags, + flags_start, + flags_end, + array_mut_ref!(out, 0, DEGREE * OUT_LEN), + ); + if increment_counter.yes() { + counter += DEGREE as u64; + } + inputs = &inputs[DEGREE..]; + out = &mut out[DEGREE * OUT_LEN..]; + } + for (&input, output) in inputs.iter().zip(out.chunks_exact_mut(OUT_LEN)) { + hash1( + input, + key, + counter, + flags, + flags_start, + flags_end, + array_mut_ref!(output, 0, OUT_LEN), + ); + if increment_counter.yes() { + counter += 1; + } + } +} + +#[cfg(test)] +mod test { + use super::*; + + #[test] + fn test_transpose() { + if !crate::platform::sse2_detected() { + return; + } + + #[target_feature(enable = "sse2")] + unsafe fn transpose_wrapper(vecs: &mut [__m128i; DEGREE]) { + transpose_vecs(vecs); + } + + let mut matrix = [[0 as u32; DEGREE]; DEGREE]; + for i in 0..DEGREE { + for j in 0..DEGREE { + matrix[i][j] = (i * DEGREE + j) as u32; + } + } + + unsafe { + let mut vecs: [__m128i; DEGREE] = core::mem::transmute(matrix); + transpose_wrapper(&mut vecs); + matrix = core::mem::transmute(vecs); + } + + for i in 0..DEGREE { + for j in 0..DEGREE { + // Reversed indexes from above. + assert_eq!(matrix[j][i], (i * DEGREE + j) as u32); + } + } + } + + #[test] + fn test_compress() { + if !crate::platform::sse2_detected() { + return; + } + crate::test::test_compress_fn(compress_in_place, compress_xof); + } + + #[test] + fn test_hash_many() { + if !crate::platform::sse2_detected() { + return; + } + crate::test::test_hash_many_fn(hash_many, hash_many); + } +} diff --git a/third-party/blake3/src/rust_sse41.rs b/third-party/blake3/src/rust_sse41.rs new file mode 100644 index 00000000..1ebadc48 --- /dev/null +++ b/third-party/blake3/src/rust_sse41.rs @@ -0,0 +1,766 @@ +#[cfg(target_arch = "x86")] +use core::arch::x86::*; +#[cfg(target_arch = "x86_64")] +use core::arch::x86_64::*; + +use crate::{ + counter_high, counter_low, CVBytes, CVWords, IncrementCounter, BLOCK_LEN, IV, MSG_SCHEDULE, + OUT_LEN, +}; +use arrayref::{array_mut_ref, array_ref, mut_array_refs}; + +pub const DEGREE: usize = 4; + +#[inline(always)] +unsafe fn loadu(src: *const u8) -> __m128i { + // This is an unaligned load, so the pointer cast is allowed. + _mm_loadu_si128(src as *const __m128i) +} + +#[inline(always)] +unsafe fn storeu(src: __m128i, dest: *mut u8) { + // This is an unaligned store, so the pointer cast is allowed. + _mm_storeu_si128(dest as *mut __m128i, src) +} + +#[inline(always)] +unsafe fn add(a: __m128i, b: __m128i) -> __m128i { + _mm_add_epi32(a, b) +} + +#[inline(always)] +unsafe fn xor(a: __m128i, b: __m128i) -> __m128i { + _mm_xor_si128(a, b) +} + +#[inline(always)] +unsafe fn set1(x: u32) -> __m128i { + _mm_set1_epi32(x as i32) +} + +#[inline(always)] +unsafe fn set4(a: u32, b: u32, c: u32, d: u32) -> __m128i { + _mm_setr_epi32(a as i32, b as i32, c as i32, d as i32) +} + +// These rotations are the "simple/shifts version". For the +// "complicated/shuffles version", see +// https://github.com/sneves/blake2-avx2/blob/b3723921f668df09ece52dcd225a36d4a4eea1d9/blake2s-common.h#L63-L66. +// For a discussion of the tradeoffs, see +// https://github.com/sneves/blake2-avx2/pull/5. Due to an LLVM bug +// (https://bugs.llvm.org/show_bug.cgi?id=44379), this version performs better +// on recent x86 chips. + +#[inline(always)] +unsafe fn rot16(a: __m128i) -> __m128i { + _mm_or_si128(_mm_srli_epi32(a, 16), _mm_slli_epi32(a, 32 - 16)) +} + +#[inline(always)] +unsafe fn rot12(a: __m128i) -> __m128i { + _mm_or_si128(_mm_srli_epi32(a, 12), _mm_slli_epi32(a, 32 - 12)) +} + +#[inline(always)] +unsafe fn rot8(a: __m128i) -> __m128i { + _mm_or_si128(_mm_srli_epi32(a, 8), _mm_slli_epi32(a, 32 - 8)) +} + +#[inline(always)] +unsafe fn rot7(a: __m128i) -> __m128i { + _mm_or_si128(_mm_srli_epi32(a, 7), _mm_slli_epi32(a, 32 - 7)) +} + +#[inline(always)] +unsafe fn g1( + row0: &mut __m128i, + row1: &mut __m128i, + row2: &mut __m128i, + row3: &mut __m128i, + m: __m128i, +) { + *row0 = add(add(*row0, m), *row1); + *row3 = xor(*row3, *row0); + *row3 = rot16(*row3); + *row2 = add(*row2, *row3); + *row1 = xor(*row1, *row2); + *row1 = rot12(*row1); +} + +#[inline(always)] +unsafe fn g2( + row0: &mut __m128i, + row1: &mut __m128i, + row2: &mut __m128i, + row3: &mut __m128i, + m: __m128i, +) { + *row0 = add(add(*row0, m), *row1); + *row3 = xor(*row3, *row0); + *row3 = rot8(*row3); + *row2 = add(*row2, *row3); + *row1 = xor(*row1, *row2); + *row1 = rot7(*row1); +} + +// Adapted from https://github.com/rust-lang-nursery/stdsimd/pull/479. +macro_rules! _MM_SHUFFLE { + ($z:expr, $y:expr, $x:expr, $w:expr) => { + ($z << 6) | ($y << 4) | ($x << 2) | $w + }; +} + +macro_rules! shuffle2 { + ($a:expr, $b:expr, $c:expr) => { + _mm_castps_si128(_mm_shuffle_ps( + _mm_castsi128_ps($a), + _mm_castsi128_ps($b), + $c, + )) + }; +} + +// Note the optimization here of leaving row1 as the unrotated row, rather than +// row0. All the message loads below are adjusted to compensate for this. See +// discussion at https://github.com/sneves/blake2-avx2/pull/4 +#[inline(always)] +unsafe fn diagonalize(row0: &mut __m128i, row2: &mut __m128i, row3: &mut __m128i) { + *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE!(2, 1, 0, 3)); + *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE!(1, 0, 3, 2)); + *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE!(0, 3, 2, 1)); +} + +#[inline(always)] +unsafe fn undiagonalize(row0: &mut __m128i, row2: &mut __m128i, row3: &mut __m128i) { + *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE!(0, 3, 2, 1)); + *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE!(1, 0, 3, 2)); + *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE!(2, 1, 0, 3)); +} + +#[inline(always)] +unsafe fn compress_pre( + cv: &CVWords, + block: &[u8; BLOCK_LEN], + block_len: u8, + counter: u64, + flags: u8, +) -> [__m128i; 4] { + let row0 = &mut loadu(cv.as_ptr().add(0) as *const u8); + let row1 = &mut loadu(cv.as_ptr().add(4) as *const u8); + let row2 = &mut set4(IV[0], IV[1], IV[2], IV[3]); + let row3 = &mut set4( + counter_low(counter), + counter_high(counter), + block_len as u32, + flags as u32, + ); + + let mut m0 = loadu(block.as_ptr().add(0 * 4 * DEGREE)); + let mut m1 = loadu(block.as_ptr().add(1 * 4 * DEGREE)); + let mut m2 = loadu(block.as_ptr().add(2 * 4 * DEGREE)); + let mut m3 = loadu(block.as_ptr().add(3 * 4 * DEGREE)); + + let mut t0; + let mut t1; + let mut t2; + let mut t3; + let mut tt; + + // Round 1. The first round permutes the message words from the original + // input order, into the groups that get mixed in parallel. + t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(2, 0, 2, 0)); // 6 4 2 0 + g1(row0, row1, row2, row3, t0); + t1 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 3, 1)); // 7 5 3 1 + g2(row0, row1, row2, row3, t1); + diagonalize(row0, row2, row3); + t2 = shuffle2!(m2, m3, _MM_SHUFFLE!(2, 0, 2, 0)); // 14 12 10 8 + t2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE!(2, 1, 0, 3)); // 12 10 8 14 + g1(row0, row1, row2, row3, t2); + t3 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 1, 3, 1)); // 15 13 11 9 + t3 = _mm_shuffle_epi32(t3, _MM_SHUFFLE!(2, 1, 0, 3)); // 13 11 9 15 + g2(row0, row1, row2, row3, t3); + undiagonalize(row0, row2, row3); + m0 = t0; + m1 = t1; + m2 = t2; + m3 = t3; + + // Round 2. This round and all following rounds apply a fixed permutation + // to the message words from the round before. + t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2)); + t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1)); + g1(row0, row1, row2, row3, t0); + t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2)); + tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3)); + t1 = _mm_blend_epi16(tt, t1, 0xCC); + g2(row0, row1, row2, row3, t1); + diagonalize(row0, row2, row3); + t2 = _mm_unpacklo_epi64(m3, m1); + tt = _mm_blend_epi16(t2, m2, 0xC0); + t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0)); + g1(row0, row1, row2, row3, t2); + t3 = _mm_unpackhi_epi32(m1, m3); + tt = _mm_unpacklo_epi32(m2, t3); + t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2)); + g2(row0, row1, row2, row3, t3); + undiagonalize(row0, row2, row3); + m0 = t0; + m1 = t1; + m2 = t2; + m3 = t3; + + // Round 3 + t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2)); + t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1)); + g1(row0, row1, row2, row3, t0); + t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2)); + tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3)); + t1 = _mm_blend_epi16(tt, t1, 0xCC); + g2(row0, row1, row2, row3, t1); + diagonalize(row0, row2, row3); + t2 = _mm_unpacklo_epi64(m3, m1); + tt = _mm_blend_epi16(t2, m2, 0xC0); + t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0)); + g1(row0, row1, row2, row3, t2); + t3 = _mm_unpackhi_epi32(m1, m3); + tt = _mm_unpacklo_epi32(m2, t3); + t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2)); + g2(row0, row1, row2, row3, t3); + undiagonalize(row0, row2, row3); + m0 = t0; + m1 = t1; + m2 = t2; + m3 = t3; + + // Round 4 + t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2)); + t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1)); + g1(row0, row1, row2, row3, t0); + t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2)); + tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3)); + t1 = _mm_blend_epi16(tt, t1, 0xCC); + g2(row0, row1, row2, row3, t1); + diagonalize(row0, row2, row3); + t2 = _mm_unpacklo_epi64(m3, m1); + tt = _mm_blend_epi16(t2, m2, 0xC0); + t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0)); + g1(row0, row1, row2, row3, t2); + t3 = _mm_unpackhi_epi32(m1, m3); + tt = _mm_unpacklo_epi32(m2, t3); + t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2)); + g2(row0, row1, row2, row3, t3); + undiagonalize(row0, row2, row3); + m0 = t0; + m1 = t1; + m2 = t2; + m3 = t3; + + // Round 5 + t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2)); + t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1)); + g1(row0, row1, row2, row3, t0); + t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2)); + tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3)); + t1 = _mm_blend_epi16(tt, t1, 0xCC); + g2(row0, row1, row2, row3, t1); + diagonalize(row0, row2, row3); + t2 = _mm_unpacklo_epi64(m3, m1); + tt = _mm_blend_epi16(t2, m2, 0xC0); + t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0)); + g1(row0, row1, row2, row3, t2); + t3 = _mm_unpackhi_epi32(m1, m3); + tt = _mm_unpacklo_epi32(m2, t3); + t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2)); + g2(row0, row1, row2, row3, t3); + undiagonalize(row0, row2, row3); + m0 = t0; + m1 = t1; + m2 = t2; + m3 = t3; + + // Round 6 + t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2)); + t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1)); + g1(row0, row1, row2, row3, t0); + t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2)); + tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3)); + t1 = _mm_blend_epi16(tt, t1, 0xCC); + g2(row0, row1, row2, row3, t1); + diagonalize(row0, row2, row3); + t2 = _mm_unpacklo_epi64(m3, m1); + tt = _mm_blend_epi16(t2, m2, 0xC0); + t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0)); + g1(row0, row1, row2, row3, t2); + t3 = _mm_unpackhi_epi32(m1, m3); + tt = _mm_unpacklo_epi32(m2, t3); + t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2)); + g2(row0, row1, row2, row3, t3); + undiagonalize(row0, row2, row3); + m0 = t0; + m1 = t1; + m2 = t2; + m3 = t3; + + // Round 7 + t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2)); + t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1)); + g1(row0, row1, row2, row3, t0); + t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2)); + tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3)); + t1 = _mm_blend_epi16(tt, t1, 0xCC); + g2(row0, row1, row2, row3, t1); + diagonalize(row0, row2, row3); + t2 = _mm_unpacklo_epi64(m3, m1); + tt = _mm_blend_epi16(t2, m2, 0xC0); + t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0)); + g1(row0, row1, row2, row3, t2); + t3 = _mm_unpackhi_epi32(m1, m3); + tt = _mm_unpacklo_epi32(m2, t3); + t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2)); + g2(row0, row1, row2, row3, t3); + undiagonalize(row0, row2, row3); + + [*row0, *row1, *row2, *row3] +} + +#[target_feature(enable = "sse4.1")] +pub unsafe fn compress_in_place( + cv: &mut CVWords, + block: &[u8; BLOCK_LEN], + block_len: u8, + counter: u64, + flags: u8, +) { + let [row0, row1, row2, row3] = compress_pre(cv, block, block_len, counter, flags); + storeu(xor(row0, row2), cv.as_mut_ptr().add(0) as *mut u8); + storeu(xor(row1, row3), cv.as_mut_ptr().add(4) as *mut u8); +} + +#[target_feature(enable = "sse4.1")] +pub unsafe fn compress_xof( + cv: &CVWords, + block: &[u8; BLOCK_LEN], + block_len: u8, + counter: u64, + flags: u8, +) -> [u8; 64] { + let [mut row0, mut row1, mut row2, mut row3] = + compress_pre(cv, block, block_len, counter, flags); + row0 = xor(row0, row2); + row1 = xor(row1, row3); + row2 = xor(row2, loadu(cv.as_ptr().add(0) as *const u8)); + row3 = xor(row3, loadu(cv.as_ptr().add(4) as *const u8)); + core::mem::transmute([row0, row1, row2, row3]) +} + +#[inline(always)] +unsafe fn round(v: &mut [__m128i; 16], m: &[__m128i; 16], r: usize) { + v[0] = add(v[0], m[MSG_SCHEDULE[r][0] as usize]); + v[1] = add(v[1], m[MSG_SCHEDULE[r][2] as usize]); + v[2] = add(v[2], m[MSG_SCHEDULE[r][4] as usize]); + v[3] = add(v[3], m[MSG_SCHEDULE[r][6] as usize]); + v[0] = add(v[0], v[4]); + v[1] = add(v[1], v[5]); + v[2] = add(v[2], v[6]); + v[3] = add(v[3], v[7]); + v[12] = xor(v[12], v[0]); + v[13] = xor(v[13], v[1]); + v[14] = xor(v[14], v[2]); + v[15] = xor(v[15], v[3]); + v[12] = rot16(v[12]); + v[13] = rot16(v[13]); + v[14] = rot16(v[14]); + v[15] = rot16(v[15]); + v[8] = add(v[8], v[12]); + v[9] = add(v[9], v[13]); + v[10] = add(v[10], v[14]); + v[11] = add(v[11], v[15]); + v[4] = xor(v[4], v[8]); + v[5] = xor(v[5], v[9]); + v[6] = xor(v[6], v[10]); + v[7] = xor(v[7], v[11]); + v[4] = rot12(v[4]); + v[5] = rot12(v[5]); + v[6] = rot12(v[6]); + v[7] = rot12(v[7]); + v[0] = add(v[0], m[MSG_SCHEDULE[r][1] as usize]); + v[1] = add(v[1], m[MSG_SCHEDULE[r][3] as usize]); + v[2] = add(v[2], m[MSG_SCHEDULE[r][5] as usize]); + v[3] = add(v[3], m[MSG_SCHEDULE[r][7] as usize]); + v[0] = add(v[0], v[4]); + v[1] = add(v[1], v[5]); + v[2] = add(v[2], v[6]); + v[3] = add(v[3], v[7]); + v[12] = xor(v[12], v[0]); + v[13] = xor(v[13], v[1]); + v[14] = xor(v[14], v[2]); + v[15] = xor(v[15], v[3]); + v[12] = rot8(v[12]); + v[13] = rot8(v[13]); + v[14] = rot8(v[14]); + v[15] = rot8(v[15]); + v[8] = add(v[8], v[12]); + v[9] = add(v[9], v[13]); + v[10] = add(v[10], v[14]); + v[11] = add(v[11], v[15]); + v[4] = xor(v[4], v[8]); + v[5] = xor(v[5], v[9]); + v[6] = xor(v[6], v[10]); + v[7] = xor(v[7], v[11]); + v[4] = rot7(v[4]); + v[5] = rot7(v[5]); + v[6] = rot7(v[6]); + v[7] = rot7(v[7]); + + v[0] = add(v[0], m[MSG_SCHEDULE[r][8] as usize]); + v[1] = add(v[1], m[MSG_SCHEDULE[r][10] as usize]); + v[2] = add(v[2], m[MSG_SCHEDULE[r][12] as usize]); + v[3] = add(v[3], m[MSG_SCHEDULE[r][14] as usize]); + v[0] = add(v[0], v[5]); + v[1] = add(v[1], v[6]); + v[2] = add(v[2], v[7]); + v[3] = add(v[3], v[4]); + v[15] = xor(v[15], v[0]); + v[12] = xor(v[12], v[1]); + v[13] = xor(v[13], v[2]); + v[14] = xor(v[14], v[3]); + v[15] = rot16(v[15]); + v[12] = rot16(v[12]); + v[13] = rot16(v[13]); + v[14] = rot16(v[14]); + v[10] = add(v[10], v[15]); + v[11] = add(v[11], v[12]); + v[8] = add(v[8], v[13]); + v[9] = add(v[9], v[14]); + v[5] = xor(v[5], v[10]); + v[6] = xor(v[6], v[11]); + v[7] = xor(v[7], v[8]); + v[4] = xor(v[4], v[9]); + v[5] = rot12(v[5]); + v[6] = rot12(v[6]); + v[7] = rot12(v[7]); + v[4] = rot12(v[4]); + v[0] = add(v[0], m[MSG_SCHEDULE[r][9] as usize]); + v[1] = add(v[1], m[MSG_SCHEDULE[r][11] as usize]); + v[2] = add(v[2], m[MSG_SCHEDULE[r][13] as usize]); + v[3] = add(v[3], m[MSG_SCHEDULE[r][15] as usize]); + v[0] = add(v[0], v[5]); + v[1] = add(v[1], v[6]); + v[2] = add(v[2], v[7]); + v[3] = add(v[3], v[4]); + v[15] = xor(v[15], v[0]); + v[12] = xor(v[12], v[1]); + v[13] = xor(v[13], v[2]); + v[14] = xor(v[14], v[3]); + v[15] = rot8(v[15]); + v[12] = rot8(v[12]); + v[13] = rot8(v[13]); + v[14] = rot8(v[14]); + v[10] = add(v[10], v[15]); + v[11] = add(v[11], v[12]); + v[8] = add(v[8], v[13]); + v[9] = add(v[9], v[14]); + v[5] = xor(v[5], v[10]); + v[6] = xor(v[6], v[11]); + v[7] = xor(v[7], v[8]); + v[4] = xor(v[4], v[9]); + v[5] = rot7(v[5]); + v[6] = rot7(v[6]); + v[7] = rot7(v[7]); + v[4] = rot7(v[4]); +} + +#[inline(always)] +unsafe fn transpose_vecs(vecs: &mut [__m128i; DEGREE]) { + // Interleave 32-bit lanes. The low unpack is lanes 00/11 and the high is + // 22/33. Note that this doesn't split the vector into two lanes, as the + // AVX2 counterparts do. + let ab_01 = _mm_unpacklo_epi32(vecs[0], vecs[1]); + let ab_23 = _mm_unpackhi_epi32(vecs[0], vecs[1]); + let cd_01 = _mm_unpacklo_epi32(vecs[2], vecs[3]); + let cd_23 = _mm_unpackhi_epi32(vecs[2], vecs[3]); + + // Interleave 64-bit lanes. + let abcd_0 = _mm_unpacklo_epi64(ab_01, cd_01); + let abcd_1 = _mm_unpackhi_epi64(ab_01, cd_01); + let abcd_2 = _mm_unpacklo_epi64(ab_23, cd_23); + let abcd_3 = _mm_unpackhi_epi64(ab_23, cd_23); + + vecs[0] = abcd_0; + vecs[1] = abcd_1; + vecs[2] = abcd_2; + vecs[3] = abcd_3; +} + +#[inline(always)] +unsafe fn transpose_msg_vecs(inputs: &[*const u8; DEGREE], block_offset: usize) -> [__m128i; 16] { + let mut vecs = [ + loadu(inputs[0].add(block_offset + 0 * 4 * DEGREE)), + loadu(inputs[1].add(block_offset + 0 * 4 * DEGREE)), + loadu(inputs[2].add(block_offset + 0 * 4 * DEGREE)), + loadu(inputs[3].add(block_offset + 0 * 4 * DEGREE)), + loadu(inputs[0].add(block_offset + 1 * 4 * DEGREE)), + loadu(inputs[1].add(block_offset + 1 * 4 * DEGREE)), + loadu(inputs[2].add(block_offset + 1 * 4 * DEGREE)), + loadu(inputs[3].add(block_offset + 1 * 4 * DEGREE)), + loadu(inputs[0].add(block_offset + 2 * 4 * DEGREE)), + loadu(inputs[1].add(block_offset + 2 * 4 * DEGREE)), + loadu(inputs[2].add(block_offset + 2 * 4 * DEGREE)), + loadu(inputs[3].add(block_offset + 2 * 4 * DEGREE)), + loadu(inputs[0].add(block_offset + 3 * 4 * DEGREE)), + loadu(inputs[1].add(block_offset + 3 * 4 * DEGREE)), + loadu(inputs[2].add(block_offset + 3 * 4 * DEGREE)), + loadu(inputs[3].add(block_offset + 3 * 4 * DEGREE)), + ]; + for i in 0..DEGREE { + _mm_prefetch(inputs[i].add(block_offset + 256) as *const i8, _MM_HINT_T0); + } + let squares = mut_array_refs!(&mut vecs, DEGREE, DEGREE, DEGREE, DEGREE); + transpose_vecs(squares.0); + transpose_vecs(squares.1); + transpose_vecs(squares.2); + transpose_vecs(squares.3); + vecs +} + +#[inline(always)] +unsafe fn load_counters(counter: u64, increment_counter: IncrementCounter) -> (__m128i, __m128i) { + let mask = if increment_counter.yes() { !0 } else { 0 }; + ( + set4( + counter_low(counter + (mask & 0)), + counter_low(counter + (mask & 1)), + counter_low(counter + (mask & 2)), + counter_low(counter + (mask & 3)), + ), + set4( + counter_high(counter + (mask & 0)), + counter_high(counter + (mask & 1)), + counter_high(counter + (mask & 2)), + counter_high(counter + (mask & 3)), + ), + ) +} + +#[target_feature(enable = "sse4.1")] +pub unsafe fn hash4( + inputs: &[*const u8; DEGREE], + blocks: usize, + key: &CVWords, + counter: u64, + increment_counter: IncrementCounter, + flags: u8, + flags_start: u8, + flags_end: u8, + out: &mut [u8; DEGREE * OUT_LEN], +) { + let mut h_vecs = [ + set1(key[0]), + set1(key[1]), + set1(key[2]), + set1(key[3]), + set1(key[4]), + set1(key[5]), + set1(key[6]), + set1(key[7]), + ]; + let (counter_low_vec, counter_high_vec) = load_counters(counter, increment_counter); + let mut block_flags = flags | flags_start; + + for block in 0..blocks { + if block + 1 == blocks { + block_flags |= flags_end; + } + let block_len_vec = set1(BLOCK_LEN as u32); // full blocks only + let block_flags_vec = set1(block_flags as u32); + let msg_vecs = transpose_msg_vecs(inputs, block * BLOCK_LEN); + + // The transposed compression function. Note that inlining this + // manually here improves compile times by a lot, compared to factoring + // it out into its own function and making it #[inline(always)]. Just + // guessing, it might have something to do with loop unrolling. + let mut v = [ + h_vecs[0], + h_vecs[1], + h_vecs[2], + h_vecs[3], + h_vecs[4], + h_vecs[5], + h_vecs[6], + h_vecs[7], + set1(IV[0]), + set1(IV[1]), + set1(IV[2]), + set1(IV[3]), + counter_low_vec, + counter_high_vec, + block_len_vec, + block_flags_vec, + ]; + round(&mut v, &msg_vecs, 0); + round(&mut v, &msg_vecs, 1); + round(&mut v, &msg_vecs, 2); + round(&mut v, &msg_vecs, 3); + round(&mut v, &msg_vecs, 4); + round(&mut v, &msg_vecs, 5); + round(&mut v, &msg_vecs, 6); + h_vecs[0] = xor(v[0], v[8]); + h_vecs[1] = xor(v[1], v[9]); + h_vecs[2] = xor(v[2], v[10]); + h_vecs[3] = xor(v[3], v[11]); + h_vecs[4] = xor(v[4], v[12]); + h_vecs[5] = xor(v[5], v[13]); + h_vecs[6] = xor(v[6], v[14]); + h_vecs[7] = xor(v[7], v[15]); + + block_flags = flags; + } + + let squares = mut_array_refs!(&mut h_vecs, DEGREE, DEGREE); + transpose_vecs(squares.0); + transpose_vecs(squares.1); + // The first four vecs now contain the first half of each output, and the + // second four vecs contain the second half of each output. + storeu(h_vecs[0], out.as_mut_ptr().add(0 * 4 * DEGREE)); + storeu(h_vecs[4], out.as_mut_ptr().add(1 * 4 * DEGREE)); + storeu(h_vecs[1], out.as_mut_ptr().add(2 * 4 * DEGREE)); + storeu(h_vecs[5], out.as_mut_ptr().add(3 * 4 * DEGREE)); + storeu(h_vecs[2], out.as_mut_ptr().add(4 * 4 * DEGREE)); + storeu(h_vecs[6], out.as_mut_ptr().add(5 * 4 * DEGREE)); + storeu(h_vecs[3], out.as_mut_ptr().add(6 * 4 * DEGREE)); + storeu(h_vecs[7], out.as_mut_ptr().add(7 * 4 * DEGREE)); +} + +#[target_feature(enable = "sse4.1")] +unsafe fn hash1( + input: &[u8; N], + key: &CVWords, + counter: u64, + flags: u8, + flags_start: u8, + flags_end: u8, + out: &mut CVBytes, +) { + debug_assert_eq!(N % BLOCK_LEN, 0, "uneven blocks"); + let mut cv = *key; + let mut block_flags = flags | flags_start; + let mut slice = &input[..]; + while slice.len() >= BLOCK_LEN { + if slice.len() == BLOCK_LEN { + block_flags |= flags_end; + } + compress_in_place( + &mut cv, + array_ref!(slice, 0, BLOCK_LEN), + BLOCK_LEN as u8, + counter, + block_flags, + ); + block_flags = flags; + slice = &slice[BLOCK_LEN..]; + } + *out = core::mem::transmute(cv); // x86 is little-endian +} + +#[target_feature(enable = "sse4.1")] +pub unsafe fn hash_many( + mut inputs: &[&[u8; N]], + key: &CVWords, + mut counter: u64, + increment_counter: IncrementCounter, + flags: u8, + flags_start: u8, + flags_end: u8, + mut out: &mut [u8], +) { + debug_assert!(out.len() >= inputs.len() * OUT_LEN, "out too short"); + while inputs.len() >= DEGREE && out.len() >= DEGREE * OUT_LEN { + // Safe because the layout of arrays is guaranteed, and because the + // `blocks` count is determined statically from the argument type. + let input_ptrs: &[*const u8; DEGREE] = &*(inputs.as_ptr() as *const [*const u8; DEGREE]); + let blocks = N / BLOCK_LEN; + hash4( + input_ptrs, + blocks, + key, + counter, + increment_counter, + flags, + flags_start, + flags_end, + array_mut_ref!(out, 0, DEGREE * OUT_LEN), + ); + if increment_counter.yes() { + counter += DEGREE as u64; + } + inputs = &inputs[DEGREE..]; + out = &mut out[DEGREE * OUT_LEN..]; + } + for (&input, output) in inputs.iter().zip(out.chunks_exact_mut(OUT_LEN)) { + hash1( + input, + key, + counter, + flags, + flags_start, + flags_end, + array_mut_ref!(output, 0, OUT_LEN), + ); + if increment_counter.yes() { + counter += 1; + } + } +} + +#[cfg(test)] +mod test { + use super::*; + + #[test] + fn test_transpose() { + if !crate::platform::sse41_detected() { + return; + } + + #[target_feature(enable = "sse4.1")] + unsafe fn transpose_wrapper(vecs: &mut [__m128i; DEGREE]) { + transpose_vecs(vecs); + } + + let mut matrix = [[0 as u32; DEGREE]; DEGREE]; + for i in 0..DEGREE { + for j in 0..DEGREE { + matrix[i][j] = (i * DEGREE + j) as u32; + } + } + + unsafe { + let mut vecs: [__m128i; DEGREE] = core::mem::transmute(matrix); + transpose_wrapper(&mut vecs); + matrix = core::mem::transmute(vecs); + } + + for i in 0..DEGREE { + for j in 0..DEGREE { + // Reversed indexes from above. + assert_eq!(matrix[j][i], (i * DEGREE + j) as u32); + } + } + } + + #[test] + fn test_compress() { + if !crate::platform::sse41_detected() { + return; + } + crate::test::test_compress_fn(compress_in_place, compress_xof); + } + + #[test] + fn test_hash_many() { + if !crate::platform::sse41_detected() { + return; + } + crate::test::test_hash_many_fn(hash_many, hash_many); + } +} diff --git a/third-party/blake3/src/test.rs b/third-party/blake3/src/test.rs new file mode 100644 index 00000000..60bbe8cc --- /dev/null +++ b/third-party/blake3/src/test.rs @@ -0,0 +1,630 @@ +use crate::{CVBytes, CVWords, IncrementCounter, BLOCK_LEN, CHUNK_LEN, OUT_LEN}; +use arrayref::array_ref; +use arrayvec::ArrayVec; +use core::usize; +use rand::prelude::*; + +// Interesting input lengths to run tests on. +pub const TEST_CASES: &[usize] = &[ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + BLOCK_LEN - 1, + BLOCK_LEN, + BLOCK_LEN + 1, + 2 * BLOCK_LEN - 1, + 2 * BLOCK_LEN, + 2 * BLOCK_LEN + 1, + CHUNK_LEN - 1, + CHUNK_LEN, + CHUNK_LEN + 1, + 2 * CHUNK_LEN, + 2 * CHUNK_LEN + 1, + 3 * CHUNK_LEN, + 3 * CHUNK_LEN + 1, + 4 * CHUNK_LEN, + 4 * CHUNK_LEN + 1, + 5 * CHUNK_LEN, + 5 * CHUNK_LEN + 1, + 6 * CHUNK_LEN, + 6 * CHUNK_LEN + 1, + 7 * CHUNK_LEN, + 7 * CHUNK_LEN + 1, + 8 * CHUNK_LEN, + 8 * CHUNK_LEN + 1, + 16 * CHUNK_LEN, // AVX512's bandwidth + 31 * CHUNK_LEN, // 16 + 8 + 4 + 2 + 1 + 100 * CHUNK_LEN, // subtrees larger than MAX_SIMD_DEGREE chunks +]; + +pub const TEST_CASES_MAX: usize = 100 * CHUNK_LEN; + +// There's a test to make sure these two are equal below. +pub const TEST_KEY: CVBytes = *b"whats the Elvish word for friend"; +pub const TEST_KEY_WORDS: CVWords = [ + 1952540791, 1752440947, 1816469605, 1752394102, 1919907616, 1868963940, 1919295602, 1684956521, +]; + +// Paint the input with a repeating byte pattern. We use a cycle length of 251, +// because that's the largest prime number less than 256. This makes it +// unlikely to swapping any two adjacent input blocks or chunks will give the +// same answer. +pub fn paint_test_input(buf: &mut [u8]) { + for (i, b) in buf.iter_mut().enumerate() { + *b = (i % 251) as u8; + } +} + +type CompressInPlaceFn = + unsafe fn(cv: &mut CVWords, block: &[u8; BLOCK_LEN], block_len: u8, counter: u64, flags: u8); + +type CompressXofFn = unsafe fn( + cv: &CVWords, + block: &[u8; BLOCK_LEN], + block_len: u8, + counter: u64, + flags: u8, +) -> [u8; 64]; + +// A shared helper function for platform-specific tests. +pub fn test_compress_fn(compress_in_place_fn: CompressInPlaceFn, compress_xof_fn: CompressXofFn) { + let initial_state = TEST_KEY_WORDS; + let block_len: u8 = 61; + let mut block = [0; BLOCK_LEN]; + paint_test_input(&mut block[..block_len as usize]); + // Use a counter with set bits in both 32-bit words. + let counter = (5u64 << 32) + 6; + let flags = crate::CHUNK_END | crate::ROOT | crate::KEYED_HASH; + + let portable_out = + crate::portable::compress_xof(&initial_state, &block, block_len, counter as u64, flags); + + let mut test_state = initial_state; + unsafe { compress_in_place_fn(&mut test_state, &block, block_len, counter as u64, flags) }; + let test_state_bytes = crate::platform::le_bytes_from_words_32(&test_state); + let test_xof = + unsafe { compress_xof_fn(&initial_state, &block, block_len, counter as u64, flags) }; + + assert_eq!(&portable_out[..32], &test_state_bytes[..]); + assert_eq!(&portable_out[..], &test_xof[..]); +} + +type HashManyFn = unsafe fn( + inputs: &[&A], + key: &CVWords, + counter: u64, + increment_counter: IncrementCounter, + flags: u8, + flags_start: u8, + flags_end: u8, + out: &mut [u8], +); + +// A shared helper function for platform-specific tests. +pub fn test_hash_many_fn( + hash_many_chunks_fn: HashManyFn<[u8; CHUNK_LEN]>, + hash_many_parents_fn: HashManyFn<[u8; 2 * OUT_LEN]>, +) { + // Test a few different initial counter values. + // - 0: The base case. + // - u32::MAX: The low word of the counter overflows for all inputs except the first. + // - i32::MAX: *No* overflow. But carry bugs in tricky SIMD code can screw this up, if you XOR + // when you're supposed to ANDNOT... + let initial_counters = [0, u32::MAX as u64, i32::MAX as u64]; + for counter in initial_counters { + #[cfg(feature = "std")] + dbg!(counter); + + // 31 (16 + 8 + 4 + 2 + 1) inputs + const NUM_INPUTS: usize = 31; + let mut input_buf = [0; CHUNK_LEN * NUM_INPUTS]; + crate::test::paint_test_input(&mut input_buf); + + // First hash chunks. + let mut chunks = ArrayVec::<&[u8; CHUNK_LEN], NUM_INPUTS>::new(); + for i in 0..NUM_INPUTS { + chunks.push(array_ref!(input_buf, i * CHUNK_LEN, CHUNK_LEN)); + } + let mut portable_chunks_out = [0; NUM_INPUTS * OUT_LEN]; + crate::portable::hash_many( + &chunks, + &TEST_KEY_WORDS, + counter, + IncrementCounter::Yes, + crate::KEYED_HASH, + crate::CHUNK_START, + crate::CHUNK_END, + &mut portable_chunks_out, + ); + + let mut test_chunks_out = [0; NUM_INPUTS * OUT_LEN]; + unsafe { + hash_many_chunks_fn( + &chunks[..], + &TEST_KEY_WORDS, + counter, + IncrementCounter::Yes, + crate::KEYED_HASH, + crate::CHUNK_START, + crate::CHUNK_END, + &mut test_chunks_out, + ); + } + for n in 0..NUM_INPUTS { + #[cfg(feature = "std")] + dbg!(n); + assert_eq!( + &portable_chunks_out[n * OUT_LEN..][..OUT_LEN], + &test_chunks_out[n * OUT_LEN..][..OUT_LEN] + ); + } + + // Then hash parents. + let mut parents = ArrayVec::<&[u8; 2 * OUT_LEN], NUM_INPUTS>::new(); + for i in 0..NUM_INPUTS { + parents.push(array_ref!(input_buf, i * 2 * OUT_LEN, 2 * OUT_LEN)); + } + let mut portable_parents_out = [0; NUM_INPUTS * OUT_LEN]; + crate::portable::hash_many( + &parents, + &TEST_KEY_WORDS, + counter, + IncrementCounter::No, + crate::KEYED_HASH | crate::PARENT, + 0, + 0, + &mut portable_parents_out, + ); + + let mut test_parents_out = [0; NUM_INPUTS * OUT_LEN]; + unsafe { + hash_many_parents_fn( + &parents[..], + &TEST_KEY_WORDS, + counter, + IncrementCounter::No, + crate::KEYED_HASH | crate::PARENT, + 0, + 0, + &mut test_parents_out, + ); + } + for n in 0..NUM_INPUTS { + #[cfg(feature = "std")] + dbg!(n); + assert_eq!( + &portable_parents_out[n * OUT_LEN..][..OUT_LEN], + &test_parents_out[n * OUT_LEN..][..OUT_LEN] + ); + } + } +} + +#[test] +fn test_key_bytes_equal_key_words() { + assert_eq!( + TEST_KEY_WORDS, + crate::platform::words_from_le_bytes_32(&TEST_KEY), + ); +} + +#[test] +fn test_reference_impl_size() { + // Because the Rust compiler optimizes struct layout, it's possible that + // some future version of the compiler will produce a different size. If + // that happens, we can either disable this test, or test for multiple + // expected values. For now, the purpose of this test is to make sure we + // notice if that happens. + assert_eq!(1880, core::mem::size_of::()); +} + +#[test] +fn test_counter_words() { + let counter: u64 = (1 << 32) + 2; + assert_eq!(crate::counter_low(counter), 2); + assert_eq!(crate::counter_high(counter), 1); +} + +#[test] +fn test_largest_power_of_two_leq() { + let input_output = &[ + // The zero case is nonsensical, but it does work. + (0, 1), + (1, 1), + (2, 2), + (3, 2), + (4, 4), + (5, 4), + (6, 4), + (7, 4), + (8, 8), + // the largest possible usize + (usize::MAX, (usize::MAX >> 1) + 1), + ]; + for &(input, output) in input_output { + assert_eq!( + output, + crate::largest_power_of_two_leq(input), + "wrong output for n={}", + input + ); + } +} + +#[test] +fn test_left_len() { + let input_output = &[ + (CHUNK_LEN + 1, CHUNK_LEN), + (2 * CHUNK_LEN - 1, CHUNK_LEN), + (2 * CHUNK_LEN, CHUNK_LEN), + (2 * CHUNK_LEN + 1, 2 * CHUNK_LEN), + (4 * CHUNK_LEN - 1, 2 * CHUNK_LEN), + (4 * CHUNK_LEN, 2 * CHUNK_LEN), + (4 * CHUNK_LEN + 1, 4 * CHUNK_LEN), + ]; + for &(input, output) in input_output { + assert_eq!(crate::left_len(input), output); + } +} + +#[test] +fn test_compare_reference_impl() { + const OUT: usize = 303; // more than 64, not a multiple of 4 + let mut input_buf = [0; TEST_CASES_MAX]; + paint_test_input(&mut input_buf); + for &case in TEST_CASES { + let input = &input_buf[..case]; + #[cfg(feature = "std")] + dbg!(case); + + // regular + { + let mut reference_hasher = reference_impl::Hasher::new(); + reference_hasher.update(input); + let mut expected_out = [0; OUT]; + reference_hasher.finalize(&mut expected_out); + + // all at once + let test_out = crate::hash(input); + assert_eq!(test_out, *array_ref!(expected_out, 0, 32)); + // incremental + let mut hasher = crate::Hasher::new(); + hasher.update(input); + assert_eq!(hasher.finalize(), *array_ref!(expected_out, 0, 32)); + assert_eq!(hasher.finalize(), test_out); + // incremental (rayon) + #[cfg(feature = "rayon")] + { + let mut hasher = crate::Hasher::new(); + hasher.update_rayon(input); + assert_eq!(hasher.finalize(), *array_ref!(expected_out, 0, 32)); + assert_eq!(hasher.finalize(), test_out); + } + // xof + let mut extended = [0; OUT]; + hasher.finalize_xof().fill(&mut extended); + assert_eq!(extended, expected_out); + } + + // keyed + { + let mut reference_hasher = reference_impl::Hasher::new_keyed(&TEST_KEY); + reference_hasher.update(input); + let mut expected_out = [0; OUT]; + reference_hasher.finalize(&mut expected_out); + + // all at once + let test_out = crate::keyed_hash(&TEST_KEY, input); + assert_eq!(test_out, *array_ref!(expected_out, 0, 32)); + // incremental + let mut hasher = crate::Hasher::new_keyed(&TEST_KEY); + hasher.update(input); + assert_eq!(hasher.finalize(), *array_ref!(expected_out, 0, 32)); + assert_eq!(hasher.finalize(), test_out); + // incremental (rayon) + #[cfg(feature = "rayon")] + { + let mut hasher = crate::Hasher::new_keyed(&TEST_KEY); + hasher.update_rayon(input); + assert_eq!(hasher.finalize(), *array_ref!(expected_out, 0, 32)); + assert_eq!(hasher.finalize(), test_out); + } + // xof + let mut extended = [0; OUT]; + hasher.finalize_xof().fill(&mut extended); + assert_eq!(extended, expected_out); + } + + // derive_key + { + let context = "BLAKE3 2019-12-27 16:13:59 example context (not the test vector one)"; + let mut reference_hasher = reference_impl::Hasher::new_derive_key(context); + reference_hasher.update(input); + let mut expected_out = [0; OUT]; + reference_hasher.finalize(&mut expected_out); + + // all at once + let test_out = crate::derive_key(context, input); + assert_eq!(test_out, expected_out[..32]); + // incremental + let mut hasher = crate::Hasher::new_derive_key(context); + hasher.update(input); + assert_eq!(hasher.finalize(), *array_ref!(expected_out, 0, 32)); + assert_eq!(hasher.finalize(), *array_ref!(test_out, 0, 32)); + // incremental (rayon) + #[cfg(feature = "rayon")] + { + let mut hasher = crate::Hasher::new_derive_key(context); + hasher.update_rayon(input); + assert_eq!(hasher.finalize(), *array_ref!(expected_out, 0, 32)); + assert_eq!(hasher.finalize(), *array_ref!(test_out, 0, 32)); + } + // xof + let mut extended = [0; OUT]; + hasher.finalize_xof().fill(&mut extended); + assert_eq!(extended, expected_out); + } + } +} + +fn reference_hash(input: &[u8]) -> crate::Hash { + let mut hasher = reference_impl::Hasher::new(); + hasher.update(input); + let mut bytes = [0; 32]; + hasher.finalize(&mut bytes); + bytes.into() +} + +#[test] +fn test_compare_update_multiple() { + // Don't use all the long test cases here, since that's unnecessarily slow + // in debug mode. + let mut short_test_cases = TEST_CASES; + while *short_test_cases.last().unwrap() > 4 * CHUNK_LEN { + short_test_cases = &short_test_cases[..short_test_cases.len() - 1]; + } + assert_eq!(*short_test_cases.last().unwrap(), 4 * CHUNK_LEN); + + let mut input_buf = [0; 2 * TEST_CASES_MAX]; + paint_test_input(&mut input_buf); + + for &first_update in short_test_cases { + #[cfg(feature = "std")] + dbg!(first_update); + let first_input = &input_buf[..first_update]; + let mut test_hasher = crate::Hasher::new(); + test_hasher.update(first_input); + + for &second_update in short_test_cases { + #[cfg(feature = "std")] + dbg!(second_update); + let second_input = &input_buf[first_update..][..second_update]; + let total_input = &input_buf[..first_update + second_update]; + + // Clone the hasher with first_update bytes already written, so + // that the next iteration can reuse it. + let mut test_hasher = test_hasher.clone(); + test_hasher.update(second_input); + let expected = reference_hash(total_input); + assert_eq!(expected, test_hasher.finalize()); + } + } +} + +#[test] +fn test_fuzz_hasher() { + const INPUT_MAX: usize = 4 * CHUNK_LEN; + let mut input_buf = [0; 3 * INPUT_MAX]; + paint_test_input(&mut input_buf); + + // Don't do too many iterations in debug mode, to keep the tests under a + // second or so. CI should run tests in release mode also. Provide an + // environment variable for specifying a larger number of fuzz iterations. + let num_tests = if cfg!(debug_assertions) { 100 } else { 10_000 }; + + // Use a fixed RNG seed for reproducibility. + let mut rng = rand_chacha::ChaCha8Rng::from_seed([1; 32]); + for _num_test in 0..num_tests { + #[cfg(feature = "std")] + dbg!(_num_test); + let mut hasher = crate::Hasher::new(); + let mut total_input = 0; + // For each test, write 3 inputs of random length. + for _ in 0..3 { + let input_len = rng.gen_range(0..(INPUT_MAX + 1)); + #[cfg(feature = "std")] + dbg!(input_len); + let input = &input_buf[total_input..][..input_len]; + hasher.update(input); + total_input += input_len; + } + let expected = reference_hash(&input_buf[..total_input]); + assert_eq!(expected, hasher.finalize()); + } +} + +#[test] +fn test_xof_seek() { + let mut out = [0; 533]; + let mut hasher = crate::Hasher::new(); + hasher.update(b"foo"); + hasher.finalize_xof().fill(&mut out); + assert_eq!(hasher.finalize().as_bytes(), &out[0..32]); + + let mut reader = hasher.finalize_xof(); + reader.set_position(303); + let mut out2 = [0; 102]; + reader.fill(&mut out2); + assert_eq!(&out[303..][..102], &out2[..]); + + #[cfg(feature = "std")] + { + use std::io::prelude::*; + let mut reader = hasher.finalize_xof(); + reader.seek(std::io::SeekFrom::Start(303)).unwrap(); + let mut out3 = Vec::new(); + reader.by_ref().take(102).read_to_end(&mut out3).unwrap(); + assert_eq!(&out[303..][..102], &out3[..]); + + assert_eq!( + reader.seek(std::io::SeekFrom::Current(0)).unwrap(), + 303 + 102 + ); + reader.seek(std::io::SeekFrom::Current(-5)).unwrap(); + assert_eq!( + reader.seek(std::io::SeekFrom::Current(0)).unwrap(), + 303 + 102 - 5 + ); + let mut out4 = [0; 17]; + assert_eq!(reader.read(&mut out4).unwrap(), 17); + assert_eq!(&out[303 + 102 - 5..][..17], &out4[..]); + assert_eq!( + reader.seek(std::io::SeekFrom::Current(0)).unwrap(), + 303 + 102 - 5 + 17 + ); + assert!(reader.seek(std::io::SeekFrom::End(0)).is_err()); + assert!(reader.seek(std::io::SeekFrom::Current(-1000)).is_err()); + } +} + +#[test] +fn test_msg_schedule_permutation() { + let permutation = [2, 6, 3, 10, 7, 0, 4, 13, 1, 11, 12, 5, 9, 14, 15, 8]; + + let mut generated = [[0; 16]; 7]; + generated[0] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]; + + for round in 1..7 { + for i in 0..16 { + generated[round][i] = generated[round - 1][permutation[i]]; + } + } + + assert_eq!(generated, crate::MSG_SCHEDULE); +} + +#[test] +fn test_reset() { + let mut hasher = crate::Hasher::new(); + hasher.update(&[42; 3 * CHUNK_LEN + 7]); + hasher.reset(); + hasher.update(&[42; CHUNK_LEN + 3]); + assert_eq!(hasher.finalize(), crate::hash(&[42; CHUNK_LEN + 3])); + + let key = &[99; crate::KEY_LEN]; + let mut keyed_hasher = crate::Hasher::new_keyed(key); + keyed_hasher.update(&[42; 3 * CHUNK_LEN + 7]); + keyed_hasher.reset(); + keyed_hasher.update(&[42; CHUNK_LEN + 3]); + assert_eq!( + keyed_hasher.finalize(), + crate::keyed_hash(key, &[42; CHUNK_LEN + 3]), + ); + + let context = "BLAKE3 2020-02-12 10:20:58 reset test"; + let mut kdf = crate::Hasher::new_derive_key(context); + kdf.update(&[42; 3 * CHUNK_LEN + 7]); + kdf.reset(); + kdf.update(&[42; CHUNK_LEN + 3]); + let expected = crate::derive_key(context, &[42; CHUNK_LEN + 3]); + assert_eq!(kdf.finalize(), expected); +} + +#[test] +fn test_hex_encoding_decoding() { + let digest_str = "04e0bb39f30b1a3feb89f536c93be15055482df748674b00d26e5a75777702e9"; + let mut hasher = crate::Hasher::new(); + hasher.update(b"foo"); + let digest = hasher.finalize(); + assert_eq!(digest.to_hex().as_str(), digest_str); + #[cfg(feature = "std")] + assert_eq!(digest.to_string(), digest_str); + + // Test round trip + let digest = crate::Hash::from_hex(digest_str).unwrap(); + assert_eq!(digest.to_hex().as_str(), digest_str); + + // Test uppercase + let digest = crate::Hash::from_hex(digest_str.to_uppercase()).unwrap(); + assert_eq!(digest.to_hex().as_str(), digest_str); + + // Test string parsing via FromStr + let digest: crate::Hash = digest_str.parse().unwrap(); + assert_eq!(digest.to_hex().as_str(), digest_str); + + // Test errors + let bad_len = "04e0bb39f30b1"; + let _result = crate::Hash::from_hex(bad_len).unwrap_err(); + #[cfg(feature = "std")] + assert_eq!(_result.to_string(), "expected 64 hex bytes, received 13"); + + let bad_char = "Z4e0bb39f30b1a3feb89f536c93be15055482df748674b00d26e5a75777702e9"; + let _result = crate::Hash::from_hex(bad_char).unwrap_err(); + #[cfg(feature = "std")] + assert_eq!(_result.to_string(), "invalid hex character: 'Z'"); + + let _result = crate::Hash::from_hex([128; 64]).unwrap_err(); + #[cfg(feature = "std")] + assert_eq!(_result.to_string(), "invalid hex character: 0x80"); +} + +// This test is a mimized failure case for the Windows SSE2 bug described in +// https://github.com/BLAKE3-team/BLAKE3/issues/206. +// +// Before that issue was fixed, this test would fail on Windows in the following configuration: +// +// cargo test --features=no_avx512,no_avx2,no_sse41 --release +// +// Bugs like this one (stomping on a caller's register) are very sensitive to the details of +// surrounding code, so it's not especially likely that this test will catch another bug (or even +// the same bug) in the future. Still, there's no harm in keeping it. +#[test] +fn test_issue_206_windows_sse2() { + // This stupid loop has to be here to trigger the bug. I don't know why. + for _ in &[0] { + // The length 65 (two blocks) is significant. It doesn't repro with 64 (one block). It also + // doesn't repro with an all-zero input. + let input = &[0xff; 65]; + let expected_hash = [ + 183, 235, 50, 217, 156, 24, 190, 219, 2, 216, 176, 255, 224, 53, 28, 95, 57, 148, 179, + 245, 162, 90, 37, 121, 0, 142, 219, 62, 234, 204, 225, 161, + ]; + + // This throwaway call has to be here to trigger the bug. + crate::Hasher::new().update(input); + + // This assert fails when the bug is triggered. + assert_eq!(crate::Hasher::new().update(input).finalize(), expected_hash); + } +} + +#[test] +fn test_hash_conversions() { + let bytes1 = [42; 32]; + let hash1: crate::Hash = bytes1.into(); + let bytes2: [u8; 32] = hash1.into(); + assert_eq!(bytes1, bytes2); + + let bytes3 = *hash1.as_bytes(); + assert_eq!(bytes1, bytes3); + + let hash2 = crate::Hash::from_bytes(bytes1); + assert_eq!(hash1, hash2); + + let hex = hash1.to_hex(); + let hash3 = crate::Hash::from_hex(hex.as_bytes()).unwrap(); + assert_eq!(hash1, hash3); +} + +#[test] +const fn test_hash_const_conversions() { + let bytes = [42; 32]; + let hash = crate::Hash::from_bytes(bytes); + _ = hash.as_bytes(); +} diff --git a/third-party/blake3/src/traits.rs b/third-party/blake3/src/traits.rs new file mode 100644 index 00000000..70b1c068 --- /dev/null +++ b/third-party/blake3/src/traits.rs @@ -0,0 +1,227 @@ +//! Implementations of commonly used traits like `Digest` and `Mac` from the +//! [`digest`](https://crates.io/crates/digest) crate. + +pub use digest; + +use crate::{Hasher, OutputReader}; +use digest::crypto_common; +use digest::generic_array::{typenum::U32, typenum::U64, GenericArray}; + +impl digest::HashMarker for Hasher {} + +impl digest::Update for Hasher { + #[inline] + fn update(&mut self, data: &[u8]) { + self.update(data); + } +} + +impl digest::Reset for Hasher { + #[inline] + fn reset(&mut self) { + self.reset(); // the inherent method + } +} + +impl digest::OutputSizeUser for Hasher { + type OutputSize = U32; +} + +impl digest::FixedOutput for Hasher { + #[inline] + fn finalize_into(self, out: &mut GenericArray) { + out.copy_from_slice(self.finalize().as_bytes()); + } +} + +impl digest::FixedOutputReset for Hasher { + #[inline] + fn finalize_into_reset(&mut self, out: &mut GenericArray) { + out.copy_from_slice(self.finalize().as_bytes()); + self.reset(); + } +} + +impl digest::ExtendableOutput for Hasher { + type Reader = OutputReader; + + #[inline] + fn finalize_xof(self) -> Self::Reader { + Hasher::finalize_xof(&self) + } +} + +impl digest::ExtendableOutputReset for Hasher { + #[inline] + fn finalize_xof_reset(&mut self) -> Self::Reader { + let reader = Hasher::finalize_xof(self); + self.reset(); + reader + } +} + +impl digest::XofReader for OutputReader { + #[inline] + fn read(&mut self, buffer: &mut [u8]) { + self.fill(buffer); + } +} + +impl crypto_common::KeySizeUser for Hasher { + type KeySize = U32; +} + +impl crypto_common::BlockSizeUser for Hasher { + type BlockSize = U64; +} + +impl digest::MacMarker for Hasher {} + +impl digest::KeyInit for Hasher { + #[inline] + fn new(key: &digest::Key) -> Self { + let key_bytes: [u8; 32] = (*key).into(); + Hasher::new_keyed(&key_bytes) + } +} + +#[cfg(test)] +mod test { + use super::*; + + #[test] + fn test_digest_traits() { + // Inherent methods. + let mut hasher1 = crate::Hasher::new(); + hasher1.update(b"foo"); + hasher1.update(b"bar"); + hasher1.update(b"baz"); + let out1 = hasher1.finalize(); + let mut xof1 = [0; 301]; + hasher1.finalize_xof().fill(&mut xof1); + assert_eq!(out1.as_bytes(), &xof1[..32]); + + // Trait implementations. + let mut hasher2: crate::Hasher = digest::Digest::new(); + digest::Digest::update(&mut hasher2, b"xxx"); + digest::Digest::reset(&mut hasher2); + digest::Digest::update(&mut hasher2, b"foo"); + digest::Digest::update(&mut hasher2, b"bar"); + digest::Digest::update(&mut hasher2, b"baz"); + let out2 = digest::Digest::finalize(hasher2.clone()); + let mut xof2 = [0; 301]; + digest::XofReader::read( + &mut digest::ExtendableOutput::finalize_xof(hasher2.clone()), + &mut xof2, + ); + assert_eq!(out1.as_bytes(), &out2[..]); + assert_eq!(xof1[..], xof2[..]); + + // Again with the resetting variants. + let mut hasher3: crate::Hasher = digest::Digest::new(); + digest::Digest::update(&mut hasher3, b"foobarbaz"); + let mut out3 = [0; 32]; + digest::FixedOutputReset::finalize_into_reset( + &mut hasher3, + GenericArray::from_mut_slice(&mut out3), + ); + digest::Digest::update(&mut hasher3, b"foobarbaz"); + let mut out4 = [0; 32]; + digest::FixedOutputReset::finalize_into_reset( + &mut hasher3, + GenericArray::from_mut_slice(&mut out4), + ); + digest::Digest::update(&mut hasher3, b"foobarbaz"); + let mut xof3 = [0; 301]; + digest::XofReader::read( + &mut digest::ExtendableOutputReset::finalize_xof_reset(&mut hasher3), + &mut xof3, + ); + digest::Digest::update(&mut hasher3, b"foobarbaz"); + let mut xof4 = [0; 301]; + digest::XofReader::read( + &mut digest::ExtendableOutputReset::finalize_xof_reset(&mut hasher3), + &mut xof4, + ); + assert_eq!(out1.as_bytes(), &out3[..]); + assert_eq!(out1.as_bytes(), &out4[..]); + assert_eq!(xof1[..], xof3[..]); + assert_eq!(xof1[..], xof4[..]); + } + + #[test] + fn test_mac_trait() { + // Inherent methods. + let key = b"some super secret key bytes fooo"; + let mut hasher1 = crate::Hasher::new_keyed(key); + hasher1.update(b"foo"); + hasher1.update(b"bar"); + hasher1.update(b"baz"); + let out1 = hasher1.finalize(); + + // Trait implementation. + let generic_key = (*key).into(); + let mut hasher2: crate::Hasher = digest::Mac::new(&generic_key); + digest::Mac::update(&mut hasher2, b"xxx"); + digest::Mac::reset(&mut hasher2); + digest::Mac::update(&mut hasher2, b"foo"); + digest::Mac::update(&mut hasher2, b"bar"); + digest::Mac::update(&mut hasher2, b"baz"); + let out2 = digest::Mac::finalize(hasher2); + assert_eq!(out1.as_bytes(), out2.into_bytes().as_slice()); + } + + fn expected_hmac_blake3(key: &[u8], input: &[u8]) -> [u8; 32] { + // See https://en.wikipedia.org/wiki/HMAC. + let key_hash; + let key_prime = if key.len() <= 64 { + key + } else { + key_hash = *crate::hash(key).as_bytes(); + &key_hash + }; + let mut ipad = [0x36; 64]; + let mut opad = [0x5c; 64]; + for i in 0..key_prime.len() { + ipad[i] ^= key_prime[i]; + opad[i] ^= key_prime[i]; + } + let mut inner_state = crate::Hasher::new(); + inner_state.update(&ipad); + inner_state.update(input); + let mut outer_state = crate::Hasher::new(); + outer_state.update(&opad); + outer_state.update(inner_state.finalize().as_bytes()); + outer_state.finalize().into() + } + + #[test] + fn test_hmac_compatibility() { + use hmac::{Mac, SimpleHmac}; + + // Test a short key. + let mut x = SimpleHmac::::new_from_slice(b"key").unwrap(); + hmac::digest::Update::update(&mut x, b"data"); + let output = x.finalize().into_bytes(); + assert_ne!(output.len(), 0); + let expected = expected_hmac_blake3(b"key", b"data"); + assert_eq!(expected, output.as_ref()); + + // Test a range of key and data lengths, particularly to exercise the long-key logic. + let mut input_bytes = [0; crate::test::TEST_CASES_MAX]; + crate::test::paint_test_input(&mut input_bytes); + for &input_len in crate::test::TEST_CASES { + #[cfg(feature = "std")] + dbg!(input_len); + let input = &input_bytes[..input_len]; + + let mut x = SimpleHmac::::new_from_slice(input).unwrap(); + hmac::digest::Update::update(&mut x, input); + let output = x.finalize().into_bytes(); + assert_ne!(output.len(), 0); + + let expected = expected_hmac_blake3(input, input); + assert_eq!(expected, output.as_ref()); + } + } +} diff --git a/third-party/blake3/test_vectors/Cargo.toml b/third-party/blake3/test_vectors/Cargo.toml new file mode 100644 index 00000000..87a9eba1 --- /dev/null +++ b/third-party/blake3/test_vectors/Cargo.toml @@ -0,0 +1,18 @@ +[package] +name = "test_vectors" +version = "0.0.0" +edition = "2021" + +[features] +neon = ["blake3/neon"] +prefer_intrinsics = ["blake3/prefer_intrinsics"] +pure = ["blake3/pure"] + +[dependencies] +# If you ever change these path dependencies, you'll probably need to update +# cross_test.sh, or CI will break. I'm sorry >.< +blake3 = { path = "../" } +hex = "0.4.0" +reference_impl = { path = "../reference_impl" } +serde = { version = "1.0", features = ["derive"] } +serde_json = "1.0" diff --git a/third-party/blake3/test_vectors/cross_test.sh b/third-party/blake3/test_vectors/cross_test.sh new file mode 100755 index 00000000..c4d280c9 --- /dev/null +++ b/third-party/blake3/test_vectors/cross_test.sh @@ -0,0 +1,25 @@ +#! /usr/bin/env bash + +# This hacky script works around the fact that `cross test` does not support +# path dependencies. (It uses a docker shared folder to let the guest access +# project files, so parent directories aren't available.) Solve this problem by +# copying the entire project to a temp dir and rearranging paths to put +# "blake3" and "reference_impl" underneath "test_vectors", so that everything +# is accessible. Hopefully this will just run on CI forever and no one will +# ever read this and discover my deep shame. + +set -e -u -o pipefail + +project_root="$(realpath "$(dirname "$BASH_SOURCE")/..")" +tmpdir="$(mktemp -d)" +echo "Running cross tests in $tmpdir" +cd "$tmpdir" +git clone "$project_root" blake3 +mv blake3/test_vectors . +mv blake3/reference_impl test_vectors +mv blake3 test_vectors +cd test_vectors +sed -i 's|blake3 = { path = "../" }|blake3 = { path = "./blake3" }|' Cargo.toml +sed -i 's|reference_impl = { path = "../reference_impl" }|reference_impl = { path = "reference_impl" }|' Cargo.toml + +cross test "$@" diff --git a/third-party/blake3/test_vectors/src/bin/generate.rs b/third-party/blake3/test_vectors/src/bin/generate.rs new file mode 100644 index 00000000..3c618c1d --- /dev/null +++ b/third-party/blake3/test_vectors/src/bin/generate.rs @@ -0,0 +1,4 @@ +fn main() { + // The trailing newline is included. + print!("{}", test_vectors::generate_json()); +} diff --git a/third-party/blake3/test_vectors/src/lib.rs b/third-party/blake3/test_vectors/src/lib.rs new file mode 100644 index 00000000..6a4c7984 --- /dev/null +++ b/third-party/blake3/test_vectors/src/lib.rs @@ -0,0 +1,352 @@ +use blake3::guts::{BLOCK_LEN, CHUNK_LEN}; +use serde::{Deserialize, Serialize}; + +// A non-multiple of 4 is important, since one possible bug is to fail to emit +// partial words. +pub const OUTPUT_LEN: usize = 2 * BLOCK_LEN + 3; + +pub const TEST_CASES: &[usize] = &[ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + BLOCK_LEN - 1, + BLOCK_LEN, + BLOCK_LEN + 1, + 2 * BLOCK_LEN - 1, + 2 * BLOCK_LEN, + 2 * BLOCK_LEN + 1, + CHUNK_LEN - 1, + CHUNK_LEN, + CHUNK_LEN + 1, + 2 * CHUNK_LEN, + 2 * CHUNK_LEN + 1, + 3 * CHUNK_LEN, + 3 * CHUNK_LEN + 1, + 4 * CHUNK_LEN, + 4 * CHUNK_LEN + 1, + 5 * CHUNK_LEN, + 5 * CHUNK_LEN + 1, + 6 * CHUNK_LEN, + 6 * CHUNK_LEN + 1, + 7 * CHUNK_LEN, + 7 * CHUNK_LEN + 1, + 8 * CHUNK_LEN, + 8 * CHUNK_LEN + 1, + 16 * CHUNK_LEN, // AVX512's bandwidth + 31 * CHUNK_LEN, // 16 + 8 + 4 + 2 + 1 + 100 * CHUNK_LEN, // subtrees larger than MAX_SIMD_DEGREE chunks +]; + +pub const TEST_KEY: &[u8; blake3::KEY_LEN] = b"whats the Elvish word for friend"; +pub const TEST_CONTEXT: &str = "BLAKE3 2019-12-27 16:29:52 test vectors context"; + +const COMMENT: &str = r#" +Each test is an input length and three outputs, one for each of the hash, +keyed_hash, and derive_key modes. The input in each case is filled with a +repeating sequence of 251 bytes: 0, 1, 2, ..., 249, 250, 0, 1, ..., and so on. +The key used with keyed_hash is the 32-byte ASCII string "whats the Elvish word +for friend", also given in the `key` field below. The context string used with +derive_key is the ASCII string "BLAKE3 2019-12-27 16:29:52 test vectors +context", also given in the `context_string` field below. Outputs are encoded +as hexadecimal. Each case is an extended output, and implementations should +also check that the first 32 bytes match their default-length output. +"#; + +// Paint the input with a repeating byte pattern. We use a cycle length of 251, +// because that's the largest prime number less than 256. This makes it +// unlikely to swapping any two adjacent input blocks or chunks will give the +// same answer. +pub fn paint_test_input(buf: &mut [u8]) { + for (i, b) in buf.iter_mut().enumerate() { + *b = (i % 251) as u8; + } +} + +#[derive(Debug, Serialize, Deserialize)] +pub struct Cases { + pub _comment: String, + pub key: String, + pub context_string: String, + pub cases: Vec, +} + +#[derive(Debug, Serialize, Deserialize)] +pub struct Case { + pub input_len: usize, + pub hash: String, + pub keyed_hash: String, + pub derive_key: String, +} + +pub fn generate_json() -> String { + let mut cases = Vec::new(); + for &input_len in TEST_CASES { + let mut input = vec![0; input_len]; + paint_test_input(&mut input); + + let mut hash_out = [0; OUTPUT_LEN]; + blake3::Hasher::new() + .update(&input) + .finalize_xof() + .fill(&mut hash_out); + + let mut keyed_hash_out = [0; OUTPUT_LEN]; + blake3::Hasher::new_keyed(TEST_KEY) + .update(&input) + .finalize_xof() + .fill(&mut keyed_hash_out); + + let mut derive_key_out = [0; OUTPUT_LEN]; + blake3::Hasher::new_derive_key(TEST_CONTEXT) + .update(&input) + .finalize_xof() + .fill(&mut derive_key_out); + + cases.push(Case { + input_len, + hash: hex::encode(&hash_out[..]), + keyed_hash: hex::encode(&keyed_hash_out[..]), + derive_key: hex::encode(&derive_key_out[..]), + }); + } + + let mut json = serde_json::to_string_pretty(&Cases { + _comment: COMMENT.trim().replace("\n", " "), + key: std::str::from_utf8(TEST_KEY).unwrap().to_string(), + context_string: TEST_CONTEXT.to_string(), + cases, + }) + .unwrap(); + + // Add a trailing newline. + json.push('\n'); + json +} + +pub fn read_test_vectors_file() -> String { + let test_vectors_file_path = "./test_vectors.json"; + std::fs::read_to_string(test_vectors_file_path).expect("failed to read test_vectors.json") +} + +pub fn parse_test_cases() -> Cases { + let json = read_test_vectors_file(); + serde_json::from_str(&json).expect("failed to parse test_vectors.json") +} + +#[cfg(test)] +mod tests { + use super::*; + + fn test_reference_impl_all_at_once( + key: &[u8; blake3::KEY_LEN], + input: &[u8], + expected_hash: &[u8], + expected_keyed_hash: &[u8], + expected_derive_key: &[u8], + ) { + let mut out = vec![0; expected_hash.len()]; + let mut hasher = reference_impl::Hasher::new(); + hasher.update(input); + hasher.finalize(&mut out); + assert_eq!(expected_hash, &out[..]); + + let mut out = vec![0; expected_keyed_hash.len()]; + let mut hasher = reference_impl::Hasher::new_keyed(key); + hasher.update(input); + hasher.finalize(&mut out); + assert_eq!(expected_keyed_hash, &out[..]); + + let mut out = vec![0; expected_derive_key.len()]; + let mut hasher = reference_impl::Hasher::new_derive_key(TEST_CONTEXT); + hasher.update(input); + hasher.finalize(&mut out); + assert_eq!(expected_derive_key, &out[..]); + } + + fn test_reference_impl_one_at_a_time( + key: &[u8; blake3::KEY_LEN], + input: &[u8], + expected_hash: &[u8], + expected_keyed_hash: &[u8], + expected_derive_key: &[u8], + ) { + let mut out = vec![0; expected_hash.len()]; + let mut hasher = reference_impl::Hasher::new(); + for &b in input { + hasher.update(&[b]); + } + hasher.finalize(&mut out); + assert_eq!(expected_hash, &out[..]); + + let mut out = vec![0; expected_keyed_hash.len()]; + let mut hasher = reference_impl::Hasher::new_keyed(key); + for &b in input { + hasher.update(&[b]); + } + hasher.finalize(&mut out); + assert_eq!(expected_keyed_hash, &out[..]); + + let mut out = vec![0; expected_derive_key.len()]; + let mut hasher = reference_impl::Hasher::new_derive_key(TEST_CONTEXT); + for &b in input { + hasher.update(&[b]); + } + hasher.finalize(&mut out); + assert_eq!(expected_derive_key, &out[..]); + } + + fn test_incremental_all_at_once( + key: &[u8; blake3::KEY_LEN], + input: &[u8], + expected_hash: &[u8], + expected_keyed_hash: &[u8], + expected_derive_key: &[u8], + ) { + let mut out = vec![0; expected_hash.len()]; + let mut hasher = blake3::Hasher::new(); + hasher.update(input); + hasher.finalize_xof().fill(&mut out); + assert_eq!(expected_hash, &out[..]); + assert_eq!(&expected_hash[..32], hasher.finalize().as_bytes()); + + let mut out = vec![0; expected_keyed_hash.len()]; + let mut hasher = blake3::Hasher::new_keyed(key); + hasher.update(input); + hasher.finalize_xof().fill(&mut out); + assert_eq!(expected_keyed_hash, &out[..]); + assert_eq!(&expected_keyed_hash[..32], hasher.finalize().as_bytes()); + + let mut out = vec![0; expected_derive_key.len()]; + let mut hasher = blake3::Hasher::new_derive_key(TEST_CONTEXT); + hasher.update(input); + hasher.finalize_xof().fill(&mut out); + assert_eq!(expected_derive_key, &out[..]); + assert_eq!(&expected_derive_key[..32], hasher.finalize().as_bytes()); + } + + fn test_incremental_one_at_a_time( + key: &[u8; blake3::KEY_LEN], + input: &[u8], + expected_hash: &[u8], + expected_keyed_hash: &[u8], + expected_derive_key: &[u8], + ) { + let mut out = vec![0; expected_hash.len()]; + let mut hasher = blake3::Hasher::new(); + for i in 0..input.len() { + hasher.update(&[input[i]]); + assert_eq!(i as u64 + 1, hasher.count()); + } + hasher.finalize_xof().fill(&mut out); + assert_eq!(expected_hash, &out[..]); + assert_eq!(&expected_hash[..32], hasher.finalize().as_bytes()); + + let mut out = vec![0; expected_keyed_hash.len()]; + let mut hasher = blake3::Hasher::new_keyed(key); + for i in 0..input.len() { + hasher.update(&[input[i]]); + assert_eq!(i as u64 + 1, hasher.count()); + } + hasher.finalize_xof().fill(&mut out); + assert_eq!(expected_keyed_hash, &out[..]); + assert_eq!(&expected_keyed_hash[..32], hasher.finalize().as_bytes()); + + let mut out = vec![0; expected_derive_key.len()]; + let mut hasher = blake3::Hasher::new_derive_key(TEST_CONTEXT); + for i in 0..input.len() { + hasher.update(&[input[i]]); + assert_eq!(i as u64 + 1, hasher.count()); + } + hasher.finalize_xof().fill(&mut out); + assert_eq!(expected_derive_key, &out[..]); + assert_eq!(&expected_derive_key[..32], hasher.finalize().as_bytes()); + } + + fn test_recursive( + key: &[u8; blake3::KEY_LEN], + input: &[u8], + expected_hash: &[u8], + expected_keyed_hash: &[u8], + expected_derive_key: &[u8], + ) { + assert_eq!(&expected_hash[..32], blake3::hash(input).as_bytes()); + assert_eq!( + &expected_keyed_hash[..32], + blake3::keyed_hash(key, input).as_bytes(), + ); + assert_eq!( + expected_derive_key[..32], + blake3::derive_key(TEST_CONTEXT, input) + ); + } + + #[test] + fn run_test_vectors() { + let cases = parse_test_cases(); + let key: &[u8; blake3::KEY_LEN] = cases.key.as_bytes().try_into().unwrap(); + for case in &cases.cases { + dbg!(case.input_len); + let mut input = vec![0; case.input_len]; + paint_test_input(&mut input); + let expected_hash = hex::decode(&case.hash).unwrap(); + let expected_keyed_hash = hex::decode(&case.keyed_hash).unwrap(); + let expected_derive_key = hex::decode(&case.derive_key).unwrap(); + + test_reference_impl_all_at_once( + key, + &input, + &expected_hash, + &expected_keyed_hash, + &expected_derive_key, + ); + + test_reference_impl_one_at_a_time( + key, + &input, + &expected_hash, + &expected_keyed_hash, + &expected_derive_key, + ); + + test_incremental_all_at_once( + key, + &input, + &expected_hash, + &expected_keyed_hash, + &expected_derive_key, + ); + + test_incremental_one_at_a_time( + key, + &input, + &expected_hash, + &expected_keyed_hash, + &expected_derive_key, + ); + + test_recursive( + key, + &input, + &expected_hash, + &expected_keyed_hash, + &expected_derive_key, + ); + } + } + + #[test] + fn test_checked_in_vectors_up_to_date() { + // Replace Windows newlines, in case Git is configured to alter + // newlines when files are checked out. + let json = read_test_vectors_file().replace("\r\n", "\n"); + if generate_json() != json { + panic!("Checked-in test_vectors.json is not up to date. Regenerate with `cargo run --bin generate > ./test_vectors.json`."); + } + } +} diff --git a/third-party/blake3/test_vectors/test_vectors.json b/third-party/blake3/test_vectors/test_vectors.json new file mode 100644 index 00000000..f6da9179 --- /dev/null +++ b/third-party/blake3/test_vectors/test_vectors.json @@ -0,0 +1,217 @@ +{ + "_comment": "Each test is an input length and three outputs, one for each of the hash, keyed_hash, and derive_key modes. The input in each case is filled with a repeating sequence of 251 bytes: 0, 1, 2, ..., 249, 250, 0, 1, ..., and so on. The key used with keyed_hash is the 32-byte ASCII string \"whats the Elvish word for friend\", also given in the `key` field below. The context string used with derive_key is the ASCII string \"BLAKE3 2019-12-27 16:29:52 test vectors context\", also given in the `context_string` field below. Outputs are encoded as hexadecimal. Each case is an extended output, and implementations should also check that the first 32 bytes match their default-length output.", + "key": "whats the Elvish word for friend", + "context_string": "BLAKE3 2019-12-27 16:29:52 test vectors context", + "cases": [ + { + "input_len": 0, + "hash": "af1349b9f5f9a1a6a0404dea36dcc9499bcb25c9adc112b7cc9a93cae41f3262e00f03e7b69af26b7faaf09fcd333050338ddfe085b8cc869ca98b206c08243a26f5487789e8f660afe6c99ef9e0c52b92e7393024a80459cf91f476f9ffdbda7001c22e159b402631f277ca96f2defdf1078282314e763699a31c5363165421cce14d", + "keyed_hash": "92b2b75604ed3c761f9d6f62392c8a9227ad0ea3f09573e783f1498a4ed60d26b18171a2f22a4b94822c701f107153dba24918c4bae4d2945c20ece13387627d3b73cbf97b797d5e59948c7ef788f54372df45e45e4293c7dc18c1d41144a9758be58960856be1eabbe22c2653190de560ca3b2ac4aa692a9210694254c371e851bc8f", + "derive_key": "2cc39783c223154fea8dfb7c1b1660f2ac2dcbd1c1de8277b0b0dd39b7e50d7d905630c8be290dfcf3e6842f13bddd573c098c3f17361f1f206b8cad9d088aa4a3f746752c6b0ce6a83b0da81d59649257cdf8eb3e9f7d4998e41021fac119deefb896224ac99f860011f73609e6e0e4540f93b273e56547dfd3aa1a035ba6689d89a0" + }, + { + "input_len": 1, + "hash": "2d3adedff11b61f14c886e35afa036736dcd87a74d27b5c1510225d0f592e213c3a6cb8bf623e20cdb535f8d1a5ffb86342d9c0b64aca3bce1d31f60adfa137b358ad4d79f97b47c3d5e79f179df87a3b9776ef8325f8329886ba42f07fb138bb502f4081cbcec3195c5871e6c23e2cc97d3c69a613eba131e5f1351f3f1da786545e5", + "keyed_hash": "6d7878dfff2f485635d39013278ae14f1454b8c0a3a2d34bc1ab38228a80c95b6568c0490609413006fbd428eb3fd14e7756d90f73a4725fad147f7bf70fd61c4e0cf7074885e92b0e3f125978b4154986d4fb202a3f331a3fb6cf349a3a70e49990f98fe4289761c8602c4e6ab1138d31d3b62218078b2f3ba9a88e1d08d0dd4cea11", + "derive_key": "b3e2e340a117a499c6cf2398a19ee0d29cca2bb7404c73063382693bf66cb06c5827b91bf889b6b97c5477f535361caefca0b5d8c4746441c57617111933158950670f9aa8a05d791daae10ac683cbef8faf897c84e6114a59d2173c3f417023a35d6983f2c7dfa57e7fc559ad751dbfb9ffab39c2ef8c4aafebc9ae973a64f0c76551" + }, + { + "input_len": 2, + "hash": "7b7015bb92cf0b318037702a6cdd81dee41224f734684c2c122cd6359cb1ee63d8386b22e2ddc05836b7c1bb693d92af006deb5ffbc4c70fb44d0195d0c6f252faac61659ef86523aa16517f87cb5f1340e723756ab65efb2f91964e14391de2a432263a6faf1d146937b35a33621c12d00be8223a7f1919cec0acd12097ff3ab00ab1", + "keyed_hash": "5392ddae0e0a69d5f40160462cbd9bd889375082ff224ac9c758802b7a6fd20a9ffbf7efd13e989a6c246f96d3a96b9d279f2c4e63fb0bdff633957acf50ee1a5f658be144bab0f6f16500dee4aa5967fc2c586d85a04caddec90fffb7633f46a60786024353b9e5cebe277fcd9514217fee2267dcda8f7b31697b7c54fab6a939bf8f", + "derive_key": "1f166565a7df0098ee65922d7fea425fb18b9943f19d6161e2d17939356168e6daa59cae19892b2d54f6fc9f475d26031fd1c22ae0a3e8ef7bdb23f452a15e0027629d2e867b1bb1e6ab21c71297377750826c404dfccc2406bd57a83775f89e0b075e59a7732326715ef912078e213944f490ad68037557518b79c0086de6d6f6cdd2" + }, + { + "input_len": 3, + "hash": "e1be4d7a8ab5560aa4199eea339849ba8e293d55ca0a81006726d184519e647f5b49b82f805a538c68915c1ae8035c900fd1d4b13902920fd05e1450822f36de9454b7e9996de4900c8e723512883f93f4345f8a58bfe64ee38d3ad71ab027765d25cdd0e448328a8e7a683b9a6af8b0af94fa09010d9186890b096a08471e4230a134", + "keyed_hash": "39e67b76b5a007d4921969779fe666da67b5213b096084ab674742f0d5ec62b9b9142d0fab08e1b161efdbb28d18afc64d8f72160c958e53a950cdecf91c1a1bbab1a9c0f01def762a77e2e8545d4dec241e98a89b6db2e9a5b070fc110caae2622690bd7b76c02ab60750a3ea75426a6bb8803c370ffe465f07fb57def95df772c39f", + "derive_key": "440aba35cb006b61fc17c0529255de438efc06a8c9ebf3f2ddac3b5a86705797f27e2e914574f4d87ec04c379e12789eccbfbc15892626042707802dbe4e97c3ff59dca80c1e54246b6d055154f7348a39b7d098b2b4824ebe90e104e763b2a447512132cede16243484a55a4e40a85790038bb0dcf762e8c053cabae41bbe22a5bff7" + }, + { + "input_len": 4, + "hash": "f30f5ab28fe047904037f77b6da4fea1e27241c5d132638d8bedce9d40494f328f603ba4564453e06cdcee6cbe728a4519bbe6f0d41e8a14b5b225174a566dbfa61b56afb1e452dc08c804f8c3143c9e2cc4a31bb738bf8c1917b55830c6e65797211701dc0b98daa1faeaa6ee9e56ab606ce03a1a881e8f14e87a4acf4646272cfd12", + "keyed_hash": "7671dde590c95d5ac9616651ff5aa0a27bee5913a348e053b8aa9108917fe070116c0acff3f0d1fa97ab38d813fd46506089118147d83393019b068a55d646251ecf81105f798d76a10ae413f3d925787d6216a7eb444e510fd56916f1d753a5544ecf0072134a146b2615b42f50c179f56b8fae0788008e3e27c67482349e249cb86a", + "derive_key": "f46085c8190d69022369ce1a18880e9b369c135eb93f3c63550d3e7630e91060fbd7d8f4258bec9da4e05044f88b91944f7cab317a2f0c18279629a3867fad0662c9ad4d42c6f27e5b124da17c8c4f3a94a025ba5d1b623686c6099d202a7317a82e3d95dae46a87de0555d727a5df55de44dab799a20dffe239594d6e99ed17950910" + }, + { + "input_len": 5, + "hash": "b40b44dfd97e7a84a996a91af8b85188c66c126940ba7aad2e7ae6b385402aa2ebcfdac6c5d32c31209e1f81a454751280db64942ce395104e1e4eaca62607de1c2ca748251754ea5bbe8c20150e7f47efd57012c63b3c6a6632dc1c7cd15f3e1c999904037d60fac2eb9397f2adbe458d7f264e64f1e73aa927b30988e2aed2f03620", + "keyed_hash": "73ac69eecf286894d8102018a6fc729f4b1f4247d3703f69bdc6a5fe3e0c84616ab199d1f2f3e53bffb17f0a2209fe8b4f7d4c7bae59c2bc7d01f1ff94c67588cc6b38fa6024886f2c078bfe09b5d9e6584cd6c521c3bb52f4de7687b37117a2dbbec0d59e92fa9a8cc3240d4432f91757aabcae03e87431dac003e7d73574bfdd8218", + "derive_key": "1f24eda69dbcb752847ec3ebb5dd42836d86e58500c7c98d906ecd82ed9ae47f6f48a3f67e4e43329c9a89b1ca526b9b35cbf7d25c1e353baffb590fd79be58ddb6c711f1a6b60e98620b851c688670412fcb0435657ba6b638d21f0f2a04f2f6b0bd8834837b10e438d5f4c7c2c71299cf7586ea9144ed09253d51f8f54dd6bff719d" + }, + { + "input_len": 6, + "hash": "06c4e8ffb6872fad96f9aaca5eee1553eb62aed0ad7198cef42e87f6a616c844611a30c4e4f37fe2fe23c0883cde5cf7059d88b657c7ed2087e3d210925ede716435d6d5d82597a1e52b9553919e804f5656278bd739880692c94bff2824d8e0b48cac1d24682699e4883389dc4f2faa2eb3b4db6e39debd5061ff3609916f3e07529a", + "keyed_hash": "82d3199d0013035682cc7f2a399d4c212544376a839aa863a0f4c91220ca7a6dc2ffb3aa05f2631f0fa9ac19b6e97eb7e6669e5ec254799350c8b8d189e8807800842a5383c4d907c932f34490aaf00064de8cdb157357bde37c1504d2960034930887603abc5ccb9f5247f79224baff6120a3c622a46d7b1bcaee02c5025460941256", + "derive_key": "be96b30b37919fe4379dfbe752ae77b4f7e2ab92f7ff27435f76f2f065f6a5f435ae01a1d14bd5a6b3b69d8cbd35f0b01ef2173ff6f9b640ca0bd4748efa398bf9a9c0acd6a66d9332fdc9b47ffe28ba7ab6090c26747b85f4fab22f936b71eb3f64613d8bd9dfabe9bb68da19de78321b481e5297df9e40ec8a3d662f3e1479c65de0" + }, + { + "input_len": 7, + "hash": "3f8770f387faad08faa9d8414e9f449ac68e6ff0417f673f602a646a891419fe66036ef6e6d1a8f54baa9fed1fc11c77cfb9cff65bae915045027046ebe0c01bf5a941f3bb0f73791d3fc0b84370f9f30af0cd5b0fc334dd61f70feb60dad785f070fef1f343ed933b49a5ca0d16a503f599a365a4296739248b28d1a20b0e2cc8975c", + "keyed_hash": "af0a7ec382aedc0cfd626e49e7628bc7a353a4cb108855541a5651bf64fbb28a7c5035ba0f48a9c73dabb2be0533d02e8fd5d0d5639a18b2803ba6bf527e1d145d5fd6406c437b79bcaad6c7bdf1cf4bd56a893c3eb9510335a7a798548c6753f74617bede88bef924ba4b334f8852476d90b26c5dc4c3668a2519266a562c6c8034a6", + "derive_key": "dc3b6485f9d94935329442916b0d059685ba815a1fa2a14107217453a7fc9f0e66266db2ea7c96843f9d8208e600a73f7f45b2f55b9e6d6a7ccf05daae63a3fdd10b25ac0bd2e224ce8291f88c05976d575df998477db86fb2cfbbf91725d62cb57acfeb3c2d973b89b503c2b60dde85a7802b69dc1ac2007d5623cbea8cbfb6b181f5" + }, + { + "input_len": 8, + "hash": "2351207d04fc16ade43ccab08600939c7c1fa70a5c0aaca76063d04c3228eaeb725d6d46ceed8f785ab9f2f9b06acfe398c6699c6129da084cb531177445a682894f9685eaf836999221d17c9a64a3a057000524cd2823986db378b074290a1a9b93a22e135ed2c14c7e20c6d045cd00b903400374126676ea78874d79f2dd7883cf5c", + "keyed_hash": "be2f5495c61cba1bb348a34948c004045e3bd4dae8f0fe82bf44d0da245a060048eb5e68ce6dea1eb0229e144f578b3aa7e9f4f85febd135df8525e6fe40c6f0340d13dd09b255ccd5112a94238f2be3c0b5b7ecde06580426a93e0708555a265305abf86d874e34b4995b788e37a823491f25127a502fe0704baa6bfdf04e76c13276", + "derive_key": "2b166978cef14d9d438046c720519d8b1cad707e199746f1562d0c87fbd32940f0e2545a96693a66654225ebbaac76d093bfa9cd8f525a53acb92a861a98c42e7d1c4ae82e68ab691d510012edd2a728f98cd4794ef757e94d6546961b4f280a51aac339cc95b64a92b83cc3f26d8af8dfb4c091c240acdb4d47728d23e7148720ef04" + }, + { + "input_len": 63, + "hash": "e9bc37a594daad83be9470df7f7b3798297c3d834ce80ba85d6e207627b7db7b1197012b1e7d9af4d7cb7bdd1f3bb49a90a9b5dec3ea2bbc6eaebce77f4e470cbf4687093b5352f04e4a4570fba233164e6acc36900e35d185886a827f7ea9bdc1e5c3ce88b095a200e62c10c043b3e9bc6cb9b6ac4dfa51794b02ace9f98779040755", + "keyed_hash": "bb1eb5d4afa793c1ebdd9fb08def6c36d10096986ae0cfe148cd101170ce37aea05a63d74a840aecd514f654f080e51ac50fd617d22610d91780fe6b07a26b0847abb38291058c97474ef6ddd190d30fc318185c09ca1589d2024f0a6f16d45f11678377483fa5c005b2a107cb9943e5da634e7046855eaa888663de55d6471371d55d", + "derive_key": "b6451e30b953c206e34644c6803724e9d2725e0893039cfc49584f991f451af3b89e8ff572d3da4f4022199b9563b9d70ebb616efff0763e9abec71b550f1371e233319c4c4e74da936ba8e5bbb29a598e007a0bbfa929c99738ca2cc098d59134d11ff300c39f82e2fce9f7f0fa266459503f64ab9913befc65fddc474f6dc1c67669" + }, + { + "input_len": 64, + "hash": "4eed7141ea4a5cd4b788606bd23f46e212af9cacebacdc7d1f4c6dc7f2511b98fc9cc56cb831ffe33ea8e7e1d1df09b26efd2767670066aa82d023b1dfe8ab1b2b7fbb5b97592d46ffe3e05a6a9b592e2949c74160e4674301bc3f97e04903f8c6cf95b863174c33228924cdef7ae47559b10b294acd660666c4538833582b43f82d74", + "keyed_hash": "ba8ced36f327700d213f120b1a207a3b8c04330528586f414d09f2f7d9ccb7e68244c26010afc3f762615bbac552a1ca909e67c83e2fd5478cf46b9e811efccc93f77a21b17a152ebaca1695733fdb086e23cd0eb48c41c034d52523fc21236e5d8c9255306e48d52ba40b4dac24256460d56573d1312319afcf3ed39d72d0bfc69acb", + "derive_key": "a5c4a7053fa86b64746d4bb688d06ad1f02a18fce9afd3e818fefaa7126bf73e9b9493a9befebe0bf0c9509fb3105cfa0e262cde141aa8e3f2c2f77890bb64a4cca96922a21ead111f6338ad5244f2c15c44cb595443ac2ac294231e31be4a4307d0a91e874d36fc9852aeb1265c09b6e0cda7c37ef686fbbcab97e8ff66718be048bb" + }, + { + "input_len": 65, + "hash": "de1e5fa0be70df6d2be8fffd0e99ceaa8eb6e8c93a63f2d8d1c30ecb6b263dee0e16e0a4749d6811dd1d6d1265c29729b1b75a9ac346cf93f0e1d7296dfcfd4313b3a227faaaaf7757cc95b4e87a49be3b8a270a12020233509b1c3632b3485eef309d0abc4a4a696c9decc6e90454b53b000f456a3f10079072baaf7a981653221f2c", + "keyed_hash": "c0a4edefa2d2accb9277c371ac12fcdbb52988a86edc54f0716e1591b4326e72d5e795f46a596b02d3d4bfb43abad1e5d19211152722ec1f20fef2cd413e3c22f2fc5da3d73041275be6ede3517b3b9f0fc67ade5956a672b8b75d96cb43294b9041497de92637ed3f2439225e683910cb3ae923374449ca788fb0f9bea92731bc26ad", + "derive_key": "51fd05c3c1cfbc8ed67d139ad76f5cf8236cd2acd26627a30c104dfd9d3ff8a82b02e8bd36d8498a75ad8c8e9b15eb386970283d6dd42c8ae7911cc592887fdbe26a0a5f0bf821cd92986c60b2502c9be3f98a9c133a7e8045ea867e0828c7252e739321f7c2d65daee4468eb4429efae469a42763f1f94977435d10dccae3e3dce88d" + }, + { + "input_len": 127, + "hash": "d81293fda863f008c09e92fc382a81f5a0b4a1251cba1634016a0f86a6bd640de3137d477156d1fde56b0cf36f8ef18b44b2d79897bece12227539ac9ae0a5119da47644d934d26e74dc316145dcb8bb69ac3f2e05c242dd6ee06484fcb0e956dc44355b452c5e2bbb5e2b66e99f5dd443d0cbcaaafd4beebaed24ae2f8bb672bcef78", + "keyed_hash": "c64200ae7dfaf35577ac5a9521c47863fb71514a3bcad18819218b818de85818ee7a317aaccc1458f78d6f65f3427ec97d9c0adb0d6dacd4471374b621b7b5f35cd54663c64dbe0b9e2d95632f84c611313ea5bd90b71ce97b3cf645776f3adc11e27d135cbadb9875c2bf8d3ae6b02f8a0206aba0c35bfe42574011931c9a255ce6dc", + "derive_key": "c91c090ceee3a3ac81902da31838012625bbcd73fcb92e7d7e56f78deba4f0c3feeb3974306966ccb3e3c69c337ef8a45660ad02526306fd685c88542ad00f759af6dd1adc2e50c2b8aac9f0c5221ff481565cf6455b772515a69463223202e5c371743e35210bbbbabd89651684107fd9fe493c937be16e39cfa7084a36207c99bea3" + }, + { + "input_len": 128, + "hash": "f17e570564b26578c33bb7f44643f539624b05df1a76c81f30acd548c44b45efa69faba091427f9c5c4caa873aa07828651f19c55bad85c47d1368b11c6fd99e47ecba5820a0325984d74fe3e4058494ca12e3f1d3293d0010a9722f7dee64f71246f75e9361f44cc8e214a100650db1313ff76a9f93ec6e84edb7add1cb4a95019b0c", + "keyed_hash": "b04fe15577457267ff3b6f3c947d93be581e7e3a4b018679125eaf86f6a628ecd86bbe0001f10bda47e6077b735016fca8119da11348d93ca302bbd125bde0db2b50edbe728a620bb9d3e6f706286aedea973425c0b9eedf8a38873544cf91badf49ad92a635a93f71ddfcee1eae536c25d1b270956be16588ef1cfef2f1d15f650bd5", + "derive_key": "81720f34452f58a0120a58b6b4608384b5c51d11f39ce97161a0c0e442ca022550e7cd651e312f0b4c6afb3c348ae5dd17d2b29fab3b894d9a0034c7b04fd9190cbd90043ff65d1657bbc05bfdecf2897dd894c7a1b54656d59a50b51190a9da44db426266ad6ce7c173a8c0bbe091b75e734b4dadb59b2861cd2518b4e7591e4b83c9" + }, + { + "input_len": 129, + "hash": "683aaae9f3c5ba37eaaf072aed0f9e30bac0865137bae68b1fde4ca2aebdcb12f96ffa7b36dd78ba321be7e842d364a62a42e3746681c8bace18a4a8a79649285c7127bf8febf125be9de39586d251f0d41da20980b70d35e3dac0eee59e468a894fa7e6a07129aaad09855f6ad4801512a116ba2b7841e6cfc99ad77594a8f2d181a7", + "keyed_hash": "d4a64dae6cdccbac1e5287f54f17c5f985105457c1a2ec1878ebd4b57e20d38f1c9db018541eec241b748f87725665b7b1ace3e0065b29c3bcb232c90e37897fa5aaee7e1e8a2ecfcd9b51463e42238cfdd7fee1aecb3267fa7f2128079176132a412cd8aaf0791276f6b98ff67359bd8652ef3a203976d5ff1cd41885573487bcd683", + "derive_key": "938d2d4435be30eafdbb2b7031f7857c98b04881227391dc40db3c7b21f41fc18d72d0f9c1de5760e1941aebf3100b51d64644cb459eb5d20258e233892805eb98b07570ef2a1787cd48e117c8d6a63a68fd8fc8e59e79dbe63129e88352865721c8d5f0cf183f85e0609860472b0d6087cefdd186d984b21542c1c780684ed6832d8d" + }, + { + "input_len": 1023, + "hash": "10108970eeda3eb932baac1428c7a2163b0e924c9a9e25b35bba72b28f70bd11a182d27a591b05592b15607500e1e8dd56bc6c7fc063715b7a1d737df5bad3339c56778957d870eb9717b57ea3d9fb68d1b55127bba6a906a4a24bbd5acb2d123a37b28f9e9a81bbaae360d58f85e5fc9d75f7c370a0cc09b6522d9c8d822f2f28f485", + "keyed_hash": "c951ecdf03288d0fcc96ee3413563d8a6d3589547f2c2fb36d9786470f1b9d6e890316d2e6d8b8c25b0a5b2180f94fb1a158ef508c3cde45e2966bd796a696d3e13efd86259d756387d9becf5c8bf1ce2192b87025152907b6d8cc33d17826d8b7b9bc97e38c3c85108ef09f013e01c229c20a83d9e8efac5b37470da28575fd755a10", + "derive_key": "74a16c1c3d44368a86e1ca6df64be6a2f64cce8f09220787450722d85725dea59c413264404661e9e4d955409dfe4ad3aa487871bcd454ed12abfe2c2b1eb7757588cf6cb18d2eccad49e018c0d0fec323bec82bf1644c6325717d13ea712e6840d3e6e730d35553f59eff5377a9c350bcc1556694b924b858f329c44ee64b884ef00d" + }, + { + "input_len": 1024, + "hash": "42214739f095a406f3fc83deb889744ac00df831c10daa55189b5d121c855af71cf8107265ecdaf8505b95d8fcec83a98a6a96ea5109d2c179c47a387ffbb404756f6eeae7883b446b70ebb144527c2075ab8ab204c0086bb22b7c93d465efc57f8d917f0b385c6df265e77003b85102967486ed57db5c5ca170ba441427ed9afa684e", + "keyed_hash": "75c46f6f3d9eb4f55ecaaee480db732e6c2105546f1e675003687c31719c7ba4a78bc838c72852d4f49c864acb7adafe2478e824afe51c8919d06168414c265f298a8094b1ad813a9b8614acabac321f24ce61c5a5346eb519520d38ecc43e89b5000236df0597243e4d2493fd626730e2ba17ac4d8824d09d1a4a8f57b8227778e2de", + "derive_key": "7356cd7720d5b66b6d0697eb3177d9f8d73a4a5c5e968896eb6a6896843027066c23b601d3ddfb391e90d5c8eccdef4ae2a264bce9e612ba15e2bc9d654af1481b2e75dbabe615974f1070bba84d56853265a34330b4766f8e75edd1f4a1650476c10802f22b64bd3919d246ba20a17558bc51c199efdec67e80a227251808d8ce5bad" + }, + { + "input_len": 1025, + "hash": "d00278ae47eb27b34faecf67b4fe263f82d5412916c1ffd97c8cb7fb814b8444f4c4a22b4b399155358a994e52bf255de60035742ec71bd08ac275a1b51cc6bfe332b0ef84b409108cda080e6269ed4b3e2c3f7d722aa4cdc98d16deb554e5627be8f955c98e1d5f9565a9194cad0c4285f93700062d9595adb992ae68ff12800ab67a", + "keyed_hash": "357dc55de0c7e382c900fd6e320acc04146be01db6a8ce7210b7189bd664ea69362396b77fdc0d2634a552970843722066c3c15902ae5097e00ff53f1e116f1cd5352720113a837ab2452cafbde4d54085d9cf5d21ca613071551b25d52e69d6c81123872b6f19cd3bc1333edf0c52b94de23ba772cf82636cff4542540a7738d5b930", + "derive_key": "effaa245f065fbf82ac186839a249707c3bddf6d3fdda22d1b95a3c970379bcb5d31013a167509e9066273ab6e2123bc835b408b067d88f96addb550d96b6852dad38e320b9d940f86db74d398c770f462118b35d2724efa13da97194491d96dd37c3c09cbef665953f2ee85ec83d88b88d11547a6f911c8217cca46defa2751e7f3ad" + }, + { + "input_len": 2048, + "hash": "e776b6028c7cd22a4d0ba182a8bf62205d2ef576467e838ed6f2529b85fba24a9a60bf80001410ec9eea6698cd537939fad4749edd484cb541aced55cd9bf54764d063f23f6f1e32e12958ba5cfeb1bf618ad094266d4fc3c968c2088f677454c288c67ba0dba337b9d91c7e1ba586dc9a5bc2d5e90c14f53a8863ac75655461cea8f9", + "keyed_hash": "879cf1fa2ea0e79126cb1063617a05b6ad9d0b696d0d757cf053439f60a99dd10173b961cd574288194b23ece278c330fbb8585485e74967f31352a8183aa782b2b22f26cdcadb61eed1a5bc144b8198fbb0c13abbf8e3192c145d0a5c21633b0ef86054f42809df823389ee40811a5910dcbd1018af31c3b43aa55201ed4edaac74fe", + "derive_key": "7b2945cb4fef70885cc5d78a87bf6f6207dd901ff239201351ffac04e1088a23e2c11a1ebffcea4d80447867b61badb1383d842d4e79645d48dd82ccba290769caa7af8eaa1bd78a2a5e6e94fbdab78d9c7b74e894879f6a515257ccf6f95056f4e25390f24f6b35ffbb74b766202569b1d797f2d4bd9d17524c720107f985f4ddc583" + }, + { + "input_len": 2049, + "hash": "5f4d72f40d7a5f82b15ca2b2e44b1de3c2ef86c426c95c1af0b687952256303096de31d71d74103403822a2e0bc1eb193e7aecc9643a76b7bbc0c9f9c52e8783aae98764ca468962b5c2ec92f0c74eb5448d519713e09413719431c802f948dd5d90425a4ecdadece9eb178d80f26efccae630734dff63340285adec2aed3b51073ad3", + "keyed_hash": "9f29700902f7c86e514ddc4df1e3049f258b2472b6dd5267f61bf13983b78dd5f9a88abfefdfa1e00b418971f2b39c64ca621e8eb37fceac57fd0c8fc8e117d43b81447be22d5d8186f8f5919ba6bcc6846bd7d50726c06d245672c2ad4f61702c646499ee1173daa061ffe15bf45a631e2946d616a4c345822f1151284712f76b2b0e", + "derive_key": "2ea477c5515cc3dd606512ee72bb3e0e758cfae7232826f35fb98ca1bcbdf27316d8e9e79081a80b046b60f6a263616f33ca464bd78d79fa18200d06c7fc9bffd808cc4755277a7d5e09da0f29ed150f6537ea9bed946227ff184cc66a72a5f8c1e4bd8b04e81cf40fe6dc4427ad5678311a61f4ffc39d195589bdbc670f63ae70f4b6" + }, + { + "input_len": 3072, + "hash": "b98cb0ff3623be03326b373de6b9095218513e64f1ee2edd2525c7ad1e5cffd29a3f6b0b978d6608335c09dc94ccf682f9951cdfc501bfe47b9c9189a6fc7b404d120258506341a6d802857322fbd20d3e5dae05b95c88793fa83db1cb08e7d8008d1599b6209d78336e24839724c191b2a52a80448306e0daa84a3fdb566661a37e11", + "keyed_hash": "044a0e7b172a312dc02a4c9a818c036ffa2776368d7f528268d2e6b5df19177022f302d0529e4174cc507c463671217975e81dab02b8fdeb0d7ccc7568dd22574c783a76be215441b32e91b9a904be8ea81f7a0afd14bad8ee7c8efc305ace5d3dd61b996febe8da4f56ca0919359a7533216e2999fc87ff7d8f176fbecb3d6f34278b", + "derive_key": "050df97f8c2ead654d9bb3ab8c9178edcd902a32f8495949feadcc1e0480c46b3604131bbd6e3ba573b6dd682fa0a63e5b165d39fc43a625d00207607a2bfeb65ff1d29292152e26b298868e3b87be95d6458f6f2ce6118437b632415abe6ad522874bcd79e4030a5e7bad2efa90a7a7c67e93f0a18fb28369d0a9329ab5c24134ccb0" + }, + { + "input_len": 3073, + "hash": "7124b49501012f81cc7f11ca069ec9226cecb8a2c850cfe644e327d22d3e1cd39a27ae3b79d68d89da9bf25bc27139ae65a324918a5f9b7828181e52cf373c84f35b639b7fccbb985b6f2fa56aea0c18f531203497b8bbd3a07ceb5926f1cab74d14bd66486d9a91eba99059a98bd1cd25876b2af5a76c3e9eed554ed72ea952b603bf", + "keyed_hash": "68dede9bef00ba89e43f31a6825f4cf433389fedae75c04ee9f0cf16a427c95a96d6da3fe985054d3478865be9a092250839a697bbda74e279e8a9e69f0025e4cfddd6cfb434b1cd9543aaf97c635d1b451a4386041e4bb100f5e45407cbbc24fa53ea2de3536ccb329e4eb9466ec37093a42cf62b82903c696a93a50b702c80f3c3c5", + "derive_key": "72613c9ec9ff7e40f8f5c173784c532ad852e827dba2bf85b2ab4b76f7079081576288e552647a9d86481c2cae75c2dd4e7c5195fb9ada1ef50e9c5098c249d743929191441301c69e1f48505a4305ec1778450ee48b8e69dc23a25960fe33070ea549119599760a8a2d28aeca06b8c5e9ba58bc19e11fe57b6ee98aa44b2a8e6b14a5" + }, + { + "input_len": 4096, + "hash": "015094013f57a5277b59d8475c0501042c0b642e531b0a1c8f58d2163229e9690289e9409ddb1b99768eafe1623da896faf7e1114bebeadc1be30829b6f8af707d85c298f4f0ff4d9438aef948335612ae921e76d411c3a9111df62d27eaf871959ae0062b5492a0feb98ef3ed4af277f5395172dbe5c311918ea0074ce0036454f620", + "keyed_hash": "befc660aea2f1718884cd8deb9902811d332f4fc4a38cf7c7300d597a081bfc0bbb64a36edb564e01e4b4aaf3b060092a6b838bea44afebd2deb8298fa562b7b597c757b9df4c911c3ca462e2ac89e9a787357aaf74c3b56d5c07bc93ce899568a3eb17d9250c20f6c5f6c1e792ec9a2dcb715398d5a6ec6d5c54f586a00403a1af1de", + "derive_key": "1e0d7f3db8c414c97c6307cbda6cd27ac3b030949da8e23be1a1a924ad2f25b9d78038f7b198596c6cc4a9ccf93223c08722d684f240ff6569075ed81591fd93f9fff1110b3a75bc67e426012e5588959cc5a4c192173a03c00731cf84544f65a2fb9378989f72e9694a6a394a8a30997c2e67f95a504e631cd2c5f55246024761b245" + }, + { + "input_len": 4097, + "hash": "9b4052b38f1c5fc8b1f9ff7ac7b27cd242487b3d890d15c96a1c25b8aa0fb99505f91b0b5600a11251652eacfa9497b31cd3c409ce2e45cfe6c0a016967316c426bd26f619eab5d70af9a418b845c608840390f361630bd497b1ab44019316357c61dbe091ce72fc16dc340ac3d6e009e050b3adac4b5b2c92e722cffdc46501531956", + "keyed_hash": "00df940cd36bb9fa7cbbc3556744e0dbc8191401afe70520ba292ee3ca80abbc606db4976cfdd266ae0abf667d9481831ff12e0caa268e7d3e57260c0824115a54ce595ccc897786d9dcbf495599cfd90157186a46ec800a6763f1c59e36197e9939e900809f7077c102f888caaf864b253bc41eea812656d46742e4ea42769f89b83f", + "derive_key": "aca51029626b55fda7117b42a7c211f8c6e9ba4fe5b7a8ca922f34299500ead8a897f66a400fed9198fd61dd2d58d382458e64e100128075fc54b860934e8de2e84170734b06e1d212a117100820dbc48292d148afa50567b8b84b1ec336ae10d40c8c975a624996e12de31abbe135d9d159375739c333798a80c64ae895e51e22f3ad" + }, + { + "input_len": 5120, + "hash": "9cadc15fed8b5d854562b26a9536d9707cadeda9b143978f319ab34230535833acc61c8fdc114a2010ce8038c853e121e1544985133fccdd0a2d507e8e615e611e9a0ba4f47915f49e53d721816a9198e8b30f12d20ec3689989175f1bf7a300eee0d9321fad8da232ece6efb8e9fd81b42ad161f6b9550a069e66b11b40487a5f5059", + "keyed_hash": "2c493e48e9b9bf31e0553a22b23503c0a3388f035cece68eb438d22fa1943e209b4dc9209cd80ce7c1f7c9a744658e7e288465717ae6e56d5463d4f80cdb2ef56495f6a4f5487f69749af0c34c2cdfa857f3056bf8d807336a14d7b89bf62bef2fb54f9af6a546f818dc1e98b9e07f8a5834da50fa28fb5874af91bf06020d1bf0120e", + "derive_key": "7a7acac8a02adcf3038d74cdd1d34527de8a0fcc0ee3399d1262397ce5817f6055d0cefd84d9d57fe792d65a278fd20384ac6c30fdb340092f1a74a92ace99c482b28f0fc0ef3b923e56ade20c6dba47e49227166251337d80a037e987ad3a7f728b5ab6dfafd6e2ab1bd583a95d9c895ba9c2422c24ea0f62961f0dca45cad47bfa0d" + }, + { + "input_len": 5121, + "hash": "628bd2cb2004694adaab7bbd778a25df25c47b9d4155a55f8fbd79f2fe154cff96adaab0613a6146cdaabe498c3a94e529d3fc1da2bd08edf54ed64d40dcd6777647eac51d8277d70219a9694334a68bc8f0f23e20b0ff70ada6f844542dfa32cd4204ca1846ef76d811cdb296f65e260227f477aa7aa008bac878f72257484f2b6c95", + "keyed_hash": "6ccf1c34753e7a044db80798ecd0782a8f76f33563accaddbfbb2e0ea4b2d0240d07e63f13667a8d1490e5e04f13eb617aea16a8c8a5aaed1ef6fbde1b0515e3c81050b361af6ead126032998290b563e3caddeaebfab592e155f2e161fb7cba939092133f23f9e65245e58ec23457b78a2e8a125588aad6e07d7f11a85b88d375b72d", + "derive_key": "b07f01e518e702f7ccb44a267e9e112d403a7b3f4883a47ffbed4b48339b3c341a0add0ac032ab5aaea1e4e5b004707ec5681ae0fcbe3796974c0b1cf31a194740c14519273eedaabec832e8a784b6e7cfc2c5952677e6c3f2c3914454082d7eb1ce1766ac7d75a4d3001fc89544dd46b5147382240d689bbbaefc359fb6ae30263165" + }, + { + "input_len": 6144, + "hash": "3e2e5b74e048f3add6d21faab3f83aa44d3b2278afb83b80b3c35164ebeca2054d742022da6fdda444ebc384b04a54c3ac5839b49da7d39f6d8a9db03deab32aade156c1c0311e9b3435cde0ddba0dce7b26a376cad121294b689193508dd63151603c6ddb866ad16c2ee41585d1633a2cea093bea714f4c5d6b903522045b20395c83", + "keyed_hash": "3d6b6d21281d0ade5b2b016ae4034c5dec10ca7e475f90f76eac7138e9bc8f1dc35754060091dc5caf3efabe0603c60f45e415bb3407db67e6beb3d11cf8e4f7907561f05dace0c15807f4b5f389c841eb114d81a82c02a00b57206b1d11fa6e803486b048a5ce87105a686dee041207e095323dfe172df73deb8c9532066d88f9da7e", + "derive_key": "2a95beae63ddce523762355cf4b9c1d8f131465780a391286a5d01abb5683a1597099e3c6488aab6c48f3c15dbe1942d21dbcdc12115d19a8b8465fb54e9053323a9178e4275647f1a9927f6439e52b7031a0b465c861a3fc531527f7758b2b888cf2f20582e9e2c593709c0a44f9c6e0f8b963994882ea4168827823eef1f64169fef" + }, + { + "input_len": 6145, + "hash": "f1323a8631446cc50536a9f705ee5cb619424d46887f3c376c695b70e0f0507f18a2cfdd73c6e39dd75ce7c1c6e3ef238fd54465f053b25d21044ccb2093beb015015532b108313b5829c3621ce324b8e14229091b7c93f32db2e4e63126a377d2a63a3597997d4f1cba59309cb4af240ba70cebff9a23d5e3ff0cdae2cfd54e070022", + "keyed_hash": "9ac301e9e39e45e3250a7e3b3df701aa0fb6889fbd80eeecf28dbc6300fbc539f3c184ca2f59780e27a576c1d1fb9772e99fd17881d02ac7dfd39675aca918453283ed8c3169085ef4a466b91c1649cc341dfdee60e32231fc34c9c4e0b9a2ba87ca8f372589c744c15fd6f985eec15e98136f25beeb4b13c4e43dc84abcc79cd4646c", + "derive_key": "379bcc61d0051dd489f686c13de00d5b14c505245103dc040d9e4dd1facab8e5114493d029bdbd295aaa744a59e31f35c7f52dba9c3642f773dd0b4262a9980a2aef811697e1305d37ba9d8b6d850ef07fe41108993180cf779aeece363704c76483458603bbeeb693cffbbe5588d1f3535dcad888893e53d977424bb707201569a8d2" + }, + { + "input_len": 7168, + "hash": "61da957ec2499a95d6b8023e2b0e604ec7f6b50e80a9678b89d2628e99ada77a5707c321c83361793b9af62a40f43b523df1c8633cecb4cd14d00bdc79c78fca5165b863893f6d38b02ff7236c5a9a8ad2dba87d24c547cab046c29fc5bc1ed142e1de4763613bb162a5a538e6ef05ed05199d751f9eb58d332791b8d73fb74e4fce95", + "keyed_hash": "b42835e40e9d4a7f42ad8cc04f85a963a76e18198377ed84adddeaecacc6f3fca2f01d5277d69bb681c70fa8d36094f73ec06e452c80d2ff2257ed82e7ba348400989a65ee8daa7094ae0933e3d2210ac6395c4af24f91c2b590ef87d7788d7066ea3eaebca4c08a4f14b9a27644f99084c3543711b64a070b94f2c9d1d8a90d035d52", + "derive_key": "11c37a112765370c94a51415d0d651190c288566e295d505defdad895dae223730d5a5175a38841693020669c7638f40b9bc1f9f39cf98bda7a5b54ae24218a800a2116b34665aa95d846d97ea988bfcb53dd9c055d588fa21ba78996776ea6c40bc428b53c62b5f3ccf200f647a5aae8067f0ea1976391fcc72af1945100e2a6dcb88" + }, + { + "input_len": 7169, + "hash": "a003fc7a51754a9b3c7fae0367ab3d782dccf28855a03d435f8cfe74605e781798a8b20534be1ca9eb2ae2df3fae2ea60e48c6fb0b850b1385b5de0fe460dbe9d9f9b0d8db4435da75c601156df9d047f4ede008732eb17adc05d96180f8a73548522840779e6062d643b79478a6e8dbce68927f36ebf676ffa7d72d5f68f050b119c8", + "keyed_hash": "ed9b1a922c046fdb3d423ae34e143b05ca1bf28b710432857bf738bcedbfa5113c9e28d72fcbfc020814ce3f5d4fc867f01c8f5b6caf305b3ea8a8ba2da3ab69fabcb438f19ff11f5378ad4484d75c478de425fb8e6ee809b54eec9bdb184315dc856617c09f5340451bf42fd3270a7b0b6566169f242e533777604c118a6358250f54", + "derive_key": "554b0a5efea9ef183f2f9b931b7497995d9eb26f5c5c6dad2b97d62fc5ac31d99b20652c016d88ba2a611bbd761668d5eda3e568e940faae24b0d9991c3bd25a65f770b89fdcadabcb3d1a9c1cb63e69721cacf1ae69fefdcef1e3ef41bc5312ccc17222199e47a26552c6adc460cf47a72319cb5039369d0060eaea59d6c65130f1dd" + }, + { + "input_len": 8192, + "hash": "aae792484c8efe4f19e2ca7d371d8c467ffb10748d8a5a1ae579948f718a2a635fe51a27db045a567c1ad51be5aa34c01c6651c4d9b5b5ac5d0fd58cf18dd61a47778566b797a8c67df7b1d60b97b19288d2d877bb2df417ace009dcb0241ca1257d62712b6a4043b4ff33f690d849da91ea3bf711ed583cb7b7a7da2839ba71309bbf", + "keyed_hash": "dc9637c8845a770b4cbf76b8daec0eebf7dc2eac11498517f08d44c8fc00d58a4834464159dcbc12a0ba0c6d6eb41bac0ed6585cabfe0aca36a375e6c5480c22afdc40785c170f5a6b8a1107dbee282318d00d915ac9ed1143ad40765ec120042ee121cd2baa36250c618adaf9e27260fda2f94dea8fb6f08c04f8f10c78292aa46102", + "derive_key": "ad01d7ae4ad059b0d33baa3c01319dcf8088094d0359e5fd45d6aeaa8b2d0c3d4c9e58958553513b67f84f8eac653aeeb02ae1d5672dcecf91cd9985a0e67f4501910ecba25555395427ccc7241d70dc21c190e2aadee875e5aae6bf1912837e53411dabf7a56cbf8e4fb780432b0d7fe6cec45024a0788cf5874616407757e9e6bef7" + }, + { + "input_len": 8193, + "hash": "bab6c09cb8ce8cf459261398d2e7aef35700bf488116ceb94a36d0f5f1b7bc3bb2282aa69be089359ea1154b9a9286c4a56af4de975a9aa4a5c497654914d279bea60bb6d2cf7225a2fa0ff5ef56bbe4b149f3ed15860f78b4e2ad04e158e375c1e0c0b551cd7dfc82f1b155c11b6b3ed51ec9edb30d133653bb5709d1dbd55f4e1ff6", + "keyed_hash": "954a2a75420c8d6547e3ba5b98d963e6fa6491addc8c023189cc519821b4a1f5f03228648fd983aef045c2fa8290934b0866b615f585149587dda2299039965328835a2b18f1d63b7e300fc76ff260b571839fe44876a4eae66cbac8c67694411ed7e09df51068a22c6e67d6d3dd2cca8ff12e3275384006c80f4db68023f24eebba57", + "derive_key": "af1e0346e389b17c23200270a64aa4e1ead98c61695d917de7d5b00491c9b0f12f20a01d6d622edf3de026a4db4e4526225debb93c1237934d71c7340bb5916158cbdafe9ac3225476b6ab57a12357db3abbad7a26c6e66290e44034fb08a20a8d0ec264f309994d2810c49cfba6989d7abb095897459f5425adb48aba07c5fb3c83c0" + }, + { + "input_len": 16384, + "hash": "f875d6646de28985646f34ee13be9a576fd515f76b5b0a26bb324735041ddde49d764c270176e53e97bdffa58d549073f2c660be0e81293767ed4e4929f9ad34bbb39a529334c57c4a381ffd2a6d4bfdbf1482651b172aa883cc13408fa67758a3e47503f93f87720a3177325f7823251b85275f64636a8f1d599c2e49722f42e93893", + "keyed_hash": "9e9fc4eb7cf081ea7c47d1807790ed211bfec56aa25bb7037784c13c4b707b0df9e601b101e4cf63a404dfe50f2e1865bb12edc8fca166579ce0c70dba5a5c0fc960ad6f3772183416a00bd29d4c6e651ea7620bb100c9449858bf14e1ddc9ecd35725581ca5b9160de04060045993d972571c3e8f71e9d0496bfa744656861b169d65", + "derive_key": "160e18b5878cd0df1c3af85eb25a0db5344d43a6fbd7a8ef4ed98d0714c3f7e160dc0b1f09caa35f2f417b9ef309dfe5ebd67f4c9507995a531374d099cf8ae317542e885ec6f589378864d3ea98716b3bbb65ef4ab5e0ab5bb298a501f19a41ec19af84a5e6b428ecd813b1a47ed91c9657c3fba11c406bc316768b58f6802c9e9b57" + }, + { + "input_len": 31744, + "hash": "62b6960e1a44bcc1eb1a611a8d6235b6b4b78f32e7abc4fb4c6cdcce94895c47860cc51f2b0c28a7b77304bd55fe73af663c02d3f52ea053ba43431ca5bab7bfea2f5e9d7121770d88f70ae9649ea713087d1914f7f312147e247f87eb2d4ffef0ac978bf7b6579d57d533355aa20b8b77b13fd09748728a5cc327a8ec470f4013226f", + "keyed_hash": "efa53b389ab67c593dba624d898d0f7353ab99e4ac9d42302ee64cbf9939a4193a7258db2d9cd32a7a3ecfce46144114b15c2fcb68a618a976bd74515d47be08b628be420b5e830fade7c080e351a076fbc38641ad80c736c8a18fe3c66ce12f95c61c2462a9770d60d0f77115bbcd3782b593016a4e728d4c06cee4505cb0c08a42ec", + "derive_key": "39772aef80e0ebe60596361e45b061e8f417429d529171b6764468c22928e28e9759adeb797a3fbf771b1bcea30150a020e317982bf0d6e7d14dd9f064bc11025c25f31e81bd78a921db0174f03dd481d30e93fd8e90f8b2fee209f849f2d2a52f31719a490fb0ba7aea1e09814ee912eba111a9fde9d5c274185f7bae8ba85d300a2b" + }, + { + "input_len": 102400, + "hash": "bc3e3d41a1146b069abffad3c0d44860cf664390afce4d9661f7902e7943e085e01c59dab908c04c3342b816941a26d69c2605ebee5ec5291cc55e15b76146e6745f0601156c3596cb75065a9c57f35585a52e1ac70f69131c23d611ce11ee4ab1ec2c009012d236648e77be9295dd0426f29b764d65de58eb7d01dd42248204f45f8e", + "keyed_hash": "1c35d1a5811083fd7119f5d5d1ba027b4d01c0c6c49fb6ff2cf75393ea5db4a7f9dbdd3e1d81dcbca3ba241bb18760f207710b751846faaeb9dff8262710999a59b2aa1aca298a032d94eacfadf1aa192418eb54808db23b56e34213266aa08499a16b354f018fc4967d05f8b9d2ad87a7278337be9693fc638a3bfdbe314574ee6fc4", + "derive_key": "4652cff7a3f385a6103b5c260fc1593e13c778dbe608efb092fe7ee69df6e9c6d83a3e041bc3a48df2879f4a0a3ed40e7c961c73eff740f3117a0504c2dff4786d44fb17f1549eb0ba585e40ec29bf7732f0b7e286ff8acddc4cb1e23b87ff5d824a986458dcc6a04ac83969b80637562953df51ed1a7e90a7926924d2763778be8560" + } + ] +} diff --git a/third-party/blake3/tools/compiler_version/Cargo.toml b/third-party/blake3/tools/compiler_version/Cargo.toml new file mode 100644 index 00000000..86250940 --- /dev/null +++ b/third-party/blake3/tools/compiler_version/Cargo.toml @@ -0,0 +1,7 @@ +[package] +name = "compiler_version" +version = "0.0.0" +edition = "2021" + +[build-dependencies] +cc = "1.0.50" diff --git a/third-party/blake3/tools/compiler_version/build.rs b/third-party/blake3/tools/compiler_version/build.rs new file mode 100644 index 00000000..3e14ebe6 --- /dev/null +++ b/third-party/blake3/tools/compiler_version/build.rs @@ -0,0 +1,6 @@ +fn main() { + let build = cc::Build::new(); + let compiler = build.get_compiler(); + let compiler_path = compiler.path().to_string_lossy(); + println!("cargo:rustc-env=COMPILER_PATH={}", compiler_path); +} diff --git a/third-party/blake3/tools/compiler_version/src/main.rs b/third-party/blake3/tools/compiler_version/src/main.rs new file mode 100644 index 00000000..767cb31b --- /dev/null +++ b/third-party/blake3/tools/compiler_version/src/main.rs @@ -0,0 +1,27 @@ +use std::process::Command; + +fn main() { + // Print the rustc version. + Command::new(env!("CARGO")) + .args(&["rustc", "--quiet", "--", "--version"]) + .status() + .unwrap(); + println!(); + + // Print the Cargo version. + Command::new(env!("CARGO")) + .args(&["--version"]) + .status() + .unwrap(); + println!(); + + // Print the C compiler version. This relies on C compiler detection done + // in build.rs, which sets the COMPILER_PATH variable. + let compiler_path = env!("COMPILER_PATH"); + let mut compiler_command = Command::new(compiler_path); + // Use the --version flag on everything other than MSVC. + if !cfg!(target_env = "msvc") { + compiler_command.arg("--version"); + } + let _ = compiler_command.status().unwrap(); +} diff --git a/third-party/blake3/tools/instruction_set_support/Cargo.toml b/third-party/blake3/tools/instruction_set_support/Cargo.toml new file mode 100644 index 00000000..eef8a80d --- /dev/null +++ b/third-party/blake3/tools/instruction_set_support/Cargo.toml @@ -0,0 +1,6 @@ +[package] +name = "instruction_set_support" +version = "0.0.0" +edition = "2021" + +[dependencies] diff --git a/third-party/blake3/tools/instruction_set_support/src/main.rs b/third-party/blake3/tools/instruction_set_support/src/main.rs new file mode 100644 index 00000000..6b509b05 --- /dev/null +++ b/third-party/blake3/tools/instruction_set_support/src/main.rs @@ -0,0 +1,10 @@ +fn main() { + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + { + dbg!(is_x86_feature_detected!("sse2")); + dbg!(is_x86_feature_detected!("sse4.1")); + dbg!(is_x86_feature_detected!("avx2")); + dbg!(is_x86_feature_detected!("avx512f")); + dbg!(is_x86_feature_detected!("avx512vl")); + } +} diff --git a/third-party/blake3/tools/release.md b/third-party/blake3/tools/release.md new file mode 100644 index 00000000..17a07b0f --- /dev/null +++ b/third-party/blake3/tools/release.md @@ -0,0 +1,16 @@ +# Release checklist + +- Make sure `cargo outdated -R` is clean in the root and in b3sum/. +- Bump the version in the root Cargo.toml. +- Bump the version in b3sum/Cargo.toml. +- Delete b3sum/Cargo.lock and recreate it with `cargo build` or similar. +- Update the `--help` output in b3sum/README.md if it's changed. +- Bump `BLAKE3_VERSION_STRING` in c/blake3.h. +- Bump `VERSION` in c/CMakeLists.txt. +- Make a version bump commit with change notes. +- `git push` and make sure CI is green. +- `git tag` the version bump commit with the new version number. +- `git push --tags` +- `cargo publish` in the root. +- `cargo publish --dry-run` in b3sum/ and make sure it fetches the just-published library version. +- `cargo publish` in b3sum/. diff --git a/third-party/mimalloc/src/prim/unix/prim.c b/third-party/mimalloc/src/prim/unix/prim.c index 314281fe..a9c0db60 100644 --- a/third-party/mimalloc/src/prim/unix/prim.c +++ b/third-party/mimalloc/src/prim/unix/prim.c @@ -27,6 +27,7 @@ terms of the MIT license. A copy of the license can be found in the file #include // mmap #include // sysconf +#include // getenv #if defined(__linux__) #include diff --git a/third-party/tbb/.github/labeler.yml b/third-party/tbb/.github/labeler.yml new file mode 100644 index 00000000..8d13d714 --- /dev/null +++ b/third-party/tbb/.github/labeler.yml @@ -0,0 +1,18 @@ +# Copyright (c) 2023 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +allocator: +- 'src/tbbmalloc/**/*' +- 'src/tbbmalloc_proxy/**/*' +- 'test/tbbmalloc/**/*' diff --git a/third-party/tbb/.github/workflows/ci.yml b/third-party/tbb/.github/workflows/ci.yml index 6d07b15f..da95c94a 100644 --- a/third-party/tbb/.github/workflows/ci.yml +++ b/third-party/tbb/.github/workflows/ci.yml @@ -1,4 +1,4 @@ -# Copyright (c) 2021-2022 Intel Corporation +# Copyright (c) 2021-2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -65,6 +65,7 @@ jobs: run: | pip3 install -U Jinja2 pip3 install git+https://github.com/executablebooks/sphinx-book-theme.git + pip3 install sphinx-tabs echo GITHUB_SHA_SHORT=${GITHUB_SHA::8} >> $GITHUB_ENV mkdir html - name: Build documentation @@ -126,8 +127,8 @@ jobs: cat years.diff exit 1 fi - python_module_test_ubuntu18-04: - runs-on: [ubuntu-18.04] + python_module_test_ubuntu_latest: + runs-on: [ubuntu-latest] timeout-minutes: 15 steps: - uses: actions/checkout@v2 @@ -146,7 +147,7 @@ jobs: fail-fast: false matrix: include: - - os: ubuntu-18.04 + - os: ubuntu-latest c_compiler: gcc cxx_compiler: g++ std: 14 @@ -246,7 +247,7 @@ jobs: fail-fast: false matrix: include: - - os: ubuntu-18.04 + - os: ubuntu-latest c_compiler: gcc cxx_compiler: g++ std: 14 diff --git a/third-party/tbb/.github/workflows/labeler.yml b/third-party/tbb/.github/workflows/labeler.yml new file mode 100644 index 00000000..8dbb0962 --- /dev/null +++ b/third-party/tbb/.github/workflows/labeler.yml @@ -0,0 +1,26 @@ +# Copyright (c) 2023 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +name: "Pull Request Labeler" +on: + - pull_request_target + +jobs: + triage: + permissions: + contents: read + pull-requests: write + runs-on: ubuntu-latest + steps: + - uses: actions/labeler@v4 + diff --git a/third-party/tbb/CMakeLists.txt b/third-party/tbb/CMakeLists.txt index a24287b1..47872941 100644 --- a/third-party/tbb/CMakeLists.txt +++ b/third-party/tbb/CMakeLists.txt @@ -112,10 +112,17 @@ if (NOT DEFINED BUILD_SHARED_LIBS) endif() if (NOT BUILD_SHARED_LIBS) - set(CMAKE_POSITION_INDEPENDENT_CODE ON) + if(NOT DEFINED CMAKE_POSITION_INDEPENDENT_CODE) + set(CMAKE_POSITION_INDEPENDENT_CODE ON) + endif() message(WARNING "You are building oneTBB as a static library. This is highly discouraged and such configuration is not supported. Consider building a dynamic library to avoid unforeseen issues.") endif() +# Prevent searching HWLOC by pkg-config on macOS +if (APPLE) + set(TBB_DISABLE_HWLOC_AUTOMATIC_SEARCH ON) +endif() + if (NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES) set(CMAKE_BUILD_TYPE RelWithDebInfo CACHE STRING "Build type" FORCE) message(STATUS "CMAKE_BUILD_TYPE is not specified. Using default: ${CMAKE_BUILD_TYPE}") @@ -123,6 +130,14 @@ if (NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES) set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "MinSizeRel" "RelWithDebInfo") endif() +if (CMAKE_BUILD_TYPE) + string(TOLOWER ${CMAKE_BUILD_TYPE} _tbb_build_type) + if (_tbb_build_type STREQUAL "debug") + set(TBB_ENABLE_IPO OFF) + endif() + unset(_tbb_build_type) +endif() + # ------------------------------------------------------------------- # Files and folders naming set(CMAKE_DEBUG_POSTFIX _debug) @@ -171,6 +186,13 @@ foreach(output_type LIBRARY ARCHIVE PDB RUNTIME) endif() endforeach() +if (CMAKE_CONFIGURATION_TYPES) + # We can't use generator expressions in a cmake variable name. + set(TBB_TEST_WORKING_DIRECTORY ${CMAKE_BINARY_DIR}/${TBB_OUTPUT_DIR_BASE}_$>) +else() + set(TBB_TEST_WORKING_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}) +endif() + # ------------------------------------------------------------------- # ------------------------------------------------------------------- diff --git a/third-party/tbb/README.md b/third-party/tbb/README.md index aec8d2aa..b96e1fb0 100644 --- a/third-party/tbb/README.md +++ b/third-party/tbb/README.md @@ -33,6 +33,7 @@ Here are [Release Notes](RELEASE_NOTES.md) and [System Requirements](SYSTEM_REQU * [oneTBB Testing Approach](https://oneapi-src.github.io/oneTBB/main/intro/testing_approach.html) * [Basic support for the Bazel build system](Bazel.md) * [oneTBB Discussions](https://github.com/oneapi-src/oneTBB/discussions) +* [WASM Support](WASM_Support.md) ## Installation See [Installation from Sources](INSTALL.md) to learn how to install oneTBB. diff --git a/third-party/tbb/RELEASE_NOTES.md b/third-party/tbb/RELEASE_NOTES.md index dcd9f732..57258416 100644 --- a/third-party/tbb/RELEASE_NOTES.md +++ b/third-party/tbb/RELEASE_NOTES.md @@ -18,25 +18,26 @@ This document contains changes of oneTBB compared to the last release. ## Table of Contents +- [New Features](#new-features) - [Known Limitations](#known-limitations) - [Fixed Issues](#fixed-issues) -- [Open-source Contributions Integrated](#open-source-contributions-integrated) + +## :tada: New Features +- Since C++17, parallel algorithms and Flow Graph nodes are allowed to accept pointers to the member functions and member objects as the user-provided callables. +- Added missed member functions, such as assignment operators and swap function, to the ``concurrent_queue`` and ``concurrent_bounded_queue`` containers. ## :rotating_light: Known Limitations -- A static assert causes compilation failures in oneTBB headers when compiling with Clang* 12.0.0 or newer if using the LLVM* standard library with -ffreestanding and C++11/14 compiler options. -- An application using Parallel STL algorithms in libstdc++ versions 9 and 10 may fail to compile due to incompatible interface changes between earlier versions of Threading Building Blocks (TBB) and oneAPI Threading Building Blocks (oneTBB). Disable support for Parallel STL algorithms by defining PSTL_USE_PARALLEL_POLICIES (in libstdc++ 9) or _GLIBCXX_USE_TBB_PAR_BACKEND (in libstdc++ 10) macro to zero before inclusion of the first standard header file in each translation unit. -- On Linux* OS, if oneAPI Threading Building Blocks (oneTBB) or Threading Building Blocks (TBB) are installed in a system folder like /usr/lib64, the application may fail to link due to the order in which the linker searches for libraries. Use the -L linker option to specify the correct location of oneTBB library. This issue does not affect the program execution. -- The oneapi::tbb::info namespace interfaces might unexpectedly change the process affinity mask on Windows* OS systems (see https://github.com/open-mpi/hwloc/issues/366 for details) when using hwloc version lower than 2.5. -- Using a hwloc version other than 1.11, 2.0, or 2.5 may cause an undefined behavior on Windows* OS. See https://github.com/open-mpi/hwloc/issues/477 for details. -- The NUMA topology may be detected incorrectly on Windows OS machines where the number of NUMA node threads exceeds the size of 1 processor group. -- On Windows OS on ARM64*, when compiling an application using oneTBB with the Microsoft* Compiler, the compiler issues a warning C4324 that a structure was padded due to the alignment specifier. Consider suppressing the warning by specifying /wd4324 to the compiler command line. -- oneTBB does not support fork(), to work-around the issue, consider using task_scheduler_handle to join oneTBB worker threads before using fork(). +- A static assert will cause compilation failures in oneTBB headers when compiling with clang 12.0.0 or newer if using the LLVM standard library with ``-ffreestanding`` and C++11/14 compiler options. +- An application using Parallel STL algorithms in libstdc++ versions 9 and 10 may fail to compile due to incompatible interface changes between earlier versions of Threading Building Blocks (TBB) and oneAPI Threading Building Blocks (oneTBB). Disable support for Parallel STL algorithms by defining ``PSTL_USE_PARALLEL_POLICIES`` (in libstdc++ 9) or ``_GLIBCXX_USE_TBB_PAR_BACKEND`` (in libstdc++ 10) macro to zero before inclusion of the first standard header file in each translation unit. +- On Linux* OS, if oneAPI Threading Building Blocks (oneTBB) or Threading Building Blocks (TBB) are installed in a system folder like ``/usr/lib64``, the application may fail to link due to the order in which the linker searches for libraries. Use the ``-L`` linker option to specify the correct location of oneTBB library. This issue does not affect the program execution. +- The ``oneapi::tbb::info`` namespace interfaces might unexpectedly change the process affinity mask on Windows* OS systems (see https://github.com/open-mpi/hwloc/issues/366 for details) when using hwloc* version lower than 2.5. +- Using a hwloc* version other than 1.11, 2.0, or 2.5 may cause an undefined behavior on Windows* OS. See https://github.com/open-mpi/hwloc/issues/477 for details. +- The NUMA* topology may be detected incorrectly on Windows* OS machines where the number of NUMA* node threads exceeds the size of 1 processor group. +- On Windows* OS on ARM64*, when compiling an application using oneTBB with the Microsoft* Compiler, the compiler issues a warning C4324 that a structure was padded due to the alignment specifier. Consider suppressing the warning by specifying ``/wd4324`` to the compiler command line. +- oneTBB does not support ``fork()``, to work-around the issue, consider using task_scheduler_handle to join oneTBB worker threads before using fork(). - C++ exception handling mechanism on Windows* OS on ARM64* might corrupt memory if an exception is thrown from any oneTBB parallel algorithm (see Windows* OS on ARM64* compiler issue: https://developercommunity.visualstudio.com/t/ARM64-incorrect-stack-unwinding-for-alig/1544293). ## :hammer: Fixed Issues -- Memory allocator crash when allocating ~1TB on 64-bit systems (GitHub* [#838](https://github.com/oneapi-src/oneTBB/issues/838)). -- Fixed thread distribution over NUMA nodes on Windows* OS systems. -- For oneapi::tbb::suspend, it is guaranteed that the user-specified callable object is executed by the calling thread. - -## :octocat: Open-source Contributions Integrated -- Fix for full LTO* build, library and tests, on UNIX* OS systems. Contributed by Vladislav Shchapov (https://github.com/oneapi-src/oneTBB/pull/798). +- Fixed the hang in the reserve method of concurrent unordered containers ([GitHub* #1056](http://github.com/oneapi-src/oneTBB/issues/1056)). +- Fixed the C++20 three-way comparison feature detection ([GitHub* #1093](http://github.com/oneapi-src/oneTBB/issues/1093)). +- Fixed oneTBB integration with CMake* in the Conda* environment. diff --git a/third-party/tbb/SECURITY.md b/third-party/tbb/SECURITY.md new file mode 100644 index 00000000..c4a49dd5 --- /dev/null +++ b/third-party/tbb/SECURITY.md @@ -0,0 +1,7 @@ +# Security Policy +Intel is committed to rapidly addressing security vulnerabilities affecting our customers and providing clear guidance on the solution, +impact, severity and mitigation. + +## Reporting a Vulnerability +Please report any security vulnerabilities in this project +[utilizing the guidelines here](https://www.intel.com/content/www/us/en/security-center/vulnerability-handling-guidelines.html). diff --git a/third-party/tbb/SYSTEM_REQUIREMENTS.md b/third-party/tbb/SYSTEM_REQUIREMENTS.md index 4e4e8e80..803041c6 100644 --- a/third-party/tbb/SYSTEM_REQUIREMENTS.md +++ b/third-party/tbb/SYSTEM_REQUIREMENTS.md @@ -23,13 +23,13 @@ This document provides details about hardware, operating system, and software pr - [Supported Operating Systems](#supported-operating-systems) - [Community-Supported Platforms](#community-supported-platforms) - [Supported Compilers](#supported-compilers) +- [Limitations](#limitations) ## Supported Hardware - Intel(R) Celeron(R) processor family - Intel(R) Core* processor family - Intel(R) Xeon(R) processor family -- Intel(R) Xeon Phi* processor family - Intel(R) Atom* processor family - Non-Intel(R) processors compatible with the processors listed above @@ -37,22 +37,23 @@ This document provides details about hardware, operating system, and software pr ## Software ### Supported Operating Systems -- Systems with Microsoft* Windows* operating systems +- Systems with Microsoft* Windows* operating systems: - Microsoft* Windows* 10 - - Microsoft* Windows* Server 2016 + - Microsoft* Windows* 11 - Microsoft* Windows* Server 2019 -- Systems with Linux* operating systems - - Clear Linux* - - Amazon* Linux 2 - - CentOS* 8 - - Debian* 10 - - Fedora* 34 - - Red Hat* Enterprise Linux* 7, 8 + - Microsoft* Windows* Server 2022 +- Systems with Linux* operating systems: + - Oracle Linux* 8 + - Amazon* Linux* 2 + - Debian* 9, 10, 11 + - Fedora* 36, 37 + - Rocky* Linux* 9 + - Red Hat* Enterprise Linux* 8, 9 - SuSE* Linux* Enterprise Server 15 - - Ubuntu* 18.04 LTS, 20.04, 21.04 -- Systems with macOS* operating systems - - macOS* 10.15, 11.x -- Systems with Android* operating systems + - Ubuntu* 20.04, 22.04 +- Systems with macOS* operating systems: + - macOS* 12.x, 13.x +- Systems with Android* operating systems: - Android* 9 ### Community-Supported Platforms @@ -64,9 +65,22 @@ This document provides details about hardware, operating system, and software pr ### Supported Compilers - Intel* oneAPI DPC++/C++ Compiler - Intel* C++ Compiler 19.0 and 19.1 version -- Microsoft* Visual C++ 14.1 (Microsoft* Visual Studio* 2017, Windows* OS only) - Microsoft* Visual C++ 14.2 (Microsoft* Visual Studio* 2019, Windows* OS only) -- For each supported Linux* operating system, the standard gcc version provided with that operating system is supported - - GNU Compilers (gcc) 4.8.5 - 11.1.1 - - GNU C Library (glibc) version 2.17 - 2.33 - - Clang* 6.0.0 - 12.0.0 +- Microsoft* Visual C++ 14.3 (Microsoft* Visual Studio* 2022, Windows* OS only) +- For each supported Linux* operating system, the standard gcc version provided with that operating system is supported: + - GNU Compilers (gcc) 4.8.5 - 11.2.1 + - GNU C Library (glibc) version 2.17 - 2.34 + - Clang* 6.0.0 - 13.0.0 + +## Limitations +There are some cases where we cannot provide support for your platforms. It includes: + +1. The platform is out of official support (met end of life). When you use an unsupported platform, you can face a security risk that can be difficult to resolve. +2. We do not have the infrastructure to test a platform. Therefore we cannot guarantee that oneTBB works correctly on that platform. +3. Changes affect more code than just platform-specific macros. +4. The platform is incompatible with oneTBB. Some platforms may have limitations that prevent oneTBB from working correctly. We cannot provide support in these cases as the issue is beyond our control. +5. The platform is modified or customized. If you made significant updates to your platform, it might be hard for us to find the root cause of the issue. Therefore, we may not be able to provide support as the modification could affect the oneTBB functionality. + + +We understand that these limitations can be frustrating. Thus, we suggest creating a branch specifically for the unsupported platform, allowing other users to contribute to or use your implementation. + diff --git a/third-party/tbb/WASM_Support.md b/third-party/tbb/WASM_Support.md new file mode 100644 index 00000000..67925ee4 --- /dev/null +++ b/third-party/tbb/WASM_Support.md @@ -0,0 +1,31 @@ + + +# WASM Support + +``WASM`` stands for WebAssembly, a low-level binary format for executing code in web browsers. +It is designed to be a portable target for compilers and to be efficient to parse and execute. + +WebAssembly aims to provide a fast, efficient, and safe way to run code in web browsers without needing plugins or other software. Code written in a variety of programming languages, including C, C++, Rust and others, can be compiled into WebAssembly format for use in web pages. This allows you to write high-performance applications that run directly in the browser. + +We currently have an [under development branch that provides you with WASM support](https://github.com/oneapi-src/oneTBB/tree/tbb_wasm). + +By using WASM, you can: +* Create highly performant and scalable applications that can meet the demands of modern web-based systems. +* Take advantage of oneTBB features to optimize the performance of your web-based applications. + + + diff --git a/third-party/tbb/cmake/compilers/Clang.cmake b/third-party/tbb/cmake/compilers/Clang.cmake index 69aa5193..a128e133 100644 --- a/third-party/tbb/cmake/compilers/Clang.cmake +++ b/third-party/tbb/cmake/compilers/Clang.cmake @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2021 Intel Corporation +# Copyright (c) 2020-2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -27,6 +27,7 @@ elseif (MSVC) else() set(TBB_LINK_DEF_FILE_FLAG -Wl,--version-script=) set(TBB_DEF_FILE_PREFIX lin${TBB_ARCH}) + set(TBB_TEST_COMPILE_FLAGS ${TBB_TEST_COMPILE_FLAGS} $<$>:-ffp-model=precise>) endif() # Depfile options (e.g. -MD) are inserted automatically in some cases. @@ -48,11 +49,14 @@ if (CMAKE_SYSTEM_PROCESSOR MATCHES "(AMD64|amd64|i.86|x86)") set(TBB_COMMON_COMPILE_FLAGS ${TBB_COMMON_COMPILE_FLAGS} -mrtm $<$>:-mwaitpkg>) endif() +# Clang flags to prevent compiler from optimizing out security checks +set(TBB_COMMON_COMPILE_FLAGS ${TBB_COMMON_COMPILE_FLAGS} -Wformat -Wformat-security -Werror=format-security + -fstack-protector-strong -fPIC) +set(TBB_LIB_LINK_FLAGS ${TBB_LIB_LINK_FLAGS} -Wl,-z,relro,-z,now) + set(TBB_COMMON_LINK_LIBS ${CMAKE_DL_LIBS}) -if (ANDROID_PLATFORM) - set(TBB_COMMON_COMPILE_FLAGS ${TBB_COMMON_COMPILE_FLAGS} $<$>:-D_FORTIFY_SOURCE=2>) -endif() +set(TBB_COMMON_COMPILE_FLAGS ${TBB_COMMON_COMPILE_FLAGS} $<$>:-D_FORTIFY_SOURCE=2>) if (MINGW) list(APPEND TBB_COMMON_COMPILE_FLAGS -U__STRICT_ANSI__) diff --git a/third-party/tbb/cmake/compilers/GNU.cmake b/third-party/tbb/cmake/compilers/GNU.cmake index 34c10db0..b60172c8 100644 --- a/third-party/tbb/cmake/compilers/GNU.cmake +++ b/third-party/tbb/cmake/compilers/GNU.cmake @@ -52,6 +52,7 @@ endif() if (NOT ${CMAKE_CXX_COMPILER_ID} STREQUAL Intel) # gcc 6.0 and later have -flifetime-dse option that controls elimination of stores done outside the object lifetime set(TBB_DSE_FLAG $<$>:-flifetime-dse=1>) + set(TBB_COMMON_COMPILE_FLAGS ${TBB_COMMON_COMPILE_FLAGS} $<$>:-fstack-clash-protection>) endif() # Workaround for heavy tests and too many symbols in debug (rellocation truncated to fit: R_MIPS_CALL16) @@ -70,6 +71,11 @@ endif () # Gnu flags to prevent compiler from optimizing out security checks set(TBB_COMMON_COMPILE_FLAGS ${TBB_COMMON_COMPILE_FLAGS} -fno-strict-overflow -fno-delete-null-pointer-checks -fwrapv) +set(TBB_COMMON_COMPILE_FLAGS ${TBB_COMMON_COMPILE_FLAGS} -Wformat -Wformat-security -Werror=format-security + -fstack-protector-strong ) +set(TBB_LIB_LINK_FLAGS ${TBB_LIB_LINK_FLAGS} -Wl,-z,relro,-z,now,-z,noexecstack) +set(TBB_COMMON_COMPILE_FLAGS ${TBB_COMMON_COMPILE_FLAGS} $<$>:-D_FORTIFY_SOURCE=2> ) + # TBB malloc settings set(TBBMALLOC_LIB_COMPILE_FLAGS -fno-rtti -fno-exceptions) diff --git a/third-party/tbb/cmake/compilers/Intel.cmake b/third-party/tbb/cmake/compilers/Intel.cmake index fdff9082..582f9a84 100644 --- a/third-party/tbb/cmake/compilers/Intel.cmake +++ b/third-party/tbb/cmake/compilers/Intel.cmake @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2021 Intel Corporation +# Copyright (c) 2020-2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -26,10 +26,8 @@ elseif (APPLE) set(TBB_IPO_COMPILE_FLAGS $<$>:-ipo>) else() include(${CMAKE_CURRENT_LIST_DIR}/GNU.cmake) - set(TBB_LIB_LINK_FLAGS ${TBB_LIB_LINK_FLAGS} -static-intel -Wl,-z,relro,-z,now,) - set(TBB_COMMON_COMPILE_FLAGS ${TBB_COMMON_COMPILE_FLAGS} -fstack-protector -Wformat -Wformat-security - $<$>:-qno-opt-report-embed -D_FORTIFY_SOURCE=2> - $<$:-falign-stack=maintain-16-byte>) + set(TBB_COMMON_COMPILE_FLAGS ${TBB_COMMON_COMPILE_FLAGS} $<$:-falign-stack=maintain-16-byte>) + set(TBB_LIB_LINK_FLAGS ${TBB_LIB_LINK_FLAGS} -static-intel) set(TBB_OPENMP_FLAG -qopenmp) set(TBB_IPO_COMPILE_FLAGS $<$>:-ipo>) endif() diff --git a/third-party/tbb/cmake/compilers/MSVC.cmake b/third-party/tbb/cmake/compilers/MSVC.cmake index c605fecb..0e0dfd31 100644 --- a/third-party/tbb/cmake/compilers/MSVC.cmake +++ b/third-party/tbb/cmake/compilers/MSVC.cmake @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2021 Intel Corporation +# Copyright (c) 2020-2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -35,6 +35,12 @@ endif() set(TBB_LIB_COMPILE_FLAGS -D_CRT_SECURE_NO_WARNINGS /GS) set(TBB_COMMON_COMPILE_FLAGS /volatile:iso /FS /EHsc) +set(TBB_LIB_LINK_FLAGS ${TBB_LIB_LINK_FLAGS} /DYNAMICBASE /NXCOMPAT) + +if (TBB_ARCH EQUAL 32) + set(TBB_LIB_LINK_FLAGS ${TBB_LIB_LINK_FLAGS} /SAFESEH ) +endif() + # Ignore /WX set through add_compile_options() or added to CMAKE_CXX_FLAGS if TBB_STRICT is disabled. if (NOT TBB_STRICT AND COMMAND tbb_remove_compile_flag) tbb_remove_compile_flag(/WX) diff --git a/third-party/tbb/cmake/config_generation.cmake b/third-party/tbb/cmake/config_generation.cmake index 74d7e76f..0cbdd745 100644 --- a/third-party/tbb/cmake/config_generation.cmake +++ b/third-party/tbb/cmake/config_generation.cmake @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2021 Intel Corporation +# Copyright (c) 2020-2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -22,7 +22,7 @@ function(tbb_generate_config) set(options HANDLE_SUBDIRS) set(oneValueArgs INSTALL_DIR SYSTEM_NAME - LIB_REL_PATH INC_REL_PATH DLL_REL_PATH + LIB_REL_PATH INC_REL_PATH VERSION TBB_BINARY_VERSION TBBMALLOC_BINARY_VERSION @@ -35,7 +35,6 @@ function(tbb_generate_config) file(MAKE_DIRECTORY ${config_install_dir}) file(TO_CMAKE_PATH "${tbb_gen_cfg_LIB_REL_PATH}" TBB_LIB_REL_PATH) - file(TO_CMAKE_PATH "${tbb_gen_cfg_DLL_REL_PATH}" TBB_DLL_REL_PATH) file(TO_CMAKE_PATH "${tbb_gen_cfg_INC_REL_PATH}" TBB_INC_REL_PATH) set(TBB_VERSION ${tbb_gen_cfg_VERSION}) @@ -54,45 +53,59 @@ set(_tbbbind_bin_version ${tbb_gen_cfg_TBBBIND_BINARY_VERSION}) if (tbb_gen_cfg_SYSTEM_NAME STREQUAL "Linux") set(TBB_LIB_PREFIX "lib") set(TBB_LIB_EXT "so.\${_\${_tbb_component}_bin_version}") - set(TBB_IMPLIB_RELEASE "") - set(TBB_IMPLIB_DEBUG "") - if (tbb_gen_cfg_HANDLE_SUBDIRS) - set(TBB_HANDLE_SUBDIRS " -if (CMAKE_SIZEOF_VOID_P STREQUAL \"8\") - set(_tbb_subdir intel64/gcc4.8) -else () - set(_tbb_subdir ia32/gcc4.8) -endif() + + set (TBB_HANDLE_IMPLIB " + set (_tbb_release_dll \${_tbb_release_lib}) + set (_tbb_debug_dll \${_tbb_debug_lib}) ") + if (tbb_gen_cfg_HANDLE_SUBDIRS) + set(TBB_HANDLE_SUBDIRS "set(_tbb_subdir gcc4.8)") set(_libdir_for_pc_file "\${prefix}/lib/intel64/gcc4.8") + set(_tbb_pc_extra_libdir "-L\${prefix}/lib") configure_file(${_tbb_gen_cfg_path}/../integration/pkg-config/tbb.pc.in ${config_install_dir}/tbb.pc @ONLY) set(_libdir_for_pc_file "\${prefix}/lib/ia32/gcc4.8") + set(_tbb_pc_extra_libdir "-L\${prefix}/lib32") configure_file(${_tbb_gen_cfg_path}/../integration/pkg-config/tbb.pc.in ${config_install_dir}/tbb32.pc @ONLY) endif() elseif (tbb_gen_cfg_SYSTEM_NAME STREQUAL "Darwin") set(TBB_LIB_PREFIX "lib") set(TBB_LIB_EXT "\${_\${_tbb_component}_bin_version}.dylib") - set(TBB_IMPLIB_RELEASE "") - set(TBB_IMPLIB_DEBUG "") + + set (TBB_HANDLE_IMPLIB " + set (_tbb_release_dll \${_tbb_release_lib}) + set (_tbb_debug_dll \${_tbb_debug_lib}) +") set(_libdir_for_pc_file "\${prefix}/lib") configure_file(${_tbb_gen_cfg_path}/../integration/pkg-config/tbb.pc.in ${config_install_dir}/tbb.pc @ONLY) elseif (tbb_gen_cfg_SYSTEM_NAME STREQUAL "Windows") set(TBB_LIB_PREFIX "") - set(TBB_LIB_EXT "dll") + set(TBB_LIB_EXT "lib") set(TBB_COMPILE_DEFINITIONS " INTERFACE_COMPILE_DEFINITIONS \"__TBB_NO_IMPLICIT_LINKAGE=1\"") - - # .lib files installed to TBB_LIB_REL_PATH (e.g. /lib); - # .dll files installed to TBB_DLL_REL_PATH (e.g. /bin); - # Expand TBB_LIB_REL_PATH here in IMPORTED_IMPLIB property and - # redefine it with TBB_DLL_REL_PATH value to properly fill IMPORTED_LOCATION property in TBBConfig.cmake.in template. - set(TBB_IMPLIB_RELEASE " - IMPORTED_IMPLIB_RELEASE \"\${_tbb_root}/${TBB_LIB_REL_PATH}/\${_tbb_subdir}/\${_tbb_component}\${_bin_version}.lib\"") - set(TBB_IMPLIB_DEBUG " - IMPORTED_IMPLIB_DEBUG \"\${_tbb_root}/${TBB_LIB_REL_PATH}/\${_tbb_subdir}/\${_tbb_component}\${_bin_version}_debug.lib\"") - set(TBB_LIB_REL_PATH ${TBB_DLL_REL_PATH}) + + # .lib - installed to TBB_LIB_REL_PATH (e.g. /lib) and are passed as IMPORTED_IMPLIB_ property to target + # .dll - installed to /bin or /redist and are passed as IMPORTED_LOCATION_ property to target + set (TBB_HANDLE_IMPLIB " + find_file(_tbb_release_dll + NAMES \${_tbb_component}\${_bin_version}.dll + PATHS \${_tbb_root} + PATH_SUFFIXES \"redist/\${_tbb_intel_arch}/\${_tbb_subdir}\" \"bin\${_tbb_arch_suffix}/\${_tbb_subdir}\" \"bin\${_tbb_arch_suffix}/\" \"bin\" + ) + + if (EXISTS \"\${_tbb_debug_lib}\") + find_file(_tbb_debug_dll + NAMES \${_tbb_component}\${_bin_version}_debug.dll + PATHS \${_tbb_root} + PATH_SUFFIXES \"redist/\${_tbb_intel_arch}/\${_tbb_subdir}\" \"bin\${_tbb_arch_suffix}/\${_tbb_subdir}\" \"bin\${_tbb_arch_suffix}/\" \"bin\" + ) + endif() +") + set(TBB_IMPLIB_RELEASE " + IMPORTED_IMPLIB_RELEASE \"\${_tbb_release_lib}\"") + set(TBB_IMPLIB_DEBUG " + IMPORTED_IMPLIB_DEBUG \"\${_tbb_debug_lib}\"") if (tbb_gen_cfg_HANDLE_SUBDIRS) set(TBB_HANDLE_SUBDIRS " @@ -100,19 +113,15 @@ set(_tbb_subdir vc14) if (WINDOWS_STORE) set(_tbb_subdir \${_tbb_subdir}_uwp) endif() - -if (CMAKE_SIZEOF_VOID_P STREQUAL \"8\") - set(_tbb_subdir intel64/\${_tbb_subdir}) -else () - set(_tbb_subdir ia32/\${_tbb_subdir}) -endif() ") set(_tbb_pc_lib_name ${_tbb_pc_lib_name}${TBB_BINARY_VERSION}) set(_libdir_for_pc_file "\${prefix}/lib/intel64/vc14") + set(_tbb_pc_extra_libdir "-L\${prefix}/lib") configure_file(${_tbb_gen_cfg_path}/../integration/pkg-config/tbb.pc.in ${config_install_dir}/tbb.pc @ONLY) set(_libdir_for_pc_file "\${prefix}/lib/ia32/vc14") + set(_tbb_pc_extra_libdir "-L\${prefix}/lib32") configure_file(${_tbb_gen_cfg_path}/../integration/pkg-config/tbb.pc.in ${config_install_dir}/tbb32.pc @ONLY) endif() diff --git a/third-party/tbb/cmake/memcheck.cmake b/third-party/tbb/cmake/memcheck.cmake index 570f90fd..fd5e920c 100644 --- a/third-party/tbb/cmake/memcheck.cmake +++ b/third-party/tbb/cmake/memcheck.cmake @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2021 Intel Corporation +# Copyright (c) 2020-2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -31,10 +31,18 @@ endif() file(MAKE_DIRECTORY ${CMAKE_BINARY_DIR}/memcheck) -function(_tbb_run_memcheck test_target) +function(_tbb_run_memcheck test_target subdir) set(target_name memcheck-${test_target}) + if(${subdir} STREQUAL "tbbmalloc") + # Valgring intercepts all allocation symbols with its own by default, + # so it disables using tbbmalloc. In case of tbbmalloc tests + # intercept allocation symbols only in the default system libraries, + # but not in any other shared library or the executable + # defining public malloc or operator new related functions. + set(option "--soname-synonyms=somalloc=nouserintercepts") + endif() add_custom_target(${target_name} - COMMAND ${VALGRIND_EXE} --leak-check=full --show-leak-kinds=all --log-file=${CMAKE_BINARY_DIR}/memcheck/${target_name}.log -v $) + COMMAND ${VALGRIND_EXE} ${option} --leak-check=full --show-leak-kinds=all --log-file=${CMAKE_BINARY_DIR}/memcheck/${target_name}.log -v $) add_dependencies(memcheck-all ${target_name}) endfunction() diff --git a/third-party/tbb/cmake/packaging.cmake b/third-party/tbb/cmake/packaging.cmake index 7b713973..aa2acc4d 100644 --- a/third-party/tbb/cmake/packaging.cmake +++ b/third-party/tbb/cmake/packaging.cmake @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2021 Intel Corporation +# Copyright (c) 2020-2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -18,4 +18,7 @@ set(CPACK_PACKAGE_NAME "${PROJECT_NAME}") set(CPACK_PACKAGE_VERSION "${TBB_VERSION}") string(TOLOWER ${CPACK_PACKAGE_NAME}-${PROJECT_VERSION}-${CMAKE_SYSTEM_NAME}_${TBB_OUTPUT_DIR_BASE}_${CMAKE_BUILD_TYPE} CPACK_PACKAGE_FILE_NAME) set(CPACK_GENERATOR ZIP) -include(CPack) +# Note: this is an internal non-documented variable set by CPack +if (NOT CPack_CMake_INCLUDED) + include(CPack) +endif() diff --git a/third-party/tbb/cmake/scripts/cmake_gen_github_configs.cmake b/third-party/tbb/cmake/scripts/cmake_gen_github_configs.cmake index 447b805a..4d9eb73d 100644 --- a/third-party/tbb/cmake/scripts/cmake_gen_github_configs.cmake +++ b/third-party/tbb/cmake/scripts/cmake_gen_github_configs.cmake @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2021 Intel Corporation +# Copyright (c) 2020-2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -20,7 +20,6 @@ set(TBB_ROOT_REL_PATH "../../..") # Paths relative to TBB root directory set(INC_REL_PATH "include") set(LIB_REL_PATH "lib") -set(DLL_REL_PATH "redist") # Parse version info file(READ ${CMAKE_CURRENT_LIST_DIR}/../../include/oneapi/tbb/version.h _tbb_version_info) @@ -45,6 +44,6 @@ set(COMMON_ARGS ) tbb_generate_config(INSTALL_DIR ${INSTALL_DIR}/linux SYSTEM_NAME Linux HANDLE_SUBDIRS ${COMMON_ARGS}) -tbb_generate_config(INSTALL_DIR ${INSTALL_DIR}/windows SYSTEM_NAME Windows HANDLE_SUBDIRS DLL_REL_PATH ${DLL_REL_PATH} ${COMMON_ARGS}) -tbb_generate_config(INSTALL_DIR ${INSTALL_DIR}/darwin SYSTEM_NAME Darwin ${COMMON_ARGS}) +tbb_generate_config(INSTALL_DIR ${INSTALL_DIR}/windows SYSTEM_NAME Windows HANDLE_SUBDIRS ${COMMON_ARGS}) +tbb_generate_config(INSTALL_DIR ${INSTALL_DIR}/darwin SYSTEM_NAME Darwin ${COMMON_ARGS}) message(STATUS "TBBConfig files were created in ${INSTALL_DIR}") diff --git a/third-party/tbb/cmake/suppressions/tsan.suppressions b/third-party/tbb/cmake/suppressions/tsan.suppressions index d20fa8ca..1bbb833b 100644 --- a/third-party/tbb/cmake/suppressions/tsan.suppressions +++ b/third-party/tbb/cmake/suppressions/tsan.suppressions @@ -1,3 +1,4 @@ # TSAN suppression for known issues. # Possible data race during ittnotify initialization. Low impact. race:__itt_nullify_all_pointers +race:__itt_init_ittlib diff --git a/third-party/tbb/cmake/templates/TBBConfig.cmake.in b/third-party/tbb/cmake/templates/TBBConfig.cmake.in index 880a9330..18ac68d3 100644 --- a/third-party/tbb/cmake/templates/TBBConfig.cmake.in +++ b/third-party/tbb/cmake/templates/TBBConfig.cmake.in @@ -1,4 +1,4 @@ -# Copyright (c) 2017-2021 Intel Corporation +# Copyright (c) 2017-2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -43,14 +43,36 @@ if (NOT _tbbmalloc_proxy_ix EQUAL -1) set(TBB_FIND_REQUIRED_tbbmalloc ${TBB_FIND_REQUIRED_tbbmalloc_proxy}) endif() unset(_tbbmalloc_proxy_ix) + +if (CMAKE_SIZEOF_VOID_P STREQUAL "8") + set(_tbb_intel_arch intel64) +else () + set(_tbb_intel_arch ia32) + set(_tbb_arch_suffix 32) +endif() + @TBB_HANDLE_SUBDIRS@ foreach (_tbb_component ${TBB_FIND_COMPONENTS}) + unset(_tbb_release_dll CACHE) + unset(_tbb_debug_dll CACHE) + unset(_tbb_release_lib CACHE) + unset(_tbb_debug_lib CACHE) + set(TBB_${_tbb_component}_FOUND 0) @TBB_HANDLE_BIN_VERSION@ - get_filename_component(_tbb_release_lib "${_tbb_root}/@TBB_LIB_REL_PATH@/${_tbb_subdir}/@TBB_LIB_PREFIX@${_tbb_component}${_bin_version}.@TBB_LIB_EXT@" ABSOLUTE) + + find_library(_tbb_release_lib + NAMES @TBB_LIB_PREFIX@${_tbb_component}${_bin_version}.@TBB_LIB_EXT@ + PATHS ${_tbb_root} + PATH_SUFFIXES "@TBB_LIB_REL_PATH@/${_tbb_intel_arch}/${_tbb_subdir}" "@TBB_LIB_REL_PATH@${_tbb_arch_suffix}/${_tbb_subdir}" "@TBB_LIB_REL_PATH@${_tbb_arch_suffix}" "@TBB_LIB_REL_PATH@" + ) if (NOT TBB_FIND_RELEASE_ONLY) - get_filename_component(_tbb_debug_lib "${_tbb_root}/@TBB_LIB_REL_PATH@/${_tbb_subdir}/@TBB_LIB_PREFIX@${_tbb_component}${_bin_version}_debug.@TBB_LIB_EXT@" ABSOLUTE) + find_library(_tbb_debug_lib + NAMES @TBB_LIB_PREFIX@${_tbb_component}${_bin_version}_debug.@TBB_LIB_EXT@ + PATHS ${_tbb_root} + PATH_SUFFIXES "@TBB_LIB_REL_PATH@/${_tbb_intel_arch}/${_tbb_subdir}" "@TBB_LIB_REL_PATH@${_tbb_arch_suffix}/${_tbb_subdir}" "@TBB_LIB_REL_PATH@${_tbb_arch_suffix}" "@TBB_LIB_REL_PATH@" + ) endif() if (EXISTS "${_tbb_release_lib}" OR EXISTS "${_tbb_debug_lib}") @@ -63,15 +85,17 @@ foreach (_tbb_component ${TBB_FIND_COMPONENTS}) unset(_tbb_current_realpath) unset(_tbb_include_dir) - if (EXISTS "${_tbb_release_lib}") + @TBB_HANDLE_IMPLIB@ + + if (EXISTS "${_tbb_release_dll}") set_target_properties(TBB::${_tbb_component} PROPERTIES - IMPORTED_LOCATION_RELEASE "${_tbb_release_lib}"@TBB_IMPLIB_RELEASE@) + IMPORTED_LOCATION_RELEASE "${_tbb_release_dll}"@TBB_IMPLIB_RELEASE@) set_property(TARGET TBB::${_tbb_component} APPEND PROPERTY IMPORTED_CONFIGURATIONS RELEASE) endif() - if (EXISTS "${_tbb_debug_lib}") + if (EXISTS "${_tbb_debug_dll}") set_target_properties(TBB::${_tbb_component} PROPERTIES - IMPORTED_LOCATION_DEBUG "${_tbb_debug_lib}"@TBB_IMPLIB_DEBUG@) + IMPORTED_LOCATION_DEBUG "${_tbb_debug_dll}"@TBB_IMPLIB_DEBUG@) set_property(TARGET TBB::${_tbb_component} APPEND PROPERTY IMPORTED_CONFIGURATIONS DEBUG) endif() @@ -93,6 +117,10 @@ foreach (_tbb_component ${TBB_FIND_COMPONENTS}) endif() endforeach() list(REMOVE_DUPLICATES TBB_IMPORTED_TARGETS) +unset(_tbb_release_dll) +unset(_tbb_debug_dll) unset(_tbb_release_lib) unset(_tbb_debug_lib) unset(_tbb_root) +unset(_tbb_intel_arch) +unset(_tbb_arch_suffix) diff --git a/third-party/tbb/cmake/toolchains/riscv64.cmake b/third-party/tbb/cmake/toolchains/riscv64.cmake new file mode 100644 index 00000000..96c0014b --- /dev/null +++ b/third-party/tbb/cmake/toolchains/riscv64.cmake @@ -0,0 +1,34 @@ +# Copyright (c) 2023 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Prevent double invocation. +if (RISCV_TOOLCHAIN_INCLUDED) + return() +endif() +set(RISCV_TOOLCHAIN_INCLUDED TRUE) + +set(CMAKE_SYSTEM_NAME Linux) +set(CMAKE_SYSTEM_VERSION 1) +set(CMAKE_SYSTEM_PROCESSOR riscv) + +# User can use -DCMAKE_FIND_ROOT_PATH to specific toolchain path +set(CMAKE_C_COMPILER ${CMAKE_FIND_ROOT_PATH}/bin/riscv64-unknown-linux-gnu-clang) +set(CMAKE_CXX_COMPILER ${CMAKE_FIND_ROOT_PATH}/bin/riscv64-unknown-linux-gnu-clang++) +set(CMAKE_LINKER ${CMAKE_FIND_ROOT_PATH}/bin/riscv64-unknown-linux-gnu-ld) + +set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY) +set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY) + +# Most linux on riscv64 support rv64imafd_zba_zbb extensions +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=rv64imafd_zba_zbb -mabi=lp64d " CACHE INTERNAL "") diff --git a/third-party/tbb/cmake/utils.cmake b/third-party/tbb/cmake/utils.cmake index 0bb6bacd..254fe11e 100644 --- a/third-party/tbb/cmake/utils.cmake +++ b/third-party/tbb/cmake/utils.cmake @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2022 Intel Corporation +# Copyright (c) 2020-2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -47,7 +47,10 @@ endmacro() macro(tbb_handle_ipo target) if (TBB_IPO_PROPERTY) - set_target_properties(${target} PROPERTIES INTERPROCEDURAL_OPTIMIZATION TRUE) + set_target_properties(${target} PROPERTIES + INTERPROCEDURAL_OPTIMIZATION TRUE + INTERPROCEDURAL_OPTIMIZATION_DEBUG FALSE + ) elseif (TBB_IPO_FLAGS) target_compile_options(${target} PRIVATE ${TBB_IPO_COMPILE_FLAGS}) if (COMMAND target_link_options) diff --git a/third-party/tbb/doc/GSG/get_started.rst b/third-party/tbb/doc/GSG/get_started.rst index ed69b022..d437ce89 100644 --- a/third-party/tbb/doc/GSG/get_started.rst +++ b/third-party/tbb/doc/GSG/get_started.rst @@ -1,17 +1,24 @@ .. _Get_Started_Guide: -Get Started with |short_name| -============================== +Get Started with |full_name| +============================= -.. include:: intro_gsg.rst +|short_name| Get Started Guide provides the information you need to begin working with oneTBB. +It is helpful for new users of parallel programming and experienced developers that want to improve code performance. -.. include:: system_requirements.rst +It is recommended for you to have a basic knowledge of C++ programming and some experience with parallel programming concepts. -.. include:: before_beginning_and_example.rst -.. include:: hybrid_cpu_support.rst +To start using oneTBB, follow the next steps: +********************************************* -Find more -********* +#. Learn what :ref:`oneTBB is` and see the :ref:`System Requirements`. +#. :ref:`Install oneTBB`. +#. Run your program using oneTBB following the :ref:`Next Steps `. +#. Learn how to :ref:`Integrate oneTBB into your project ` using CMake* and pkg-config tool. +#. See :ref:`oneTBB Samples `. + + + + -See our `documentation `_ to learn more about |short_name|. diff --git a/third-party/tbb/doc/GSG/integrate.rst b/third-party/tbb/doc/GSG/integrate.rst new file mode 100644 index 00000000..13fd679d --- /dev/null +++ b/third-party/tbb/doc/GSG/integrate.rst @@ -0,0 +1,68 @@ +.. _integrate: + +Integrate oneTBB +================ + +If you want to improve the performance and scalability of your application, you can integrate oneTBB into your project. +For example, you may want to integrate oneTBB if your application needs to process large amounts of data in parallel. + +To integrate oneTBB, you need to: + +* Link oneTBB with the project's source code. +* Provide the necessary compiler and linker flags. + +However, you can use CMake* and the pkg-config tool to simplify the process of integrating oneTBB into your project and handling its dependencies. +See the instructions below to learn how to use the tools. + +CMake* +******* + +CMake* is a cross-platform build tool that helps you manage dependencies and build systems. +Integrating oneTBB into your project using CMake*: + +* Simplifies the process of building and linking against the library. +* Ensures that your project can be built and run on multiple platforms. +* Lets you manage oneTBB dependencies. + +To add oneTBB to another project using CMake*, add the following commands to your ``CMakeLists.txt`` file: + +.. code-block:: + + `find_package(TBB REQUIRED)` + `target_link_libraries(my_executable TBB::tbb)` + +After that, configure your project with CMake* as usual. + + +Compile a Program Using pkg-config +*********************************** + +The pkg-config tool is used to simplify the compilation line by retrieving information about packages +from special metadata files. It helps avoid large hard-coded paths and makes compilation more portable. + +To compile a test program ``test.cpp`` with oneTBB on Linux* OS, +provide the full path to search for included files and libraries, or provide a line as the following: + +.. code-block:: + + g++ -o test test.cpp $(pkg-config --libs --cflags tbb) + +Where: + +``--cflags`` provides oneTBB library include path: + +.. code-block:: + + $ pkg-config --cflags tbb + -I/tbb/latest/lib/pkgconfig/../..//include + +``--libs`` provides the Intel(R) oneTBB library name and the search path to find it: + +.. code-block:: + + $ pkg-config –libs tbb + -Ltbb/latest/lib/pkgconfig/../..//lib/intel64/gcc4.8 -ltbb + +.. note:: + + For Windows* OS, additionally, use the ``--msvc-syntax`` option flag that converts the compiling and linking flags in an appropriate mode. diff --git a/third-party/tbb/doc/GSG/intro.rst b/third-party/tbb/doc/GSG/intro.rst new file mode 100644 index 00000000..da8c558d --- /dev/null +++ b/third-party/tbb/doc/GSG/intro.rst @@ -0,0 +1,29 @@ +.. _intro: + +What oneTBB Is +============== + +|full_name| is a runtime-based parallel programming model for C++ code that uses threads. +The template-based runtime library can help you harness the latent performance of multi-core processors. + +oneTBB enables you to simplify parallel programming by breaking computation into parallel running tasks. Within a single process, +parallelism is carried out through threads, an operating system mechanism that allows the same or different sets of instructions +to be executed simultaneously. Using threads can make your program work faster and more efficiently. + +Here you can see one of the possible executions of tasks by threads. + +.. figure:: Images/how-oneTBB-works.png + :scale: 70% + :align: center + +Use oneTBB to write scalable applications that: + +* Specify logical parallel structure instead of threads. +* Emphasize data-parallel programming. +* Take advantage of concurrent collections and parallel algorithms. + +oneTBB supports nested parallelism and load balancing. It means that you can use the library without worrying about oversubscribing a system, which happens when more tasks are assigned to a system than it can handle efficiently. + +oneTBB is used in different areas, such as scientific simulations, gaming, data analysis, etc. + +It is available as a stand-alone product and as part of the |base_tk|. diff --git a/third-party/tbb/doc/GSG/next_steps.rst b/third-party/tbb/doc/GSG/next_steps.rst new file mode 100644 index 00000000..4974265d --- /dev/null +++ b/third-party/tbb/doc/GSG/next_steps.rst @@ -0,0 +1,151 @@ +.. _next_steps: + +Next Steps +=========== + +After installing oneTBB, complete the following steps to start working with the library. + +Set the Environment Variables +***************************** + +After installing |short_name|, set the environment variables: + +#. Go to the oneTBB installation directory. + +#. Set the environment variables using the script in ```` by running: + + * On Linux* OS: ``vars.{sh|csh} in /tbb/latest/env`` + * On Windows* OS: ``vars.bat in /tbb/latest/env`` + + +Build and Run a Sample +********************** + +.. tabs:: + + .. group-tab:: Windows* OS + + #. Create a new C++ project using your IDE. In this example, Microsoft* Visual Studio* Code is used. + #. Create an ``example.cpp`` file in the project. + #. Copy and paste the code below. It is a typical example of a |short_name| algorithm. The sample calculates a sum of all integer numbers from 1 to 100. + + .. code:: + + #include + + int main (){ + int sum = oneapi::tbb::parallel_reduce( + oneapi::tbb::blocked_range(1,101), 0, + [](oneapi::tbb::blocked_range const& r, int init) -> int { + for (int v = r.begin(); v != r.end(); v++) { + init += v; + } + return init; + }, + [](int lhs, int rhs) -> int { + return lhs + rhs; + } + ); + + printf("Sum: %d\n", sum); + return 0; + } + + #. Open the ``tasks.json`` file in the ``.vscode`` directory and paste the following lines to the args array: + + * ``-Ipath/to/oneTBB/include`` to add oneTBB include directory. + * ``path/to/oneTBB/`` to add oneTBB. + + For example: + + .. code-block:: + + { + "tasks": [ + { + "label": "build & run", + "type": "cppbuild", + "group": { + "args": [ + "/IC:\\Program Files (x86)\\Intel\\oneAPI\\tbb\\2021.9.0\\include", + "C:\\Program Files (x86)\\Intel\\oneAPI\\tbb\\2021.9.0\\lib\\ia32\\vc14\\tbb12.lib" + + + #. Build the project. + #. Run the example. + #. If oneTBB is configured correctly, the output displays ``Sum: 5050``. + + .. group-tab:: Linux* OS + + #. Create an ``example.cpp`` file in the project. + #. Copy and paste the code below. It is a typical example of a |short_name| algorithm. The sample calculates a sum of all integer numbers from 1 to 100. + + .. code:: + + #include + + int main(){ + int sum = oneapi::tbb::parallel_reduce( + oneapi::tbb::blocked_range(1,101), 0, + [](oneapi::tbb::blocked_range const& r, int init) -> int { + for (int v = r.begin(); v != r.end(); v++) { + init += v; + } + return init; + }, + [](int lhs, int rhs) -> int { + return lhs + rhs; + } + ); + + printf("Sum: %d\n", sum); + return 0; + } + + #. Compile the code using oneTBB. For example, + + .. code-block:: + + g++ -std=c++11 example.cpp -o example -ltbb + + + #. Run the executable: + + .. code-block:: + + ./example + + #. If oneTBB is configured correctly, the output displays ``Sum: 5050``. + + +Hybrid CPU and NUMA Support +**************************** + +If you need NUMA/Hybrid CPU support in oneTBB, you need to make sure that HWLOC* is installed on your system. + +HWLOC* (Hardware Locality) is a library that provides a portable abstraction of the hierarchical topology of modern architectures (NUMA, hybrid CPU systems, etc). oneTBB relies on HWLOC* to identify the underlying topology of the system to optimize thread scheduling and memory allocation. + +Without HWLOC*, oneTBB may not take advantage of NUMA/Hybrid CPU support. Therefore, it's important to make sure that HWLOC* is installed before using oneTBB on such systems. + +Check HWLOC* on the System +^^^^^^^^^^^^^^^^^^^^^^^^^^^ +To check if HWLOC* is already installed on your system, run ``hwloc-ls``: + +* For Linux* OS, in the command line. +* For Windows* OS, in the command prompt. + +If HWLOC* is installed, the command displays information about the hardware topology of your system. If it is not installed, you receive an error message saying that the command ``hwloc-ls`` could not be found. + +.. note:: For Hybrid CPU support, make sure that HWLOC* is version 2.5 or higher. For NUMA support, install HWLOC* version 1.11 or higher. + +Install HWLOC* +^^^^^^^^^^^^^^ + +To install HWLOC*, visit the official Portable Hardware Locality website (https://www-lb.open-mpi.org/projects/hwloc/). + +* For Windows* OS, binaries are available for download. +* For Linux* OS, only the source code is provided and binaries should be built. + +On Linux* OS, HWLOC* can be also installed with package managers, such as APT*, YUM*, etc. To do so, run: sudo apt install hwloc. + +.. note:: For Hybrid CPU support, make sure that HWLOC* is version 2.5 or higher. For NUMA support, install HWLOC* version 1.11 or higher. diff --git a/third-party/tbb/doc/GSG/samples.rst b/third-party/tbb/doc/GSG/samples.rst new file mode 100644 index 00000000..f19a2562 --- /dev/null +++ b/third-party/tbb/doc/GSG/samples.rst @@ -0,0 +1,49 @@ +.. _samples: + +oneTBB Samples +============== + +To become an expert in using oneTBB, explore its samples and examples to learn how +to properly utilize the features and functionality of oneTBB and avoid common mistakes that may impede your performance. + +The following samples are available: + +* **Containers** + + * `concurrent_hash_map `_ + * `concurrent_priority_queue `_ + +* `Flow Graph `_ + * `A solution to the binpacking problem using a queue_node, a buffer_node, and function_node. `_ + * `Cholesky Factorization algorithm `_ + * `An implementation of dining philosophers in graph using the reserving join_node `_ + * `A parallel implementation of bzip2 block-sorting file compressor `_ + * `An example of a collection of digital logic gates that can be easily composed into larger circuits `_ + * `An example of a Kohonen Self-Organizing Map using cancellation `_ + * `Split computational kernel for execution between CPU and GPU `_ + +* **Algorithms** + + * `parallel_for `_ + * `Game of life overlay `_ + * `Polygon overlay `_ + * `Parallel seismic wave simulation `_ + * `Parallel 2-D raytracer/renderer `_ + * `Find largest matching substrings `_ + * `Resumable task: Split computational kernel for execution between CPU and GPU `_ + * `parallel_for_each `_ + * `parallel_pipeline `_ + * `parallel_reduce `_ + +* **Task Scheduler** + + * `task_arena `_ + * `task_group `_ + * `Execute similar computational kernels, with one task executing the SYCL* code and the other task executing the oneTBB code `_ + +* **Other** + + * `Compute Fibonacci numbers in different ways `_ + + +.. note:: You can also refer to the `oneAPI Samples `_ to learn more about the ecosystem. \ No newline at end of file diff --git a/third-party/tbb/doc/conf.py b/third-party/tbb/doc/conf.py index 39a5ca90..87593ebf 100644 --- a/third-party/tbb/doc/conf.py +++ b/third-party/tbb/doc/conf.py @@ -56,7 +56,8 @@ 'sphinx.ext.imgmath', 'sphinx.ext.ifconfig', 'sphinx.ext.viewcode', - 'sphinx.ext.githubpages' + 'sphinx.ext.githubpages', + 'sphinx_tabs.tabs' ] # Add any paths that contain templates here, relative to this directory. @@ -77,7 +78,7 @@ # # This is also used if you do content translation via gettext catalogs. # Usually you set "language" from the command line for these cases. -language = None +language = 'en' # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. @@ -132,7 +133,7 @@ else: html_theme_options = { 'repository_url': 'https://github.com/oneapi-src/oneTBB', - 'path_to_docs': 'doc/main', + 'path_to_docs': 'doc', 'use_issues_button': True, 'use_edit_page_button': True, 'repository_branch': 'master', @@ -154,10 +155,16 @@ else: html_js_files = ['custom.js'] +html_theme_options = { + "logo": { + "text": "oneTBB Documentation", + } +} html_logo = '_static/oneAPI-rgb-rev-100.png' html_favicon = '_static/favicons.png' + # Custom sidebar templates, must be a dictionary that maps document names # to template names. # diff --git a/third-party/tbb/doc/index/toctree.rst b/third-party/tbb/doc/index/toctree.rst index 74534a4c..eda4497e 100644 --- a/third-party/tbb/doc/index/toctree.rst +++ b/third-party/tbb/doc/index/toctree.rst @@ -16,8 +16,12 @@ :maxdepth: 2 /GSG/get_started + /GSG/intro + /GSG/system_requirements /GSG/installation - /GSG/examples + /GSG/next_steps + /GSG/integrate + /GSG/samples .. toctree:: diff --git a/third-party/tbb/doc/main/_templates/layout.html b/third-party/tbb/doc/main/_templates/layout.html index f044be17..eb4d31dd 100644 --- a/third-party/tbb/doc/main/_templates/layout.html +++ b/third-party/tbb/doc/main/_templates/layout.html @@ -6,11 +6,10 @@ var wapLocalCode = 'us-en'; // Dynamically set per localized site, see mapping table for values var wapSection = "oneapi-tbb"; // WAP team will give you a unique section for your site // Load TMS - (function () { - var url = 'https://www.intel.com/content/dam/www/global/wap/tms-loader.js'; // WAP file URL - var po = document.createElement('script'); po.type = 'text/javascript'; po.async = true; po.src = url; - var s = document.getElementsByTagName('script')[0]; s.parentNode.insertBefore(po, s); + (function () { + var url = 'https://www.intel.com/content/dam/www/global/wap/tms-loader.js'; // WAP file URL + var po = document.createElement('script'); po.type = 'text/javascript'; po.async = true; po.src = url; + var s = document.getElementsByTagName('script')[0]; s.parentNode.insertBefore(po, s); })(); - {% endblock %} diff --git a/third-party/tbb/doc/main/reference/reference.rst b/third-party/tbb/doc/main/reference/reference.rst index 87d05c32..ec9fb1e1 100644 --- a/third-party/tbb/doc/main/reference/reference.rst +++ b/third-party/tbb/doc/main/reference/reference.rst @@ -16,7 +16,6 @@ It also describes features that are not included in the oneTBB specification. .. toctree:: :titlesonly: - info_namespace parallel_for_each_semantics parallel_sort_ranges_extension scalable_memory_pools/malloc_replacement_log diff --git a/third-party/tbb/doc/main/reference/task_group_extensions.rst b/third-party/tbb/doc/main/reference/task_group_extensions.rst index c2be6acc..10d39801 100644 --- a/third-party/tbb/doc/main/reference/task_group_extensions.rst +++ b/third-party/tbb/doc/main/reference/task_group_extensions.rst @@ -13,9 +13,7 @@ task_group extensions Description *********** -|full_name| implementation extends the `tbb::task_group specification `_ with the following members: - - - requirements for a user-provided function object +|full_name| implementation extends the `tbb::task_group specification `_ with the requirements for a user-provided function object. API @@ -88,4 +86,4 @@ As an optimization hint, ``F`` might return a ``task_handle``, which task object * `oneapi::tbb::task_group specification `_ * `oneapi::tbb::task_group_context specification `_ * `oneapi::tbb::task_group_status specification `_ -* :doc:`oneapi::tbb::task_handle class ` +* `oneapi::tbb::task_handle class `_ diff --git a/third-party/tbb/doc/main/tbb_userguide/Flow_Graph_Tips.rst b/third-party/tbb/doc/main/tbb_userguide/Flow_Graph_Tips.rst index 96d95122..7cda5022 100644 --- a/third-party/tbb/doc/main/tbb_userguide/Flow_Graph_Tips.rst +++ b/third-party/tbb/doc/main/tbb_userguide/Flow_Graph_Tips.rst @@ -6,8 +6,8 @@ Flow Graph Tips and Tricks .. toctree:: :maxdepth: 4 - ../tbb_userguide/Flow_Graph_waiting_tips + ../tbb_userguide/Flow-Graph-waiting-tips ../tbb_userguide/Flow_Graph_making_edges_tips ../tbb_userguide/Flow_Graph_nested_parallelism_tips ../tbb_userguide/Flow_Graph_resource_tips - ../tbb_userguide/Flow_Graph_exception_tips \ No newline at end of file + ../tbb_userguide/Flow-Graph-exception-tips \ No newline at end of file diff --git a/third-party/tbb/doc/main/tbb_userguide/Graph_Object.rst b/third-party/tbb/doc/main/tbb_userguide/Graph_Object.rst index 8993e1a1..06fd5331 100644 --- a/third-party/tbb/doc/main/tbb_userguide/Graph_Object.rst +++ b/third-party/tbb/doc/main/tbb_userguide/Graph_Object.rst @@ -25,3 +25,10 @@ or edges, and therefore no tasks are spawned. graph g; g.wait_for_all(); +The graph object does not own the nodes associated with it. You need to make sure that the graph object's lifetime is longer than the lifetimes of all nodes added to the graph and any activity associated with the graph. + +.. tip:: Call ``wait_for_all`` on a graph object before destroying it to make sure all activities are complete. + + Even when using smart pointers, be aware of the order of destruction for nodes and the graph to make sure that nodes are not deleted before the graph. + + diff --git a/third-party/tbb/doc/main/tbb_userguide/Migration_Guide/Task_Scheduler_Init.rst b/third-party/tbb/doc/main/tbb_userguide/Migration_Guide/Task_Scheduler_Init.rst index f1fafaea..aa8658ac 100644 --- a/third-party/tbb/doc/main/tbb_userguide/Migration_Guide/Task_Scheduler_Init.rst +++ b/third-party/tbb/doc/main/tbb_userguide/Migration_Guide/Task_Scheduler_Init.rst @@ -140,7 +140,8 @@ to set the stack size for oneTBB worker threads: Terminating oneTBB scheduler --------------------------------------- -:ref:`task_scheduler_handle_reference` + +`task_scheduler_handle `_ allows waiting for oneTBB worker threads completion: .. code:: cpp diff --git a/third-party/tbb/doc/main/tbb_userguide/attach_flow_graph_to_arena.rst b/third-party/tbb/doc/main/tbb_userguide/attach_flow_graph_to_arena.rst index d887d4b2..c31387f9 100644 --- a/third-party/tbb/doc/main/tbb_userguide/attach_flow_graph_to_arena.rst +++ b/third-party/tbb/doc/main/tbb_userguide/attach_flow_graph_to_arena.rst @@ -1,7 +1,7 @@ .. _attach_flow_graph_to_arena: Attach Flow Graph to an Arbitrary Task Arena -====================== +============================================= |short_name| ``task_arena`` interface provides mechanisms to guide tasks diff --git a/third-party/tbb/doc/main/tbb_userguide/design_patterns/Lazy_Initialization.rst b/third-party/tbb/doc/main/tbb_userguide/design_patterns/Lazy_Initialization.rst index ccba0d24..6812e6c5 100644 --- a/third-party/tbb/doc/main/tbb_userguide/design_patterns/Lazy_Initialization.rst +++ b/third-party/tbb/doc/main/tbb_userguide/design_patterns/Lazy_Initialization.rst @@ -1,7 +1,7 @@ .. _Lazy_Initialization: Lazy Initialization -================== +==================== .. container:: section diff --git a/third-party/tbb/doc/main/tbb_userguide/std_invoke.rst b/third-party/tbb/doc/main/tbb_userguide/std_invoke.rst new file mode 100644 index 00000000..17ee7add --- /dev/null +++ b/third-party/tbb/doc/main/tbb_userguide/std_invoke.rst @@ -0,0 +1,217 @@ +.. _std_invoke: + +Invoke a Callable Object +========================== + +Starting from C++17, the requirements for callable objects passed to algorithms or Flow Graph nodes are relaxed. It allows using additional types of bodies. +Previously, the body of the algorithm or Flow Graph node needed to be a Function Object (see `C++ Standard Function Object `_) and provide an +``operator()`` that accepts input parameters. + +Now the body needs to meet the more relaxed requirements of being Callable (see `C++ Standard Callable `_) that covers three types of objects: + +* **Function Objects that provide operator(arg1, arg2, ...)**, which accepts the input parameters +* **Pointers to member functions** that you can use as the body of the algorithm or the Flow Graph node +* **Pointers to member objects** work as the body of the algorithm or parallel construct + +You can use it not only for a Flow Graph but also for algorithms. See the example below: + +.. code:: + + // The class models oneTBB Range + class StrideRange { + public: + StrideRange(int* s, std::size_t sz, std::size_t str) + : start(s), size(sz), stride(str) {} + + // A copy constructor + StrideRange(const StrideRange&) = default; + + // A splitting constructor + StrideRange(StrideRange& other, oneapi::tbb::split) + : start(other.start), size(other.size / 2) + { + other.size -= size; + other.start += size; + } + + ~StrideRange() = default; + + // Indicate if the range is empty + bool empty() const { + return size == 0; + } + + // Indicate if the range can be divided + bool is_divisible() const { + return size >= stride; + } + + void iterate() const { + for (std::size_t i = 0; i < size; i += stride) { + // Performed an action for each element of the range, + // implement the code based on your requirements + } + } + + private: + int* start; + std::size_t size; + std::size_t stride; + }; + +Where: + +* The ``StrideRange`` class models oneTBB range that should be iterated with a specified stride during its initial construction. +* The ``stride`` value is stored in a private field within the range. Therefore, the class provides the member function ``iterate() const`` that implements a loop with the specified stride. + +``range.iterate()`` +******************* + +Before C++17, to utilize a range in a parallel algorithm, such as ``parallel_for``, it was required to provide a ``Function Object`` as the algorithm's body. This Function Object defined the operations to be executed on each iteration of the range: + +.. code:: + + int main() { + std::size_t array_size = 1000; + + int* array_to_iterate = new int[array_size]; + + StrideRange range(array_to_iterate, array_size, /* stride = */ 2); + + // Define a lambda function as the body of the parallel_for loop + auto pfor_body = [] (const StrideRange& range) { + range.iterate(); + }; + + // Perform parallel iteration + oneapi::tbb::parallel_for(range, pfor_body); + + delete[] array_to_iterate; + } + +An additional lambda function ``pfor_body`` was also required. This lambda function invoked the ``rage.iterate()`` function. + +Now with C++17, you can directly utilize a pointer to ``range.iterate()`` as the body of the algorithm: + +.. code:: + + int main() { + std::size_t array_size = 1000; + + int* array_to_iterate = new int[array_size]; + + // Performs the iteration over the array elements with the specified stride + StrideRange range(array_to_iterate, array_size, /* stride = */ 2); + + // Parallelize the iteration over the range object + oneapi::tbb::parallel_for(range, &StrideRange::iterate); + + delete[] array_to_iterate; + } + +``std::invoke`` +**************** + +``std::invoke`` is a function template that provides a syntax for invoking different types of callable objects with a set of arguments. + +oneTBB implementation uses the C++ standard function ``std::invoke(&StrideRange::iterate, range)`` to execute the body. It is the equivalent of ``range.iterate()``. +Therefore, it allows you to invoke a callable object, such as a function object, with the provided arguments. + +.. tip:: Refer to `C++ Standard `_ to learn more about ``std::invoke``. + +Example +^^^^^^^^ + +Consider a specific scenario with ``function_node`` within a Flow Graph. + +In the example below, a ``function_node`` takes an object as an input to read a member object of that input and proceed it to the next node in the graph: + +.. code:: + + struct Object { + int number; + }; + + int main() { + using namespace oneapi::tbb::flow; + + // Lambda function to read the member object of the input Object + auto number_reader = [] (const Object& obj) { + return obj.number; + }; + + // Lambda function to process the received integer + auto number_processor = [] (int i) { /* processing integer */ }; + + graph g; + + // Function node that takes an Object as input and produces an integer + function_node func1(g, unlimited, number_reader); + + // Function node that takes an integer as input and processes it + function_node func2(g, unlimited, number_processor); + + // Connect the function nodes + make_edge(func1, func2); + + // Provide produced input to the graph + func1.try_put(Object{1}); + + // Wait for the graph to complete + g.wait_for_all(); + } + + +Before C++17, the ``function_node`` in the Flow Graph required the body to be a Function Object. A lambda function was required to extract the number from the Object. + +With C++17, you can use ``std::invoke`` with a pointer to the member number directly as the body. + +You can update the previous example as follows: + +.. code:: + + struct Object { + int number; + }; + + int main() { + using namespace oneapi::tbb::flow; + + // The processing logic for the received integer + auto number_processor = [] (int i) { /* processing integer */ }; + + // Create a graph object g to hold the flow graph + graph g; + + // Use a member function pointer to the number member of the Object struct as the body + function_node func1(g, unlimited, &Object::number); + + // Use the number_processor lambda function as the body + function_node func2(g, unlimited, number_processor); + + // Connect the function nodes + make_edge(func1, func2); + + // Connect the function nodes + func1.try_put(Object{1}); + + // Wait for the graph to complete + g.wait_for_all(); + } + +Find More +********* + +The following APIs supports Callable object as Bodies: + +* `parallel_for `_ +* `parallel_reduce `_ +* `parallel_deterministic_reduce `_ +* `parallel_for_each `_ +* `parallel_scan `_ +* `parallel_pipeline `_ +* `function_node `_ +* `multifunction_node `_ +* `async_node `_ +* `sequencer_node `_ +* `join_node with key_matching policy `_ diff --git a/third-party/tbb/doc/main/tbb_userguide/title.rst b/third-party/tbb/doc/main/tbb_userguide/title.rst index b51c3294..c073acfc 100644 --- a/third-party/tbb/doc/main/tbb_userguide/title.rst +++ b/third-party/tbb/doc/main/tbb_userguide/title.rst @@ -23,6 +23,7 @@ ../tbb_userguide/design_patterns/Design_Patterns ../tbb_userguide/Migration_Guide ../tbb_userguide/Constraints + ../tbb_userguide/std_invoke ../tbb_userguide/appendix_A ../tbb_userguide/appendix_B ../tbb_userguide/References diff --git a/third-party/tbb/include/oneapi/tbb/concurrent_queue.h b/third-party/tbb/include/oneapi/tbb/concurrent_queue.h index 24659715..1e7ff50b 100644 --- a/third-party/tbb/include/oneapi/tbb/concurrent_queue.h +++ b/third-party/tbb/include/oneapi/tbb/concurrent_queue.h @@ -1,5 +1,5 @@ /* - Copyright (c) 2005-2022 Intel Corporation + Copyright (c) 2005-2023 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -91,6 +91,10 @@ class concurrent_queue { push(*begin); } + concurrent_queue( std::initializer_list init, const allocator_type& alloc = allocator_type() ) : + concurrent_queue(init.begin(), init.end(), alloc) + {} + concurrent_queue(const concurrent_queue& src, const allocator_type& a) : concurrent_queue(a) { @@ -132,6 +136,53 @@ class concurrent_queue { r1::cache_aligned_deallocate(my_queue_representation); } + concurrent_queue& operator=( const concurrent_queue& other ) { + //TODO: implement support for std::allocator_traits::propagate_on_container_copy_assignment + if (my_queue_representation != other.my_queue_representation) { + clear(); + my_allocator = other.my_allocator; + my_queue_representation->assign(*other.my_queue_representation, my_allocator, copy_construct_item); + } + return *this; + } + + concurrent_queue& operator=( concurrent_queue&& other ) { + //TODO: implement support for std::allocator_traits::propagate_on_container_move_assignment + if (my_queue_representation != other.my_queue_representation) { + clear(); + if (my_allocator == other.my_allocator) { + internal_swap(other); + } else { + my_queue_representation->assign(*other.my_queue_representation, other.my_allocator, move_construct_item); + other.clear(); + my_allocator = std::move(other.my_allocator); + } + } + return *this; + } + + concurrent_queue& operator=( std::initializer_list init ) { + assign(init); + return *this; + } + + template + void assign( InputIterator first, InputIterator last ) { + concurrent_queue src(first, last); + clear(); + my_queue_representation->assign(*src.my_queue_representation, my_allocator, move_construct_item); + } + + void assign( std::initializer_list init ) { + assign(init.begin(), init.end()); + } + + void swap ( concurrent_queue& other ) { + //TODO: implement support for std::allocator_traits::propagate_on_container_swap + __TBB_ASSERT(my_allocator == other.my_allocator, "unequal allocators"); + internal_swap(other); + } + // Enqueue an item at tail of queue. void push(const T& value) { internal_push(value); @@ -215,6 +266,20 @@ class concurrent_queue { queue_allocator_type my_allocator; queue_representation_type* my_queue_representation; + + friend void swap( concurrent_queue& lhs, concurrent_queue& rhs ) { + lhs.swap(rhs); + } + + friend bool operator==( const concurrent_queue& lhs, const concurrent_queue& rhs ) { + return lhs.unsafe_size() == rhs.unsafe_size() && std::equal(lhs.unsafe_begin(), lhs.unsafe_end(), rhs.unsafe_begin()); + } + +#if !__TBB_CPP20_COMPARISONS_PRESENT + friend bool operator!=( const concurrent_queue& lhs, const concurrent_queue& rhs ) { + return !(lhs == rhs); + } +#endif // __TBB_CPP20_COMPARISONS_PRESENT }; // class concurrent_queue #if __TBB_CPP17_DEDUCTION_GUIDES_PRESENT @@ -304,6 +369,10 @@ class concurrent_bounded_queue { push(*begin); } + concurrent_bounded_queue( std::initializer_list init, const allocator_type& alloc = allocator_type() ): + concurrent_bounded_queue(init.begin(), init.end(), alloc) + {} + concurrent_bounded_queue( const concurrent_bounded_queue& src, const allocator_type& a ) : concurrent_bounded_queue(a) { @@ -346,6 +415,53 @@ class concurrent_bounded_queue { sizeof(queue_representation_type)); } + concurrent_bounded_queue& operator=( const concurrent_bounded_queue& other ) { + //TODO: implement support for std::allocator_traits::propagate_on_container_copy_assignment + if (my_queue_representation != other.my_queue_representation) { + clear(); + my_allocator = other.my_allocator; + my_queue_representation->assign(*other.my_queue_representation, my_allocator, copy_construct_item); + } + return *this; + } + + concurrent_bounded_queue& operator=( concurrent_bounded_queue&& other ) { + //TODO: implement support for std::allocator_traits::propagate_on_container_move_assignment + if (my_queue_representation != other.my_queue_representation) { + clear(); + if (my_allocator == other.my_allocator) { + internal_swap(other); + } else { + my_queue_representation->assign(*other.my_queue_representation, other.my_allocator, move_construct_item); + other.clear(); + my_allocator = std::move(other.my_allocator); + } + } + return *this; + } + + concurrent_bounded_queue& operator=( std::initializer_list init ) { + assign(init); + return *this; + } + + template + void assign( InputIterator first, InputIterator last ) { + concurrent_bounded_queue src(first, last); + clear(); + my_queue_representation->assign(*src.my_queue_representation, my_allocator, move_construct_item); + } + + void assign( std::initializer_list init ) { + assign(init.begin(), init.end()); + } + + void swap ( concurrent_bounded_queue& other ) { + //TODO: implement support for std::allocator_traits::propagate_on_container_swap + __TBB_ASSERT(my_allocator == other.my_allocator, "unequal allocators"); + internal_swap(other); + } + // Enqueue an item at tail of queue. void push( const T& value ) { internal_push(value); @@ -544,6 +660,20 @@ class concurrent_bounded_queue { queue_representation_type* my_queue_representation; r1::concurrent_monitor* my_monitors; + + friend void swap( concurrent_bounded_queue& lhs, concurrent_bounded_queue& rhs ) { + lhs.swap(rhs); + } + + friend bool operator==( const concurrent_bounded_queue& lhs, const concurrent_bounded_queue& rhs ) { + return lhs.size() == rhs.size() && std::equal(lhs.unsafe_begin(), lhs.unsafe_end(), rhs.unsafe_begin()); + } + +#if !__TBB_CPP20_COMPARISONS_PRESENT + friend bool operator!=( const concurrent_bounded_queue& lhs, const concurrent_bounded_queue& rhs ) { + return !(lhs == rhs); + } +#endif // __TBB_CPP20_COMPARISONS_PRESENT }; // class concurrent_bounded_queue #if __TBB_CPP17_DEDUCTION_GUIDES_PRESENT diff --git a/third-party/tbb/include/oneapi/tbb/detail/_concurrent_unordered_base.h b/third-party/tbb/include/oneapi/tbb/detail/_concurrent_unordered_base.h index 4cbf91ad..ade91c33 100644 --- a/third-party/tbb/include/oneapi/tbb/detail/_concurrent_unordered_base.h +++ b/third-party/tbb/include/oneapi/tbb/detail/_concurrent_unordered_base.h @@ -1,5 +1,5 @@ /* - Copyright (c) 2005-2022 Intel Corporation + Copyright (c) 2005-2023 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -677,13 +677,17 @@ class concurrent_unordered_base { size_type current_bucket_count = my_bucket_count.load(std::memory_order_acquire); size_type necessary_bucket_count = current_bucket_count; - do { - // TODO: Log2 seems useful here - while (necessary_bucket_count * max_load_factor() < elements_count) { + // max_load_factor() is currently unsafe, so we can assume that my_max_load_factor + // would not be changed during the calculation + // TODO: Log2 seems useful here + while (necessary_bucket_count * max_load_factor() < elements_count) { necessary_bucket_count <<= 1; - } - } while (current_bucket_count >= necessary_bucket_count || - !my_bucket_count.compare_exchange_strong(current_bucket_count, necessary_bucket_count)); + } + + while (!my_bucket_count.compare_exchange_strong(current_bucket_count, necessary_bucket_count)) { + if (current_bucket_count >= necessary_bucket_count) + break; + } } // Observers diff --git a/third-party/tbb/include/oneapi/tbb/detail/_config.h b/third-party/tbb/include/oneapi/tbb/detail/_config.h index 1d0b9877..ad9f0f31 100644 --- a/third-party/tbb/include/oneapi/tbb/detail/_config.h +++ b/third-party/tbb/include/oneapi/tbb/detail/_config.h @@ -28,6 +28,12 @@ /* Check which standard library we use. */ #include +#ifdef __has_include +#if __has_include() +#include +#endif +#endif + #include "_export.h" #if _MSC_VER @@ -220,7 +226,7 @@ /** Library features presence macros **/ #define __TBB_CPP14_INTEGER_SEQUENCE_PRESENT (__TBB_LANG >= 201402L) -#define __TBB_CPP17_INVOKE_RESULT_PRESENT (__TBB_LANG >= 201703L) +#define __TBB_CPP17_INVOKE_PRESENT (__TBB_LANG >= 201703L) // TODO: Remove the condition(__INTEL_COMPILER > 2021) from the __TBB_CPP17_DEDUCTION_GUIDES_PRESENT // macro when this feature start working correctly on this compiler. @@ -265,7 +271,7 @@ #if defined(__cpp_impl_three_way_comparison) && defined(__cpp_lib_three_way_comparison) #define __TBB_CPP20_COMPARISONS_PRESENT ((__cpp_impl_three_way_comparison >= 201907L) && (__cpp_lib_three_way_comparison >= 201907L)) #else - #define __TBB_CPP20_COMPARISONS_PRESENT __TBB_CPP20_PRESENT + #define __TBB_CPP20_COMPARISONS_PRESENT 0 #endif #define __TBB_RESUMABLE_TASKS (!__TBB_WIN8UI_SUPPORT && !__ANDROID__ && !__QNXNTO__ && (!__linux__ || __GLIBC__)) diff --git a/third-party/tbb/include/oneapi/tbb/detail/_flow_graph_body_impl.h b/third-party/tbb/include/oneapi/tbb/detail/_flow_graph_body_impl.h index 0b061c2f..8ac11211 100644 --- a/third-party/tbb/include/oneapi/tbb/detail/_flow_graph_body_impl.h +++ b/third-party/tbb/include/oneapi/tbb/detail/_flow_graph_body_impl.h @@ -1,5 +1,5 @@ /* - Copyright (c) 2005-2022 Intel Corporation + Copyright (c) 2005-2023 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -109,7 +109,7 @@ template class function_body_leaf : public function_body< Input, Output > { public: function_body_leaf( const B &_body ) : body(_body) { } - Output operator()(const Input &i) override { return body(i); } + Output operator()(const Input &i) override { return tbb::detail::invoke(body,i); } B get_body() { return body; } function_body_leaf* clone() override { return new function_body_leaf< Input, Output, B >(body); @@ -184,7 +184,7 @@ class multifunction_body_leaf : public multifunction_body { public: multifunction_body_leaf(const B &_body) : body(_body) { } void operator()(const Input &input, OutputSet &oset) override { - body(input, oset); // body may explicitly put() to one or more of oset. + tbb::detail::invoke(body, input, oset); // body may explicitly put() to one or more of oset. } void* get_body_ptr() override { return &body; } multifunction_body_leaf* clone() override { @@ -218,7 +218,7 @@ template class type_to_key_function_body_leaf : public type_to_key_function_body { public: type_to_key_function_body_leaf( const B &_body ) : body(_body) { } - Output operator()(const Input &i) override { return body(i); } + Output operator()(const Input &i) override { return tbb::detail::invoke(body, i); } type_to_key_function_body_leaf* clone() override { return new type_to_key_function_body_leaf< Input, Output, B>(body); } @@ -231,7 +231,7 @@ class type_to_key_function_body_leaf : public type_to_key_funct public: type_to_key_function_body_leaf( const B &_body ) : body(_body) { } const Output& operator()(const Input &i) override { - return body(i); + return tbb::detail::invoke(body, i); } type_to_key_function_body_leaf* clone() override { return new type_to_key_function_body_leaf< Input, Output&, B>(body); diff --git a/third-party/tbb/include/oneapi/tbb/detail/_flow_graph_node_impl.h b/third-party/tbb/include/oneapi/tbb/detail/_flow_graph_node_impl.h index 9e2f9adf..b79c53dd 100644 --- a/third-party/tbb/include/oneapi/tbb/detail/_flow_graph_node_impl.h +++ b/third-party/tbb/include/oneapi/tbb/detail/_flow_graph_node_impl.h @@ -1,5 +1,5 @@ /* - Copyright (c) 2005-2022 Intel Corporation + Copyright (c) 2005-2023 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -361,7 +361,7 @@ class function_input : public function_input_base function_input( graph &g, size_t max_concurrency, Body& body, node_priority_t a_priority ) - : base_type(g, max_concurrency, a_priority, noexcept(body(input_type()))) + : base_type(g, max_concurrency, a_priority, noexcept(tbb::detail::invoke(body, input_type()))) , my_body( new function_body_leaf< input_type, output_type, Body>(body) ) , my_init_body( new function_body_leaf< input_type, output_type, Body>(body) ) { } @@ -392,7 +392,7 @@ class function_input : public function_input_base multifunction_input(graph &g, size_t max_concurrency,Body& body, node_priority_t a_priority ) - : base_type(g, max_concurrency, a_priority, noexcept(body(input_type(), my_output_ports))) + : base_type(g, max_concurrency, a_priority, noexcept(tbb::detail::invoke(body, input_type(), my_output_ports))) , my_body( new multifunction_body_leaf(body) ) , my_init_body( new multifunction_body_leaf(body) ) , my_output_ports(init_output_ports::call(g, my_output_ports)){ diff --git a/third-party/tbb/include/oneapi/tbb/detail/_flow_graph_tagged_buffer_impl.h b/third-party/tbb/include/oneapi/tbb/detail/_flow_graph_tagged_buffer_impl.h index 6426da55..0d9de176 100644 --- a/third-party/tbb/include/oneapi/tbb/detail/_flow_graph_tagged_buffer_impl.h +++ b/third-party/tbb/include/oneapi/tbb/detail/_flow_graph_tagged_buffer_impl.h @@ -1,5 +1,5 @@ /* - Copyright (c) 2005-2022 Intel Corporation + Copyright (c) 2005-2023 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -130,7 +130,7 @@ class hash_buffer : public HashCompare { const value_type &v) { size_t l_mask = p_sz-1; __TBB_ASSERT(my_key, "Error: value-to-key functor not provided"); - size_t h = this->hash((*my_key)(v)) & l_mask; + size_t h = this->hash(tbb::detail::invoke(*my_key, v)) & l_mask; __TBB_ASSERT(p_free_list, "Error: free list not set up."); element_type* my_elem = p_free_list; p_free_list = (element_type *)(p_free_list->second); (void) new(&(my_elem->first)) value_type(v); @@ -200,7 +200,7 @@ class hash_buffer : public HashCompare { bool insert_with_key(const value_type &v) { pointer_type p = nullptr; __TBB_ASSERT(my_key, "Error: value-to-key functor not provided"); - if(find_ref_with_key((*my_key)(v), p)) { + if(find_ref_with_key(tbb::detail::invoke(*my_key, v), p)) { p->~value_type(); (void) new(p) value_type(v); // copy-construct into the space return false; @@ -217,7 +217,7 @@ class hash_buffer : public HashCompare { for(element_type* p = pointer_array[i]; p; p = (element_type *)(p->second)) { pointer_type pv = reinterpret_cast(&(p->first)); __TBB_ASSERT(my_key, "Error: value-to-key functor not provided"); - if(this->equal((*my_key)(*pv), k)) { + if(this->equal(tbb::detail::invoke(*my_key, *pv), k)) { v = pv; return true; } @@ -241,7 +241,7 @@ class hash_buffer : public HashCompare { for(element_type* p = pointer_array[h]; p; prev = p, p = (element_type *)(p->second)) { value_type *vp = reinterpret_cast(&(p->first)); __TBB_ASSERT(my_key, "Error: value-to-key functor not provided"); - if(this->equal((*my_key)(*vp), k)) { + if(this->equal(tbb::detail::invoke(*my_key, *vp), k)) { vp->~value_type(); if(prev) prev->second = p->second; else pointer_array[h] = (element_type *)(p->second); diff --git a/third-party/tbb/include/oneapi/tbb/detail/_machine.h b/third-party/tbb/include/oneapi/tbb/detail/_machine.h index 763bc65b..7a4a1e31 100644 --- a/third-party/tbb/include/oneapi/tbb/detail/_machine.h +++ b/third-party/tbb/include/oneapi/tbb/detail/_machine.h @@ -1,5 +1,5 @@ /* - Copyright (c) 2005-2022 Intel Corporation + Copyright (c) 2005-2023 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -110,9 +110,9 @@ static inline void machine_pause(int32_t delay) { #if defined(__GNUC__) || defined(__clang__) namespace gnu_builtins { - inline uintptr_t clz(unsigned int x) { return __builtin_clz(x); } - inline uintptr_t clz(unsigned long int x) { return __builtin_clzl(x); } - inline uintptr_t clz(unsigned long long int x) { return __builtin_clzll(x); } + inline uintptr_t clz(unsigned int x) { return static_cast(__builtin_clz(x)); } + inline uintptr_t clz(unsigned long int x) { return static_cast(__builtin_clzl(x)); } + inline uintptr_t clz(unsigned long long int x) { return static_cast(__builtin_clzll(x)); } } #elif defined(_MSC_VER) #pragma intrinsic(__TBB_W(_BitScanReverse)) @@ -221,8 +221,8 @@ T machine_reverse_bits(T src) { return builtin_bitreverse(fixed_width_cast(src)); #else /* Generic */ T dst; - unsigned char *original = (unsigned char *) &src; - unsigned char *reversed = (unsigned char *) &dst; + unsigned char *original = reinterpret_cast(&src); + unsigned char *reversed = reinterpret_cast(&dst); for ( int i = sizeof(T) - 1; i >= 0; i-- ) { reversed[i] = reverse_byte( original[sizeof(T) - i - 1] ); diff --git a/third-party/tbb/include/oneapi/tbb/detail/_pipeline_filters.h b/third-party/tbb/include/oneapi/tbb/detail/_pipeline_filters.h index 149b7f46..46e7b95d 100644 --- a/third-party/tbb/include/oneapi/tbb/detail/_pipeline_filters.h +++ b/third-party/tbb/include/oneapi/tbb/detail/_pipeline_filters.h @@ -1,5 +1,5 @@ /* - Copyright (c) 2005-2022 Intel Corporation + Copyright (c) 2005-2023 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -233,7 +233,7 @@ class concrete_filter: public base_filter { void* operator()(void* input) override { input_pointer temp_input = input_helper::cast_from_void_ptr(input); - output_pointer temp_output = output_helper::create_token(my_body(std::move(input_helper::token(temp_input)))); + output_pointer temp_output = output_helper::create_token(tbb::detail::invoke(my_body, std::move(input_helper::token(temp_input)))); input_helper::destroy_token(temp_input); return output_helper::cast_to_void_ptr(temp_output); } @@ -281,7 +281,7 @@ class concrete_filter: public base_filter { void* operator()(void* input) override { input_pointer temp_input = input_helper::cast_from_void_ptr(input); - my_body(std::move(input_helper::token(temp_input))); + tbb::detail::invoke(my_body, std::move(input_helper::token(temp_input))); input_helper::destroy_token(temp_input); return nullptr; } @@ -441,11 +441,11 @@ class filter_node_leaf: public filter_node { }; -template ::input_type> +template ::input_type> using filter_input = typename std::conditional::value, void, Input>::type; template -using filter_output = typename body_types::output_type; +using filter_output = typename filter_body_types::output_type; } // namespace d1 } // namespace detail diff --git a/third-party/tbb/include/oneapi/tbb/detail/_pipeline_filters_deduction.h b/third-party/tbb/include/oneapi/tbb/detail/_pipeline_filters_deduction.h index 55f94dce..c1a6e8ae 100644 --- a/third-party/tbb/include/oneapi/tbb/detail/_pipeline_filters_deduction.h +++ b/third-party/tbb/include/oneapi/tbb/detail/_pipeline_filters_deduction.h @@ -1,5 +1,5 @@ /* - Copyright (c) 2005-2021 Intel Corporation + Copyright (c) 2005-2023 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -26,18 +26,18 @@ namespace detail { namespace d1 { template -struct declare_fitler_types { +struct declare_filter_types { using input_type = typename std::remove_const::type>::type; using output_type = typename std::remove_const::type>::type; }; -template struct body_types; +template struct filter_body_types; template -struct body_types : declare_fitler_types {}; +struct filter_body_types : declare_filter_types {}; template -struct body_types : declare_fitler_types {}; +struct filter_body_types : declare_filter_types {}; } // namespace d1 } // namespace detail diff --git a/third-party/tbb/include/oneapi/tbb/detail/_task.h b/third-party/tbb/include/oneapi/tbb/detail/_task.h index 9c1a3550..636aea97 100644 --- a/third-party/tbb/include/oneapi/tbb/detail/_task.h +++ b/third-party/tbb/include/oneapi/tbb/detail/_task.h @@ -1,5 +1,5 @@ /* - Copyright (c) 2020-2022 Intel Corporation + Copyright (c) 2020-2023 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -103,7 +103,7 @@ class wait_context { void add_reference(std::int64_t delta) { call_itt_task_notify(releasing, this); - std::uint64_t r = m_ref_count.fetch_add(delta) + delta; + std::uint64_t r = m_ref_count.fetch_add(static_cast(delta)) + static_cast(delta); __TBB_ASSERT_EX((r & overflow_mask) == 0, "Overflow is detected"); diff --git a/third-party/tbb/include/oneapi/tbb/detail/_template_helpers.h b/third-party/tbb/include/oneapi/tbb/detail/_template_helpers.h index 72211171..34913710 100644 --- a/third-party/tbb/include/oneapi/tbb/detail/_template_helpers.h +++ b/third-party/tbb/include/oneapi/tbb/detail/_template_helpers.h @@ -1,5 +1,5 @@ /* - Copyright (c) 2005-2021 Intel Corporation + Copyright (c) 2005-2023 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -59,7 +59,7 @@ template struct select_size_t_constant { // Explicit cast is needed to avoid compiler warnings about possible truncation. // The value of the right size, which is selected by ?:, is anyway not truncated or promoted. - static const std::size_t value = (std::size_t)((sizeof(std::size_t)==sizeof(u)) ? u : ull); + static const std::size_t value = static_cast((sizeof(std::size_t)==sizeof(u)) ? u : ull); }; // TODO: do we really need it? diff --git a/third-party/tbb/include/oneapi/tbb/detail/_utils.h b/third-party/tbb/include/oneapi/tbb/detail/_utils.h index 28fe1a17..1ac2e3ba 100644 --- a/third-party/tbb/include/oneapi/tbb/detail/_utils.h +++ b/third-party/tbb/include/oneapi/tbb/detail/_utils.h @@ -1,5 +1,5 @@ /* - Copyright (c) 2005-2021 Intel Corporation + Copyright (c) 2005-2023 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -20,6 +20,7 @@ #include #include #include +#include #include "_config.h" #include "_assert.h" @@ -177,7 +178,7 @@ inline ArgIntegerType modulo_power_of_two(ArgIntegerType arg, DivisorIntegerType //! A function to check if passed in pointer is aligned on a specific border template constexpr bool is_aligned(T* pointer, std::uintptr_t alignment) { - return 0 == ((std::uintptr_t)pointer & (alignment - 1)); + return 0 == (reinterpret_cast(pointer) & (alignment - 1)); } #if TBB_USE_ASSERT @@ -340,6 +341,22 @@ concept adaptive_same_as = #endif #endif // __TBB_CPP20_CONCEPTS_PRESENT +template +auto invoke(F&& f, Args&&... args) +#if __TBB_CPP17_INVOKE_PRESENT + noexcept(std::is_nothrow_invocable_v) + -> std::invoke_result_t +{ + return std::invoke(std::forward(f), std::forward(args)...); +} +#else // __TBB_CPP17_INVOKE_PRESENT + noexcept(noexcept(std::forward(f)(std::forward(args)...))) + -> decltype(std::forward(f)(std::forward(args)...)) +{ + return std::forward(f)(std::forward(args)...); +} +#endif // __TBB_CPP17_INVOKE_PRESENT + } // namespace d0 namespace d1 { diff --git a/third-party/tbb/include/oneapi/tbb/flow_graph.h b/third-party/tbb/include/oneapi/tbb/flow_graph.h index 60016d93..2df4b140 100644 --- a/third-party/tbb/include/oneapi/tbb/flow_graph.h +++ b/third-party/tbb/include/oneapi/tbb/flow_graph.h @@ -1,5 +1,5 @@ /* - Copyright (c) 2005-2022 Intel Corporation + Copyright (c) 2005-2023 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -88,8 +88,9 @@ namespace d0 { template concept node_body_return_type = std::same_as || - std::same_as; + std::convertible_to; +// TODO: consider using std::invocable here template concept continue_node_body = std::copy_constructible && requires( Body& body, const tbb::detail::d1::continue_msg& v ) { @@ -98,15 +99,13 @@ concept continue_node_body = std::copy_constructible && template concept function_node_body = std::copy_constructible && - requires( Body& body, const Input& v ) { - { body(v) } -> node_body_return_type; - }; + std::invocable && + node_body_return_type, Output>; template concept join_node_function_object = std::copy_constructible && - requires( FunctionObject& func, const Input& v ) { - { func(v) } -> adaptive_same_as; - }; + std::invocable && + std::convertible_to, Key>; template concept input_node_body = std::copy_constructible && @@ -116,21 +115,16 @@ concept input_node_body = std::copy_constructible && template concept multifunction_node_body = std::copy_constructible && - requires( Body& body, const Input& v, OutputPortsType& p ) { - body(v, p); - }; + std::invocable; template concept sequencer = std::copy_constructible && - requires( Sequencer& seq, const Value& value ) { - { seq(value) } -> adaptive_same_as; - }; + std::invocable && + std::convertible_to, std::size_t>; template concept async_node_body = std::copy_constructible && - requires( Body& body, const Input& v, GatewayType& gateway ) { - body(v, gateway); - }; + std::invocable; } // namespace d0 #endif // __TBB_CPP20_CONCEPTS_PRESENT @@ -1892,7 +1886,7 @@ class limiter_node : public graph_node, public receiver< T >, public sender< T > size_t my_threshold; size_t my_count; // number of successful puts size_t my_tries; // number of active put attempts - size_t my_future_decrement; // number of active decrement + size_t my_future_decrement; // number of active decrement reservable_predecessor_cache< T, spin_mutex > my_predecessors; spin_mutex my_mutex; broadcast_cache< T > my_successors; @@ -2863,8 +2857,8 @@ class async_body: public async_body_base { async_body(const Body &body, gateway_type *gateway) : base_type(gateway), my_body(body) { } - void operator()( const Input &v, Ports & ) noexcept(noexcept(my_body(v, std::declval()))) { - my_body(v, *this->my_gateway); + void operator()( const Input &v, Ports & ) noexcept(noexcept(tbb::detail::invoke(my_body, v, std::declval()))) { + tbb::detail::invoke(my_body, v, *this->my_gateway); } Body get_body() { return my_body; } diff --git a/third-party/tbb/include/oneapi/tbb/parallel_for.h b/third-party/tbb/include/oneapi/tbb/parallel_for.h index a9e9a1c2..91c7c44c 100644 --- a/third-party/tbb/include/oneapi/tbb/parallel_for.h +++ b/third-party/tbb/include/oneapi/tbb/parallel_for.h @@ -1,5 +1,5 @@ /* - Copyright (c) 2005-2022 Intel Corporation + Copyright (c) 2005-2023 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -37,10 +37,7 @@ namespace detail { inline namespace d0 { template -concept parallel_for_body = std::copy_constructible && - requires( const std::remove_reference_t& body, Range& range ) { - body(range); - }; +concept parallel_for_body = std::copy_constructible && std::invocable&, Range&>; template concept parallel_for_index = std::constructible_from && @@ -52,9 +49,7 @@ concept parallel_for_index = std::constructible_from && }; template -concept parallel_for_function = requires( const std::remove_reference_t& func, Index index ) { - func(index); -}; +concept parallel_for_function = std::invocable&, Index>; } // namespace d0 #endif // __TBB_CPP20_CONCEPTS_PRESENT @@ -119,7 +114,7 @@ struct start_for : public task { } //! Run body for range, serves as callback for partitioner void run_body( Range &r ) { - my_body( r ); + tbb::detail::invoke(my_body, r); } //! spawn right task, serves as callback for partitioner @@ -208,7 +203,7 @@ class parallel_for_body_wrapper : detail::no_assign { #endif #endif for ( Index i = b; i < e; ++i, k += ms ) { - my_func( k ); + tbb::detail::invoke(my_func, k); } } }; @@ -313,7 +308,7 @@ void parallel_for_impl(Index first, Index last, Index step, const Function& f, P throw_exception(exception_id::nonpositive_step); // throws std::invalid_argument else if (first < last) { // Above "else" avoids "potential divide by zero" warning on some platforms - Index end = (last - first - Index(1)) / step + Index(1); + Index end = Index(last - first - 1ul) / step + Index(1); blocked_range range(static_cast(0), end); parallel_for_body_wrapper body(f, first, step); parallel_for(range, body, partitioner); diff --git a/third-party/tbb/include/oneapi/tbb/parallel_for_each.h b/third-party/tbb/include/oneapi/tbb/parallel_for_each.h index 795e7d03..56dbeb41 100644 --- a/third-party/tbb/include/oneapi/tbb/parallel_for_each.h +++ b/third-party/tbb/include/oneapi/tbb/parallel_for_each.h @@ -23,6 +23,7 @@ #include "detail/_task.h" #include "detail/_aligned_space.h" #include "detail/_small_object_pool.h" +#include "detail/_utils.h" #include "parallel_for.h" #include "task_group.h" // task_group_context @@ -41,13 +42,8 @@ class feeder; inline namespace d0 { template -concept parallel_for_each_body = requires( const std::remove_reference_t& body, ItemType&& item ) { - body(std::forward(item)); - } || - requires( const std::remove_reference_t& body, ItemType&& item, - tbb::detail::d1::feeder& feeder ) { - body(std::forward(item), feeder); -}; +concept parallel_for_each_body = std::invocable&, ItemType&&> || + std::invocable&, ItemType&&, tbb::detail::d1::feeder&>; } // namespace d0 #endif // __TBB_CPP20_CONCEPTS_PRESENT @@ -85,14 +81,14 @@ struct parallel_for_each_operator_selector { public: template static auto call(const Body& body, ItemArg&& item, FeederArg*) - -> decltype(body(std::forward(item)), void()) { + -> decltype(tbb::detail::invoke(body, std::forward(item)), void()) { #if defined(_MSC_VER) && !defined(__INTEL_COMPILER) // Suppression of Microsoft non-standard extension warnings #pragma warning (push) #pragma warning (disable: 4239) #endif - body(std::forward(item)); + tbb::detail::invoke(body, std::forward(item)); #if defined(_MSC_VER) && !defined(__INTEL_COMPILER) #pragma warning (pop) @@ -101,7 +97,7 @@ struct parallel_for_each_operator_selector { template static auto call(const Body& body, ItemArg&& item, FeederArg* feeder) - -> decltype(body(std::forward(item), *feeder), void()) { + -> decltype(tbb::detail::invoke(body, std::forward(item), *feeder), void()) { #if defined(_MSC_VER) && !defined(__INTEL_COMPILER) // Suppression of Microsoft non-standard extension warnings #pragma warning (push) @@ -109,7 +105,7 @@ struct parallel_for_each_operator_selector { #endif __TBB_ASSERT(feeder, "Feeder was not created but should be"); - body(std::forward(item), *feeder); + tbb::detail::invoke(body, std::forward(item), *feeder); #if defined(_MSC_VER) && !defined(__INTEL_COMPILER) #pragma warning (pop) @@ -424,8 +420,9 @@ using iterator_tag_dispatch = typename >::type; template -using feeder_is_required = tbb::detail::void_t()(std::declval::reference>(), - std::declval&>()))>; +using feeder_is_required = tbb::detail::void_t(), + std::declval::reference>(), + std::declval&>()))>; // Creates feeder object only if the body can accept it template diff --git a/third-party/tbb/include/oneapi/tbb/parallel_reduce.h b/third-party/tbb/include/oneapi/tbb/parallel_reduce.h index a1bc8f3d..401ad004 100644 --- a/third-party/tbb/include/oneapi/tbb/parallel_reduce.h +++ b/third-party/tbb/include/oneapi/tbb/parallel_reduce.h @@ -1,5 +1,5 @@ /* - Copyright (c) 2005-2022 Intel Corporation + Copyright (c) 2005-2023 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -41,17 +41,18 @@ concept parallel_reduce_body = splittable && }; template -concept parallel_reduce_function = requires( const std::remove_reference_t& func, - const Range& range, - const Value& value ) { - { func(range, value) } -> std::convertible_to; -}; +concept parallel_reduce_function = std::invocable&, + const Range&, const Value&> && + std::convertible_to&, + const Range&, const Value&>, + Value>; template -concept parallel_reduce_combine = requires( const std::remove_reference_t& combine, - const Value& lhs, const Value& rhs ) { - { combine(lhs, rhs) } -> std::convertible_to; -}; +concept parallel_reduce_combine = std::invocable&, + const Value&, const Value&> && + std::convertible_to&, + const Value&, const Value&>, + Value>; } // namespace d0 #endif // __TBB_CPP20_CONCEPTS_PRESENT @@ -150,7 +151,7 @@ struct start_reduce : public task { } //! Run body for range, serves as callback for partitioner void run_body( Range &r ) { - (*my_body)(r); + tbb::detail::invoke(*my_body, r); } //! spawn right task, serves as callback for partitioner @@ -207,7 +208,7 @@ task* start_reduce::execute(execution_data& ed) { __TBB_ASSERT(my_parent, nullptr); if( is_right_child && my_parent->m_ref_count.load(std::memory_order_acquire) == 2 ) { tree_node_type* parent_ptr = static_cast(my_parent); - my_body = (Body*) new( parent_ptr->zombie_space.begin() ) Body(*my_body, split()); + my_body = static_cast(new( parent_ptr->zombie_space.begin() ) Body(*my_body, split())); parent_ptr->has_right_zombie = true; } __TBB_ASSERT(my_body != nullptr, "Incorrect body value"); @@ -296,7 +297,7 @@ struct start_deterministic_reduce : public task { } //! Run body for range, serves as callback for partitioner void run_body( Range &r ) { - my_body( r ); + tbb::detail::invoke(my_body, r); } //! Spawn right task, serves as callback for partitioner void offer_work(typename Partitioner::split_type& split_obj, execution_data& ed) { @@ -389,10 +390,11 @@ class lambda_reduce_body { , my_value(other.my_identity_element) { } void operator()(Range& range) { - my_value = my_real_body(range, const_cast(my_value)); + my_value = tbb::detail::invoke(my_real_body, range, const_cast(my_value)); } void join( lambda_reduce_body& rhs ) { - my_value = my_reduction(const_cast(my_value), const_cast(rhs.my_value)); + my_value = tbb::detail::invoke(my_reduction, const_cast(my_value), + const_cast(rhs.my_value)); } Value result() const { return my_value; diff --git a/third-party/tbb/include/oneapi/tbb/parallel_scan.h b/third-party/tbb/include/oneapi/tbb/parallel_scan.h index 51adcbd7..6d2a4d64 100644 --- a/third-party/tbb/include/oneapi/tbb/parallel_scan.h +++ b/third-party/tbb/include/oneapi/tbb/parallel_scan.h @@ -1,5 +1,5 @@ /* - Copyright (c) 2005-2021 Intel Corporation + Copyright (c) 2005-2023 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -64,16 +64,18 @@ concept parallel_scan_body = splittable && }; template -concept parallel_scan_function = requires( const std::remove_reference_t& func, - const Range& range, const Value& value ) { - { func(range, value, true) } -> std::convertible_to; -}; +concept parallel_scan_function = std::invocable&, + const Range&, const Value&, bool> && + std::convertible_to&, + const Range&, const Value&, bool>, + Value>; template -concept parallel_scan_combine = requires( const std::remove_reference_t& combine, - const Value& lhs, const Value& rhs ) { - { combine(lhs, rhs) } -> std::convertible_to; -}; +concept parallel_scan_combine = std::invocable&, + const Value&, const Value&> && + std::convertible_to&, + const Value&, const Value&>, + Value>; } // namespace d0 namespace d1 { @@ -519,11 +521,11 @@ class lambda_scan_body { template void operator()( const Range& r, Tag tag ) { - m_sum_slot = m_scan(r, m_sum_slot, tag); + m_sum_slot = tbb::detail::invoke(m_scan, r, m_sum_slot, tag); } void reverse_join( lambda_scan_body& a ) { - m_sum_slot = m_reverse_join(a.m_sum_slot, m_sum_slot); + m_sum_slot = tbb::detail::invoke(m_reverse_join, a.m_sum_slot, m_sum_slot); } void assign( lambda_scan_body& b ) { @@ -626,4 +628,3 @@ inline namespace v1 { } // namespace tbb #endif /* __TBB_parallel_scan_H */ - diff --git a/third-party/tbb/include/oneapi/tbb/partitioner.h b/third-party/tbb/include/oneapi/tbb/partitioner.h index cbbf5e98..98de0d42 100644 --- a/third-party/tbb/include/oneapi/tbb/partitioner.h +++ b/third-party/tbb/include/oneapi/tbb/partitioner.h @@ -1,5 +1,5 @@ /* - Copyright (c) 2005-2022 Intel Corporation + Copyright (c) 2005-2023 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -70,7 +70,7 @@ class affinity_partitioner_base; inline std::size_t get_initial_auto_partitioner_divisor() { const std::size_t factor = 4; - return factor * max_concurrency(); + return factor * static_cast(max_concurrency()); } //! Defines entry point for affinity partitioner into oneTBB run-time library. @@ -90,7 +90,7 @@ class affinity_partitioner_base: no_copy { /** Retains values if resulting size is the same. */ void resize(unsigned factor) { // Check factor to avoid asking for number of workers while there might be no arena. - unsigned max_threads_in_arena = max_concurrency(); + unsigned max_threads_in_arena = static_cast(max_concurrency()); std::size_t new_size = factor ? factor * max_threads_in_arena : 0; if (new_size != my_size) { if (my_array) { diff --git a/third-party/tbb/include/oneapi/tbb/profiling.h b/third-party/tbb/include/oneapi/tbb/profiling.h index 3bd2a426..412b5a35 100644 --- a/third-party/tbb/include/oneapi/tbb/profiling.h +++ b/third-party/tbb/include/oneapi/tbb/profiling.h @@ -1,5 +1,5 @@ /* - Copyright (c) 2005-2022 Intel Corporation + Copyright (c) 2005-2023 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -122,14 +122,14 @@ namespace d1 { // Distinguish notifications on task for reducing overheads #if TBB_USE_PROFILING_TOOLS == 2 inline void call_itt_task_notify(d1::notify_type t, void *ptr) { - r1::call_itt_notify((int)t, ptr); + r1::call_itt_notify(static_cast(t), ptr); } #else inline void call_itt_task_notify(d1::notify_type, void *) {} #endif // TBB_USE_PROFILING_TOOLS inline void call_itt_notify(d1::notify_type t, void *ptr) { - r1::call_itt_notify((int)t, ptr); + r1::call_itt_notify(static_cast(t), ptr); } #if (_WIN32||_WIN64) && !__MINGW32__ diff --git a/third-party/tbb/include/oneapi/tbb/version.h b/third-party/tbb/include/oneapi/tbb/version.h index 1396b85b..965af129 100644 --- a/third-party/tbb/include/oneapi/tbb/version.h +++ b/third-party/tbb/include/oneapi/tbb/version.h @@ -1,5 +1,5 @@ /* - Copyright (c) 2005-2022 Intel Corporation + Copyright (c) 2005-2023 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -29,7 +29,7 @@ // Product version #define TBB_VERSION_MAJOR 2021 // Update version -#define TBB_VERSION_MINOR 9 +#define TBB_VERSION_MINOR 10 // "Patch" version for custom releases #define TBB_VERSION_PATCH 0 // Suffix string @@ -40,7 +40,7 @@ // OneAPI oneTBB specification version #define ONETBB_SPEC_VERSION "1.0" // Full interface version -#define TBB_INTERFACE_VERSION 12090 +#define TBB_INTERFACE_VERSION 12100 // Major interface version #define TBB_INTERFACE_VERSION_MAJOR (TBB_INTERFACE_VERSION/1000) // Minor interface version diff --git a/third-party/tbb/integration/pkg-config/tbb.pc.in b/third-party/tbb/integration/pkg-config/tbb.pc.in index d87fcf56..34ea3bea 100644 --- a/third-party/tbb/integration/pkg-config/tbb.pc.in +++ b/third-party/tbb/integration/pkg-config/tbb.pc.in @@ -1,4 +1,4 @@ -# Copyright (c) 2021 Intel Corporation +# Copyright (c) 2021-2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -20,5 +20,5 @@ Name: oneAPI Threading Building Blocks (oneTBB) Description: C++ library for parallel programming on multi-core processors. URL: https://github.com/oneapi-src/oneTBB Version: @TBB_VERSION@ -Libs: -L${libdir} -l@_tbb_pc_lib_name@ +Libs: -L${libdir} @_tbb_pc_extra_libdir@ -l@_tbb_pc_lib_name@ Cflags: -I${includedir} diff --git a/third-party/tbb/integration/windows/env/vars.bat b/third-party/tbb/integration/windows/env/vars.bat index 3618ac4e..78d99301 100644 --- a/third-party/tbb/integration/windows/env/vars.bat +++ b/third-party/tbb/integration/windows/env/vars.bat @@ -1,6 +1,6 @@ @echo off REM -REM Copyright (c) 2005-2021 Intel Corporation +REM Copyright (c) 2005-2023 Intel Corporation REM REM Licensed under the Apache License, Version 2.0 (the "License"); REM you may not use this file except in compliance with the License. @@ -22,9 +22,8 @@ REM ia32 : Set up for IA-32 architecture REM intel64 : Set up for Intel(R) 64 architecture REM if ^ is not set Intel(R) 64 architecture will be used REM ^ should be one of the following -REM vs2015 : Set to use with Microsoft Visual Studio 2015 runtime DLLs -REM vs2017 : Set to use with Microsoft Visual Studio 2017 runtime DLLs REM vs2019 : Set to use with Microsoft Visual Studio 2019 runtime DLLs +REM vs2022 : Set to use with Microsoft Visual Studio 2022 runtime DLLs REM all : Set to use TBB statically linked with Microsoft Visual C++ runtime REM if ^ is not set TBB statically linked with Microsoft Visual C++ runtime will be used. @@ -41,9 +40,8 @@ set TBB_TARGET_VS=vc_mt if /i "%1"=="" goto SetEnv if /i "%1"=="ia32" (set TBB_TARGET_ARCH=ia32) & shift & goto ParseArgs if /i "%1"=="intel64" (set TBB_TARGET_ARCH=intel64) & shift & goto ParseArgs -if /i "%1"=="vs2015" (set TBB_TARGET_VS=vc14) & shift & goto ParseArgs -if /i "%1"=="vs2017" (set TBB_TARGET_VS=vc14) & shift & goto ParseArgs if /i "%1"=="vs2019" (set TBB_TARGET_VS=vc14) & shift & goto ParseArgs +if /i "%1"=="vs2022" (set TBB_TARGET_VS=vc14) & shift & goto ParseArgs if /i "%1"=="all" (set TBB_TARGET_VS=vc_mt) & shift & goto ParseArgs :SetEnv diff --git a/third-party/tbb/python/TBB.py b/third-party/tbb/python/TBB.py index fbee4c24..3144560e 100644 --- a/third-party/tbb/python/TBB.py +++ b/third-party/tbb/python/TBB.py @@ -1,6 +1,4 @@ -#!/usr/bin/env python3 -# -# Copyright (c) 2016-2021 Intel Corporation +# Copyright (c) 2016-2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/third-party/tbb/python/setup.py b/third-party/tbb/python/setup.py index f084e705..7c050188 100644 --- a/third-party/tbb/python/setup.py +++ b/third-party/tbb/python/setup.py @@ -1,6 +1,4 @@ -#!/usr/bin/env python3 -# -# Copyright (c) 2016-2021 Intel Corporation +# Copyright (c) 2016-2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/third-party/tbb/python/tbb/__init__.py b/third-party/tbb/python/tbb/__init__.py index 318faa6b..f09c93aa 100644 --- a/third-party/tbb/python/tbb/__init__.py +++ b/third-party/tbb/python/tbb/__init__.py @@ -1,6 +1,4 @@ -#!/usr/bin/env python3 -# -# Copyright (c) 2016-2022 Intel Corporation +# Copyright (c) 2016-2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/third-party/tbb/python/tbb/__main__.py b/third-party/tbb/python/tbb/__main__.py index 2c071017..03c2efce 100644 --- a/third-party/tbb/python/tbb/__main__.py +++ b/third-party/tbb/python/tbb/__main__.py @@ -1,6 +1,4 @@ -#!/usr/bin/env python3 -# -# Copyright (c) 2016-2021 Intel Corporation +# Copyright (c) 2016-2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/third-party/tbb/python/tbb/pool.py b/third-party/tbb/python/tbb/pool.py index 75c9016c..a372324d 100644 --- a/third-party/tbb/python/tbb/pool.py +++ b/third-party/tbb/python/tbb/pool.py @@ -1,6 +1,4 @@ -#!/usr/bin/env python3 -# -# Copyright (c) 2016-2021 Intel Corporation +# Copyright (c) 2016-2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/third-party/tbb/python/tbb/test.py b/third-party/tbb/python/tbb/test.py index 7630d24f..690bfd4c 100644 --- a/third-party/tbb/python/tbb/test.py +++ b/third-party/tbb/python/tbb/test.py @@ -1,6 +1,4 @@ -#!/usr/bin/env python3 -# -# Copyright (c) 2016-2022 Intel Corporation +# Copyright (c) 2016-2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/third-party/tbb/src/tbb/CMakeLists.txt b/third-party/tbb/src/tbb/CMakeLists.txt index 996bf6c1..6aade7db 100644 --- a/third-party/tbb/src/tbb/CMakeLists.txt +++ b/third-party/tbb/src/tbb/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2022 Intel Corporation +# Copyright (c) 2020-2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -59,7 +59,7 @@ target_compile_definitions(tbb $<$>:__TBB_DYNAMIC_LOAD_ENABLED=0> $<$>:__TBB_SOURCE_DIRECTLY_INCLUDED=1>) -if (NOT ("${CMAKE_SYSTEM_PROCESSOR}" MATCHES "(armv7-a|aarch64|mips|arm64)" OR +if (NOT ("${CMAKE_SYSTEM_PROCESSOR}" MATCHES "(armv7-a|aarch64|mips|arm64|riscv)" OR "${CMAKE_OSX_ARCHITECTURES}" MATCHES "arm64" OR WINDOWS_STORE OR TBB_WINDOWS_DRIVER)) diff --git a/third-party/tbb/src/tbb/dynamic_link.cpp b/third-party/tbb/src/tbb/dynamic_link.cpp index 330415a8..2d88f8bc 100644 --- a/third-party/tbb/src/tbb/dynamic_link.cpp +++ b/third-party/tbb/src/tbb/dynamic_link.cpp @@ -1,5 +1,5 @@ /* - Copyright (c) 2005-2022 Intel Corporation + Copyright (c) 2005-2023 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -15,6 +15,7 @@ */ #include "dynamic_link.h" +#include "environment.h" #include "oneapi/tbb/detail/_template_helpers.h" #include "oneapi/tbb/detail/_utils.h" @@ -414,7 +415,9 @@ namespace r1 { if (local_binding) { flags = flags | RTLD_LOCAL; #if (__linux__ && __GLIBC__) && !__TBB_USE_SANITIZERS - flags = flags | RTLD_DEEPBIND; + if( !GetBoolEnvironmentVariable("TBB_ENABLE_SANITIZERS") ) { + flags = flags | RTLD_DEEPBIND; + } #endif } else { flags = flags | RTLD_GLOBAL; diff --git a/third-party/tbb/src/tbb/tbb.rc b/third-party/tbb/src/tbb/tbb.rc index be3e3011..6c8b99fc 100644 --- a/third-party/tbb/src/tbb/tbb.rc +++ b/third-party/tbb/src/tbb/tbb.rc @@ -31,7 +31,7 @@ LANGUAGE LANG_NEUTRAL, SUBLANG_NEUTRAL // // Version // -#define TBB_VERNUMBERS TBB_VERSION_MAJOR,TBB_VERSION_MINOR +#define TBB_VERNUMBERS TBB_VERSION_MAJOR,TBB_VERSION_MINOR,TBB_VERSION_PATCH #define TBB_VERSION TBB_VERSION_STRING VS_VERSION_INFO VERSIONINFO @@ -54,7 +54,7 @@ BEGIN VALUE "CompanyName", "Intel Corporation\0" VALUE "FileDescription", "oneAPI Threading Building Blocks (oneTBB) library\0" VALUE "FileVersion", TBB_VERSION "\0" - VALUE "LegalCopyright", "Copyright 2005-2022 Intel Corporation. All Rights Reserved.\0" + VALUE "LegalCopyright", "Copyright 2005-2023 Intel Corporation. All Rights Reserved.\0" VALUE "LegalTrademarks", "\0" #ifndef TBB_USE_DEBUG VALUE "OriginalFilename", "tbb12.dll\0" diff --git a/third-party/tbb/src/tbbbind/CMakeLists.txt b/third-party/tbb/src/tbbbind/CMakeLists.txt index 3e850827..24cd3e5d 100644 --- a/third-party/tbb/src/tbbbind/CMakeLists.txt +++ b/third-party/tbb/src/tbbbind/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2022 Intel Corporation +# Copyright (c) 2020-2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -24,13 +24,11 @@ function(tbbbind_build TBBBIND_NAME REQUIRED_HWLOC_TARGET) return() endif() add_library(${TBBBIND_NAME} tbb_bind.cpp) - + if (WIN32) - if(${TBBBIND_NAME} STREQUAL tbbbind) #adding resource info for default tbbbind - target_sources(${TBBBIND_NAME} PRIVATE tbb_bind.rc) - endif() + target_sources(${TBBBIND_NAME} PRIVATE tbb_bind.rc) endif() - + add_library(TBB::${TBBBIND_NAME} ALIAS ${TBBBIND_NAME}) target_compile_definitions(${TBBBIND_NAME} diff --git a/third-party/tbb/src/tbbbind/tbb_bind.rc b/third-party/tbb/src/tbbbind/tbb_bind.rc index 41b78ee4..bc060353 100644 --- a/third-party/tbb/src/tbbbind/tbb_bind.rc +++ b/third-party/tbb/src/tbbbind/tbb_bind.rc @@ -31,7 +31,7 @@ LANGUAGE LANG_NEUTRAL, SUBLANG_NEUTRAL // // Version // -#define TBB_VERNUMBERS TBB_VERSION_MAJOR,TBB_VERSION_MINOR +#define TBB_VERNUMBERS TBB_VERSION_MAJOR,TBB_VERSION_MINOR,TBB_VERSION_PATCH #define TBB_VERSION TBB_VERSION_STRING VS_VERSION_INFO VERSIONINFO @@ -54,7 +54,7 @@ BEGIN VALUE "CompanyName", "Intel Corporation\0" VALUE "FileDescription", "oneAPI Threading Building Blocks (oneTBB) library\0" VALUE "FileVersion", TBB_VERSION "\0" - VALUE "LegalCopyright", "Copyright 2005-2022 Intel Corporation. All Rights Reserved.\0" + VALUE "LegalCopyright", "Copyright 2005-2023 Intel Corporation. All Rights Reserved.\0" VALUE "LegalTrademarks", "\0" #ifndef TBB_USE_DEBUG VALUE "OriginalFilename", "tbbbind.dll\0" diff --git a/third-party/tbb/src/tbbmalloc/CMakeLists.txt b/third-party/tbb/src/tbbmalloc/CMakeLists.txt index 8c37a4f5..0386daa3 100644 --- a/third-party/tbb/src/tbbmalloc/CMakeLists.txt +++ b/third-party/tbb/src/tbbmalloc/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2022 Intel Corporation +# Copyright (c) 2020-2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -34,7 +34,7 @@ target_compile_definitions(tbbmalloc $<$>:__TBB_DYNAMIC_LOAD_ENABLED=0> $<$>:__TBB_SOURCE_DIRECTLY_INCLUDED=1>) -if (NOT ("${CMAKE_SYSTEM_PROCESSOR}" MATCHES "(armv7-a|aarch64|mips|arm64)" OR +if (NOT ("${CMAKE_SYSTEM_PROCESSOR}" MATCHES "(armv7-a|aarch64|mips|arm64|riscv)" OR "${CMAKE_OSX_ARCHITECTURES}" MATCHES "arm64" OR WINDOWS_STORE OR TBB_WINDOWS_DRIVER OR @@ -50,15 +50,15 @@ target_include_directories(tbbmalloc # TODO: fix warnings if (MSVC) # signed unsigned mismatch, declaration hides class member - set(TBB_WARNING_SUPPRESS ${TBB_WARNING_SUPPRESS} /wd4267 /wd4244 /wd4245 /wd4018 /wd4458) + set(TBB_WARNING_SUPPRESS ${TBB_WARNING_SUPPRESS} /wd4267 /wd4244 /wd4245 /wd4458) endif() -# TODO: add ${TBB_WARNING_LEVEL} and fix problems target_compile_options(tbbmalloc PRIVATE ${TBB_CXX_STD_FLAG} # TODO: consider making it PUBLIC. ${TBB_MMD_FLAG} ${TBB_DSE_FLAG} + ${TBB_WARNING_LEVEL} ${TBB_WARNING_SUPPRESS} ${TBB_LIB_COMPILE_FLAGS} ${TBBMALLOC_LIB_COMPILE_FLAGS} diff --git a/third-party/tbb/src/tbbmalloc/Customize.h b/third-party/tbb/src/tbbmalloc/Customize.h index 00341e88..fdb61643 100644 --- a/third-party/tbb/src/tbbmalloc/Customize.h +++ b/third-party/tbb/src/tbbmalloc/Customize.h @@ -1,5 +1,5 @@ /* - Copyright (c) 2005-2022 Intel Corporation + Copyright (c) 2005-2023 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -48,7 +48,7 @@ #endif inline intptr_t BitScanRev(uintptr_t x) { - return !x? -1 : tbb::detail::log2(x); + return x == 0 ? -1 : static_cast(tbb::detail::log2(x)); } template diff --git a/third-party/tbb/src/tbbmalloc/backend.cpp b/third-party/tbb/src/tbbmalloc/backend.cpp index 54a269f6..c240e030 100644 --- a/third-party/tbb/src/tbbmalloc/backend.cpp +++ b/third-party/tbb/src/tbbmalloc/backend.cpp @@ -1,5 +1,5 @@ /* - Copyright (c) 2005-2022 Intel Corporation + Copyright (c) 2005-2023 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -363,6 +363,7 @@ inline void CoalRequestQ::blockWasProcessed() { bkndSync->binsModified(); int prev = inFlyBlocks.fetch_sub(1); + tbb::detail::suppress_unused_warning(prev); MALLOC_ASSERT(prev > 0, ASSERT_TEXT); } @@ -748,7 +749,7 @@ int Backend::IndexedBins::getMinNonemptyBin(unsigned startBin) const FreeBlock *Backend::IndexedBins::findBlock(int nativeBin, BackendSync *sync, size_t size, bool needAlignedBlock, bool alignedBin, int *numOfLockedBins) { - for (int i=getMinNonemptyBin(nativeBin); isoftCachesCleanup())) { + bool retScanCoalescQ = scanCoalescQ(/*forceCoalescQDrop=*/true); + bool retSoftCachesCleanup = extMemPool->softCachesCleanup(); + if (!(retScanCoalescQ || retSoftCachesCleanup)) { // bins are not updated, // only remaining possibility is to ask for more memory block = askMemFromOS(totalReqSize, startModifiedCnt, &lockedBinsThreshold, @@ -1410,7 +1413,7 @@ bool Backend::clean() void Backend::IndexedBins::verify() { #if MALLOC_DEBUG - for (int i=0; inext) { uintptr_t mySz = fb->myL.value; MALLOC_ASSERT(mySz>GuardedSize::MAX_SPEC_VAL, ASSERT_TEXT); diff --git a/third-party/tbb/src/tbbmalloc/backref.cpp b/third-party/tbb/src/tbbmalloc/backref.cpp index 88386002..b0ea8306 100644 --- a/third-party/tbb/src/tbbmalloc/backref.cpp +++ b/third-party/tbb/src/tbbmalloc/backref.cpp @@ -1,5 +1,5 @@ /* - Copyright (c) 2005-2022 Intel Corporation + Copyright (c) 2005-2023 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -42,13 +42,13 @@ struct BackRefBlock : public BlockI { nextForUse(nullptr), bumpPtr((FreeObject*)((uintptr_t)blockToUse + slabSize - sizeof(void*))), freeList(nullptr), nextRawMemBlock(nullptr), allocatedCount(0), myNum(num), addedToForUse(false) { - memset(&blockMutex, 0, sizeof(MallocMutex)); + memset(static_cast(&blockMutex), 0, sizeof(MallocMutex)); MALLOC_ASSERT(!(num >> CHAR_BIT*sizeof(BackRefIdx::main_t)), "index in BackRefMain must fit to BackRefIdx::main"); } // clean all but header - void zeroSet() { memset(this+1, 0, BackRefBlock::bytes-sizeof(BackRefBlock)); } + void zeroSet() { memset(static_cast(this+1), 0, BackRefBlock::bytes-sizeof(BackRefBlock)); } static const int bytes = slabSize; }; @@ -106,7 +106,7 @@ bool initBackRefMain(Backend *backend) main->allRawMemBlocks = nullptr; main->rawMemUsed = rawMemUsed; main->lastUsed = -1; - memset(&main->requestNewSpaceMutex, 0, sizeof(MallocMutex)); + memset(static_cast(&main->requestNewSpaceMutex), 0, sizeof(MallocMutex)); for (int i=0; izeroSet(); diff --git a/third-party/tbb/src/tbbmalloc/frontend.cpp b/third-party/tbb/src/tbbmalloc/frontend.cpp index e32c240c..aa358313 100644 --- a/third-party/tbb/src/tbbmalloc/frontend.cpp +++ b/third-party/tbb/src/tbbmalloc/frontend.cpp @@ -1,5 +1,5 @@ /* - Copyright (c) 2005-2022 Intel Corporation + Copyright (c) 2005-2023 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -601,7 +601,9 @@ class TLSData : public TLSRemote { // should be called only for the current thread bool released = cleanBins ? cleanupBlockBins() : false; // both cleanups to be called, and the order is not important - return released | lloc.externalCleanup(&memPool->extMemPool) | freeSlabBlocks.externalCleanup(); + bool lloc_cleaned = lloc.externalCleanup(&memPool->extMemPool); + bool free_slab_blocks_cleaned = freeSlabBlocks.externalCleanup(); + return released || lloc_cleaned || free_slab_blocks_cleaned; } bool cleanupBlockBins(); void markUsed() { unused.store(false, std::memory_order_relaxed); } // called by owner when TLS touched @@ -802,7 +804,8 @@ static inline unsigned int highestBitPos(unsigned int n) unsigned int getSmallObjectIndex(unsigned int size) { unsigned int result = (size-1)>>3; - if (sizeof(void*)==8) { + constexpr bool is_64bit = (8 == sizeof(void*)); + if (is_64bit) { // For 64-bit malloc, 16 byte alignment is needed except for bin 0. if (result) result |= 1; // 0,1,3,5,7; bins 2,4,6 are not aligned to 16 bytes } @@ -927,7 +930,7 @@ static MallocMutex publicFreeListLock; // lock for changes of publicFreeList LifoList::LifoList( ) : top(nullptr) { // MallocMutex assumes zero initialization - memset(&lock, 0, sizeof(MallocMutex)); + memset(static_cast(&lock), 0, sizeof(MallocMutex)); } void LifoList::push(Block *block) @@ -1263,7 +1266,7 @@ Block* Bin::getPrivatizedFreeListBlock() Block* block; MALLOC_ASSERT( this, ASSERT_TEXT ); // if this method is called, active block usage must be unsuccessful - MALLOC_ASSERT( !activeBlk && !mailbox.load(std::memory_order_relaxed) || activeBlk && activeBlk->isFull, ASSERT_TEXT ); + MALLOC_ASSERT( (!activeBlk && !mailbox.load(std::memory_order_relaxed)) || (activeBlk && activeBlk->isFull), ASSERT_TEXT ); // the counter should be changed STAT_increment(getThreadId(), ThreadCommonCounters, lockPublicFreeList); if (!mailbox.load(std::memory_order_acquire)) // hotpath is empty mailbox @@ -1863,7 +1866,6 @@ FreeObject *StartupBlock::allocate(size_t size) { FreeObject *result; StartupBlock *newBlock = nullptr; - bool newBlockUnused = false; /* Objects must be aligned on their natural bounds, and objects bigger than word on word's bound. */ @@ -2718,7 +2720,7 @@ rml::MemPoolError pool_create_v1(intptr_t pool_id, const MemPoolPolicy *policy, *pool = nullptr; return NO_MEMORY; } - memset(memPool, 0, sizeof(rml::internal::MemoryPool)); + memset(static_cast(memPool), 0, sizeof(rml::internal::MemoryPool)); if (!memPool->init(pool_id, policy)) { internalFree(memPool); *pool = nullptr; diff --git a/third-party/tbb/src/tbbmalloc/large_objects.cpp b/third-party/tbb/src/tbbmalloc/large_objects.cpp index 59e1177e..8b470ab5 100644 --- a/third-party/tbb/src/tbbmalloc/large_objects.cpp +++ b/third-party/tbb/src/tbbmalloc/large_objects.cpp @@ -1,5 +1,5 @@ /* - Copyright (c) 2005-2022 Intel Corporation + Copyright (c) 2005-2023 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -124,7 +124,7 @@ class CacheBinFunctor { public: OperationPreprocessor(typename LargeObjectCacheImpl::CacheBin *bin) : bin(bin), lclTime(0), opGet(nullptr), opClean(nullptr), cleanTime(0), - lastGetOpTime(0), updateUsedSize(0), head(nullptr), isCleanAll(false) {} + lastGetOpTime(0), lastGet(0), updateUsedSize(0), head(nullptr), tail(nullptr), putListNum(0), isCleanAll(false) {} void operator()(CacheBinOperation* opList); uintptr_t getTimeRange() const { return -lclTime; } @@ -225,8 +225,8 @@ std::atomic memAllocKB, memHitKB; #if MALLOC_DEBUG inline bool lessThanWithOverflow(intptr_t a, intptr_t b) { - return (a < b && (b - a < UINTPTR_MAX/2)) || - (a > b && (a - b > UINTPTR_MAX/2)); + return (a < b && (b - a < static_cast(UINTPTR_MAX/2))) || + (a > b && (a - b > static_cast(UINTPTR_MAX/2))); } #endif @@ -462,7 +462,7 @@ template LargeMemoryBlock *LargeObjectCacheImpl:: CacheBin::get(ExtMemoryPool *extMemPool, size_t size, BinBitMask *bitMask, int idx) { LargeMemoryBlock *lmb=nullptr; - OpGet data = {&lmb, size}; + OpGet data = {&lmb, size, static_cast(0)}; CacheBinOperation op(data); ExecuteOperation( &op, extMemPool, bitMask, idx ); return lmb; @@ -543,6 +543,7 @@ template LargeMemoryBlock *LargeObjectCacheImpl:: MALLOC_ASSERT( !last.load(std::memory_order_relaxed) || (last.load(std::memory_order_relaxed)->age != 0 && last.load(std::memory_order_relaxed)->age != -1U), ASSERT_TEXT ); MALLOC_ASSERT( (tail==head && num==1) || (tail!=head && num>1), ASSERT_TEXT ); + MALLOC_ASSERT( tail, ASSERT_TEXT ); LargeMemoryBlock *toRelease = nullptr; if (size < hugeSizeThreshold && !lastCleanedAge) { // 1st object of such size was released. @@ -559,7 +560,6 @@ template LargeMemoryBlock *LargeObjectCacheImpl:: } if (num) { // add [head;tail] list to cache - MALLOC_ASSERT( tail, ASSERT_TEXT ); tail->next = first; if (first) first->prev = tail; @@ -611,9 +611,9 @@ template void LargeObjectCacheImpl:: intptr_t threshold = ageThreshold.load(std::memory_order_relaxed); if (threshold) - doCleanup = sinceLastGet > Props::LongWaitFactor * threshold; + doCleanup = sinceLastGet > static_cast(Props::LongWaitFactor * threshold); else if (lastCleanedAge) - doCleanup = sinceLastGet > Props::LongWaitFactor * (lastCleanedAge - lastGet); + doCleanup = sinceLastGet > static_cast(Props::LongWaitFactor * (lastCleanedAge - lastGet)); if (doCleanup) { lastCleanedAge = 0; @@ -804,8 +804,10 @@ bool LargeObjectCache::doCleanup(uintptr_t currTime, bool doThreshDecr) { if (!doThreshDecr) extMemPool->allLocalCaches.markUnused(); - return largeCache.regularCleanup(extMemPool, currTime, doThreshDecr) - | hugeCache.regularCleanup(extMemPool, currTime, doThreshDecr); + + bool large_cache_cleaned = largeCache.regularCleanup(extMemPool, currTime, doThreshDecr); + bool huge_cache_cleaned = hugeCache.regularCleanup(extMemPool, currTime, doThreshDecr); + return large_cache_cleaned || huge_cache_cleaned; } bool LargeObjectCache::decreasingCleanup() @@ -820,7 +822,9 @@ bool LargeObjectCache::regularCleanup() bool LargeObjectCache::cleanAll() { - return largeCache.cleanAll(extMemPool) | hugeCache.cleanAll(extMemPool); + bool large_cache_cleaned = largeCache.cleanAll(extMemPool); + bool huge_cache_cleaned = hugeCache.cleanAll(extMemPool); + return large_cache_cleaned || huge_cache_cleaned; } void LargeObjectCache::reset() @@ -847,7 +851,7 @@ template void LargeObjectCacheImpl::updateCacheState(ExtMemoryPool *extMemPool, DecreaseOrIncrease op, size_t size) { int idx = Props::sizeToIdx(size); - MALLOC_ASSERT(idx(numBins), ASSERT_TEXT); bin[idx].updateUsedSize(extMemPool, op==decrease? -size : size, &bitMask, idx); } @@ -1051,4 +1055,3 @@ void *ExtMemoryPool::remap(void *ptr, size_t oldSize, size_t newSize, size_t ali #if defined(_MSC_VER) && !defined(__INTEL_COMPILER) #pragma warning(pop) #endif - diff --git a/third-party/tbb/src/tbbmalloc/large_objects.h b/third-party/tbb/src/tbbmalloc/large_objects.h index 556d3fbf..ff205ccd 100644 --- a/third-party/tbb/src/tbbmalloc/large_objects.h +++ b/third-party/tbb/src/tbbmalloc/large_objects.h @@ -1,5 +1,5 @@ /* - Copyright (c) 2005-2021 Intel Corporation + Copyright (c) 2005-2023 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -174,7 +174,7 @@ class LargeObjectCacheImpl { public: void init() { - memset(this, 0, sizeof(CacheBin)); + memset(static_cast(this), 0, sizeof(CacheBin)); } /* ---------- Cache accessors ---------- */ diff --git a/third-party/tbb/src/tbbmalloc/tbbmalloc.cpp b/third-party/tbb/src/tbbmalloc/tbbmalloc.cpp index 0a7efdb9..675726ea 100644 --- a/third-party/tbb/src/tbbmalloc/tbbmalloc.cpp +++ b/third-party/tbb/src/tbbmalloc/tbbmalloc.cpp @@ -1,5 +1,5 @@ /* - Copyright (c) 2005-2022 Intel Corporation + Copyright (c) 2005-2023 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -67,6 +67,7 @@ void init_tbbmalloc() { |GET_MODULE_HANDLE_EX_FLAG_PIN, (LPCTSTR)&scalable_malloc, &lib); MALLOC_ASSERT(lib && ret, "Allocator can't find itself."); + tbb::detail::suppress_unused_warning(ret); SetErrorMode (prev_mode); #endif /* USE_PTHREAD && !__TBB_SOURCE_DIRECTLY_INCLUDED */ } diff --git a/third-party/tbb/src/tbbmalloc/tbbmalloc.rc b/third-party/tbb/src/tbbmalloc/tbbmalloc.rc index b2cce517..77e87ff5 100644 --- a/third-party/tbb/src/tbbmalloc/tbbmalloc.rc +++ b/third-party/tbb/src/tbbmalloc/tbbmalloc.rc @@ -31,7 +31,7 @@ LANGUAGE LANG_NEUTRAL, SUBLANG_NEUTRAL // // Version // -#define TBB_VERNUMBERS TBB_VERSION_MAJOR,TBB_VERSION_MINOR +#define TBB_VERNUMBERS TBB_VERSION_MAJOR,TBB_VERSION_MINOR,TBB_VERSION_PATCH #define TBB_VERSION TBB_VERSION_STRING VS_VERSION_INFO VERSIONINFO @@ -54,7 +54,7 @@ BEGIN VALUE "CompanyName", "Intel Corporation\0" VALUE "FileDescription", "oneAPI Threading Building Blocks (oneTBB) library\0" VALUE "FileVersion", TBB_VERSION "\0" - VALUE "LegalCopyright", "Copyright 2005-2022 Intel Corporation. All Rights Reserved.\0" + VALUE "LegalCopyright", "Copyright 2005-2023 Intel Corporation. All Rights Reserved.\0" VALUE "LegalTrademarks", "\0" #ifndef TBB_USE_DEBUG VALUE "OriginalFilename", "tbbmalloc.dll\0" diff --git a/third-party/tbb/src/tbbmalloc/tbbmalloc_internal.h b/third-party/tbb/src/tbbmalloc/tbbmalloc_internal.h index 81fa188f..352d41a8 100644 --- a/third-party/tbb/src/tbbmalloc/tbbmalloc_internal.h +++ b/third-party/tbb/src/tbbmalloc/tbbmalloc_internal.h @@ -1,5 +1,5 @@ /* - Copyright (c) 2005-2022 Intel Corporation + Copyright (c) 2005-2023 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -339,12 +339,12 @@ class BackRefIdx { // composite index to backreference array // Block header is used during block coalescing // and must be preserved in used blocks. class BlockI { -#if __clang__ +#if __clang__ && !__INTEL_COMPILER #pragma clang diagnostic push #pragma clang diagnostic ignored "-Wunused-private-field" #endif intptr_t blockState[2]; -#if __clang__ +#if __clang__ && !__INTEL_COMPILER #pragma clang diagnostic pop // "-Wunused-private-field" #endif }; diff --git a/third-party/tbb/src/tbbmalloc_proxy/tbbmalloc_proxy.rc b/third-party/tbb/src/tbbmalloc_proxy/tbbmalloc_proxy.rc index 71277e48..20b3b480 100644 --- a/third-party/tbb/src/tbbmalloc_proxy/tbbmalloc_proxy.rc +++ b/third-party/tbb/src/tbbmalloc_proxy/tbbmalloc_proxy.rc @@ -31,7 +31,7 @@ LANGUAGE LANG_NEUTRAL, SUBLANG_NEUTRAL // // Version // -#define TBB_VERNUMBERS TBB_VERSION_MAJOR,TBB_VERSION_MINOR +#define TBB_VERNUMBERS TBB_VERSION_MAJOR,TBB_VERSION_MINOR,TBB_VERSION_PATCH #define TBB_VERSION TBB_VERSION_STRING VS_VERSION_INFO VERSIONINFO @@ -54,7 +54,7 @@ BEGIN VALUE "CompanyName", "Intel Corporation\0" VALUE "FileDescription", "oneAPI Threading Building Blocks (oneTBB) library\0" VALUE "FileVersion", TBB_VERSION "\0" - VALUE "LegalCopyright", "Copyright 2005-2022 Intel Corporation. All Rights Reserved.\0" + VALUE "LegalCopyright", "Copyright 2005-2023 Intel Corporation. All Rights Reserved.\0" VALUE "LegalTrademarks", "\0" #ifndef TBB_USE_DEBUG VALUE "OriginalFilename", "tbbmalloc_proxy.dll\0" diff --git a/third-party/tbb/test/CMakeLists.txt b/third-party/tbb/test/CMakeLists.txt index b789219f..05466970 100644 --- a/third-party/tbb/test/CMakeLists.txt +++ b/third-party/tbb/test/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2022 Intel Corporation +# Copyright (c) 2020-2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -50,10 +50,10 @@ function(tbb_add_test) -DTEST_NAME=${_tbb_test_TARGET_NAME} -P ${PROJECT_SOURCE_DIR}/cmake/android/test_launcher.cmake) else() - add_test(NAME ${_tbb_test_TARGET_NAME} COMMAND ${_tbb_test_TARGET_NAME} --force-colors=1 WORKING_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}) + add_test(NAME ${_tbb_test_TARGET_NAME} COMMAND ${_tbb_test_TARGET_NAME} --force-colors=1 WORKING_DIRECTORY ${TBB_TEST_WORKING_DIRECTORY}) # Additional testing scenarios if Intel(R) Software Development Emulator is found if (UNIX AND ";test_mutex;conformance_mutex;" MATCHES ";${_tbb_test_TARGET_NAME};" AND SDE_EXE) - add_test(NAME ${_tbb_test_TARGET_NAME}_SDE COMMAND ${SDE_EXE} -nhm -rtm_mode disabled -- ./${_tbb_test_TARGET_NAME} --force-colors=1 WORKING_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}) + add_test(NAME ${_tbb_test_TARGET_NAME}_SDE COMMAND ${SDE_EXE} -nhm -rtm_mode disabled -- ./${_tbb_test_TARGET_NAME} --force-colors=1 WORKING_DIRECTORY ${TBB_TEST_WORKING_DIRECTORY}) set_property(TEST ${_tbb_test_TARGET_NAME}_SDE PROPERTY ENVIRONMENT ${TBB_TESTS_ENVIRONMENT} APPEND) endif() endif() @@ -77,7 +77,7 @@ function(tbb_add_test) target_link_libraries(${_tbb_test_TARGET_NAME} PRIVATE ${_tbb_test_DEPENDENCIES} Threads::Threads ${TBB_COMMON_LINK_LIBS}) if (COMMAND _tbb_run_memcheck) - _tbb_run_memcheck(${_tbb_test_NAME}) + _tbb_run_memcheck(${_tbb_test_NAME} ${_tbb_test_SUBDIR}) endif() endfunction() @@ -99,7 +99,7 @@ function(tbb_add_c_test) -DTEST_NAME=${_tbb_test_NAME} -P ${PROJECT_SOURCE_DIR}/cmake/android/test_launcher.cmake) else() - add_test(NAME ${_tbb_test_NAME} COMMAND ${_tbb_test_NAME} --force-colors=1 WORKING_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}) + add_test(NAME ${_tbb_test_NAME} COMMAND ${_tbb_test_NAME} --force-colors=1 WORKING_DIRECTORY ${TBB_TEST_WORKING_DIRECTORY}) endif() set_property(TEST ${_tbb_test_NAME} PROPERTY ENVIRONMENT ${TBB_TESTS_ENVIRONMENT} APPEND) @@ -423,6 +423,7 @@ if (TARGET TBB::tbb) tbb_add_test(SUBDIR tbb NAME test_indexer_node DEPENDENCIES TBB::tbb) tbb_add_test(SUBDIR tbb NAME test_join_node DEPENDENCIES TBB::tbb) tbb_add_test(SUBDIR tbb NAME test_join_node_key_matching DEPENDENCIES TBB::tbb) + tbb_add_test(SUBDIR tbb NAME test_join_node_key_matching_n_args DEPENDENCIES TBB::tbb) tbb_add_test(SUBDIR tbb NAME test_join_node_msg_key_matching DEPENDENCIES TBB::tbb) tbb_add_test(SUBDIR tbb NAME test_join_node_msg_key_matching_n_args DEPENDENCIES TBB::tbb) tbb_add_test(SUBDIR tbb NAME test_join_node_preview DEPENDENCIES TBB::tbb) diff --git a/third-party/tbb/test/common/concurrent_unordered_common.h b/third-party/tbb/test/common/concurrent_unordered_common.h index 895fbfb6..7f1a393d 100644 --- a/third-party/tbb/test/common/concurrent_unordered_common.h +++ b/third-party/tbb/test/common/concurrent_unordered_common.h @@ -1,5 +1,5 @@ /* - Copyright (c) 2005-2021 Intel Corporation + Copyright (c) 2005-2023 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -372,4 +372,22 @@ void test_set_comparisons() { test_two_way_comparable_container(); } +template +void test_reserve_regression() { + Container container; + + float lf = container.max_load_factor(); + std::size_t buckets = container.unsafe_bucket_count(); + std::size_t capacity = std::size_t(buckets * lf); + + for (std::size_t elements = 0; elements < capacity; ++elements) { + container.reserve(elements); + REQUIRE_MESSAGE(container.unsafe_bucket_count() == buckets, + "reserve() should not increase bucket count if the capacity is not reached"); + } + + container.reserve(capacity * 2); + REQUIRE_MESSAGE(container.unsafe_bucket_count() > buckets, "reserve() should increase bucket count if the capacity is reached"); +} + #endif // __TBB_test_common_concurrent_unordered_common diff --git a/third-party/tbb/test/common/doctest.h b/third-party/tbb/test/common/doctest.h index 3b906764..8714c5b2 100644 --- a/third-party/tbb/test/common/doctest.h +++ b/third-party/tbb/test/common/doctest.h @@ -1,5 +1,5 @@ /* - Modifications Copyright (c) 2020-2022 Intel Corporation + Modifications Copyright (c) 2020-2023 Intel Corporation Modifications Licensed under the Apache License, Version 2.0; You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 */ @@ -10,14 +10,14 @@ // // doctest.h - the lightest feature-rich C++ single-header testing framework for unit tests and TDD // -// Copyright (c) 2016-2021 Viktor Kirilov +// Copyright (c) 2016-2023 Viktor Kirilov // // Distributed under the MIT Software License // See accompanying file LICENSE.txt or copy at // https://opensource.org/licenses/MIT // // The documentation can be found at the library's page: -// https://github.com/onqtam/doctest/blob/master/doc/markdown/readme.md +// https://github.com/doctest/doctest/blob/master/doc/markdown/readme.md // // ================================================================================================= // ================================================================================================= @@ -54,8 +54,16 @@ #define DOCTEST_VERSION_MAJOR 2 #define DOCTEST_VERSION_MINOR 4 -#define DOCTEST_VERSION_PATCH 7 -#define DOCTEST_VERSION_STR "2.4.7" +#define DOCTEST_VERSION_PATCH 11 + +// util we need here +#define DOCTEST_TOSTR_IMPL(x) #x +#define DOCTEST_TOSTR(x) DOCTEST_TOSTR_IMPL(x) + +#define DOCTEST_VERSION_STR \ + DOCTEST_TOSTR(DOCTEST_VERSION_MAJOR) "." \ + DOCTEST_TOSTR(DOCTEST_VERSION_MINOR) "." \ + DOCTEST_TOSTR(DOCTEST_VERSION_PATCH) #define DOCTEST_VERSION \ (DOCTEST_VERSION_MAJOR * 10000 + DOCTEST_VERSION_MINOR * 100 + DOCTEST_VERSION_PATCH) @@ -66,6 +74,12 @@ // ideas for the version stuff are taken from here: https://github.com/cxxstuff/cxx_detect +#ifdef _MSC_VER +#define DOCTEST_CPLUSPLUS _MSVC_LANG +#else +#define DOCTEST_CPLUSPLUS __cplusplus +#endif + #define DOCTEST_COMPILER(MAJOR, MINOR, PATCH) ((MAJOR)*10000000 + (MINOR)*100000 + (PATCH)) // GCC/Clang and GCC/MSVC are mutually exclusive, but Clang/MSVC are not because of clang-cl... @@ -77,15 +91,14 @@ DOCTEST_COMPILER(_MSC_VER / 100, (_MSC_FULL_VER / 100000) % 100, _MSC_FULL_VER % 100000) #endif // MSVC #endif // MSVC -#if defined(__clang__) && defined(__clang_minor__) +#if defined(__clang__) && defined(__clang_minor__) && defined(__clang_patchlevel__) #define DOCTEST_CLANG DOCTEST_COMPILER(__clang_major__, __clang_minor__, __clang_patchlevel__) #elif defined(__GNUC__) && defined(__GNUC_MINOR__) && defined(__GNUC_PATCHLEVEL__) && \ !defined(__INTEL_COMPILER) #define DOCTEST_GCC DOCTEST_COMPILER(__GNUC__, __GNUC_MINOR__, __GNUC_PATCHLEVEL__) #endif // GCC -// TODO: upstream the change to doctest : Intel Compiler support #if defined(__INTEL_COMPILER) -#define DOCTEST_ICC DOCTEST_COMPILER(__INTEL_COMPILER / 100, __INTEL_COMPILER % 100, __INTEL_COMPILER % 10000) +#define DOCTEST_ICC DOCTEST_COMPILER(__INTEL_COMPILER / 100, __INTEL_COMPILER % 100, 0) #endif // ICC #ifndef DOCTEST_MSVC @@ -150,86 +163,94 @@ // == COMPILER WARNINGS ============================================================================ // ================================================================================================= +// both the header and the implementation suppress all of these, +// so it only makes sense to aggregate them like so +#define DOCTEST_SUPPRESS_COMMON_WARNINGS_PUSH \ + DOCTEST_CLANG_SUPPRESS_WARNING_PUSH \ + DOCTEST_CLANG_SUPPRESS_WARNING("-Wunknown-pragmas") \ + DOCTEST_CLANG_SUPPRESS_WARNING("-Wweak-vtables") \ + DOCTEST_CLANG_SUPPRESS_WARNING("-Wpadded") \ + DOCTEST_CLANG_SUPPRESS_WARNING("-Wmissing-prototypes") \ + DOCTEST_CLANG_SUPPRESS_WARNING("-Wc++98-compat") \ + DOCTEST_CLANG_SUPPRESS_WARNING("-Wc++98-compat-pedantic") \ + \ + DOCTEST_GCC_SUPPRESS_WARNING_PUSH \ + DOCTEST_GCC_SUPPRESS_WARNING("-Wunknown-pragmas") \ + DOCTEST_GCC_SUPPRESS_WARNING("-Wpragmas") \ + DOCTEST_GCC_SUPPRESS_WARNING("-Weffc++") \ + DOCTEST_GCC_SUPPRESS_WARNING("-Wstrict-overflow") \ + DOCTEST_GCC_SUPPRESS_WARNING("-Wstrict-aliasing") \ + DOCTEST_GCC_SUPPRESS_WARNING("-Wmissing-declarations") \ + DOCTEST_GCC_SUPPRESS_WARNING("-Wuseless-cast") \ + DOCTEST_GCC_SUPPRESS_WARNING("-Wnoexcept") \ + \ + DOCTEST_MSVC_SUPPRESS_WARNING_PUSH \ + /* these 4 also disabled globally via cmake: */ \ + DOCTEST_MSVC_SUPPRESS_WARNING(4514) /* unreferenced inline function has been removed */ \ + DOCTEST_MSVC_SUPPRESS_WARNING(4571) /* SEH related */ \ + DOCTEST_MSVC_SUPPRESS_WARNING(4710) /* function not inlined */ \ + DOCTEST_MSVC_SUPPRESS_WARNING(4711) /* function selected for inline expansion*/ \ + /* common ones */ \ + DOCTEST_MSVC_SUPPRESS_WARNING(4616) /* invalid compiler warning */ \ + DOCTEST_MSVC_SUPPRESS_WARNING(4619) /* invalid compiler warning */ \ + DOCTEST_MSVC_SUPPRESS_WARNING(4996) /* The compiler encountered a deprecated declaration */ \ + DOCTEST_MSVC_SUPPRESS_WARNING(4706) /* assignment within conditional expression */ \ + DOCTEST_MSVC_SUPPRESS_WARNING(4512) /* 'class' : assignment operator could not be generated */ \ + DOCTEST_MSVC_SUPPRESS_WARNING(4127) /* conditional expression is constant */ \ + DOCTEST_MSVC_SUPPRESS_WARNING(4820) /* padding */ \ + DOCTEST_MSVC_SUPPRESS_WARNING(4625) /* copy constructor was implicitly deleted */ \ + DOCTEST_MSVC_SUPPRESS_WARNING(4626) /* assignment operator was implicitly deleted */ \ + DOCTEST_MSVC_SUPPRESS_WARNING(5027) /* move assignment operator implicitly deleted */ \ + DOCTEST_MSVC_SUPPRESS_WARNING(5026) /* move constructor was implicitly deleted */ \ + DOCTEST_MSVC_SUPPRESS_WARNING(4640) /* construction of local static object not thread-safe */ \ + DOCTEST_MSVC_SUPPRESS_WARNING(5045) /* Spectre mitigation for memory load */ \ + DOCTEST_MSVC_SUPPRESS_WARNING(5264) /* 'variable-name': 'const' variable is not used */ \ + /* static analysis */ \ + DOCTEST_MSVC_SUPPRESS_WARNING(26439) /* Function may not throw. Declare it 'noexcept' */ \ + DOCTEST_MSVC_SUPPRESS_WARNING(26495) /* Always initialize a member variable */ \ + DOCTEST_MSVC_SUPPRESS_WARNING(26451) /* Arithmetic overflow ... */ \ + DOCTEST_MSVC_SUPPRESS_WARNING(26444) /* Avoid unnamed objects with custom ctor and dtor... */ \ + DOCTEST_MSVC_SUPPRESS_WARNING(26812) /* Prefer 'enum class' over 'enum' */ + +#define DOCTEST_SUPPRESS_COMMON_WARNINGS_POP \ + DOCTEST_CLANG_SUPPRESS_WARNING_POP \ + DOCTEST_GCC_SUPPRESS_WARNING_POP \ + DOCTEST_MSVC_SUPPRESS_WARNING_POP + +DOCTEST_SUPPRESS_COMMON_WARNINGS_PUSH + DOCTEST_CLANG_SUPPRESS_WARNING_PUSH -DOCTEST_CLANG_SUPPRESS_WARNING("-Wunknown-pragmas") DOCTEST_CLANG_SUPPRESS_WARNING("-Wnon-virtual-dtor") -DOCTEST_CLANG_SUPPRESS_WARNING("-Wweak-vtables") -DOCTEST_CLANG_SUPPRESS_WARNING("-Wpadded") DOCTEST_CLANG_SUPPRESS_WARNING("-Wdeprecated") -DOCTEST_CLANG_SUPPRESS_WARNING("-Wmissing-prototypes") -DOCTEST_CLANG_SUPPRESS_WARNING("-Wunused-local-typedef") -DOCTEST_CLANG_SUPPRESS_WARNING("-Wc++98-compat") -DOCTEST_CLANG_SUPPRESS_WARNING("-Wc++98-compat-pedantic") DOCTEST_GCC_SUPPRESS_WARNING_PUSH -DOCTEST_GCC_SUPPRESS_WARNING("-Wunknown-pragmas") -DOCTEST_GCC_SUPPRESS_WARNING("-Wpragmas") -DOCTEST_GCC_SUPPRESS_WARNING("-Weffc++") -DOCTEST_GCC_SUPPRESS_WARNING("-Wstrict-overflow") -DOCTEST_GCC_SUPPRESS_WARNING("-Wstrict-aliasing") DOCTEST_GCC_SUPPRESS_WARNING("-Wctor-dtor-privacy") -DOCTEST_GCC_SUPPRESS_WARNING("-Wmissing-declarations") DOCTEST_GCC_SUPPRESS_WARNING("-Wnon-virtual-dtor") -DOCTEST_GCC_SUPPRESS_WARNING("-Wunused-local-typedefs") -DOCTEST_GCC_SUPPRESS_WARNING("-Wuseless-cast") -DOCTEST_GCC_SUPPRESS_WARNING("-Wnoexcept") DOCTEST_GCC_SUPPRESS_WARNING("-Wsign-promo") DOCTEST_MSVC_SUPPRESS_WARNING_PUSH -DOCTEST_MSVC_SUPPRESS_WARNING(4616) // invalid compiler warning -DOCTEST_MSVC_SUPPRESS_WARNING(4619) // invalid compiler warning -DOCTEST_MSVC_SUPPRESS_WARNING(4996) // The compiler encountered a deprecated declaration -DOCTEST_MSVC_SUPPRESS_WARNING(4706) // assignment within conditional expression -DOCTEST_MSVC_SUPPRESS_WARNING(4512) // 'class' : assignment operator could not be generated -DOCTEST_MSVC_SUPPRESS_WARNING(4127) // conditional expression is constant -DOCTEST_MSVC_SUPPRESS_WARNING(4820) // padding -DOCTEST_MSVC_SUPPRESS_WARNING(4625) // copy constructor was implicitly defined as deleted -DOCTEST_MSVC_SUPPRESS_WARNING(4626) // assignment operator was implicitly defined as deleted -DOCTEST_MSVC_SUPPRESS_WARNING(5027) // move assignment operator was implicitly defined as deleted -DOCTEST_MSVC_SUPPRESS_WARNING(5026) // move constructor was implicitly defined as deleted DOCTEST_MSVC_SUPPRESS_WARNING(4623) // default constructor was implicitly defined as deleted -DOCTEST_MSVC_SUPPRESS_WARNING(4640) // construction of local static object is not thread-safe -DOCTEST_MSVC_SUPPRESS_WARNING(5045) // Spectre mitigation for memory load -// static analysis -DOCTEST_MSVC_SUPPRESS_WARNING(26439) // This kind of function may not throw. Declare it 'noexcept' -DOCTEST_MSVC_SUPPRESS_WARNING(26495) // Always initialize a member variable -DOCTEST_MSVC_SUPPRESS_WARNING(26451) // Arithmetic overflow ... -DOCTEST_MSVC_SUPPRESS_WARNING(26444) // Avoid unnamed objects with custom construction and dtr... -DOCTEST_MSVC_SUPPRESS_WARNING(26812) // Prefer 'enum class' over 'enum' - -// 4548 - expression before comma has no effect; expected expression with side - effect -// 4265 - class has virtual functions, but destructor is not virtual -// 4986 - exception specification does not match previous declaration -// 4350 - behavior change: 'member1' called instead of 'member2' -// 4668 - 'x' is not defined as a preprocessor macro, replacing with '0' for '#if/#elif' -// 4365 - conversion from 'int' to 'unsigned long', signed/unsigned mismatch -// 4774 - format string expected in argument 'x' is not a string literal -// 4820 - padding in structs - -// only 4 should be disabled globally: -// - 4514 # unreferenced inline function has been removed -// - 4571 # SEH related -// - 4710 # function not inlined -// - 4711 # function 'x' selected for automatic inline expansion #define DOCTEST_MAKE_STD_HEADERS_CLEAN_FROM_WARNINGS_ON_WALL_BEGIN \ DOCTEST_MSVC_SUPPRESS_WARNING_PUSH \ - DOCTEST_MSVC_SUPPRESS_WARNING(4548) \ - DOCTEST_MSVC_SUPPRESS_WARNING(4265) \ - DOCTEST_MSVC_SUPPRESS_WARNING(4986) \ - DOCTEST_MSVC_SUPPRESS_WARNING(4350) \ - DOCTEST_MSVC_SUPPRESS_WARNING(4668) \ - DOCTEST_MSVC_SUPPRESS_WARNING(4365) \ - DOCTEST_MSVC_SUPPRESS_WARNING(4774) \ - DOCTEST_MSVC_SUPPRESS_WARNING(4820) \ - DOCTEST_MSVC_SUPPRESS_WARNING(4625) \ - DOCTEST_MSVC_SUPPRESS_WARNING(4626) \ - DOCTEST_MSVC_SUPPRESS_WARNING(5027) \ - DOCTEST_MSVC_SUPPRESS_WARNING(5026) \ - DOCTEST_MSVC_SUPPRESS_WARNING(4623) \ - DOCTEST_MSVC_SUPPRESS_WARNING(5039) \ - DOCTEST_MSVC_SUPPRESS_WARNING(5045) \ - DOCTEST_MSVC_SUPPRESS_WARNING(5105) + DOCTEST_MSVC_SUPPRESS_WARNING(4548) /* before comma no effect; expected side - effect */ \ + DOCTEST_MSVC_SUPPRESS_WARNING(4265) /* virtual functions, but destructor is not virtual */ \ + DOCTEST_MSVC_SUPPRESS_WARNING(4986) /* exception specification does not match previous */ \ + DOCTEST_MSVC_SUPPRESS_WARNING(4350) /* 'member1' called instead of 'member2' */ \ + DOCTEST_MSVC_SUPPRESS_WARNING(4668) /* not defined as a preprocessor macro */ \ + DOCTEST_MSVC_SUPPRESS_WARNING(4365) /* signed/unsigned mismatch */ \ + DOCTEST_MSVC_SUPPRESS_WARNING(4774) /* format string not a string literal */ \ + DOCTEST_MSVC_SUPPRESS_WARNING(4820) /* padding */ \ + DOCTEST_MSVC_SUPPRESS_WARNING(4625) /* copy constructor was implicitly deleted */ \ + DOCTEST_MSVC_SUPPRESS_WARNING(4626) /* assignment operator was implicitly deleted */ \ + DOCTEST_MSVC_SUPPRESS_WARNING(5027) /* move assignment operator implicitly deleted */ \ + DOCTEST_MSVC_SUPPRESS_WARNING(5026) /* move constructor was implicitly deleted */ \ + DOCTEST_MSVC_SUPPRESS_WARNING(4623) /* default constructor was implicitly deleted */ \ + DOCTEST_MSVC_SUPPRESS_WARNING(5039) /* pointer to pot. throwing function passed to extern C */ \ + DOCTEST_MSVC_SUPPRESS_WARNING(5045) /* Spectre mitigation for memory load */ \ + DOCTEST_MSVC_SUPPRESS_WARNING(5105) /* macro producing 'defined' has undefined behavior */ \ + DOCTEST_MSVC_SUPPRESS_WARNING(4738) /* storing float result in memory, loss of performance */ \ + DOCTEST_MSVC_SUPPRESS_WARNING(5262) /* implicit fall-through */ #define DOCTEST_MAKE_STD_HEADERS_CLEAN_FROM_WARNINGS_ON_WALL_END DOCTEST_MSVC_SUPPRESS_WARNING_POP @@ -242,6 +263,7 @@ DOCTEST_MSVC_SUPPRESS_WARNING(26812) // Prefer 'enum class' over 'enum' // GCC C++11 feature support table: https://gcc.gnu.org/projects/cxx-status.html // MSVC version table: // https://en.wikipedia.org/wiki/Microsoft_Visual_C%2B%2B#Internal_version_numbering +// MSVC++ 14.3 (17) _MSC_VER == 1930 (Visual Studio 2022) // MSVC++ 14.2 (16) _MSC_VER == 1920 (Visual Studio 2019) // MSVC++ 14.1 (15) _MSC_VER == 1910 (Visual Studio 2017) // MSVC++ 14.0 _MSC_VER == 1900 (Visual Studio 2015) @@ -252,7 +274,6 @@ DOCTEST_MSVC_SUPPRESS_WARNING(26812) // Prefer 'enum class' over 'enum' // MSVC++ 8.0 _MSC_VER == 1400 (Visual Studio 2005) // Universal Windows Platform support -// TODO: upstream changes to origin repository #if defined(WINAPI_FAMILY) && (WINAPI_FAMILY == WINAPI_FAMILY_APP) #define DOCTEST_CONFIG_NO_WINDOWS_SEH #endif // WINAPI_FAMILY @@ -264,7 +285,7 @@ DOCTEST_MSVC_SUPPRESS_WARNING(26812) // Prefer 'enum class' over 'enum' #endif // DOCTEST_CONFIG_NO_WINDOWS_SEH #if !defined(_WIN32) && !defined(__QNX__) && !defined(DOCTEST_CONFIG_POSIX_SIGNALS) && \ - !defined(__EMSCRIPTEN__) + !defined(__EMSCRIPTEN__) && !defined(__wasi__) #define DOCTEST_CONFIG_POSIX_SIGNALS #endif // _WIN32 #if defined(DOCTEST_CONFIG_NO_POSIX_SIGNALS) && defined(DOCTEST_CONFIG_POSIX_SIGNALS) @@ -272,7 +293,8 @@ DOCTEST_MSVC_SUPPRESS_WARNING(26812) // Prefer 'enum class' over 'enum' #endif // DOCTEST_CONFIG_NO_POSIX_SIGNALS #ifndef DOCTEST_CONFIG_NO_EXCEPTIONS -#if !defined(__cpp_exceptions) && !defined(__EXCEPTIONS) && !defined(_CPPUNWIND) +#if !defined(__cpp_exceptions) && !defined(__EXCEPTIONS) && !defined(_CPPUNWIND) \ + || defined(__wasi__) #define DOCTEST_CONFIG_NO_EXCEPTIONS #endif // no exceptions #endif // DOCTEST_CONFIG_NO_EXCEPTIONS @@ -287,6 +309,10 @@ DOCTEST_MSVC_SUPPRESS_WARNING(26812) // Prefer 'enum class' over 'enum' #define DOCTEST_CONFIG_NO_TRY_CATCH_IN_ASSERTS #endif // DOCTEST_CONFIG_NO_EXCEPTIONS && !DOCTEST_CONFIG_NO_TRY_CATCH_IN_ASSERTS +#ifdef __wasi__ +#define DOCTEST_CONFIG_NO_MULTITHREADING +#endif + #if defined(DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN) && !defined(DOCTEST_CONFIG_IMPLEMENT) #define DOCTEST_CONFIG_IMPLEMENT #endif // DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN @@ -314,6 +340,16 @@ DOCTEST_MSVC_SUPPRESS_WARNING(26812) // Prefer 'enum class' over 'enum' #define DOCTEST_INTERFACE #endif // DOCTEST_CONFIG_IMPLEMENTATION_IN_DLL +// needed for extern template instantiations +// see https://github.com/fmtlib/fmt/issues/2228 +#if DOCTEST_MSVC +#define DOCTEST_INTERFACE_DECL +#define DOCTEST_INTERFACE_DEF DOCTEST_INTERFACE +#else // DOCTEST_MSVC +#define DOCTEST_INTERFACE_DECL DOCTEST_INTERFACE +#define DOCTEST_INTERFACE_DEF +#endif // DOCTEST_MSVC + #define DOCTEST_EMPTY #if DOCTEST_MSVC @@ -329,12 +365,19 @@ DOCTEST_MSVC_SUPPRESS_WARNING(26812) // Prefer 'enum class' over 'enum' #define DOCTEST_UNUSED __attribute__((unused)) #define DOCTEST_ALIGNMENT(x) __attribute__((aligned(x))) #endif + // TODO: upstream the change to doctest : Work-around for the warning: 'routine is both "inline" and "noinline"' #if DOCTEST_ICC #undef DOCTEST_NOINLINE #define DOCTEST_NOINLINE #endif // ICC +#ifdef DOCTEST_CONFIG_NO_CONTRADICTING_INLINE +#define DOCTEST_INLINE_NOINLINE inline +#else +#define DOCTEST_INLINE_NOINLINE inline DOCTEST_NOINLINE +#endif + #ifndef DOCTEST_NORETURN #if DOCTEST_MSVC && (DOCTEST_MSVC < DOCTEST_COMPILER(19, 0, 0)) #define DOCTEST_NORETURN @@ -354,15 +397,37 @@ DOCTEST_MSVC_SUPPRESS_WARNING(26812) // Prefer 'enum class' over 'enum' #ifndef DOCTEST_CONSTEXPR #if DOCTEST_MSVC && (DOCTEST_MSVC < DOCTEST_COMPILER(19, 0, 0)) #define DOCTEST_CONSTEXPR const +#define DOCTEST_CONSTEXPR_FUNC inline #else // DOCTEST_MSVC #define DOCTEST_CONSTEXPR constexpr +#define DOCTEST_CONSTEXPR_FUNC constexpr #endif // DOCTEST_MSVC #endif // DOCTEST_CONSTEXPR +#ifndef DOCTEST_NO_SANITIZE_INTEGER +// TODO: upstream the change to doctest : Work-around for the error on macOS with icpc: error #1292: unknown attribute "no_sanitize" +#if DOCTEST_CLANG >= DOCTEST_COMPILER(3, 7, 0) && !DOCTEST_ICC +#define DOCTEST_NO_SANITIZE_INTEGER __attribute__((no_sanitize("integer"))) +#else +#define DOCTEST_NO_SANITIZE_INTEGER +#endif +#endif // DOCTEST_NO_SANITIZE_INTEGER + // ================================================================================================= // == FEATURE DETECTION END ======================================================================== // ================================================================================================= +#define DOCTEST_DECLARE_INTERFACE(name) \ + virtual ~name(); \ + name() = default; \ + name(const name&) = delete; \ + name(name&&) = delete; \ + name& operator=(const name&) = delete; \ + name& operator=(name&&) = delete; + +#define DOCTEST_DEFINE_INTERFACE(name) \ + name::~name() = default; + // internal macros for string concatenation and anonymous variable name generation #define DOCTEST_CAT_IMPL(s1, s2) s1##s2 #define DOCTEST_CAT(s1, s2) DOCTEST_CAT_IMPL(s1, s2) @@ -372,8 +437,6 @@ DOCTEST_MSVC_SUPPRESS_WARNING(26812) // Prefer 'enum class' over 'enum' #define DOCTEST_ANONYMOUS(x) DOCTEST_CAT(x, __LINE__) #endif // __COUNTER__ -#define DOCTEST_TOSTR(x) #x - #ifndef DOCTEST_CONFIG_ASSERTION_PARAMETERS_BY_VALUE #define DOCTEST_REF_WRAP(x) x& #else // DOCTEST_CONFIG_ASSERTION_PARAMETERS_BY_VALUE @@ -387,33 +450,39 @@ DOCTEST_MSVC_SUPPRESS_WARNING(26812) // Prefer 'enum class' over 'enum' #define DOCTEST_PLATFORM_IPHONE #elif defined(_WIN32) #define DOCTEST_PLATFORM_WINDOWS +#elif defined(__wasi__) +#define DOCTEST_PLATFORM_WASI #else // DOCTEST_PLATFORM #define DOCTEST_PLATFORM_LINUX #endif // DOCTEST_PLATFORM -// TODO: upstream the change to doctest : suppress unused variable warning -#define DOCTEST_GLOBAL_NO_WARNINGS(var) \ - DOCTEST_CLANG_SUPPRESS_WARNING_WITH_PUSH("-Wglobal-constructors") \ - DOCTEST_CLANG_SUPPRESS_WARNING("-Wunused-variable") \ - static volatile int var DOCTEST_UNUSED // NOLINT(fuchsia-statically-constructed-objects,cert-err58-cpp) -#define DOCTEST_GLOBAL_NO_WARNINGS_END() \ - DOCTEST_CLANG_SUPPRESS_WARNING_POP \ +namespace doctest { namespace detail { + static DOCTEST_CONSTEXPR int consume(const int*, int) noexcept { return 0; } +}} + +#define DOCTEST_GLOBAL_NO_WARNINGS(var, ...) \ + DOCTEST_CLANG_SUPPRESS_WARNING_WITH_PUSH("-Wglobal-constructors") \ + static const int var = doctest::detail::consume(&var, __VA_ARGS__); \ + DOCTEST_CLANG_SUPPRESS_WARNING_POP #ifndef DOCTEST_BREAK_INTO_DEBUGGER // should probably take a look at https://github.com/scottt/debugbreak #ifdef DOCTEST_PLATFORM_LINUX #if defined(__GNUC__) && (defined(__i386) || defined(__x86_64)) // Break at the location of the failing check if possible -#define DOCTEST_BREAK_INTO_DEBUGGER() __asm__("int $3\n" : :) // NOLINT (hicpp-no-assembler) +#define DOCTEST_BREAK_INTO_DEBUGGER() __asm__("int $3\n" : :) // NOLINT(hicpp-no-assembler) #else #include #define DOCTEST_BREAK_INTO_DEBUGGER() raise(SIGTRAP) #endif #elif defined(DOCTEST_PLATFORM_MAC) #if defined(__x86_64) || defined(__x86_64__) || defined(__amd64__) || defined(__i386) -#define DOCTEST_BREAK_INTO_DEBUGGER() __asm__("int $3\n" : :) // NOLINT (hicpp-no-assembler) +#define DOCTEST_BREAK_INTO_DEBUGGER() __asm__("int $3\n" : :) // NOLINT(hicpp-no-assembler) +#elif defined(__ppc__) || defined(__ppc64__) +// https://www.cocoawithlove.com/2008/03/break-into-debugger.html +#define DOCTEST_BREAK_INTO_DEBUGGER() __asm__("li r0, 20\nsc\nnop\nli r0, 37\nli r4, 2\nsc\nnop\n": : : "memory","r0","r3","r4") // NOLINT(hicpp-no-assembler) #else -#define DOCTEST_BREAK_INTO_DEBUGGER() __asm__("brk #0"); // NOLINT (hicpp-no-assembler) +#define DOCTEST_BREAK_INTO_DEBUGGER() __asm__("brk #0"); // NOLINT(hicpp-no-assembler) #endif #elif DOCTEST_MSVC #define DOCTEST_BREAK_INTO_DEBUGGER() __debugbreak() @@ -429,54 +498,67 @@ DOCTEST_GCC_SUPPRESS_WARNING_POP // this is kept here for backwards compatibility since the config option was changed #ifdef DOCTEST_CONFIG_USE_IOSFWD +#ifndef DOCTEST_CONFIG_USE_STD_HEADERS #define DOCTEST_CONFIG_USE_STD_HEADERS +#endif #endif // DOCTEST_CONFIG_USE_IOSFWD +// for clang - always include ciso646 (which drags some std stuff) because +// we want to check if we are using libc++ with the _LIBCPP_VERSION macro in +// which case we don't want to forward declare stuff from std - for reference: +// https://github.com/doctest/doctest/issues/126 +// https://github.com/doctest/doctest/issues/356 +#if DOCTEST_CLANG +#include +#endif // clang + +#ifdef _LIBCPP_VERSION +#ifndef DOCTEST_CONFIG_USE_STD_HEADERS +#define DOCTEST_CONFIG_USE_STD_HEADERS +#endif +#endif // _LIBCPP_VERSION + #ifdef DOCTEST_CONFIG_USE_STD_HEADERS #ifndef DOCTEST_CONFIG_INCLUDE_TYPE_TRAITS #define DOCTEST_CONFIG_INCLUDE_TYPE_TRAITS #endif // DOCTEST_CONFIG_INCLUDE_TYPE_TRAITS -#include +DOCTEST_MAKE_STD_HEADERS_CLEAN_FROM_WARNINGS_ON_WALL_BEGIN #include #include +#include +DOCTEST_MAKE_STD_HEADERS_CLEAN_FROM_WARNINGS_ON_WALL_END #else // DOCTEST_CONFIG_USE_STD_HEADERS -#if DOCTEST_CLANG -// to detect if libc++ is being used with clang (the _LIBCPP_VERSION identifier) -#include -#endif // clang - -#ifdef _LIBCPP_VERSION -#define DOCTEST_STD_NAMESPACE_BEGIN _LIBCPP_BEGIN_NAMESPACE_STD -#define DOCTEST_STD_NAMESPACE_END _LIBCPP_END_NAMESPACE_STD -#else // _LIBCPP_VERSION -#define DOCTEST_STD_NAMESPACE_BEGIN namespace std { -#define DOCTEST_STD_NAMESPACE_END } -#endif // _LIBCPP_VERSION - // Forward declaring 'X' in namespace std is not permitted by the C++ Standard. DOCTEST_MSVC_SUPPRESS_WARNING_WITH_PUSH(4643) -DOCTEST_STD_NAMESPACE_BEGIN // NOLINT (cert-dcl58-cpp) -typedef decltype(nullptr) nullptr_t; +namespace std { // NOLINT(cert-dcl58-cpp) +typedef decltype(nullptr) nullptr_t; // NOLINT(modernize-use-using) +typedef decltype(sizeof(void*)) size_t; // NOLINT(modernize-use-using) template struct char_traits; template <> struct char_traits; template -class basic_ostream; -typedef basic_ostream> ostream; +class basic_ostream; // NOLINT(fuchsia-virtual-inheritance) +typedef basic_ostream> ostream; // NOLINT(modernize-use-using) +template +// NOLINTNEXTLINE +basic_ostream& operator<<(basic_ostream&, const char*); +template +class basic_istream; +typedef basic_istream> istream; // NOLINT(modernize-use-using) template class tuple; #if DOCTEST_MSVC >= DOCTEST_COMPILER(19, 20, 0) -// see this issue on why this is needed: https://github.com/onqtam/doctest/issues/183 -template +// see this issue on why this is needed: https://github.com/doctest/doctest/issues/183 +template class allocator; -template +template class basic_string; using string = basic_string, allocator>; #endif // VS 2019 -DOCTEST_STD_NAMESPACE_END +} // namespace std DOCTEST_MSVC_SUPPRESS_WARNING_POP @@ -488,8 +570,14 @@ DOCTEST_MSVC_SUPPRESS_WARNING_POP namespace doctest { +using std::size_t; + DOCTEST_INTERFACE extern bool is_running_in_test; +#ifndef DOCTEST_CONFIG_STRING_SIZE_TYPE +#define DOCTEST_CONFIG_STRING_SIZE_TYPE unsigned +#endif + #if DOCTEST_MSVC // TODO: upstream the change to doctest : // Due to race between exiting the process and starting of a new detached thread in Windows, thread @@ -538,7 +626,6 @@ struct doctest_thread_local_wrapper { // TODO: // - optimizations - like not deleting memory unnecessarily in operator= and etc. // - resize/reserve/clear -// - substr // - replace // - back/front // - iterator stuff @@ -548,60 +635,80 @@ struct doctest_thread_local_wrapper { // - relational operators as free functions - taking const char* as one of the params class DOCTEST_INTERFACE String { - static const unsigned len = 24; //!OCLINT avoid private static members - static const unsigned last = len - 1; //!OCLINT avoid private static members +public: + using size_type = DOCTEST_CONFIG_STRING_SIZE_TYPE; + +private: + static DOCTEST_CONSTEXPR size_type len = 24; //!OCLINT avoid private static members + static DOCTEST_CONSTEXPR size_type last = len - 1; //!OCLINT avoid private static members struct view // len should be more than sizeof(view) - because of the final byte for flags { char* ptr; - unsigned size; - unsigned capacity; + size_type size; + size_type capacity; }; union { - char buf[len]; + char buf[len]; // NOLINT(*-avoid-c-arrays) view data; }; - bool isOnStack() const { return (buf[last] & 128) == 0; } - void setOnHeap(); - void setLast(unsigned in = last); + char* allocate(size_type sz); + + bool isOnStack() const noexcept { return (buf[last] & 128) == 0; } + void setOnHeap() noexcept; + void setLast(size_type in = last) noexcept; + void setSize(size_type sz) noexcept; void copy(const String& other); public: - String(); + static DOCTEST_CONSTEXPR size_type npos = static_cast(-1); + + String() noexcept; ~String(); // cppcheck-suppress noExplicitConstructor String(const char* in); - String(const char* in, unsigned in_size); + String(const char* in, size_type in_size); + + String(std::istream& in, size_type in_size); String(const String& other); String& operator=(const String& other); String& operator+=(const String& other); - String(String&& other); - String& operator=(String&& other); + String(String&& other) noexcept; + String& operator=(String&& other) noexcept; - char operator[](unsigned i) const; - char& operator[](unsigned i); + char operator[](size_type i) const; + char& operator[](size_type i); // the only functions I'm willing to leave in the interface - available for inlining const char* c_str() const { return const_cast(this)->c_str(); } // NOLINT char* c_str() { - if(isOnStack()) + if (isOnStack()) { return reinterpret_cast(buf); + } return data.ptr; } - unsigned size() const; - unsigned capacity() const; + size_type size() const; + size_type capacity() const; + + String substr(size_type pos, size_type cnt = npos) &&; + String substr(size_type pos, size_type cnt = npos) const &; + + size_type find(char ch, size_type pos = 0) const; + size_type rfind(char ch, size_type pos = npos) const; int compare(const char* other, bool no_case = false) const; int compare(const String& other, bool no_case = false) const; + +friend DOCTEST_INTERFACE std::ostream& operator<<(std::ostream& s, const String& in); }; DOCTEST_INTERFACE String operator+(const String& lhs, const String& rhs); @@ -613,7 +720,21 @@ DOCTEST_INTERFACE bool operator>(const String& lhs, const String& rhs); DOCTEST_INTERFACE bool operator<=(const String& lhs, const String& rhs); DOCTEST_INTERFACE bool operator>=(const String& lhs, const String& rhs); -DOCTEST_INTERFACE std::ostream& operator<<(std::ostream& s, const String& in); +class DOCTEST_INTERFACE Contains { +public: + explicit Contains(const String& string); + + bool checkWith(const String& other) const; + + String string; +}; + +DOCTEST_INTERFACE String toString(const Contains& in); + +DOCTEST_INTERFACE bool operator==(const String& lhs, const Contains& rhs); +DOCTEST_INTERFACE bool operator==(const Contains& lhs, const String& rhs); +DOCTEST_INTERFACE bool operator!=(const String& lhs, const Contains& rhs); +DOCTEST_INTERFACE bool operator!=(const Contains& lhs, const String& rhs); namespace Color { enum Enum @@ -767,9 +888,27 @@ struct DOCTEST_INTERFACE AssertData String m_decomp; // for specific exception-related asserts - bool m_threw_as; - const char* m_exception_type; - const char* m_exception_string; + bool m_threw_as; + const char* m_exception_type; + + class DOCTEST_INTERFACE StringContains { + private: + Contains content; + bool isContains; + + public: + StringContains(const String& str) : content(str), isContains(false) { } + StringContains(Contains cntn) : content(static_cast(cntn)), isContains(true) { } + + bool check(const String& str) { return isContains ? (content == str) : (content.string == str); } + + operator const String&() const { return content.string; } + + const char* c_str() const { return content.string.c_str(); } + } m_exception_string; + + AssertData(assertType::Enum at, const char* file, int line, const char* expr, + const char* exception_type, const StringContains& exception_string); }; struct DOCTEST_INTERFACE MessageData @@ -786,13 +925,13 @@ struct DOCTEST_INTERFACE SubcaseSignature const char* m_file; int m_line; + bool operator==(const SubcaseSignature& other) const; bool operator<(const SubcaseSignature& other) const; }; struct DOCTEST_INTERFACE IContextScope { - IContextScope(); - virtual ~IContextScope(); + DOCTEST_DECLARE_INTERFACE(IContextScope) virtual void stringify(std::ostream*) const = 0; }; @@ -849,199 +988,189 @@ struct ContextOptions //!OCLINT too many fields }; namespace detail { - template - struct enable_if - {}; + namespace types { +#ifdef DOCTEST_CONFIG_INCLUDE_TYPE_TRAITS + using namespace std; +#else + template + struct enable_if { }; - template - struct enable_if - { typedef TYPE type; }; + template + struct enable_if { using type = T; }; - // clang-format off - template struct remove_reference { typedef T type; }; - template struct remove_reference { typedef T type; }; - template struct remove_reference { typedef T type; }; + struct true_type { static DOCTEST_CONSTEXPR bool value = true; }; + struct false_type { static DOCTEST_CONSTEXPR bool value = false; }; - template U declval(int); + template struct remove_reference { using type = T; }; + template struct remove_reference { using type = T; }; + template struct remove_reference { using type = T; }; - template T declval(long); + template struct is_rvalue_reference : false_type { }; + template struct is_rvalue_reference : true_type { }; - template auto declval() DOCTEST_NOEXCEPT -> decltype(declval(0)) ; + template struct remove_const { using type = T; }; + template struct remove_const { using type = T; }; - template struct is_lvalue_reference { const static bool value=false; }; - template struct is_lvalue_reference { const static bool value=true; }; + // Compiler intrinsics + template struct is_enum { static DOCTEST_CONSTEXPR bool value = __is_enum(T); }; + template struct underlying_type { using type = __underlying_type(T); }; + + template struct is_pointer : false_type { }; + template struct is_pointer : true_type { }; + + template struct is_array : false_type { }; + // NOLINTNEXTLINE(*-avoid-c-arrays) + template struct is_array : true_type { }; +#endif + } - template struct is_rvalue_reference { const static bool value=false; }; - template struct is_rvalue_reference { const static bool value=true; }; + // + template + T&& declval(); template - inline T&& forward(typename remove_reference::type& t) DOCTEST_NOEXCEPT - { + DOCTEST_CONSTEXPR_FUNC T&& forward(typename types::remove_reference::type& t) DOCTEST_NOEXCEPT { return static_cast(t); } template - inline T&& forward(typename remove_reference::type&& t) DOCTEST_NOEXCEPT - { - static_assert(!is_lvalue_reference::value, - "Can not forward an rvalue as an lvalue."); + DOCTEST_CONSTEXPR_FUNC T&& forward(typename types::remove_reference::type&& t) DOCTEST_NOEXCEPT { return static_cast(t); } - template struct remove_const { typedef T type; }; - template struct remove_const { typedef T type; }; -#ifdef DOCTEST_CONFIG_INCLUDE_TYPE_TRAITS - template struct is_enum : public std::is_enum {}; - template struct underlying_type : public std::underlying_type {}; -#else - // Use compiler intrinsics - template struct is_enum { DOCTEST_CONSTEXPR static bool value = __is_enum(T); }; - template struct underlying_type { typedef __underlying_type(T) type; }; -#endif - // clang-format on + template + struct deferred_false : types::false_type { }; + +// MSVS 2015 :( +#if !DOCTEST_CLANG && defined(_MSC_VER) && _MSC_VER <= 1900 + template + struct has_global_insertion_operator : types::false_type { }; template - struct deferred_false - // cppcheck-suppress unusedStructMember - { static const bool value = false; }; - - namespace has_insertion_operator_impl { - std::ostream &os(); - template - DOCTEST_REF_WRAP(T) val(); - - template - struct check { - static DOCTEST_CONSTEXPR bool value = false; - }; + struct has_global_insertion_operator(), declval()), void())> : types::true_type { }; - template - struct check(), void())> { - static DOCTEST_CONSTEXPR bool value = true; - }; - } // namespace has_insertion_operator_impl + template + struct has_insertion_operator { static DOCTEST_CONSTEXPR bool value = has_global_insertion_operator::value; }; + + template + struct insert_hack; + + template + struct insert_hack { + static void insert(std::ostream& os, const T& t) { ::operator<<(os, t); } + }; - template - using has_insertion_operator = has_insertion_operator_impl::check; + template + struct insert_hack { + static void insert(std::ostream& os, const T& t) { operator<<(os, t); } + }; + + template + using insert_hack_t = insert_hack::value>; +#else + template + struct has_insertion_operator : types::false_type { }; +#endif - DOCTEST_INTERFACE void my_memcpy(void* dest, const void* src, unsigned num); + template + struct has_insertion_operator(), declval()), void())> : types::true_type { }; - DOCTEST_INTERFACE std::ostream* getTlsOss(bool reset=true); // returns a thread-local ostringstream - DOCTEST_INTERFACE String getTlsOssResult(); + template + struct should_stringify_as_underlying_type { + static DOCTEST_CONSTEXPR bool value = detail::types::is_enum::value && !doctest::detail::has_insertion_operator::value; + }; + DOCTEST_INTERFACE std::ostream* tlssPush(); + DOCTEST_INTERFACE String tlssPop(); template - struct StringMakerBase - { + struct StringMakerBase { template static String convert(const DOCTEST_REF_WRAP(T)) { +#ifdef DOCTEST_CONFIG_REQUIRE_STRINGIFICATION_FOR_ALL_USED_TYPES + static_assert(deferred_false::value, "No stringification detected for type T. See string conversion manual"); +#endif return "{?}"; } }; - // Vector and various type other than pointer or array. - template - struct filldata - { - static void fill(const T &in) { - *getTlsOss() << in; - } - }; + template + struct filldata; - /* This method can be chained */ - template - void fillstream(const T (&in)[N] ) { - for(unsigned long i = 0; i < N; i++) { - *getTlsOss(false) << in[i]; - } + template + void filloss(std::ostream* stream, const T& in) { + filldata::fill(stream, in); } - template - struct filldata - { - static void fill(const T (&in)[N]) { - fillstream(in); - *getTlsOss(false)<<""; - } - }; - - template - void filloss(const T& in){ - filldata::fill(in); + template + void filloss(std::ostream* stream, const T (&in)[N]) { // NOLINT(*-avoid-c-arrays) + // T[N], T(&)[N], T(&&)[N] have same behaviour. + // Hence remove reference. + filloss::type>(stream, in); } - template - void filloss(const T (&in)[N]) { - // T[N], T(&)[N], T(&&)[N] have same behaviour. - // Hence remove reference. - filldata::type >::fill(in); + template + String toStream(const T& in) { + std::ostream* stream = tlssPush(); + filloss(stream, in); + return tlssPop(); } template <> - struct StringMakerBase - { + struct StringMakerBase { template static String convert(const DOCTEST_REF_WRAP(T) in) { - /* When parameter "in" is a null terminated const char* it works. - * When parameter "in" is a T arr[N] without '\0' we can fill the - * stringstream with N objects (T=char).If in is char pointer * - * without '\0' , it would cause segfault - * stepping over unaccessible memory. - */ - - filloss(in); - return getTlsOssResult(); + return toStream(in); } }; - - DOCTEST_INTERFACE String rawMemoryToString(const void* object, unsigned size); - - template - String rawMemoryToString(const DOCTEST_REF_WRAP(T) object) { - return rawMemoryToString(&object, sizeof(object)); - } - - template - const char* type_to_string() { - return "<>"; - } } // namespace detail template -struct StringMaker : public detail::StringMakerBase::value> +struct StringMaker : public detail::StringMakerBase< + detail::has_insertion_operator::value || detail::types::is_pointer::value || detail::types::is_array::value> {}; -template -struct StringMaker -{ - template - static String convert(U* p) { - if(p) - return detail::rawMemoryToString(p); - return "NULL"; - } -}; +#ifndef DOCTEST_STRINGIFY +#ifdef DOCTEST_CONFIG_DOUBLE_STRINGIFY +#define DOCTEST_STRINGIFY(...) toString(toString(__VA_ARGS__)) +#else +#define DOCTEST_STRINGIFY(...) toString(__VA_ARGS__) +#endif +#endif -template -struct StringMaker -{ - static String convert(R C::*p) { - if(p) - return detail::rawMemoryToString(p); - return "NULL"; - } -}; +template +String toString() { +#if DOCTEST_CLANG == 0 && DOCTEST_GCC == 0 && DOCTEST_ICC == 0 + String ret = __FUNCSIG__; // class doctest::String __cdecl doctest::toString(void) + String::size_type beginPos = ret.find('<'); + return ret.substr(beginPos + 1, ret.size() - beginPos - static_cast(sizeof(">(void)"))); +#else + String ret = __PRETTY_FUNCTION__; // doctest::String toString() [with T = TYPE] + String::size_type begin = ret.find('=') + 2; + return ret.substr(begin, ret.size() - begin - 1); +#endif +} -template ::value, bool>::type = true> +template ::value, bool>::type = true> String toString(const DOCTEST_REF_WRAP(T) value) { return StringMaker::convert(value); } #ifdef DOCTEST_CONFIG_TREAT_CHAR_STAR_AS_STRING -DOCTEST_INTERFACE String toString(char* in); DOCTEST_INTERFACE String toString(const char* in); #endif // DOCTEST_CONFIG_TREAT_CHAR_STAR_AS_STRING + +#if DOCTEST_MSVC >= DOCTEST_COMPILER(19, 20, 0) +// see this issue on why this is needed: https://github.com/doctest/doctest/issues/183 +DOCTEST_INTERFACE String toString(const std::string& in); +#endif // VS 2019 + +DOCTEST_INTERFACE String toString(String in); + +DOCTEST_INTERFACE String toString(std::nullptr_t); + DOCTEST_INTERFACE String toString(bool in); + DOCTEST_INTERFACE String toString(float in); DOCTEST_INTERFACE String toString(double in); DOCTEST_INTERFACE String toString(double long in); @@ -1049,40 +1178,95 @@ DOCTEST_INTERFACE String toString(double long in); DOCTEST_INTERFACE String toString(char in); DOCTEST_INTERFACE String toString(char signed in); DOCTEST_INTERFACE String toString(char unsigned in); -DOCTEST_INTERFACE String toString(int short in); -DOCTEST_INTERFACE String toString(int short unsigned in); -DOCTEST_INTERFACE String toString(int in); -DOCTEST_INTERFACE String toString(int unsigned in); -DOCTEST_INTERFACE String toString(int long in); -DOCTEST_INTERFACE String toString(int long unsigned in); -DOCTEST_INTERFACE String toString(int long long in); -DOCTEST_INTERFACE String toString(int long long unsigned in); -DOCTEST_INTERFACE String toString(std::nullptr_t in); - -template ::value, bool>::type = true> +DOCTEST_INTERFACE String toString(short in); +DOCTEST_INTERFACE String toString(short unsigned in); +DOCTEST_INTERFACE String toString(signed in); +DOCTEST_INTERFACE String toString(unsigned in); +DOCTEST_INTERFACE String toString(long in); +DOCTEST_INTERFACE String toString(long unsigned in); +DOCTEST_INTERFACE String toString(long long in); +DOCTEST_INTERFACE String toString(long long unsigned in); + +template ::value, bool>::type = true> String toString(const DOCTEST_REF_WRAP(T) value) { - typedef typename detail::underlying_type::type UT; - return toString(static_cast(value)); + using UT = typename detail::types::underlying_type::type; + return (DOCTEST_STRINGIFY(static_cast(value))); } -#if DOCTEST_MSVC >= DOCTEST_COMPILER(19, 20, 0) -// see this issue on why this is needed: https://github.com/onqtam/doctest/issues/183 -DOCTEST_INTERFACE String toString(const std::string& in); -#endif // VS 2019 +namespace detail { + template + struct filldata + { + static void fill(std::ostream* stream, const T& in) { +#if defined(_MSC_VER) && _MSC_VER <= 1900 + insert_hack_t::insert(*stream, in); +#else + operator<<(*stream, in); +#endif + } + }; + +DOCTEST_MSVC_SUPPRESS_WARNING_WITH_PUSH(4866) +// NOLINTBEGIN(*-avoid-c-arrays) + template + struct filldata { + static void fill(std::ostream* stream, const T(&in)[N]) { + *stream << "["; + for (size_t i = 0; i < N; i++) { + if (i != 0) { *stream << ", "; } + *stream << (DOCTEST_STRINGIFY(in[i])); + } + *stream << "]"; + } + }; +// NOLINTEND(*-avoid-c-arrays) +DOCTEST_MSVC_SUPPRESS_WARNING_POP + + // Specialized since we don't want the terminating null byte! +// NOLINTBEGIN(*-avoid-c-arrays) + template + struct filldata { + static void fill(std::ostream* stream, const char (&in)[N]) { + *stream << String(in, in[N - 1] ? N : N - 1); + } // NOLINT(clang-analyzer-cplusplus.NewDeleteLeaks) + }; +// NOLINTEND(*-avoid-c-arrays) + + template <> + struct filldata { + static void fill(std::ostream* stream, const void* in); + }; + + template + struct filldata { +DOCTEST_MSVC_SUPPRESS_WARNING_WITH_PUSH(4180) + static void fill(std::ostream* stream, const T* in) { +DOCTEST_MSVC_SUPPRESS_WARNING_POP +DOCTEST_CLANG_SUPPRESS_WARNING_WITH_PUSH("-Wmicrosoft-cast") + filldata::fill(stream, +#if DOCTEST_GCC == 0 || DOCTEST_GCC >= DOCTEST_COMPILER(4, 9, 0) + reinterpret_cast(in) +#else + *reinterpret_cast(&in) +#endif + ); +DOCTEST_CLANG_SUPPRESS_WARNING_POP + } + }; +} -class DOCTEST_INTERFACE Approx +struct DOCTEST_INTERFACE Approx { -public: - explicit Approx(double value); + Approx(double value); Approx operator()(double value) const; #ifdef DOCTEST_CONFIG_INCLUDE_TYPE_TRAITS template explicit Approx(const T& value, - typename detail::enable_if::value>::type* = + typename detail::types::enable_if::value>::type* = static_cast(nullptr)) { - *this = Approx(static_cast(value)); + *this = static_cast(value); } #endif // DOCTEST_CONFIG_INCLUDE_TYPE_TRAITS @@ -1090,7 +1274,7 @@ class DOCTEST_INTERFACE Approx #ifdef DOCTEST_CONFIG_INCLUDE_TYPE_TRAITS template - typename detail::enable_if::value, Approx&>::type epsilon( + typename std::enable_if::value, Approx&>::type epsilon( const T& newEpsilon) { m_epsilon = static_cast(newEpsilon); return *this; @@ -1101,7 +1285,7 @@ class DOCTEST_INTERFACE Approx #ifdef DOCTEST_CONFIG_INCLUDE_TYPE_TRAITS template - typename detail::enable_if::value, Approx&>::type scale( + typename std::enable_if::value, Approx&>::type scale( const T& newScale) { m_scale = static_cast(newScale); return *this; @@ -1122,30 +1306,27 @@ class DOCTEST_INTERFACE Approx DOCTEST_INTERFACE friend bool operator> (double lhs, const Approx & rhs); DOCTEST_INTERFACE friend bool operator> (const Approx & lhs, double rhs); - DOCTEST_INTERFACE friend String toString(const Approx& in); - #ifdef DOCTEST_CONFIG_INCLUDE_TYPE_TRAITS #define DOCTEST_APPROX_PREFIX \ - template friend typename detail::enable_if::value, bool>::type + template friend typename std::enable_if::value, bool>::type - DOCTEST_APPROX_PREFIX operator==(const T& lhs, const Approx& rhs) { return operator==(double(lhs), rhs); } + DOCTEST_APPROX_PREFIX operator==(const T& lhs, const Approx& rhs) { return operator==(static_cast(lhs), rhs); } DOCTEST_APPROX_PREFIX operator==(const Approx& lhs, const T& rhs) { return operator==(rhs, lhs); } DOCTEST_APPROX_PREFIX operator!=(const T& lhs, const Approx& rhs) { return !operator==(lhs, rhs); } DOCTEST_APPROX_PREFIX operator!=(const Approx& lhs, const T& rhs) { return !operator==(rhs, lhs); } - DOCTEST_APPROX_PREFIX operator<=(const T& lhs, const Approx& rhs) { return double(lhs) < rhs.m_value || lhs == rhs; } - DOCTEST_APPROX_PREFIX operator<=(const Approx& lhs, const T& rhs) { return lhs.m_value < double(rhs) || lhs == rhs; } - DOCTEST_APPROX_PREFIX operator>=(const T& lhs, const Approx& rhs) { return double(lhs) > rhs.m_value || lhs == rhs; } - DOCTEST_APPROX_PREFIX operator>=(const Approx& lhs, const T& rhs) { return lhs.m_value > double(rhs) || lhs == rhs; } - DOCTEST_APPROX_PREFIX operator< (const T& lhs, const Approx& rhs) { return double(lhs) < rhs.m_value && lhs != rhs; } - DOCTEST_APPROX_PREFIX operator< (const Approx& lhs, const T& rhs) { return lhs.m_value < double(rhs) && lhs != rhs; } - DOCTEST_APPROX_PREFIX operator> (const T& lhs, const Approx& rhs) { return double(lhs) > rhs.m_value && lhs != rhs; } - DOCTEST_APPROX_PREFIX operator> (const Approx& lhs, const T& rhs) { return lhs.m_value > double(rhs) && lhs != rhs; } + DOCTEST_APPROX_PREFIX operator<=(const T& lhs, const Approx& rhs) { return static_cast(lhs) < rhs.m_value || lhs == rhs; } + DOCTEST_APPROX_PREFIX operator<=(const Approx& lhs, const T& rhs) { return lhs.m_value < static_cast(rhs) || lhs == rhs; } + DOCTEST_APPROX_PREFIX operator>=(const T& lhs, const Approx& rhs) { return static_cast(lhs) > rhs.m_value || lhs == rhs; } + DOCTEST_APPROX_PREFIX operator>=(const Approx& lhs, const T& rhs) { return lhs.m_value > static_cast(rhs) || lhs == rhs; } + DOCTEST_APPROX_PREFIX operator< (const T& lhs, const Approx& rhs) { return static_cast(lhs) < rhs.m_value && lhs != rhs; } + DOCTEST_APPROX_PREFIX operator< (const Approx& lhs, const T& rhs) { return lhs.m_value < static_cast(rhs) && lhs != rhs; } + DOCTEST_APPROX_PREFIX operator> (const T& lhs, const Approx& rhs) { return static_cast(lhs) > rhs.m_value && lhs != rhs; } + DOCTEST_APPROX_PREFIX operator> (const Approx& lhs, const T& rhs) { return lhs.m_value > static_cast(rhs) && lhs != rhs; } #undef DOCTEST_APPROX_PREFIX #endif // DOCTEST_CONFIG_INCLUDE_TYPE_TRAITS // clang-format on -private: double m_epsilon; double m_scale; double m_value; @@ -1155,18 +1336,35 @@ DOCTEST_INTERFACE String toString(const Approx& in); DOCTEST_INTERFACE const ContextOptions* getContextOptions(); -#if !defined(DOCTEST_CONFIG_DISABLE) +template +struct DOCTEST_INTERFACE_DECL IsNaN +{ + F value; bool flipped; + IsNaN(F f, bool flip = false) : value(f), flipped(flip) { } + IsNaN operator!() const { return { value, !flipped }; } + operator bool() const; +}; +#ifndef __MINGW32__ +extern template struct DOCTEST_INTERFACE_DECL IsNaN; +extern template struct DOCTEST_INTERFACE_DECL IsNaN; +extern template struct DOCTEST_INTERFACE_DECL IsNaN; +#endif +DOCTEST_INTERFACE String toString(IsNaN in); +DOCTEST_INTERFACE String toString(IsNaN in); +DOCTEST_INTERFACE String toString(IsNaN in); + +#ifndef DOCTEST_CONFIG_DISABLE namespace detail { // clang-format off #ifdef DOCTEST_CONFIG_TREAT_CHAR_STAR_AS_STRING - template struct decay_array { typedef T type; }; - template struct decay_array { typedef T* type; }; - template struct decay_array { typedef T* type; }; + template struct decay_array { using type = T; }; + template struct decay_array { using type = T*; }; + template struct decay_array { using type = T*; }; - template struct not_char_pointer { enum { value = 1 }; }; - template<> struct not_char_pointer { enum { value = 0 }; }; - template<> struct not_char_pointer { enum { value = 0 }; }; + template struct not_char_pointer { static DOCTEST_CONSTEXPR int value = 1; }; + template<> struct not_char_pointer { static DOCTEST_CONSTEXPR int value = 0; }; + template<> struct not_char_pointer { static DOCTEST_CONSTEXPR int value = 0; }; template struct can_use_op : public not_char_pointer::type> {}; #endif // DOCTEST_CONFIG_TREAT_CHAR_STAR_AS_STRING @@ -1189,16 +1387,22 @@ namespace detail { bool m_entered = false; Subcase(const String& name, const char* file, int line); + Subcase(const Subcase&) = delete; + Subcase(Subcase&&) = delete; + Subcase& operator=(const Subcase&) = delete; + Subcase& operator=(Subcase&&) = delete; ~Subcase(); operator bool() const; + + private: + bool checkFilters(); }; template String stringifyBinaryExpr(const DOCTEST_REF_WRAP(L) lhs, const char* op, const DOCTEST_REF_WRAP(R) rhs) { - // NOLINTNEXTLINE(clang-analyzer-cplusplus.NewDeleteLeaks) - return toString(lhs) + op + toString(rhs); + return (DOCTEST_STRINGIFY(lhs)) + op + (DOCTEST_STRINGIFY(rhs)); } #if DOCTEST_CLANG && DOCTEST_CLANG < DOCTEST_COMPILER(3, 6, 0) @@ -1209,25 +1413,16 @@ DOCTEST_CLANG_SUPPRESS_WARNING_WITH_PUSH("-Wunused-comparison") // If not it doesn't find the operator or if the operator at global scope is defined after // this template, the template won't be instantiated due to SFINAE. Once the template is not // instantiated it can look for global operator using normal conversions. -#if !DOCTEST_ICC -#define SFINAE_OP(ret,op) decltype((void)(doctest::detail::declval() op doctest::detail::declval()),ret{}) -#else +#ifdef __NVCC__ #define SFINAE_OP(ret,op) ret +#else +#define SFINAE_OP(ret,op) decltype((void)(doctest::detail::declval() op doctest::detail::declval()),ret{}) #endif #define DOCTEST_DO_BINARY_EXPRESSION_COMPARISON(op, op_str, op_macro) \ template \ - DOCTEST_NOINLINE SFINAE_OP(Result,op) operator op(const R&& rhs) { \ - bool res = op_macro(doctest::detail::forward(lhs), doctest::detail::forward(rhs)); \ - if(m_at & assertType::is_false) \ - res = !res; \ - if(!res || doctest::getContextOptions()->success) \ - return Result(res, stringifyBinaryExpr(lhs, op_str, rhs)); \ - return Result(res); \ - } \ - template ::value , void >::type* = nullptr> \ - DOCTEST_NOINLINE SFINAE_OP(Result,op) operator op(const R& rhs) { \ - bool res = op_macro(doctest::detail::forward(lhs), doctest::detail::forward(rhs)); \ + DOCTEST_NOINLINE SFINAE_OP(Result,op) operator op(R&& rhs) { \ + bool res = op_macro(doctest::detail::forward(lhs), doctest::detail::forward(rhs)); \ if(m_at & assertType::is_false) \ res = !res; \ if(!res || doctest::getContextOptions()->success) \ @@ -1235,7 +1430,6 @@ DOCTEST_CLANG_SUPPRESS_WARNING_WITH_PUSH("-Wunused-comparison") return Result(res); \ } - // more checks could be added - like in Catch: // https://github.com/catchorg/Catch2/pull/1480/files // https://github.com/catchorg/Catch2/pull/1481/files @@ -1247,12 +1441,12 @@ DOCTEST_CLANG_SUPPRESS_WARNING_WITH_PUSH("-Wunused-comparison") return *this; \ } - struct DOCTEST_INTERFACE Result + struct DOCTEST_INTERFACE Result // NOLINT(*-member-init) { bool m_passed; String m_decomp; - Result() = default; + Result() = default; // TODO: Why do we need this? (To remove NOLINT) Result(bool passed, const String& decomposition = String()); // forbidding some expressions based on this table: https://en.cppreference.com/w/cpp/language/operator_precedence @@ -1309,8 +1503,7 @@ DOCTEST_CLANG_SUPPRESS_WARNING_WITH_PUSH("-Wunused-comparison") #ifndef DOCTEST_CONFIG_TREAT_CHAR_STAR_AS_STRING #define DOCTEST_COMPARISON_RETURN_TYPE bool #else // DOCTEST_CONFIG_TREAT_CHAR_STAR_AS_STRING -#define DOCTEST_COMPARISON_RETURN_TYPE typename enable_if::value || can_use_op::value, bool>::type - // NOLINTNEXTLINE(clang-analyzer-cplusplus.NewDeleteLeaks) +#define DOCTEST_COMPARISON_RETURN_TYPE typename types::enable_if::value || can_use_op::value, bool>::type inline bool eq(const char* lhs, const char* rhs) { return String(lhs) == String(rhs); } inline bool ne(const char* lhs, const char* rhs) { return String(lhs) != String(rhs); } inline bool lt(const char* lhs, const char* rhs) { return String(lhs) < String(rhs); } @@ -1358,28 +1551,26 @@ DOCTEST_CLANG_SUPPRESS_WARNING_WITH_PUSH("-Wunused-comparison") assertType::Enum m_at; explicit Expression_lhs(L&& in, assertType::Enum at) - : lhs(doctest::detail::forward(in)) + : lhs(static_cast(in)) , m_at(at) {} DOCTEST_NOINLINE operator Result() { -DOCTEST_GCC_SUPPRESS_WARNING_WITH_PUSH("-Waddress") -// this is needed only for MSVC 2015: -// https://ci.appveyor.com/project/onqtam/doctest/builds/38181202 +// this is needed only for MSVC 2015 DOCTEST_MSVC_SUPPRESS_WARNING_WITH_PUSH(4800) // 'int': forcing value to bool bool res = static_cast(lhs); DOCTEST_MSVC_SUPPRESS_WARNING_POP -DOCTEST_GCC_SUPPRESS_WARNING_POP - if(m_at & assertType::is_false) //!OCLINT bitwise operator in conditional + if(m_at & assertType::is_false) { //!OCLINT bitwise operator in conditional res = !res; + } - if(!res || getContextOptions()->success) - return Result(res, toString(lhs)); - return Result(res); + if(!res || getContextOptions()->success) { + return { res, (DOCTEST_STRINGIFY(lhs)) }; + } + return { res }; } - /* This is required for user-defined conversions from Expression_lhs to L */ - //operator L() const { return lhs; } - operator L() const { return lhs; } + /* This is required for user-defined conversions from Expression_lhs to L */ + operator L() const { return lhs; } // clang-format off DOCTEST_DO_BINARY_EXPRESSION_COMPARISON(==, " == ", DOCTEST_CMP_EQ) //!OCLINT bitwise operator in conditional @@ -1436,12 +1627,12 @@ DOCTEST_CLANG_SUPPRESS_WARNING_POP // https://github.com/catchorg/Catch2/issues/870 // https://github.com/catchorg/Catch2/issues/565 template - Expression_lhs operator<<(const L &&operand) { - return Expression_lhs(doctest::detail::forward(operand), m_at); + Expression_lhs operator<<(L&& operand) { + return Expression_lhs(static_cast(operand), m_at); } - template ::value,void >::type* = nullptr> - Expression_lhs operator<<(const L &operand) { + template ::value,void >::type* = nullptr> + Expression_lhs operator<<(const L &operand) { return Expression_lhs(operand, m_at); } }; @@ -1467,25 +1658,28 @@ DOCTEST_CLANG_SUPPRESS_WARNING_POP } }; - typedef void (*funcType)(); + using funcType = void (*)(); struct DOCTEST_INTERFACE TestCase : public TestCaseData { funcType m_test; // a function pointer to the test case - const char* m_type; // for templated test cases - gets appended to the real name + String m_type; // for templated test cases - gets appended to the real name int m_template_id; // an ID used to distinguish between the different versions of a templated test case String m_full_name; // contains the name (only for templated test cases!) + the template type TestCase(funcType test, const char* file, unsigned line, const TestSuite& test_suite, - const char* type = "", int template_id = -1); + const String& type = String(), int template_id = -1); TestCase(const TestCase& other); + TestCase(TestCase&&) = delete; DOCTEST_MSVC_SUPPRESS_WARNING_WITH_PUSH(26434) // hides a non-virtual function TestCase& operator=(const TestCase& other); DOCTEST_MSVC_SUPPRESS_WARNING_POP + TestCase& operator=(TestCase&&) = delete; + TestCase& operator*(const char* in); template @@ -1495,6 +1689,8 @@ DOCTEST_CLANG_SUPPRESS_WARNING_POP } bool operator<(const TestCase& other) const; + + ~TestCase() = default; }; // forward declarations of functions used by the macros @@ -1534,27 +1730,36 @@ DOCTEST_CLANG_SUPPRESS_WARNING_POP struct DOCTEST_INTERFACE ResultBuilder : public AssertData { ResultBuilder(assertType::Enum at, const char* file, int line, const char* expr, - const char* exception_type = "", const char* exception_string = ""); + const char* exception_type = "", const String& exception_string = ""); + + ResultBuilder(assertType::Enum at, const char* file, int line, const char* expr, + const char* exception_type, const Contains& exception_string); void setResult(const Result& res); template - DOCTEST_NOINLINE void binary_assert(const DOCTEST_REF_WRAP(L) lhs, + DOCTEST_NOINLINE bool binary_assert(const DOCTEST_REF_WRAP(L) lhs, const DOCTEST_REF_WRAP(R) rhs) { m_failed = !RelationalComparator()(lhs, rhs); - if(m_failed || getContextOptions()->success) + if (m_failed || getContextOptions()->success) { m_decomp = stringifyBinaryExpr(lhs, ", ", rhs); + } + return !m_failed; } template - DOCTEST_NOINLINE void unary_assert(const DOCTEST_REF_WRAP(L) val) { + DOCTEST_NOINLINE bool unary_assert(const DOCTEST_REF_WRAP(L) val) { m_failed = !val; - if(m_at & assertType::is_false) //!OCLINT bitwise operator in conditional + if (m_at & assertType::is_false) { //!OCLINT bitwise operator in conditional m_failed = !m_failed; + } - if(m_failed || getContextOptions()->success) - m_decomp = toString(val); + if (m_failed || getContextOptions()->success) { + m_decomp = (DOCTEST_STRINGIFY(val)); + } + + return !m_failed; } void translateException(); @@ -1574,8 +1779,8 @@ DOCTEST_CLANG_SUPPRESS_WARNING_POP DOCTEST_INTERFACE void failed_out_of_a_testing_context(const AssertData& ad); - DOCTEST_INTERFACE void decomp_assert(assertType::Enum at, const char* file, int line, - const char* expr, Result result); + DOCTEST_INTERFACE bool decomp_assert(assertType::Enum at, const char* file, int line, + const char* expr, const Result& result); #define DOCTEST_ASSERT_OUT_OF_TESTS(decomp) \ do { \ @@ -1590,7 +1795,7 @@ DOCTEST_CLANG_SUPPRESS_WARNING_POP if(checkIfShouldThrow(at)) \ throwException(); \ } \ - return; \ + return !failed; \ } \ } while(false) @@ -1605,7 +1810,7 @@ DOCTEST_CLANG_SUPPRESS_WARNING_POP throwException() template - DOCTEST_NOINLINE void binary_assert(assertType::Enum at, const char* file, int line, + DOCTEST_NOINLINE bool binary_assert(assertType::Enum at, const char* file, int line, const char* expr, const DOCTEST_REF_WRAP(L) lhs, const DOCTEST_REF_WRAP(R) rhs) { bool failed = !RelationalComparator()(lhs, rhs); @@ -1616,10 +1821,11 @@ DOCTEST_CLANG_SUPPRESS_WARNING_POP // ################################################################################### DOCTEST_ASSERT_OUT_OF_TESTS(stringifyBinaryExpr(lhs, ", ", rhs)); DOCTEST_ASSERT_IN_TESTS(stringifyBinaryExpr(lhs, ", ", rhs)); + return !failed; } template - DOCTEST_NOINLINE void unary_assert(assertType::Enum at, const char* file, int line, + DOCTEST_NOINLINE bool unary_assert(assertType::Enum at, const char* file, int line, const char* expr, const DOCTEST_REF_WRAP(L) val) { bool failed = !val; @@ -1630,14 +1836,14 @@ DOCTEST_CLANG_SUPPRESS_WARNING_POP // IF THE DEBUGGER BREAKS HERE - GO 1 LEVEL UP IN THE CALLSTACK FOR THE FAILING ASSERT // THIS IS THE EFFECT OF HAVING 'DOCTEST_CONFIG_SUPER_FAST_ASSERTS' DEFINED // ################################################################################### - DOCTEST_ASSERT_OUT_OF_TESTS(toString(val)); - DOCTEST_ASSERT_IN_TESTS(toString(val)); + DOCTEST_ASSERT_OUT_OF_TESTS((DOCTEST_STRINGIFY(val))); + DOCTEST_ASSERT_IN_TESTS((DOCTEST_STRINGIFY(val))); + return !failed; } struct DOCTEST_INTERFACE IExceptionTranslator { - IExceptionTranslator(); - virtual ~IExceptionTranslator(); + DOCTEST_DECLARE_INTERFACE(IExceptionTranslator) virtual bool translate(String&) const = 0; }; @@ -1653,7 +1859,7 @@ DOCTEST_CLANG_SUPPRESS_WARNING_POP try { throw; // lgtm [cpp/rethrow-no-exception] // cppcheck-suppress catchExceptionByValue - } catch(T ex) { // NOLINT + } catch(const T& ex) { res = m_translateFunction(ex); //!OCLINT parameter reassignment return true; } catch(...) {} //!OCLINT - empty catch statement @@ -1668,64 +1874,19 @@ DOCTEST_CLANG_SUPPRESS_WARNING_POP DOCTEST_INTERFACE void registerExceptionTranslatorImpl(const IExceptionTranslator* et); - template - struct StringStreamBase - { - template - static void convert(std::ostream* s, const T& in) { - *s << toString(in); - } - - // always treat char* as a string in this context - no matter - // if DOCTEST_CONFIG_TREAT_CHAR_STAR_AS_STRING is defined - static void convert(std::ostream* s, const char* in) { *s << String(in); } - }; - - template <> - struct StringStreamBase - { - template - static void convert(std::ostream* s, const T& in) { - *s << in; - } - }; - - template - struct StringStream : public StringStreamBase::value> - {}; + // ContextScope base class used to allow implementing methods of ContextScope + // that don't depend on the template parameter in doctest.cpp. + struct DOCTEST_INTERFACE ContextScopeBase : public IContextScope { + ContextScopeBase(const ContextScopeBase&) = delete; - template - void toStream(std::ostream* s, const T& value) { - StringStream::convert(s, value); - } + ContextScopeBase& operator=(const ContextScopeBase&) = delete; + ContextScopeBase& operator=(ContextScopeBase&&) = delete; -#ifdef DOCTEST_CONFIG_TREAT_CHAR_STAR_AS_STRING - DOCTEST_INTERFACE void toStream(std::ostream* s, char* in); - DOCTEST_INTERFACE void toStream(std::ostream* s, const char* in); -#endif // DOCTEST_CONFIG_TREAT_CHAR_STAR_AS_STRING - DOCTEST_INTERFACE void toStream(std::ostream* s, bool in); - DOCTEST_INTERFACE void toStream(std::ostream* s, float in); - DOCTEST_INTERFACE void toStream(std::ostream* s, double in); - DOCTEST_INTERFACE void toStream(std::ostream* s, double long in); - - DOCTEST_INTERFACE void toStream(std::ostream* s, char in); - DOCTEST_INTERFACE void toStream(std::ostream* s, char signed in); - DOCTEST_INTERFACE void toStream(std::ostream* s, char unsigned in); - DOCTEST_INTERFACE void toStream(std::ostream* s, int short in); - DOCTEST_INTERFACE void toStream(std::ostream* s, int short unsigned in); - DOCTEST_INTERFACE void toStream(std::ostream* s, int in); - DOCTEST_INTERFACE void toStream(std::ostream* s, int unsigned in); - DOCTEST_INTERFACE void toStream(std::ostream* s, int long in); - DOCTEST_INTERFACE void toStream(std::ostream* s, int long unsigned in); - DOCTEST_INTERFACE void toStream(std::ostream* s, int long long in); - DOCTEST_INTERFACE void toStream(std::ostream* s, int long long unsigned in); + ~ContextScopeBase() override = default; - // ContextScope base class used to allow implementing methods of ContextScope - // that don't depend on the template parameter in doctest.cpp. - class DOCTEST_INTERFACE ContextScopeBase : public IContextScope { protected: ContextScopeBase(); - ContextScopeBase(ContextScopeBase&& other); + ContextScopeBase(ContextScopeBase&& other) noexcept; void destroy(); bool need_to_destroy{true}; @@ -1733,12 +1894,17 @@ DOCTEST_CLANG_SUPPRESS_WARNING_POP template class ContextScope : public ContextScopeBase { - const L lambda_; + L lambda_; public: explicit ContextScope(const L &lambda) : lambda_(lambda) {} + explicit ContextScope(L&& lambda) : lambda_(static_cast(lambda)) { } - ContextScope(ContextScope &&other) : ContextScopeBase(static_cast(other)), lambda_(other.lambda_) {} + ContextScope(const ContextScope&) = delete; + ContextScope(ContextScope&&) noexcept = default; + + ContextScope& operator=(const ContextScope&) = delete; + ContextScope& operator=(ContextScope&&) = delete; void stringify(std::ostream* s) const override { lambda_(s); } @@ -1752,17 +1918,26 @@ DOCTEST_CLANG_SUPPRESS_WARNING_POP struct DOCTEST_INTERFACE MessageBuilder : public MessageData { std::ostream* m_stream; + bool logged = false; MessageBuilder(const char* file, int line, assertType::Enum severity); - MessageBuilder() = delete; + + MessageBuilder(const MessageBuilder&) = delete; + MessageBuilder(MessageBuilder&&) = delete; + + MessageBuilder& operator=(const MessageBuilder&) = delete; + MessageBuilder& operator=(MessageBuilder&&) = delete; + ~MessageBuilder(); // the preferred way of chaining parameters for stringification +DOCTEST_MSVC_SUPPRESS_WARNING_WITH_PUSH(4866) template MessageBuilder& operator,(const T& in) { - toStream(m_stream, in); + *m_stream << (DOCTEST_STRINGIFY(in)); return *this; } +DOCTEST_MSVC_SUPPRESS_WARNING_POP // kept here just for backwards-compatibility - the comma operator should be preferred now template @@ -1831,7 +2006,7 @@ int registerExceptionTranslator(String (*)(T)) { #endif // DOCTEST_CONFIG_DISABLE namespace detail { - typedef void (*assert_handler)(const AssertData&); + using assert_handler = void (*)(const AssertData&); struct ContextState; } // namespace detail @@ -1844,7 +2019,13 @@ class DOCTEST_INTERFACE Context public: explicit Context(int argc = 0, const char* const* argv = nullptr); - ~Context(); + Context(const Context&) = delete; + Context(Context&&) = delete; + + Context& operator=(const Context&) = delete; + Context& operator=(Context&&) = delete; + + ~Context(); // NOLINT(performance-trivially-destructible) void applyCommandLine(int argc, const char* const* argv); @@ -1952,8 +2133,7 @@ struct DOCTEST_INTERFACE IReporter // or isn't in the execution range (between first and last) (safe to cache a pointer to the input) virtual void test_case_skipped(const TestCaseData&) = 0; - // doctest will not be managing the lifetimes of reporters given to it but this would still be nice to have - virtual ~IReporter(); + DOCTEST_DECLARE_INTERFACE(IReporter) // can obtain all currently active contexts and stringify them if one wishes to do so static int get_num_active_contexts(); @@ -1965,7 +2145,7 @@ struct DOCTEST_INTERFACE IReporter }; namespace detail { - typedef IReporter* (*reporterCreatorFunc)(const ContextOptions&); + using reporterCreatorFunc = IReporter* (*)(const ContextOptions&); DOCTEST_INTERFACE void registerReporterImpl(const char* name, int prio, reporterCreatorFunc c, bool isReporter); @@ -1982,14 +2162,30 @@ int registerReporter(const char* name, int priority, bool isReporter) { } } // namespace doctest +#ifdef DOCTEST_CONFIG_ASSERTS_RETURN_VALUES +#define DOCTEST_FUNC_EMPTY [] { return false; }() +#else +#define DOCTEST_FUNC_EMPTY (void)0 +#endif + // if registering is not disabled -#if !defined(DOCTEST_CONFIG_DISABLE) +#ifndef DOCTEST_CONFIG_DISABLE + +#ifdef DOCTEST_CONFIG_ASSERTS_RETURN_VALUES +#define DOCTEST_FUNC_SCOPE_BEGIN [&] +#define DOCTEST_FUNC_SCOPE_END () +#define DOCTEST_FUNC_SCOPE_RET(v) return v +#else +#define DOCTEST_FUNC_SCOPE_BEGIN do +#define DOCTEST_FUNC_SCOPE_END while(false) +#define DOCTEST_FUNC_SCOPE_RET(v) (void)0 +#endif // common code in asserts - for convenience -#define DOCTEST_ASSERT_LOG_AND_REACT(b) \ - if(b.log()) \ - DOCTEST_BREAK_INTO_DEBUGGER(); \ - b.react() +#define DOCTEST_ASSERT_LOG_REACT_RETURN(b) \ + if(b.log()) DOCTEST_BREAK_INTO_DEBUGGER(); \ + b.react(); \ + DOCTEST_FUNC_SCOPE_RET(!b.m_failed) #ifdef DOCTEST_CONFIG_NO_TRY_CATCH_IN_ASSERTS #define DOCTEST_WRAP_IN_TRY(x) x; @@ -2011,27 +2207,26 @@ int registerReporter(const char* name, int priority, bool isReporter) { // registers the test by initializing a dummy var with a function #define DOCTEST_REGISTER_FUNCTION(global_prefix, f, decorators) \ - global_prefix DOCTEST_GLOBAL_NO_WARNINGS(DOCTEST_ANONYMOUS(DOCTEST_ANON_VAR_)) = \ + global_prefix DOCTEST_GLOBAL_NO_WARNINGS(DOCTEST_ANONYMOUS(DOCTEST_ANON_VAR_), /* NOLINT */ \ doctest::detail::regTest( \ doctest::detail::TestCase( \ f, __FILE__, __LINE__, \ doctest_detail_test_suite_ns::getCurrentTestSuite()) * \ - decorators); \ - DOCTEST_GLOBAL_NO_WARNINGS_END() + decorators)) #define DOCTEST_IMPLEMENT_FIXTURE(der, base, func, decorators) \ - namespace { \ + namespace { /* NOLINT */ \ struct der : public base \ { \ void f(); \ }; \ - static void func() { \ + static DOCTEST_INLINE_NOINLINE void func() { \ der v; \ v.f(); \ } \ DOCTEST_REGISTER_FUNCTION(DOCTEST_EMPTY, func, decorators) \ } \ - inline DOCTEST_NOINLINE void der::f() + DOCTEST_INLINE_NOINLINE void der::f() // NOLINT(misc-definitions-in-headers) #define DOCTEST_CREATE_AND_REGISTER_FUNCTION(f, decorators) \ static void f(); \ @@ -2040,7 +2235,7 @@ int registerReporter(const char* name, int priority, bool isReporter) { #define DOCTEST_CREATE_AND_REGISTER_FUNCTION_IN_CLASS(f, proxy, decorators) \ static doctest::detail::funcType proxy() { return f; } \ - DOCTEST_REGISTER_FUNCTION(inline, proxy(), decorators) \ + DOCTEST_REGISTER_FUNCTION(inline, proxy(), decorators) \ static void f() // for registering tests @@ -2048,7 +2243,7 @@ int registerReporter(const char* name, int priority, bool isReporter) { DOCTEST_CREATE_AND_REGISTER_FUNCTION(DOCTEST_ANONYMOUS(DOCTEST_ANON_FUNC_), decorators) // for registering tests in classes - requires C++17 for inline variables! -#if __cplusplus >= 201703L || (DOCTEST_MSVC >= DOCTEST_COMPILER(19, 12, 0) && _MSVC_LANG >= 201703L) +#if DOCTEST_CPLUSPLUS >= 201703L #define DOCTEST_TEST_CASE_CLASS(decorators) \ DOCTEST_CREATE_AND_REGISTER_FUNCTION_IN_CLASS(DOCTEST_ANONYMOUS(DOCTEST_ANON_FUNC_), \ DOCTEST_ANONYMOUS(DOCTEST_ANON_PROXY_), \ @@ -2064,22 +2259,21 @@ int registerReporter(const char* name, int priority, bool isReporter) { DOCTEST_ANONYMOUS(DOCTEST_ANON_FUNC_), decorators) // for converting types to strings without the header and demangling -#define DOCTEST_TYPE_TO_STRING_IMPL(...) \ - template <> \ - inline const char* type_to_string<__VA_ARGS__>() { \ - return "<" #__VA_ARGS__ ">"; \ - } -#define DOCTEST_TYPE_TO_STRING(...) \ - namespace doctest { namespace detail { \ - DOCTEST_TYPE_TO_STRING_IMPL(__VA_ARGS__) \ +#define DOCTEST_TYPE_TO_STRING_AS(str, ...) \ + namespace doctest { \ + template <> \ + inline String toString<__VA_ARGS__>() { \ + return str; \ } \ } \ - typedef int DOCTEST_ANONYMOUS(DOCTEST_ANON_FOR_SEMICOLON_) + static_assert(true, "") + +#define DOCTEST_TYPE_TO_STRING(...) DOCTEST_TYPE_TO_STRING_AS(#__VA_ARGS__, __VA_ARGS__) #define DOCTEST_TEST_CASE_TEMPLATE_DEFINE_IMPL(dec, T, iter, func) \ template \ static void func(); \ - namespace { \ + namespace { /* NOLINT */ \ template \ struct iter; \ template \ @@ -2088,7 +2282,7 @@ int registerReporter(const char* name, int priority, bool isReporter) { iter(const char* file, unsigned line, int index) { \ doctest::detail::regTest(doctest::detail::TestCase(func, file, line, \ doctest_detail_test_suite_ns::getCurrentTestSuite(), \ - doctest::detail::type_to_string(), \ + doctest::toString(), \ int(line) * 1000 + index) \ * dec); \ iter>(file, line, index + 1); \ @@ -2108,17 +2302,17 @@ int registerReporter(const char* name, int priority, bool isReporter) { DOCTEST_ANONYMOUS(DOCTEST_ANON_TMP_)) #define DOCTEST_TEST_CASE_TEMPLATE_INSTANTIATE_IMPL(id, anon, ...) \ - DOCTEST_GLOBAL_NO_WARNINGS(DOCTEST_CAT(anon, DUMMY)) = \ - doctest::detail::instantiationHelper(DOCTEST_CAT(id, ITERATOR)<__VA_ARGS__>(__FILE__, __LINE__, 0));\ - DOCTEST_GLOBAL_NO_WARNINGS_END() + DOCTEST_GLOBAL_NO_WARNINGS(DOCTEST_CAT(anon, DUMMY), /* NOLINT(cert-err58-cpp, fuchsia-statically-constructed-objects) */ \ + doctest::detail::instantiationHelper( \ + DOCTEST_CAT(id, ITERATOR)<__VA_ARGS__>(__FILE__, __LINE__, 0))) #define DOCTEST_TEST_CASE_TEMPLATE_INVOKE(id, ...) \ DOCTEST_TEST_CASE_TEMPLATE_INSTANTIATE_IMPL(id, DOCTEST_ANONYMOUS(DOCTEST_ANON_TMP_), std::tuple<__VA_ARGS__>) \ - typedef int DOCTEST_ANONYMOUS(DOCTEST_ANON_FOR_SEMICOLON_) + static_assert(true, "") #define DOCTEST_TEST_CASE_TEMPLATE_APPLY(id, ...) \ DOCTEST_TEST_CASE_TEMPLATE_INSTANTIATE_IMPL(id, DOCTEST_ANONYMOUS(DOCTEST_ANON_TMP_), __VA_ARGS__) \ - typedef int DOCTEST_ANONYMOUS(DOCTEST_ANON_FOR_SEMICOLON_) + static_assert(true, "") #define DOCTEST_TEST_CASE_TEMPLATE_IMPL(dec, T, anon, ...) \ DOCTEST_TEST_CASE_TEMPLATE_DEFINE_IMPL(dec, T, DOCTEST_CAT(anon, ITERATOR), anon); \ @@ -2137,7 +2331,7 @@ int registerReporter(const char* name, int priority, bool isReporter) { // for grouping tests in test suites by using code blocks #define DOCTEST_TEST_SUITE_IMPL(decorators, ns_name) \ namespace ns_name { namespace doctest_detail_test_suite_ns { \ - static DOCTEST_NOINLINE doctest::detail::TestSuite& getCurrentTestSuite() { \ + static DOCTEST_NOINLINE doctest::detail::TestSuite& getCurrentTestSuite() noexcept { \ DOCTEST_MSVC_SUPPRESS_WARNING_WITH_PUSH(4640) \ DOCTEST_CLANG_SUPPRESS_WARNING_WITH_PUSH("-Wexit-time-destructors") \ DOCTEST_GCC_SUPPRESS_WARNING_WITH_PUSH("-Wmissing-field-initializers") \ @@ -2161,24 +2355,21 @@ int registerReporter(const char* name, int priority, bool isReporter) { // for starting a testsuite block #define DOCTEST_TEST_SUITE_BEGIN(decorators) \ - DOCTEST_GLOBAL_NO_WARNINGS(DOCTEST_ANONYMOUS(DOCTEST_ANON_VAR_)) = \ - doctest::detail::setTestSuite(doctest::detail::TestSuite() * decorators); \ - DOCTEST_GLOBAL_NO_WARNINGS_END() \ - typedef int DOCTEST_ANONYMOUS(DOCTEST_ANON_FOR_SEMICOLON_) + DOCTEST_GLOBAL_NO_WARNINGS(DOCTEST_ANONYMOUS(DOCTEST_ANON_VAR_), /* NOLINT(cert-err58-cpp) */ \ + doctest::detail::setTestSuite(doctest::detail::TestSuite() * decorators)) \ + static_assert(true, "") // for ending a testsuite block #define DOCTEST_TEST_SUITE_END \ - DOCTEST_GLOBAL_NO_WARNINGS(DOCTEST_ANONYMOUS(DOCTEST_ANON_VAR_)) = \ - doctest::detail::setTestSuite(doctest::detail::TestSuite() * ""); \ - DOCTEST_GLOBAL_NO_WARNINGS_END() \ - typedef int DOCTEST_ANONYMOUS(DOCTEST_ANON_FOR_SEMICOLON_) + DOCTEST_GLOBAL_NO_WARNINGS(DOCTEST_ANONYMOUS(DOCTEST_ANON_VAR_), /* NOLINT(cert-err58-cpp) */ \ + doctest::detail::setTestSuite(doctest::detail::TestSuite() * "")) \ + using DOCTEST_ANONYMOUS(DOCTEST_ANON_FOR_SEMICOLON_) = int // for registering exception translators #define DOCTEST_REGISTER_EXCEPTION_TRANSLATOR_IMPL(translatorName, signature) \ inline doctest::String translatorName(signature); \ - DOCTEST_GLOBAL_NO_WARNINGS(DOCTEST_ANONYMOUS(DOCTEST_ANON_TRANSLATOR_)) = \ - doctest::registerExceptionTranslator(translatorName); \ - DOCTEST_GLOBAL_NO_WARNINGS_END() \ + DOCTEST_GLOBAL_NO_WARNINGS(DOCTEST_ANONYMOUS(DOCTEST_ANON_TRANSLATOR_), /* NOLINT(cert-err58-cpp) */ \ + doctest::registerExceptionTranslator(translatorName)) \ doctest::String translatorName(signature) #define DOCTEST_REGISTER_EXCEPTION_TRANSLATOR(signature) \ @@ -2187,15 +2378,15 @@ int registerReporter(const char* name, int priority, bool isReporter) { // for registering reporters #define DOCTEST_REGISTER_REPORTER(name, priority, reporter) \ - DOCTEST_GLOBAL_NO_WARNINGS(DOCTEST_ANONYMOUS(DOCTEST_ANON_REPORTER_)) = \ - doctest::registerReporter(name, priority, true); \ - DOCTEST_GLOBAL_NO_WARNINGS_END() typedef int DOCTEST_ANONYMOUS(DOCTEST_ANON_FOR_SEMICOLON_) + DOCTEST_GLOBAL_NO_WARNINGS(DOCTEST_ANONYMOUS(DOCTEST_ANON_REPORTER_), /* NOLINT(cert-err58-cpp) */ \ + doctest::registerReporter(name, priority, true)) \ + static_assert(true, "") // for registering listeners #define DOCTEST_REGISTER_LISTENER(name, priority, reporter) \ - DOCTEST_GLOBAL_NO_WARNINGS(DOCTEST_ANONYMOUS(DOCTEST_ANON_REPORTER_)) = \ - doctest::registerReporter(name, priority, false); \ - DOCTEST_GLOBAL_NO_WARNINGS_END() typedef int DOCTEST_ANONYMOUS(DOCTEST_ANON_FOR_SEMICOLON_) + DOCTEST_GLOBAL_NO_WARNINGS(DOCTEST_ANONYMOUS(DOCTEST_ANON_REPORTER_), /* NOLINT(cert-err58-cpp) */ \ + doctest::registerReporter(name, priority, false)) \ + static_assert(true, "") // clang-format off // for logging - disabling formatting because it's important to have these on 2 separate lines - see PR #557 @@ -2216,11 +2407,13 @@ int registerReporter(const char* name, int priority, bool isReporter) { #define DOCTEST_CAPTURE(x) DOCTEST_INFO(#x " := ", x) #define DOCTEST_ADD_AT_IMPL(type, file, line, mb, ...) \ - do { \ + DOCTEST_FUNC_SCOPE_BEGIN { \ doctest::detail::MessageBuilder mb(file, line, doctest::assertType::type); \ mb * __VA_ARGS__; \ - DOCTEST_ASSERT_LOG_AND_REACT(mb); \ - } while(false) + if(mb.log()) \ + DOCTEST_BREAK_INTO_DEBUGGER(); \ + mb.react(); \ + } DOCTEST_FUNC_SCOPE_END // clang-format off #define DOCTEST_ADD_MESSAGE_AT(file, line, ...) DOCTEST_ADD_AT_IMPL(is_warn, file, line, DOCTEST_ANONYMOUS(DOCTEST_MESSAGE_), __VA_ARGS__) @@ -2238,18 +2431,37 @@ int registerReporter(const char* name, int priority, bool isReporter) { #define DOCTEST_ASSERT_IMPLEMENT_2(assert_type, ...) \ DOCTEST_CLANG_SUPPRESS_WARNING_WITH_PUSH("-Woverloaded-shift-op-parentheses") \ + /* NOLINTNEXTLINE(clang-analyzer-cplusplus.NewDeleteLeaks) */ \ doctest::detail::ResultBuilder DOCTEST_RB(doctest::assertType::assert_type, __FILE__, \ __LINE__, #__VA_ARGS__); \ DOCTEST_WRAP_IN_TRY(DOCTEST_RB.setResult( \ doctest::detail::ExpressionDecomposer(doctest::assertType::assert_type) \ - << __VA_ARGS__)) \ - DOCTEST_ASSERT_LOG_AND_REACT(DOCTEST_RB) \ + << __VA_ARGS__)) /* NOLINTNEXTLINE(clang-analyzer-cplusplus.NewDeleteLeaks) */ \ + DOCTEST_ASSERT_LOG_REACT_RETURN(DOCTEST_RB) \ DOCTEST_CLANG_SUPPRESS_WARNING_POP #define DOCTEST_ASSERT_IMPLEMENT_1(assert_type, ...) \ - do { \ + DOCTEST_FUNC_SCOPE_BEGIN { \ DOCTEST_ASSERT_IMPLEMENT_2(assert_type, __VA_ARGS__); \ - } while(false) + } DOCTEST_FUNC_SCOPE_END // NOLINT(clang-analyzer-cplusplus.NewDeleteLeaks) + +#define DOCTEST_BINARY_ASSERT(assert_type, comp, ...) \ + DOCTEST_FUNC_SCOPE_BEGIN { \ + doctest::detail::ResultBuilder DOCTEST_RB(doctest::assertType::assert_type, __FILE__, \ + __LINE__, #__VA_ARGS__); \ + DOCTEST_WRAP_IN_TRY( \ + DOCTEST_RB.binary_assert( \ + __VA_ARGS__)) \ + DOCTEST_ASSERT_LOG_REACT_RETURN(DOCTEST_RB); \ + } DOCTEST_FUNC_SCOPE_END + +#define DOCTEST_UNARY_ASSERT(assert_type, ...) \ + DOCTEST_FUNC_SCOPE_BEGIN { \ + doctest::detail::ResultBuilder DOCTEST_RB(doctest::assertType::assert_type, __FILE__, \ + __LINE__, #__VA_ARGS__); \ + DOCTEST_WRAP_IN_TRY(DOCTEST_RB.unary_assert(__VA_ARGS__)) \ + DOCTEST_ASSERT_LOG_REACT_RETURN(DOCTEST_RB); \ + } DOCTEST_FUNC_SCOPE_END #else // DOCTEST_CONFIG_SUPER_FAST_ASSERTS @@ -2263,6 +2475,14 @@ int registerReporter(const char* name, int priority, bool isReporter) { doctest::detail::ExpressionDecomposer(doctest::assertType::assert_type) \ << __VA_ARGS__) DOCTEST_CLANG_SUPPRESS_WARNING_POP +#define DOCTEST_BINARY_ASSERT(assert_type, comparison, ...) \ + doctest::detail::binary_assert( \ + doctest::assertType::assert_type, __FILE__, __LINE__, #__VA_ARGS__, __VA_ARGS__) + +#define DOCTEST_UNARY_ASSERT(assert_type, ...) \ + doctest::detail::unary_assert(doctest::assertType::assert_type, __FILE__, __LINE__, \ + #__VA_ARGS__, __VA_ARGS__) + #endif // DOCTEST_CONFIG_SUPER_FAST_ASSERTS #define DOCTEST_WARN(...) DOCTEST_ASSERT_IMPLEMENT_1(DT_WARN, __VA_ARGS__) @@ -2273,51 +2493,83 @@ int registerReporter(const char* name, int priority, bool isReporter) { #define DOCTEST_REQUIRE_FALSE(...) DOCTEST_ASSERT_IMPLEMENT_1(DT_REQUIRE_FALSE, __VA_ARGS__) // clang-format off -#define DOCTEST_WARN_MESSAGE(cond, ...) do { DOCTEST_INFO(__VA_ARGS__); DOCTEST_ASSERT_IMPLEMENT_2(DT_WARN, cond); } while(false) -#define DOCTEST_CHECK_MESSAGE(cond, ...) do { DOCTEST_INFO(__VA_ARGS__); DOCTEST_ASSERT_IMPLEMENT_2(DT_CHECK, cond); } while(false) -#define DOCTEST_REQUIRE_MESSAGE(cond, ...) do { DOCTEST_INFO(__VA_ARGS__); DOCTEST_ASSERT_IMPLEMENT_2(DT_REQUIRE, cond); } while(false) -#define DOCTEST_WARN_FALSE_MESSAGE(cond, ...) do { DOCTEST_INFO(__VA_ARGS__); DOCTEST_ASSERT_IMPLEMENT_2(DT_WARN_FALSE, cond); } while(false) -#define DOCTEST_CHECK_FALSE_MESSAGE(cond, ...) do { DOCTEST_INFO(__VA_ARGS__); DOCTEST_ASSERT_IMPLEMENT_2(DT_CHECK_FALSE, cond); } while(false) -#define DOCTEST_REQUIRE_FALSE_MESSAGE(cond, ...) do { DOCTEST_INFO(__VA_ARGS__); DOCTEST_ASSERT_IMPLEMENT_2(DT_REQUIRE_FALSE, cond); } while(false) +#define DOCTEST_WARN_MESSAGE(cond, ...) DOCTEST_FUNC_SCOPE_BEGIN { DOCTEST_INFO(__VA_ARGS__); DOCTEST_ASSERT_IMPLEMENT_2(DT_WARN, cond); } DOCTEST_FUNC_SCOPE_END +#define DOCTEST_CHECK_MESSAGE(cond, ...) DOCTEST_FUNC_SCOPE_BEGIN { DOCTEST_INFO(__VA_ARGS__); DOCTEST_ASSERT_IMPLEMENT_2(DT_CHECK, cond); } DOCTEST_FUNC_SCOPE_END +#define DOCTEST_REQUIRE_MESSAGE(cond, ...) DOCTEST_FUNC_SCOPE_BEGIN { DOCTEST_INFO(__VA_ARGS__); DOCTEST_ASSERT_IMPLEMENT_2(DT_REQUIRE, cond); } DOCTEST_FUNC_SCOPE_END +#define DOCTEST_WARN_FALSE_MESSAGE(cond, ...) DOCTEST_FUNC_SCOPE_BEGIN { DOCTEST_INFO(__VA_ARGS__); DOCTEST_ASSERT_IMPLEMENT_2(DT_WARN_FALSE, cond); } DOCTEST_FUNC_SCOPE_END +#define DOCTEST_CHECK_FALSE_MESSAGE(cond, ...) DOCTEST_FUNC_SCOPE_BEGIN { DOCTEST_INFO(__VA_ARGS__); DOCTEST_ASSERT_IMPLEMENT_2(DT_CHECK_FALSE, cond); } DOCTEST_FUNC_SCOPE_END +#define DOCTEST_REQUIRE_FALSE_MESSAGE(cond, ...) DOCTEST_FUNC_SCOPE_BEGIN { DOCTEST_INFO(__VA_ARGS__); DOCTEST_ASSERT_IMPLEMENT_2(DT_REQUIRE_FALSE, cond); } DOCTEST_FUNC_SCOPE_END // clang-format on +#define DOCTEST_WARN_EQ(...) DOCTEST_BINARY_ASSERT(DT_WARN_EQ, eq, __VA_ARGS__) +#define DOCTEST_CHECK_EQ(...) DOCTEST_BINARY_ASSERT(DT_CHECK_EQ, eq, __VA_ARGS__) +#define DOCTEST_REQUIRE_EQ(...) DOCTEST_BINARY_ASSERT(DT_REQUIRE_EQ, eq, __VA_ARGS__) +#define DOCTEST_WARN_NE(...) DOCTEST_BINARY_ASSERT(DT_WARN_NE, ne, __VA_ARGS__) +#define DOCTEST_CHECK_NE(...) DOCTEST_BINARY_ASSERT(DT_CHECK_NE, ne, __VA_ARGS__) +#define DOCTEST_REQUIRE_NE(...) DOCTEST_BINARY_ASSERT(DT_REQUIRE_NE, ne, __VA_ARGS__) +#define DOCTEST_WARN_GT(...) DOCTEST_BINARY_ASSERT(DT_WARN_GT, gt, __VA_ARGS__) +#define DOCTEST_CHECK_GT(...) DOCTEST_BINARY_ASSERT(DT_CHECK_GT, gt, __VA_ARGS__) +#define DOCTEST_REQUIRE_GT(...) DOCTEST_BINARY_ASSERT(DT_REQUIRE_GT, gt, __VA_ARGS__) +#define DOCTEST_WARN_LT(...) DOCTEST_BINARY_ASSERT(DT_WARN_LT, lt, __VA_ARGS__) +#define DOCTEST_CHECK_LT(...) DOCTEST_BINARY_ASSERT(DT_CHECK_LT, lt, __VA_ARGS__) +#define DOCTEST_REQUIRE_LT(...) DOCTEST_BINARY_ASSERT(DT_REQUIRE_LT, lt, __VA_ARGS__) +#define DOCTEST_WARN_GE(...) DOCTEST_BINARY_ASSERT(DT_WARN_GE, ge, __VA_ARGS__) +#define DOCTEST_CHECK_GE(...) DOCTEST_BINARY_ASSERT(DT_CHECK_GE, ge, __VA_ARGS__) +#define DOCTEST_REQUIRE_GE(...) DOCTEST_BINARY_ASSERT(DT_REQUIRE_GE, ge, __VA_ARGS__) +#define DOCTEST_WARN_LE(...) DOCTEST_BINARY_ASSERT(DT_WARN_LE, le, __VA_ARGS__) +#define DOCTEST_CHECK_LE(...) DOCTEST_BINARY_ASSERT(DT_CHECK_LE, le, __VA_ARGS__) +#define DOCTEST_REQUIRE_LE(...) DOCTEST_BINARY_ASSERT(DT_REQUIRE_LE, le, __VA_ARGS__) + +#define DOCTEST_WARN_UNARY(...) DOCTEST_UNARY_ASSERT(DT_WARN_UNARY, __VA_ARGS__) +#define DOCTEST_CHECK_UNARY(...) DOCTEST_UNARY_ASSERT(DT_CHECK_UNARY, __VA_ARGS__) +#define DOCTEST_REQUIRE_UNARY(...) DOCTEST_UNARY_ASSERT(DT_REQUIRE_UNARY, __VA_ARGS__) +#define DOCTEST_WARN_UNARY_FALSE(...) DOCTEST_UNARY_ASSERT(DT_WARN_UNARY_FALSE, __VA_ARGS__) +#define DOCTEST_CHECK_UNARY_FALSE(...) DOCTEST_UNARY_ASSERT(DT_CHECK_UNARY_FALSE, __VA_ARGS__) +#define DOCTEST_REQUIRE_UNARY_FALSE(...) DOCTEST_UNARY_ASSERT(DT_REQUIRE_UNARY_FALSE, __VA_ARGS__) + +#ifndef DOCTEST_CONFIG_NO_EXCEPTIONS + #define DOCTEST_ASSERT_THROWS_AS(expr, assert_type, message, ...) \ - do { \ + DOCTEST_FUNC_SCOPE_BEGIN { \ if(!doctest::getContextOptions()->no_throw) { \ doctest::detail::ResultBuilder DOCTEST_RB(doctest::assertType::assert_type, __FILE__, \ __LINE__, #expr, #__VA_ARGS__, message); \ try { \ DOCTEST_CAST_TO_VOID(expr) \ - } catch(const typename doctest::detail::remove_const< \ - typename doctest::detail::remove_reference<__VA_ARGS__>::type>::type&) { \ + } catch(const typename doctest::detail::types::remove_const< \ + typename doctest::detail::types::remove_reference<__VA_ARGS__>::type>::type&) {\ DOCTEST_RB.translateException(); \ DOCTEST_RB.m_threw_as = true; \ } catch(...) { DOCTEST_RB.translateException(); } \ - DOCTEST_ASSERT_LOG_AND_REACT(DOCTEST_RB); \ + DOCTEST_ASSERT_LOG_REACT_RETURN(DOCTEST_RB); \ + } else { /* NOLINT(*-else-after-return) */ \ + DOCTEST_FUNC_SCOPE_RET(false); \ } \ - } while(false) + } DOCTEST_FUNC_SCOPE_END #define DOCTEST_ASSERT_THROWS_WITH(expr, expr_str, assert_type, ...) \ - do { \ + DOCTEST_FUNC_SCOPE_BEGIN { \ if(!doctest::getContextOptions()->no_throw) { \ doctest::detail::ResultBuilder DOCTEST_RB(doctest::assertType::assert_type, __FILE__, \ __LINE__, expr_str, "", __VA_ARGS__); \ try { \ DOCTEST_CAST_TO_VOID(expr) \ } catch(...) { DOCTEST_RB.translateException(); } \ - DOCTEST_ASSERT_LOG_AND_REACT(DOCTEST_RB); \ + DOCTEST_ASSERT_LOG_REACT_RETURN(DOCTEST_RB); \ + } else { /* NOLINT(*-else-after-return) */ \ + DOCTEST_FUNC_SCOPE_RET(false); \ } \ - } while(false) + } DOCTEST_FUNC_SCOPE_END #define DOCTEST_ASSERT_NOTHROW(assert_type, ...) \ - do { \ + DOCTEST_FUNC_SCOPE_BEGIN { \ doctest::detail::ResultBuilder DOCTEST_RB(doctest::assertType::assert_type, __FILE__, \ __LINE__, #__VA_ARGS__); \ try { \ DOCTEST_CAST_TO_VOID(__VA_ARGS__) \ } catch(...) { DOCTEST_RB.translateException(); } \ - DOCTEST_ASSERT_LOG_AND_REACT(DOCTEST_RB); \ - } while(false) + DOCTEST_ASSERT_LOG_REACT_RETURN(DOCTEST_RB); \ + } DOCTEST_FUNC_SCOPE_END // clang-format off #define DOCTEST_WARN_THROWS(...) DOCTEST_ASSERT_THROWS_WITH((__VA_ARGS__), #__VA_ARGS__, DT_WARN_THROWS, "") @@ -2340,166 +2592,23 @@ int registerReporter(const char* name, int priority, bool isReporter) { #define DOCTEST_CHECK_NOTHROW(...) DOCTEST_ASSERT_NOTHROW(DT_CHECK_NOTHROW, __VA_ARGS__) #define DOCTEST_REQUIRE_NOTHROW(...) DOCTEST_ASSERT_NOTHROW(DT_REQUIRE_NOTHROW, __VA_ARGS__) -#define DOCTEST_WARN_THROWS_MESSAGE(expr, ...) do { DOCTEST_INFO(__VA_ARGS__); DOCTEST_WARN_THROWS(expr); } while(false) -#define DOCTEST_CHECK_THROWS_MESSAGE(expr, ...) do { DOCTEST_INFO(__VA_ARGS__); DOCTEST_CHECK_THROWS(expr); } while(false) -#define DOCTEST_REQUIRE_THROWS_MESSAGE(expr, ...) do { DOCTEST_INFO(__VA_ARGS__); DOCTEST_REQUIRE_THROWS(expr); } while(false) -#define DOCTEST_WARN_THROWS_AS_MESSAGE(expr, ex, ...) do { DOCTEST_INFO(__VA_ARGS__); DOCTEST_WARN_THROWS_AS(expr, ex); } while(false) -#define DOCTEST_CHECK_THROWS_AS_MESSAGE(expr, ex, ...) do { DOCTEST_INFO(__VA_ARGS__); DOCTEST_CHECK_THROWS_AS(expr, ex); } while(false) -#define DOCTEST_REQUIRE_THROWS_AS_MESSAGE(expr, ex, ...) do { DOCTEST_INFO(__VA_ARGS__); DOCTEST_REQUIRE_THROWS_AS(expr, ex); } while(false) -#define DOCTEST_WARN_THROWS_WITH_MESSAGE(expr, with, ...) do { DOCTEST_INFO(__VA_ARGS__); DOCTEST_WARN_THROWS_WITH(expr, with); } while(false) -#define DOCTEST_CHECK_THROWS_WITH_MESSAGE(expr, with, ...) do { DOCTEST_INFO(__VA_ARGS__); DOCTEST_CHECK_THROWS_WITH(expr, with); } while(false) -#define DOCTEST_REQUIRE_THROWS_WITH_MESSAGE(expr, with, ...) do { DOCTEST_INFO(__VA_ARGS__); DOCTEST_REQUIRE_THROWS_WITH(expr, with); } while(false) -#define DOCTEST_WARN_THROWS_WITH_AS_MESSAGE(expr, with, ex, ...) do { DOCTEST_INFO(__VA_ARGS__); DOCTEST_WARN_THROWS_WITH_AS(expr, with, ex); } while(false) -#define DOCTEST_CHECK_THROWS_WITH_AS_MESSAGE(expr, with, ex, ...) do { DOCTEST_INFO(__VA_ARGS__); DOCTEST_CHECK_THROWS_WITH_AS(expr, with, ex); } while(false) -#define DOCTEST_REQUIRE_THROWS_WITH_AS_MESSAGE(expr, with, ex, ...) do { DOCTEST_INFO(__VA_ARGS__); DOCTEST_REQUIRE_THROWS_WITH_AS(expr, with, ex); } while(false) -#define DOCTEST_WARN_NOTHROW_MESSAGE(expr, ...) do { DOCTEST_INFO(__VA_ARGS__); DOCTEST_WARN_NOTHROW(expr); } while(false) -#define DOCTEST_CHECK_NOTHROW_MESSAGE(expr, ...) do { DOCTEST_INFO(__VA_ARGS__); DOCTEST_CHECK_NOTHROW(expr); } while(false) -#define DOCTEST_REQUIRE_NOTHROW_MESSAGE(expr, ...) do { DOCTEST_INFO(__VA_ARGS__); DOCTEST_REQUIRE_NOTHROW(expr); } while(false) +#define DOCTEST_WARN_THROWS_MESSAGE(expr, ...) DOCTEST_FUNC_SCOPE_BEGIN { DOCTEST_INFO(__VA_ARGS__); DOCTEST_WARN_THROWS(expr); } DOCTEST_FUNC_SCOPE_END +#define DOCTEST_CHECK_THROWS_MESSAGE(expr, ...) DOCTEST_FUNC_SCOPE_BEGIN { DOCTEST_INFO(__VA_ARGS__); DOCTEST_CHECK_THROWS(expr); } DOCTEST_FUNC_SCOPE_END +#define DOCTEST_REQUIRE_THROWS_MESSAGE(expr, ...) DOCTEST_FUNC_SCOPE_BEGIN { DOCTEST_INFO(__VA_ARGS__); DOCTEST_REQUIRE_THROWS(expr); } DOCTEST_FUNC_SCOPE_END +#define DOCTEST_WARN_THROWS_AS_MESSAGE(expr, ex, ...) DOCTEST_FUNC_SCOPE_BEGIN { DOCTEST_INFO(__VA_ARGS__); DOCTEST_WARN_THROWS_AS(expr, ex); } DOCTEST_FUNC_SCOPE_END +#define DOCTEST_CHECK_THROWS_AS_MESSAGE(expr, ex, ...) DOCTEST_FUNC_SCOPE_BEGIN { DOCTEST_INFO(__VA_ARGS__); DOCTEST_CHECK_THROWS_AS(expr, ex); } DOCTEST_FUNC_SCOPE_END +#define DOCTEST_REQUIRE_THROWS_AS_MESSAGE(expr, ex, ...) DOCTEST_FUNC_SCOPE_BEGIN { DOCTEST_INFO(__VA_ARGS__); DOCTEST_REQUIRE_THROWS_AS(expr, ex); } DOCTEST_FUNC_SCOPE_END +#define DOCTEST_WARN_THROWS_WITH_MESSAGE(expr, with, ...) DOCTEST_FUNC_SCOPE_BEGIN { DOCTEST_INFO(__VA_ARGS__); DOCTEST_WARN_THROWS_WITH(expr, with); } DOCTEST_FUNC_SCOPE_END +#define DOCTEST_CHECK_THROWS_WITH_MESSAGE(expr, with, ...) DOCTEST_FUNC_SCOPE_BEGIN { DOCTEST_INFO(__VA_ARGS__); DOCTEST_CHECK_THROWS_WITH(expr, with); } DOCTEST_FUNC_SCOPE_END +#define DOCTEST_REQUIRE_THROWS_WITH_MESSAGE(expr, with, ...) DOCTEST_FUNC_SCOPE_BEGIN { DOCTEST_INFO(__VA_ARGS__); DOCTEST_REQUIRE_THROWS_WITH(expr, with); } DOCTEST_FUNC_SCOPE_END +#define DOCTEST_WARN_THROWS_WITH_AS_MESSAGE(expr, with, ex, ...) DOCTEST_FUNC_SCOPE_BEGIN { DOCTEST_INFO(__VA_ARGS__); DOCTEST_WARN_THROWS_WITH_AS(expr, with, ex); } DOCTEST_FUNC_SCOPE_END +#define DOCTEST_CHECK_THROWS_WITH_AS_MESSAGE(expr, with, ex, ...) DOCTEST_FUNC_SCOPE_BEGIN { DOCTEST_INFO(__VA_ARGS__); DOCTEST_CHECK_THROWS_WITH_AS(expr, with, ex); } DOCTEST_FUNC_SCOPE_END +#define DOCTEST_REQUIRE_THROWS_WITH_AS_MESSAGE(expr, with, ex, ...) DOCTEST_FUNC_SCOPE_BEGIN { DOCTEST_INFO(__VA_ARGS__); DOCTEST_REQUIRE_THROWS_WITH_AS(expr, with, ex); } DOCTEST_FUNC_SCOPE_END +#define DOCTEST_WARN_NOTHROW_MESSAGE(expr, ...) DOCTEST_FUNC_SCOPE_BEGIN { DOCTEST_INFO(__VA_ARGS__); DOCTEST_WARN_NOTHROW(expr); } DOCTEST_FUNC_SCOPE_END +#define DOCTEST_CHECK_NOTHROW_MESSAGE(expr, ...) DOCTEST_FUNC_SCOPE_BEGIN { DOCTEST_INFO(__VA_ARGS__); DOCTEST_CHECK_NOTHROW(expr); } DOCTEST_FUNC_SCOPE_END +#define DOCTEST_REQUIRE_NOTHROW_MESSAGE(expr, ...) DOCTEST_FUNC_SCOPE_BEGIN { DOCTEST_INFO(__VA_ARGS__); DOCTEST_REQUIRE_NOTHROW(expr); } DOCTEST_FUNC_SCOPE_END // clang-format on -#ifndef DOCTEST_CONFIG_SUPER_FAST_ASSERTS - -#define DOCTEST_BINARY_ASSERT(assert_type, comp, ...) \ - do { \ - doctest::detail::ResultBuilder DOCTEST_RB(doctest::assertType::assert_type, __FILE__, \ - __LINE__, #__VA_ARGS__); \ - DOCTEST_WRAP_IN_TRY( \ - DOCTEST_RB.binary_assert( \ - __VA_ARGS__)) \ - DOCTEST_ASSERT_LOG_AND_REACT(DOCTEST_RB); \ - } while(false) - -#define DOCTEST_UNARY_ASSERT(assert_type, ...) \ - do { \ - doctest::detail::ResultBuilder DOCTEST_RB(doctest::assertType::assert_type, __FILE__, \ - __LINE__, #__VA_ARGS__); \ - DOCTEST_WRAP_IN_TRY(DOCTEST_RB.unary_assert(__VA_ARGS__)) \ - DOCTEST_ASSERT_LOG_AND_REACT(DOCTEST_RB); \ - } while(false) - -#else // DOCTEST_CONFIG_SUPER_FAST_ASSERTS - -#define DOCTEST_BINARY_ASSERT(assert_type, comparison, ...) \ - doctest::detail::binary_assert( \ - doctest::assertType::assert_type, __FILE__, __LINE__, #__VA_ARGS__, __VA_ARGS__) - -#define DOCTEST_UNARY_ASSERT(assert_type, ...) \ - doctest::detail::unary_assert(doctest::assertType::assert_type, __FILE__, __LINE__, \ - #__VA_ARGS__, __VA_ARGS__) - -#endif // DOCTEST_CONFIG_SUPER_FAST_ASSERTS - -#define DOCTEST_WARN_EQ(...) DOCTEST_BINARY_ASSERT(DT_WARN_EQ, eq, __VA_ARGS__) -#define DOCTEST_CHECK_EQ(...) DOCTEST_BINARY_ASSERT(DT_CHECK_EQ, eq, __VA_ARGS__) -#define DOCTEST_REQUIRE_EQ(...) DOCTEST_BINARY_ASSERT(DT_REQUIRE_EQ, eq, __VA_ARGS__) -#define DOCTEST_WARN_NE(...) DOCTEST_BINARY_ASSERT(DT_WARN_NE, ne, __VA_ARGS__) -#define DOCTEST_CHECK_NE(...) DOCTEST_BINARY_ASSERT(DT_CHECK_NE, ne, __VA_ARGS__) -#define DOCTEST_REQUIRE_NE(...) DOCTEST_BINARY_ASSERT(DT_REQUIRE_NE, ne, __VA_ARGS__) -#define DOCTEST_WARN_GT(...) DOCTEST_BINARY_ASSERT(DT_WARN_GT, gt, __VA_ARGS__) -#define DOCTEST_CHECK_GT(...) DOCTEST_BINARY_ASSERT(DT_CHECK_GT, gt, __VA_ARGS__) -#define DOCTEST_REQUIRE_GT(...) DOCTEST_BINARY_ASSERT(DT_REQUIRE_GT, gt, __VA_ARGS__) -#define DOCTEST_WARN_LT(...) DOCTEST_BINARY_ASSERT(DT_WARN_LT, lt, __VA_ARGS__) -#define DOCTEST_CHECK_LT(...) DOCTEST_BINARY_ASSERT(DT_CHECK_LT, lt, __VA_ARGS__) -#define DOCTEST_REQUIRE_LT(...) DOCTEST_BINARY_ASSERT(DT_REQUIRE_LT, lt, __VA_ARGS__) -#define DOCTEST_WARN_GE(...) DOCTEST_BINARY_ASSERT(DT_WARN_GE, ge, __VA_ARGS__) -#define DOCTEST_CHECK_GE(...) DOCTEST_BINARY_ASSERT(DT_CHECK_GE, ge, __VA_ARGS__) -#define DOCTEST_REQUIRE_GE(...) DOCTEST_BINARY_ASSERT(DT_REQUIRE_GE, ge, __VA_ARGS__) -#define DOCTEST_WARN_LE(...) DOCTEST_BINARY_ASSERT(DT_WARN_LE, le, __VA_ARGS__) -#define DOCTEST_CHECK_LE(...) DOCTEST_BINARY_ASSERT(DT_CHECK_LE, le, __VA_ARGS__) -#define DOCTEST_REQUIRE_LE(...) DOCTEST_BINARY_ASSERT(DT_REQUIRE_LE, le, __VA_ARGS__) - -#define DOCTEST_WARN_UNARY(...) DOCTEST_UNARY_ASSERT(DT_WARN_UNARY, __VA_ARGS__) -#define DOCTEST_CHECK_UNARY(...) DOCTEST_UNARY_ASSERT(DT_CHECK_UNARY, __VA_ARGS__) -#define DOCTEST_REQUIRE_UNARY(...) DOCTEST_UNARY_ASSERT(DT_REQUIRE_UNARY, __VA_ARGS__) -#define DOCTEST_WARN_UNARY_FALSE(...) DOCTEST_UNARY_ASSERT(DT_WARN_UNARY_FALSE, __VA_ARGS__) -#define DOCTEST_CHECK_UNARY_FALSE(...) DOCTEST_UNARY_ASSERT(DT_CHECK_UNARY_FALSE, __VA_ARGS__) -#define DOCTEST_REQUIRE_UNARY_FALSE(...) DOCTEST_UNARY_ASSERT(DT_REQUIRE_UNARY_FALSE, __VA_ARGS__) - -#ifdef DOCTEST_CONFIG_NO_EXCEPTIONS - -#undef DOCTEST_WARN_THROWS -#undef DOCTEST_CHECK_THROWS -#undef DOCTEST_REQUIRE_THROWS -#undef DOCTEST_WARN_THROWS_AS -#undef DOCTEST_CHECK_THROWS_AS -#undef DOCTEST_REQUIRE_THROWS_AS -#undef DOCTEST_WARN_THROWS_WITH -#undef DOCTEST_CHECK_THROWS_WITH -#undef DOCTEST_REQUIRE_THROWS_WITH -#undef DOCTEST_WARN_THROWS_WITH_AS -#undef DOCTEST_CHECK_THROWS_WITH_AS -#undef DOCTEST_REQUIRE_THROWS_WITH_AS -#undef DOCTEST_WARN_NOTHROW -#undef DOCTEST_CHECK_NOTHROW -#undef DOCTEST_REQUIRE_NOTHROW - -#undef DOCTEST_WARN_THROWS_MESSAGE -#undef DOCTEST_CHECK_THROWS_MESSAGE -#undef DOCTEST_REQUIRE_THROWS_MESSAGE -#undef DOCTEST_WARN_THROWS_AS_MESSAGE -#undef DOCTEST_CHECK_THROWS_AS_MESSAGE -#undef DOCTEST_REQUIRE_THROWS_AS_MESSAGE -#undef DOCTEST_WARN_THROWS_WITH_MESSAGE -#undef DOCTEST_CHECK_THROWS_WITH_MESSAGE -#undef DOCTEST_REQUIRE_THROWS_WITH_MESSAGE -#undef DOCTEST_WARN_THROWS_WITH_AS_MESSAGE -#undef DOCTEST_CHECK_THROWS_WITH_AS_MESSAGE -#undef DOCTEST_REQUIRE_THROWS_WITH_AS_MESSAGE -#undef DOCTEST_WARN_NOTHROW_MESSAGE -#undef DOCTEST_CHECK_NOTHROW_MESSAGE -#undef DOCTEST_REQUIRE_NOTHROW_MESSAGE - -#ifdef DOCTEST_CONFIG_NO_EXCEPTIONS_BUT_WITH_ALL_ASSERTS - -#define DOCTEST_WARN_THROWS(...) (static_cast(0)) -#define DOCTEST_CHECK_THROWS(...) (static_cast(0)) -#define DOCTEST_REQUIRE_THROWS(...) (static_cast(0)) -#define DOCTEST_WARN_THROWS_AS(expr, ...) (static_cast(0)) -#define DOCTEST_CHECK_THROWS_AS(expr, ...) (static_cast(0)) -#define DOCTEST_REQUIRE_THROWS_AS(expr, ...) (static_cast(0)) -#define DOCTEST_WARN_THROWS_WITH(expr, ...) (static_cast(0)) -#define DOCTEST_CHECK_THROWS_WITH(expr, ...) (static_cast(0)) -#define DOCTEST_REQUIRE_THROWS_WITH(expr, ...) (static_cast(0)) -#define DOCTEST_WARN_THROWS_WITH_AS(expr, with, ...) (static_cast(0)) -#define DOCTEST_CHECK_THROWS_WITH_AS(expr, with, ...) (static_cast(0)) -#define DOCTEST_REQUIRE_THROWS_WITH_AS(expr, with, ...) (static_cast(0)) -#define DOCTEST_WARN_NOTHROW(...) (static_cast(0)) -#define DOCTEST_CHECK_NOTHROW(...) (static_cast(0)) -#define DOCTEST_REQUIRE_NOTHROW(...) (static_cast(0)) - -#define DOCTEST_WARN_THROWS_MESSAGE(expr, ...) (static_cast(0)) -#define DOCTEST_CHECK_THROWS_MESSAGE(expr, ...) (static_cast(0)) -#define DOCTEST_REQUIRE_THROWS_MESSAGE(expr, ...) (static_cast(0)) -#define DOCTEST_WARN_THROWS_AS_MESSAGE(expr, ex, ...) (static_cast(0)) -#define DOCTEST_CHECK_THROWS_AS_MESSAGE(expr, ex, ...) (static_cast(0)) -#define DOCTEST_REQUIRE_THROWS_AS_MESSAGE(expr, ex, ...) (static_cast(0)) -#define DOCTEST_WARN_THROWS_WITH_MESSAGE(expr, with, ...) (static_cast(0)) -#define DOCTEST_CHECK_THROWS_WITH_MESSAGE(expr, with, ...) (static_cast(0)) -#define DOCTEST_REQUIRE_THROWS_WITH_MESSAGE(expr, with, ...) (static_cast(0)) -#define DOCTEST_WARN_THROWS_WITH_AS_MESSAGE(expr, with, ex, ...) (static_cast(0)) -#define DOCTEST_CHECK_THROWS_WITH_AS_MESSAGE(expr, with, ex, ...) (static_cast(0)) -#define DOCTEST_REQUIRE_THROWS_WITH_AS_MESSAGE(expr, with, ex, ...) (static_cast(0)) -#define DOCTEST_WARN_NOTHROW_MESSAGE(expr, ...) (static_cast(0)) -#define DOCTEST_CHECK_NOTHROW_MESSAGE(expr, ...) (static_cast(0)) -#define DOCTEST_REQUIRE_NOTHROW_MESSAGE(expr, ...) (static_cast(0)) - -#else // DOCTEST_CONFIG_NO_EXCEPTIONS_BUT_WITH_ALL_ASSERTS - -#undef DOCTEST_REQUIRE -#undef DOCTEST_REQUIRE_FALSE -#undef DOCTEST_REQUIRE_MESSAGE -#undef DOCTEST_REQUIRE_FALSE_MESSAGE -#undef DOCTEST_REQUIRE_EQ -#undef DOCTEST_REQUIRE_NE -#undef DOCTEST_REQUIRE_GT -#undef DOCTEST_REQUIRE_LT -#undef DOCTEST_REQUIRE_GE -#undef DOCTEST_REQUIRE_LE -#undef DOCTEST_REQUIRE_UNARY -#undef DOCTEST_REQUIRE_UNARY_FALSE - -#endif // DOCTEST_CONFIG_NO_EXCEPTIONS_BUT_WITH_ALL_ASSERTS - #endif // DOCTEST_CONFIG_NO_EXCEPTIONS // ================================================================================================= @@ -2509,7 +2618,7 @@ int registerReporter(const char* name, int priority, bool isReporter) { #else // DOCTEST_CONFIG_DISABLE #define DOCTEST_IMPLEMENT_FIXTURE(der, base, func, name) \ - namespace { \ + namespace /* NOLINT */ { \ template \ struct der : public base \ { void f(); }; \ @@ -2535,8 +2644,8 @@ int registerReporter(const char* name, int priority, bool isReporter) { DOCTEST_ANONYMOUS(DOCTEST_ANON_FUNC_), name) // for converting types to strings without the header and demangling -#define DOCTEST_TYPE_TO_STRING(...) typedef int DOCTEST_ANONYMOUS(DOCTEST_ANON_FOR_SEMICOLON_) -#define DOCTEST_TYPE_TO_STRING_IMPL(...) +#define DOCTEST_TYPE_TO_STRING_AS(str, ...) static_assert(true, "") +#define DOCTEST_TYPE_TO_STRING(...) static_assert(true, "") // for typed tests #define DOCTEST_TEST_CASE_TEMPLATE(name, type, ...) \ @@ -2547,113 +2656,283 @@ int registerReporter(const char* name, int priority, bool isReporter) { template \ inline void DOCTEST_ANONYMOUS(DOCTEST_ANON_TMP_)() -#define DOCTEST_TEST_CASE_TEMPLATE_INVOKE(id, ...) \ - typedef int DOCTEST_ANONYMOUS(DOCTEST_ANON_FOR_SEMICOLON_) +#define DOCTEST_TEST_CASE_TEMPLATE_INVOKE(id, ...) static_assert(true, "") +#define DOCTEST_TEST_CASE_TEMPLATE_APPLY(id, ...) static_assert(true, "") + +// for subcases +#define DOCTEST_SUBCASE(name) + +// for a testsuite block +#define DOCTEST_TEST_SUITE(name) namespace // NOLINT + +// for starting a testsuite block +#define DOCTEST_TEST_SUITE_BEGIN(name) static_assert(true, "") + +// for ending a testsuite block +#define DOCTEST_TEST_SUITE_END using DOCTEST_ANONYMOUS(DOCTEST_ANON_FOR_SEMICOLON_) = int + +#define DOCTEST_REGISTER_EXCEPTION_TRANSLATOR(signature) \ + template \ + static inline doctest::String DOCTEST_ANONYMOUS(DOCTEST_ANON_TRANSLATOR_)(signature) + +#define DOCTEST_REGISTER_REPORTER(name, priority, reporter) +#define DOCTEST_REGISTER_LISTENER(name, priority, reporter) + +#define DOCTEST_INFO(...) (static_cast(0)) +#define DOCTEST_CAPTURE(x) (static_cast(0)) +#define DOCTEST_ADD_MESSAGE_AT(file, line, ...) (static_cast(0)) +#define DOCTEST_ADD_FAIL_CHECK_AT(file, line, ...) (static_cast(0)) +#define DOCTEST_ADD_FAIL_AT(file, line, ...) (static_cast(0)) +#define DOCTEST_MESSAGE(...) (static_cast(0)) +#define DOCTEST_FAIL_CHECK(...) (static_cast(0)) +#define DOCTEST_FAIL(...) (static_cast(0)) + +#if defined(DOCTEST_CONFIG_EVALUATE_ASSERTS_EVEN_WHEN_DISABLED) \ + && defined(DOCTEST_CONFIG_ASSERTS_RETURN_VALUES) + +#define DOCTEST_WARN(...) [&] { return __VA_ARGS__; }() +#define DOCTEST_CHECK(...) [&] { return __VA_ARGS__; }() +#define DOCTEST_REQUIRE(...) [&] { return __VA_ARGS__; }() +#define DOCTEST_WARN_FALSE(...) [&] { return !(__VA_ARGS__); }() +#define DOCTEST_CHECK_FALSE(...) [&] { return !(__VA_ARGS__); }() +#define DOCTEST_REQUIRE_FALSE(...) [&] { return !(__VA_ARGS__); }() + +#define DOCTEST_WARN_MESSAGE(cond, ...) [&] { return cond; }() +#define DOCTEST_CHECK_MESSAGE(cond, ...) [&] { return cond; }() +#define DOCTEST_REQUIRE_MESSAGE(cond, ...) [&] { return cond; }() +#define DOCTEST_WARN_FALSE_MESSAGE(cond, ...) [&] { return !(cond); }() +#define DOCTEST_CHECK_FALSE_MESSAGE(cond, ...) [&] { return !(cond); }() +#define DOCTEST_REQUIRE_FALSE_MESSAGE(cond, ...) [&] { return !(cond); }() + +namespace doctest { +namespace detail { +#define DOCTEST_RELATIONAL_OP(name, op) \ + template \ + bool name(const DOCTEST_REF_WRAP(L) lhs, const DOCTEST_REF_WRAP(R) rhs) { return lhs op rhs; } + + DOCTEST_RELATIONAL_OP(eq, ==) + DOCTEST_RELATIONAL_OP(ne, !=) + DOCTEST_RELATIONAL_OP(lt, <) + DOCTEST_RELATIONAL_OP(gt, >) + DOCTEST_RELATIONAL_OP(le, <=) + DOCTEST_RELATIONAL_OP(ge, >=) +} // namespace detail +} // namespace doctest + +#define DOCTEST_WARN_EQ(...) [&] { return doctest::detail::eq(__VA_ARGS__); }() +#define DOCTEST_CHECK_EQ(...) [&] { return doctest::detail::eq(__VA_ARGS__); }() +#define DOCTEST_REQUIRE_EQ(...) [&] { return doctest::detail::eq(__VA_ARGS__); }() +#define DOCTEST_WARN_NE(...) [&] { return doctest::detail::ne(__VA_ARGS__); }() +#define DOCTEST_CHECK_NE(...) [&] { return doctest::detail::ne(__VA_ARGS__); }() +#define DOCTEST_REQUIRE_NE(...) [&] { return doctest::detail::ne(__VA_ARGS__); }() +#define DOCTEST_WARN_LT(...) [&] { return doctest::detail::lt(__VA_ARGS__); }() +#define DOCTEST_CHECK_LT(...) [&] { return doctest::detail::lt(__VA_ARGS__); }() +#define DOCTEST_REQUIRE_LT(...) [&] { return doctest::detail::lt(__VA_ARGS__); }() +#define DOCTEST_WARN_GT(...) [&] { return doctest::detail::gt(__VA_ARGS__); }() +#define DOCTEST_CHECK_GT(...) [&] { return doctest::detail::gt(__VA_ARGS__); }() +#define DOCTEST_REQUIRE_GT(...) [&] { return doctest::detail::gt(__VA_ARGS__); }() +#define DOCTEST_WARN_LE(...) [&] { return doctest::detail::le(__VA_ARGS__); }() +#define DOCTEST_CHECK_LE(...) [&] { return doctest::detail::le(__VA_ARGS__); }() +#define DOCTEST_REQUIRE_LE(...) [&] { return doctest::detail::le(__VA_ARGS__); }() +#define DOCTEST_WARN_GE(...) [&] { return doctest::detail::ge(__VA_ARGS__); }() +#define DOCTEST_CHECK_GE(...) [&] { return doctest::detail::ge(__VA_ARGS__); }() +#define DOCTEST_REQUIRE_GE(...) [&] { return doctest::detail::ge(__VA_ARGS__); }() +#define DOCTEST_WARN_UNARY(...) [&] { return __VA_ARGS__; }() +#define DOCTEST_CHECK_UNARY(...) [&] { return __VA_ARGS__; }() +#define DOCTEST_REQUIRE_UNARY(...) [&] { return __VA_ARGS__; }() +#define DOCTEST_WARN_UNARY_FALSE(...) [&] { return !(__VA_ARGS__); }() +#define DOCTEST_CHECK_UNARY_FALSE(...) [&] { return !(__VA_ARGS__); }() +#define DOCTEST_REQUIRE_UNARY_FALSE(...) [&] { return !(__VA_ARGS__); }() + +#ifndef DOCTEST_CONFIG_NO_EXCEPTIONS + +#define DOCTEST_WARN_THROWS_WITH(expr, with, ...) [] { static_assert(false, "Exception translation is not available when doctest is disabled."); return false; }() +#define DOCTEST_CHECK_THROWS_WITH(expr, with, ...) DOCTEST_WARN_THROWS_WITH(,,) +#define DOCTEST_REQUIRE_THROWS_WITH(expr, with, ...) DOCTEST_WARN_THROWS_WITH(,,) +#define DOCTEST_WARN_THROWS_WITH_AS(expr, with, ex, ...) DOCTEST_WARN_THROWS_WITH(,,) +#define DOCTEST_CHECK_THROWS_WITH_AS(expr, with, ex, ...) DOCTEST_WARN_THROWS_WITH(,,) +#define DOCTEST_REQUIRE_THROWS_WITH_AS(expr, with, ex, ...) DOCTEST_WARN_THROWS_WITH(,,) + +#define DOCTEST_WARN_THROWS_WITH_MESSAGE(expr, with, ...) DOCTEST_WARN_THROWS_WITH(,,) +#define DOCTEST_CHECK_THROWS_WITH_MESSAGE(expr, with, ...) DOCTEST_WARN_THROWS_WITH(,,) +#define DOCTEST_REQUIRE_THROWS_WITH_MESSAGE(expr, with, ...) DOCTEST_WARN_THROWS_WITH(,,) +#define DOCTEST_WARN_THROWS_WITH_AS_MESSAGE(expr, with, ex, ...) DOCTEST_WARN_THROWS_WITH(,,) +#define DOCTEST_CHECK_THROWS_WITH_AS_MESSAGE(expr, with, ex, ...) DOCTEST_WARN_THROWS_WITH(,,) +#define DOCTEST_REQUIRE_THROWS_WITH_AS_MESSAGE(expr, with, ex, ...) DOCTEST_WARN_THROWS_WITH(,,) + +#define DOCTEST_WARN_THROWS(...) [&] { try { __VA_ARGS__; return false; } catch (...) { return true; } }() +#define DOCTEST_CHECK_THROWS(...) [&] { try { __VA_ARGS__; return false; } catch (...) { return true; } }() +#define DOCTEST_REQUIRE_THROWS(...) [&] { try { __VA_ARGS__; return false; } catch (...) { return true; } }() +#define DOCTEST_WARN_THROWS_AS(expr, ...) [&] { try { expr; } catch (__VA_ARGS__) { return true; } catch (...) { } return false; }() +#define DOCTEST_CHECK_THROWS_AS(expr, ...) [&] { try { expr; } catch (__VA_ARGS__) { return true; } catch (...) { } return false; }() +#define DOCTEST_REQUIRE_THROWS_AS(expr, ...) [&] { try { expr; } catch (__VA_ARGS__) { return true; } catch (...) { } return false; }() +#define DOCTEST_WARN_NOTHROW(...) [&] { try { __VA_ARGS__; return true; } catch (...) { return false; } }() +#define DOCTEST_CHECK_NOTHROW(...) [&] { try { __VA_ARGS__; return true; } catch (...) { return false; } }() +#define DOCTEST_REQUIRE_NOTHROW(...) [&] { try { __VA_ARGS__; return true; } catch (...) { return false; } }() + +#define DOCTEST_WARN_THROWS_MESSAGE(expr, ...) [&] { try { __VA_ARGS__; return false; } catch (...) { return true; } }() +#define DOCTEST_CHECK_THROWS_MESSAGE(expr, ...) [&] { try { __VA_ARGS__; return false; } catch (...) { return true; } }() +#define DOCTEST_REQUIRE_THROWS_MESSAGE(expr, ...) [&] { try { __VA_ARGS__; return false; } catch (...) { return true; } }() +#define DOCTEST_WARN_THROWS_AS_MESSAGE(expr, ex, ...) [&] { try { expr; } catch (__VA_ARGS__) { return true; } catch (...) { } return false; }() +#define DOCTEST_CHECK_THROWS_AS_MESSAGE(expr, ex, ...) [&] { try { expr; } catch (__VA_ARGS__) { return true; } catch (...) { } return false; }() +#define DOCTEST_REQUIRE_THROWS_AS_MESSAGE(expr, ex, ...) [&] { try { expr; } catch (__VA_ARGS__) { return true; } catch (...) { } return false; }() +#define DOCTEST_WARN_NOTHROW_MESSAGE(expr, ...) [&] { try { __VA_ARGS__; return true; } catch (...) { return false; } }() +#define DOCTEST_CHECK_NOTHROW_MESSAGE(expr, ...) [&] { try { __VA_ARGS__; return true; } catch (...) { return false; } }() +#define DOCTEST_REQUIRE_NOTHROW_MESSAGE(expr, ...) [&] { try { __VA_ARGS__; return true; } catch (...) { return false; } }() + +#endif // DOCTEST_CONFIG_NO_EXCEPTIONS + +#else // DOCTEST_CONFIG_EVALUATE_ASSERTS_EVEN_WHEN_DISABLED + +#define DOCTEST_WARN(...) DOCTEST_FUNC_EMPTY +#define DOCTEST_CHECK(...) DOCTEST_FUNC_EMPTY +#define DOCTEST_REQUIRE(...) DOCTEST_FUNC_EMPTY +#define DOCTEST_WARN_FALSE(...) DOCTEST_FUNC_EMPTY +#define DOCTEST_CHECK_FALSE(...) DOCTEST_FUNC_EMPTY +#define DOCTEST_REQUIRE_FALSE(...) DOCTEST_FUNC_EMPTY + +#define DOCTEST_WARN_MESSAGE(cond, ...) DOCTEST_FUNC_EMPTY +#define DOCTEST_CHECK_MESSAGE(cond, ...) DOCTEST_FUNC_EMPTY +#define DOCTEST_REQUIRE_MESSAGE(cond, ...) DOCTEST_FUNC_EMPTY +#define DOCTEST_WARN_FALSE_MESSAGE(cond, ...) DOCTEST_FUNC_EMPTY +#define DOCTEST_CHECK_FALSE_MESSAGE(cond, ...) DOCTEST_FUNC_EMPTY +#define DOCTEST_REQUIRE_FALSE_MESSAGE(cond, ...) DOCTEST_FUNC_EMPTY + +#define DOCTEST_WARN_EQ(...) DOCTEST_FUNC_EMPTY +#define DOCTEST_CHECK_EQ(...) DOCTEST_FUNC_EMPTY +#define DOCTEST_REQUIRE_EQ(...) DOCTEST_FUNC_EMPTY +#define DOCTEST_WARN_NE(...) DOCTEST_FUNC_EMPTY +#define DOCTEST_CHECK_NE(...) DOCTEST_FUNC_EMPTY +#define DOCTEST_REQUIRE_NE(...) DOCTEST_FUNC_EMPTY +#define DOCTEST_WARN_GT(...) DOCTEST_FUNC_EMPTY +#define DOCTEST_CHECK_GT(...) DOCTEST_FUNC_EMPTY +#define DOCTEST_REQUIRE_GT(...) DOCTEST_FUNC_EMPTY +#define DOCTEST_WARN_LT(...) DOCTEST_FUNC_EMPTY +#define DOCTEST_CHECK_LT(...) DOCTEST_FUNC_EMPTY +#define DOCTEST_REQUIRE_LT(...) DOCTEST_FUNC_EMPTY +#define DOCTEST_WARN_GE(...) DOCTEST_FUNC_EMPTY +#define DOCTEST_CHECK_GE(...) DOCTEST_FUNC_EMPTY +#define DOCTEST_REQUIRE_GE(...) DOCTEST_FUNC_EMPTY +#define DOCTEST_WARN_LE(...) DOCTEST_FUNC_EMPTY +#define DOCTEST_CHECK_LE(...) DOCTEST_FUNC_EMPTY +#define DOCTEST_REQUIRE_LE(...) DOCTEST_FUNC_EMPTY + +#define DOCTEST_WARN_UNARY(...) DOCTEST_FUNC_EMPTY +#define DOCTEST_CHECK_UNARY(...) DOCTEST_FUNC_EMPTY +#define DOCTEST_REQUIRE_UNARY(...) DOCTEST_FUNC_EMPTY +#define DOCTEST_WARN_UNARY_FALSE(...) DOCTEST_FUNC_EMPTY +#define DOCTEST_CHECK_UNARY_FALSE(...) DOCTEST_FUNC_EMPTY +#define DOCTEST_REQUIRE_UNARY_FALSE(...) DOCTEST_FUNC_EMPTY + +#ifndef DOCTEST_CONFIG_NO_EXCEPTIONS + +#define DOCTEST_WARN_THROWS(...) DOCTEST_FUNC_EMPTY +#define DOCTEST_CHECK_THROWS(...) DOCTEST_FUNC_EMPTY +#define DOCTEST_REQUIRE_THROWS(...) DOCTEST_FUNC_EMPTY +#define DOCTEST_WARN_THROWS_AS(expr, ...) DOCTEST_FUNC_EMPTY +#define DOCTEST_CHECK_THROWS_AS(expr, ...) DOCTEST_FUNC_EMPTY +#define DOCTEST_REQUIRE_THROWS_AS(expr, ...) DOCTEST_FUNC_EMPTY +#define DOCTEST_WARN_THROWS_WITH(expr, ...) DOCTEST_FUNC_EMPTY +#define DOCTEST_CHECK_THROWS_WITH(expr, ...) DOCTEST_FUNC_EMPTY +#define DOCTEST_REQUIRE_THROWS_WITH(expr, ...) DOCTEST_FUNC_EMPTY +#define DOCTEST_WARN_THROWS_WITH_AS(expr, with, ...) DOCTEST_FUNC_EMPTY +#define DOCTEST_CHECK_THROWS_WITH_AS(expr, with, ...) DOCTEST_FUNC_EMPTY +#define DOCTEST_REQUIRE_THROWS_WITH_AS(expr, with, ...) DOCTEST_FUNC_EMPTY +#define DOCTEST_WARN_NOTHROW(...) DOCTEST_FUNC_EMPTY +#define DOCTEST_CHECK_NOTHROW(...) DOCTEST_FUNC_EMPTY +#define DOCTEST_REQUIRE_NOTHROW(...) DOCTEST_FUNC_EMPTY + +#define DOCTEST_WARN_THROWS_MESSAGE(expr, ...) DOCTEST_FUNC_EMPTY +#define DOCTEST_CHECK_THROWS_MESSAGE(expr, ...) DOCTEST_FUNC_EMPTY +#define DOCTEST_REQUIRE_THROWS_MESSAGE(expr, ...) DOCTEST_FUNC_EMPTY +#define DOCTEST_WARN_THROWS_AS_MESSAGE(expr, ex, ...) DOCTEST_FUNC_EMPTY +#define DOCTEST_CHECK_THROWS_AS_MESSAGE(expr, ex, ...) DOCTEST_FUNC_EMPTY +#define DOCTEST_REQUIRE_THROWS_AS_MESSAGE(expr, ex, ...) DOCTEST_FUNC_EMPTY +#define DOCTEST_WARN_THROWS_WITH_MESSAGE(expr, with, ...) DOCTEST_FUNC_EMPTY +#define DOCTEST_CHECK_THROWS_WITH_MESSAGE(expr, with, ...) DOCTEST_FUNC_EMPTY +#define DOCTEST_REQUIRE_THROWS_WITH_MESSAGE(expr, with, ...) DOCTEST_FUNC_EMPTY +#define DOCTEST_WARN_THROWS_WITH_AS_MESSAGE(expr, with, ex, ...) DOCTEST_FUNC_EMPTY +#define DOCTEST_CHECK_THROWS_WITH_AS_MESSAGE(expr, with, ex, ...) DOCTEST_FUNC_EMPTY +#define DOCTEST_REQUIRE_THROWS_WITH_AS_MESSAGE(expr, with, ex, ...) DOCTEST_FUNC_EMPTY +#define DOCTEST_WARN_NOTHROW_MESSAGE(expr, ...) DOCTEST_FUNC_EMPTY +#define DOCTEST_CHECK_NOTHROW_MESSAGE(expr, ...) DOCTEST_FUNC_EMPTY +#define DOCTEST_REQUIRE_NOTHROW_MESSAGE(expr, ...) DOCTEST_FUNC_EMPTY -#define DOCTEST_TEST_CASE_TEMPLATE_APPLY(id, ...) \ - typedef int DOCTEST_ANONYMOUS(DOCTEST_ANON_FOR_SEMICOLON_) +#endif // DOCTEST_CONFIG_NO_EXCEPTIONS -// for subcases -#define DOCTEST_SUBCASE(name) +#endif // DOCTEST_CONFIG_EVALUATE_ASSERTS_EVEN_WHEN_DISABLED -// for a testsuite block -#define DOCTEST_TEST_SUITE(name) namespace +#endif // DOCTEST_CONFIG_DISABLE -// for starting a testsuite block -#define DOCTEST_TEST_SUITE_BEGIN(name) typedef int DOCTEST_ANONYMOUS(DOCTEST_ANON_FOR_SEMICOLON_) +#ifdef DOCTEST_CONFIG_NO_EXCEPTIONS -// for ending a testsuite block -#define DOCTEST_TEST_SUITE_END typedef int DOCTEST_ANONYMOUS(DOCTEST_ANON_FOR_SEMICOLON_) +#ifdef DOCTEST_CONFIG_NO_EXCEPTIONS_BUT_WITH_ALL_ASSERTS +#define DOCTEST_EXCEPTION_EMPTY_FUNC DOCTEST_FUNC_EMPTY +#else // DOCTEST_CONFIG_NO_EXCEPTIONS_BUT_WITH_ALL_ASSERTS +#define DOCTEST_EXCEPTION_EMPTY_FUNC [] { static_assert(false, "Exceptions are disabled! " \ + "Use DOCTEST_CONFIG_NO_EXCEPTIONS_BUT_WITH_ALL_ASSERTS if you want to compile with exceptions disabled."); return false; }() -#define DOCTEST_REGISTER_EXCEPTION_TRANSLATOR(signature) \ - template \ - static inline doctest::String DOCTEST_ANONYMOUS(DOCTEST_ANON_TRANSLATOR_)(signature) +#undef DOCTEST_REQUIRE +#undef DOCTEST_REQUIRE_FALSE +#undef DOCTEST_REQUIRE_MESSAGE +#undef DOCTEST_REQUIRE_FALSE_MESSAGE +#undef DOCTEST_REQUIRE_EQ +#undef DOCTEST_REQUIRE_NE +#undef DOCTEST_REQUIRE_GT +#undef DOCTEST_REQUIRE_LT +#undef DOCTEST_REQUIRE_GE +#undef DOCTEST_REQUIRE_LE +#undef DOCTEST_REQUIRE_UNARY +#undef DOCTEST_REQUIRE_UNARY_FALSE -#define DOCTEST_REGISTER_REPORTER(name, priority, reporter) -#define DOCTEST_REGISTER_LISTENER(name, priority, reporter) +#define DOCTEST_REQUIRE DOCTEST_EXCEPTION_EMPTY_FUNC +#define DOCTEST_REQUIRE_FALSE DOCTEST_EXCEPTION_EMPTY_FUNC +#define DOCTEST_REQUIRE_MESSAGE DOCTEST_EXCEPTION_EMPTY_FUNC +#define DOCTEST_REQUIRE_FALSE_MESSAGE DOCTEST_EXCEPTION_EMPTY_FUNC +#define DOCTEST_REQUIRE_EQ DOCTEST_EXCEPTION_EMPTY_FUNC +#define DOCTEST_REQUIRE_NE DOCTEST_EXCEPTION_EMPTY_FUNC +#define DOCTEST_REQUIRE_GT DOCTEST_EXCEPTION_EMPTY_FUNC +#define DOCTEST_REQUIRE_LT DOCTEST_EXCEPTION_EMPTY_FUNC +#define DOCTEST_REQUIRE_GE DOCTEST_EXCEPTION_EMPTY_FUNC +#define DOCTEST_REQUIRE_LE DOCTEST_EXCEPTION_EMPTY_FUNC +#define DOCTEST_REQUIRE_UNARY DOCTEST_EXCEPTION_EMPTY_FUNC +#define DOCTEST_REQUIRE_UNARY_FALSE DOCTEST_EXCEPTION_EMPTY_FUNC -#define DOCTEST_INFO(...) (static_cast(0)) -#define DOCTEST_CAPTURE(x) (static_cast(0)) -#define DOCTEST_ADD_MESSAGE_AT(file, line, ...) (static_cast(0)) -#define DOCTEST_ADD_FAIL_CHECK_AT(file, line, ...) (static_cast(0)) -#define DOCTEST_ADD_FAIL_AT(file, line, ...) (static_cast(0)) -#define DOCTEST_MESSAGE(...) (static_cast(0)) -#define DOCTEST_FAIL_CHECK(...) (static_cast(0)) -#define DOCTEST_FAIL(...) (static_cast(0)) +#endif // DOCTEST_CONFIG_NO_EXCEPTIONS_BUT_WITH_ALL_ASSERTS -#define DOCTEST_WARN(...) (static_cast(0)) -#define DOCTEST_CHECK(...) (static_cast(0)) -#define DOCTEST_REQUIRE(...) (static_cast(0)) -#define DOCTEST_WARN_FALSE(...) (static_cast(0)) -#define DOCTEST_CHECK_FALSE(...) (static_cast(0)) -#define DOCTEST_REQUIRE_FALSE(...) (static_cast(0)) - -#define DOCTEST_WARN_MESSAGE(cond, ...) (static_cast(0)) -#define DOCTEST_CHECK_MESSAGE(cond, ...) (static_cast(0)) -#define DOCTEST_REQUIRE_MESSAGE(cond, ...) (static_cast(0)) -#define DOCTEST_WARN_FALSE_MESSAGE(cond, ...) (static_cast(0)) -#define DOCTEST_CHECK_FALSE_MESSAGE(cond, ...) (static_cast(0)) -#define DOCTEST_REQUIRE_FALSE_MESSAGE(cond, ...) (static_cast(0)) - -#define DOCTEST_WARN_THROWS(...) (static_cast(0)) -#define DOCTEST_CHECK_THROWS(...) (static_cast(0)) -#define DOCTEST_REQUIRE_THROWS(...) (static_cast(0)) -#define DOCTEST_WARN_THROWS_AS(expr, ...) (static_cast(0)) -#define DOCTEST_CHECK_THROWS_AS(expr, ...) (static_cast(0)) -#define DOCTEST_REQUIRE_THROWS_AS(expr, ...) (static_cast(0)) -#define DOCTEST_WARN_THROWS_WITH(expr, ...) (static_cast(0)) -#define DOCTEST_CHECK_THROWS_WITH(expr, ...) (static_cast(0)) -#define DOCTEST_REQUIRE_THROWS_WITH(expr, ...) (static_cast(0)) -#define DOCTEST_WARN_THROWS_WITH_AS(expr, with, ...) (static_cast(0)) -#define DOCTEST_CHECK_THROWS_WITH_AS(expr, with, ...) (static_cast(0)) -#define DOCTEST_REQUIRE_THROWS_WITH_AS(expr, with, ...) (static_cast(0)) -#define DOCTEST_WARN_NOTHROW(...) (static_cast(0)) -#define DOCTEST_CHECK_NOTHROW(...) (static_cast(0)) -#define DOCTEST_REQUIRE_NOTHROW(...) (static_cast(0)) - -#define DOCTEST_WARN_THROWS_MESSAGE(expr, ...) (static_cast(0)) -#define DOCTEST_CHECK_THROWS_MESSAGE(expr, ...) (static_cast(0)) -#define DOCTEST_REQUIRE_THROWS_MESSAGE(expr, ...) (static_cast(0)) -#define DOCTEST_WARN_THROWS_AS_MESSAGE(expr, ex, ...) (static_cast(0)) -#define DOCTEST_CHECK_THROWS_AS_MESSAGE(expr, ex, ...) (static_cast(0)) -#define DOCTEST_REQUIRE_THROWS_AS_MESSAGE(expr, ex, ...) (static_cast(0)) -#define DOCTEST_WARN_THROWS_WITH_MESSAGE(expr, with, ...) (static_cast(0)) -#define DOCTEST_CHECK_THROWS_WITH_MESSAGE(expr, with, ...) (static_cast(0)) -#define DOCTEST_REQUIRE_THROWS_WITH_MESSAGE(expr, with, ...) (static_cast(0)) -#define DOCTEST_WARN_THROWS_WITH_AS_MESSAGE(expr, with, ex, ...) (static_cast(0)) -#define DOCTEST_CHECK_THROWS_WITH_AS_MESSAGE(expr, with, ex, ...) (static_cast(0)) -#define DOCTEST_REQUIRE_THROWS_WITH_AS_MESSAGE(expr, with, ex, ...) (static_cast(0)) -#define DOCTEST_WARN_NOTHROW_MESSAGE(expr, ...) (static_cast(0)) -#define DOCTEST_CHECK_NOTHROW_MESSAGE(expr, ...) (static_cast(0)) -#define DOCTEST_REQUIRE_NOTHROW_MESSAGE(expr, ...) (static_cast(0)) - -#define DOCTEST_WARN_EQ(...) (static_cast(0)) -#define DOCTEST_CHECK_EQ(...) (static_cast(0)) -#define DOCTEST_REQUIRE_EQ(...) (static_cast(0)) -#define DOCTEST_WARN_NE(...) (static_cast(0)) -#define DOCTEST_CHECK_NE(...) (static_cast(0)) -#define DOCTEST_REQUIRE_NE(...) (static_cast(0)) -#define DOCTEST_WARN_GT(...) (static_cast(0)) -#define DOCTEST_CHECK_GT(...) (static_cast(0)) -#define DOCTEST_REQUIRE_GT(...) (static_cast(0)) -#define DOCTEST_WARN_LT(...) (static_cast(0)) -#define DOCTEST_CHECK_LT(...) (static_cast(0)) -#define DOCTEST_REQUIRE_LT(...) (static_cast(0)) -#define DOCTEST_WARN_GE(...) (static_cast(0)) -#define DOCTEST_CHECK_GE(...) (static_cast(0)) -#define DOCTEST_REQUIRE_GE(...) (static_cast(0)) -#define DOCTEST_WARN_LE(...) (static_cast(0)) -#define DOCTEST_CHECK_LE(...) (static_cast(0)) -#define DOCTEST_REQUIRE_LE(...) (static_cast(0)) - -#define DOCTEST_WARN_UNARY(...) (static_cast(0)) -#define DOCTEST_CHECK_UNARY(...) (static_cast(0)) -#define DOCTEST_REQUIRE_UNARY(...) (static_cast(0)) -#define DOCTEST_WARN_UNARY_FALSE(...) (static_cast(0)) -#define DOCTEST_CHECK_UNARY_FALSE(...) (static_cast(0)) -#define DOCTEST_REQUIRE_UNARY_FALSE(...) (static_cast(0)) +#define DOCTEST_WARN_THROWS(...) DOCTEST_EXCEPTION_EMPTY_FUNC +#define DOCTEST_CHECK_THROWS(...) DOCTEST_EXCEPTION_EMPTY_FUNC +#define DOCTEST_REQUIRE_THROWS(...) DOCTEST_EXCEPTION_EMPTY_FUNC +#define DOCTEST_WARN_THROWS_AS(expr, ...) DOCTEST_EXCEPTION_EMPTY_FUNC +#define DOCTEST_CHECK_THROWS_AS(expr, ...) DOCTEST_EXCEPTION_EMPTY_FUNC +#define DOCTEST_REQUIRE_THROWS_AS(expr, ...) DOCTEST_EXCEPTION_EMPTY_FUNC +#define DOCTEST_WARN_THROWS_WITH(expr, ...) DOCTEST_EXCEPTION_EMPTY_FUNC +#define DOCTEST_CHECK_THROWS_WITH(expr, ...) DOCTEST_EXCEPTION_EMPTY_FUNC +#define DOCTEST_REQUIRE_THROWS_WITH(expr, ...) DOCTEST_EXCEPTION_EMPTY_FUNC +#define DOCTEST_WARN_THROWS_WITH_AS(expr, with, ...) DOCTEST_EXCEPTION_EMPTY_FUNC +#define DOCTEST_CHECK_THROWS_WITH_AS(expr, with, ...) DOCTEST_EXCEPTION_EMPTY_FUNC +#define DOCTEST_REQUIRE_THROWS_WITH_AS(expr, with, ...) DOCTEST_EXCEPTION_EMPTY_FUNC +#define DOCTEST_WARN_NOTHROW(...) DOCTEST_EXCEPTION_EMPTY_FUNC +#define DOCTEST_CHECK_NOTHROW(...) DOCTEST_EXCEPTION_EMPTY_FUNC +#define DOCTEST_REQUIRE_NOTHROW(...) DOCTEST_EXCEPTION_EMPTY_FUNC + +#define DOCTEST_WARN_THROWS_MESSAGE(expr, ...) DOCTEST_EXCEPTION_EMPTY_FUNC +#define DOCTEST_CHECK_THROWS_MESSAGE(expr, ...) DOCTEST_EXCEPTION_EMPTY_FUNC +#define DOCTEST_REQUIRE_THROWS_MESSAGE(expr, ...) DOCTEST_EXCEPTION_EMPTY_FUNC +#define DOCTEST_WARN_THROWS_AS_MESSAGE(expr, ex, ...) DOCTEST_EXCEPTION_EMPTY_FUNC +#define DOCTEST_CHECK_THROWS_AS_MESSAGE(expr, ex, ...) DOCTEST_EXCEPTION_EMPTY_FUNC +#define DOCTEST_REQUIRE_THROWS_AS_MESSAGE(expr, ex, ...) DOCTEST_EXCEPTION_EMPTY_FUNC +#define DOCTEST_WARN_THROWS_WITH_MESSAGE(expr, with, ...) DOCTEST_EXCEPTION_EMPTY_FUNC +#define DOCTEST_CHECK_THROWS_WITH_MESSAGE(expr, with, ...) DOCTEST_EXCEPTION_EMPTY_FUNC +#define DOCTEST_REQUIRE_THROWS_WITH_MESSAGE(expr, with, ...) DOCTEST_EXCEPTION_EMPTY_FUNC +#define DOCTEST_WARN_THROWS_WITH_AS_MESSAGE(expr, with, ex, ...) DOCTEST_EXCEPTION_EMPTY_FUNC +#define DOCTEST_CHECK_THROWS_WITH_AS_MESSAGE(expr, with, ex, ...) DOCTEST_EXCEPTION_EMPTY_FUNC +#define DOCTEST_REQUIRE_THROWS_WITH_AS_MESSAGE(expr, with, ex, ...) DOCTEST_EXCEPTION_EMPTY_FUNC +#define DOCTEST_WARN_NOTHROW_MESSAGE(expr, ...) DOCTEST_EXCEPTION_EMPTY_FUNC +#define DOCTEST_CHECK_NOTHROW_MESSAGE(expr, ...) DOCTEST_EXCEPTION_EMPTY_FUNC +#define DOCTEST_REQUIRE_NOTHROW_MESSAGE(expr, ...) DOCTEST_EXCEPTION_EMPTY_FUNC -#endif // DOCTEST_CONFIG_DISABLE +#endif // DOCTEST_CONFIG_NO_EXCEPTIONS // clang-format off // KEPT FOR BACKWARDS COMPATIBILITY - FORWARDING TO THE RIGHT MACROS @@ -2701,11 +2980,12 @@ int registerReporter(const char* name, int priority, bool isReporter) { // clang-format on // == SHORT VERSIONS OF THE MACROS -#if !defined(DOCTEST_CONFIG_NO_SHORT_MACRO_NAMES) +#ifndef DOCTEST_CONFIG_NO_SHORT_MACRO_NAMES #define TEST_CASE(name) DOCTEST_TEST_CASE(name) #define TEST_CASE_CLASS(name) DOCTEST_TEST_CASE_CLASS(name) #define TEST_CASE_FIXTURE(x, name) DOCTEST_TEST_CASE_FIXTURE(x, name) +#define TYPE_TO_STRING_AS(str, ...) DOCTEST_TYPE_TO_STRING_AS(str, __VA_ARGS__) #define TYPE_TO_STRING(...) DOCTEST_TYPE_TO_STRING(__VA_ARGS__) #define TEST_CASE_TEMPLATE(name, T, ...) DOCTEST_TEST_CASE_TEMPLATE(name, T, __VA_ARGS__) #define TEST_CASE_TEMPLATE_DEFINE(name, T, id) DOCTEST_TEST_CASE_TEMPLATE_DEFINE(name, T, id) @@ -2838,39 +3118,19 @@ int registerReporter(const char* name, int priority, bool isReporter) { #endif // DOCTEST_CONFIG_NO_SHORT_MACRO_NAMES -#if !defined(DOCTEST_CONFIG_DISABLE) +#ifndef DOCTEST_CONFIG_DISABLE // this is here to clear the 'current test suite' for the current translation unit - at the top DOCTEST_TEST_SUITE_END(); -// add stringification for primitive/fundamental types -namespace doctest { namespace detail { - DOCTEST_TYPE_TO_STRING_IMPL(bool) - DOCTEST_TYPE_TO_STRING_IMPL(float) - DOCTEST_TYPE_TO_STRING_IMPL(double) - DOCTEST_TYPE_TO_STRING_IMPL(long double) - DOCTEST_TYPE_TO_STRING_IMPL(char) - DOCTEST_TYPE_TO_STRING_IMPL(signed char) - DOCTEST_TYPE_TO_STRING_IMPL(unsigned char) -#if !DOCTEST_MSVC || defined(_NATIVE_WCHAR_T_DEFINED) - DOCTEST_TYPE_TO_STRING_IMPL(wchar_t) -#endif // not MSVC or wchar_t support enabled - DOCTEST_TYPE_TO_STRING_IMPL(short int) - DOCTEST_TYPE_TO_STRING_IMPL(unsigned short int) - DOCTEST_TYPE_TO_STRING_IMPL(int) - DOCTEST_TYPE_TO_STRING_IMPL(unsigned int) - DOCTEST_TYPE_TO_STRING_IMPL(long int) - DOCTEST_TYPE_TO_STRING_IMPL(unsigned long int) - DOCTEST_TYPE_TO_STRING_IMPL(long long int) - DOCTEST_TYPE_TO_STRING_IMPL(unsigned long long int) -}} // namespace doctest::detail - #endif // DOCTEST_CONFIG_DISABLE DOCTEST_CLANG_SUPPRESS_WARNING_POP DOCTEST_MSVC_SUPPRESS_WARNING_POP DOCTEST_GCC_SUPPRESS_WARNING_POP +DOCTEST_SUPPRESS_COMMON_WARNINGS_POP + #endif // DOCTEST_LIBRARY_INCLUDED #ifndef DOCTEST_SINGLE_HEADER @@ -2890,13 +3150,11 @@ DOCTEST_CLANG_SUPPRESS_WARNING_WITH_PUSH("-Wunused-macros") DOCTEST_CLANG_SUPPRESS_WARNING_POP +DOCTEST_SUPPRESS_COMMON_WARNINGS_PUSH + DOCTEST_CLANG_SUPPRESS_WARNING_PUSH -DOCTEST_CLANG_SUPPRESS_WARNING("-Wunknown-pragmas") -DOCTEST_CLANG_SUPPRESS_WARNING("-Wpadded") -DOCTEST_CLANG_SUPPRESS_WARNING("-Wweak-vtables") DOCTEST_CLANG_SUPPRESS_WARNING("-Wglobal-constructors") DOCTEST_CLANG_SUPPRESS_WARNING("-Wexit-time-destructors") -DOCTEST_CLANG_SUPPRESS_WARNING("-Wmissing-prototypes") DOCTEST_CLANG_SUPPRESS_WARNING("-Wsign-conversion") DOCTEST_CLANG_SUPPRESS_WARNING("-Wshorten-64-to-32") DOCTEST_CLANG_SUPPRESS_WARNING("-Wmissing-variable-declarations") @@ -2904,66 +3162,35 @@ DOCTEST_CLANG_SUPPRESS_WARNING("-Wswitch") DOCTEST_CLANG_SUPPRESS_WARNING("-Wswitch-enum") DOCTEST_CLANG_SUPPRESS_WARNING("-Wcovered-switch-default") DOCTEST_CLANG_SUPPRESS_WARNING("-Wmissing-noreturn") -DOCTEST_CLANG_SUPPRESS_WARNING("-Wunused-local-typedef") DOCTEST_CLANG_SUPPRESS_WARNING("-Wdisabled-macro-expansion") DOCTEST_CLANG_SUPPRESS_WARNING("-Wmissing-braces") DOCTEST_CLANG_SUPPRESS_WARNING("-Wmissing-field-initializers") -DOCTEST_CLANG_SUPPRESS_WARNING("-Wc++98-compat") -DOCTEST_CLANG_SUPPRESS_WARNING("-Wc++98-compat-pedantic") DOCTEST_CLANG_SUPPRESS_WARNING("-Wunused-member-function") DOCTEST_CLANG_SUPPRESS_WARNING("-Wnonportable-system-include-path") DOCTEST_GCC_SUPPRESS_WARNING_PUSH -DOCTEST_GCC_SUPPRESS_WARNING("-Wunknown-pragmas") -DOCTEST_GCC_SUPPRESS_WARNING("-Wpragmas") DOCTEST_GCC_SUPPRESS_WARNING("-Wconversion") -DOCTEST_GCC_SUPPRESS_WARNING("-Weffc++") DOCTEST_GCC_SUPPRESS_WARNING("-Wsign-conversion") -DOCTEST_GCC_SUPPRESS_WARNING("-Wstrict-overflow") -DOCTEST_GCC_SUPPRESS_WARNING("-Wstrict-aliasing") DOCTEST_GCC_SUPPRESS_WARNING("-Wmissing-field-initializers") DOCTEST_GCC_SUPPRESS_WARNING("-Wmissing-braces") -DOCTEST_GCC_SUPPRESS_WARNING("-Wmissing-declarations") DOCTEST_GCC_SUPPRESS_WARNING("-Wswitch") DOCTEST_GCC_SUPPRESS_WARNING("-Wswitch-enum") DOCTEST_GCC_SUPPRESS_WARNING("-Wswitch-default") DOCTEST_GCC_SUPPRESS_WARNING("-Wunsafe-loop-optimizations") DOCTEST_GCC_SUPPRESS_WARNING("-Wold-style-cast") -DOCTEST_GCC_SUPPRESS_WARNING("-Wunused-local-typedefs") -DOCTEST_GCC_SUPPRESS_WARNING("-Wuseless-cast") DOCTEST_GCC_SUPPRESS_WARNING("-Wunused-function") DOCTEST_GCC_SUPPRESS_WARNING("-Wmultiple-inheritance") -DOCTEST_GCC_SUPPRESS_WARNING("-Wnoexcept") DOCTEST_GCC_SUPPRESS_WARNING("-Wsuggest-attribute") DOCTEST_MSVC_SUPPRESS_WARNING_PUSH -DOCTEST_MSVC_SUPPRESS_WARNING(4616) // invalid compiler warning -DOCTEST_MSVC_SUPPRESS_WARNING(4619) // invalid compiler warning -DOCTEST_MSVC_SUPPRESS_WARNING(4996) // The compiler encountered a deprecated declaration DOCTEST_MSVC_SUPPRESS_WARNING(4267) // 'var' : conversion from 'x' to 'y', possible loss of data -DOCTEST_MSVC_SUPPRESS_WARNING(4706) // assignment within conditional expression -DOCTEST_MSVC_SUPPRESS_WARNING(4512) // 'class' : assignment operator could not be generated -DOCTEST_MSVC_SUPPRESS_WARNING(4127) // conditional expression is constant DOCTEST_MSVC_SUPPRESS_WARNING(4530) // C++ exception handler used, but unwind semantics not enabled DOCTEST_MSVC_SUPPRESS_WARNING(4577) // 'noexcept' used with no exception handling mode specified DOCTEST_MSVC_SUPPRESS_WARNING(4774) // format string expected in argument is not a string literal DOCTEST_MSVC_SUPPRESS_WARNING(4365) // conversion from 'int' to 'unsigned', signed/unsigned mismatch -DOCTEST_MSVC_SUPPRESS_WARNING(4820) // padding in structs -DOCTEST_MSVC_SUPPRESS_WARNING(4640) // construction of local static object is not thread-safe DOCTEST_MSVC_SUPPRESS_WARNING(5039) // pointer to potentially throwing function passed to extern C -DOCTEST_MSVC_SUPPRESS_WARNING(5045) // Spectre mitigation stuff -DOCTEST_MSVC_SUPPRESS_WARNING(4626) // assignment operator was implicitly defined as deleted -DOCTEST_MSVC_SUPPRESS_WARNING(5027) // move assignment operator was implicitly defined as deleted -DOCTEST_MSVC_SUPPRESS_WARNING(5026) // move constructor was implicitly defined as deleted -DOCTEST_MSVC_SUPPRESS_WARNING(4625) // copy constructor was implicitly defined as deleted DOCTEST_MSVC_SUPPRESS_WARNING(4800) // forcing value to bool 'true' or 'false' (performance warning) DOCTEST_MSVC_SUPPRESS_WARNING(5245) // unreferenced function with internal linkage has been removed -// static analysis -DOCTEST_MSVC_SUPPRESS_WARNING(26439) // This kind of function may not throw. Declare it 'noexcept' -DOCTEST_MSVC_SUPPRESS_WARNING(26495) // Always initialize a member variable -DOCTEST_MSVC_SUPPRESS_WARNING(26451) // Arithmetic overflow ... -DOCTEST_MSVC_SUPPRESS_WARNING(26444) // Avoid unnamed objects with custom construction and dtor... -DOCTEST_MSVC_SUPPRESS_WARNING(26812) // Prefer 'enum class' over 'enum' DOCTEST_MAKE_STD_HEADERS_CLEAN_FROM_WARNINGS_ON_WALL_BEGIN @@ -2971,7 +3198,7 @@ DOCTEST_MAKE_STD_HEADERS_CLEAN_FROM_WARNINGS_ON_WALL_BEGIN #include #include #include -// borland (Embarcadero) compiler requires math.h and not cmath - https://github.com/onqtam/doctest/pull/37 +// borland (Embarcadero) compiler requires math.h and not cmath - https://github.com/doctest/doctest/pull/37 #ifdef __BORLANDC__ #include #endif // __BORLANDC__ @@ -2983,20 +3210,33 @@ DOCTEST_MAKE_STD_HEADERS_CLEAN_FROM_WARNINGS_ON_WALL_BEGIN #include #include #include +#ifndef DOCTEST_CONFIG_NO_INCLUDE_IOSTREAM #include +#endif // DOCTEST_CONFIG_NO_INCLUDE_IOSTREAM #include #include #include +#ifndef DOCTEST_CONFIG_NO_MULTITHREADING #include #include +#define DOCTEST_DECLARE_MUTEX(name) std::mutex name; +#define DOCTEST_DECLARE_STATIC_MUTEX(name) static DOCTEST_DECLARE_MUTEX(name) +#define DOCTEST_LOCK_MUTEX(name) std::lock_guard DOCTEST_ANONYMOUS(DOCTEST_ANON_LOCK_)(name); +#else // DOCTEST_CONFIG_NO_MULTITHREADING +#define DOCTEST_DECLARE_MUTEX(name) +#define DOCTEST_DECLARE_STATIC_MUTEX(name) +#define DOCTEST_LOCK_MUTEX(name) +#endif // DOCTEST_CONFIG_NO_MULTITHREADING #include #include +#include #include #include #include #include #include #include +#include #ifdef DOCTEST_PLATFORM_MAC #include @@ -3009,9 +3249,11 @@ DOCTEST_MAKE_STD_HEADERS_CLEAN_FROM_WARNINGS_ON_WALL_BEGIN // defines for a leaner windows.h #ifndef WIN32_LEAN_AND_MEAN #define WIN32_LEAN_AND_MEAN +#define DOCTEST_UNDEF_WIN32_LEAN_AND_MEAN #endif // WIN32_LEAN_AND_MEAN #ifndef NOMINMAX #define NOMINMAX +#define DOCTEST_UNDEF_NOMINMAX #endif // NOMINMAX // not sure what AfxWin.h is for - here I do what Catch does @@ -3029,7 +3271,7 @@ DOCTEST_MAKE_STD_HEADERS_CLEAN_FROM_WARNINGS_ON_WALL_BEGIN #endif // DOCTEST_PLATFORM_WINDOWS -// this is a fix for https://github.com/onqtam/doctest/issues/348 +// this is a fix for https://github.com/doctest/doctest/issues/348 // https://mail.gnome.org/archives/xml/2012-January/msg00000.html #if !defined(HAVE_UNISTD_H) && !defined(STDOUT_FILENO) #define STDOUT_FILENO fileno(stdout) @@ -3051,7 +3293,7 @@ DOCTEST_MAKE_STD_HEADERS_CLEAN_FROM_WARNINGS_ON_WALL_END #endif #ifndef DOCTEST_THREAD_LOCAL -#if DOCTEST_MSVC && (DOCTEST_MSVC < DOCTEST_COMPILER(19, 0, 0)) +#if defined(DOCTEST_CONFIG_NO_MULTITHREADING) || DOCTEST_MSVC && (DOCTEST_MSVC < DOCTEST_COMPILER(19, 0, 0)) #define DOCTEST_THREAD_LOCAL #else // DOCTEST_MSVC #define DOCTEST_THREAD_LOCAL thread_local @@ -3086,6 +3328,30 @@ bool is_running_in_test = false; namespace { using namespace detail; + + template + DOCTEST_NORETURN void throw_exception(Ex const& e) { +#ifndef DOCTEST_CONFIG_NO_EXCEPTIONS + throw e; +#else // DOCTEST_CONFIG_NO_EXCEPTIONS +#ifdef DOCTEST_CONFIG_HANDLE_EXCEPTION + DOCTEST_CONFIG_HANDLE_EXCEPTION(e); +#else // DOCTEST_CONFIG_HANDLE_EXCEPTION +#ifndef DOCTEST_CONFIG_NO_INCLUDE_IOSTREAM + std::cerr << "doctest will terminate because it needed to throw an exception.\n" + << "The message was: " << e.what() << '\n'; +#endif // DOCTEST_CONFIG_NO_INCLUDE_IOSTREAM +#endif // DOCTEST_CONFIG_HANDLE_EXCEPTION + std::terminate(); +#endif // DOCTEST_CONFIG_NO_EXCEPTIONS + } + +#ifndef DOCTEST_INTERNAL_ERROR +#define DOCTEST_INTERNAL_ERROR(msg) \ + throw_exception(std::logic_error( \ + __FILE__ ":" DOCTEST_TOSTR(__LINE__) ": Internal doctest error: " msg)) +#endif // DOCTEST_INTERNAL_ERROR + // case insensitive strcmp int stricmp(const char* a, const char* b) { for(;; a++, b++) { @@ -3095,20 +3361,6 @@ namespace { } } - template - String fpToString(T value, int precision) { - std::ostringstream oss; - oss << std::setprecision(precision) << std::fixed << value; - std::string d = oss.str(); - size_t i = d.find_last_not_of('0'); - if(i != std::string::npos && i != d.size() - 1) { - if(d[i] == '.') - i++; - d = d.substr(0, i + 1); - } - return d.c_str(); - } - struct Endianness { enum Arch @@ -3129,40 +3381,36 @@ namespace { } // namespace namespace detail { - void my_memcpy(void* dest, const void* src, unsigned num) { memcpy(dest, src, num); } + class os_ostream { + std::vector stack; + std::stringstream ss; - String rawMemoryToString(const void* object, unsigned size) { - // Reverse order for little endian architectures - int i = 0, end = static_cast(size), inc = 1; - if(Endianness::which() == Endianness::Little) { - i = end - 1; - end = inc = -1; + public: + std::ostream* push() { + stack.push_back(ss.tellp()); + return &ss; } - unsigned const char* bytes = static_cast(object); - std::ostringstream oss; - oss << "0x" << std::setfill('0') << std::hex; - for(; i != end; i += inc) - oss << std::setw(2) << static_cast(bytes[i]); - return oss.str().c_str(); - } + String pop() { + if (stack.empty()) + DOCTEST_INTERNAL_ERROR("TLSS was empty when trying to pop!"); + + std::streampos pos = stack.back(); + stack.pop_back(); + unsigned sz = static_cast(ss.tellp() - pos); + ss.rdbuf()->pubseekpos(pos, std::ios::in | std::ios::out); + return String(ss, sz); + } + }; - DOCTEST_THREAD_LOCAL doctest_thread_local_wrapper wrapped_g_oss; // NOLINT(cert-err58-cpp) + DOCTEST_THREAD_LOCAL doctest_thread_local_wrapper wrapped_g_oss; - //reset default value is true. getTlsOss(bool reset=true); - std::ostream* getTlsOss(bool reset) { - auto& g_oss = wrapped_g_oss.get(); - if(reset) { - g_oss.clear(); // there shouldn't be anything worth clearing in the flags - g_oss.str(""); // the slow way of resetting a string stream - //g_oss.seekp(0); // optimal reset - as seen here: https://stackoverflow.com/a/624291/3162383 - } - return &g_oss; + std::ostream* tlssPush() { + return wrapped_g_oss.get().push(); } - String getTlsOssResult() { - //g_oss << std::ends; // needed - as shown here: https://stackoverflow.com/a/624291/3162383 - return wrapped_g_oss.get().str().c_str(); + String tlssPop() { + return wrapped_g_oss.get().pop(); } #ifndef DOCTEST_CONFIG_DISABLE @@ -3171,20 +3419,19 @@ namespace timer_large_integer { #if defined(DOCTEST_PLATFORM_WINDOWS) - typedef ULONGLONG type; + using type = ULONGLONG; #else // DOCTEST_PLATFORM_WINDOWS - using namespace std; - typedef uint64_t type; + using type = std::uint64_t; #endif // DOCTEST_PLATFORM_WINDOWS } -typedef timer_large_integer::type ticks_t; +using ticks_t = timer_large_integer::type; #ifdef DOCTEST_CONFIG_GETCURRENTTICKS ticks_t getCurrentTicks() { return DOCTEST_CONFIG_GETCURRENTTICKS(); } #elif defined(DOCTEST_PLATFORM_WINDOWS) ticks_t getCurrentTicks() { - static LARGE_INTEGER hz = {0}, hzo = {0}; + static LARGE_INTEGER hz = { {0} }, hzo = { {0} }; if(!hz.QuadPart) { QueryPerformanceFrequency(&hz); QueryPerformanceCounter(&hzo); @@ -3216,9 +3463,17 @@ typedef timer_large_integer::type ticks_t; ticks_t m_ticks = 0; }; -#ifdef DOCTEST_CONFIG_NO_MULTI_LANE_ATOMICS +#ifdef DOCTEST_CONFIG_NO_MULTITHREADING + template + using Atomic = T; +#else // DOCTEST_CONFIG_NO_MULTITHREADING + template + using Atomic = std::atomic; +#endif // DOCTEST_CONFIG_NO_MULTITHREADING + +#if defined(DOCTEST_CONFIG_NO_MULTI_LANE_ATOMICS) || defined(DOCTEST_CONFIG_NO_MULTITHREADING) template - using AtomicOrMultiLaneAtomic = std::atomic; + using MultiLaneAtomic = Atomic; #else // DOCTEST_CONFIG_NO_MULTI_LANE_ATOMICS // Provides a multilane implementation of an atomic variable that supports add, sub, load, // store. Instead of using a single atomic variable, this splits up into multiple ones, @@ -3235,8 +3490,8 @@ typedef timer_large_integer::type ticks_t; { struct CacheLineAlignedAtomic { - std::atomic atomic{}; - char padding[DOCTEST_MULTI_LANE_ATOMICS_CACHE_LINE_SIZE - sizeof(std::atomic)]; + Atomic atomic{}; + char padding[DOCTEST_MULTI_LANE_ATOMICS_CACHE_LINE_SIZE - sizeof(Atomic)]; }; CacheLineAlignedAtomic m_atomics[DOCTEST_MULTI_LANE_ATOMICS_THREAD_LANES]; @@ -3292,24 +3547,21 @@ typedef timer_large_integer::type ticks_t; // assigned in a round-robin fashion. // 3. This tlsLaneIdx is stored in the thread local data, so it is directly available with // little overhead. - std::atomic& myAtomic() DOCTEST_NOEXCEPT { - static std::atomic laneCounter; + Atomic& myAtomic() DOCTEST_NOEXCEPT { + static Atomic laneCounter; DOCTEST_THREAD_LOCAL size_t tlsLaneIdx = laneCounter++ % DOCTEST_MULTI_LANE_ATOMICS_THREAD_LANES; return m_atomics[tlsLaneIdx].atomic; } }; - - template - using AtomicOrMultiLaneAtomic = MultiLaneAtomic; #endif // DOCTEST_CONFIG_NO_MULTI_LANE_ATOMICS // this holds both parameters from the command line and runtime data for tests struct ContextState : ContextOptions, TestRunStats, CurrentTestCaseStats { - AtomicOrMultiLaneAtomic numAssertsCurrentTest_atomic; - AtomicOrMultiLaneAtomic numAssertsFailedCurrentTest_atomic; + MultiLaneAtomic numAssertsCurrentTest_atomic; + MultiLaneAtomic numAssertsFailedCurrentTest_atomic; std::vector> filters = decltype(filters)(9); // 9 different filters @@ -3322,11 +3574,12 @@ typedef timer_large_integer::type ticks_t; std::vector stringifiedContexts; // logging from INFO() due to an exception // stuff for subcases - std::vector subcasesStack; - std::set subcasesPassed; - int subcasesCurrentMaxLevel; - bool should_reenter; - std::atomic shouldLogCurrentException; + bool reachedLeaf; + std::vector subcaseStack; + std::vector nextSubcaseStack; + std::unordered_set fullyTraversedSubcases; + size_t currentSubcaseDepth; + Atomic shouldLogCurrentException; void resetRunData() { numTestCases = 0; @@ -3392,23 +3645,37 @@ typedef timer_large_integer::type ticks_t; #endif // DOCTEST_CONFIG_DISABLE } // namespace detail -void String::setOnHeap() { *reinterpret_cast(&buf[last]) = 128; } -void String::setLast(unsigned in) { buf[last] = char(in); } +char* String::allocate(size_type sz) { + if (sz <= last) { + buf[sz] = '\0'; + setLast(last - sz); + return buf; + } else { + setOnHeap(); + data.size = sz; + data.capacity = data.size + 1; + data.ptr = new char[data.capacity]; + data.ptr[sz] = '\0'; + return data.ptr; + } +} + +void String::setOnHeap() noexcept { *reinterpret_cast(&buf[last]) = 128; } +void String::setLast(size_type in) noexcept { buf[last] = char(in); } +void String::setSize(size_type sz) noexcept { + if (isOnStack()) { buf[sz] = '\0'; setLast(last - sz); } + else { data.ptr[sz] = '\0'; data.size = sz; } +} void String::copy(const String& other) { - using namespace std; if(other.isOnStack()) { memcpy(buf, other.buf, len); } else { - setOnHeap(); - data.size = other.data.size; - data.capacity = data.size + 1; - data.ptr = new char[data.capacity]; - memcpy(data.ptr, other.data.ptr, data.size + 1); + memcpy(allocate(other.data.size), other.data.ptr, other.data.size); } } -String::String() { +String::String() noexcept { buf[0] = '\0'; setLast(); } @@ -3416,26 +3683,17 @@ String::String() { String::~String() { if(!isOnStack()) delete[] data.ptr; - // NOLINTNEXTLINE(clang-analyzer-cplusplus.NewDeleteLeaks) -} +} // NOLINT(clang-analyzer-cplusplus.NewDeleteLeaks) String::String(const char* in) : String(in, strlen(in)) {} -String::String(const char* in, unsigned in_size) { - using namespace std; - if(in_size <= last) { - memcpy(buf, in, in_size); - buf[in_size] = '\0'; - setLast(last - in_size); - } else { - setOnHeap(); - data.size = in_size; - data.capacity = data.size + 1; - data.ptr = new char[data.capacity]; - memcpy(data.ptr, in, in_size); - data.ptr[in_size] = '\0'; - } +String::String(const char* in, size_type in_size) { + memcpy(allocate(in_size), in, in_size); +} + +String::String(std::istream& in, size_type in_size) { + in.read(allocate(in_size), in_size); } String::String(const String& other) { copy(other); } @@ -3452,10 +3710,9 @@ String& String::operator=(const String& other) { } String& String::operator+=(const String& other) { - const unsigned my_old_size = size(); - const unsigned other_size = other.size(); - const unsigned total_size = my_old_size + other_size; - using namespace std; + const size_type my_old_size = size(); + const size_type other_size = other.size(); + const size_type total_size = my_old_size + other_size; if(isOnStack()) { if(total_size < len) { // append to the current stack space @@ -3502,15 +3759,13 @@ String& String::operator+=(const String& other) { return *this; } -String::String(String&& other) { - using namespace std; +String::String(String&& other) noexcept { memcpy(buf, other.buf, len); other.buf[0] = '\0'; other.setLast(); } -String& String::operator=(String&& other) { - using namespace std; +String& String::operator=(String&& other) noexcept { if(this != &other) { if(!isOnStack()) delete[] data.ptr; @@ -3521,30 +3776,60 @@ String& String::operator=(String&& other) { return *this; } -char String::operator[](unsigned i) const { - return const_cast(this)->operator[](i); // NOLINT +char String::operator[](size_type i) const { + return const_cast(this)->operator[](i); } -char& String::operator[](unsigned i) { +char& String::operator[](size_type i) { if(isOnStack()) return reinterpret_cast(buf)[i]; return data.ptr[i]; } DOCTEST_GCC_SUPPRESS_WARNING_WITH_PUSH("-Wmaybe-uninitialized") -unsigned String::size() const { +String::size_type String::size() const { if(isOnStack()) - return last - (unsigned(buf[last]) & 31); // using "last" would work only if "len" is 32 + return last - (size_type(buf[last]) & 31); // using "last" would work only if "len" is 32 return data.size; } DOCTEST_GCC_SUPPRESS_WARNING_POP -unsigned String::capacity() const { +String::size_type String::capacity() const { if(isOnStack()) return len; return data.capacity; } +String String::substr(size_type pos, size_type cnt) && { + cnt = std::min(cnt, size() - 1 - pos); + char* cptr = c_str(); + memmove(cptr, cptr + pos, cnt); + setSize(cnt); + return std::move(*this); +} + +String String::substr(size_type pos, size_type cnt) const & { + cnt = std::min(cnt, size() - 1 - pos); + return String{ c_str() + pos, cnt }; +} + +String::size_type String::find(char ch, size_type pos) const { + const char* begin = c_str(); + const char* end = begin + size(); + const char* it = begin + pos; + for (; it < end && *it != ch; it++); + if (it < end) { return static_cast(it - begin); } + else { return npos; } +} + +String::size_type String::rfind(char ch, size_type pos) const { + const char* begin = c_str(); + const char* it = begin + std::min(pos, size() - 1); + for (; it >= begin && *it != ch; it--); + if (it >= begin) { return static_cast(it - begin); } + else { return npos; } +} + int String::compare(const char* other, bool no_case) const { if(no_case) return doctest::stricmp(c_str(), other); @@ -3555,20 +3840,32 @@ int String::compare(const String& other, bool no_case) const { return compare(other.c_str(), no_case); } -// NOLINTNEXTLINE(clang-analyzer-cplusplus.NewDeleteLeaks) String operator+(const String& lhs, const String& rhs) { return String(lhs) += rhs; } -// clang-format off bool operator==(const String& lhs, const String& rhs) { return lhs.compare(rhs) == 0; } bool operator!=(const String& lhs, const String& rhs) { return lhs.compare(rhs) != 0; } bool operator< (const String& lhs, const String& rhs) { return lhs.compare(rhs) < 0; } bool operator> (const String& lhs, const String& rhs) { return lhs.compare(rhs) > 0; } bool operator<=(const String& lhs, const String& rhs) { return (lhs != rhs) ? lhs.compare(rhs) < 0 : true; } bool operator>=(const String& lhs, const String& rhs) { return (lhs != rhs) ? lhs.compare(rhs) > 0 : true; } -// clang-format on std::ostream& operator<<(std::ostream& s, const String& in) { return s << in.c_str(); } +Contains::Contains(const String& str) : string(str) { } + +bool Contains::checkWith(const String& other) const { + return strstr(other.c_str(), string.c_str()) != nullptr; +} + +String toString(const Contains& in) { + return "Contains( " + in.string + " )"; +} + +bool operator==(const String& lhs, const Contains& rhs) { return rhs.checkWith(lhs); } +bool operator==(const Contains& lhs, const String& rhs) { return lhs.checkWith(rhs); } +bool operator!=(const String& lhs, const Contains& rhs) { return !rhs.checkWith(lhs); } +bool operator!=(const Contains& lhs, const String& rhs) { return !lhs.checkWith(rhs); } + namespace { void color_to_stream(std::ostream&, Color::Enum) DOCTEST_BRANCH_ON_DISABLED({}, ;) } // namespace @@ -3582,64 +3879,42 @@ namespace Color { // clang-format off const char* assertString(assertType::Enum at) { - DOCTEST_MSVC_SUPPRESS_WARNING_WITH_PUSH(4062) // enum 'x' in switch of enum 'y' is not handled - switch(at) { //!OCLINT missing default in switch statements - case assertType::DT_WARN : return "WARN"; - case assertType::DT_CHECK : return "CHECK"; - case assertType::DT_REQUIRE : return "REQUIRE"; - - case assertType::DT_WARN_FALSE : return "WARN_FALSE"; - case assertType::DT_CHECK_FALSE : return "CHECK_FALSE"; - case assertType::DT_REQUIRE_FALSE : return "REQUIRE_FALSE"; - - case assertType::DT_WARN_THROWS : return "WARN_THROWS"; - case assertType::DT_CHECK_THROWS : return "CHECK_THROWS"; - case assertType::DT_REQUIRE_THROWS : return "REQUIRE_THROWS"; - - case assertType::DT_WARN_THROWS_AS : return "WARN_THROWS_AS"; - case assertType::DT_CHECK_THROWS_AS : return "CHECK_THROWS_AS"; - case assertType::DT_REQUIRE_THROWS_AS : return "REQUIRE_THROWS_AS"; - - case assertType::DT_WARN_THROWS_WITH : return "WARN_THROWS_WITH"; - case assertType::DT_CHECK_THROWS_WITH : return "CHECK_THROWS_WITH"; - case assertType::DT_REQUIRE_THROWS_WITH : return "REQUIRE_THROWS_WITH"; - - case assertType::DT_WARN_THROWS_WITH_AS : return "WARN_THROWS_WITH_AS"; - case assertType::DT_CHECK_THROWS_WITH_AS : return "CHECK_THROWS_WITH_AS"; - case assertType::DT_REQUIRE_THROWS_WITH_AS : return "REQUIRE_THROWS_WITH_AS"; - - case assertType::DT_WARN_NOTHROW : return "WARN_NOTHROW"; - case assertType::DT_CHECK_NOTHROW : return "CHECK_NOTHROW"; - case assertType::DT_REQUIRE_NOTHROW : return "REQUIRE_NOTHROW"; - - case assertType::DT_WARN_EQ : return "WARN_EQ"; - case assertType::DT_CHECK_EQ : return "CHECK_EQ"; - case assertType::DT_REQUIRE_EQ : return "REQUIRE_EQ"; - case assertType::DT_WARN_NE : return "WARN_NE"; - case assertType::DT_CHECK_NE : return "CHECK_NE"; - case assertType::DT_REQUIRE_NE : return "REQUIRE_NE"; - case assertType::DT_WARN_GT : return "WARN_GT"; - case assertType::DT_CHECK_GT : return "CHECK_GT"; - case assertType::DT_REQUIRE_GT : return "REQUIRE_GT"; - case assertType::DT_WARN_LT : return "WARN_LT"; - case assertType::DT_CHECK_LT : return "CHECK_LT"; - case assertType::DT_REQUIRE_LT : return "REQUIRE_LT"; - case assertType::DT_WARN_GE : return "WARN_GE"; - case assertType::DT_CHECK_GE : return "CHECK_GE"; - case assertType::DT_REQUIRE_GE : return "REQUIRE_GE"; - case assertType::DT_WARN_LE : return "WARN_LE"; - case assertType::DT_CHECK_LE : return "CHECK_LE"; - case assertType::DT_REQUIRE_LE : return "REQUIRE_LE"; - - case assertType::DT_WARN_UNARY : return "WARN_UNARY"; - case assertType::DT_CHECK_UNARY : return "CHECK_UNARY"; - case assertType::DT_REQUIRE_UNARY : return "REQUIRE_UNARY"; - case assertType::DT_WARN_UNARY_FALSE : return "WARN_UNARY_FALSE"; - case assertType::DT_CHECK_UNARY_FALSE : return "CHECK_UNARY_FALSE"; - case assertType::DT_REQUIRE_UNARY_FALSE : return "REQUIRE_UNARY_FALSE"; + DOCTEST_MSVC_SUPPRESS_WARNING_WITH_PUSH(4061) // enum 'x' in switch of enum 'y' is not explicitly handled + #define DOCTEST_GENERATE_ASSERT_TYPE_CASE(assert_type) case assertType::DT_ ## assert_type: return #assert_type + #define DOCTEST_GENERATE_ASSERT_TYPE_CASES(assert_type) \ + DOCTEST_GENERATE_ASSERT_TYPE_CASE(WARN_ ## assert_type); \ + DOCTEST_GENERATE_ASSERT_TYPE_CASE(CHECK_ ## assert_type); \ + DOCTEST_GENERATE_ASSERT_TYPE_CASE(REQUIRE_ ## assert_type) + switch(at) { + DOCTEST_GENERATE_ASSERT_TYPE_CASE(WARN); + DOCTEST_GENERATE_ASSERT_TYPE_CASE(CHECK); + DOCTEST_GENERATE_ASSERT_TYPE_CASE(REQUIRE); + + DOCTEST_GENERATE_ASSERT_TYPE_CASES(FALSE); + + DOCTEST_GENERATE_ASSERT_TYPE_CASES(THROWS); + + DOCTEST_GENERATE_ASSERT_TYPE_CASES(THROWS_AS); + + DOCTEST_GENERATE_ASSERT_TYPE_CASES(THROWS_WITH); + + DOCTEST_GENERATE_ASSERT_TYPE_CASES(THROWS_WITH_AS); + + DOCTEST_GENERATE_ASSERT_TYPE_CASES(NOTHROW); + + DOCTEST_GENERATE_ASSERT_TYPE_CASES(EQ); + DOCTEST_GENERATE_ASSERT_TYPE_CASES(NE); + DOCTEST_GENERATE_ASSERT_TYPE_CASES(GT); + DOCTEST_GENERATE_ASSERT_TYPE_CASES(LT); + DOCTEST_GENERATE_ASSERT_TYPE_CASES(GE); + DOCTEST_GENERATE_ASSERT_TYPE_CASES(LE); + + DOCTEST_GENERATE_ASSERT_TYPE_CASES(UNARY); + DOCTEST_GENERATE_ASSERT_TYPE_CASES(UNARY_FALSE); + + default: DOCTEST_INTERNAL_ERROR("Tried stringifying invalid assert type!"); } DOCTEST_MSVC_SUPPRESS_WARNING_POP - return ""; } // clang-format on @@ -3673,6 +3948,12 @@ const char* skipPathFromFilename(const char* file) { DOCTEST_CLANG_SUPPRESS_WARNING_POP DOCTEST_GCC_SUPPRESS_WARNING_POP +bool SubcaseSignature::operator==(const SubcaseSignature& other) const { + return m_line == other.m_line + && std::strcmp(m_file, other.m_file) == 0 + && m_name == other.m_name; +} + bool SubcaseSignature::operator<(const SubcaseSignature& other) const { if(m_line != other.m_line) return m_line < other.m_line; @@ -3681,47 +3962,53 @@ bool SubcaseSignature::operator<(const SubcaseSignature& other) const { return m_name.compare(other.m_name) < 0; } -IContextScope::IContextScope() = default; -IContextScope::~IContextScope() = default; +DOCTEST_DEFINE_INTERFACE(IContextScope) + +namespace detail { + void filldata::fill(std::ostream* stream, const void* in) { + if (in) { *stream << in; } + else { *stream << "nullptr"; } + } + + template + String toStreamLit(T t) { + std::ostream* os = tlssPush(); + os->operator<<(t); + return tlssPop(); + } +} #ifdef DOCTEST_CONFIG_TREAT_CHAR_STAR_AS_STRING -String toString(char* in) { return toString(static_cast(in)); } -// NOLINTNEXTLINE(clang-analyzer-cplusplus.NewDeleteLeaks) String toString(const char* in) { return String("\"") + (in ? in : "{null string}") + "\""; } #endif // DOCTEST_CONFIG_TREAT_CHAR_STAR_AS_STRING -String toString(bool in) { return in ? "true" : "false"; } -String toString(float in) { return fpToString(in, 5) + "f"; } -String toString(double in) { return fpToString(in, 10); } -String toString(double long in) { return fpToString(in, 15); } - -DOCTEST_CLANG_SUPPRESS_WARNING_WITH_PUSH("-Wdeprecated-declarations") -#define DOCTEST_TO_STRING_OVERLOAD(type, fmt) \ - String toString(type in) { \ - char buf[64]; \ - std::sprintf(buf, fmt, in); \ - return buf; \ - } - -DOCTEST_TO_STRING_OVERLOAD(char, "%d") -DOCTEST_TO_STRING_OVERLOAD(char signed, "%d") -DOCTEST_TO_STRING_OVERLOAD(char unsigned, "%u") -DOCTEST_TO_STRING_OVERLOAD(int short, "%d") -DOCTEST_TO_STRING_OVERLOAD(int short unsigned, "%u") -DOCTEST_TO_STRING_OVERLOAD(int, "%d") -DOCTEST_TO_STRING_OVERLOAD(unsigned, "%u") -DOCTEST_TO_STRING_OVERLOAD(int long, "%ld") -DOCTEST_TO_STRING_OVERLOAD(int long unsigned, "%lu") -DOCTEST_TO_STRING_OVERLOAD(int long long, "%lld") -DOCTEST_TO_STRING_OVERLOAD(int long long unsigned, "%llu") -DOCTEST_CLANG_SUPPRESS_WARNING_POP - -String toString(std::nullptr_t) { return "NULL"; } #if DOCTEST_MSVC >= DOCTEST_COMPILER(19, 20, 0) -// see this issue on why this is needed: https://github.com/onqtam/doctest/issues/183 +// see this issue on why this is needed: https://github.com/doctest/doctest/issues/183 String toString(const std::string& in) { return in.c_str(); } #endif // VS 2019 +String toString(String in) { return in; } + +String toString(std::nullptr_t) { return "nullptr"; } + +String toString(bool in) { return in ? "true" : "false"; } + +String toString(float in) { return toStreamLit(in); } +String toString(double in) { return toStreamLit(in); } +String toString(double long in) { return toStreamLit(in); } + +String toString(char in) { return toStreamLit(static_cast(in)); } +String toString(char signed in) { return toStreamLit(static_cast(in)); } +String toString(char unsigned in) { return toStreamLit(static_cast(in)); } +String toString(short in) { return toStreamLit(in); } +String toString(short unsigned in) { return toStreamLit(in); } +String toString(signed in) { return toStreamLit(in); } +String toString(unsigned in) { return toStreamLit(in); } +String toString(long in) { return toStreamLit(in); } +String toString(long unsigned in) { return toStreamLit(in); } +String toString(long long in) { return toStreamLit(in); } +String toString(long long unsigned in) { return toStreamLit(in); } + Approx::Approx(double value) : m_epsilon(static_cast(std::numeric_limits::epsilon()) * 100) , m_scale(1.0) @@ -3761,11 +4048,25 @@ bool operator>(double lhs, const Approx& rhs) { return lhs > rhs.m_value && lhs bool operator>(const Approx& lhs, double rhs) { return lhs.m_value > rhs && lhs != rhs; } String toString(const Approx& in) { - // NOLINTNEXTLINE(clang-analyzer-cplusplus.NewDeleteLeaks) return "Approx( " + doctest::toString(in.m_value) + " )"; } const ContextOptions* getContextOptions() { return DOCTEST_BRANCH_ON_DISABLED(nullptr, g_cs); } +DOCTEST_MSVC_SUPPRESS_WARNING_WITH_PUSH(4738) +template +IsNaN::operator bool() const { + return std::isnan(value) ^ flipped; +} +DOCTEST_MSVC_SUPPRESS_WARNING_POP +template struct DOCTEST_INTERFACE_DEF IsNaN; +template struct DOCTEST_INTERFACE_DEF IsNaN; +template struct DOCTEST_INTERFACE_DEF IsNaN; +template +String toString(IsNaN in) { return String(in.flipped ? "! " : "") + "IsNaN( " + doctest::toString(in.value) + " )"; } +String toString(IsNaN in) { return toString(in); } +String toString(IsNaN in) { return toString(in); } +String toString(IsNaN in) { return toString(in); } + } // namespace doctest #ifdef DOCTEST_CONFIG_DISABLE @@ -3781,11 +4082,9 @@ void Context::setOption(const char*, const char*) {} bool Context::shouldExit() { return false; } void Context::setAsDefaultForAssertsOutOfTestCases() {} void Context::setAssertHandler(detail::assert_handler) {} -void Context::setCout(std::ostream* out) {} +void Context::setCout(std::ostream*) {} int Context::run() { return 0; } -IReporter::~IReporter() = default; - int IReporter::get_num_active_contexts() { return 0; } const IContextScope* const* IReporter::get_active_contexts() { return nullptr; } int IReporter::get_num_stringified_contexts() { return 0; } @@ -3818,7 +4117,7 @@ namespace doctest { namespace { // the int (priority) is part of the key for automatic sorting - sadly one can register a // reporter with a duplicate name and a different priority but hopefully that won't happen often :| - typedef std::map, reporterCreatorFunc> reporterMap; + using reporterMap = std::map, reporterCreatorFunc>; reporterMap& getReporters() { static reporterMap data; @@ -3850,8 +4149,8 @@ namespace detail { #ifndef DOCTEST_CONFIG_NO_EXCEPTIONS DOCTEST_NORETURN void throwException() { g_cs->shouldLogCurrentException = false; - throw TestFailureException(); - } // NOLINT(cert-err60-cpp) + throw TestFailureException(); // NOLINT(hicpp-exception-baseclass) + } #else // DOCTEST_CONFIG_NO_EXCEPTIONS void throwException() {} #endif // DOCTEST_CONFIG_NO_EXCEPTIONS @@ -3897,59 +4196,94 @@ namespace { return !*wild; } - //// C string hash function (djb2) - taken from http://www.cse.yorku.ca/~oz/hash.html - //unsigned hashStr(unsigned const char* str) { - // unsigned long hash = 5381; - // char c; - // while((c = *str++)) - // hash = ((hash << 5) + hash) + c; // hash * 33 + c - // return hash; - //} - // checks if the name matches any of the filters (and can be configured what to do when empty) bool matchesAny(const char* name, const std::vector& filters, bool matchEmpty, - bool caseSensitive) { - if(filters.empty() && matchEmpty) + bool caseSensitive) { + if (filters.empty() && matchEmpty) return true; - for(auto& curr : filters) - if(wildcmp(name, curr.c_str(), caseSensitive)) + for (auto& curr : filters) + if (wildcmp(name, curr.c_str(), caseSensitive)) return true; return false; } -} // namespace -namespace detail { - Subcase::Subcase(const String& name, const char* file, int line) - : m_signature({name, file, line}) { - auto* s = g_cs; + DOCTEST_NO_SANITIZE_INTEGER + unsigned long long hash(unsigned long long a, unsigned long long b) { + return (a << 5) + b; + } - // check subcase filters - if(s->subcasesStack.size() < size_t(s->subcase_filter_levels)) { - if(!matchesAny(m_signature.m_name.c_str(), s->filters[6], true, s->case_sensitive)) - return; - if(matchesAny(m_signature.m_name.c_str(), s->filters[7], false, s->case_sensitive)) - return; - } + // C string hash function (djb2) - taken from http://www.cse.yorku.ca/~oz/hash.html + DOCTEST_NO_SANITIZE_INTEGER + unsigned long long hash(const char* str) { + unsigned long long hash = 5381; + char c; + while ((c = *str++)) + hash = ((hash << 5) + hash) + c; // hash * 33 + c + return hash; + } - // if a Subcase on the same level has already been entered - if(s->subcasesStack.size() < size_t(s->subcasesCurrentMaxLevel)) { - s->should_reenter = true; - return; - } + unsigned long long hash(const SubcaseSignature& sig) { + return hash(hash(hash(sig.m_file), hash(sig.m_name.c_str())), sig.m_line); + } - // push the current signature to the stack so we can check if the - // current stack + the current new subcase have been traversed - s->subcasesStack.push_back(m_signature); - if(s->subcasesPassed.count(s->subcasesStack) != 0) { - // pop - revert to previous stack since we've already passed this - s->subcasesStack.pop_back(); - return; + unsigned long long hash(const std::vector& sigs, size_t count) { + unsigned long long running = 0; + auto end = sigs.begin() + count; + for (auto it = sigs.begin(); it != end; it++) { + running = hash(running, hash(*it)); } + return running; + } - s->subcasesCurrentMaxLevel = s->subcasesStack.size(); - m_entered = true; + unsigned long long hash(const std::vector& sigs) { + unsigned long long running = 0; + for (const SubcaseSignature& sig : sigs) { + running = hash(running, hash(sig)); + } + return running; + } +} // namespace +namespace detail { + bool Subcase::checkFilters() { + if (g_cs->subcaseStack.size() < size_t(g_cs->subcase_filter_levels)) { + if (!matchesAny(m_signature.m_name.c_str(), g_cs->filters[6], true, g_cs->case_sensitive)) + return true; + if (matchesAny(m_signature.m_name.c_str(), g_cs->filters[7], false, g_cs->case_sensitive)) + return true; + } + return false; + } - DOCTEST_ITERATE_THROUGH_REPORTERS(subcase_start, m_signature); + Subcase::Subcase(const String& name, const char* file, int line) + : m_signature({name, file, line}) { + if (!g_cs->reachedLeaf) { + if (g_cs->nextSubcaseStack.size() <= g_cs->subcaseStack.size() + || g_cs->nextSubcaseStack[g_cs->subcaseStack.size()] == m_signature) { + // Going down. + if (checkFilters()) { return; } + + g_cs->subcaseStack.push_back(m_signature); + g_cs->currentSubcaseDepth++; + m_entered = true; + DOCTEST_ITERATE_THROUGH_REPORTERS(subcase_start, m_signature); + } + } else { + if (g_cs->subcaseStack[g_cs->currentSubcaseDepth] == m_signature) { + // This subcase is reentered via control flow. + g_cs->currentSubcaseDepth++; + m_entered = true; + DOCTEST_ITERATE_THROUGH_REPORTERS(subcase_start, m_signature); + } else if (g_cs->nextSubcaseStack.size() <= g_cs->currentSubcaseDepth + && g_cs->fullyTraversedSubcases.find(hash(hash(g_cs->subcaseStack, g_cs->currentSubcaseDepth), hash(m_signature))) + == g_cs->fullyTraversedSubcases.end()) { + if (checkFilters()) { return; } + // This subcase is part of the one to be executed next. + g_cs->nextSubcaseStack.clear(); + g_cs->nextSubcaseStack.insert(g_cs->nextSubcaseStack.end(), + g_cs->subcaseStack.begin(), g_cs->subcaseStack.begin() + g_cs->currentSubcaseDepth); + g_cs->nextSubcaseStack.push_back(m_signature); + } + } } DOCTEST_MSVC_SUPPRESS_WARNING_WITH_PUSH(4996) // std::uncaught_exception is deprecated in C++17 @@ -3957,25 +4291,33 @@ namespace detail { DOCTEST_CLANG_SUPPRESS_WARNING_WITH_PUSH("-Wdeprecated-declarations") Subcase::~Subcase() { - if(m_entered) { - // only mark the subcase stack as passed if no subcases have been skipped - if(g_cs->should_reenter == false) - g_cs->subcasesPassed.insert(g_cs->subcasesStack); - g_cs->subcasesStack.pop_back(); + if (m_entered) { + g_cs->currentSubcaseDepth--; + + if (!g_cs->reachedLeaf) { + // Leaf. + g_cs->fullyTraversedSubcases.insert(hash(g_cs->subcaseStack)); + g_cs->nextSubcaseStack.clear(); + g_cs->reachedLeaf = true; + } else if (g_cs->nextSubcaseStack.empty()) { + // All children are finished. + g_cs->fullyTraversedSubcases.insert(hash(g_cs->subcaseStack)); + } #if defined(__cpp_lib_uncaught_exceptions) && __cpp_lib_uncaught_exceptions >= 201411L && (!defined(__MAC_OS_X_VERSION_MIN_REQUIRED) || __MAC_OS_X_VERSION_MIN_REQUIRED >= 101200) if(std::uncaught_exceptions() > 0 #else if(std::uncaught_exception() #endif - && g_cs->shouldLogCurrentException) { + && g_cs->shouldLogCurrentException) { DOCTEST_ITERATE_THROUGH_REPORTERS( test_case_exception, {"exception thrown in subcase - will translate later " - "when the whole test case has been exited (cannot " - "translate while there is an active exception)", - false}); + "when the whole test case has been exited (cannot " + "translate while there is an active exception)", + false}); g_cs->shouldLogCurrentException = false; } + DOCTEST_ITERATE_THROUGH_REPORTERS(subcase_end, DOCTEST_EMPTY); } } @@ -3999,7 +4341,7 @@ namespace detail { } TestCase::TestCase(funcType test, const char* file, unsigned line, const TestSuite& test_suite, - const char* type, int template_id) { + const String& type, int template_id) { m_file = file; m_line = line; m_name = nullptr; // will be later overridden in operator* @@ -4024,10 +4366,8 @@ namespace detail { } DOCTEST_MSVC_SUPPRESS_WARNING_WITH_PUSH(26434) // hides a non-virtual function - DOCTEST_MSVC_SUPPRESS_WARNING(26437) // Do not slice TestCase& TestCase::operator=(const TestCase& other) { - static_cast(*this) = static_cast(other); - + TestCaseData::operator=(other); m_test = other.m_test; m_type = other.m_type; m_template_id = other.m_template_id; @@ -4043,7 +4383,7 @@ namespace detail { m_name = in; // make a new name with an appended type for templated test case if(m_template_id != -1) { - m_full_name = String(m_name) + m_type; + m_full_name = String(m_name) + "<" + m_type + ">"; // redirect the name to point to the newly constructed full name m_name = m_full_name.c_str(); } @@ -4099,35 +4439,8 @@ namespace { return suiteOrderComparator(lhs, rhs); } -#ifdef DOCTEST_CONFIG_COLORS_WINDOWS - HANDLE g_stdoutHandle; - WORD g_origFgAttrs; - WORD g_origBgAttrs; - bool g_attrsInited = false; - - int colors_init() { - if(!g_attrsInited) { - g_stdoutHandle = GetStdHandle(STD_OUTPUT_HANDLE); - g_attrsInited = true; - CONSOLE_SCREEN_BUFFER_INFO csbiInfo; - GetConsoleScreenBufferInfo(g_stdoutHandle, &csbiInfo); - g_origFgAttrs = csbiInfo.wAttributes & ~(BACKGROUND_GREEN | BACKGROUND_RED | - BACKGROUND_BLUE | BACKGROUND_INTENSITY); - g_origBgAttrs = csbiInfo.wAttributes & ~(FOREGROUND_GREEN | FOREGROUND_RED | - FOREGROUND_BLUE | FOREGROUND_INTENSITY); - } - return 0; - } - - int dummy_init_console_colors = colors_init(); -#endif // DOCTEST_CONFIG_COLORS_WINDOWS - DOCTEST_CLANG_SUPPRESS_WARNING_WITH_PUSH("-Wdeprecated-declarations") void color_to_stream(std::ostream& s, Color::Enum code) { -// TODO: upstream the change to doctest : suppress unused variable warning -#if defined(DOCTEST_CONFIG_COLORS_WINDOWS) && DOCTEST_ICC - static_cast(dummy_init_console_colors); -#endif static_cast(s); // for DOCTEST_CONFIG_COLORS_NONE or DOCTEST_CONFIG_COLORS_WINDOWS static_cast(code); // for DOCTEST_CONFIG_COLORS_NONE #ifdef DOCTEST_CONFIG_COLORS_ANSI @@ -4162,7 +4475,23 @@ namespace { (_isatty(_fileno(stdout)) == false && getContextOptions()->force_colors == false)) return; -#define DOCTEST_SET_ATTR(x) SetConsoleTextAttribute(g_stdoutHandle, x | g_origBgAttrs) + static struct ConsoleHelper { + HANDLE stdoutHandle; + WORD origFgAttrs; + WORD origBgAttrs; + + ConsoleHelper() { + stdoutHandle = GetStdHandle(STD_OUTPUT_HANDLE); + CONSOLE_SCREEN_BUFFER_INFO csbiInfo; + GetConsoleScreenBufferInfo(stdoutHandle, &csbiInfo); + origFgAttrs = csbiInfo.wAttributes & ~(BACKGROUND_GREEN | BACKGROUND_RED | + BACKGROUND_BLUE | BACKGROUND_INTENSITY); + origBgAttrs = csbiInfo.wAttributes & ~(FOREGROUND_GREEN | FOREGROUND_RED | + FOREGROUND_BLUE | FOREGROUND_INTENSITY); + } + } ch; + +#define DOCTEST_SET_ATTR(x) SetConsoleTextAttribute(ch.stdoutHandle, x | ch.origBgAttrs) // clang-format off switch (code) { @@ -4179,7 +4508,7 @@ namespace { case Color::BrightWhite: DOCTEST_SET_ATTR(FOREGROUND_INTENSITY | FOREGROUND_GREEN | FOREGROUND_RED | FOREGROUND_BLUE); break; case Color::None: case Color::Bright: // invalid - default: DOCTEST_SET_ATTR(g_origFgAttrs); + default: DOCTEST_SET_ATTR(ch.origFgAttrs); } // clang-format on #endif // DOCTEST_CONFIG_COLORS_WINDOWS @@ -4296,34 +4625,13 @@ namespace detail { getExceptionTranslators().push_back(et); } -#ifdef DOCTEST_CONFIG_TREAT_CHAR_STAR_AS_STRING - void toStream(std::ostream* s, char* in) { *s << in; } - void toStream(std::ostream* s, const char* in) { *s << in; } -#endif // DOCTEST_CONFIG_TREAT_CHAR_STAR_AS_STRING - void toStream(std::ostream* s, bool in) { *s << std::boolalpha << in << std::noboolalpha; } - void toStream(std::ostream* s, float in) { *s << in; } - void toStream(std::ostream* s, double in) { *s << in; } - void toStream(std::ostream* s, double long in) { *s << in; } - - void toStream(std::ostream* s, char in) { *s << in; } - void toStream(std::ostream* s, char signed in) { *s << in; } - void toStream(std::ostream* s, char unsigned in) { *s << in; } - void toStream(std::ostream* s, int short in) { *s << in; } - void toStream(std::ostream* s, int short unsigned in) { *s << in; } - void toStream(std::ostream* s, int in) { *s << in; } - void toStream(std::ostream* s, int unsigned in) { *s << in; } - void toStream(std::ostream* s, int long in) { *s << in; } - void toStream(std::ostream* s, int long unsigned in) { *s << in; } - void toStream(std::ostream* s, int long long in) { *s << in; } - void toStream(std::ostream* s, int long long unsigned in) { *s << in; } - DOCTEST_THREAD_LOCAL doctest_thread_local_wrapper> wrapped_g_infoContexts; // for logging with INFO() ContextScopeBase::ContextScopeBase() { wrapped_g_infoContexts.get().push_back(this); } - ContextScopeBase::ContextScopeBase(ContextScopeBase&& other) { + ContextScopeBase::ContextScopeBase(ContextScopeBase&& other) noexcept { if (other.need_to_destroy) { other.destroy(); } @@ -4393,10 +4701,10 @@ namespace { static LONG CALLBACK handleException(PEXCEPTION_POINTERS ExceptionInfo) { // Multiple threads may enter this filter/handler at once. We want the error message to be printed on the // console just once no matter how many threads have crashed. - static std::mutex mutex; + DOCTEST_DECLARE_STATIC_MUTEX(mutex) static bool execute = true; { - std::lock_guard lock(mutex); + DOCTEST_LOCK_MUTEX(mutex) if(execute) { bool reported = false; for(size_t i = 0; i < DOCTEST_COUNTOF(signalDefs); ++i) { @@ -4569,7 +4877,7 @@ namespace { sigStack.ss_flags = 0; sigaltstack(&sigStack, &oldSigStack); struct sigaction sa = {}; - sa.sa_handler = handleSignal; // NOLINT + sa.sa_handler = handleSignal; sa.sa_flags = SA_ONSTACK; for(std::size_t i = 0; i < DOCTEST_COUNTOF(signalDefs); ++i) { sigaction(signalDefs[i].id, &sa, &oldSigActions[i]); @@ -4608,7 +4916,7 @@ namespace { #define DOCTEST_OUTPUT_DEBUG_STRING(text) ::OutputDebugStringA(text) #else // TODO: integration with XCode and other IDEs -#define DOCTEST_OUTPUT_DEBUG_STRING(text) // NOLINT(clang-diagnostic-unused-macros) +#define DOCTEST_OUTPUT_DEBUG_STRING(text) #endif // Platform void addAssert(assertType::Enum at) { @@ -4627,8 +4935,8 @@ namespace { DOCTEST_ITERATE_THROUGH_REPORTERS(test_case_exception, {message.c_str(), true}); - while(g_cs->subcasesStack.size()) { - g_cs->subcasesStack.pop_back(); + while (g_cs->subcaseStack.size()) { + g_cs->subcaseStack.pop_back(); DOCTEST_ITERATE_THROUGH_REPORTERS(subcase_end, DOCTEST_EMPTY); } @@ -4640,25 +4948,26 @@ namespace { } #endif // DOCTEST_CONFIG_POSIX_SIGNALS || DOCTEST_CONFIG_WINDOWS_SEH } // namespace -namespace detail { - ResultBuilder::ResultBuilder(assertType::Enum at, const char* file, int line, const char* expr, - const char* exception_type, const char* exception_string) { - m_test_case = g_cs->currentTest; - m_at = at; - m_file = file; - m_line = line; - m_expr = expr; - m_failed = true; - m_threw = false; - m_threw_as = false; - m_exception_type = exception_type; - m_exception_string = exception_string; +AssertData::AssertData(assertType::Enum at, const char* file, int line, const char* expr, + const char* exception_type, const StringContains& exception_string) + : m_test_case(g_cs->currentTest), m_at(at), m_file(file), m_line(line), m_expr(expr), + m_failed(true), m_threw(false), m_threw_as(false), m_exception_type(exception_type), + m_exception_string(exception_string) { #if DOCTEST_MSVC - if(m_expr[0] == ' ') // this happens when variadic macros are disabled under MSVC - ++m_expr; + if (m_expr[0] == ' ') // this happens when variadic macros are disabled under MSVC + ++m_expr; #endif // MSVC - } +} + +namespace detail { + ResultBuilder::ResultBuilder(assertType::Enum at, const char* file, int line, const char* expr, + const char* exception_type, const String& exception_string) + : AssertData(at, file, line, expr, exception_type, exception_string) { } + + ResultBuilder::ResultBuilder(assertType::Enum at, const char* file, int line, const char* expr, + const char* exception_type, const Contains& exception_string) + : AssertData(at, file, line, expr, exception_type, exception_string) { } void ResultBuilder::setResult(const Result& res) { m_decomp = res.m_decomp; @@ -4674,11 +4983,11 @@ namespace detail { if(m_at & assertType::is_throws) { //!OCLINT bitwise operator in conditional m_failed = !m_threw; } else if((m_at & assertType::is_throws_as) && (m_at & assertType::is_throws_with)) { //!OCLINT - m_failed = !m_threw_as || (m_exception != m_exception_string); + m_failed = !m_threw_as || !m_exception_string.check(m_exception); } else if(m_at & assertType::is_throws_as) { //!OCLINT bitwise operator in conditional m_failed = !m_threw_as; } else if(m_at & assertType::is_throws_with) { //!OCLINT bitwise operator in conditional - m_failed = m_exception != m_exception_string; + m_failed = !m_exception_string.check(m_exception); } else if(m_at & assertType::is_nothrow) { //!OCLINT bitwise operator in conditional m_failed = m_threw; } @@ -4712,8 +5021,8 @@ namespace detail { std::abort(); } - void decomp_assert(assertType::Enum at, const char* file, int line, const char* expr, - Result result) { + bool decomp_assert(assertType::Enum at, const char* file, int line, const char* expr, + const Result& result) { bool failed = !result.m_passed; // ################################################################################### @@ -4722,21 +5031,29 @@ namespace detail { // ################################################################################### DOCTEST_ASSERT_OUT_OF_TESTS(result.m_decomp); DOCTEST_ASSERT_IN_TESTS(result.m_decomp); - // NOLINTNEXTLINE(clang-analyzer-cplusplus.NewDeleteLeaks) + return !failed; } MessageBuilder::MessageBuilder(const char* file, int line, assertType::Enum severity) { - m_stream = getTlsOss(); + m_stream = tlssPush(); m_file = file; m_line = line; m_severity = severity; } - IExceptionTranslator::IExceptionTranslator() = default; - IExceptionTranslator::~IExceptionTranslator() = default; + MessageBuilder::~MessageBuilder() { + if (!logged) + tlssPop(); + } + + DOCTEST_DEFINE_INTERFACE(IExceptionTranslator) bool MessageBuilder::log() { - m_string = getTlsOssResult(); + if (!logged) { + m_string = tlssPop(); + logged = true; + } + DOCTEST_ITERATE_THROUGH_REPORTERS(log_message, *this); const bool isWarn = m_severity & assertType::is_warn; @@ -4755,29 +5072,10 @@ namespace detail { if(m_severity & assertType::is_require) //!OCLINT bitwise operator in conditional throwException(); } - - MessageBuilder::~MessageBuilder() = default; } // namespace detail namespace { using namespace detail; - template - DOCTEST_NORETURN void throw_exception(Ex const& e) { -#ifndef DOCTEST_CONFIG_NO_EXCEPTIONS - throw e; -#else // DOCTEST_CONFIG_NO_EXCEPTIONS - std::cerr << "doctest will terminate because it needed to throw an exception.\n" - << "The message was: " << e.what() << '\n'; - std::terminate(); -#endif // DOCTEST_CONFIG_NO_EXCEPTIONS - } - -#ifndef DOCTEST_INTERNAL_ERROR -#define DOCTEST_INTERNAL_ERROR(msg) \ - throw_exception(std::logic_error( \ - __FILE__ ":" DOCTEST_TOSTR(__LINE__) ": Internal doctest error: " msg)) -#endif // DOCTEST_INTERNAL_ERROR - // clang-format off // ================================================================================================= @@ -4824,7 +5122,11 @@ namespace { mutable XmlWriter* m_writer = nullptr; }; +#ifndef DOCTEST_CONFIG_NO_INCLUDE_IOSTREAM XmlWriter( std::ostream& os = std::cout ); +#else // DOCTEST_CONFIG_NO_INCLUDE_IOSTREAM + XmlWriter( std::ostream& os ); +#endif // DOCTEST_CONFIG_NO_INCLUDE_IOSTREAM ~XmlWriter(); XmlWriter( XmlWriter const& ) = delete; @@ -4859,10 +5161,10 @@ namespace { void ensureTagClosed(); - private: - void writeDeclaration(); + private: + void newlineIfNecessary(); bool m_tagIsOpen = false; @@ -5051,7 +5353,7 @@ namespace { XmlWriter::XmlWriter( std::ostream& os ) : m_os( os ) { - writeDeclaration(); + // writeDeclaration(); // called explicitly by the reporters that use the writer class - see issue #627 } XmlWriter::~XmlWriter() { @@ -5162,8 +5464,8 @@ namespace { struct XmlReporter : public IReporter { - XmlWriter xml; - std::mutex mutex; + XmlWriter xml; + DOCTEST_DECLARE_MUTEX(mutex) // caching pointers/references to objects of these types - safe to do const ContextOptions& opt; @@ -5257,6 +5559,8 @@ namespace { } void test_run_start() override { + xml.writeDeclaration(); + // remove .exe extension - mainly to have the same output on UNIX and Windows std::string binary_name = skipPathFromFilename(opt.binary_name.c_str()); #ifdef DOCTEST_PLATFORM_WINDOWS @@ -5323,7 +5627,7 @@ namespace { } void test_case_exception(const TestCaseException& e) override { - std::lock_guard lock(mutex); + DOCTEST_LOCK_MUTEX(mutex) xml.scopedElement("Exception") .writeAttribute("crash", e.is_crash) @@ -5344,7 +5648,7 @@ namespace { if(!rb.m_failed && !opt.success) return; - std::lock_guard lock(mutex); + DOCTEST_LOCK_MUTEX(mutex) xml.startElement("Expression") .writeAttribute("success", !rb.m_failed) @@ -5360,7 +5664,7 @@ namespace { if(rb.m_at & assertType::is_throws_as) xml.scopedElement("ExpectedException").writeText(rb.m_exception_type); if(rb.m_at & assertType::is_throws_with) - xml.scopedElement("ExpectedExceptionString").writeText(rb.m_exception_string); + xml.scopedElement("ExpectedExceptionString").writeText(rb.m_exception_string.c_str()); if((rb.m_at & assertType::is_normal) && !rb.m_threw) xml.scopedElement("Expanded").writeText(rb.m_decomp.c_str()); @@ -5370,7 +5674,7 @@ namespace { } void log_message(const MessageData& mb) override { - std::lock_guard lock(mutex); + DOCTEST_LOCK_MUTEX(mutex) xml.startElement("Message") .writeAttribute("type", failureString(mb.m_severity)) @@ -5406,7 +5710,8 @@ namespace { } else if((rb.m_at & assertType::is_throws_as) && (rb.m_at & assertType::is_throws_with)) { //!OCLINT s << Color::Cyan << assertString(rb.m_at) << "( " << rb.m_expr << ", \"" - << rb.m_exception_string << "\", " << rb.m_exception_type << " ) " << Color::None; + << rb.m_exception_string.c_str() + << "\", " << rb.m_exception_type << " ) " << Color::None; if(rb.m_threw) { if(!rb.m_failed) { s << "threw as expected!\n"; @@ -5427,7 +5732,8 @@ namespace { } else if(rb.m_at & assertType::is_throws_with) { //!OCLINT bitwise operator in conditional s << Color::Cyan << assertString(rb.m_at) << "( " << rb.m_expr << ", \"" - << rb.m_exception_string << "\" ) " << Color::None + << rb.m_exception_string.c_str() + << "\" ) " << Color::None << (rb.m_threw ? (!rb.m_failed ? "threw as expected!" : "threw a DIFFERENT exception: ") : "did NOT throw at all!") @@ -5452,8 +5758,8 @@ namespace { // - more attributes in tags struct JUnitReporter : public IReporter { - XmlWriter xml; - std::mutex mutex; + XmlWriter xml; + DOCTEST_DECLARE_MUTEX(mutex) Timer timer; std::vector deepestSubcaseStackNames; @@ -5549,9 +5855,13 @@ namespace { // WHAT FOLLOWS ARE OVERRIDES OF THE VIRTUAL METHODS OF THE REPORTER INTERFACE // ========================================================================================= - void report_query(const QueryData&) override {} + void report_query(const QueryData&) override { + xml.writeDeclaration(); + } - void test_run_start() override {} + void test_run_start() override { + xml.writeDeclaration(); + } void test_run_end(const TestRunStats& p) override { // remove .exe extension - mainly to have the same output on UNIX and Windows @@ -5621,7 +5931,7 @@ namespace { } void test_case_exception(const TestCaseException& e) override { - std::lock_guard lock(mutex); + DOCTEST_LOCK_MUTEX(mutex) testCaseData.addError("exception", e.error_string.c_str()); } @@ -5635,7 +5945,7 @@ namespace { if(!rb.m_failed) // report only failures & ignore the `success` option return; - std::lock_guard lock(mutex); + DOCTEST_LOCK_MUTEX(mutex) std::ostringstream os; os << skipPathFromFilename(rb.m_file) << (opt.gnu_file_line ? ":" : "(") @@ -5646,7 +5956,22 @@ namespace { testCaseData.addFailure(rb.m_decomp.c_str(), assertString(rb.m_at), os.str()); } - void log_message(const MessageData&) override {} + void log_message(const MessageData& mb) override { + if(mb.m_severity & assertType::is_warn) // report only failures + return; + + DOCTEST_LOCK_MUTEX(mutex) + + std::ostringstream os; + os << skipPathFromFilename(mb.m_file) << (opt.gnu_file_line ? ":" : "(") + << line(mb.m_line) << (opt.gnu_file_line ? ":" : "):") << std::endl; + + os << mb.m_string.c_str() << "\n"; + log_contexts(os); + + testCaseData.addFailure(mb.m_string.c_str(), + mb.m_severity & assertType::is_check ? "FAIL_CHECK" : "FAIL", os.str()); + } void test_case_skipped(const TestCaseData&) override {} @@ -5686,7 +6011,7 @@ namespace { bool hasLoggedCurrentTestStart; std::vector subcasesStack; size_t currentSubcaseLevel; - std::mutex mutex; + DOCTEST_DECLARE_MUTEX(mutex) // caching pointers/references to objects of these types - safe to do const ContextOptions& opt; @@ -5986,9 +6311,9 @@ namespace { separator_to_stream(); s << std::dec; - auto totwidth = int(std::ceil(log10((std::max(p.numTestCasesPassingFilters, static_cast(p.numAsserts))) + 1))); - auto passwidth = int(std::ceil(log10((std::max(p.numTestCasesPassingFilters - p.numTestCasesFailed, static_cast(p.numAsserts - p.numAssertsFailed))) + 1))); - auto failwidth = int(std::ceil(log10((std::max(p.numTestCasesFailed, static_cast(p.numAssertsFailed))) + 1))); + auto totwidth = int(std::ceil(log10(static_cast(std::max(p.numTestCasesPassingFilters, static_cast(p.numAsserts))) + 1))); + auto passwidth = int(std::ceil(log10(static_cast(std::max(p.numTestCasesPassingFilters - p.numTestCasesFailed, static_cast(p.numAsserts - p.numAssertsFailed))) + 1))); + auto failwidth = int(std::ceil(log10(static_cast(std::max(p.numTestCasesFailed, static_cast(p.numAssertsFailed))) + 1))); const bool anythingFailed = p.numTestCasesFailed > 0 || p.numAssertsFailed > 0; s << Color::Cyan << "[doctest] " << Color::None << "test cases: " << std::setw(totwidth) << p.numTestCasesPassingFilters << " | " @@ -6032,7 +6357,7 @@ namespace { // log the preamble of the test case only if there is something // else to print - something other than that an assert has failed if(opt.duration || - (st.failure_flags && st.failure_flags != TestCaseFailureReason::AssertFailure)) + (st.failure_flags && st.failure_flags != static_cast(TestCaseFailureReason::AssertFailure))) logTestStart(); if(opt.duration) @@ -6063,7 +6388,7 @@ namespace { } void test_case_exception(const TestCaseException& e) override { - std::lock_guard lock(mutex); + DOCTEST_LOCK_MUTEX(mutex) if(tc->m_no_output) return; @@ -6102,7 +6427,7 @@ namespace { if((!rb.m_failed && !opt.success) || tc->m_no_output) return; - std::lock_guard lock(mutex); + DOCTEST_LOCK_MUTEX(mutex) logTestStart(); @@ -6118,7 +6443,7 @@ namespace { if(tc->m_no_output) return; - std::lock_guard lock(mutex); + DOCTEST_LOCK_MUTEX(mutex) logTestStart(); @@ -6246,8 +6571,8 @@ namespace { char character = *current++; if(seenBackslash) { seenBackslash = false; - if(character == ',') { - s.put(','); + if(character == ',' || character == '\\') { + s.put(character); continue; } s.put('\\'); @@ -6283,30 +6608,30 @@ namespace { if(!parseOption(argc, argv, pattern, &parsedValue)) return false; - if(type == 0) { + if(type) { + // integer + // TODO: change this to use std::stoi or something else! currently it uses undefined behavior - assumes '0' on failed parse... + int theInt = std::atoi(parsedValue.c_str()); + if (theInt != 0) { + res = theInt; //!OCLINT parameter reassignment + return true; + } + } else { // boolean - const char positive[][5] = {"1", "true", "on", "yes"}; // 5 - strlen("true") + 1 - const char negative[][6] = {"0", "false", "off", "no"}; // 6 - strlen("false") + 1 + const char positive[][5] = { "1", "true", "on", "yes" }; // 5 - strlen("true") + 1 + const char negative[][6] = { "0", "false", "off", "no" }; // 6 - strlen("false") + 1 // if the value matches any of the positive/negative possibilities - for(unsigned i = 0; i < 4; i++) { - if(parsedValue.compare(positive[i], true) == 0) { + for (unsigned i = 0; i < 4; i++) { + if (parsedValue.compare(positive[i], true) == 0) { res = 1; //!OCLINT parameter reassignment return true; } - if(parsedValue.compare(negative[i], true) == 0) { + if (parsedValue.compare(negative[i], true) == 0) { res = 0; //!OCLINT parameter reassignment return true; } } - } else { - // integer - // TODO: change this to use std::stoi or something else! currently it uses undefined behavior - assumes '0' on failed parse... - int theInt = std::atoi(parsedValue.c_str()); // NOLINT - if(theInt != 0) { - res = theInt; //!OCLINT parameter reassignment - return true; - } } return false; } @@ -6474,7 +6799,6 @@ void Context::setOption(const char* option, bool value) { // allows the user to override procedurally the int options from the command line void Context::setOption(const char* option, int value) { setOption(option, toString(value).c_str()); - // NOLINTNEXTLINE(clang-analyzer-cplusplus.NewDeleteLeaks) } // allows the user to override procedurally the string options from the command line @@ -6538,8 +6862,12 @@ int Context::run() { fstr.open(p->out.c_str(), std::fstream::out); p->cout = &fstr; } else { +#ifndef DOCTEST_CONFIG_NO_INCLUDE_IOSTREAM // stdout by default p->cout = &std::cout; +#else // DOCTEST_CONFIG_NO_INCLUDE_IOSTREAM + return EXIT_FAILURE; +#endif // DOCTEST_CONFIG_NO_INCLUDE_IOSTREAM } } @@ -6612,7 +6940,7 @@ int Context::run() { // random_shuffle implementation const auto first = &testArray[0]; for(size_t i = testArray.size() - 1; i > 0; --i) { - int idxToSwap = std::rand() % (i + 1); // NOLINT + int idxToSwap = std::rand() % (i + 1); const auto temp = first[i]; @@ -6699,7 +7027,7 @@ int Context::run() { p->numAssertsFailedCurrentTest_atomic = 0; p->numAssertsCurrentTest_atomic = 0; - p->subcasesPassed.clear(); + p->fullyTraversedSubcases.clear(); DOCTEST_ITERATE_THROUGH_REPORTERS(test_case_start, tc); @@ -6709,9 +7037,10 @@ int Context::run() { do { // reset some of the fields for subcases (except for the set of fully passed ones) - p->should_reenter = false; - p->subcasesCurrentMaxLevel = 0; - p->subcasesStack.clear(); + p->reachedLeaf = false; + // May not be empty if previous subcase exited via exception. + p->subcaseStack.clear(); + p->currentSubcaseDepth = 0; p->shouldLogCurrentException = true; @@ -6745,9 +7074,9 @@ DOCTEST_MSVC_SUPPRESS_WARNING_POP p->failure_flags |= TestCaseFailureReason::TooManyFailedAsserts; } - if(p->should_reenter && run_test) + if(!p->nextSubcaseStack.empty() && run_test) DOCTEST_ITERATE_THROUGH_REPORTERS(test_case_reenter, tc); - if(!p->should_reenter) + if(p->nextSubcaseStack.empty()) run_test = false; } while(run_test); @@ -6773,17 +7102,10 @@ DOCTEST_MSVC_SUPPRESS_WARNING_POP DOCTEST_ITERATE_THROUGH_REPORTERS(report_query, qdata); } - // see these issues on the reasoning for this: - // - https://github.com/onqtam/doctest/issues/143#issuecomment-414418903 - // - https://github.com/onqtam/doctest/issues/126 - auto DOCTEST_FIX_FOR_MACOS_LIBCPP_IOSFWD_STRING_LINK_ERRORS = []() DOCTEST_NOINLINE - { std::cout << std::string(); }; - DOCTEST_FIX_FOR_MACOS_LIBCPP_IOSFWD_STRING_LINK_ERRORS(); - return cleanup_and_return(); } -IReporter::~IReporter() = default; +DOCTEST_DEFINE_INTERFACE(IReporter) int IReporter::get_num_active_contexts() { return detail::wrapped_g_infoContexts.get().size(); } const IContextScope* const* IReporter::get_active_contexts() { @@ -6818,5 +7140,17 @@ DOCTEST_CLANG_SUPPRESS_WARNING_POP DOCTEST_MSVC_SUPPRESS_WARNING_POP DOCTEST_GCC_SUPPRESS_WARNING_POP +DOCTEST_SUPPRESS_COMMON_WARNINGS_POP + #endif // DOCTEST_LIBRARY_IMPLEMENTATION #endif // DOCTEST_CONFIG_IMPLEMENT + +#ifdef DOCTEST_UNDEF_WIN32_LEAN_AND_MEAN +#undef WIN32_LEAN_AND_MEAN +#undef DOCTEST_UNDEF_WIN32_LEAN_AND_MEAN +#endif // DOCTEST_UNDEF_WIN32_LEAN_AND_MEAN + +#ifdef DOCTEST_UNDEF_NOMINMAX +#undef NOMINMAX +#undef DOCTEST_UNDEF_NOMINMAX +#endif // DOCTEST_UNDEF_NOMINMAX diff --git a/third-party/tbb/test/common/test_invoke.h b/third-party/tbb/test/common/test_invoke.h new file mode 100644 index 00000000..42b4b2b7 --- /dev/null +++ b/third-party/tbb/test/common/test_invoke.h @@ -0,0 +1,145 @@ +/* + Copyright (c) 2023 Intel Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#ifndef __TBB_test_common_test_invoke_H +#define __TBB_test_common_test_invoke_H + +#include "test.h" +#include "oneapi/tbb/flow_graph.h" +#include "oneapi/tbb/blocked_range.h" + +#if __TBB_CPP17_INVOKE_PRESENT +namespace test_invoke { + +// Can be customized +template +std::size_t get_real_index(const T& obj) { + return obj; +} + +template +class SmartRange : public oneapi::tbb::blocked_range { + using base_range = oneapi::tbb::blocked_range; +public: + SmartRange(const Value& first, const Value& last) : base_range(first, last), change_vector(nullptr) {} + SmartRange(const Value& first, const Value& last, std::vector& cv) + : base_range(first, last), change_vector(&cv) {} + + SmartRange(const SmartRange&) = default; + SmartRange(SmartRange& other, oneapi::tbb::split) + : base_range(other, oneapi::tbb::split{}), change_vector(other.change_vector) {} + + void increase() const { + CHECK_MESSAGE(change_vector, "Attempt to operate with no associated vector"); + for (std::size_t index = get_real_index(this->begin()); index != get_real_index(this->end()); ++index) { + ++(*change_vector)[index]; + } + } + + Value reduction(const Value& idx) const { + Value result = idx; + for (std::size_t index = get_real_index(this->begin()); index != get_real_index(this->end()); ++index) { + result = result + Value(index); + } + return Value(result); + } + + Value scan(const Value& idx, bool is_final_scan) const { + CHECK_MESSAGE(change_vector, "Attempt to operate with no associated vector"); + Value result = idx; + for (std::size_t index = get_real_index(this->begin()); index != get_real_index(this->end()); ++index) { + result = result + Value(index); + if (is_final_scan) (*change_vector)[index] = get_real_index(result); + } + return result; + } +private: + std::vector* change_vector; +}; + +template +class SmartID { +public: + SmartID() : id(999), operate_signal_point(nullptr) {} + SmartID(std::size_t* sp) : id(999), operate_signal_point(sp) {} + + SmartID(const IDType& n) : id(n), operate_signal_point(nullptr) {} + SmartID(const IDType& n, std::size_t* sp) : id(n), operate_signal_point(sp) {} + + IDType get_id() const { return id; } + const IDType& get_id_ref() const { return id; } + +private: + template + void send_id_impl(TupleOfPorts& ports, std::index_sequence) const { + (std::get(ports).try_put(id) , ...); + } +public: + template + void send_id(TupleOfPorts& ports) const { + send_id_impl(ports, std::make_index_sequence::value>()); + } + + template + void send_id_to_gateway(GatewayType& gateway) const { + gateway.reserve_wait(); + gateway.try_put(id); + gateway.release_wait(); + } + + void operate() const { + CHECK_MESSAGE(operate_signal_point, "incorrect test setup"); + ++(*operate_signal_point); + } + + IDType id; +private: + std::size_t* operate_signal_point; +}; + +class SmartValue { +public: + SmartValue(std::size_t rv) : real_value(rv) {} + SmartValue(const SmartValue&) = default; + SmartValue& operator=(const SmartValue&) = default; + + SmartValue operator+(const SmartValue& other) const { + return SmartValue{real_value + other.real_value}; + } + std::size_t operator-(const SmartValue& other) const { + return real_value - other.real_value; + } + + std::size_t get() const { return real_value; } + + bool operator<(const SmartValue& other) const { + return real_value < other.real_value; + } + + SmartValue& operator++() { ++real_value; return *this; } +private: + std::size_t real_value; +}; + +std::size_t get_real_index(const SmartValue& value) { + return value.get(); +} + + +} // namespace test_invoke + +#endif // __TBB_CPP17_INVOKE_PRESENT +#endif // __TBB_test_common_test_invoke_H diff --git a/third-party/tbb/test/common/utils_assert.h b/third-party/tbb/test/common/utils_assert.h index 1df8ae72..0123ab88 100644 --- a/third-party/tbb/test/common/utils_assert.h +++ b/third-party/tbb/test/common/utils_assert.h @@ -1,5 +1,5 @@ /* - Copyright (c) 2005-2022 Intel Corporation + Copyright (c) 2005-2023 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -20,6 +20,8 @@ #include "config.h" #include "utils_report.h" +#include + #define REPORT_FATAL_ERROR REPORT namespace utils { diff --git a/third-party/tbb/test/conformance/conformance_async_node.cpp b/third-party/tbb/test/conformance/conformance_async_node.cpp index ce8594f6..486e61d5 100644 --- a/third-party/tbb/test/conformance/conformance_async_node.cpp +++ b/third-party/tbb/test/conformance/conformance_async_node.cpp @@ -1,5 +1,5 @@ /* - Copyright (c) 2020-2021 Intel Corporation + Copyright (c) 2020-2023 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -19,6 +19,7 @@ #endif #include "conformance_flowgraph.h" +#include "common/test_invoke.h" //! \file conformance_async_node.cpp //! \brief Test for [flow_graph.async_node] specification @@ -82,7 +83,7 @@ TEST_CASE("async_node broadcast"){ conformance::test_forwarding, input_msg, int>(1, oneapi::tbb::flow::unlimited, fun); } -//! Test async_node has a user-settable concurrency limit. It can be set to one of predefined values. +//! Test async_node has a user-settable concurrency limit. It can be set to one of predefined values. //! The user can also provide a value of type std::size_t to limit concurrency. //! Test that not more than limited threads works in parallel. //! \brief \ref requirement @@ -136,3 +137,32 @@ TEST_CASE("async_node with rejecting policy"){ CHECK_MESSAGE((flag.load()), "The body of assync_node must submits the messages to an external activity for processing outside of the graph"); thr.join(); } + +#if __TBB_CPP17_INVOKE_PRESENT +//! Test that async_node uses std::invoke to run the body +//! \brief \ref requirement +TEST_CASE("async_node and std::invoke") { + using namespace oneapi::tbb::flow; + + using start_node_type = function_node>; + using async_node_type = async_node, std::size_t>; + + auto async_body = &test_invoke::SmartID::template send_id_to_gateway; + + graph g; + start_node_type starter(g, serial, [](std::size_t i) -> test_invoke::SmartID { return {i}; }); + async_node_type activity_submitter(g, serial, async_body); + buffer_node buf(g); + + make_edge(starter, activity_submitter); + make_edge(activity_submitter, buf); + + starter.try_put(1); + + g.wait_for_all(); + std::size_t result = 0; + CHECK(buf.try_get(result)); + CHECK(result == 1); + CHECK(!buf.try_get(result)); +} +#endif diff --git a/third-party/tbb/test/conformance/conformance_concurrent_queue.cpp b/third-party/tbb/test/conformance/conformance_concurrent_queue.cpp index ef790f39..10db09fb 100644 --- a/third-party/tbb/test/conformance/conformance_concurrent_queue.cpp +++ b/third-party/tbb/test/conformance/conformance_concurrent_queue.cpp @@ -1,5 +1,5 @@ /* - Copyright (c) 2005-2022 Intel Corporation + Copyright (c) 2005-2023 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -1677,3 +1677,209 @@ TEST_CASE("Test with minimalistic object type") { test_with_minimalistic_objects(); test_with_minimalistic_objects(); } + +//TODO: Once support for std::allocator_traits::propagate_on_container_* is implemented, +// most of the 4 test cases below can be replaced with move_support_tests::test_*. + +template +void test_queue_helper() { + int size = 5; + typename CQ::value_type vec_1(size, 0), vec_2(size, 0), vec_3(size, 0), vec_4(size, 0); + srand(static_cast(time(0))); + generate(vec_1.begin(), vec_1.end(), rand); + generate(vec_2.begin(), vec_2.end(), rand); + generate(vec_3.begin(), vec_3.end(), rand); + generate(vec_4.begin(), vec_4.end(), rand); + + CQ q1, q2, q3; + q3 = {vec_4, vec_2, vec_3}; + CQ q4({vec_1, vec_2, vec_3}); + + q1 = q3; + q2 = std::move(q3); + CHECK(q3.empty()); + + CHECK(q1 != q4); + q1.swap(q4); + CHECK(q2 == q4); + + swap(q2, q3); + CHECK(q2.empty()); + CHECK(q3 == q4); +} + +//! Test assignment (copy/move/initializer_list) and swapping +//! \brief \ref interface \ref requirement +TEST_CASE("testing assignment and swapping") { + test_queue_helper>>(); + test_queue_helper>>(); +} + +template +void TestMoveQueue() { + using allocator_type = typename QueueType::allocator_type; + + QueueType q1, q2; + move_support_tests::Foo obj; + size_t n1(15), n2(7); + + allocator_type::init_counters(); + for(size_t i =0; i < n1; i++) + q1.push(obj); + size_t q1_items_constructed = allocator_type::items_constructed; + size_t q1_items_allocated = allocator_type::items_allocated; + + allocator_type::init_counters(); + for(size_t i =0; i < n2; i++) + q2.push(obj); + size_t q2_items_allocated = allocator_type::items_allocated; + + allocator_type::init_counters(); + q1 = std::move(q2); + + CHECK(q1_items_allocated == allocator_type::items_freed); + CHECK(q1_items_constructed == allocator_type::items_destroyed); + CHECK(q2_items_allocated >= allocator_type::items_allocated); +} + +//! move assignment test for equal counting allocator +//! \brief \ref interface \ref requirement +TEST_CASE("testing move assignment with equal counting allocators") { + using allocator_type = StaticSharedCountingAllocator>; + TestMoveQueue>(); + TestMoveQueue>(); +} + +template +struct stateful_allocator { + typedef T value_type; + stateful_allocator() = default; + int state = 0; + template + constexpr stateful_allocator(const stateful_allocator& src) noexcept : state(src.state) {} + + T* allocate(std::size_t n) { + return static_cast(::operator new(n * sizeof(T))); + } + + void deallocate(T* p, std::size_t) noexcept { + ::operator delete(p); + } +}; + +template +bool operator==(const stateful_allocator& lhs, const stateful_allocator& rhs) { return lhs.state == rhs.state; } + +template +bool operator!=(const stateful_allocator& lhs, const stateful_allocator& rhs) { return lhs.state != rhs.state; } + +template +void TestMoveQueueUnequal() { + using allocator_type = typename QueueType::allocator_type; + allocator_type alloc1, alloc2; + alloc1.state = 0; + alloc2.state = 1; + + QueueType q1(alloc1), q2(alloc2); + move_support_tests::Foo obj; + size_t n1(15), n2(7); + + allocator_type::init_counters(); + for(size_t i =0; i < n1; i++) + q1.push(obj); + + allocator_type::init_counters(); + for(size_t i =0; i < n2; i++) + q2.push(obj); + size_t q2_items_allocated = allocator_type::items_allocated; + + allocator_type::init_counters(); + q1 = std::move(q2); + + REQUIRE_MESSAGE(allocator_type::items_allocated == q2_items_allocated, "More than expected memory allocated?"); + REQUIRE_MESSAGE(std::all_of(q1.unsafe_begin(), q1.unsafe_end(), is_state_predicate()), + "Container did not move construct some elements"); + REQUIRE_MESSAGE(std::all_of(q2.unsafe_begin(), q2.unsafe_end(), is_state_predicate()), + "Container did not move all the elements"); +} + +//! move assignment test for unequal counting allocator +//! \brief \ref interface \ref requirement +TEST_CASE("testing move assignment with unequal counting allocators") { + using allocator_type = StaticSharedCountingAllocator>; + TestMoveQueueUnequal>(); + TestMoveQueueUnequal>(); +} + +template +void test_check_move_allocator(Container& src, Container& dst, Container& cpy) { + REQUIRE_MESSAGE(src.empty(), "Source didn't clear"); + REQUIRE_MESSAGE(std::equal(dst.unsafe_begin(), dst.unsafe_end(), cpy.unsafe_begin()), "Elements are not equal"); +} + +void test_move_assignment_test_equal() { + int n = 5; + std::vector vect1(n, 10), vect2(n,20), vect3(n, 30); + + tbb::concurrent_queue> src({vect1, vect2, vect3}); + tbb::concurrent_queue> dst(src.get_allocator()); + tbb::concurrent_queue> cpy(src.get_allocator()); + REQUIRE_MESSAGE(src.get_allocator() == dst.get_allocator(), "Incorrect test setup: allocators should be equal"); + cpy = src; + dst = std::move(src); + + tbb::concurrent_bounded_queue> src_bnd({vect1, vect2, vect3}); + tbb::concurrent_bounded_queue> dst_bnd(src_bnd.get_allocator()); + tbb::concurrent_bounded_queue> cpy_bnd(src_bnd.get_allocator()); + REQUIRE_MESSAGE(src_bnd.get_allocator() == dst_bnd.get_allocator(), "Incorrect test setup: allocators should be equal"); + cpy_bnd = src_bnd; + dst_bnd = std::move(src_bnd); + + test_check_move_allocator>>(src, dst, cpy); + REQUIRE_MESSAGE(cpy.unsafe_size() == dst.unsafe_size(), "Queues are not equal"); + + test_check_move_allocator>>(src_bnd, dst_bnd, cpy_bnd); + REQUIRE_MESSAGE(cpy_bnd.size() == dst_bnd.size(), "Queues are not equal"); +} + +void test_move_assignment_test_unequal() { + stateful_allocator src_alloc; + src_alloc.state = 0; + std::vector> v(8, 0, src_alloc); + tbb::concurrent_queue>, stateful_allocator> src(src_alloc); + + v.push_back(42); + v.push_back(82); + src.push(v); + src.push(v); + + stateful_allocator dst_alloc; + dst_alloc.state = 1; + tbb::concurrent_queue>, stateful_allocator> dst(dst_alloc); + tbb::concurrent_queue>, stateful_allocator> cpy(src_alloc); + REQUIRE_MESSAGE(src.get_allocator() != dst.get_allocator(), "Incorrect test setup: allocators should be unequal"); + cpy = src; + dst = std::move(src); + + tbb::concurrent_bounded_queue>, stateful_allocator> src_bnd(src_alloc); + tbb::concurrent_bounded_queue>, stateful_allocator> dst_bnd(dst_alloc); + tbb::concurrent_bounded_queue>, stateful_allocator> cpy_bnd(src_alloc); + REQUIRE_MESSAGE(src_bnd.get_allocator() != dst_bnd.get_allocator(), "Incorrect test setup: allocators should be unequal"); + src_bnd.push(v); + src_bnd.push(v); + cpy_bnd = src_bnd; + dst_bnd = std::move(src_bnd); + + test_check_move_allocator>, stateful_allocator>>(src, dst, cpy); + REQUIRE_MESSAGE(dst.unsafe_size() == cpy.unsafe_size(), "Queues are not equal"); + + test_check_move_allocator>, stateful_allocator>>(src_bnd, dst_bnd, cpy_bnd); + REQUIRE_MESSAGE(dst_bnd.size() == cpy_bnd.size(), "Queues are not equal"); +} + +//! move assignment test for equal and unequal allocator +//! \brief \ref interface \ref requirement +TEST_CASE("testing move assignment with equal and unequal allocators") { + test_move_assignment_test_equal(); + test_move_assignment_test_unequal(); +} diff --git a/third-party/tbb/test/conformance/conformance_function_node.cpp b/third-party/tbb/test/conformance/conformance_function_node.cpp index ab7cb13a..332558cf 100644 --- a/third-party/tbb/test/conformance/conformance_function_node.cpp +++ b/third-party/tbb/test/conformance/conformance_function_node.cpp @@ -1,5 +1,5 @@ /* - Copyright (c) 2020-2021 Intel Corporation + Copyright (c) 2020-2023 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -19,6 +19,7 @@ #endif #include "conformance_flowgraph.h" +#include "common/test_invoke.h" using input_msg = conformance::message; using output_msg = conformance::message; @@ -76,6 +77,42 @@ void test_deduction_guides() { #endif +#if __TBB_CPP17_INVOKE_PRESENT + +template +void test_fn_invoke_basic(const Body1& body1, const Body2& body2) { + using namespace oneapi::tbb::flow; + + graph g; + + function_node f1(g, unlimited, body1); + function_node f2(g, unlimited, body2); + buffer_node buf(g); + + make_edge(f1, f2); + make_edge(f2, buf); + + f1.try_put(InputType{OutputType1{1}}); + + g.wait_for_all(); + + std::size_t result = 0; + CHECK(buf.try_get(result)); + CHECK(result == 1); + CHECK(!buf.try_get(result)); +} + +void test_fn_invoke() { + using output_type = test_invoke::SmartID; + using input_type = test_invoke::SmartID; + // Testing pointer to member function + test_fn_invoke_basic(&input_type::get_id, &output_type::get_id); + // Testing pointer to member object + test_fn_invoke_basic(&input_type::id, &output_type::id); +} +#endif // __TBB_CPP17_INVOKE_PRESENT + //! Test calling function body //! \brief \ref interface \ref requirement TEST_CASE("Test function_node body") { @@ -168,3 +205,11 @@ TEST_CASE("Test function_node Output and Input class") { using Body = conformance::copy_counting_object; conformance::test_output_input_class, Body>(); } + +#if __TBB_CPP17_INVOKE_PRESENT +//! Test that function_node uses std::invoke to execute the body +//! \brief \ref interface \ref requirement +TEST_CASE("Test function_node and std::invoke") { + test_fn_invoke(); +} +#endif diff --git a/third-party/tbb/test/conformance/conformance_join_node.cpp b/third-party/tbb/test/conformance/conformance_join_node.cpp index 532c9565..52623471 100644 --- a/third-party/tbb/test/conformance/conformance_join_node.cpp +++ b/third-party/tbb/test/conformance/conformance_join_node.cpp @@ -1,5 +1,5 @@ /* - Copyright (c) 2020-2022 Intel Corporation + Copyright (c) 2020-2023 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -19,6 +19,7 @@ #endif #include "conformance_flowgraph.h" +#include "common/test_invoke.h" //! \file conformance_join_node.cpp //! \brief Test for [flow_graph.join_node] specification @@ -264,3 +265,53 @@ TEST_CASE("join_node output_ports") { CHECK_MESSAGE((std::is_same>::input_ports_type&, decltype(node.input_ports())>::value), "join_node input_ports should returns a tuple of input ports"); } + +#if __TBB_CPP17_INVOKE_PRESENT + +template +void test_invoke_basic(Body1 body1, Body2 body2) { + static_assert(std::is_same_v, std::size_t>, "incorrect test setup"); + using namespace oneapi::tbb::flow; + auto generator = [](std::size_t n) { return test_invoke::SmartID(n); }; + graph g; + + function_node> f1(g, unlimited, generator); + function_node> f2(g, unlimited, generator); + + using tuple_type = std::tuple, test_invoke::SmartID>; + using join_type = join_node>; + + + join_type j(g, body1, body2); + + buffer_node buf(g); + + make_edge(f1, input_port<0>(j)); + make_edge(f2, input_port<1>(j)); + make_edge(j, buf); + + std::size_t objects_count = 100; + for (std::size_t i = 0; i < objects_count; ++i) { + f1.try_put(i); + f2.try_put(objects_count - i - 1); + } + + g.wait_for_all(); + + std::size_t buf_size = 0; + tuple_type tpl; + + while(buf.try_get(tpl)) { + ++buf_size; + CHECK(std::get<0>(tpl).id == std::get<1>(tpl).id); + } + CHECK(buf_size == objects_count); +} + +//! Test that key_matching join_node uses std::invoke to run the body +//! \brief \ref requirement +TEST_CASE("key_matching join_node invoke semantics") { + test_invoke_basic(&test_invoke::SmartID::get_id, &test_invoke::SmartID::id); + test_invoke_basic(&test_invoke::SmartID::get_id_ref, &test_invoke::SmartID::get_id_ref); +} +#endif // __TBB_CPP17_INVOKE_PRESENT diff --git a/third-party/tbb/test/conformance/conformance_multifunction_node.cpp b/third-party/tbb/test/conformance/conformance_multifunction_node.cpp index 18f49d78..1ad7b530 100644 --- a/third-party/tbb/test/conformance/conformance_multifunction_node.cpp +++ b/third-party/tbb/test/conformance/conformance_multifunction_node.cpp @@ -1,5 +1,5 @@ /* - Copyright (c) 2020-2021 Intel Corporation + Copyright (c) 2020-2023 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -21,6 +21,7 @@ #define CONFORMANCE_MULTIFUNCTION_NODE #include "conformance_flowgraph.h" +#include "common/test_invoke.h" //! \file conformance_multifunction_node.cpp //! \brief Test for [flow_graph.function_node] specification @@ -51,7 +52,7 @@ TEST_CASE("multifunction_node priority"){ conformance::test_priority>, input_msg>(oneapi::tbb::flow::unlimited); } -//! Test function_node has a user-settable concurrency limit. It can be set to one of predefined values. +//! Test function_node has a user-settable concurrency limit. It can be set to one of predefined values. //! The user can also provide a value of type std::size_t to limit concurrency. //! Test that not more than limited threads works in parallel. //! \brief \ref interface @@ -135,3 +136,53 @@ TEST_CASE("Test function_node Output and Input class") { using Body = conformance::copy_counting_object; conformance::test_output_input_class>, Body>(); } + +#if __TBB_CPP17_INVOKE_PRESENT +//! Test that multifunction_node uses std::invoke to execute the body +//! \brief \ref interface \ref requirement +TEST_CASE("Test multifunction_node and std::invoke") { + using namespace oneapi::tbb::flow; + + using output_type1 = test_invoke::SmartID; + using input_type = test_invoke::SmartID; + + using output_tuple1 = std::tuple; + using output_tuple2 = std::tuple; + + using first_mf_node_type = multifunction_node; + using second_mf_node_type = multifunction_node; + + using first_ports_type = typename first_mf_node_type::output_ports_type; + using second_ports_type = typename second_mf_node_type::output_ports_type; + + graph g; + + auto first_body = &input_type::template send_id; + auto second_body = &output_type1::template send_id; + + first_mf_node_type mf1(g, unlimited, first_body); + second_mf_node_type mf21(g, unlimited, second_body); + second_mf_node_type mf22(g, unlimited, second_body); + + buffer_node buf(g); + + make_edge(output_port<0>(mf1), mf21); + make_edge(output_port<1>(mf1), mf22); + + make_edge(output_port<0>(mf21), buf); + make_edge(output_port<0>(mf22), buf); + + mf1.try_put(input_type{output_type1{1}}); + + g.wait_for_all(); + + std::size_t buf_size = 0; + std::size_t tmp = 0; + while(buf.try_get(tmp)) { + ++buf_size; + CHECK(tmp == 1); + } + + CHECK(buf_size == 2); +} +#endif diff --git a/third-party/tbb/test/conformance/conformance_parallel_for.cpp b/third-party/tbb/test/conformance/conformance_parallel_for.cpp index ad92ba6b..44903f06 100644 --- a/third-party/tbb/test/conformance/conformance_parallel_for.cpp +++ b/third-party/tbb/test/conformance/conformance_parallel_for.cpp @@ -1,5 +1,5 @@ /* - Copyright (c) 2005-2021 Intel Corporation + Copyright (c) 2005-2023 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -18,6 +18,7 @@ #include "common/test.h" #include "common/utils.h" #include "common/utils_report.h" +#include "common/test_invoke.h" #include "oneapi/tbb/parallel_for.h" #include "oneapi/tbb/tick_count.h" @@ -244,6 +245,122 @@ void TestParallelForWithStepSupport() { oneapi::tbb::parallel_for(static_cast(2), static_cast(1), static_cast(1), TestFunctor()); } +#if __TBB_CPP17_INVOKE_PRESENT +class SmartIndex { +public: + SmartIndex(std::size_t ri) : real_index(ri), change_vector(nullptr) {} + SmartIndex(std::size_t ri, std::vector& cv) + : real_index(ri), change_vector(&cv) {} + SmartIndex(const SmartIndex& other) : real_index(other.real_index), + change_vector(other.change_vector) {} + ~SmartIndex() = default; + + SmartIndex& operator=(const SmartIndex& other) { + real_index = other.real_index; + change_vector = other.change_vector; + return *this; + } + + bool operator<(const SmartIndex& other) const { + return real_index < other.real_index; + } + + bool operator<=(const SmartIndex& other) const { + return real_index <= other.real_index; + } + + SmartIndex operator/(const SmartIndex& other) const { + return {real_index / other.real_index, *change_vector}; + } + + SmartIndex operator*(const SmartIndex& other) const { + return {real_index * other.real_index, *change_vector}; + } + + SmartIndex operator+(const SmartIndex& other) const { + return {real_index + other.real_index, *change_vector}; + } + + SmartIndex& operator+=(const SmartIndex& other) { + real_index += other.real_index; + return *this; + } + + SmartIndex& operator++() { ++real_index; return *this; } + + std::size_t operator-(const SmartIndex& other) const { + return real_index - other.real_index; + } + + SmartIndex operator+(std::size_t k) { + return {real_index + k, *change_vector}; + } + + void increase() const { + CHECK(change_vector); + ++(*change_vector)[real_index]; + } +private: + std::size_t real_index; + std::vector* change_vector; +}; + +void test_pfor_body_invoke() { + const std::size_t number_of_overloads = 5; + const std::size_t iterations = 100000; + + using range_type = test_invoke::SmartRange; + std::vector change_vector(iterations, 0); + range_type range{0, iterations, change_vector}; + + oneapi::tbb::parallel_for(range, &range_type::increase); + oneapi::tbb::parallel_for(range, &range_type::increase, oneapi::tbb::simple_partitioner()); + oneapi::tbb::parallel_for(range, &range_type::increase, oneapi::tbb::auto_partitioner()); + oneapi::tbb::parallel_for(range, &range_type::increase, oneapi::tbb::static_partitioner()); + oneapi::tbb::affinity_partitioner aff; + oneapi::tbb::parallel_for(range, &range_type::increase, aff); + + for (std::size_t item : change_vector) { + CHECK(item == number_of_overloads); + } +} + + +void test_pfor_func_invoke() { + const std::size_t number_of_overloads = 5; + const std::size_t iterations = 100000; + + std::vector change_vector(iterations, 0); + SmartIndex first{0, change_vector}; + SmartIndex last{iterations, change_vector}; + SmartIndex stride{2}; + + oneapi::tbb::parallel_for(first, last, &SmartIndex::increase); + oneapi::tbb::parallel_for(first, last, &SmartIndex::increase, oneapi::tbb::simple_partitioner()); + oneapi::tbb::parallel_for(first, last, &SmartIndex::increase, oneapi::tbb::auto_partitioner()); + oneapi::tbb::parallel_for(first, last, &SmartIndex::increase, oneapi::tbb::static_partitioner()); + oneapi::tbb::affinity_partitioner aff; + oneapi::tbb::parallel_for(first, last, &SmartIndex::increase, aff); + + for (std::size_t& item : change_vector) { + CHECK(item == number_of_overloads); + item = 0; + } + + oneapi::tbb::parallel_for(first, last, stride, &SmartIndex::increase); + oneapi::tbb::parallel_for(first, last, stride, &SmartIndex::increase, oneapi::tbb::simple_partitioner()); + oneapi::tbb::parallel_for(first, last, stride, &SmartIndex::increase, oneapi::tbb::auto_partitioner()); + oneapi::tbb::parallel_for(first, last, stride, &SmartIndex::increase, oneapi::tbb::static_partitioner()); + oneapi::tbb::parallel_for(first, last, stride, &SmartIndex::increase, aff); + + CHECK(change_vector[0] == number_of_overloads); + for (std::size_t i = 1; i < iterations; ++i) { + std::size_t expected = change_vector[i - 1] == 0 ? number_of_overloads : 0; + CHECK(change_vector[i] == expected); + } +} +#endif // __TBB_CPP17_INVOKE_PRESENT + //! Test simple parallel_for with different partitioners //! \brief \ref interface \ref requirement TEST_CASE("Basic parallel_for") { @@ -313,3 +430,12 @@ TEST_CASE("Testing parallel_for with partitioners") { parallel_for(Range1(true, false), b, oneapi::tbb::static_partitioner()); parallel_for(Range6(false, true), b, oneapi::tbb::static_partitioner()); } + +#if __TBB_CPP17_INVOKE_PRESENT +//! Test that parallel_for uses std::invoke to run body and function +//! \brief \ref interface \ref requirement +TEST_CASE("parallel_for and std::invoke") { + test_pfor_body_invoke(); + test_pfor_func_invoke(); +} +#endif diff --git a/third-party/tbb/test/conformance/conformance_parallel_for_each.cpp b/third-party/tbb/test/conformance/conformance_parallel_for_each.cpp index 579cf26f..ad8ee672 100644 --- a/third-party/tbb/test/conformance/conformance_parallel_for_each.cpp +++ b/third-party/tbb/test/conformance/conformance_parallel_for_each.cpp @@ -1,5 +1,5 @@ /* - Copyright (c) 2005-2021 Intel Corporation + Copyright (c) 2005-2023 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -91,6 +91,64 @@ void WorkProducingTest(Context&... context) { } } +#if __TBB_CPP17_INVOKE_PRESENT + +class ForEachInvokeItem { +public: + ForEachInvokeItem(std::size_t rv, std::vector& cv) : real_value(rv), change_vector(cv) {} + + void do_action() const { ++change_vector[real_value]; } + + void do_action_and_feed(oneapi::tbb::feeder& feeder) const { + CHECK_MESSAGE(change_vector.size() % 2 == 0, "incorrect test setup"); + std::size_t shift = change_vector.size() / 2; + std::cout << "Process " << real_value << std::endl; + ++change_vector[real_value]; + if (real_value < shift) { + std::cout << "Add " << real_value + shift << std::endl; + feeder.add(ForEachInvokeItem(real_value + shift, change_vector)); + } + } +private: + std::size_t real_value; + std::vector& change_vector; +}; + +template