diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index b1e88d0e57..b0eb9f18bb 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -79,8 +79,16 @@ jobs: cat /home/runner/work/orc/orc/build/java/rat.txt windows: - name: "Build on Windows" + name: "C++ ${{ matrix.simd }} Test on Windows" runs-on: windows-2019 + strategy: + fail-fast: false + matrix: + simd: + - General + - AVX512 + env: + ORC_USER_SIMD_LEVEL: AVX512 steps: - name: Checkout uses: actions/checkout@v2 @@ -89,13 +97,41 @@ jobs: with: msbuild-architecture: x64 - name: "Test" + shell: bash run: | mkdir build cd build - cmake .. -G "Visual Studio 16 2019" -DCMAKE_BUILD_TYPE=Debug -DBUILD_LIBHDFSPP=OFF -DBUILD_TOOLS=OFF -DBUILD_JAVA=OFF + if [ "${{ matrix.simd }}" = "General" ]; then + cmake .. -G "Visual Studio 16 2019" -DCMAKE_BUILD_TYPE=Debug -DBUILD_LIBHDFSPP=OFF -DBUILD_TOOLS=OFF -DBUILD_JAVA=OFF + else + cmake .. -G "Visual Studio 16 2019" -DCMAKE_BUILD_TYPE=Debug -DBUILD_LIBHDFSPP=OFF -DBUILD_TOOLS=OFF -DBUILD_JAVA=OFF -DBUILD_ENABLE_AVX512=ON + fi cmake --build . --config Debug ctest -C Debug --output-on-failure + simdUbuntu: + name: "SIMD programming using C++ intrinsic functions on ${{ matrix.os }}" + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + os: + - ubuntu-22.04 + cxx: + - clang++ + env: + ORC_USER_SIMD_LEVEL: AVX512 + steps: + - name: Checkout + uses: actions/checkout@v2 + - name: "Test" + run: | + mkdir -p ~/.m2 + mkdir build + cd build + cmake -DBUILD_JAVA=OFF -DBUILD_ENABLE_AVX512=ON .. + make package test-out + doc: name: "Javadoc generation" runs-on: ubuntu-20.04 diff --git a/CMakeLists.txt b/CMakeLists.txt index 98a96b3fb1..8e1308fd68 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -72,6 +72,10 @@ option(BUILD_CPP_ENABLE_METRICS "Enable the metrics collection at compile phase" OFF) +option(BUILD_ENABLE_AVX512 + "Enable build with AVX512 at compile time" + OFF) + # Make sure that a build type is selected if (NOT CMAKE_BUILD_TYPE) message(STATUS "No build type selected, default to ReleaseWithDebugInfo") @@ -121,7 +125,7 @@ if (CMAKE_CXX_COMPILER_ID MATCHES "Clang") set (WARN_FLAGS "${WARN_FLAGS} -Wno-covered-switch-default") set (WARN_FLAGS "${WARN_FLAGS} -Wno-missing-noreturn -Wno-unknown-pragmas") set (WARN_FLAGS "${WARN_FLAGS} -Wno-gnu-zero-variadic-macro-arguments") - set (WARN_FLAGS "${WARN_FLAGS} -Wconversion") + set (WARN_FLAGS "${WARN_FLAGS} -Wno-conversion") if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL "13.0") set (WARN_FLAGS "${WARN_FLAGS} -Wno-reserved-identifier") endif() @@ -140,7 +144,7 @@ elseif (CMAKE_CXX_COMPILER_ID STREQUAL "GNU") else () set (CXX17_FLAGS "-std=c++17") endif () - set (WARN_FLAGS "-Wall -Wno-unknown-pragmas -Wconversion") + set (WARN_FLAGS "-Wall -Wno-unknown-pragmas -Wno-conversion") if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER "12.0") set (WARN_FLAGS "${WARN_FLAGS} -Wno-array-bounds -Wno-stringop-overread") # To compile protobuf in Fedora37 endif () @@ -174,6 +178,15 @@ enable_testing() INCLUDE(CheckSourceCompiles) INCLUDE(ThirdpartyToolchain) +message(STATUS "BUILD_ENABLE_AVX512: ${BUILD_ENABLE_AVX512}") +# +# macOS doesn't fully support AVX512, it has a different way dealing with AVX512 than Windows and Linux. +# +# Here can find the description: +# https://github.com/apple/darwin-xnu/blob/2ff845c2e033bd0ff64b5b6aa6063a1f8f65aa32/osfmk/i386/fpu.c#L174 +if (BUILD_ENABLE_AVX512 AND NOT APPLE) + INCLUDE(ConfigSimdLevel) +endif () set (EXAMPLE_DIRECTORY ${CMAKE_SOURCE_DIR}/examples) diff --git a/README.md b/README.md index f5216af83c..a32062828d 100644 --- a/README.md +++ b/README.md @@ -93,3 +93,18 @@ To build only the C++ library: % make test-out ``` + +To build the C++ library with AVX512 enabled: +```shell +export ORC_USER_SIMD_LEVEL=AVX512 +% mkdir build +% cd build +% cmake .. -DBUILD_JAVA=OFF -DBUILD_ENABLE_AVX512=ON +% make package +% make test-out +``` +Cmake option BUILD_ENABLE_AVX512 can be set to "ON" or (default value)"OFF" at the compile time. At compile time, it defines the SIMD level(AVX512) to be compiled into the binaries. + +Environment variable ORC_USER_SIMD_LEVEL can be set to "AVX512" or (default value)"NONE" at the run time. At run time, it defines the SIMD level to dispatch the code which can apply SIMD optimization. + +Note that if ORC_USER_SIMD_LEVEL is set to "NONE" at run time, AVX512 will not take effect at run time even if BUILD_ENABLE_AVX512 is set to "ON" at compile time. diff --git a/c++/src/BitUnpackerAvx512.hh b/c++/src/BitUnpackerAvx512.hh new file mode 100644 index 0000000000..5b04866718 --- /dev/null +++ b/c++/src/BitUnpackerAvx512.hh @@ -0,0 +1,484 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef ORC_BIT_UNPACKER_AVX512_HH +#define ORC_BIT_UNPACKER_AVX512_HH + +// Mingw-w64 defines strcasecmp in string.h +#if defined(_WIN32) && !defined(strcasecmp) +#include +#define strcasecmp stricmp +#else +#include +#endif + +#include +#include +#include + +namespace orc { +#define ORC_VECTOR_BITS_2_BYTE(x) \ + (((x) + 7u) >> 3u) /**< Convert a number of bits to a number of bytes */ +#define ORC_VECTOR_ONE_64U (1ULL) +#define ORC_VECTOR_MAX_16U 0xFFFF /**< Max value for uint16_t */ +#define ORC_VECTOR_MAX_32U 0xFFFFFFFF /**< Max value for uint32_t */ +#define ORC_VECTOR_BYTE_WIDTH 8u /**< Byte width in bits */ +#define ORC_VECTOR_WORD_WIDTH 16u /**< Word width in bits */ +#define ORC_VECTOR_DWORD_WIDTH 32u /**< Dword width in bits */ +#define ORC_VECTOR_QWORD_WIDTH 64u /**< Qword width in bits */ +#define ORC_VECTOR_BIT_MASK(x) \ + ((ORC_VECTOR_ONE_64U << (x)) - 1u) /**< Bit mask below bit position */ + +#define ORC_VECTOR_BITS_2_WORD(x) \ + (((x) + 15u) >> 4u) /**< Convert a number of bits to a number of words */ +#define ORC_VECTOR_BITS_2_DWORD(x) \ + (((x) + 31u) >> 5u) /**< Convert a number of bits to a number of double words */ + + // ------------------------------------ 3u ----------------------------------------- + static const uint8_t shuffleIdxTable3u_0[64] = { + 1u, 0u, 1u, 0u, 2u, 1u, 3u, 2u, 4u, 3u, 4u, 3u, 5u, 4u, 6u, 5u, 1u, 0u, 1u, 0u, 2u, 1u, + 3u, 2u, 4u, 3u, 4u, 3u, 5u, 4u, 6u, 5u, 1u, 0u, 1u, 0u, 2u, 1u, 3u, 2u, 4u, 3u, 4u, 3u, + 5u, 4u, 6u, 5u, 1u, 0u, 1u, 0u, 2u, 1u, 3u, 2u, 4u, 3u, 4u, 3u, 5u, 4u, 6u, 5u}; + static const uint8_t shuffleIdxTable3u_1[64] = { + 0u, 0u, 1u, 0u, 2u, 1u, 3u, 2u, 3u, 2u, 4u, 3u, 5u, 4u, 6u, 5u, 0u, 0u, 1u, 0u, 2u, 1u, + 3u, 2u, 3u, 2u, 4u, 3u, 5u, 4u, 6u, 5u, 0u, 0u, 1u, 0u, 2u, 1u, 3u, 2u, 3u, 2u, 4u, 3u, + 5u, 4u, 6u, 5u, 0u, 0u, 1u, 0u, 2u, 1u, 3u, 2u, 3u, 2u, 4u, 3u, 5u, 4u, 6u, 5u}; + static const uint16_t shiftTable3u_0[32] = {13u, 7u, 9u, 11u, 13u, 7u, 9u, 11u, 13u, 7u, 9u, + 11u, 13u, 7u, 9u, 11u, 13u, 7u, 9u, 11u, 13u, 7u, + 9u, 11u, 13u, 7u, 9u, 11u, 13u, 7u, 9u, 11u}; + static const uint16_t shiftTable3u_1[32] = {6u, 4u, 2u, 0u, 6u, 4u, 2u, 0u, 6u, 4u, 2u, + 0u, 6u, 4u, 2u, 0u, 6u, 4u, 2u, 0u, 6u, 4u, + 2u, 0u, 6u, 4u, 2u, 0u, 6u, 4u, 2u, 0u}; + static const uint16_t permutexIdxTable3u[32] = { + 0u, 1u, 2u, 0x0, 0x0, 0x0, 0x0, 0x0, 3u, 4u, 5u, 0x0, 0x0, 0x0, 0x0, 0x0, + 6u, 7u, 8u, 0x0, 0x0, 0x0, 0x0, 0x0, 9u, 10u, 11u, 0x0, 0x0, 0x0, 0x0, 0x0}; + + // ------------------------------------ 5u ----------------------------------------- + static const uint8_t shuffleIdxTable5u_0[64] = { + 1u, 0u, 2u, 1u, 3u, 2u, 4u, 3u, 6u, 5u, 7u, 6u, 8u, 7u, 9u, 8u, 1u, 0u, 2u, 1u, 3u, 2u, + 4u, 3u, 6u, 5u, 7u, 6u, 8u, 7u, 9u, 8u, 1u, 0u, 2u, 1u, 3u, 2u, 4u, 3u, 6u, 5u, 7u, 6u, + 8u, 7u, 9u, 8u, 1u, 0u, 2u, 1u, 3u, 2u, 4u, 3u, 6u, 5u, 7u, 6u, 8u, 7u, 9u, 8u}; + static const uint8_t shuffleIdxTable5u_1[64] = { + 1u, 0u, 2u, 1u, 3u, 2u, 5u, 4u, 6u, 5u, 7u, 6u, 8u, 7u, 10u, 9u, 1u, 0u, 2u, 1u, 3u, 2u, + 5u, 4u, 6u, 5u, 7u, 6u, 8u, 7u, 10u, 9u, 1u, 0u, 2u, 1u, 3u, 2u, 5u, 4u, 6u, 5u, 7u, 6u, + 8u, 7u, 10u, 9u, 1u, 0u, 2u, 1u, 3u, 2u, 5u, 4u, 6u, 5u, 7u, 6u, 8u, 7u, 10u, 9u}; + static const uint16_t shiftTable5u_0[32] = {11u, 9u, 7u, 5u, 11u, 9u, 7u, 5u, 11u, 9u, 7u, + 5u, 11u, 9u, 7u, 5u, 11u, 9u, 7u, 5u, 11u, 9u, + 7u, 5u, 11u, 9u, 7u, 5u, 11u, 9u, 7u, 5u}; + static const uint16_t shiftTable5u_1[32] = {2u, 4u, 6u, 0u, 2u, 4u, 6u, 0u, 2u, 4u, 6u, + 0u, 2u, 4u, 6u, 0u, 2u, 4u, 6u, 0u, 2u, 4u, + 6u, 0u, 2u, 4u, 6u, 0u, 2u, 4u, 6u, 0u}; + static const uint16_t permutexIdxTable5u[32] = { + 0u, 1u, 2u, 3u, 4u, 0x0, 0x0, 0x0, 5u, 6u, 7u, 8u, 9u, 0x0, 0x0, 0x0, + 10u, 11u, 12u, 13u, 14u, 0x0, 0x0, 0x0, 15u, 16u, 17u, 18u, 19u, 0x0, 0x0, 0x0}; + + // ------------------------------------ 6u ----------------------------------------- + static const uint8_t shuffleIdxTable6u_0[64] = { + 1u, 0u, 2u, 1u, 4u, 3u, 5u, 4u, 7u, 6u, 8u, 7u, 10u, 9u, 11u, 10u, + 1u, 0u, 2u, 1u, 4u, 3u, 5u, 4u, 7u, 6u, 8u, 7u, 10u, 9u, 11u, 10u, + 1u, 0u, 2u, 1u, 4u, 3u, 5u, 4u, 7u, 6u, 8u, 7u, 10u, 9u, 11u, 10u, + 1u, 0u, 2u, 1u, 4u, 3u, 5u, 4u, 7u, 6u, 8u, 7u, 10u, 9u, 11u, 10u}; + static const uint8_t shuffleIdxTable6u_1[64] = { + 1u, 0u, 3u, 2u, 4u, 3u, 6u, 5u, 7u, 6u, 9u, 8u, 10u, 9u, 12u, 11u, + 1u, 0u, 3u, 2u, 4u, 3u, 6u, 5u, 7u, 6u, 9u, 8u, 10u, 9u, 12u, 11u, + 1u, 0u, 3u, 2u, 4u, 3u, 6u, 5u, 7u, 6u, 9u, 8u, 10u, 9u, 12u, 11u, + 1u, 0u, 3u, 2u, 4u, 3u, 6u, 5u, 7u, 6u, 9u, 8u, 10u, 9u, 12u, 11u}; + static const uint16_t shiftTable6u_0[32] = {10u, 6u, 10u, 6u, 10u, 6u, 10u, 6u, 10u, 6u, 10u, + 6u, 10u, 6u, 10u, 6u, 10u, 6u, 10u, 6u, 10u, 6u, + 10u, 6u, 10u, 6u, 10u, 6u, 10u, 6u, 10u, 6u}; + static const uint16_t shiftTable6u_1[32] = {4u, 0u, 4u, 0u, 4u, 0u, 4u, 0u, 4u, 0u, 4u, + 0u, 4u, 0u, 4u, 0u, 4u, 0u, 4u, 0u, 4u, 0u, + 4u, 0u, 4u, 0u, 4u, 0u, 4u, 0u, 4u, 0u}; + static const uint32_t permutexIdxTable6u[16] = {0u, 1u, 2u, 0x0, 3u, 4u, 5u, 0x0, + 6u, 7u, 8u, 0x0, 9u, 10u, 11u, 0x0}; + + // ------------------------------------ 7u ----------------------------------------- + static const uint8_t shuffleIdxTable7u_0[64] = { + 1u, 0u, 2u, 1u, 4u, 3u, 6u, 5u, 8u, 7u, 9u, 8u, 11u, 10u, 13u, 12u, + 1u, 0u, 2u, 1u, 4u, 3u, 6u, 5u, 8u, 7u, 9u, 8u, 11u, 10u, 13u, 12u, + 1u, 0u, 2u, 1u, 4u, 3u, 6u, 5u, 8u, 7u, 9u, 8u, 11u, 10u, 13u, 12u, + 1u, 0u, 2u, 1u, 4u, 3u, 6u, 5u, 8u, 7u, 9u, 8u, 11u, 10u, 13u, 12u}; + static const uint8_t shuffleIdxTable7u_1[64] = { + 1u, 0u, 3u, 2u, 5u, 4u, 7u, 6u, 8u, 7u, 10u, 9u, 12u, 11u, 14u, 13u, + 1u, 0u, 3u, 2u, 5u, 4u, 7u, 6u, 8u, 7u, 10u, 9u, 12u, 11u, 14u, 13u, + 1u, 0u, 3u, 2u, 5u, 4u, 7u, 6u, 8u, 7u, 10u, 9u, 12u, 11u, 14u, 13u, + 1u, 0u, 3u, 2u, 5u, 4u, 7u, 6u, 8u, 7u, 10u, 9u, 12u, 11u, 14u, 13u}; + static const uint16_t shiftTable7u_0[32] = {9u, 3u, 5u, 7u, 9u, 3u, 5u, 7u, 9u, 3u, 5u, + 7u, 9u, 3u, 5u, 7u, 9u, 3u, 5u, 7u, 9u, 3u, + 5u, 7u, 9u, 3u, 5u, 7u, 9u, 3u, 5u, 7u}; + static const uint16_t shiftTable7u_1[32] = {6u, 4u, 2u, 0u, 6u, 4u, 2u, 0u, 6u, 4u, 2u, + 0u, 6u, 4u, 2u, 0u, 6u, 4u, 2u, 0u, 6u, 4u, + 2u, 0u, 6u, 4u, 2u, 0u, 6u, 4u, 2u, 0u}; + static const uint16_t permutexIdxTable7u[32] = { + 0u, 1u, 2u, 3u, 4u, 5u, 6u, 0x0, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 0x0, + 14u, 15u, 16u, 17u, 18u, 19u, 20u, 0x0, 21u, 22u, 23u, 24u, 25u, 26u, 27u, 0x0}; + + // ------------------------------------ 9u ----------------------------------------- + static const uint16_t permutexIdxTable9u_0[32] = { + 0u, 1u, 1u, 2u, 2u, 3u, 3u, 4u, 4u, 5u, 5u, 6u, 6u, 7u, 7u, 8u, + 9u, 10u, 10u, 11u, 11u, 12u, 12u, 13u, 13u, 14u, 14u, 15u, 15u, 16u, 16u, 17u}; + static const uint16_t permutexIdxTable9u_1[32] = { + 0u, 1u, 1u, 2u, 2u, 3u, 3u, 4u, 5u, 6u, 6u, 7u, 7u, 8u, 8u, 9u, + 9u, 10u, 10u, 11u, 11u, 12u, 12u, 13u, 14u, 15u, 15u, 16u, 16u, 17u, 17u, 18u}; + static const uint32_t shiftTable9u_0[16] = {0u, 2u, 4u, 6u, 8u, 10u, 12u, 14u, + 0u, 2u, 4u, 6u, 8u, 10u, 12u, 14u}; + static const uint32_t shiftTable9u_1[16] = {7u, 5u, 3u, 1u, 15u, 13u, 11u, 9u, + 7u, 5u, 3u, 1u, 15u, 13u, 11u, 9u}; + + static const uint8_t shuffleIdxTable9u_0[64] = { + 1u, 0u, 2u, 1u, 3u, 2u, 4u, 3u, 5u, 4u, 6u, 5u, 7u, 6u, 8u, 7u, 1u, 0u, 2u, 1u, 3u, 2u, + 4u, 3u, 5u, 4u, 6u, 5u, 7u, 6u, 8u, 7u, 1u, 0u, 2u, 1u, 3u, 2u, 4u, 3u, 5u, 4u, 6u, 5u, + 7u, 6u, 8u, 7u, 1u, 0u, 2u, 1u, 3u, 2u, 4u, 3u, 5u, 4u, 6u, 5u, 7u, 6u, 8u, 7u}; + static const uint16_t shiftTable9u_2[32] = {7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u, 7u, 6u, 5u, + 4u, 3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 3u, 2u, + 1u, 0u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u}; + static const uint64_t gatherIdxTable9u[8] = {0u, 8u, 9u, 17u, 18u, 26u, 27u, 35u}; + + // ------------------------------------ 10u ----------------------------------------- + static const uint8_t shuffleIdxTable10u_0[64] = { + 1u, 0u, 2u, 1u, 3u, 2u, 4u, 3u, 6u, 5u, 7u, 6u, 8u, 7u, 9u, 8u, 1u, 0u, 2u, 1u, 3u, 2u, + 4u, 3u, 6u, 5u, 7u, 6u, 8u, 7u, 9u, 8u, 1u, 0u, 2u, 1u, 3u, 2u, 4u, 3u, 6u, 5u, 7u, 6u, + 8u, 7u, 9u, 8u, 1u, 0u, 2u, 1u, 3u, 2u, 4u, 3u, 6u, 5u, 7u, 6u, 8u, 7u, 9u, 8u}; + static const uint16_t shiftTable10u[32] = {6u, 4u, 2u, 0u, 6u, 4u, 2u, 0u, 6u, 4u, 2u, + 0u, 6u, 4u, 2u, 0u, 6u, 4u, 2u, 0u, 6u, 4u, + 2u, 0u, 6u, 4u, 2u, 0u, 6u, 4u, 2u, 0u}; + static const uint16_t permutexIdxTable10u[32] = { + 0u, 1u, 2u, 3u, 4u, 0x0, 0x0, 0x0, 5u, 6u, 7u, 8u, 9u, 0x0, 0x0, 0x0, + 10u, 11u, 12u, 13u, 14u, 0x0, 0x0, 0x0, 15u, 16u, 17u, 18u, 19u, 0x0, 0x0, 0x0}; + + // ------------------------------------ 11u ----------------------------------------- + static const uint16_t permutexIdxTable11u_0[32] = { + 0u, 1u, 1u, 2u, 2u, 3u, 4u, 5u, 5u, 6u, 6u, 7u, 8u, 9u, 9u, 10u, + 11u, 12u, 12u, 13u, 13u, 14u, 15u, 16u, 16u, 17u, 17u, 18u, 19u, 20u, 20u, 21u}; + static const uint16_t permutexIdxTable11u_1[32] = { + 0u, 1u, 2u, 3u, 3u, 4u, 4u, 5u, 6u, 7u, 7u, 8u, 8u, 9u, 10u, 11u, + 11u, 12u, 13u, 14u, 14u, 15u, 15u, 16u, 17u, 18u, 18u, 19u, 19u, 20u, 21u, 22u}; + static const uint32_t shiftTable11u_0[16] = {0u, 6u, 12u, 2u, 8u, 14u, 4u, 10u, + 0u, 6u, 12u, 2u, 8u, 14u, 4u, 10u}; + static const uint32_t shiftTable11u_1[16] = {5u, 15u, 9u, 3u, 13u, 7u, 1u, 11u, + 5u, 15u, 9u, 3u, 13u, 7u, 1u, 11u}; + + static const uint8_t shuffleIdxTable11u_0[64] = { + 3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 8u, 7u, 6u, 5u, 11u, 10u, 9u, 8u, + 3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 8u, 7u, 6u, 5u, 11u, 10u, 9u, 8u, + 3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 8u, 7u, 6u, 5u, 11u, 10u, 9u, 8u, + 3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 8u, 7u, 6u, 5u, 11u, 10u, 9u, 8u}; + static const uint8_t shuffleIdxTable11u_1[64] = { + 3u, 2u, 1u, 0u, 6u, 5u, 4u, 0u, 8u, 7u, 6u, 0u, 11u, 10u, 9u, 0u, + 3u, 2u, 1u, 0u, 6u, 5u, 4u, 0u, 8u, 7u, 6u, 0u, 11u, 10u, 9u, 0u, + 3u, 2u, 1u, 0u, 6u, 5u, 4u, 0u, 8u, 7u, 6u, 0u, 11u, 10u, 9u, 0u, + 3u, 2u, 1u, 0u, 6u, 5u, 4u, 0u, 8u, 7u, 6u, 0u, 11u, 10u, 9u, 0u}; + static const uint32_t shiftTable11u_2[16] = {21u, 15u, 17u, 19u, 21u, 15u, 17u, 19u, + 21u, 15u, 17u, 19u, 21u, 15u, 17u, 19u}; + static const uint32_t shiftTable11u_3[16] = {6u, 4u, 10u, 8u, 6u, 4u, 10u, 8u, + 6u, 4u, 10u, 8u, 6u, 4u, 10u, 8u}; + static const uint64_t gatherIdxTable11u[8] = {0u, 8u, 11u, 19u, 22u, 30u, 33u, 41u}; + + // ------------------------------------ 12u ----------------------------------------- + static const uint8_t shuffleIdxTable12u_0[64] = { + 1u, 0u, 2u, 1u, 4u, 3u, 5u, 4u, 7u, 6u, 8u, 7u, 10u, 9u, 11u, 10u, + 1u, 0u, 2u, 1u, 4u, 3u, 5u, 4u, 7u, 6u, 8u, 7u, 10u, 9u, 11u, 10u, + 1u, 0u, 2u, 1u, 4u, 3u, 5u, 4u, 7u, 6u, 8u, 7u, 10u, 9u, 11u, 10u, + 1u, 0u, 2u, 1u, 4u, 3u, 5u, 4u, 7u, 6u, 8u, 7u, 10u, 9u, 11u, 10u}; + static const uint16_t shiftTable12u[32] = {4u, 0u, 4u, 0u, 4u, 0u, 4u, 0u, 4u, 0u, 4u, + 0u, 4u, 0u, 4u, 0u, 4u, 0u, 4u, 0u, 4u, 0u, + 4u, 0u, 4u, 0u, 4u, 0u, 4u, 0u, 4u, 0u}; + static const uint32_t permutexIdxTable12u[16] = {0u, 1u, 2u, 0x0, 3u, 4u, 5u, 0x0, + 6u, 7u, 8u, 0x0, 9u, 10u, 11u, 0x0}; + + // ------------------------------------ 13u ----------------------------------------- + static const uint16_t permutexIdxTable13u_0[32] = { + 0u, 1u, 1u, 2u, 3u, 4u, 4u, 5u, 6u, 7u, 8u, 9u, 9u, 10u, 11u, 12u, + 13u, 14u, 14u, 15u, 16u, 17u, 17u, 18u, 19u, 20u, 21u, 22u, 22u, 23u, 24u, 25u}; + static const uint16_t permutexIdxTable13u_1[32] = { + 0u, 1u, 2u, 3u, 4u, 5u, 5u, 6u, 7u, 8u, 8u, 9u, 10u, 11u, 12u, 13u, + 13u, 14u, 15u, 16u, 17u, 18u, 18u, 19u, 20u, 21u, 21u, 22u, 23u, 24u, 25u, 26u}; + static const uint32_t shiftTable13u_0[16] = {0u, 10u, 4u, 14u, 8u, 2u, 12u, 6u, + 0u, 10u, 4u, 14u, 8u, 2u, 12u, 6u}; + static const uint32_t shiftTable13u_1[16] = {3u, 9u, 15u, 5u, 11u, 1u, 7u, 13u, + 3u, 9u, 15u, 5u, 11u, 1u, 7u, 13u}; + + static const uint8_t shuffleIdxTable13u_0[64] = { + 3u, 2u, 1u, 0u, 6u, 5u, 4u, 3u, 9u, 8u, 7u, 6u, 12u, 11u, 10u, 9u, + 3u, 2u, 1u, 0u, 6u, 5u, 4u, 3u, 9u, 8u, 7u, 6u, 12u, 11u, 10u, 9u, + 3u, 2u, 1u, 0u, 6u, 5u, 4u, 3u, 9u, 8u, 7u, 6u, 12u, 11u, 10u, 9u, + 3u, 2u, 1u, 0u, 6u, 5u, 4u, 3u, 9u, 8u, 7u, 6u, 12u, 11u, 10u, 9u}; + static const uint8_t shuffleIdxTable13u_1[64] = { + 3u, 2u, 1u, 0u, 6u, 5u, 4u, 0u, 10u, 9u, 8u, 0u, 13u, 12u, 11u, 0u, + 3u, 2u, 1u, 0u, 6u, 5u, 4u, 0u, 10u, 9u, 8u, 0u, 13u, 12u, 11u, 0u, + 3u, 2u, 1u, 0u, 6u, 5u, 4u, 0u, 10u, 9u, 8u, 0u, 13u, 12u, 11u, 0u, + 3u, 2u, 1u, 0u, 6u, 5u, 4u, 0u, 10u, 9u, 8u, 0u, 13u, 12u, 11u, 0u}; + static const uint32_t shiftTable13u_2[16] = {19u, 17u, 15u, 13u, 19u, 17u, 15u, 13u, + 19u, 17u, 15u, 13u, 19u, 17u, 15u, 13u}; + static const uint32_t shiftTable13u_3[16] = {10u, 12u, 6u, 8u, 10u, 12u, 6u, 8u, + 10u, 12u, 6u, 8u, 10u, 12u, 6u, 8u}; + static const uint64_t gatherIdxTable13u[8] = {0u, 8u, 13u, 21u, 26u, 34u, 39u, 47u}; + + // ------------------------------------ 14u ----------------------------------------- + static const uint8_t shuffleIdxTable14u_0[64] = { + 3u, 2u, 1u, 0u, 6u, 5u, 4u, 3u, 10u, 9u, 8u, 7u, 13u, 12u, 11u, 10u, + 3u, 2u, 1u, 0u, 6u, 5u, 4u, 3u, 10u, 9u, 8u, 7u, 13u, 12u, 11u, 10u, + 3u, 2u, 1u, 0u, 6u, 5u, 4u, 3u, 10u, 9u, 8u, 7u, 13u, 12u, 11u, 10u, + 3u, 2u, 1u, 0u, 6u, 5u, 4u, 3u, 10u, 9u, 8u, 7u, 13u, 12u, 11u, 10u}; + static const uint8_t shuffleIdxTable14u_1[64] = { + 3u, 2u, 1u, 0u, 7u, 6u, 5u, 0u, 10u, 9u, 8u, 0u, 14u, 13u, 12u, 0u, + 3u, 2u, 1u, 0u, 7u, 6u, 5u, 0u, 10u, 9u, 8u, 0u, 14u, 13u, 12u, 0u, + 3u, 2u, 1u, 0u, 7u, 6u, 5u, 0u, 10u, 9u, 8u, 0u, 14u, 13u, 12u, 0u, + 3u, 2u, 1u, 0u, 7u, 6u, 5u, 0u, 10u, 9u, 8u, 0u, 14u, 13u, 12u, 0u}; + static const uint32_t shiftTable14u_0[16] = {18u, 14u, 18u, 14u, 18u, 14u, 18u, 14u, + 18u, 14u, 18u, 14u, 18u, 14u, 18u, 14u}; + static const uint32_t shiftTable14u_1[16] = {12u, 8u, 12u, 8u, 12u, 8u, 12u, 8u, + 12u, 8u, 12u, 8u, 12u, 8u, 12u, 8u}; + static const uint16_t permutexIdxTable14u[32] = { + 0u, 1u, 2u, 3u, 4u, 5u, 6u, 0x0, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 0x0, + 14u, 15u, 16u, 17u, 18u, 19u, 20u, 0x0, 21u, 22u, 23u, 24u, 25u, 26u, 27u, 0x0}; + + // ------------------------------------ 15u ----------------------------------------- + static const uint16_t permutexIdxTable15u_0[32] = { + 0u, 1u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 14u, + 15u, 16u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 26u, 27u, 28u, 29u}; + static const uint16_t permutexIdxTable15u_1[32] = { + 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 14u, 15u, + 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 26u, 27u, 28u, 29u, 30u}; + static const uint32_t shiftTable15u_0[16] = {0u, 14u, 12u, 10u, 8u, 6u, 4u, 2u, + 0u, 14u, 12u, 10u, 8u, 6u, 4u, 2u}; + static const uint32_t shiftTable15u_1[16] = {1u, 3u, 5u, 7u, 9u, 11u, 13u, 15u, + 1u, 3u, 5u, 7u, 9u, 11u, 13u, 15u}; + + static const uint8_t shuffleIdxTable15u_0[64] = { + 3u, 2u, 1u, 0u, 6u, 5u, 4u, 3u, 10u, 9u, 8u, 7u, 14u, 13u, 12u, 11u, + 3u, 2u, 1u, 0u, 6u, 5u, 4u, 3u, 10u, 9u, 8u, 7u, 14u, 13u, 12u, 11u, + 3u, 2u, 1u, 0u, 6u, 5u, 4u, 3u, 10u, 9u, 8u, 7u, 14u, 13u, 12u, 11u, + 3u, 2u, 1u, 0u, 6u, 5u, 4u, 3u, 10u, 9u, 8u, 7u, 14u, 13u, 12u, 11u}; + static const uint8_t shuffleIdxTable15u_1[64] = { + 3u, 2u, 1u, 0u, 7u, 6u, 5u, 0u, 11u, 10u, 9u, 0u, 15u, 14u, 13u, 0u, + 3u, 2u, 1u, 0u, 7u, 6u, 5u, 0u, 11u, 10u, 9u, 0u, 15u, 14u, 13u, 0u, + 3u, 2u, 1u, 0u, 7u, 6u, 5u, 0u, 11u, 10u, 9u, 0u, 15u, 14u, 13u, 0u, + 3u, 2u, 1u, 0u, 7u, 6u, 5u, 0u, 11u, 10u, 9u, 0u, 15u, 14u, 13u, 0u}; + static const uint32_t shiftTable15u_2[16] = {17u, 11u, 13u, 15u, 17u, 11u, 13u, 15u, + 17u, 11u, 13u, 15u, 17u, 11u, 13u, 15u}; + static const uint32_t shiftTable15u_3[16] = {14u, 12u, 10u, 8u, 14u, 12u, 10u, 8u, + 14u, 12u, 10u, 8u, 14u, 12u, 10u, 8u}; + static const uint64_t gatherIdxTable15u[8] = {0u, 8u, 15u, 23u, 30u, 38u, 45u, 53u}; + + // ------------------------------------ 17u ----------------------------------------- + static const uint32_t permutexIdxTable17u_0[16] = {0u, 1u, 1u, 2u, 2u, 3u, 3u, 4u, + 4u, 5u, 5u, 6u, 6u, 7u, 7u, 8u}; + static const uint32_t permutexIdxTable17u_1[16] = {0u, 1u, 1u, 2u, 2u, 3u, 3u, 4u, + 4u, 5u, 5u, 6u, 6u, 7u, 7u, 8u}; + static const uint64_t shiftTable17u_0[8] = {0u, 2u, 4u, 6u, 8u, 10u, 12u, 14u}; + static const uint64_t shiftTable17u_1[8] = {15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u}; + + static const uint8_t shuffleIdxTable17u_0[64] = { + 3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 7u, 6u, 5u, 4u, 9u, 8u, 7u, 6u, 3u, 2u, 1u, 0u, 5u, 4u, + 3u, 2u, 7u, 6u, 5u, 4u, 9u, 8u, 7u, 6u, 3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 7u, 6u, 5u, 4u, + 9u, 8u, 7u, 6u, 3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 7u, 6u, 5u, 4u, 9u, 8u, 7u, 6u}; + static const uint32_t shiftTable17u_2[16] = {15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, + 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u}; + static const uint64_t gatherIdxTable17u[8] = {0u, 8u, 8u, 16u, 17u, 25u, 25u, 33u}; + + // ------------------------------------ 18u ----------------------------------------- + static const uint32_t permutexIdxTable18u_0[16] = {0u, 1u, 1u, 2u, 2u, 3u, 3u, 4u, + 4u, 5u, 5u, 6u, 6u, 7u, 7u, 8u}; + static const uint32_t permutexIdxTable18u_1[16] = {0u, 1u, 1u, 2u, 2u, 3u, 3u, 4u, + 5u, 6u, 6u, 7u, 7u, 8u, 8u, 9u}; + static const uint64_t shiftTable18u_0[8] = {0u, 4u, 8u, 12u, 16u, 20u, 24u, 28u}; + static const uint64_t shiftTable18u_1[8] = {14u, 10u, 6u, 2u, 30u, 26u, 22u, 18u}; + + static const uint8_t shuffleIdxTable18u_0[64] = { + 3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 7u, 6u, 5u, 4u, 9u, 8u, 7u, 6u, 3u, 2u, 1u, 0u, 5u, 4u, + 3u, 2u, 7u, 6u, 5u, 4u, 9u, 8u, 7u, 6u, 3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 7u, 6u, 5u, 4u, + 9u, 8u, 7u, 6u, 3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 7u, 6u, 5u, 4u, 9u, 8u, 7u, 6u}; + static const uint32_t shiftTable18u_2[16] = {14u, 12u, 10u, 8u, 14u, 12u, 10u, 8u, + 14u, 12u, 10u, 8u, 14u, 12u, 10u, 8u}; + static const uint64_t gatherIdxTable18u[8] = {0u, 8u, 9u, 17u, 18u, 26u, 27u, 35u}; + + // ------------------------------------ 19u ----------------------------------------- + static const uint32_t permutexIdxTable19u_0[16] = {0u, 1u, 1u, 2u, 2u, 3u, 3u, 4u, + 4u, 5u, 5u, 6u, 7u, 8u, 8u, 9u}; + static const uint32_t permutexIdxTable19u_1[16] = {0u, 1u, 1u, 2u, 2u, 3u, 4u, 5u, + 5u, 6u, 6u, 7u, 7u, 8u, 8u, 9u}; + static const uint64_t shiftTable19u_0[8] = {0u, 6u, 12u, 18u, 24u, 30u, 4u, 10u}; + static const uint64_t shiftTable19u_1[8] = {13u, 7u, 1u, 27u, 21u, 15u, 9u, 3u}; + + static const uint8_t shuffleIdxTable19u_0[64] = { + 3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 7u, 6u, 5u, 4u, 10u, 9u, 8u, 7u, 3u, 2u, 1u, 0u, 5u, 4u, + 3u, 2u, 8u, 7u, 6u, 5u, 10u, 9u, 8u, 7u, 3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 7u, 6u, 5u, 4u, + 10u, 9u, 8u, 7u, 3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 8u, 7u, 6u, 5u, 10u, 9u, 8u, 7u}; + static const uint32_t shiftTable19u_2[16] = {13u, 10u, 7u, 12u, 9u, 6u, 11u, 8u, + 13u, 10u, 7u, 12u, 9u, 6u, 11u, 8u}; + static const uint64_t gatherIdxTable19u[8] = {0u, 8u, 9u, 17u, 19u, 27u, 28u, 36u}; + + // ------------------------------------ 20u ----------------------------------------- + static const uint8_t shuffleIdxTable20u_0[64] = { + 3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 8u, 7u, 6u, 5u, 10u, 9u, 8u, 7u, 3u, 2u, 1u, 0u, 5u, 4u, + 3u, 2u, 8u, 7u, 6u, 5u, 10u, 9u, 8u, 7u, 3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 8u, 7u, 6u, 5u, + 10u, 9u, 8u, 7u, 3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 8u, 7u, 6u, 5u, 10u, 9u, 8u, 7u}; + static const uint32_t shiftTable20u[16] = {12u, 8u, 12u, 8u, 12u, 8u, 12u, 8u, + 12u, 8u, 12u, 8u, 12u, 8u, 12u, 8u}; + static const uint16_t permutexIdxTable20u[32] = { + 0u, 1u, 2u, 3u, 4u, 0x0, 0x0, 0x0, 5u, 6u, 7u, 8u, 9u, 0x0, 0x0, 0x0, + 10u, 11u, 12u, 13u, 14u, 0x0, 0x0, 0x0, 15u, 16u, 17u, 18u, 19u, 0x0, 0x0, 0x0}; + + // ------------------------------------ 21u ----------------------------------------- + static const uint32_t permutexIdxTable21u_0[16] = {0u, 1u, 1u, 2u, 2u, 3u, 3u, 4u, + 5u, 6u, 6u, 7u, 7u, 8u, 9u, 10u}; + static const uint32_t permutexIdxTable21u_1[16] = {0u, 1u, 1u, 2u, 3u, 4u, 4u, 5u, + 5u, 6u, 7u, 8u, 8u, 9u, 9u, 10u}; + static const uint64_t shiftTable21u_0[8] = {0u, 10u, 20u, 30u, 8u, 18u, 28u, 6u}; + static const uint64_t shiftTable21u_1[8] = {11u, 1u, 23u, 13u, 3u, 25u, 15u, 5u}; + + static const uint8_t shuffleIdxTable21u_0[64] = { + 3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 8u, 7u, 6u, 5u, 10u, 9u, 8u, 7u, 3u, 2u, 1u, 0u, 6u, 5u, + 4u, 3u, 8u, 7u, 6u, 5u, 11u, 10u, 9u, 8u, 3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 8u, 7u, 6u, 5u, + 10u, 9u, 8u, 7u, 3u, 2u, 1u, 0u, 6u, 5u, 4u, 3u, 8u, 7u, 6u, 5u, 11u, 10u, 9u, 8u}; + static const uint32_t shiftTable21u_2[16] = {11u, 6u, 9u, 4u, 7u, 10u, 5u, 8u, + 11u, 6u, 9u, 4u, 7u, 10u, 5u, 8u}; + static const uint64_t gatherIdxTable21u[8] = {0u, 8u, 10u, 18u, 21u, 29u, 31u, 39u}; + + // ------------------------------------ 22u ----------------------------------------- + static const uint32_t permutexIdxTable22u_0[16] = {0u, 1u, 1u, 2u, 2u, 3u, 4u, 5u, + 5u, 6u, 6u, 7u, 8u, 9u, 9u, 10u}; + static const uint32_t permutexIdxTable22u_1[16] = {0u, 1u, 2u, 3u, 3u, 4u, 4u, 5u, + 6u, 7u, 7u, 8u, 8u, 9u, 10u, 11u}; + static const uint64_t shiftTable22u_0[8] = {0u, 12u, 24u, 4u, 16u, 28u, 8u, 20u}; + static const uint64_t shiftTable22u_1[8] = {10u, 30u, 18u, 6u, 26u, 14u, 2u, 22u}; + + static const uint8_t shuffleIdxTable22u_0[64] = { + 3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 8u, 7u, 6u, 5u, 11u, 10u, 9u, 8u, + 3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 8u, 7u, 6u, 5u, 11u, 10u, 9u, 8u, + 3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 8u, 7u, 6u, 5u, 11u, 10u, 9u, 8u, + 3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 8u, 7u, 6u, 5u, 11u, 10u, 9u, 8u}; + static const uint32_t shiftTable22u_2[16] = {10u, 4u, 6u, 8u, 10u, 4u, 6u, 8u, + 10u, 4u, 6u, 8u, 10u, 4u, 6u, 8u}; + static const uint64_t gatherIdxTable22u[8] = {0u, 8u, 11u, 19u, 22u, 30u, 33u, 41u}; + + // ------------------------------------ 23u ----------------------------------------- + static const uint32_t permutexIdxTable23u_0[16] = {0u, 1u, 1u, 2u, 2u, 3u, 4u, 5u, + 5u, 6u, 7u, 8u, 8u, 9u, 10u, 11u}; + static const uint32_t permutexIdxTable23u_1[16] = {0u, 1u, 2u, 3u, 3u, 4u, 5u, 6u, + 6u, 7u, 7u, 8u, 9u, 10u, 10u, 11u}; + static const uint64_t shiftTable23u_0[8] = {0u, 14u, 28u, 10u, 24u, 6u, 20u, 2u}; + static const uint64_t shiftTable23u_1[8] = {9u, 27u, 13u, 31u, 17u, 3u, 21u, 7u}; + + static const uint8_t shuffleIdxTable23u_0[64] = { + 3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 8u, 7u, 6u, 5u, 11u, 10u, 9u, 8u, + 3u, 2u, 1u, 0u, 6u, 5u, 4u, 3u, 9u, 8u, 7u, 6u, 12u, 11u, 10u, 9u, + 3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 8u, 7u, 6u, 5u, 11u, 10u, 9u, 8u, + 3u, 2u, 1u, 0u, 6u, 5u, 4u, 3u, 9u, 8u, 7u, 6u, 12u, 11u, 10u, 9u}; + static const uint32_t shiftTable23u_2[16] = {9u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, + 9u, 2u, 3u, 4u, 5u, 6u, 7u, 8u}; + static const uint64_t gatherIdxTable23u[8] = {0u, 8u, 11u, 19u, 23u, 31u, 34u, 42u}; + + // ------------------------------------ 24u ----------------------------------------- + static const uint8_t shuffleIdxTable24u_0[64] = { + 2u, 1u, 0u, 0xFF, 5u, 4u, 3u, 0xFF, 8u, 7u, 6u, 0xFF, 11u, 10u, 9u, 0xFF, + 2u, 1u, 0u, 0xFF, 5u, 4u, 3u, 0xFF, 8u, 7u, 6u, 0xFF, 11u, 10u, 9u, 0xFF, + 2u, 1u, 0u, 0xFF, 5u, 4u, 3u, 0xFF, 8u, 7u, 6u, 0xFF, 11u, 10u, 9u, 0xFF, + 2u, 1u, 0u, 0xFF, 5u, 4u, 3u, 0xFF, 8u, 7u, 6u, 0xFF, 11u, 10u, 9u, 0xFF}; + static const uint32_t permutexIdxTable24u[16] = {0u, 1u, 2u, 0x0, 3u, 4u, 5u, 0x0, + 6u, 7u, 8u, 0x0, 9u, 10u, 11u, 0x0}; + + // ------------------------------------ 26u ----------------------------------------- + static const uint32_t permutexIdxTable26u_0[16] = {0u, 1u, 1u, 2u, 3u, 4u, 4u, 5u, + 6u, 7u, 8u, 9u, 9u, 10u, 11u, 12u}; + static const uint32_t permutexIdxTable26u_1[16] = {0u, 1u, 2u, 3u, 4u, 5u, 5u, 6u, + 7u, 8u, 8u, 9u, 10u, 11u, 12u, 13u}; + static const uint64_t shiftTable26u_0[8] = {0u, 20u, 8u, 28u, 16u, 4u, 24u, 12u}; + static const uint64_t shiftTable26u_1[8] = {6u, 18u, 30u, 10u, 22u, 2u, 14u, 26u}; + + static const uint8_t shuffleIdxTable26u_0[64] = { + 3u, 2u, 1u, 0u, 6u, 5u, 4u, 3u, 9u, 8u, 7u, 6u, 12u, 11u, 10u, 9u, + 3u, 2u, 1u, 0u, 6u, 5u, 4u, 3u, 9u, 8u, 7u, 6u, 12u, 11u, 10u, 9u, + 3u, 2u, 1u, 0u, 6u, 5u, 4u, 3u, 9u, 8u, 7u, 6u, 12u, 11u, 10u, 9u, + 3u, 2u, 1u, 0u, 6u, 5u, 4u, 3u, 9u, 8u, 7u, 6u, 12u, 11u, 10u, 9u}; + static const uint32_t shiftTable26u_2[16] = {6u, 4u, 2u, 0u, 6u, 4u, 2u, 0u, + 6u, 4u, 2u, 0u, 6u, 4u, 2u, 0u}; + static const uint64_t gatherIdxTable26u[8] = {0u, 8u, 13u, 21u, 26u, 34u, 39u, 47u}; + + // ------------------------------------ 28u ----------------------------------------- + static const uint8_t shuffleIdxTable28u_0[64] = { + 3u, 2u, 1u, 0u, 6u, 5u, 4u, 3u, 10u, 9u, 8u, 7u, 13u, 12u, 11u, 10u, + 3u, 2u, 1u, 0u, 6u, 5u, 4u, 3u, 10u, 9u, 8u, 7u, 13u, 12u, 11u, 10u, + 3u, 2u, 1u, 0u, 6u, 5u, 4u, 3u, 10u, 9u, 8u, 7u, 13u, 12u, 11u, 10u, + 3u, 2u, 1u, 0u, 6u, 5u, 4u, 3u, 10u, 9u, 8u, 7u, 13u, 12u, 11u, 10u}; + static const uint32_t shiftTable28u[16] = {4u, 0u, 4u, 0u, 4u, 0u, 4u, 0u, + 4u, 0u, 4u, 0u, 4u, 0u, 4u, 0u}; + static const uint16_t permutexIdxTable28u[32] = { + 0u, 1u, 2u, 3u, 4u, 5u, 6u, 0x0, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 0x0, + 14u, 15u, 16u, 17u, 18u, 19u, 20u, 0x0, 21u, 22u, 23u, 24u, 25u, 26u, 27u, 0x0}; + + // ------------------------------------ 30u ----------------------------------------- + static const uint32_t permutexIdxTable30u_0[16] = {0u, 1u, 1u, 2u, 3u, 4u, 5u, 6u, + 7u, 8u, 9u, 10u, 11u, 12u, 13u, 14u}; + static const uint32_t permutexIdxTable30u_1[16] = {0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, + 8u, 9u, 10u, 11u, 12u, 13u, 14u, 15u}; + static const uint64_t shiftTable30u_0[8] = {0u, 28u, 24u, 20u, 16u, 12u, 8u, 4u}; + static const uint64_t shiftTable30u_1[8] = {2u, 6u, 10u, 14u, 18u, 22u, 26u, 30u}; + + static const uint8_t shuffleIdxTable30u_0[64] = { + 0u, 0u, 0u, 4u, 3u, 2u, 1u, 0u, 0u, 0u, 0u, 11u, 10u, 9u, 8u, 7u, + 0u, 0u, 0u, 4u, 3u, 2u, 1u, 0u, 0u, 0u, 0u, 11u, 10u, 9u, 8u, 7u, + 0u, 0u, 0u, 4u, 3u, 2u, 1u, 0u, 0u, 0u, 0u, 11u, 10u, 9u, 8u, 7u, + 0u, 0u, 0u, 4u, 3u, 2u, 1u, 0u, 0u, 0u, 0u, 11u, 10u, 9u, 8u, 7u}; + static const uint8_t shuffleIdxTable30u_1[64] = { + 7u, 6u, 5u, 4u, 3u, 0u, 0u, 0u, 15u, 14u, 13u, 12u, 11u, 0u, 0u, 0u, + 7u, 6u, 5u, 4u, 3u, 0u, 0u, 0u, 15u, 14u, 13u, 12u, 11u, 0u, 0u, 0u, + 7u, 6u, 5u, 4u, 3u, 0u, 0u, 0u, 15u, 14u, 13u, 12u, 11u, 0u, 0u, 0u, + 7u, 6u, 5u, 4u, 3u, 0u, 0u, 0u, 15u, 14u, 13u, 12u, 11u, 0u, 0u, 0u}; + static const uint64_t shiftTable30u_2[8] = {34u, 30u, 34u, 30u, 34u, 30u, 34u, 30u}; + static const uint64_t shiftTable30u_3[8] = {28u, 24u, 28u, 24u, 28u, 24u, 28u, 24u}; + static const uint64_t gatherIdxTable30u[8] = {0u, 8u, 15u, 23u, 30u, 38u, 45u, 53u}; + + static const uint64_t nibbleReverseTable[8] = { + 0x0E060A020C040800, 0x0F070B030D050901, 0x0E060A020C040800, 0x0F070B030D050901, + 0x0E060A020C040800, 0x0F070B030D050901, 0x0E060A020C040800, 0x0F070B030D050901}; + + static const uint64_t reverseMaskTable1u[8] = { + 0x0001020304050607, 0x08090A0B0C0D0E0F, 0x1011121314151617, 0x18191A1B1C1D1E1F, + 0x2021222324252627, 0x28292A2B2C2D2E2F, 0x3031323334353637, 0x38393A3B3C3D3E3F}; + + static const uint64_t reverseMaskTable16u[8] = { + 0x0607040502030001, 0x0E0F0C0D0A0B0809, 0x1617141512131011, 0x1E1F1C1D1A1B1819, + 0x2627242522232021, 0x2E2F2C2D2A2B2829, 0x3637343532333031, 0x3E3F3C3D3A3B3839}; + + static const uint64_t reverseMaskTable32u[8] = { + 0x0405060700010203, 0x0C0D0E0F08090A0B, 0x1415161710111213, 0x1C1D1E1F18191A1B, + 0x2425262720212223, 0x2C2D2E2F28292A2B, 0x3435363730313233, 0x3C3D3E3F38393A3B}; + + inline uint32_t getAlign(uint32_t startBit, uint32_t base, uint32_t bitSize) { + uint32_t remnant = bitSize - startBit; + uint32_t retValue = 0xFFFFFFFF; + for (uint32_t i = 0u; i < bitSize; ++i) { + uint32_t testValue = (i * base) % bitSize; + if (testValue == remnant) { + retValue = i; + break; + } + } + return retValue; + } + + inline uint64_t moveByteLen(uint64_t numBits) { + uint64_t result = numBits / ORC_VECTOR_BYTE_WIDTH; + if (numBits % ORC_VECTOR_BYTE_WIDTH != 0) ++result; + return result; + } +} // namespace orc + +#endif diff --git a/c++/src/Bpacking.hh b/c++/src/Bpacking.hh new file mode 100644 index 0000000000..f55e986d8d --- /dev/null +++ b/c++/src/Bpacking.hh @@ -0,0 +1,34 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef ORC_BPACKING_HH +#define ORC_BPACKING_HH + +#include + +namespace orc { + class RleDecoderV2; + + class BitUnpack { + public: + static void readLongs(RleDecoderV2* decoder, int64_t* data, uint64_t offset, uint64_t len, + uint64_t fbs); + }; +} // namespace orc + +#endif diff --git a/c++/src/BpackingAvx512.cc b/c++/src/BpackingAvx512.cc new file mode 100644 index 0000000000..22f6972fb6 --- /dev/null +++ b/c++/src/BpackingAvx512.cc @@ -0,0 +1,2588 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "BpackingAvx512.hh" +#include "BitUnpackerAvx512.hh" +#include "CpuInfoUtil.hh" +#include "RLEv2.hh" + +namespace orc { + UnpackAvx512::UnpackAvx512(RleDecoderV2* dec) : decoder(dec), unpackDefault(UnpackDefault(dec)) { + // PASS + } + + UnpackAvx512::~UnpackAvx512() { + // PASS + } + + template + inline void UnpackAvx512::alignHeaderBoundary(const uint32_t bitWidth, const uint32_t bitMaxSize, + uint64_t& startBit, uint64_t& bufMoveByteLen, + uint64_t& bufRestByteLen, + uint64_t& remainingNumElements, + uint64_t& tailBitLen, uint32_t& backupByteLen, + uint64_t& numElements, bool& resetBuf, + const uint8_t*& srcPtr, int64_t*& dstPtr) { + uint64_t numBits = remainingNumElements * bitWidth; + if (hasBitOffset && startBit != 0) { + numBits += startBit - ORC_VECTOR_BYTE_WIDTH; + } + bufMoveByteLen += moveByteLen(numBits); + + if (bufMoveByteLen <= bufRestByteLen) { + numElements = remainingNumElements; + resetBuf = false; + remainingNumElements = 0; + } else { + uint64_t leadingBits = 0; + if (hasBitOffset && startBit != 0) leadingBits = ORC_VECTOR_BYTE_WIDTH - startBit; + uint64_t bufRestBitLen = bufRestByteLen * ORC_VECTOR_BYTE_WIDTH + leadingBits; + numElements = bufRestBitLen / bitWidth; + remainingNumElements -= numElements; + tailBitLen = fmod(bufRestBitLen, bitWidth); + resetBuf = true; + } + + if (tailBitLen != 0) { + backupByteLen = tailBitLen / ORC_VECTOR_BYTE_WIDTH; + tailBitLen = 0; + } + + if (hasBitOffset && startBit > 0) { + uint32_t align = getAlign(startBit, bitWidth, bitMaxSize); + if (align > numElements) { + align = numElements; + } + if (align != 0) { + bufMoveByteLen -= moveByteLen(align * bitWidth + startBit - ORC_VECTOR_BYTE_WIDTH); + plainUnpackLongs(dstPtr, 0, align, bitWidth, startBit); + srcPtr = reinterpret_cast(decoder->getBufStart()); + bufRestByteLen = decoder->bufLength(); + dstPtr += align; + numElements -= align; + } + } + } + + template + inline void UnpackAvx512::alignTailerBoundary(const uint32_t bitWidth, const uint32_t specialBit, + uint64_t& startBit, uint64_t& bufMoveByteLen, + uint64_t& bufRestByteLen, + uint64_t& remainingNumElements, + uint32_t& backupByteLen, uint64_t& numElements, + bool& resetBuf, const uint8_t*& srcPtr, + int64_t*& dstPtr) { + if (numElements > 0) { + uint64_t numBits = numElements * bitWidth; + if (hasBitOffset && startBit != 0) { + numBits += startBit - ORC_VECTOR_BYTE_WIDTH; + } + bufMoveByteLen -= moveByteLen(numBits); + if (hasBitOffset) { + plainUnpackLongs(dstPtr, 0, numElements, bitWidth, startBit); + } else { + switch (specialBit) { + case 16: + unpackDefault.unrolledUnpack16(dstPtr, 0, numElements); + break; + case 24: + unpackDefault.unrolledUnpack24(dstPtr, 0, numElements); + break; + case 32: + unpackDefault.unrolledUnpack32(dstPtr, 0, numElements); + break; + default: + break; + } + } + srcPtr = reinterpret_cast(decoder->getBufStart()); + dstPtr += numElements; + bufRestByteLen = decoder->bufLength(); + } + + if (bufMoveByteLen <= bufRestByteLen) { + decoder->resetBufferStart(bufMoveByteLen, resetBuf, backupByteLen); + return; + } + + decoder->resetBufferStart(bufRestByteLen, resetBuf, backupByteLen); + if (backupByteLen != 0) { + if (hasBitOffset) { + plainUnpackLongs(dstPtr, 0, 1, bitWidth, startBit); + } else { + switch (specialBit) { + case 16: + unpackDefault.unrolledUnpack16(dstPtr, 0, 1); + break; + case 24: + unpackDefault.unrolledUnpack24(dstPtr, 0, 1); + break; + case 32: + unpackDefault.unrolledUnpack32(dstPtr, 0, 1); + break; + default: + break; + } + } + dstPtr++; + backupByteLen = 0; + remainingNumElements--; + } + + bufRestByteLen = decoder->bufLength(); + bufMoveByteLen = 0; + srcPtr = reinterpret_cast(decoder->getBufStart()); + } + + void UnpackAvx512::vectorUnpack1(int64_t* data, uint64_t offset, uint64_t len) { + uint32_t bitWidth = 1; + const uint8_t* srcPtr = reinterpret_cast(decoder->getBufStart()); + uint64_t numElements = 0; + int64_t* dstPtr = data + offset; + uint64_t bufMoveByteLen = 0; + uint64_t bufRestByteLen = decoder->bufLength(); + bool resetBuf = false; + uint64_t startBit = 0; + uint64_t tailBitLen = 0; + uint32_t backupByteLen = 0; + + while (len > 0) { + alignHeaderBoundary(bitWidth, UNPACK_8Bit_MAX_SIZE, startBit, bufMoveByteLen, + bufRestByteLen, len, tailBitLen, backupByteLen, numElements, + resetBuf, srcPtr, dstPtr); + + if (numElements >= VECTOR_UNPACK_8BIT_MAX_NUM) { + uint8_t* simdPtr = reinterpret_cast(vectorBuf); + __m512i reverseMask1u = _mm512_loadu_si512(reverseMaskTable1u); + + while (numElements >= VECTOR_UNPACK_8BIT_MAX_NUM) { + uint64_t src_64 = *reinterpret_cast(const_cast(srcPtr)); + // convert mask to 512-bit register. 0 --> 0x00, 1 --> 0xFF + __m512i srcmm = _mm512_movm_epi8(src_64); + // make 0x00 --> 0x00, 0xFF --> 0x01 + srcmm = _mm512_abs_epi8(srcmm); + srcmm = _mm512_shuffle_epi8(srcmm, reverseMask1u); + _mm512_storeu_si512(simdPtr, srcmm); + + srcPtr += 8 * bitWidth; + decoder->resetBufferStart(8 * bitWidth, false, 0); + bufRestByteLen = decoder->bufLength(); + bufMoveByteLen -= 8 * bitWidth; + numElements -= VECTOR_UNPACK_8BIT_MAX_NUM; + std::copy(simdPtr, simdPtr + VECTOR_UNPACK_8BIT_MAX_NUM, dstPtr); + dstPtr += VECTOR_UNPACK_8BIT_MAX_NUM; + } + } + + alignTailerBoundary(bitWidth, 0, startBit, bufMoveByteLen, bufRestByteLen, len, + backupByteLen, numElements, resetBuf, srcPtr, dstPtr); + } + } + + void UnpackAvx512::vectorUnpack2(int64_t* data, uint64_t offset, uint64_t len) { + uint32_t bitWidth = 2; + const uint8_t* srcPtr = reinterpret_cast(decoder->getBufStart()); + uint64_t numElements = 0; + int64_t* dstPtr = data + offset; + uint64_t bufMoveByteLen = 0; + uint64_t bufRestByteLen = decoder->bufLength(); + bool resetBuf = false; + uint64_t startBit = 0; + uint64_t tailBitLen = 0; + uint32_t backupByteLen = 0; + + while (len > 0) { + alignHeaderBoundary(bitWidth, UNPACK_8Bit_MAX_SIZE, startBit, bufMoveByteLen, + bufRestByteLen, len, tailBitLen, backupByteLen, numElements, + resetBuf, srcPtr, dstPtr); + + if (numElements >= VECTOR_UNPACK_8BIT_MAX_NUM) { + uint8_t* simdPtr = reinterpret_cast(vectorBuf); + __mmask64 readMask = ORC_VECTOR_MAX_16U; // first 16 bytes (64 elements) + __m512i parse_mask = _mm512_set1_epi16(0x0303); // 2 times 1 then (8 - 2) times 0 + while (numElements >= VECTOR_UNPACK_8BIT_MAX_NUM) { + __m512i srcmm3 = _mm512_maskz_loadu_epi8(readMask, srcPtr); + __m512i srcmm0, srcmm1, srcmm2, tmpmm; + + srcmm2 = _mm512_srli_epi16(srcmm3, 2); + srcmm1 = _mm512_srli_epi16(srcmm3, 4); + srcmm0 = _mm512_srli_epi16(srcmm3, 6); + + // turn 2 bitWidth into 8 by zeroing 3 of each 4 elements. + // move them into their places + // srcmm0: a e i m 0 0 0 0 0 0 0 0 0 0 0 0 + // srcmm1: b f j n 0 0 0 0 0 0 0 0 0 0 0 0 + tmpmm = _mm512_unpacklo_epi8(srcmm0, srcmm1); // ab ef 00 00 00 00 00 00 + srcmm0 = _mm512_unpackhi_epi8(srcmm0, srcmm1); // ij mn 00 00 00 00 00 00 + srcmm0 = _mm512_shuffle_i64x2(tmpmm, srcmm0, 0x00); // ab ef ab ef ij mn ij mn + + // srcmm2: c g k o 0 0 0 0 0 0 0 0 0 0 0 0 + // srcmm3: d h l p 0 0 0 0 0 0 0 0 0 0 0 0 + tmpmm = _mm512_unpacklo_epi8(srcmm2, srcmm3); // cd gh 00 00 00 00 00 00 + srcmm1 = _mm512_unpackhi_epi8(srcmm2, srcmm3); // kl op 00 00 00 00 00 00 + srcmm1 = _mm512_shuffle_i64x2(tmpmm, srcmm1, 0x00); // cd gh cd gh kl op kl op + + tmpmm = _mm512_unpacklo_epi16(srcmm0, srcmm1); // abcd abcd ijkl ijkl + srcmm0 = _mm512_unpackhi_epi16(srcmm0, srcmm1); // efgh efgh mnop mnop + srcmm0 = _mm512_shuffle_i64x2(tmpmm, srcmm0, 0x88); // abcd ijkl efgh mnop + srcmm0 = _mm512_shuffle_i64x2(srcmm0, srcmm0, 0xD8); // abcd efgh ijkl mnop + + srcmm0 = _mm512_and_si512(srcmm0, parse_mask); + + _mm512_storeu_si512(simdPtr, srcmm0); + + srcPtr += 8 * bitWidth; + decoder->resetBufferStart(8 * bitWidth, false, 0); + bufRestByteLen = decoder->bufLength(); + bufMoveByteLen -= 8 * bitWidth; + numElements -= VECTOR_UNPACK_8BIT_MAX_NUM; + std::copy(simdPtr, simdPtr + VECTOR_UNPACK_8BIT_MAX_NUM, dstPtr); + dstPtr += VECTOR_UNPACK_8BIT_MAX_NUM; + } + } + + alignTailerBoundary(bitWidth, 0, startBit, bufMoveByteLen, bufRestByteLen, len, + backupByteLen, numElements, resetBuf, srcPtr, dstPtr); + } + } + + void UnpackAvx512::vectorUnpack3(int64_t* data, uint64_t offset, uint64_t len) { + uint32_t bitWidth = 3; + const uint8_t* srcPtr = reinterpret_cast(decoder->getBufStart()); + uint64_t numElements = 0; + int64_t* dstPtr = data + offset; + uint64_t bufMoveByteLen = 0; + uint64_t bufRestByteLen = decoder->bufLength(); + bool resetBuf = false; + uint64_t startBit = 0; + uint64_t tailBitLen = 0; + uint32_t backupByteLen = 0; + + while (len > 0) { + alignHeaderBoundary(bitWidth, UNPACK_8Bit_MAX_SIZE, startBit, bufMoveByteLen, + bufRestByteLen, len, tailBitLen, backupByteLen, numElements, + resetBuf, srcPtr, dstPtr); + + if (numElements >= VECTOR_UNPACK_8BIT_MAX_NUM) { + uint8_t* simdPtr = reinterpret_cast(vectorBuf); + __mmask64 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_BYTE(bitWidth * 64)); + __m512i parseMask = _mm512_set1_epi8(ORC_VECTOR_BIT_MASK(bitWidth)); + + __m512i permutexIdx = _mm512_loadu_si512(permutexIdxTable3u); + + __m512i shuffleIdxPtr[2]; + shuffleIdxPtr[0] = _mm512_loadu_si512(shuffleIdxTable3u_0); + shuffleIdxPtr[1] = _mm512_loadu_si512(shuffleIdxTable3u_1); + + __m512i shiftMaskPtr[2]; + shiftMaskPtr[0] = _mm512_loadu_si512(shiftTable3u_0); + shiftMaskPtr[1] = _mm512_loadu_si512(shiftTable3u_1); + + while (numElements >= VECTOR_UNPACK_8BIT_MAX_NUM) { + __m512i srcmm, zmm[2]; + + srcmm = _mm512_maskz_loadu_epi8(readMask, srcPtr); + srcmm = _mm512_permutexvar_epi16(permutexIdx, srcmm); + + // shuffling so in zmm[0] will be elements with even indexes and in zmm[1] - with odd ones + zmm[0] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr[0]); + zmm[1] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr[1]); + + // shifting elements so they start from the start of the word + zmm[0] = _mm512_srlv_epi16(zmm[0], shiftMaskPtr[0]); + zmm[1] = _mm512_sllv_epi16(zmm[1], shiftMaskPtr[1]); + + // gathering even and odd elements together + zmm[0] = _mm512_mask_mov_epi8(zmm[0], 0xAAAAAAAAAAAAAAAA, zmm[1]); + zmm[0] = _mm512_and_si512(zmm[0], parseMask); + + _mm512_storeu_si512(simdPtr, zmm[0]); + + srcPtr += 8 * bitWidth; + decoder->resetBufferStart(8 * bitWidth, false, 0); + bufRestByteLen = decoder->bufLength(); + bufMoveByteLen -= 8 * bitWidth; + numElements -= VECTOR_UNPACK_8BIT_MAX_NUM; + std::copy(simdPtr, simdPtr + VECTOR_UNPACK_8BIT_MAX_NUM, dstPtr); + dstPtr += VECTOR_UNPACK_8BIT_MAX_NUM; + } + } + + alignTailerBoundary(bitWidth, 0, startBit, bufMoveByteLen, bufRestByteLen, len, + backupByteLen, numElements, resetBuf, srcPtr, dstPtr); + } + } + + void UnpackAvx512::vectorUnpack4(int64_t* data, uint64_t offset, uint64_t len) { + uint32_t bitWidth = 4; + const uint8_t* srcPtr = reinterpret_cast(decoder->getBufStart()); + uint64_t numElements = 0; + int64_t* dstPtr = data + offset; + uint64_t bufMoveByteLen = 0; + uint64_t bufRestByteLen = decoder->bufLength(); + bool resetBuf = false; + uint64_t startBit = 0; + uint64_t tailBitLen = 0; + uint32_t backupByteLen = 0; + + while (len > 0) { + alignHeaderBoundary(bitWidth, UNPACK_8Bit_MAX_SIZE, startBit, bufMoveByteLen, + bufRestByteLen, len, tailBitLen, backupByteLen, numElements, + resetBuf, srcPtr, dstPtr); + + if (numElements >= VECTOR_UNPACK_8BIT_MAX_NUM) { + uint8_t* simdPtr = reinterpret_cast(vectorBuf); + __mmask64 readMask = ORC_VECTOR_MAX_32U; // first 32 bytes (64 elements) + __m512i parseMask = _mm512_set1_epi16(0x0F0F); // 4 times 1 then (8 - 4) times 0 + while (numElements >= VECTOR_UNPACK_8BIT_MAX_NUM) { + __m512i srcmm0, srcmm1, tmpmm; + + srcmm1 = _mm512_maskz_loadu_epi8(readMask, srcPtr); + srcmm0 = _mm512_srli_epi16(srcmm1, 4); + + // move elements into their places + // srcmm0: a c e g 0 0 0 0 + // srcmm1: b d f h 0 0 0 0 + tmpmm = _mm512_unpacklo_epi8(srcmm0, srcmm1); // ab ef 00 00 + srcmm0 = _mm512_unpackhi_epi8(srcmm0, srcmm1); // cd gh 00 00 + srcmm0 = _mm512_shuffle_i64x2(tmpmm, srcmm0, 0x44); // ab ef cd gh + srcmm0 = _mm512_shuffle_i64x2(srcmm0, srcmm0, 0xD8); // ab cd ef gh + + // turn 4 bitWidth into 8 by zeroing 4 of each 8 bits. + srcmm0 = _mm512_and_si512(srcmm0, parseMask); + + _mm512_storeu_si512(simdPtr, srcmm0); + + srcPtr += 8 * bitWidth; + decoder->resetBufferStart(8 * bitWidth, false, 0); + bufRestByteLen = decoder->bufLength(); + bufMoveByteLen -= 8 * bitWidth; + numElements -= VECTOR_UNPACK_8BIT_MAX_NUM; + std::copy(simdPtr, simdPtr + VECTOR_UNPACK_8BIT_MAX_NUM, dstPtr); + dstPtr += VECTOR_UNPACK_8BIT_MAX_NUM; + } + } + + alignTailerBoundary(bitWidth, 0, startBit, bufMoveByteLen, bufRestByteLen, len, + backupByteLen, numElements, resetBuf, srcPtr, dstPtr); + } + } + + void UnpackAvx512::vectorUnpack5(int64_t* data, uint64_t offset, uint64_t len) { + uint32_t bitWidth = 5; + const uint8_t* srcPtr = reinterpret_cast(decoder->getBufStart()); + uint64_t numElements = 0; + int64_t* dstPtr = data + offset; + uint64_t bufMoveByteLen = 0; + uint64_t bufRestByteLen = decoder->bufLength(); + bool resetBuf = false; + uint64_t startBit = 0; + uint64_t tailBitLen = 0; + uint32_t backupByteLen = 0; + + while (len > 0) { + alignHeaderBoundary(bitWidth, UNPACK_8Bit_MAX_SIZE, startBit, bufMoveByteLen, + bufRestByteLen, len, tailBitLen, backupByteLen, numElements, + resetBuf, srcPtr, dstPtr); + + if (numElements >= VECTOR_UNPACK_8BIT_MAX_NUM) { + uint8_t* simdPtr = reinterpret_cast(vectorBuf); + __mmask64 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_BYTE(bitWidth * 64)); + __m512i parseMask = _mm512_set1_epi8(ORC_VECTOR_BIT_MASK(bitWidth)); + + __m512i permutexIdx = _mm512_loadu_si512(permutexIdxTable5u); + + __m512i shuffleIdxPtr[2]; + shuffleIdxPtr[0] = _mm512_loadu_si512(shuffleIdxTable5u_0); + shuffleIdxPtr[1] = _mm512_loadu_si512(shuffleIdxTable5u_1); + + __m512i shiftMaskPtr[2]; + shiftMaskPtr[0] = _mm512_loadu_si512(shiftTable5u_0); + shiftMaskPtr[1] = _mm512_loadu_si512(shiftTable5u_1); + + while (numElements >= VECTOR_UNPACK_8BIT_MAX_NUM) { + __m512i srcmm, zmm[2]; + + srcmm = _mm512_maskz_loadu_epi8(readMask, srcPtr); + srcmm = _mm512_permutexvar_epi16(permutexIdx, srcmm); + + // shuffling so in zmm[0] will be elements with even indexes and in zmm[1] - with odd ones + zmm[0] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr[0]); + zmm[1] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr[1]); + + // shifting elements so they start from the start of the word + zmm[0] = _mm512_srlv_epi16(zmm[0], shiftMaskPtr[0]); + zmm[1] = _mm512_sllv_epi16(zmm[1], shiftMaskPtr[1]); + + // gathering even and odd elements together + zmm[0] = _mm512_mask_mov_epi8(zmm[0], 0xAAAAAAAAAAAAAAAA, zmm[1]); + zmm[0] = _mm512_and_si512(zmm[0], parseMask); + + _mm512_storeu_si512(simdPtr, zmm[0]); + + srcPtr += 8 * bitWidth; + decoder->resetBufferStart(8 * bitWidth, false, 0); + bufRestByteLen = decoder->bufLength(); + bufMoveByteLen -= 8 * bitWidth; + numElements -= VECTOR_UNPACK_8BIT_MAX_NUM; + std::copy(simdPtr, simdPtr + VECTOR_UNPACK_8BIT_MAX_NUM, dstPtr); + dstPtr += VECTOR_UNPACK_8BIT_MAX_NUM; + } + } + + alignTailerBoundary(bitWidth, 0, startBit, bufMoveByteLen, bufRestByteLen, len, + backupByteLen, numElements, resetBuf, srcPtr, dstPtr); + } + } + + void UnpackAvx512::vectorUnpack6(int64_t* data, uint64_t offset, uint64_t len) { + uint32_t bitWidth = 6; + const uint8_t* srcPtr = reinterpret_cast(decoder->getBufStart()); + uint64_t numElements = 0; + int64_t* dstPtr = data + offset; + uint64_t bufMoveByteLen = 0; + uint64_t bufRestByteLen = decoder->bufLength(); + bool resetBuf = false; + uint64_t startBit = 0; + uint64_t tailBitLen = 0; + uint32_t backupByteLen = 0; + + while (len > 0) { + alignHeaderBoundary(bitWidth, UNPACK_8Bit_MAX_SIZE, startBit, bufMoveByteLen, + bufRestByteLen, len, tailBitLen, backupByteLen, numElements, + resetBuf, srcPtr, dstPtr); + + if (numElements >= VECTOR_UNPACK_8BIT_MAX_NUM) { + uint8_t* simdPtr = reinterpret_cast(vectorBuf); + __mmask64 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_BYTE(bitWidth * 64)); + __m512i parseMask = _mm512_set1_epi8(ORC_VECTOR_BIT_MASK(bitWidth)); + + __m512i permutexIdx = _mm512_loadu_si512(permutexIdxTable6u); + + __m512i shuffleIdxPtr[2]; + shuffleIdxPtr[0] = _mm512_loadu_si512(shuffleIdxTable6u_0); + shuffleIdxPtr[1] = _mm512_loadu_si512(shuffleIdxTable6u_1); + + __m512i shiftMaskPtr[2]; + shiftMaskPtr[0] = _mm512_loadu_si512(shiftTable6u_0); + shiftMaskPtr[1] = _mm512_loadu_si512(shiftTable6u_1); + + while (numElements >= VECTOR_UNPACK_8BIT_MAX_NUM) { + __m512i srcmm, zmm[2]; + + srcmm = _mm512_maskz_loadu_epi8(readMask, srcPtr); + srcmm = _mm512_permutexvar_epi32(permutexIdx, srcmm); + + // shuffling so in zmm[0] will be elements with even indexes and in zmm[1] - with odd ones + zmm[0] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr[0]); + zmm[1] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr[1]); + + // shifting elements so they start from the start of the word + zmm[0] = _mm512_srlv_epi16(zmm[0], shiftMaskPtr[0]); + zmm[1] = _mm512_sllv_epi16(zmm[1], shiftMaskPtr[1]); + + // gathering even and odd elements together + zmm[0] = _mm512_mask_mov_epi8(zmm[0], 0xAAAAAAAAAAAAAAAA, zmm[1]); + zmm[0] = _mm512_and_si512(zmm[0], parseMask); + + _mm512_storeu_si512(simdPtr, zmm[0]); + + srcPtr += 8 * bitWidth; + decoder->resetBufferStart(8 * bitWidth, false, 0); + bufRestByteLen = decoder->bufLength(); + bufMoveByteLen -= 8 * bitWidth; + numElements -= VECTOR_UNPACK_8BIT_MAX_NUM; + std::copy(simdPtr, simdPtr + VECTOR_UNPACK_8BIT_MAX_NUM, dstPtr); + dstPtr += VECTOR_UNPACK_8BIT_MAX_NUM; + } + } + + alignTailerBoundary(bitWidth, 0, startBit, bufMoveByteLen, bufRestByteLen, len, + backupByteLen, numElements, resetBuf, srcPtr, dstPtr); + } + } + + void UnpackAvx512::vectorUnpack7(int64_t* data, uint64_t offset, uint64_t len) { + uint32_t bitWidth = 7; + const uint8_t* srcPtr = reinterpret_cast(decoder->getBufStart()); + uint64_t numElements = 0; + int64_t* dstPtr = data + offset; + uint64_t bufMoveByteLen = 0; + uint64_t bufRestByteLen = decoder->bufLength(); + bool resetBuf = false; + uint64_t startBit = 0; + uint64_t tailBitLen = 0; + uint32_t backupByteLen = 0; + + while (len > 0) { + alignHeaderBoundary(bitWidth, UNPACK_8Bit_MAX_SIZE, startBit, bufMoveByteLen, + bufRestByteLen, len, tailBitLen, backupByteLen, numElements, + resetBuf, srcPtr, dstPtr); + + if (numElements >= VECTOR_UNPACK_8BIT_MAX_NUM) { + uint8_t* simdPtr = reinterpret_cast(vectorBuf); + __mmask64 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_BYTE(bitWidth * 64)); + __m512i parseMask = _mm512_set1_epi8(ORC_VECTOR_BIT_MASK(bitWidth)); + + __m512i permutexIdx = _mm512_loadu_si512(permutexIdxTable7u); + + __m512i shuffleIdxPtr[2]; + shuffleIdxPtr[0] = _mm512_loadu_si512(shuffleIdxTable7u_0); + shuffleIdxPtr[1] = _mm512_loadu_si512(shuffleIdxTable7u_1); + + __m512i shiftMaskPtr[2]; + shiftMaskPtr[0] = _mm512_loadu_si512(shiftTable7u_0); + shiftMaskPtr[1] = _mm512_loadu_si512(shiftTable7u_1); + + while (numElements >= VECTOR_UNPACK_8BIT_MAX_NUM) { + __m512i srcmm, zmm[2]; + + srcmm = _mm512_maskz_loadu_epi8(readMask, srcPtr); + srcmm = _mm512_permutexvar_epi16(permutexIdx, srcmm); + + // shuffling so in zmm[0] will be elements with even indexes and in zmm[1] - with odd ones + zmm[0] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr[0]); + zmm[1] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr[1]); + + // shifting elements so they start from the start of the word + zmm[0] = _mm512_srlv_epi16(zmm[0], shiftMaskPtr[0]); + zmm[1] = _mm512_sllv_epi16(zmm[1], shiftMaskPtr[1]); + + // gathering even and odd elements together + zmm[0] = _mm512_mask_mov_epi8(zmm[0], 0xAAAAAAAAAAAAAAAA, zmm[1]); + zmm[0] = _mm512_and_si512(zmm[0], parseMask); + + _mm512_storeu_si512(simdPtr, zmm[0]); + + srcPtr += 8 * bitWidth; + decoder->resetBufferStart(8 * bitWidth, false, 0); + bufRestByteLen = decoder->bufLength(); + bufMoveByteLen -= 8 * bitWidth; + numElements -= VECTOR_UNPACK_8BIT_MAX_NUM; + std::copy(simdPtr, simdPtr + VECTOR_UNPACK_8BIT_MAX_NUM, dstPtr); + dstPtr += VECTOR_UNPACK_8BIT_MAX_NUM; + } + } + + alignTailerBoundary(bitWidth, 0, startBit, bufMoveByteLen, bufRestByteLen, len, + backupByteLen, numElements, resetBuf, srcPtr, dstPtr); + } + } + + void UnpackAvx512::vectorUnpack9(int64_t* data, uint64_t offset, uint64_t len) { + uint32_t bitWidth = 9; + const uint8_t* srcPtr = reinterpret_cast(decoder->getBufStart()); + uint64_t numElements = 0; + int64_t* dstPtr = data + offset; + uint64_t bufMoveByteLen = 0; + uint64_t bufRestByteLen = decoder->bufLength(); + bool resetBuf = false; + uint64_t startBit = 0; + uint64_t tailBitLen = 0; + uint32_t backupByteLen = 0; + + while (len > 0) { + alignHeaderBoundary(bitWidth, UNPACK_16Bit_MAX_SIZE, startBit, bufMoveByteLen, + bufRestByteLen, len, tailBitLen, backupByteLen, numElements, + resetBuf, srcPtr, dstPtr); + + if (numElements >= VECTOR_UNPACK_16BIT_MAX_NUM) { + uint16_t* simdPtr = reinterpret_cast(vectorBuf); + __mmask32 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_WORD(bitWidth * 32)); + __m512i parseMask0 = _mm512_set1_epi16(ORC_VECTOR_BIT_MASK(bitWidth)); + __m512i nibbleReversemm = _mm512_loadu_si512(nibbleReverseTable); + __m512i reverseMask16u = _mm512_loadu_si512(reverseMaskTable16u); + __m512i maskmm = _mm512_set1_epi8(0x0F); + + __m512i shuffleIdxPtr = _mm512_loadu_si512(shuffleIdxTable9u_0); + + __m512i permutexIdxPtr[2]; + permutexIdxPtr[0] = _mm512_loadu_si512(permutexIdxTable9u_0); + permutexIdxPtr[1] = _mm512_loadu_si512(permutexIdxTable9u_1); + + __m512i shiftMaskPtr[3]; + shiftMaskPtr[0] = _mm512_loadu_si512(shiftTable9u_0); + shiftMaskPtr[1] = _mm512_loadu_si512(shiftTable9u_1); + shiftMaskPtr[2] = _mm512_loadu_si512(shiftTable9u_2); + + __m512i gatherIdxmm = _mm512_loadu_si512(gatherIdxTable9u); + + while (numElements >= 2 * VECTOR_UNPACK_16BIT_MAX_NUM) { + __m512i srcmm, zmm[2]; + + srcmm = _mm512_i64gather_epi64(gatherIdxmm, srcPtr, 1); + + zmm[0] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr); + + // shifting elements so they start from the start of the word + zmm[0] = _mm512_srlv_epi16(zmm[0], shiftMaskPtr[2]); + zmm[0] = _mm512_and_si512(zmm[0], parseMask0); + + _mm512_storeu_si512(simdPtr, zmm[0]); + + srcPtr += 4 * bitWidth; + decoder->resetBufferStart(4 * bitWidth, false, 0); + bufRestByteLen = decoder->bufLength(); + bufMoveByteLen -= 4 * bitWidth; + numElements -= VECTOR_UNPACK_16BIT_MAX_NUM; + std::copy(simdPtr, simdPtr + VECTOR_UNPACK_16BIT_MAX_NUM, dstPtr); + dstPtr += VECTOR_UNPACK_16BIT_MAX_NUM; + } + if (numElements >= VECTOR_UNPACK_16BIT_MAX_NUM) { + __m512i srcmm, zmm[2]; + + srcmm = _mm512_maskz_loadu_epi16(readMask, srcPtr); + + __m512i lowNibblemm = _mm512_and_si512(srcmm, maskmm); + __m512i highNibblemm = _mm512_srli_epi16(srcmm, 4); + highNibblemm = _mm512_and_si512(highNibblemm, maskmm); + + lowNibblemm = _mm512_shuffle_epi8(nibbleReversemm, lowNibblemm); + highNibblemm = _mm512_shuffle_epi8(nibbleReversemm, highNibblemm); + lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4); + + srcmm = _mm512_or_si512(lowNibblemm, highNibblemm); + + // permuting so in zmm[0] will be elements with even indexes and in zmm[1] - with odd ones + zmm[0] = _mm512_permutexvar_epi16(permutexIdxPtr[0], srcmm); + zmm[1] = _mm512_permutexvar_epi16(permutexIdxPtr[1], srcmm); + + // shifting elements so they start from the start of the word + zmm[0] = _mm512_srlv_epi32(zmm[0], shiftMaskPtr[0]); + zmm[1] = _mm512_sllv_epi32(zmm[1], shiftMaskPtr[1]); + + // gathering even and odd elements together + zmm[0] = _mm512_mask_mov_epi16(zmm[0], 0xAAAAAAAA, zmm[1]); + zmm[0] = _mm512_and_si512(zmm[0], parseMask0); + + zmm[0] = _mm512_slli_epi16(zmm[0], 7); + + lowNibblemm = _mm512_and_si512(zmm[0], maskmm); + highNibblemm = _mm512_srli_epi16(zmm[0], 4); + highNibblemm = _mm512_and_si512(highNibblemm, maskmm); + + lowNibblemm = _mm512_shuffle_epi8(nibbleReversemm, lowNibblemm); + highNibblemm = _mm512_shuffle_epi8(nibbleReversemm, highNibblemm); + lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4); + + zmm[0] = _mm512_or_si512(lowNibblemm, highNibblemm); + zmm[0] = _mm512_shuffle_epi8(zmm[0], reverseMask16u); + + _mm512_storeu_si512(simdPtr, zmm[0]); + + srcPtr += 4 * bitWidth; + decoder->resetBufferStart(4 * bitWidth, false, 0); + bufRestByteLen = decoder->bufLength(); + bufMoveByteLen -= 4 * bitWidth; + numElements -= VECTOR_UNPACK_16BIT_MAX_NUM; + std::copy(simdPtr, simdPtr + VECTOR_UNPACK_16BIT_MAX_NUM, dstPtr); + dstPtr += VECTOR_UNPACK_16BIT_MAX_NUM; + } + } + + alignTailerBoundary(bitWidth, 0, startBit, bufMoveByteLen, bufRestByteLen, len, + backupByteLen, numElements, resetBuf, srcPtr, dstPtr); + } + } + + void UnpackAvx512::vectorUnpack10(int64_t* data, uint64_t offset, uint64_t len) { + uint32_t bitWidth = 10; + const uint8_t* srcPtr = reinterpret_cast(decoder->getBufStart()); + uint64_t numElements = 0; + int64_t* dstPtr = data + offset; + uint64_t bufMoveByteLen = 0; + uint64_t bufRestByteLen = decoder->bufLength(); + bool resetBuf = false; + uint64_t startBit = 0; + uint64_t tailBitLen = 0; + uint32_t backupByteLen = 0; + + while (len > 0) { + alignHeaderBoundary(bitWidth, UNPACK_16Bit_MAX_SIZE, startBit, bufMoveByteLen, + bufRestByteLen, len, tailBitLen, backupByteLen, numElements, + resetBuf, srcPtr, dstPtr); + + if (numElements >= VECTOR_UNPACK_16BIT_MAX_NUM) { + uint16_t* simdPtr = reinterpret_cast(vectorBuf); + __mmask32 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_WORD(bitWidth * 32)); + __m512i parseMask0 = _mm512_set1_epi16(ORC_VECTOR_BIT_MASK(bitWidth)); + + __m512i shuffleIdxPtr = _mm512_loadu_si512(shuffleIdxTable10u_0); + __m512i permutexIdx = _mm512_loadu_si512(permutexIdxTable10u); + __m512i shiftMask = _mm512_loadu_si512(shiftTable10u); + + while (numElements >= VECTOR_UNPACK_16BIT_MAX_NUM) { + __m512i srcmm, zmm; + + srcmm = _mm512_maskz_loadu_epi16(readMask, srcPtr); + + zmm = _mm512_permutexvar_epi16(permutexIdx, srcmm); + zmm = _mm512_shuffle_epi8(zmm, shuffleIdxPtr); + + // shifting elements so they start from the start of the word + zmm = _mm512_srlv_epi16(zmm, shiftMask); + zmm = _mm512_and_si512(zmm, parseMask0); + + _mm512_storeu_si512(simdPtr, zmm); + + srcPtr += 4 * bitWidth; + decoder->resetBufferStart(4 * bitWidth, false, 0); + bufRestByteLen = decoder->bufLength(); + bufMoveByteLen -= 4 * bitWidth; + numElements -= VECTOR_UNPACK_16BIT_MAX_NUM; + std::copy(simdPtr, simdPtr + VECTOR_UNPACK_16BIT_MAX_NUM, dstPtr); + dstPtr += VECTOR_UNPACK_16BIT_MAX_NUM; + } + } + + alignTailerBoundary(bitWidth, 0, startBit, bufMoveByteLen, bufRestByteLen, len, + backupByteLen, numElements, resetBuf, srcPtr, dstPtr); + } + } + + void UnpackAvx512::vectorUnpack11(int64_t* data, uint64_t offset, uint64_t len) { + uint32_t bitWidth = 11; + const uint8_t* srcPtr = reinterpret_cast(decoder->getBufStart()); + uint64_t numElements = 0; + int64_t* dstPtr = data + offset; + uint64_t bufMoveByteLen = 0; + uint64_t bufRestByteLen = decoder->bufLength(); + bool resetBuf = false; + uint64_t startBit = 0; + uint64_t tailBitLen = 0; + uint32_t backupByteLen = 0; + + while (len > 0) { + alignHeaderBoundary(bitWidth, UNPACK_16Bit_MAX_SIZE, startBit, bufMoveByteLen, + bufRestByteLen, len, tailBitLen, backupByteLen, numElements, + resetBuf, srcPtr, dstPtr); + + if (numElements >= VECTOR_UNPACK_16BIT_MAX_NUM) { + uint16_t* simdPtr = reinterpret_cast(vectorBuf); + __mmask32 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_WORD(bitWidth * 32)); + __m512i parseMask0 = _mm512_set1_epi16(ORC_VECTOR_BIT_MASK(bitWidth)); + __m512i nibbleReversemm = _mm512_loadu_si512(nibbleReverseTable); + __m512i reverse_mask_16u = _mm512_loadu_si512(reverseMaskTable16u); + __m512i maskmm = _mm512_set1_epi8(0x0F); + + __m512i shuffleIdxPtr[2]; + shuffleIdxPtr[0] = _mm512_loadu_si512(shuffleIdxTable11u_0); + shuffleIdxPtr[1] = _mm512_loadu_si512(shuffleIdxTable11u_1); + + __m512i permutexIdxPtr[2]; + permutexIdxPtr[0] = _mm512_loadu_si512(permutexIdxTable11u_0); + permutexIdxPtr[1] = _mm512_loadu_si512(permutexIdxTable11u_1); + + __m512i shiftMaskPtr[4]; + shiftMaskPtr[0] = _mm512_loadu_si512(shiftTable11u_0); + shiftMaskPtr[1] = _mm512_loadu_si512(shiftTable11u_1); + shiftMaskPtr[2] = _mm512_loadu_si512(shiftTable11u_2); + shiftMaskPtr[3] = _mm512_loadu_si512(shiftTable11u_3); + + __m512i gatherIdxmm = _mm512_loadu_si512(gatherIdxTable11u); + + while (numElements >= 2 * VECTOR_UNPACK_16BIT_MAX_NUM) { + __m512i srcmm, zmm[2]; + + srcmm = _mm512_i64gather_epi64(gatherIdxmm, srcPtr, 1); + + // shuffling so in zmm[0] will be elements with even indexes and in zmm[1] - with odd ones + zmm[0] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr[0]); + zmm[1] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr[1]); + + // shifting elements so they start from the start of the word + zmm[0] = _mm512_srlv_epi32(zmm[0], shiftMaskPtr[2]); + zmm[1] = _mm512_sllv_epi32(zmm[1], shiftMaskPtr[3]); + + // gathering even and odd elements together + zmm[0] = _mm512_mask_mov_epi16(zmm[0], 0xAAAAAAAA, zmm[1]); + zmm[0] = _mm512_and_si512(zmm[0], parseMask0); + + _mm512_storeu_si512(simdPtr, zmm[0]); + + srcPtr += 4 * bitWidth; + decoder->resetBufferStart(4 * bitWidth, false, 0); + bufRestByteLen = decoder->bufLength(); + bufMoveByteLen -= 4 * bitWidth; + numElements -= VECTOR_UNPACK_16BIT_MAX_NUM; + std::copy(simdPtr, simdPtr + VECTOR_UNPACK_16BIT_MAX_NUM, dstPtr); + dstPtr += VECTOR_UNPACK_16BIT_MAX_NUM; + } + if (numElements >= VECTOR_UNPACK_16BIT_MAX_NUM) { + __m512i srcmm, zmm[2]; + + srcmm = _mm512_maskz_loadu_epi16(readMask, srcPtr); + + __m512i lowNibblemm = _mm512_and_si512(srcmm, maskmm); + __m512i highNibblemm = _mm512_srli_epi16(srcmm, 4); + highNibblemm = _mm512_and_si512(highNibblemm, maskmm); + + lowNibblemm = _mm512_shuffle_epi8(nibbleReversemm, lowNibblemm); + highNibblemm = _mm512_shuffle_epi8(nibbleReversemm, highNibblemm); + lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4u); + + srcmm = _mm512_or_si512(lowNibblemm, highNibblemm); + + // permuting so in zmm[0] will be elements with even indexes and in zmm[1] - with odd ones + zmm[0] = _mm512_permutexvar_epi16(permutexIdxPtr[0], srcmm); + zmm[1] = _mm512_permutexvar_epi16(permutexIdxPtr[1], srcmm); + + // shifting elements so they start from the start of the word + zmm[0] = _mm512_srlv_epi32(zmm[0], shiftMaskPtr[0]); + zmm[1] = _mm512_sllv_epi32(zmm[1], shiftMaskPtr[1]); + + // gathering even and odd elements together + zmm[0] = _mm512_mask_mov_epi16(zmm[0], 0xAAAAAAAA, zmm[1]); + zmm[0] = _mm512_and_si512(zmm[0], parseMask0); + + zmm[0] = _mm512_slli_epi16(zmm[0], 5); + + lowNibblemm = _mm512_and_si512(zmm[0], maskmm); + highNibblemm = _mm512_srli_epi16(zmm[0], 4); + highNibblemm = _mm512_and_si512(highNibblemm, maskmm); + + lowNibblemm = _mm512_shuffle_epi8(nibbleReversemm, lowNibblemm); + highNibblemm = _mm512_shuffle_epi8(nibbleReversemm, highNibblemm); + lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4); + + zmm[0] = _mm512_or_si512(lowNibblemm, highNibblemm); + zmm[0] = _mm512_shuffle_epi8(zmm[0], reverse_mask_16u); + + _mm512_storeu_si512(simdPtr, zmm[0]); + + srcPtr += 4 * bitWidth; + decoder->resetBufferStart(4 * bitWidth, false, 0); + bufRestByteLen = decoder->bufLength(); + bufMoveByteLen -= 4 * bitWidth; + numElements -= VECTOR_UNPACK_16BIT_MAX_NUM; + std::copy(simdPtr, simdPtr + VECTOR_UNPACK_16BIT_MAX_NUM, dstPtr); + dstPtr += VECTOR_UNPACK_16BIT_MAX_NUM; + } + } + + alignTailerBoundary(bitWidth, 0, startBit, bufMoveByteLen, bufRestByteLen, len, + backupByteLen, numElements, resetBuf, srcPtr, dstPtr); + } + } + + void UnpackAvx512::vectorUnpack12(int64_t* data, uint64_t offset, uint64_t len) { + uint32_t bitWidth = 12; + const uint8_t* srcPtr = reinterpret_cast(decoder->getBufStart()); + uint64_t numElements = 0; + int64_t* dstPtr = data + offset; + uint64_t bufMoveByteLen = 0; + uint64_t bufRestByteLen = decoder->bufLength(); + bool resetBuf = false; + uint64_t startBit = 0; + uint64_t tailBitLen = 0; + uint32_t backupByteLen = 0; + + while (len > 0) { + alignHeaderBoundary(bitWidth, UNPACK_16Bit_MAX_SIZE, startBit, bufMoveByteLen, + bufRestByteLen, len, tailBitLen, backupByteLen, numElements, + resetBuf, srcPtr, dstPtr); + + if (numElements >= VECTOR_UNPACK_16BIT_MAX_NUM) { + uint16_t* simdPtr = reinterpret_cast(vectorBuf); + __mmask32 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_WORD(bitWidth * 32)); + __m512i parseMask0 = _mm512_set1_epi16(ORC_VECTOR_BIT_MASK(bitWidth)); + + __m512i shuffleIdxPtr = _mm512_loadu_si512(shuffleIdxTable12u_0); + __m512i permutexIdx = _mm512_loadu_si512(permutexIdxTable12u); + __m512i shiftMask = _mm512_loadu_si512(shiftTable12u); + + while (numElements >= VECTOR_UNPACK_16BIT_MAX_NUM) { + __m512i srcmm, zmm; + + srcmm = _mm512_maskz_loadu_epi16(readMask, srcPtr); + + zmm = _mm512_permutexvar_epi32(permutexIdx, srcmm); + zmm = _mm512_shuffle_epi8(zmm, shuffleIdxPtr); + + // shifting elements so they start from the start of the word + zmm = _mm512_srlv_epi16(zmm, shiftMask); + zmm = _mm512_and_si512(zmm, parseMask0); + + _mm512_storeu_si512(simdPtr, zmm); + + srcPtr += 4 * bitWidth; + decoder->resetBufferStart(4 * bitWidth, false, 0); + bufRestByteLen = decoder->bufLength(); + bufMoveByteLen -= 4 * bitWidth; + numElements -= VECTOR_UNPACK_16BIT_MAX_NUM; + std::copy(simdPtr, simdPtr + VECTOR_UNPACK_16BIT_MAX_NUM, dstPtr); + dstPtr += VECTOR_UNPACK_16BIT_MAX_NUM; + } + } + + alignTailerBoundary(bitWidth, 0, startBit, bufMoveByteLen, bufRestByteLen, len, + backupByteLen, numElements, resetBuf, srcPtr, dstPtr); + } + } + + void UnpackAvx512::vectorUnpack13(int64_t* data, uint64_t offset, uint64_t len) { + uint32_t bitWidth = 13; + const uint8_t* srcPtr = reinterpret_cast(decoder->getBufStart()); + uint64_t numElements = 0; + int64_t* dstPtr = data + offset; + uint64_t bufMoveByteLen = 0; + uint64_t bufRestByteLen = decoder->bufLength(); + bool resetBuf = false; + uint64_t startBit = 0; + uint64_t tailBitLen = 0; + uint32_t backupByteLen = 0; + + while (len > 0) { + alignHeaderBoundary(bitWidth, UNPACK_16Bit_MAX_SIZE, startBit, bufMoveByteLen, + bufRestByteLen, len, tailBitLen, backupByteLen, numElements, + resetBuf, srcPtr, dstPtr); + + if (numElements >= VECTOR_UNPACK_16BIT_MAX_NUM) { + uint16_t* simdPtr = reinterpret_cast(vectorBuf); + __mmask32 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_WORD(bitWidth * 32)); + __m512i parseMask0 = _mm512_set1_epi16(ORC_VECTOR_BIT_MASK(bitWidth)); + __m512i nibbleReversemm = _mm512_loadu_si512(nibbleReverseTable); + __m512i reverse_mask_16u = _mm512_loadu_si512(reverseMaskTable16u); + __m512i maskmm = _mm512_set1_epi8(0x0F); + + __m512i shuffleIdxPtr[2]; + shuffleIdxPtr[0] = _mm512_loadu_si512(shuffleIdxTable13u_0); + shuffleIdxPtr[1] = _mm512_loadu_si512(shuffleIdxTable13u_1); + + __m512i permutexIdxPtr[2]; + permutexIdxPtr[0] = _mm512_loadu_si512(permutexIdxTable13u_0); + permutexIdxPtr[1] = _mm512_loadu_si512(permutexIdxTable13u_1); + + __m512i shiftMaskPtr[4]; + shiftMaskPtr[0] = _mm512_loadu_si512(shiftTable13u_0); + shiftMaskPtr[1] = _mm512_loadu_si512(shiftTable13u_1); + shiftMaskPtr[2] = _mm512_loadu_si512(shiftTable13u_2); + shiftMaskPtr[3] = _mm512_loadu_si512(shiftTable13u_3); + + __m512i gatherIdxmm = _mm512_loadu_si512(gatherIdxTable13u); + + while (numElements >= 2 * VECTOR_UNPACK_16BIT_MAX_NUM) { + __m512i srcmm, zmm[2]; + + srcmm = _mm512_i64gather_epi64(gatherIdxmm, srcPtr, 1); + + // shuffling so in zmm[0] will be elements with even indexes and in zmm[1] - with odd ones + zmm[0] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr[0]); + zmm[1] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr[1]); + + // shifting elements so they start from the start of the word + zmm[0] = _mm512_srlv_epi32(zmm[0], shiftMaskPtr[2]); + zmm[1] = _mm512_sllv_epi32(zmm[1], shiftMaskPtr[3]); + + // gathering even and odd elements together + zmm[0] = _mm512_mask_mov_epi16(zmm[0], 0xAAAAAAAA, zmm[1]); + zmm[0] = _mm512_and_si512(zmm[0], parseMask0); + + _mm512_storeu_si512(simdPtr, zmm[0]); + + srcPtr += 4 * bitWidth; + decoder->resetBufferStart(4 * bitWidth, false, 0); + bufRestByteLen = decoder->bufLength(); + bufMoveByteLen -= 4 * bitWidth; + numElements -= VECTOR_UNPACK_16BIT_MAX_NUM; + std::copy(simdPtr, simdPtr + VECTOR_UNPACK_16BIT_MAX_NUM, dstPtr); + dstPtr += VECTOR_UNPACK_16BIT_MAX_NUM; + } + if (numElements >= VECTOR_UNPACK_16BIT_MAX_NUM) { + __m512i srcmm, zmm[2]; + + srcmm = _mm512_maskz_loadu_epi16(readMask, srcPtr); + + __m512i lowNibblemm = _mm512_and_si512(srcmm, maskmm); + __m512i highNibblemm = _mm512_srli_epi16(srcmm, 4); + highNibblemm = _mm512_and_si512(highNibblemm, maskmm); + + lowNibblemm = _mm512_shuffle_epi8(nibbleReversemm, lowNibblemm); + highNibblemm = _mm512_shuffle_epi8(nibbleReversemm, highNibblemm); + lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4); + + srcmm = _mm512_or_si512(lowNibblemm, highNibblemm); + + // permuting so in zmm[0] will be elements with even indexes and in zmm[1] - with odd ones + zmm[0] = _mm512_permutexvar_epi16(permutexIdxPtr[0], srcmm); + zmm[1] = _mm512_permutexvar_epi16(permutexIdxPtr[1], srcmm); + + // shifting elements so they start from the start of the word + zmm[0] = _mm512_srlv_epi32(zmm[0], shiftMaskPtr[0]); + zmm[1] = _mm512_sllv_epi32(zmm[1], shiftMaskPtr[1]); + + // gathering even and odd elements together + zmm[0] = _mm512_mask_mov_epi16(zmm[0], 0xAAAAAAAA, zmm[1]); + zmm[0] = _mm512_and_si512(zmm[0], parseMask0); + + zmm[0] = _mm512_slli_epi16(zmm[0], 3); + + lowNibblemm = _mm512_and_si512(zmm[0], maskmm); + highNibblemm = _mm512_srli_epi16(zmm[0], 4); + highNibblemm = _mm512_and_si512(highNibblemm, maskmm); + + lowNibblemm = _mm512_shuffle_epi8(nibbleReversemm, lowNibblemm); + highNibblemm = _mm512_shuffle_epi8(nibbleReversemm, highNibblemm); + lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4); + + zmm[0] = _mm512_or_si512(lowNibblemm, highNibblemm); + zmm[0] = _mm512_shuffle_epi8(zmm[0], reverse_mask_16u); + + _mm512_storeu_si512(simdPtr, zmm[0]); + + srcPtr += 4 * bitWidth; + decoder->resetBufferStart(4 * bitWidth, false, 0); + bufRestByteLen = decoder->bufLength(); + bufMoveByteLen -= 4 * bitWidth; + numElements -= VECTOR_UNPACK_16BIT_MAX_NUM; + std::copy(simdPtr, simdPtr + VECTOR_UNPACK_16BIT_MAX_NUM, dstPtr); + dstPtr += VECTOR_UNPACK_16BIT_MAX_NUM; + } + } + + alignTailerBoundary(bitWidth, 0, startBit, bufMoveByteLen, bufRestByteLen, len, + backupByteLen, numElements, resetBuf, srcPtr, dstPtr); + } + } + + void UnpackAvx512::vectorUnpack14(int64_t* data, uint64_t offset, uint64_t len) { + uint32_t bitWidth = 14; + const uint8_t* srcPtr = reinterpret_cast(decoder->getBufStart()); + uint64_t numElements = 0; + int64_t* dstPtr = data + offset; + uint64_t bufMoveByteLen = 0; + uint64_t bufRestByteLen = decoder->bufLength(); + bool resetBuf = false; + uint64_t startBit = 0; + uint64_t tailBitLen = 0; + uint32_t backupByteLen = 0; + + while (len > 0) { + alignHeaderBoundary(bitWidth, UNPACK_16Bit_MAX_SIZE, startBit, bufMoveByteLen, + bufRestByteLen, len, tailBitLen, backupByteLen, numElements, + resetBuf, srcPtr, dstPtr); + + if (numElements >= VECTOR_UNPACK_16BIT_MAX_NUM) { + uint16_t* simdPtr = reinterpret_cast(vectorBuf); + __mmask32 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_WORD(bitWidth * 32)); + __m512i parseMask0 = _mm512_set1_epi16(ORC_VECTOR_BIT_MASK(bitWidth)); + + __m512i shuffleIdxPtr[2]; + shuffleIdxPtr[0] = _mm512_loadu_si512(shuffleIdxTable14u_0); + shuffleIdxPtr[1] = _mm512_loadu_si512(shuffleIdxTable14u_1); + + __m512i permutexIdx = _mm512_loadu_si512(permutexIdxTable14u); + + __m512i shiftMaskPtr[2]; + shiftMaskPtr[0] = _mm512_loadu_si512(shiftTable14u_0); + shiftMaskPtr[1] = _mm512_loadu_si512(shiftTable14u_1); + + while (numElements >= VECTOR_UNPACK_16BIT_MAX_NUM) { + __m512i srcmm, zmm[2]; + + srcmm = _mm512_maskz_loadu_epi16(readMask, srcPtr); + srcmm = _mm512_permutexvar_epi16(permutexIdx, srcmm); + + // shuffling so in zmm[0] will be elements with even indexes and in zmm[1] - with odd ones + zmm[0] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr[0]); + zmm[1] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr[1]); + + // shifting elements so they start from the start of the word + zmm[0] = _mm512_srlv_epi32(zmm[0], shiftMaskPtr[0]); + zmm[1] = _mm512_sllv_epi32(zmm[1], shiftMaskPtr[1]); + + // gathering even and odd elements together + zmm[0] = _mm512_mask_mov_epi16(zmm[0], 0xAAAAAAAA, zmm[1]); + zmm[0] = _mm512_and_si512(zmm[0], parseMask0); + + _mm512_storeu_si512(simdPtr, zmm[0]); + + srcPtr += 4 * bitWidth; + decoder->resetBufferStart(4 * bitWidth, false, 0); + bufRestByteLen = decoder->bufLength(); + bufMoveByteLen -= 4 * bitWidth; + numElements -= VECTOR_UNPACK_16BIT_MAX_NUM; + std::copy(simdPtr, simdPtr + VECTOR_UNPACK_16BIT_MAX_NUM, dstPtr); + dstPtr += VECTOR_UNPACK_16BIT_MAX_NUM; + } + } + + alignTailerBoundary(bitWidth, 0, startBit, bufMoveByteLen, bufRestByteLen, len, + backupByteLen, numElements, resetBuf, srcPtr, dstPtr); + } + } + + void UnpackAvx512::vectorUnpack15(int64_t* data, uint64_t offset, uint64_t len) { + uint32_t bitWidth = 15; + const uint8_t* srcPtr = reinterpret_cast(decoder->getBufStart()); + uint64_t numElements = 0; + int64_t* dstPtr = data + offset; + uint64_t bufMoveByteLen = 0; + uint64_t bufRestByteLen = decoder->bufLength(); + bool resetBuf = false; + uint64_t startBit = 0; + uint64_t tailBitLen = 0; + uint32_t backupByteLen = 0; + + while (len > 0) { + alignHeaderBoundary(bitWidth, UNPACK_16Bit_MAX_SIZE, startBit, bufMoveByteLen, + bufRestByteLen, len, tailBitLen, backupByteLen, numElements, + resetBuf, srcPtr, dstPtr); + + if (numElements >= VECTOR_UNPACK_16BIT_MAX_NUM) { + uint16_t* simdPtr = reinterpret_cast(vectorBuf); + __mmask32 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_WORD(bitWidth * 32)); + __m512i parseMask0 = _mm512_set1_epi16(ORC_VECTOR_BIT_MASK(bitWidth)); + __m512i nibbleReversemm = _mm512_loadu_si512(nibbleReverseTable); + __m512i reverseMask16u = _mm512_loadu_si512(reverseMaskTable16u); + __m512i maskmm = _mm512_set1_epi8(0x0F); + + __m512i shuffleIdxPtr[2]; + shuffleIdxPtr[0] = _mm512_loadu_si512(shuffleIdxTable15u_0); + shuffleIdxPtr[1] = _mm512_loadu_si512(shuffleIdxTable15u_1); + + __m512i permutexIdxPtr[2]; + permutexIdxPtr[0] = _mm512_loadu_si512(permutexIdxTable15u_0); + permutexIdxPtr[1] = _mm512_loadu_si512(permutexIdxTable15u_1); + + __m512i shiftMaskPtr[4]; + shiftMaskPtr[0] = _mm512_loadu_si512(shiftTable15u_0); + shiftMaskPtr[1] = _mm512_loadu_si512(shiftTable15u_1); + shiftMaskPtr[2] = _mm512_loadu_si512(shiftTable15u_2); + shiftMaskPtr[3] = _mm512_loadu_si512(shiftTable15u_3); + + __m512i gatherIdxmm = _mm512_loadu_si512(gatherIdxTable15u); + + while (numElements >= 2 * VECTOR_UNPACK_16BIT_MAX_NUM) { + __m512i srcmm, zmm[2]; + + srcmm = _mm512_i64gather_epi64(gatherIdxmm, srcPtr, 1); + + // shuffling so in zmm[0] will be elements with even indexes and in zmm[1] - with odd ones + zmm[0] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr[0]); + zmm[1] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr[1]); + + // shifting elements so they start from the start of the word + zmm[0] = _mm512_srlv_epi32(zmm[0], shiftMaskPtr[2]); + zmm[1] = _mm512_sllv_epi32(zmm[1], shiftMaskPtr[3]); + + // gathering even and odd elements together + zmm[0] = _mm512_mask_mov_epi16(zmm[0], 0xAAAAAAAA, zmm[1]); + zmm[0] = _mm512_and_si512(zmm[0], parseMask0); + + _mm512_storeu_si512(simdPtr, zmm[0]); + + srcPtr += 4 * bitWidth; + decoder->resetBufferStart(4 * bitWidth, false, 0); + bufRestByteLen = decoder->bufLength(); + bufMoveByteLen -= 4 * bitWidth; + numElements -= VECTOR_UNPACK_16BIT_MAX_NUM; + std::copy(simdPtr, simdPtr + VECTOR_UNPACK_16BIT_MAX_NUM, dstPtr); + dstPtr += VECTOR_UNPACK_16BIT_MAX_NUM; + } + if (numElements >= VECTOR_UNPACK_16BIT_MAX_NUM) { + __m512i srcmm, zmm[2]; + + srcmm = _mm512_maskz_loadu_epi16(readMask, srcPtr); + + __m512i lowNibblemm = _mm512_and_si512(srcmm, maskmm); + __m512i highNibblemm = _mm512_srli_epi16(srcmm, 4); + highNibblemm = _mm512_and_si512(highNibblemm, maskmm); + + lowNibblemm = _mm512_shuffle_epi8(nibbleReversemm, lowNibblemm); + highNibblemm = _mm512_shuffle_epi8(nibbleReversemm, highNibblemm); + lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4); + + srcmm = _mm512_or_si512(lowNibblemm, highNibblemm); + + // permuting so in zmm[0] will be elements with even indexes and in zmm[1] - with odd ones + zmm[0] = _mm512_permutexvar_epi16(permutexIdxPtr[0], srcmm); + zmm[1] = _mm512_permutexvar_epi16(permutexIdxPtr[1], srcmm); + + // shifting elements so they start from the start of the word + zmm[0] = _mm512_srlv_epi32(zmm[0], shiftMaskPtr[0]); + zmm[1] = _mm512_sllv_epi32(zmm[1], shiftMaskPtr[1]); + + // gathering even and odd elements together + zmm[0] = _mm512_mask_mov_epi16(zmm[0], 0xAAAAAAAA, zmm[1]); + zmm[0] = _mm512_and_si512(zmm[0], parseMask0); + + zmm[0] = _mm512_slli_epi16(zmm[0], 1); + + lowNibblemm = _mm512_and_si512(zmm[0], maskmm); + highNibblemm = _mm512_srli_epi16(zmm[0], 4); + highNibblemm = _mm512_and_si512(highNibblemm, maskmm); + + lowNibblemm = _mm512_shuffle_epi8(nibbleReversemm, lowNibblemm); + highNibblemm = _mm512_shuffle_epi8(nibbleReversemm, highNibblemm); + lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4); + + zmm[0] = _mm512_or_si512(lowNibblemm, highNibblemm); + zmm[0] = _mm512_shuffle_epi8(zmm[0], reverseMask16u); + + _mm512_storeu_si512(simdPtr, zmm[0]); + + srcPtr += 4 * bitWidth; + decoder->resetBufferStart(4 * bitWidth, false, 0); + bufRestByteLen = decoder->bufLength(); + bufMoveByteLen -= 4 * bitWidth; + numElements -= VECTOR_UNPACK_16BIT_MAX_NUM; + std::copy(simdPtr, simdPtr + VECTOR_UNPACK_16BIT_MAX_NUM, dstPtr); + dstPtr += VECTOR_UNPACK_16BIT_MAX_NUM; + } + } + + alignTailerBoundary(bitWidth, 0, startBit, bufMoveByteLen, bufRestByteLen, len, + backupByteLen, numElements, resetBuf, srcPtr, dstPtr); + } + } + + void UnpackAvx512::vectorUnpack16(int64_t* data, uint64_t offset, uint64_t len) { + uint32_t bitWidth = 16; + const uint8_t* srcPtr = reinterpret_cast(decoder->getBufStart()); + uint64_t numElements = len; + uint64_t bufMoveByteLen = 0; + uint64_t bufRestByteLen = decoder->bufLength(); + int64_t* dstPtr = data + offset; + bool resetBuf = false; + uint64_t tailBitLen = 0; + uint32_t backupByteLen = 0; + uint64_t startBit = 0; + + while (len > 0) { + alignHeaderBoundary(bitWidth, UNPACK_16Bit_MAX_SIZE, startBit, bufMoveByteLen, + bufRestByteLen, len, tailBitLen, backupByteLen, numElements, + resetBuf, srcPtr, dstPtr); + + if (numElements >= VECTOR_UNPACK_16BIT_MAX_NUM) { + uint16_t* simdPtr = reinterpret_cast(vectorBuf); + __m512i reverse_mask_16u = _mm512_loadu_si512(reverseMaskTable16u); + while (numElements >= VECTOR_UNPACK_16BIT_MAX_NUM) { + __m512i srcmm = _mm512_loadu_si512(srcPtr); + srcmm = _mm512_shuffle_epi8(srcmm, reverse_mask_16u); + _mm512_storeu_si512(simdPtr, srcmm); + + srcPtr += 4 * bitWidth; + decoder->resetBufferStart(4 * bitWidth, false, 0); + bufRestByteLen = decoder->bufLength(); + bufMoveByteLen -= 4 * bitWidth; + numElements -= VECTOR_UNPACK_16BIT_MAX_NUM; + std::copy(simdPtr, simdPtr + VECTOR_UNPACK_16BIT_MAX_NUM, dstPtr); + dstPtr += VECTOR_UNPACK_16BIT_MAX_NUM; + } + } + + alignTailerBoundary(bitWidth, 16, startBit, bufMoveByteLen, bufRestByteLen, len, + backupByteLen, numElements, resetBuf, srcPtr, dstPtr); + } + } + + void UnpackAvx512::vectorUnpack17(int64_t* data, uint64_t offset, uint64_t len) { + uint32_t bitWidth = 17; + const uint8_t* srcPtr = reinterpret_cast(decoder->getBufStart()); + uint64_t numElements = 0; + int64_t* dstPtr = data + offset; + uint64_t bufMoveByteLen = 0; + uint64_t bufRestByteLen = decoder->bufLength(); + bool resetBuf = false; + uint64_t startBit = 0; + uint64_t tailBitLen = 0; + uint32_t backupByteLen = 0; + + while (len > 0) { + alignHeaderBoundary(bitWidth, UNPACK_32Bit_MAX_SIZE, startBit, bufMoveByteLen, + bufRestByteLen, len, tailBitLen, backupByteLen, numElements, + resetBuf, srcPtr, dstPtr); + + if (numElements >= VECTOR_UNPACK_32BIT_MAX_NUM) { + __mmask32 readMask = ORC_VECTOR_BIT_MASK(bitWidth); + __m512i parseMask0 = _mm512_set1_epi32(ORC_VECTOR_BIT_MASK(bitWidth)); + __m512i nibbleReversemm = _mm512_loadu_si512(nibbleReverseTable); + __m512i reverseMask32u = _mm512_loadu_si512(reverseMaskTable32u); + __m512i maskmm = _mm512_set1_epi8(0x0F); + + __m512i shuffleIdxPtr = _mm512_loadu_si512(shuffleIdxTable17u_0); + + __m512i permutexIdxPtr[2]; + permutexIdxPtr[0] = _mm512_loadu_si512(permutexIdxTable17u_0); + permutexIdxPtr[1] = _mm512_loadu_si512(permutexIdxTable17u_1); + + __m512i shiftMaskPtr[3]; + shiftMaskPtr[0] = _mm512_loadu_si512(shiftTable17u_0); + shiftMaskPtr[1] = _mm512_loadu_si512(shiftTable17u_1); + shiftMaskPtr[2] = _mm512_loadu_si512(shiftTable17u_2); + + __m512i gatherIdxmm = _mm512_loadu_si512(gatherIdxTable17u); + + while (numElements >= 2 * VECTOR_UNPACK_32BIT_MAX_NUM) { + __m512i srcmm, zmm[2]; + + srcmm = _mm512_i64gather_epi64(gatherIdxmm, srcPtr, 1u); + + zmm[0] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr); + + // shifting elements so they start from the start of the word + zmm[0] = _mm512_srlv_epi32(zmm[0], shiftMaskPtr[2]); + zmm[0] = _mm512_and_si512(zmm[0], parseMask0); + + _mm512_storeu_si512(vectorBuf, zmm[0]); + + srcPtr += 2 * bitWidth; + decoder->resetBufferStart(2 * bitWidth, false, 0); + bufRestByteLen = decoder->bufLength(); + bufMoveByteLen -= 2 * bitWidth; + numElements -= VECTOR_UNPACK_32BIT_MAX_NUM; + std::copy(vectorBuf, vectorBuf + VECTOR_UNPACK_32BIT_MAX_NUM, dstPtr); + dstPtr += VECTOR_UNPACK_32BIT_MAX_NUM; + } + + if (numElements >= VECTOR_UNPACK_32BIT_MAX_NUM) { + __m512i srcmm, zmm[2]; + + srcmm = _mm512_maskz_loadu_epi16(readMask, srcPtr); + + __m512i lowNibblemm = _mm512_and_si512(srcmm, maskmm); + __m512i highNibblemm = _mm512_srli_epi16(srcmm, 4); + highNibblemm = _mm512_and_si512(highNibblemm, maskmm); + + lowNibblemm = _mm512_shuffle_epi8(nibbleReversemm, lowNibblemm); + highNibblemm = _mm512_shuffle_epi8(nibbleReversemm, highNibblemm); + lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4); + + srcmm = _mm512_or_si512(lowNibblemm, highNibblemm); + + // permuting so in zmm[0] will be elements with even indexes and in zmm[1] - with odd ones + zmm[0] = _mm512_permutexvar_epi32(permutexIdxPtr[0], srcmm); + zmm[1] = _mm512_permutexvar_epi32(permutexIdxPtr[1], srcmm); + + // shifting elements so they start from the start of the word + zmm[0] = _mm512_srlv_epi64(zmm[0], shiftMaskPtr[0]); + zmm[1] = _mm512_sllv_epi64(zmm[1], shiftMaskPtr[1]); + + // gathering even and odd elements together + zmm[0] = _mm512_mask_mov_epi32(zmm[0], 0xAAAA, zmm[1]); + zmm[0] = _mm512_and_si512(zmm[0], parseMask0); + + zmm[0] = _mm512_slli_epi32(zmm[0], 15); + lowNibblemm = _mm512_and_si512(zmm[0], maskmm); + highNibblemm = _mm512_srli_epi16(zmm[0], 4); + highNibblemm = _mm512_and_si512(highNibblemm, maskmm); + + lowNibblemm = _mm512_shuffle_epi8(nibbleReversemm, lowNibblemm); + highNibblemm = _mm512_shuffle_epi8(nibbleReversemm, highNibblemm); + lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4); + + zmm[0] = _mm512_or_si512(lowNibblemm, highNibblemm); + zmm[0] = _mm512_shuffle_epi8(zmm[0], reverseMask32u); + + _mm512_storeu_si512(vectorBuf, zmm[0]); + + srcPtr += 2 * bitWidth; + decoder->resetBufferStart(2 * bitWidth, false, 0); + bufRestByteLen = decoder->bufLength(); + bufMoveByteLen -= 2 * bitWidth; + numElements -= VECTOR_UNPACK_32BIT_MAX_NUM; + std::copy(vectorBuf, vectorBuf + VECTOR_UNPACK_32BIT_MAX_NUM, dstPtr); + dstPtr += VECTOR_UNPACK_32BIT_MAX_NUM; + } + } + + alignTailerBoundary(bitWidth, 0, startBit, bufMoveByteLen, bufRestByteLen, len, + backupByteLen, numElements, resetBuf, srcPtr, dstPtr); + } + } + + void UnpackAvx512::vectorUnpack18(int64_t* data, uint64_t offset, uint64_t len) { + uint32_t bitWidth = 18; + const uint8_t* srcPtr = reinterpret_cast(decoder->getBufStart()); + uint64_t numElements = 0; + int64_t* dstPtr = data + offset; + uint64_t bufMoveByteLen = 0; + uint64_t bufRestByteLen = decoder->bufLength(); + bool resetBuf = false; + uint64_t startBit = 0; + uint64_t tailBitLen = 0; + uint32_t backupByteLen = 0; + + while (len > 0) { + alignHeaderBoundary(bitWidth, UNPACK_32Bit_MAX_SIZE, startBit, bufMoveByteLen, + bufRestByteLen, len, tailBitLen, backupByteLen, numElements, + resetBuf, srcPtr, dstPtr); + + if (numElements >= VECTOR_UNPACK_32BIT_MAX_NUM) { + __mmask16 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_DWORD(bitWidth * 16)); + __m512i parseMask0 = _mm512_set1_epi32(ORC_VECTOR_BIT_MASK(bitWidth)); + __m512i nibbleReversemm = _mm512_loadu_si512(nibbleReverseTable); + __m512i reverseMask32u = _mm512_loadu_si512(reverseMaskTable32u); + __m512i maskmm = _mm512_set1_epi8(0x0F); + + __m512i shuffleIdxPtr = _mm512_loadu_si512(shuffleIdxTable18u_0); + + __m512i permutexIdxPtr[2]; + permutexIdxPtr[0] = _mm512_loadu_si512(permutexIdxTable18u_0); + permutexIdxPtr[1] = _mm512_loadu_si512(permutexIdxTable18u_1); + + __m512i shiftMaskPtr[3]; + shiftMaskPtr[0] = _mm512_loadu_si512(shiftTable18u_0); + shiftMaskPtr[1] = _mm512_loadu_si512(shiftTable18u_1); + shiftMaskPtr[2] = _mm512_loadu_si512(shiftTable18u_2); + + __m512i gatherIdxmm = _mm512_loadu_si512(gatherIdxTable18u); + + while (numElements >= 2 * VECTOR_UNPACK_32BIT_MAX_NUM) { + __m512i srcmm, zmm[2]; + + srcmm = _mm512_i64gather_epi64(gatherIdxmm, srcPtr, 1); + + zmm[0] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr); + + // shifting elements so they start from the start of the word + zmm[0] = _mm512_srlv_epi32(zmm[0], shiftMaskPtr[2]); + zmm[0] = _mm512_and_si512(zmm[0], parseMask0); + + _mm512_storeu_si512(vectorBuf, zmm[0]); + + srcPtr += 2 * bitWidth; + decoder->resetBufferStart(2 * bitWidth, false, 0); + bufRestByteLen = decoder->bufLength(); + bufMoveByteLen -= 2 * bitWidth; + numElements -= VECTOR_UNPACK_32BIT_MAX_NUM; + std::copy(vectorBuf, vectorBuf + VECTOR_UNPACK_32BIT_MAX_NUM, dstPtr); + dstPtr += VECTOR_UNPACK_32BIT_MAX_NUM; + } + + if (numElements >= VECTOR_UNPACK_32BIT_MAX_NUM) { + __m512i srcmm, zmm[2]; + + srcmm = _mm512_maskz_loadu_epi32(readMask, srcPtr); + + __m512i lowNibblemm = _mm512_and_si512(srcmm, maskmm); + __m512i highNibblemm = _mm512_srli_epi16(srcmm, 4); + highNibblemm = _mm512_and_si512(highNibblemm, maskmm); + + lowNibblemm = _mm512_shuffle_epi8(nibbleReversemm, lowNibblemm); + highNibblemm = _mm512_shuffle_epi8(nibbleReversemm, highNibblemm); + lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4); + + srcmm = _mm512_or_si512(lowNibblemm, highNibblemm); + + // permuting so in zmm[0] will be elements with even indexes and in zmm[1] - with odd ones + zmm[0] = _mm512_permutexvar_epi32(permutexIdxPtr[0], srcmm); + zmm[1] = _mm512_permutexvar_epi32(permutexIdxPtr[1], srcmm); + + // shifting elements so they start from the start of the word + zmm[0] = _mm512_srlv_epi64(zmm[0], shiftMaskPtr[0]); + zmm[1] = _mm512_sllv_epi64(zmm[1], shiftMaskPtr[1]); + + // gathering even and odd elements together + zmm[0] = _mm512_mask_mov_epi32(zmm[0], 0xAAAA, zmm[1]); + zmm[0] = _mm512_and_si512(zmm[0], parseMask0); + + zmm[0] = _mm512_slli_epi32(zmm[0], 14); + lowNibblemm = _mm512_and_si512(zmm[0], maskmm); + highNibblemm = _mm512_srli_epi16(zmm[0], 4); + highNibblemm = _mm512_and_si512(highNibblemm, maskmm); + + lowNibblemm = _mm512_shuffle_epi8(nibbleReversemm, lowNibblemm); + highNibblemm = _mm512_shuffle_epi8(nibbleReversemm, highNibblemm); + lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4); + + zmm[0] = _mm512_or_si512(lowNibblemm, highNibblemm); + zmm[0] = _mm512_shuffle_epi8(zmm[0], reverseMask32u); + + _mm512_storeu_si512(vectorBuf, zmm[0]); + + srcPtr += 2 * bitWidth; + decoder->resetBufferStart(2 * bitWidth, false, 0); + bufRestByteLen = decoder->bufLength(); + bufMoveByteLen -= 2 * bitWidth; + numElements -= VECTOR_UNPACK_32BIT_MAX_NUM; + std::copy(vectorBuf, vectorBuf + VECTOR_UNPACK_32BIT_MAX_NUM, dstPtr); + dstPtr += VECTOR_UNPACK_32BIT_MAX_NUM; + } + } + + alignTailerBoundary(bitWidth, 0, startBit, bufMoveByteLen, bufRestByteLen, len, + backupByteLen, numElements, resetBuf, srcPtr, dstPtr); + } + } + + void UnpackAvx512::vectorUnpack19(int64_t* data, uint64_t offset, uint64_t len) { + uint32_t bitWidth = 19; + const uint8_t* srcPtr = reinterpret_cast(decoder->getBufStart()); + uint64_t numElements = 0; + int64_t* dstPtr = data + offset; + uint64_t bufMoveByteLen = 0; + uint64_t bufRestByteLen = decoder->bufLength(); + bool resetBuf = false; + uint64_t startBit = 0; + uint64_t tailBitLen = 0; + uint32_t backupByteLen = 0; + + while (len > 0) { + alignHeaderBoundary(bitWidth, UNPACK_32Bit_MAX_SIZE, startBit, bufMoveByteLen, + bufRestByteLen, len, tailBitLen, backupByteLen, numElements, + resetBuf, srcPtr, dstPtr); + + if (numElements >= VECTOR_UNPACK_32BIT_MAX_NUM) { + __mmask32 readMask = ORC_VECTOR_BIT_MASK(bitWidth); + __m512i parseMask0 = _mm512_set1_epi32(ORC_VECTOR_BIT_MASK(bitWidth)); + __m512i nibbleReversemm = _mm512_loadu_si512(nibbleReverseTable); + __m512i reverseMask32u = _mm512_loadu_si512(reverseMaskTable32u); + __m512i maskmm = _mm512_set1_epi8(0x0F); + + __m512i shuffleIdxPtr = _mm512_loadu_si512(shuffleIdxTable19u_0); + + __m512i permutexIdxPtr[2]; + permutexIdxPtr[0] = _mm512_loadu_si512(permutexIdxTable19u_0); + permutexIdxPtr[1] = _mm512_loadu_si512(permutexIdxTable19u_1); + + __m512i shiftMaskPtr[3]; + shiftMaskPtr[0] = _mm512_loadu_si512(shiftTable19u_0); + shiftMaskPtr[1] = _mm512_loadu_si512(shiftTable19u_1); + shiftMaskPtr[2] = _mm512_loadu_si512(shiftTable19u_2); + + __m512i gatherIdxmm = _mm512_loadu_si512(gatherIdxTable19u); + + while (numElements >= 2 * VECTOR_UNPACK_32BIT_MAX_NUM) { + __m512i srcmm, zmm[2]; + + srcmm = _mm512_i64gather_epi64(gatherIdxmm, srcPtr, 1); + + zmm[0] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr); + + // shifting elements so they start from the start of the word + zmm[0] = _mm512_srlv_epi32(zmm[0], shiftMaskPtr[2]); + zmm[0] = _mm512_and_si512(zmm[0], parseMask0); + + _mm512_storeu_si512(vectorBuf, zmm[0]); + + srcPtr += 2 * bitWidth; + decoder->resetBufferStart(2 * bitWidth, false, 0); + bufRestByteLen = decoder->bufLength(); + bufMoveByteLen -= 2 * bitWidth; + numElements -= VECTOR_UNPACK_32BIT_MAX_NUM; + std::copy(vectorBuf, vectorBuf + VECTOR_UNPACK_32BIT_MAX_NUM, dstPtr); + dstPtr += VECTOR_UNPACK_32BIT_MAX_NUM; + } + + if (numElements >= VECTOR_UNPACK_32BIT_MAX_NUM) { + __m512i srcmm, zmm[2]; + + srcmm = _mm512_maskz_loadu_epi16(readMask, srcPtr); + + __m512i lowNibblemm = _mm512_and_si512(srcmm, maskmm); + __m512i highNibblemm = _mm512_srli_epi16(srcmm, 4); + highNibblemm = _mm512_and_si512(highNibblemm, maskmm); + + lowNibblemm = _mm512_shuffle_epi8(nibbleReversemm, lowNibblemm); + highNibblemm = _mm512_shuffle_epi8(nibbleReversemm, highNibblemm); + lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4); + + srcmm = _mm512_or_si512(lowNibblemm, highNibblemm); + + // permuting so in zmm[0] will be elements with even indexes and in zmm[1] - with odd ones + zmm[0] = _mm512_permutexvar_epi32(permutexIdxPtr[0], srcmm); + zmm[1] = _mm512_permutexvar_epi32(permutexIdxPtr[1], srcmm); + + // shifting elements so they start from the start of the word + zmm[0] = _mm512_srlv_epi64(zmm[0], shiftMaskPtr[0]); + zmm[1] = _mm512_sllv_epi64(zmm[1], shiftMaskPtr[1]); + + // gathering even and odd elements together + zmm[0] = _mm512_mask_mov_epi32(zmm[0], 0xAAAA, zmm[1]); + zmm[0] = _mm512_and_si512(zmm[0], parseMask0); + + zmm[0] = _mm512_slli_epi32(zmm[0], 13); + lowNibblemm = _mm512_and_si512(zmm[0], maskmm); + highNibblemm = _mm512_srli_epi16(zmm[0], 4); + highNibblemm = _mm512_and_si512(highNibblemm, maskmm); + + lowNibblemm = _mm512_shuffle_epi8(nibbleReversemm, lowNibblemm); + highNibblemm = _mm512_shuffle_epi8(nibbleReversemm, highNibblemm); + lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4); + + zmm[0] = _mm512_or_si512(lowNibblemm, highNibblemm); + zmm[0] = _mm512_shuffle_epi8(zmm[0], reverseMask32u); + + _mm512_storeu_si512(vectorBuf, zmm[0]); + + srcPtr += 2 * bitWidth; + decoder->resetBufferStart(2 * bitWidth, false, 0); + bufRestByteLen = decoder->bufLength(); + bufMoveByteLen -= 2 * bitWidth; + numElements -= VECTOR_UNPACK_32BIT_MAX_NUM; + std::copy(vectorBuf, vectorBuf + VECTOR_UNPACK_32BIT_MAX_NUM, dstPtr); + dstPtr += VECTOR_UNPACK_32BIT_MAX_NUM; + } + } + + alignTailerBoundary(bitWidth, 0, startBit, bufMoveByteLen, bufRestByteLen, len, + backupByteLen, numElements, resetBuf, srcPtr, dstPtr); + } + } + + void UnpackAvx512::vectorUnpack20(int64_t* data, uint64_t offset, uint64_t len) { + uint32_t bitWidth = 20; + const uint8_t* srcPtr = reinterpret_cast(decoder->getBufStart()); + uint64_t numElements = 0; + int64_t* dstPtr = data + offset; + uint64_t bufMoveByteLen = 0; + uint64_t bufRestByteLen = decoder->bufLength(); + bool resetBuf = false; + uint64_t startBit = 0; + uint64_t tailBitLen = 0; + uint32_t backupByteLen = 0; + + while (len > 0) { + alignHeaderBoundary(bitWidth, UNPACK_32Bit_MAX_SIZE, startBit, bufMoveByteLen, + bufRestByteLen, len, tailBitLen, backupByteLen, numElements, + resetBuf, srcPtr, dstPtr); + + if (numElements >= VECTOR_UNPACK_32BIT_MAX_NUM) { + __mmask16 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_DWORD(bitWidth * 16)); + __m512i parseMask0 = _mm512_set1_epi32(ORC_VECTOR_BIT_MASK(bitWidth)); + + __m512i shuffleIdxPtr = _mm512_loadu_si512(shuffleIdxTable20u_0); + __m512i permutexIdx = _mm512_loadu_si512(permutexIdxTable20u); + __m512i shiftMask = _mm512_loadu_si512(shiftTable20u); + + while (numElements >= VECTOR_UNPACK_32BIT_MAX_NUM) { + __m512i srcmm, zmm; + + srcmm = _mm512_maskz_loadu_epi32(readMask, srcPtr); + + zmm = _mm512_permutexvar_epi16(permutexIdx, srcmm); + zmm = _mm512_shuffle_epi8(zmm, shuffleIdxPtr); + + // shifting elements so they start from the start of the word + zmm = _mm512_srlv_epi32(zmm, shiftMask); + zmm = _mm512_and_si512(zmm, parseMask0); + + _mm512_storeu_si512(vectorBuf, zmm); + + srcPtr += 2 * bitWidth; + decoder->resetBufferStart(2 * bitWidth, false, 0); + bufRestByteLen = decoder->bufLength(); + bufMoveByteLen -= 2 * bitWidth; + numElements -= VECTOR_UNPACK_32BIT_MAX_NUM; + std::copy(vectorBuf, vectorBuf + VECTOR_UNPACK_32BIT_MAX_NUM, dstPtr); + dstPtr += VECTOR_UNPACK_32BIT_MAX_NUM; + } + } + + alignTailerBoundary(bitWidth, 0, startBit, bufMoveByteLen, bufRestByteLen, len, + backupByteLen, numElements, resetBuf, srcPtr, dstPtr); + } + } + + void UnpackAvx512::vectorUnpack21(int64_t* data, uint64_t offset, uint64_t len) { + uint32_t bitWidth = 21; + const uint8_t* srcPtr = reinterpret_cast(decoder->getBufStart()); + uint64_t numElements = 0; + int64_t* dstPtr = data + offset; + uint64_t bufMoveByteLen = 0; + uint64_t bufRestByteLen = decoder->bufLength(); + bool resetBuf = false; + uint64_t startBit = 0; + uint64_t tailBitLen = 0; + uint32_t backupByteLen = 0; + + while (len > 0) { + alignHeaderBoundary(bitWidth, UNPACK_32Bit_MAX_SIZE, startBit, bufMoveByteLen, + bufRestByteLen, len, tailBitLen, backupByteLen, numElements, + resetBuf, srcPtr, dstPtr); + + if (numElements >= VECTOR_UNPACK_32BIT_MAX_NUM) { + __mmask32 readMask = ORC_VECTOR_BIT_MASK(bitWidth); + __m512i parseMask0 = _mm512_set1_epi32(ORC_VECTOR_BIT_MASK(bitWidth)); + __m512i nibbleReversemm = _mm512_loadu_si512(nibbleReverseTable); + __m512i reverseMask32u = _mm512_loadu_si512(reverseMaskTable32u); + __m512i maskmm = _mm512_set1_epi8(0x0F); + + __m512i shuffleIdxPtr = _mm512_loadu_si512(shuffleIdxTable21u_0); + + __m512i permutexIdxPtr[2]; + permutexIdxPtr[0] = _mm512_loadu_si512(permutexIdxTable21u_0); + permutexIdxPtr[1] = _mm512_loadu_si512(permutexIdxTable21u_1); + + __m512i shiftMaskPtr[3]; + shiftMaskPtr[0] = _mm512_loadu_si512(shiftTable21u_0); + shiftMaskPtr[1] = _mm512_loadu_si512(shiftTable21u_1); + shiftMaskPtr[2] = _mm512_loadu_si512(shiftTable21u_2); + + __m512i gatherIdxmm = _mm512_loadu_si512(gatherIdxTable21u); + + while (numElements >= 2 * VECTOR_UNPACK_32BIT_MAX_NUM) { + __m512i srcmm, zmm[2]; + + srcmm = _mm512_i64gather_epi64(gatherIdxmm, srcPtr, 1); + + zmm[0] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr); + + // shifting elements so they start from the start of the word + zmm[0] = _mm512_srlv_epi32(zmm[0], shiftMaskPtr[2]); + zmm[0] = _mm512_and_si512(zmm[0], parseMask0); + + _mm512_storeu_si512(vectorBuf, zmm[0]); + + srcPtr += 2 * bitWidth; + decoder->resetBufferStart(2 * bitWidth, false, 0); + bufRestByteLen = decoder->bufLength(); + bufMoveByteLen -= 2 * bitWidth; + numElements -= VECTOR_UNPACK_32BIT_MAX_NUM; + std::copy(vectorBuf, vectorBuf + VECTOR_UNPACK_32BIT_MAX_NUM, dstPtr); + dstPtr += VECTOR_UNPACK_32BIT_MAX_NUM; + } + + if (numElements >= VECTOR_UNPACK_32BIT_MAX_NUM) { + __m512i srcmm, zmm[2]; + + srcmm = _mm512_maskz_loadu_epi16(readMask, srcPtr); + + __m512i lowNibblemm = _mm512_and_si512(srcmm, maskmm); + __m512i highNibblemm = _mm512_srli_epi16(srcmm, 4); + highNibblemm = _mm512_and_si512(highNibblemm, maskmm); + + lowNibblemm = _mm512_shuffle_epi8(nibbleReversemm, lowNibblemm); + highNibblemm = _mm512_shuffle_epi8(nibbleReversemm, highNibblemm); + lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4); + + srcmm = _mm512_or_si512(lowNibblemm, highNibblemm); + + // permuting so in zmm[0] will be elements with even indexes and in zmm[1] - with odd ones + zmm[0] = _mm512_permutexvar_epi32(permutexIdxPtr[0], srcmm); + zmm[1] = _mm512_permutexvar_epi32(permutexIdxPtr[1], srcmm); + + // shifting elements so they start from the start of the word + zmm[0] = _mm512_srlv_epi64(zmm[0], shiftMaskPtr[0]); + zmm[1] = _mm512_sllv_epi64(zmm[1], shiftMaskPtr[1]); + + // gathering even and odd elements together + zmm[0] = _mm512_mask_mov_epi32(zmm[0], 0xAAAA, zmm[1]); + zmm[0] = _mm512_and_si512(zmm[0], parseMask0); + + zmm[0] = _mm512_slli_epi32(zmm[0], 11); + lowNibblemm = _mm512_and_si512(zmm[0], maskmm); + highNibblemm = _mm512_srli_epi16(zmm[0], 4); + highNibblemm = _mm512_and_si512(highNibblemm, maskmm); + + lowNibblemm = _mm512_shuffle_epi8(nibbleReversemm, lowNibblemm); + highNibblemm = _mm512_shuffle_epi8(nibbleReversemm, highNibblemm); + lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4); + + zmm[0] = _mm512_or_si512(lowNibblemm, highNibblemm); + zmm[0] = _mm512_shuffle_epi8(zmm[0], reverseMask32u); + + _mm512_storeu_si512(vectorBuf, zmm[0]); + + srcPtr += 2 * bitWidth; + decoder->resetBufferStart(2 * bitWidth, false, 0); + bufRestByteLen = decoder->bufLength(); + bufMoveByteLen -= 2 * bitWidth; + numElements -= VECTOR_UNPACK_32BIT_MAX_NUM; + std::copy(vectorBuf, vectorBuf + VECTOR_UNPACK_32BIT_MAX_NUM, dstPtr); + dstPtr += VECTOR_UNPACK_32BIT_MAX_NUM; + } + } + + alignTailerBoundary(bitWidth, 0, startBit, bufMoveByteLen, bufRestByteLen, len, + backupByteLen, numElements, resetBuf, srcPtr, dstPtr); + } + } + + void UnpackAvx512::vectorUnpack22(int64_t* data, uint64_t offset, uint64_t len) { + uint32_t bitWidth = 22; + const uint8_t* srcPtr = reinterpret_cast(decoder->getBufStart()); + uint64_t numElements = 0; + int64_t* dstPtr = data + offset; + uint64_t bufMoveByteLen = 0; + uint64_t bufRestByteLen = decoder->bufLength(); + bool resetBuf = false; + uint64_t startBit = 0; + uint64_t tailBitLen = 0; + uint32_t backupByteLen = 0; + + while (len > 0) { + alignHeaderBoundary(bitWidth, UNPACK_32Bit_MAX_SIZE, startBit, bufMoveByteLen, + bufRestByteLen, len, tailBitLen, backupByteLen, numElements, + resetBuf, srcPtr, dstPtr); + + if (numElements >= VECTOR_UNPACK_32BIT_MAX_NUM) { + __mmask16 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_DWORD(bitWidth * 16)); + __m512i parseMask0 = _mm512_set1_epi32(ORC_VECTOR_BIT_MASK(bitWidth)); + __m512i nibbleReversemm = _mm512_loadu_si512(nibbleReverseTable); + __m512i reverseMask32u = _mm512_loadu_si512(reverseMaskTable32u); + __m512i maskmm = _mm512_set1_epi8(0x0F); + + __m512i shuffleIdxPtr = _mm512_loadu_si512(shuffleIdxTable22u_0); + + __m512i permutexIdxPtr[2]; + permutexIdxPtr[0] = _mm512_loadu_si512(permutexIdxTable22u_0); + permutexIdxPtr[1] = _mm512_loadu_si512(permutexIdxTable22u_1); + + __m512i shiftMaskPtr[3]; + shiftMaskPtr[0] = _mm512_loadu_si512(shiftTable22u_0); + shiftMaskPtr[1] = _mm512_loadu_si512(shiftTable22u_1); + shiftMaskPtr[2] = _mm512_loadu_si512(shiftTable22u_2); + + __m512i gatherIdxmm = _mm512_loadu_si512(gatherIdxTable22u); + + while (numElements >= 2 * VECTOR_UNPACK_32BIT_MAX_NUM) { + __m512i srcmm, zmm[2]; + + srcmm = _mm512_i64gather_epi64(gatherIdxmm, srcPtr, 1); + + zmm[0] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr); + + // shifting elements so they start from the start of the word + zmm[0] = _mm512_srlv_epi32(zmm[0], shiftMaskPtr[2]); + zmm[0] = _mm512_and_si512(zmm[0], parseMask0); + + _mm512_storeu_si512(vectorBuf, zmm[0]); + + srcPtr += 2 * bitWidth; + decoder->resetBufferStart(2 * bitWidth, false, 0); + bufRestByteLen = decoder->bufLength(); + bufMoveByteLen -= 2 * bitWidth; + numElements -= VECTOR_UNPACK_32BIT_MAX_NUM; + std::copy(vectorBuf, vectorBuf + VECTOR_UNPACK_32BIT_MAX_NUM, dstPtr); + dstPtr += VECTOR_UNPACK_32BIT_MAX_NUM; + } + + if (numElements >= VECTOR_UNPACK_32BIT_MAX_NUM) { + __m512i srcmm, zmm[2]; + + srcmm = _mm512_maskz_loadu_epi32(readMask, srcPtr); + + __m512i lowNibblemm = _mm512_and_si512(srcmm, maskmm); + __m512i highNibblemm = _mm512_srli_epi16(srcmm, 4); + highNibblemm = _mm512_and_si512(highNibblemm, maskmm); + + lowNibblemm = _mm512_shuffle_epi8(nibbleReversemm, lowNibblemm); + highNibblemm = _mm512_shuffle_epi8(nibbleReversemm, highNibblemm); + lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4); + + srcmm = _mm512_or_si512(lowNibblemm, highNibblemm); + + // permuting so in zmm[0] will be elements with even indexes and in zmm[1] - with odd ones + zmm[0] = _mm512_permutexvar_epi32(permutexIdxPtr[0], srcmm); + zmm[1] = _mm512_permutexvar_epi32(permutexIdxPtr[1], srcmm); + + // shifting elements so they start from the start of the word + zmm[0] = _mm512_srlv_epi64(zmm[0], shiftMaskPtr[0]); + zmm[1] = _mm512_sllv_epi64(zmm[1], shiftMaskPtr[1]); + + // gathering even and odd elements together + zmm[0] = _mm512_mask_mov_epi32(zmm[0], 0xAAAA, zmm[1]); + zmm[0] = _mm512_and_si512(zmm[0], parseMask0); + + zmm[0] = _mm512_slli_epi32(zmm[0], 10); + lowNibblemm = _mm512_and_si512(zmm[0], maskmm); + highNibblemm = _mm512_srli_epi16(zmm[0], 4); + highNibblemm = _mm512_and_si512(highNibblemm, maskmm); + + lowNibblemm = _mm512_shuffle_epi8(nibbleReversemm, lowNibblemm); + highNibblemm = _mm512_shuffle_epi8(nibbleReversemm, highNibblemm); + lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4); + + zmm[0] = _mm512_or_si512(lowNibblemm, highNibblemm); + zmm[0] = _mm512_shuffle_epi8(zmm[0], reverseMask32u); + + _mm512_storeu_si512(vectorBuf, zmm[0]); + + srcPtr += 2 * bitWidth; + decoder->resetBufferStart(2 * bitWidth, false, 0); + bufRestByteLen = decoder->bufLength(); + bufMoveByteLen -= 2 * bitWidth; + numElements -= VECTOR_UNPACK_32BIT_MAX_NUM; + std::copy(vectorBuf, vectorBuf + VECTOR_UNPACK_32BIT_MAX_NUM, dstPtr); + dstPtr += VECTOR_UNPACK_32BIT_MAX_NUM; + } + } + + alignTailerBoundary(bitWidth, 0, startBit, bufMoveByteLen, bufRestByteLen, len, + backupByteLen, numElements, resetBuf, srcPtr, dstPtr); + } + } + + void UnpackAvx512::vectorUnpack23(int64_t* data, uint64_t offset, uint64_t len) { + uint32_t bitWidth = 23; + const uint8_t* srcPtr = reinterpret_cast(decoder->getBufStart()); + uint64_t numElements = 0; + int64_t* dstPtr = data + offset; + uint64_t bufMoveByteLen = 0; + uint64_t bufRestByteLen = decoder->bufLength(); + bool resetBuf = false; + + uint64_t startBit = 0; + uint64_t tailBitLen = 0; + uint32_t backupByteLen = 0; + + while (len > 0) { + alignHeaderBoundary(bitWidth, UNPACK_32Bit_MAX_SIZE, startBit, bufMoveByteLen, + bufRestByteLen, len, tailBitLen, backupByteLen, numElements, + resetBuf, srcPtr, dstPtr); + + if (numElements >= VECTOR_UNPACK_32BIT_MAX_NUM) { + __mmask32 readMask = ORC_VECTOR_BIT_MASK(bitWidth); + __m512i parseMask0 = _mm512_set1_epi32(ORC_VECTOR_BIT_MASK(bitWidth)); + __m512i nibbleReversemm = _mm512_loadu_si512(nibbleReverseTable); + __m512i reverseMask32u = _mm512_loadu_si512(reverseMaskTable32u); + __m512i maskmm = _mm512_set1_epi8(0x0F); + + __m512i shuffleIdxPtr = _mm512_loadu_si512(shuffleIdxTable23u_0); + + __m512i permutexIdxPtr[2]; + permutexIdxPtr[0] = _mm512_loadu_si512(permutexIdxTable23u_0); + permutexIdxPtr[1] = _mm512_loadu_si512(permutexIdxTable23u_1); + + __m512i shiftMaskPtr[3]; + shiftMaskPtr[0] = _mm512_loadu_si512(shiftTable23u_0); + shiftMaskPtr[1] = _mm512_loadu_si512(shiftTable23u_1); + shiftMaskPtr[2] = _mm512_loadu_si512(shiftTable23u_2); + + __m512i gatherIdxmm = _mm512_loadu_si512(gatherIdxTable23u); + + while (numElements >= 2 * VECTOR_UNPACK_32BIT_MAX_NUM) { + __m512i srcmm, zmm[2]; + + srcmm = _mm512_i64gather_epi64(gatherIdxmm, srcPtr, 1); + + zmm[0] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr); + + // shifting elements so they start from the start of the word + zmm[0] = _mm512_srlv_epi32(zmm[0], shiftMaskPtr[2]); + zmm[0] = _mm512_and_si512(zmm[0], parseMask0); + + _mm512_storeu_si512(vectorBuf, zmm[0]); + + srcPtr += 2 * bitWidth; + decoder->resetBufferStart(2 * bitWidth, false, 0); + bufRestByteLen = decoder->bufLength(); + bufMoveByteLen -= 2 * bitWidth; + numElements -= VECTOR_UNPACK_32BIT_MAX_NUM; + std::copy(vectorBuf, vectorBuf + VECTOR_UNPACK_32BIT_MAX_NUM, dstPtr); + dstPtr += VECTOR_UNPACK_32BIT_MAX_NUM; + } + + if (numElements >= VECTOR_UNPACK_32BIT_MAX_NUM) { + __m512i srcmm, zmm[2]; + + srcmm = _mm512_maskz_loadu_epi16(readMask, srcPtr); + + __m512i lowNibblemm = _mm512_and_si512(srcmm, maskmm); + __m512i highNibblemm = _mm512_srli_epi16(srcmm, 4); + highNibblemm = _mm512_and_si512(highNibblemm, maskmm); + + lowNibblemm = _mm512_shuffle_epi8(nibbleReversemm, lowNibblemm); + highNibblemm = _mm512_shuffle_epi8(nibbleReversemm, highNibblemm); + lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4); + + srcmm = _mm512_or_si512(lowNibblemm, highNibblemm); + + // permuting so in zmm[0] will be elements with even indexes and in zmm[1] - with odd ones + zmm[0] = _mm512_permutexvar_epi32(permutexIdxPtr[0], srcmm); + zmm[1] = _mm512_permutexvar_epi32(permutexIdxPtr[1], srcmm); + + // shifting elements so they start from the start of the word + zmm[0] = _mm512_srlv_epi64(zmm[0], shiftMaskPtr[0]); + zmm[1] = _mm512_sllv_epi64(zmm[1], shiftMaskPtr[1]); + + // gathering even and odd elements together + zmm[0] = _mm512_mask_mov_epi32(zmm[0], 0xAAAA, zmm[1]); + zmm[0] = _mm512_and_si512(zmm[0], parseMask0); + + zmm[0] = _mm512_slli_epi32(zmm[0], 9); + lowNibblemm = _mm512_and_si512(zmm[0], maskmm); + highNibblemm = _mm512_srli_epi16(zmm[0], 4); + highNibblemm = _mm512_and_si512(highNibblemm, maskmm); + + lowNibblemm = _mm512_shuffle_epi8(nibbleReversemm, lowNibblemm); + highNibblemm = _mm512_shuffle_epi8(nibbleReversemm, highNibblemm); + lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4); + + zmm[0] = _mm512_or_si512(lowNibblemm, highNibblemm); + zmm[0] = _mm512_shuffle_epi8(zmm[0], reverseMask32u); + + _mm512_storeu_si512(vectorBuf, zmm[0]); + + srcPtr += 2 * bitWidth; + decoder->resetBufferStart(2 * bitWidth, false, 0); + bufRestByteLen = decoder->bufLength(); + bufMoveByteLen -= 2 * bitWidth; + numElements -= VECTOR_UNPACK_32BIT_MAX_NUM; + std::copy(vectorBuf, vectorBuf + VECTOR_UNPACK_32BIT_MAX_NUM, dstPtr); + dstPtr += VECTOR_UNPACK_32BIT_MAX_NUM; + } + } + + alignTailerBoundary(bitWidth, 0, startBit, bufMoveByteLen, bufRestByteLen, len, + backupByteLen, numElements, resetBuf, srcPtr, dstPtr); + } + } + + void UnpackAvx512::vectorUnpack24(int64_t* data, uint64_t offset, uint64_t len) { + uint32_t bitWidth = 24; + const uint8_t* srcPtr = reinterpret_cast(decoder->getBufStart()); + uint64_t numElements = 0; + int64_t* dstPtr = data + offset; + uint64_t bufMoveByteLen = 0; + uint64_t bufRestByteLen = decoder->bufLength(); + bool resetBuf = false; + uint64_t tailBitLen = 0; + uint32_t backupByteLen = 0; + uint64_t startBit = 0; + + while (len > 0) { + alignHeaderBoundary(bitWidth, UNPACK_32Bit_MAX_SIZE, startBit, bufMoveByteLen, + bufRestByteLen, len, tailBitLen, backupByteLen, numElements, + resetBuf, srcPtr, dstPtr); + + if (numElements >= VECTOR_UNPACK_32BIT_MAX_NUM) { + __mmask16 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_DWORD(bitWidth * 16)); + + __m512i shuffleIdx = _mm512_loadu_si512(shuffleIdxTable24u_0); + __m512i permutexIdx = _mm512_loadu_si512(permutexIdxTable24u); + + while (numElements >= VECTOR_UNPACK_32BIT_MAX_NUM) { + __m512i srcmm, zmm; + + srcmm = _mm512_maskz_loadu_epi32(readMask, srcPtr); + + zmm = _mm512_permutexvar_epi32(permutexIdx, srcmm); + zmm = _mm512_shuffle_epi8(zmm, shuffleIdx); + + _mm512_storeu_si512(vectorBuf, zmm); + + srcPtr += 2 * bitWidth; + decoder->resetBufferStart(2 * bitWidth, false, 0); + bufRestByteLen = decoder->bufLength(); + bufMoveByteLen -= 2 * bitWidth; + numElements -= VECTOR_UNPACK_32BIT_MAX_NUM; + std::copy(vectorBuf, vectorBuf + VECTOR_UNPACK_32BIT_MAX_NUM, dstPtr); + dstPtr += VECTOR_UNPACK_32BIT_MAX_NUM; + } + } + + alignTailerBoundary(bitWidth, 24, startBit, bufMoveByteLen, bufRestByteLen, len, + backupByteLen, numElements, resetBuf, srcPtr, dstPtr); + } + } + + void UnpackAvx512::vectorUnpack26(int64_t* data, uint64_t offset, uint64_t len) { + uint32_t bitWidth = 26; + const uint8_t* srcPtr = reinterpret_cast(decoder->getBufStart()); + uint64_t numElements = 0; + int64_t* dstPtr = data + offset; + uint64_t bufMoveByteLen = 0; + uint64_t bufRestByteLen = decoder->bufLength(); + bool resetBuf = false; + uint64_t startBit = 0; + uint64_t tailBitLen = 0; + uint32_t backupByteLen = 0; + + while (len > 0) { + alignHeaderBoundary(bitWidth, UNPACK_32Bit_MAX_SIZE, startBit, bufMoveByteLen, + bufRestByteLen, len, tailBitLen, backupByteLen, numElements, + resetBuf, srcPtr, dstPtr); + + if (numElements >= VECTOR_UNPACK_32BIT_MAX_NUM) { + __mmask16 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_DWORD(bitWidth * 16)); + __m512i parseMask0 = _mm512_set1_epi32(ORC_VECTOR_BIT_MASK(bitWidth)); + __m512i nibbleReversemm = _mm512_loadu_si512(nibbleReverseTable); + __m512i reverseMask32u = _mm512_loadu_si512(reverseMaskTable32u); + __m512i maskmm = _mm512_set1_epi8(0x0F); + + __m512i shuffleIdxPtr = _mm512_loadu_si512(shuffleIdxTable26u_0); + + __m512i permutexIdxPtr[2]; + permutexIdxPtr[0] = _mm512_loadu_si512(permutexIdxTable26u_0); + permutexIdxPtr[1] = _mm512_loadu_si512(permutexIdxTable26u_1); + + __m512i shiftMaskPtr[3]; + shiftMaskPtr[0] = _mm512_loadu_si512(shiftTable26u_0); + shiftMaskPtr[1] = _mm512_loadu_si512(shiftTable26u_1); + shiftMaskPtr[2] = _mm512_loadu_si512(shiftTable26u_2); + + __m512i gatherIdxmm = _mm512_loadu_si512(gatherIdxTable26u); + + while (numElements >= 2 * VECTOR_UNPACK_32BIT_MAX_NUM) { + __m512i srcmm, zmm[2]; + + srcmm = _mm512_i64gather_epi64(gatherIdxmm, srcPtr, 1); + + zmm[0] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr); + + // shifting elements so they start from the start of the word + zmm[0] = _mm512_srlv_epi32(zmm[0], shiftMaskPtr[2]); + zmm[0] = _mm512_and_si512(zmm[0], parseMask0); + + _mm512_storeu_si512(vectorBuf, zmm[0]); + + srcPtr += 2 * bitWidth; + decoder->resetBufferStart(2 * bitWidth, false, 0); + bufRestByteLen = decoder->bufLength(); + bufMoveByteLen -= 2 * bitWidth; + numElements -= VECTOR_UNPACK_32BIT_MAX_NUM; + std::copy(vectorBuf, vectorBuf + VECTOR_UNPACK_32BIT_MAX_NUM, dstPtr); + dstPtr += VECTOR_UNPACK_32BIT_MAX_NUM; + } + + if (numElements >= VECTOR_UNPACK_32BIT_MAX_NUM) { + __m512i srcmm, zmm[2]; + + srcmm = _mm512_maskz_loadu_epi32(readMask, srcPtr); + + __m512i lowNibblemm = _mm512_and_si512(srcmm, maskmm); + __m512i highNibblemm = _mm512_srli_epi16(srcmm, 4); + highNibblemm = _mm512_and_si512(highNibblemm, maskmm); + + lowNibblemm = _mm512_shuffle_epi8(nibbleReversemm, lowNibblemm); + highNibblemm = _mm512_shuffle_epi8(nibbleReversemm, highNibblemm); + lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4); + + srcmm = _mm512_or_si512(lowNibblemm, highNibblemm); + + // permuting so in zmm[0] will be elements with even indexes and in zmm[1] - with odd ones + zmm[0] = _mm512_permutexvar_epi32(permutexIdxPtr[0], srcmm); + zmm[1] = _mm512_permutexvar_epi32(permutexIdxPtr[1], srcmm); + + // shifting elements so they start from the start of the word + zmm[0] = _mm512_srlv_epi64(zmm[0], shiftMaskPtr[0]); + zmm[1] = _mm512_sllv_epi64(zmm[1], shiftMaskPtr[1]); + + // gathering even and odd elements together + zmm[0] = _mm512_mask_mov_epi32(zmm[0], 0xAAAA, zmm[1]); + zmm[0] = _mm512_and_si512(zmm[0], parseMask0); + + zmm[0] = _mm512_slli_epi32(zmm[0], 6); + lowNibblemm = _mm512_and_si512(zmm[0], maskmm); + highNibblemm = _mm512_srli_epi16(zmm[0], 4); + highNibblemm = _mm512_and_si512(highNibblemm, maskmm); + + lowNibblemm = _mm512_shuffle_epi8(nibbleReversemm, lowNibblemm); + highNibblemm = _mm512_shuffle_epi8(nibbleReversemm, highNibblemm); + lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4); + + zmm[0] = _mm512_or_si512(lowNibblemm, highNibblemm); + zmm[0] = _mm512_shuffle_epi8(zmm[0], reverseMask32u); + + _mm512_storeu_si512(vectorBuf, zmm[0]); + + srcPtr += 2 * bitWidth; + decoder->resetBufferStart(2 * bitWidth, false, 0); + bufRestByteLen = decoder->bufLength(); + bufMoveByteLen -= 2 * bitWidth; + numElements -= VECTOR_UNPACK_32BIT_MAX_NUM; + std::copy(vectorBuf, vectorBuf + VECTOR_UNPACK_32BIT_MAX_NUM, dstPtr); + dstPtr += VECTOR_UNPACK_32BIT_MAX_NUM; + } + } + + alignTailerBoundary(bitWidth, 0, startBit, bufMoveByteLen, bufRestByteLen, len, + backupByteLen, numElements, resetBuf, srcPtr, dstPtr); + } + } + + void UnpackAvx512::vectorUnpack28(int64_t* data, uint64_t offset, uint64_t len) { + uint32_t bitWidth = 28; + const uint8_t* srcPtr = reinterpret_cast(decoder->getBufStart()); + uint64_t numElements = 0; + int64_t* dstPtr = data + offset; + uint64_t bufMoveByteLen = 0; + uint64_t bufRestByteLen = decoder->bufLength(); + bool resetBuf = false; + uint64_t startBit = 0; + uint64_t tailBitLen = 0; + uint32_t backupByteLen = 0; + + while (len > 0) { + alignHeaderBoundary(bitWidth, UNPACK_32Bit_MAX_SIZE, startBit, bufMoveByteLen, + bufRestByteLen, len, tailBitLen, backupByteLen, numElements, + resetBuf, srcPtr, dstPtr); + + if (numElements >= VECTOR_UNPACK_32BIT_MAX_NUM) { + __mmask16 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_DWORD(bitWidth * 16)); + __m512i parseMask0 = _mm512_set1_epi32(ORC_VECTOR_BIT_MASK(bitWidth)); + + __m512i shuffleIdxPtr = _mm512_loadu_si512(shuffleIdxTable28u_0); + __m512i permutexIdx = _mm512_loadu_si512(permutexIdxTable28u); + __m512i shiftMask = _mm512_loadu_si512(shiftTable28u); + + while (numElements >= VECTOR_UNPACK_32BIT_MAX_NUM) { + __m512i srcmm, zmm; + + srcmm = _mm512_maskz_loadu_epi32(readMask, srcPtr); + + zmm = _mm512_permutexvar_epi16(permutexIdx, srcmm); + zmm = _mm512_shuffle_epi8(zmm, shuffleIdxPtr); + + // shifting elements so they start from the start of the word + zmm = _mm512_srlv_epi32(zmm, shiftMask); + zmm = _mm512_and_si512(zmm, parseMask0); + + _mm512_storeu_si512(vectorBuf, zmm); + + srcPtr += 2 * bitWidth; + decoder->resetBufferStart(2 * bitWidth, false, 0); + bufRestByteLen = decoder->bufLength(); + bufMoveByteLen -= 2 * bitWidth; + numElements -= VECTOR_UNPACK_32BIT_MAX_NUM; + std::copy(vectorBuf, vectorBuf + VECTOR_UNPACK_32BIT_MAX_NUM, dstPtr); + dstPtr += VECTOR_UNPACK_32BIT_MAX_NUM; + } + } + + alignTailerBoundary(bitWidth, 0, startBit, bufMoveByteLen, bufRestByteLen, len, + backupByteLen, numElements, resetBuf, srcPtr, dstPtr); + } + } + + void UnpackAvx512::vectorUnpack30(int64_t* data, uint64_t offset, uint64_t len) { + uint32_t bitWidth = 30; + const uint8_t* srcPtr = reinterpret_cast(decoder->getBufStart()); + uint64_t numElements = 0; + int64_t* dstPtr = data + offset; + uint64_t bufMoveByteLen = 0; + uint64_t bufRestByteLen = decoder->bufLength(); + bool resetBuf = false; + uint64_t startBit = 0; + uint64_t tailBitLen = 0; + uint32_t backupByteLen = 0; + + while (len > 0) { + alignHeaderBoundary(bitWidth, UNPACK_32Bit_MAX_SIZE, startBit, bufMoveByteLen, + bufRestByteLen, len, tailBitLen, backupByteLen, numElements, + resetBuf, srcPtr, dstPtr); + + if (numElements >= VECTOR_UNPACK_32BIT_MAX_NUM) { + __mmask16 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_DWORD(bitWidth * 16)); + __m512i parseMask0 = _mm512_set1_epi32(ORC_VECTOR_BIT_MASK(bitWidth)); + __m512i nibbleReversemm = _mm512_loadu_si512(nibbleReverseTable); + __m512i reverseMask32u = _mm512_loadu_si512(reverseMaskTable32u); + __m512i maskmm = _mm512_set1_epi8(0x0F); + + __m512i shuffleIdxPtr[2]; + shuffleIdxPtr[0] = _mm512_loadu_si512(shuffleIdxTable30u_0); + shuffleIdxPtr[1] = _mm512_loadu_si512(shuffleIdxTable30u_1); + + __m512i permutexIdxPtr[2]; + permutexIdxPtr[0] = _mm512_loadu_si512(permutexIdxTable30u_0); + permutexIdxPtr[1] = _mm512_loadu_si512(permutexIdxTable30u_1); + + __m512i shiftMaskPtr[4]; + shiftMaskPtr[0] = _mm512_loadu_si512(shiftTable30u_0); + shiftMaskPtr[1] = _mm512_loadu_si512(shiftTable30u_1); + shiftMaskPtr[2] = _mm512_loadu_si512(shiftTable30u_2); + shiftMaskPtr[3] = _mm512_loadu_si512(shiftTable30u_3); + + __m512i gatherIdxmm = _mm512_loadu_si512(gatherIdxTable30u); + + while (numElements >= 2 * VECTOR_UNPACK_32BIT_MAX_NUM) { + __m512i srcmm, zmm[2]; + + srcmm = _mm512_i64gather_epi64(gatherIdxmm, srcPtr, 1u); + + // shuffling so in zmm[0] will be elements with even indexes and in zmm[1] - with odd ones + zmm[0] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr[0]); + zmm[1] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr[1]); + + // shifting elements so they start from the start of the word + zmm[0] = _mm512_srlv_epi64(zmm[0], shiftMaskPtr[2]); + zmm[1] = _mm512_sllv_epi64(zmm[1], shiftMaskPtr[3]); + + // gathering even and odd elements together + zmm[0] = _mm512_mask_mov_epi32(zmm[0], 0xAAAA, zmm[1]); + zmm[0] = _mm512_and_si512(zmm[0], parseMask0); + + _mm512_storeu_si512(vectorBuf, zmm[0]); + + srcPtr += 2 * bitWidth; + decoder->resetBufferStart(2 * bitWidth, false, 0); + bufRestByteLen = decoder->bufLength(); + bufMoveByteLen -= 2 * bitWidth; + numElements -= VECTOR_UNPACK_32BIT_MAX_NUM; + std::copy(vectorBuf, vectorBuf + VECTOR_UNPACK_32BIT_MAX_NUM, dstPtr); + dstPtr += VECTOR_UNPACK_32BIT_MAX_NUM; + } + if (numElements >= VECTOR_UNPACK_32BIT_MAX_NUM) { + __m512i srcmm, zmm[2]; + + srcmm = _mm512_maskz_loadu_epi32(readMask, srcPtr); + + __m512i lowNibblemm = _mm512_and_si512(srcmm, maskmm); + __m512i highNibblemm = _mm512_srli_epi16(srcmm, 4); + highNibblemm = _mm512_and_si512(highNibblemm, maskmm); + + lowNibblemm = _mm512_shuffle_epi8(nibbleReversemm, lowNibblemm); + highNibblemm = _mm512_shuffle_epi8(nibbleReversemm, highNibblemm); + lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4u); + + srcmm = _mm512_or_si512(lowNibblemm, highNibblemm); + + // permuting so in zmm[0] will be elements with even indexes and in zmm[1] - with odd ones + zmm[0] = _mm512_permutexvar_epi32(permutexIdxPtr[0], srcmm); + zmm[1] = _mm512_permutexvar_epi32(permutexIdxPtr[1], srcmm); + + // shifting elements so they start from the start of the word + zmm[0] = _mm512_srlv_epi64(zmm[0], shiftMaskPtr[0]); + zmm[1] = _mm512_sllv_epi64(zmm[1], shiftMaskPtr[1]); + + // gathering even and odd elements together + zmm[0] = _mm512_mask_mov_epi32(zmm[0], 0xAAAA, zmm[1]); + zmm[0] = _mm512_and_si512(zmm[0], parseMask0); + + zmm[0] = _mm512_slli_epi32(zmm[0], 2u); + lowNibblemm = _mm512_and_si512(zmm[0], maskmm); + highNibblemm = _mm512_srli_epi16(zmm[0], 4u); + highNibblemm = _mm512_and_si512(highNibblemm, maskmm); + + lowNibblemm = _mm512_shuffle_epi8(nibbleReversemm, lowNibblemm); + highNibblemm = _mm512_shuffle_epi8(nibbleReversemm, highNibblemm); + lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4u); + + zmm[0] = _mm512_or_si512(lowNibblemm, highNibblemm); + zmm[0] = _mm512_shuffle_epi8(zmm[0], reverseMask32u); + + _mm512_storeu_si512(vectorBuf, zmm[0]); + + srcPtr += 2 * bitWidth; + decoder->resetBufferStart(2 * bitWidth, false, 0); + bufRestByteLen = decoder->bufLength(); + bufMoveByteLen -= 2 * bitWidth; + numElements -= VECTOR_UNPACK_32BIT_MAX_NUM; + std::copy(vectorBuf, vectorBuf + VECTOR_UNPACK_32BIT_MAX_NUM, dstPtr); + dstPtr += VECTOR_UNPACK_32BIT_MAX_NUM; + } + } + + alignTailerBoundary(bitWidth, 0, startBit, bufMoveByteLen, bufRestByteLen, len, + backupByteLen, numElements, resetBuf, srcPtr, dstPtr); + } + } + + void UnpackAvx512::vectorUnpack32(int64_t* data, uint64_t offset, uint64_t len) { + uint32_t bitWidth = 32; + const uint8_t* srcPtr = reinterpret_cast(decoder->getBufStart()); + uint64_t numElements = 0; + int64_t* dstPtr = data + offset; + uint64_t bufMoveByteLen = 0; + uint64_t bufRestByteLen = decoder->bufLength(); + bool resetBuf = false; + uint64_t tailBitLen = 0; + uint32_t backupByteLen = 0; + uint64_t startBit = 0; + + while (len > 0) { + alignHeaderBoundary(bitWidth, UNPACK_32Bit_MAX_SIZE, startBit, bufMoveByteLen, + bufRestByteLen, len, tailBitLen, backupByteLen, numElements, + resetBuf, srcPtr, dstPtr); + + if (numElements >= VECTOR_UNPACK_32BIT_MAX_NUM) { + __m512i reverseMask32u = _mm512_loadu_si512(reverseMaskTable32u); + while (numElements >= VECTOR_UNPACK_32BIT_MAX_NUM) { + __m512i srcmm = _mm512_loadu_si512(srcPtr); + srcmm = _mm512_shuffle_epi8(srcmm, reverseMask32u); + _mm512_storeu_si512(vectorBuf, srcmm); + + srcPtr += 2 * bitWidth; + decoder->resetBufferStart(2 * bitWidth, false, 0); + bufRestByteLen = decoder->bufLength(); + bufMoveByteLen -= 2 * bitWidth; + numElements -= VECTOR_UNPACK_32BIT_MAX_NUM; + std::copy(vectorBuf, vectorBuf + VECTOR_UNPACK_32BIT_MAX_NUM, dstPtr); + dstPtr += VECTOR_UNPACK_32BIT_MAX_NUM; + } + } + + alignTailerBoundary(bitWidth, 32, startBit, bufMoveByteLen, bufRestByteLen, len, + backupByteLen, numElements, resetBuf, srcPtr, dstPtr); + } + } + + void UnpackAvx512::plainUnpackLongs(int64_t* data, uint64_t offset, uint64_t len, uint64_t fbs, + uint64_t& startBit) { + for (uint64_t i = offset; i < (offset + len); i++) { + uint64_t result = 0; + uint64_t bitsLeftToRead = fbs; + while (bitsLeftToRead > decoder->getBitsLeft()) { + result <<= decoder->getBitsLeft(); + result |= decoder->getCurByte() & ((1 << decoder->getBitsLeft()) - 1); + bitsLeftToRead -= decoder->getBitsLeft(); + decoder->setCurByte(decoder->readByte()); + decoder->setBitsLeft(8); + } + + // handle the left over bits + if (bitsLeftToRead > 0) { + result <<= bitsLeftToRead; + decoder->setBitsLeft(decoder->getBitsLeft() - static_cast(bitsLeftToRead)); + result |= (decoder->getCurByte() >> decoder->getBitsLeft()) & ((1 << bitsLeftToRead) - 1); + } + data[i] = static_cast(result); + startBit = decoder->getBitsLeft() == 0 ? 0 : (8 - decoder->getBitsLeft()); + } + } + + void BitUnpackAVX512::readLongs(RleDecoderV2* decoder, int64_t* data, uint64_t offset, + uint64_t len, uint64_t fbs) { + UnpackAvx512 unpackAvx512(decoder); + UnpackDefault unpackDefault(decoder); + uint64_t startBit = 0; + static const auto cpu_info = CpuInfo::getInstance(); + if (cpu_info->isSupported(CpuInfo::AVX512)) { + switch (fbs) { + case 1: + unpackAvx512.vectorUnpack1(data, offset, len); + break; + case 2: + unpackAvx512.vectorUnpack2(data, offset, len); + break; + case 3: + unpackAvx512.vectorUnpack3(data, offset, len); + break; + case 4: + unpackAvx512.vectorUnpack4(data, offset, len); + break; + case 5: + unpackAvx512.vectorUnpack5(data, offset, len); + break; + case 6: + unpackAvx512.vectorUnpack6(data, offset, len); + break; + case 7: + unpackAvx512.vectorUnpack7(data, offset, len); + break; + case 8: + unpackDefault.unrolledUnpack8(data, offset, len); + break; + case 9: + unpackAvx512.vectorUnpack9(data, offset, len); + break; + case 10: + unpackAvx512.vectorUnpack10(data, offset, len); + break; + case 11: + unpackAvx512.vectorUnpack11(data, offset, len); + break; + case 12: + unpackAvx512.vectorUnpack12(data, offset, len); + break; + case 13: + unpackAvx512.vectorUnpack13(data, offset, len); + break; + case 14: + unpackAvx512.vectorUnpack14(data, offset, len); + break; + case 15: + unpackAvx512.vectorUnpack15(data, offset, len); + break; + case 16: + unpackAvx512.vectorUnpack16(data, offset, len); + break; + case 17: + unpackAvx512.vectorUnpack17(data, offset, len); + break; + case 18: + unpackAvx512.vectorUnpack18(data, offset, len); + break; + case 19: + unpackAvx512.vectorUnpack19(data, offset, len); + break; + case 20: + unpackAvx512.vectorUnpack20(data, offset, len); + break; + case 21: + unpackAvx512.vectorUnpack21(data, offset, len); + break; + case 22: + unpackAvx512.vectorUnpack22(data, offset, len); + break; + case 23: + unpackAvx512.vectorUnpack23(data, offset, len); + break; + case 24: + unpackAvx512.vectorUnpack24(data, offset, len); + break; + case 26: + unpackAvx512.vectorUnpack26(data, offset, len); + break; + case 28: + unpackAvx512.vectorUnpack28(data, offset, len); + break; + case 30: + unpackAvx512.vectorUnpack30(data, offset, len); + break; + case 32: + unpackAvx512.vectorUnpack32(data, offset, len); + break; + case 40: + unpackDefault.unrolledUnpack40(data, offset, len); + break; + case 48: + unpackDefault.unrolledUnpack48(data, offset, len); + break; + case 56: + unpackDefault.unrolledUnpack56(data, offset, len); + break; + case 64: + unpackDefault.unrolledUnpack64(data, offset, len); + break; + default: + // Fallback to the default implementation for deprecated bit size. + unpackAvx512.plainUnpackLongs(data, offset, len, fbs, startBit); + break; + } + } else { + switch (fbs) { + case 4: + unpackDefault.unrolledUnpack4(data, offset, len); + break; + case 8: + unpackDefault.unrolledUnpack8(data, offset, len); + break; + case 16: + unpackDefault.unrolledUnpack16(data, offset, len); + break; + case 24: + unpackDefault.unrolledUnpack24(data, offset, len); + break; + case 32: + unpackDefault.unrolledUnpack32(data, offset, len); + break; + case 40: + unpackDefault.unrolledUnpack40(data, offset, len); + break; + case 48: + unpackDefault.unrolledUnpack48(data, offset, len); + break; + case 56: + unpackDefault.unrolledUnpack56(data, offset, len); + break; + case 64: + unpackDefault.unrolledUnpack64(data, offset, len); + break; + default: + // Fallback to the default implementation for deprecated bit size. + unpackDefault.plainUnpackLongs(data, offset, len, fbs); + break; + } + } + } +} // namespace orc diff --git a/c++/src/BpackingAvx512.hh b/c++/src/BpackingAvx512.hh new file mode 100644 index 0000000000..7197b67d4d --- /dev/null +++ b/c++/src/BpackingAvx512.hh @@ -0,0 +1,155 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef ORC_BPACKINGAVX512_HH +#define ORC_BPACKINGAVX512_HH + +#include +#include + +#include "BpackingDefault.hh" + +namespace orc { + +#define VECTOR_UNPACK_8BIT_MAX_NUM 64 +#define VECTOR_UNPACK_16BIT_MAX_NUM 32 +#define VECTOR_UNPACK_32BIT_MAX_NUM 16 +#define UNPACK_8Bit_MAX_SIZE 8 +#define UNPACK_16Bit_MAX_SIZE 16 +#define UNPACK_32Bit_MAX_SIZE 32 + + class RleDecoderV2; + + class UnpackAvx512 { + public: + UnpackAvx512(RleDecoderV2* dec); + ~UnpackAvx512(); + + void vectorUnpack1(int64_t* data, uint64_t offset, uint64_t len); + void vectorUnpack2(int64_t* data, uint64_t offset, uint64_t len); + void vectorUnpack3(int64_t* data, uint64_t offset, uint64_t len); + void vectorUnpack4(int64_t* data, uint64_t offset, uint64_t len); + void vectorUnpack5(int64_t* data, uint64_t offset, uint64_t len); + void vectorUnpack6(int64_t* data, uint64_t offset, uint64_t len); + void vectorUnpack7(int64_t* data, uint64_t offset, uint64_t len); + void vectorUnpack9(int64_t* data, uint64_t offset, uint64_t len); + void vectorUnpack10(int64_t* data, uint64_t offset, uint64_t len); + void vectorUnpack11(int64_t* data, uint64_t offset, uint64_t len); + void vectorUnpack12(int64_t* data, uint64_t offset, uint64_t len); + void vectorUnpack13(int64_t* data, uint64_t offset, uint64_t len); + void vectorUnpack14(int64_t* data, uint64_t offset, uint64_t len); + void vectorUnpack15(int64_t* data, uint64_t offset, uint64_t len); + void vectorUnpack16(int64_t* data, uint64_t offset, uint64_t len); + void vectorUnpack17(int64_t* data, uint64_t offset, uint64_t len); + void vectorUnpack18(int64_t* data, uint64_t offset, uint64_t len); + void vectorUnpack19(int64_t* data, uint64_t offset, uint64_t len); + void vectorUnpack20(int64_t* data, uint64_t offset, uint64_t len); + void vectorUnpack21(int64_t* data, uint64_t offset, uint64_t len); + void vectorUnpack22(int64_t* data, uint64_t offset, uint64_t len); + void vectorUnpack23(int64_t* data, uint64_t offset, uint64_t len); + void vectorUnpack24(int64_t* data, uint64_t offset, uint64_t len); + void vectorUnpack26(int64_t* data, uint64_t offset, uint64_t len); + void vectorUnpack28(int64_t* data, uint64_t offset, uint64_t len); + void vectorUnpack30(int64_t* data, uint64_t offset, uint64_t len); + void vectorUnpack32(int64_t* data, uint64_t offset, uint64_t len); + + void plainUnpackLongs(int64_t* data, uint64_t offset, uint64_t len, uint64_t fbs, + uint64_t& startBit); + + /** + * In the processing of AVX512 unpacking, AVX512 instructions can only process the memory align + * data. It means that if data input is not memory align (@param startBit != 0), we need to + * process the unaligned data. After that, it could be use AVX512 instructions to process these + * memory align data. + * + * @tparam hasBitOffset If currently processed data has offset bits in one Byte, 8X-bit width + * data will not have bits offset in one Byte, so it will be false. For other bits data, it will + * be true. + * @param bitWidth The unpacking data bit width + * @param bitMaxSize The unpacking data needs the Max bit size (8X) + * @param startBit The start bit position in one Byte + * @param bufMoveByteLen In the current buffer, it will be processed/moved Bytes length in the + * unpacking + * @param bufRestByteLen In the current buffer, there will be some rest Bytes length after + * unpacking + * @param remainingNumElements After unpacking, the remaining elements number need to be + * processed + * @param tailBitLen After unpacking, the tail bits length + * @param backupByteLen The backup Byte length after unpacking + * @param numElements Currently, the number of elements need to be processed + * @param resetBuf When the current buffer has already been processed, it need to be reset the + * buffer + * @param srcPtr the pointer of source data + * @param dstPtr the pointer of destinative data + */ + template + inline void alignHeaderBoundary(const uint32_t bitWidth, const uint32_t bitMaxSize, + uint64_t& startBit, uint64_t& bufMoveByteLen, + uint64_t& bufRestByteLen, uint64_t& remainingNumElements, + uint64_t& tailBitLen, uint32_t& backupByteLen, + uint64_t& numElements, bool& resetBuf, const uint8_t*& srcPtr, + int64_t*& dstPtr); + + /** + * After AVX512 unpacking processed, there could be some scattered data not be process, + * it needs to be processed by the default way. + * + * @tparam hasBitOffset If currently processed data has offset bits in one Byte, 8X-bit width + * data will not have bits offset in one Byte, so it will be false. For other bits data, it will + * be true. + * @param bitWidth The unpacking data bit width + * @param specialBit 8X bit width data is the specialBit, they have the different unpackDefault + * functions with others + * @param startBit The start bit position in one Byte + * @param bufMoveByteLen In the current buffer, it will be processed/moved Bytes length in the + * unpacking + * @param bufRestByteLen In the current buffer, there will be some rest Bytes length after + * unpacking + * @param remainingNumElements After unpacking, the remaining elements number need to be + * processed + * @param backupByteLen The backup Byte length after unpacking + * @param numElements Currently, the number of elements need to be processed + * @param resetBuf When the current buffer has already been processed, it need to be reset the + * buffer + * @param srcPtr the pointer of source data + * @param dstPtr the pointer of destinative data + */ + template + inline void alignTailerBoundary(const uint32_t bitWidth, const uint32_t specialBit, + uint64_t& startBit, uint64_t& bufMoveByteLen, + uint64_t& bufRestByteLen, uint64_t& remainingNumElements, + uint32_t& backupByteLen, uint64_t& numElements, bool& resetBuf, + const uint8_t*& srcPtr, int64_t*& dstPtr); + + private: + RleDecoderV2* decoder; + UnpackDefault unpackDefault; + + // Used by vectorized bit-unpacking data + uint32_t vectorBuf[VECTOR_UNPACK_32BIT_MAX_NUM + 1]; + }; + + class BitUnpackAVX512 : public BitUnpack { + public: + static void readLongs(RleDecoderV2* decoder, int64_t* data, uint64_t offset, uint64_t len, + uint64_t fbs); + }; + +} // namespace orc + +#endif diff --git a/c++/src/BpackingDefault.cc b/c++/src/BpackingDefault.cc new file mode 100644 index 0000000000..5a80bc6fb1 --- /dev/null +++ b/c++/src/BpackingDefault.cc @@ -0,0 +1,368 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "BpackingDefault.hh" +#include "RLEv2.hh" +#include "Utils.hh" + +namespace orc { + + UnpackDefault::UnpackDefault(RleDecoderV2* dec) : decoder(dec) { + // PASS + } + + UnpackDefault::~UnpackDefault() { + // PASS + } + + void UnpackDefault::unrolledUnpack4(int64_t* data, uint64_t offset, uint64_t len) { + uint64_t curIdx = offset; + while (curIdx < offset + len) { + // Make sure bitsLeft is 0 before the loop. bitsLeft can only be 0, 4, or 8. + while (decoder->getBitsLeft() > 0 && curIdx < offset + len) { + decoder->setBitsLeft(decoder->getBitsLeft() - 4); + data[curIdx++] = (decoder->getCurByte() >> decoder->getBitsLeft()) & 15; + } + if (curIdx == offset + len) return; + + // Exhaust the buffer + uint64_t numGroups = (offset + len - curIdx) / 2; + numGroups = std::min(numGroups, static_cast(decoder->bufLength())); + // Avoid updating 'bufferStart' inside the loop. + auto* buffer = reinterpret_cast(decoder->getBufStart()); + uint32_t localByte; + for (uint64_t i = 0; i < numGroups; ++i) { + localByte = *buffer++; + data[curIdx] = (localByte >> 4) & 15; + data[curIdx + 1] = localByte & 15; + curIdx += 2; + } + decoder->setBufStart(reinterpret_cast(buffer)); + if (curIdx == offset + len) return; + + // readByte() will update 'bufferStart' and 'bufferEnd' + decoder->setCurByte(decoder->readByte()); + decoder->setBitsLeft(8); + } + } + + void UnpackDefault::unrolledUnpack8(int64_t* data, uint64_t offset, uint64_t len) { + uint64_t curIdx = offset; + while (curIdx < offset + len) { + // Exhaust the buffer + int64_t bufferNum = decoder->bufLength(); + bufferNum = std::min(bufferNum, static_cast(offset + len - curIdx)); + // Avoid updating 'bufferStart' inside the loop. + auto* buffer = reinterpret_cast(decoder->getBufStart()); + for (int i = 0; i < bufferNum; ++i) { + data[curIdx++] = *buffer++; + } + decoder->setBufStart(reinterpret_cast(buffer)); + if (curIdx == offset + len) return; + + // readByte() will update 'bufferStart' and 'bufferEnd'. + data[curIdx++] = decoder->readByte(); + } + } + + void UnpackDefault::unrolledUnpack16(int64_t* data, uint64_t offset, uint64_t len) { + uint64_t curIdx = offset; + while (curIdx < offset + len) { + // Exhaust the buffer + int64_t bufferNum = decoder->bufLength() / 2; + bufferNum = std::min(bufferNum, static_cast(offset + len - curIdx)); + uint16_t b0, b1; + // Avoid updating 'bufferStart' inside the loop. + auto* buffer = reinterpret_cast(decoder->getBufStart()); + for (int i = 0; i < bufferNum; ++i) { + b0 = static_cast(*buffer); + b1 = static_cast(*(buffer + 1)); + buffer += 2; + data[curIdx++] = (b0 << 8) | b1; + } + decoder->setBufStart(reinterpret_cast(buffer)); + if (curIdx == offset + len) return; + + // One of the following readByte() will update 'bufferStart' and 'bufferEnd'. + b0 = decoder->readByte(); + b1 = decoder->readByte(); + data[curIdx++] = (b0 << 8) | b1; + } + } + + void UnpackDefault::unrolledUnpack24(int64_t* data, uint64_t offset, uint64_t len) { + uint64_t curIdx = offset; + while (curIdx < offset + len) { + // Exhaust the buffer + int64_t bufferNum = decoder->bufLength() / 3; + bufferNum = std::min(bufferNum, static_cast(offset + len - curIdx)); + uint32_t b0, b1, b2; + // Avoid updating 'bufferStart' inside the loop. + auto* buffer = reinterpret_cast(decoder->getBufStart()); + for (int i = 0; i < bufferNum; ++i) { + b0 = static_cast(*buffer); + b1 = static_cast(*(buffer + 1)); + b2 = static_cast(*(buffer + 2)); + buffer += 3; + data[curIdx++] = static_cast((b0 << 16) | (b1 << 8) | b2); + } + //////decoder->bufferStart += bufferNum * 3; + decoder->setBufStart(reinterpret_cast(buffer)); + if (curIdx == offset + len) return; + + // One of the following readByte() will update 'bufferStart' and 'bufferEnd'. + b0 = decoder->readByte(); + b1 = decoder->readByte(); + b2 = decoder->readByte(); + data[curIdx++] = static_cast((b0 << 16) | (b1 << 8) | b2); + } + } + + void UnpackDefault::unrolledUnpack32(int64_t* data, uint64_t offset, uint64_t len) { + uint64_t curIdx = offset; + while (curIdx < offset + len) { + // Exhaust the buffer + int64_t bufferNum = decoder->bufLength() / 4; + bufferNum = std::min(bufferNum, static_cast(offset + len - curIdx)); + uint32_t b0, b1, b2, b3; + // Avoid updating 'bufferStart' inside the loop. + auto* buffer = reinterpret_cast(decoder->getBufStart()); + for (int i = 0; i < bufferNum; ++i) { + b0 = static_cast(*buffer); + b1 = static_cast(*(buffer + 1)); + b2 = static_cast(*(buffer + 2)); + b3 = static_cast(*(buffer + 3)); + buffer += 4; + data[curIdx++] = static_cast((b0 << 24) | (b1 << 16) | (b2 << 8) | b3); + } + decoder->setBufStart(reinterpret_cast(buffer)); + if (curIdx == offset + len) return; + + // One of the following readByte() will update 'bufferStart' and 'bufferEnd'. + b0 = decoder->readByte(); + b1 = decoder->readByte(); + b2 = decoder->readByte(); + b3 = decoder->readByte(); + data[curIdx++] = static_cast((b0 << 24) | (b1 << 16) | (b2 << 8) | b3); + } + } + + void UnpackDefault::unrolledUnpack40(int64_t* data, uint64_t offset, uint64_t len) { + uint64_t curIdx = offset; + while (curIdx < offset + len) { + // Exhaust the buffer + int64_t bufferNum = decoder->bufLength() / 5; + bufferNum = std::min(bufferNum, static_cast(offset + len - curIdx)); + uint64_t b0, b1, b2, b3, b4; + // Avoid updating 'bufferStart' inside the loop. + auto* buffer = reinterpret_cast(decoder->getBufStart()); + for (int i = 0; i < bufferNum; ++i) { + b0 = static_cast(*buffer); + b1 = static_cast(*(buffer + 1)); + b2 = static_cast(*(buffer + 2)); + b3 = static_cast(*(buffer + 3)); + b4 = static_cast(*(buffer + 4)); + buffer += 5; + data[curIdx++] = + static_cast((b0 << 32) | (b1 << 24) | (b2 << 16) | (b3 << 8) | b4); + } + decoder->setBufStart(reinterpret_cast(buffer)); + if (curIdx == offset + len) return; + + // One of the following readByte() will update 'bufferStart' and 'bufferEnd'. + b0 = decoder->readByte(); + b1 = decoder->readByte(); + b2 = decoder->readByte(); + b3 = decoder->readByte(); + b4 = decoder->readByte(); + data[curIdx++] = static_cast((b0 << 32) | (b1 << 24) | (b2 << 16) | (b3 << 8) | b4); + } + } + + void UnpackDefault::unrolledUnpack48(int64_t* data, uint64_t offset, uint64_t len) { + uint64_t curIdx = offset; + while (curIdx < offset + len) { + // Exhaust the buffer + int64_t bufferNum = decoder->bufLength() / 6; + bufferNum = std::min(bufferNum, static_cast(offset + len - curIdx)); + uint64_t b0, b1, b2, b3, b4, b5; + // Avoid updating 'bufferStart' inside the loop. + auto* buffer = reinterpret_cast(decoder->getBufStart()); + for (int i = 0; i < bufferNum; ++i) { + b0 = static_cast(*buffer); + b1 = static_cast(*(buffer + 1)); + b2 = static_cast(*(buffer + 2)); + b3 = static_cast(*(buffer + 3)); + b4 = static_cast(*(buffer + 4)); + b5 = static_cast(*(buffer + 5)); + buffer += 6; + data[curIdx++] = static_cast((b0 << 40) | (b1 << 32) | (b2 << 24) | (b3 << 16) | + (b4 << 8) | b5); + } + decoder->setBufStart(reinterpret_cast(buffer)); + if (curIdx == offset + len) return; + + // One of the following readByte() will update 'bufferStart' and 'bufferEnd'. + b0 = decoder->readByte(); + b1 = decoder->readByte(); + b2 = decoder->readByte(); + b3 = decoder->readByte(); + b4 = decoder->readByte(); + b5 = decoder->readByte(); + data[curIdx++] = + static_cast((b0 << 40) | (b1 << 32) | (b2 << 24) | (b3 << 16) | (b4 << 8) | b5); + } + } + + void UnpackDefault::unrolledUnpack56(int64_t* data, uint64_t offset, uint64_t len) { + uint64_t curIdx = offset; + while (curIdx < offset + len) { + // Exhaust the buffer + int64_t bufferNum = decoder->bufLength() / 7; + bufferNum = std::min(bufferNum, static_cast(offset + len - curIdx)); + uint64_t b0, b1, b2, b3, b4, b5, b6; + // Avoid updating 'bufferStart' inside the loop. + auto* buffer = reinterpret_cast(decoder->getBufStart()); + for (int i = 0; i < bufferNum; ++i) { + b0 = static_cast(*buffer); + b1 = static_cast(*(buffer + 1)); + b2 = static_cast(*(buffer + 2)); + b3 = static_cast(*(buffer + 3)); + b4 = static_cast(*(buffer + 4)); + b5 = static_cast(*(buffer + 5)); + b6 = static_cast(*(buffer + 6)); + buffer += 7; + data[curIdx++] = static_cast((b0 << 48) | (b1 << 40) | (b2 << 32) | (b3 << 24) | + (b4 << 16) | (b5 << 8) | b6); + } + decoder->setBufStart(reinterpret_cast(buffer)); + if (curIdx == offset + len) return; + + // One of the following readByte() will update 'bufferStart' and 'bufferEnd'. + b0 = decoder->readByte(); + b1 = decoder->readByte(); + b2 = decoder->readByte(); + b3 = decoder->readByte(); + b4 = decoder->readByte(); + b5 = decoder->readByte(); + b6 = decoder->readByte(); + data[curIdx++] = static_cast((b0 << 48) | (b1 << 40) | (b2 << 32) | (b3 << 24) | + (b4 << 16) | (b5 << 8) | b6); + } + } + + void UnpackDefault::unrolledUnpack64(int64_t* data, uint64_t offset, uint64_t len) { + uint64_t curIdx = offset; + while (curIdx < offset + len) { + // Exhaust the buffer + int64_t bufferNum = decoder->bufLength() / 8; + bufferNum = std::min(bufferNum, static_cast(offset + len - curIdx)); + uint64_t b0, b1, b2, b3, b4, b5, b6, b7; + // Avoid updating 'bufferStart' inside the loop. + auto* buffer = reinterpret_cast(decoder->getBufStart()); + for (int i = 0; i < bufferNum; ++i) { + b0 = static_cast(*buffer); + b1 = static_cast(*(buffer + 1)); + b2 = static_cast(*(buffer + 2)); + b3 = static_cast(*(buffer + 3)); + b4 = static_cast(*(buffer + 4)); + b5 = static_cast(*(buffer + 5)); + b6 = static_cast(*(buffer + 6)); + b7 = static_cast(*(buffer + 7)); + buffer += 8; + data[curIdx++] = static_cast((b0 << 56) | (b1 << 48) | (b2 << 40) | (b3 << 32) | + (b4 << 24) | (b5 << 16) | (b6 << 8) | b7); + } + decoder->setBufStart(reinterpret_cast(buffer)); + if (curIdx == offset + len) return; + + // One of the following readByte() will update 'bufferStart' and 'bufferEnd'. + b0 = decoder->readByte(); + b1 = decoder->readByte(); + b2 = decoder->readByte(); + b3 = decoder->readByte(); + b4 = decoder->readByte(); + b5 = decoder->readByte(); + b6 = decoder->readByte(); + b7 = decoder->readByte(); + data[curIdx++] = static_cast((b0 << 56) | (b1 << 48) | (b2 << 40) | (b3 << 32) | + (b4 << 24) | (b5 << 16) | (b6 << 8) | b7); + } + } + + void UnpackDefault::plainUnpackLongs(int64_t* data, uint64_t offset, uint64_t len, uint64_t fbs) { + for (uint64_t i = offset; i < (offset + len); i++) { + uint64_t result = 0; + uint64_t bitsLeftToRead = fbs; + while (bitsLeftToRead > decoder->getBitsLeft()) { + result <<= decoder->getBitsLeft(); + result |= decoder->getCurByte() & ((1 << decoder->getBitsLeft()) - 1); + bitsLeftToRead -= decoder->getBitsLeft(); + decoder->setCurByte(decoder->readByte()); + decoder->setBitsLeft(8); + } + + // handle the left over bits + if (bitsLeftToRead > 0) { + result <<= bitsLeftToRead; + decoder->setBitsLeft(decoder->getBitsLeft() - static_cast(bitsLeftToRead)); + result |= (decoder->getCurByte() >> decoder->getBitsLeft()) & ((1 << bitsLeftToRead) - 1); + } + data[i] = static_cast(result); + } + } + + void BitUnpackDefault::readLongs(RleDecoderV2* decoder, int64_t* data, uint64_t offset, + uint64_t len, uint64_t fbs) { + UnpackDefault unpackDefault(decoder); + switch (fbs) { + case 4: + unpackDefault.unrolledUnpack4(data, offset, len); + break; + case 8: + unpackDefault.unrolledUnpack8(data, offset, len); + break; + case 16: + unpackDefault.unrolledUnpack16(data, offset, len); + break; + case 24: + unpackDefault.unrolledUnpack24(data, offset, len); + break; + case 32: + unpackDefault.unrolledUnpack32(data, offset, len); + break; + case 40: + unpackDefault.unrolledUnpack40(data, offset, len); + break; + case 48: + unpackDefault.unrolledUnpack48(data, offset, len); + break; + case 56: + unpackDefault.unrolledUnpack56(data, offset, len); + break; + case 64: + unpackDefault.unrolledUnpack64(data, offset, len); + break; + default: + // Fallback to the default implementation for deprecated bit size. + unpackDefault.plainUnpackLongs(data, offset, len, fbs); + break; + } + } + +} // namespace orc diff --git a/c++/src/BpackingDefault.hh b/c++/src/BpackingDefault.hh new file mode 100644 index 0000000000..0a58234495 --- /dev/null +++ b/c++/src/BpackingDefault.hh @@ -0,0 +1,59 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef ORC_BPACKINGDEFAULT_HH +#define ORC_BPACKINGDEFAULT_HH + +#include +#include + +#include "Bpacking.hh" + +namespace orc { + class RleDecoderV2; + + class UnpackDefault { + public: + UnpackDefault(RleDecoderV2* dec); + ~UnpackDefault(); + + void unrolledUnpack4(int64_t* data, uint64_t offset, uint64_t len); + void unrolledUnpack8(int64_t* data, uint64_t offset, uint64_t len); + void unrolledUnpack16(int64_t* data, uint64_t offset, uint64_t len); + void unrolledUnpack24(int64_t* data, uint64_t offset, uint64_t len); + void unrolledUnpack32(int64_t* data, uint64_t offset, uint64_t len); + void unrolledUnpack40(int64_t* data, uint64_t offset, uint64_t len); + void unrolledUnpack48(int64_t* data, uint64_t offset, uint64_t len); + void unrolledUnpack56(int64_t* data, uint64_t offset, uint64_t len); + void unrolledUnpack64(int64_t* data, uint64_t offset, uint64_t len); + + void plainUnpackLongs(int64_t* data, uint64_t offset, uint64_t len, uint64_t fbs); + + private: + RleDecoderV2* decoder; + }; + + class BitUnpackDefault : public BitUnpack { + public: + static void readLongs(RleDecoderV2* decoder, int64_t* data, uint64_t offset, uint64_t len, + uint64_t fbs); + }; + +} // namespace orc + +#endif diff --git a/c++/src/CMakeLists.txt b/c++/src/CMakeLists.txt index 16b5549b9f..972f2fc035 100644 --- a/c++/src/CMakeLists.txt +++ b/c++/src/CMakeLists.txt @@ -166,6 +166,7 @@ set(SOURCE_FILES Adaptor.cc BlockBuffer.cc BloomFilter.cc + BpackingDefault.cc ByteRLE.cc ColumnPrinter.cc ColumnReader.cc @@ -173,6 +174,7 @@ set(SOURCE_FILES Common.cc Compression.cc ConvertColumnReader.cc + CpuInfoUtil.cc Exceptions.cc Int128.cc LzoDecompressor.cc @@ -198,6 +200,12 @@ if(BUILD_LIBHDFSPP) add_definitions(-DBUILD_LIBHDFSPP) endif(BUILD_LIBHDFSPP) +if(BUILD_ENABLE_AVX512) + set(SOURCE_FILES + ${SOURCE_FILES} + BpackingAvx512.cc) +endif(BUILD_ENABLE_AVX512) + add_library (orc STATIC ${SOURCE_FILES}) target_link_libraries (orc diff --git a/c++/src/CpuInfoUtil.cc b/c++/src/CpuInfoUtil.cc new file mode 100644 index 0000000000..bf32617c43 --- /dev/null +++ b/c++/src/CpuInfoUtil.cc @@ -0,0 +1,545 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file CpuInfoUtil.cc is from Apache Arrow as of 2023-03-21 + */ + +#include "CpuInfoUtil.hh" + +#ifdef __APPLE__ +#include +#endif + +#ifndef _MSC_VER +#include +#endif + +#ifdef _WIN32 +#define NOMINMAX +#include +#include +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "orc/Exceptions.hh" + +#undef CPUINFO_ARCH_X86 + +#if defined(__i386) || defined(_M_IX86) || defined(__x86_64__) || defined(_M_X64) +#define CPUINFO_ARCH_X86 +#endif + +#ifndef ORC_HAVE_RUNTIME_AVX512 +#define UNUSED(x) (void)(x) +#endif + +namespace orc { + + namespace { + + constexpr int kCacheLevels = static_cast(CpuInfo::CacheLevel::Last) + 1; + + //============================== OS Dependent ==============================// + +#if defined(_WIN32) + //------------------------------ WINDOWS ------------------------------// + void OsRetrieveCacheSize(std::array* cache_sizes) { + PSYSTEM_LOGICAL_PROCESSOR_INFORMATION buffer = nullptr; + PSYSTEM_LOGICAL_PROCESSOR_INFORMATION buffer_position = nullptr; + DWORD buffer_size = 0; + size_t offset = 0; + typedef BOOL(WINAPI * GetLogicalProcessorInformationFuncPointer)(void*, void*); + GetLogicalProcessorInformationFuncPointer func_pointer = + (GetLogicalProcessorInformationFuncPointer)GetProcAddress( + GetModuleHandle("kernel32"), "GetLogicalProcessorInformation"); + + if (!func_pointer) { + throw ParseError("Failed to find procedure GetLogicalProcessorInformation"); + } + + // Get buffer size + if (func_pointer(buffer, &buffer_size) && GetLastError() != ERROR_INSUFFICIENT_BUFFER) { + throw ParseError("Failed to get size of processor information buffer"); + } + + buffer = (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION)malloc(buffer_size); + if (!buffer) { + return; + } + + if (!func_pointer(buffer, &buffer_size)) { + free(buffer); + throw ParseError("Failed to get processor information"); + } + + buffer_position = buffer; + while (offset + sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION) <= buffer_size) { + if (RelationCache == buffer_position->Relationship) { + PCACHE_DESCRIPTOR cache = &buffer_position->Cache; + if (cache->Level >= 1 && cache->Level <= kCacheLevels) { + const int64_t current = (*cache_sizes)[cache->Level - 1]; + (*cache_sizes)[cache->Level - 1] = std::max(current, cache->Size); + } + } + offset += sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION); + buffer_position++; + } + + free(buffer); + } + +#if defined(CPUINFO_ARCH_X86) + // On x86, get CPU features by cpuid, https://en.wikipedia.org/wiki/CPUID + +#if defined(__MINGW64_VERSION_MAJOR) && __MINGW64_VERSION_MAJOR < 5 + void __cpuidex(int CPUInfo[4], int function_id, int subfunction_id) { + __asm__ __volatile__("cpuid" + : "=a"(CPUInfo[0]), "=b"(CPUInfo[1]), "=c"(CPUInfo[2]), "=d"(CPUInfo[3]) + : "a"(function_id), "c"(subfunction_id)); + } + + int64_t _xgetbv(int xcr) { + int out = 0; + __asm__ __volatile__("xgetbv" : "=a"(out) : "c"(xcr) : "%edx"); + return out; + } +#endif // MINGW + + void OsRetrieveCpuInfo(int64_t* hardware_flags, CpuInfo::Vendor* vendor, + std::string* model_name) { + int register_EAX_id = 1; + int highest_valid_id = 0; + int highest_extended_valid_id = 0; + std::bitset<32> features_ECX; + std::array cpu_info; + + // Get highest valid id + __cpuid(cpu_info.data(), 0); + highest_valid_id = cpu_info[0]; + // HEX of "GenuineIntel": 47656E75 696E6549 6E74656C + // HEX of "AuthenticAMD": 41757468 656E7469 63414D44 + if (cpu_info[1] == 0x756e6547 && cpu_info[3] == 0x49656e69 && cpu_info[2] == 0x6c65746e) { + *vendor = CpuInfo::Vendor::Intel; + } else if (cpu_info[1] == 0x68747541 && cpu_info[3] == 0x69746e65 && + cpu_info[2] == 0x444d4163) { + *vendor = CpuInfo::Vendor::AMD; + } + + if (highest_valid_id <= register_EAX_id) { + return; + } + + // EAX=1: Processor Info and Feature Bits + __cpuidex(cpu_info.data(), register_EAX_id, 0); + features_ECX = cpu_info[2]; + + // Get highest extended id + __cpuid(cpu_info.data(), 0x80000000); + highest_extended_valid_id = cpu_info[0]; + + // Retrieve CPU model name + if (highest_extended_valid_id >= static_cast(0x80000004)) { + model_name->clear(); + for (int i = 0x80000002; i <= static_cast(0x80000004); ++i) { + __cpuidex(cpu_info.data(), i, 0); + *model_name += std::string(reinterpret_cast(cpu_info.data()), sizeof(cpu_info)); + } + } + + bool zmm_enabled = false; + if (features_ECX[27]) { // OSXSAVE + // Query if the OS supports saving ZMM registers when switching contexts + int64_t xcr0 = _xgetbv(0); + zmm_enabled = (xcr0 & 0xE0) == 0xE0; + } + + if (features_ECX[9]) *hardware_flags |= CpuInfo::SSSE3; + if (features_ECX[19]) *hardware_flags |= CpuInfo::SSE4_1; + if (features_ECX[20]) *hardware_flags |= CpuInfo::SSE4_2; + if (features_ECX[23]) *hardware_flags |= CpuInfo::POPCNT; + if (features_ECX[28]) *hardware_flags |= CpuInfo::AVX; + + // cpuid with EAX=7, ECX=0: Extended Features + register_EAX_id = 7; + if (highest_valid_id > register_EAX_id) { + __cpuidex(cpu_info.data(), register_EAX_id, 0); + std::bitset<32> features_EBX = cpu_info[1]; + + if (features_EBX[3]) *hardware_flags |= CpuInfo::BMI1; + if (features_EBX[5]) *hardware_flags |= CpuInfo::AVX2; + if (features_EBX[8]) *hardware_flags |= CpuInfo::BMI2; + if (zmm_enabled) { + if (features_EBX[16]) *hardware_flags |= CpuInfo::AVX512F; + if (features_EBX[17]) *hardware_flags |= CpuInfo::AVX512DQ; + if (features_EBX[28]) *hardware_flags |= CpuInfo::AVX512CD; + if (features_EBX[30]) *hardware_flags |= CpuInfo::AVX512BW; + if (features_EBX[31]) *hardware_flags |= CpuInfo::AVX512VL; + } + } + } +#endif + +#elif defined(__APPLE__) + //------------------------------ MACOS ------------------------------// + std::optional IntegerSysCtlByName(const char* name) { + size_t len = sizeof(int64_t); + int64_t data = 0; + if (sysctlbyname(name, &data, &len, nullptr, 0) == 0) { + return data; + } + // ENOENT is the official errno value for non-existing sysctl's, + // but EINVAL and ENOTSUP have been seen in the wild. + if (errno != ENOENT && errno != EINVAL && errno != ENOTSUP) { + std::ostringstream ss; + ss << "sysctlbyname failed for '" << name << "'"; + throw ParseError(ss.str()); + } + return std::nullopt; + } + + void OsRetrieveCacheSize(std::array* cache_sizes) { + static_assert(kCacheLevels >= 3, ""); + auto c = IntegerSysCtlByName("hw.l1dcachesize"); + if (c.has_value()) { + (*cache_sizes)[0] = *c; + } + c = IntegerSysCtlByName("hw.l2cachesize"); + if (c.has_value()) { + (*cache_sizes)[1] = *c; + } + c = IntegerSysCtlByName("hw.l3cachesize"); + if (c.has_value()) { + (*cache_sizes)[2] = *c; + } + } + + void OsRetrieveCpuInfo(int64_t* hardware_flags, CpuInfo::Vendor* vendor, + std::string* model_name) { + // hardware_flags + struct SysCtlCpuFeature { + const char* name; + int64_t flag; + }; + std::vector features = { +#if defined(CPUINFO_ARCH_X86) + {"hw.optional.sse4_2", + CpuInfo::SSSE3 | CpuInfo::SSE4_1 | CpuInfo::SSE4_2 | CpuInfo::POPCNT}, + {"hw.optional.avx1_0", CpuInfo::AVX}, + {"hw.optional.avx2_0", CpuInfo::AVX2}, + {"hw.optional.bmi1", CpuInfo::BMI1}, + {"hw.optional.bmi2", CpuInfo::BMI2}, + {"hw.optional.avx512f", CpuInfo::AVX512F}, + {"hw.optional.avx512cd", CpuInfo::AVX512CD}, + {"hw.optional.avx512dq", CpuInfo::AVX512DQ}, + {"hw.optional.avx512bw", CpuInfo::AVX512BW}, + {"hw.optional.avx512vl", CpuInfo::AVX512VL}, +#endif + }; + for (const auto& feature : features) { + auto v = IntegerSysCtlByName(feature.name); + if (v.value_or(0)) { + *hardware_flags |= feature.flag; + } + } + + // TODO: vendor, model_name + *vendor = CpuInfo::Vendor::Unknown; + *model_name = "Unknown"; + } + +#else + //------------------------------ LINUX ------------------------------// + // Get cache size, return 0 on error + int64_t LinuxGetCacheSize(int level) { + // get cache size by sysconf() +#ifdef _SC_LEVEL1_DCACHE_SIZE + const int kCacheSizeConf[] = { + _SC_LEVEL1_DCACHE_SIZE, + _SC_LEVEL2_CACHE_SIZE, + _SC_LEVEL3_CACHE_SIZE, + }; + static_assert(sizeof(kCacheSizeConf) / sizeof(kCacheSizeConf[0]) == kCacheLevels, ""); + + errno = 0; + const int64_t cache_size = sysconf(kCacheSizeConf[level]); + if (errno == 0 && cache_size > 0) { + return cache_size; + } +#endif + + // get cache size from sysfs if sysconf() fails or not supported + const char* kCacheSizeSysfs[] = { + "/sys/devices/system/cpu/cpu0/cache/index0/size", // l1d (index1 is l1i) + "/sys/devices/system/cpu/cpu0/cache/index2/size", // l2 + "/sys/devices/system/cpu/cpu0/cache/index3/size", // l3 + }; + static_assert(sizeof(kCacheSizeSysfs) / sizeof(kCacheSizeSysfs[0]) == kCacheLevels, ""); + + std::ifstream cacheinfo(kCacheSizeSysfs[level], std::ios::in); + if (!cacheinfo) { + return 0; + } + // cacheinfo is one line like: 65536, 64K, 1M, etc. + uint64_t size = 0; + char unit = '\0'; + cacheinfo >> size >> unit; + if (unit == 'K') { + size <<= 10; + } else if (unit == 'M') { + size <<= 20; + } else if (unit == 'G') { + size <<= 30; + } else if (unit != '\0') { + return 0; + } + return static_cast(size); + } + + // Helper function to parse for hardware flags from /proc/cpuinfo + // values contains a list of space-separated flags. check to see if the flags we + // care about are present. + // Returns a bitmap of flags. + int64_t LinuxParseCpuFlags(const std::string& values) { + const struct { + std::string name; + int64_t flag; + } flag_mappings[] = { +#if defined(CPUINFO_ARCH_X86) + {"ssse3", CpuInfo::SSSE3}, + {"sse4_1", CpuInfo::SSE4_1}, + {"sse4_2", CpuInfo::SSE4_2}, + {"popcnt", CpuInfo::POPCNT}, + {"avx", CpuInfo::AVX}, + {"avx2", CpuInfo::AVX2}, + {"avx512f", CpuInfo::AVX512F}, + {"avx512cd", CpuInfo::AVX512CD}, + {"avx512vl", CpuInfo::AVX512VL}, + {"avx512dq", CpuInfo::AVX512DQ}, + {"avx512bw", CpuInfo::AVX512BW}, + {"bmi1", CpuInfo::BMI1}, + {"bmi2", CpuInfo::BMI2}, +#endif + }; + const int64_t num_flags = sizeof(flag_mappings) / sizeof(flag_mappings[0]); + + int64_t flags = 0; + for (int i = 0; i < num_flags; ++i) { + if (values.find(flag_mappings[i].name) != std::string::npos) { + flags |= flag_mappings[i].flag; + } + } + return flags; + } + + void OsRetrieveCacheSize(std::array* cache_sizes) { + for (int i = 0; i < kCacheLevels; ++i) { + const int64_t cache_size = LinuxGetCacheSize(i); + if (cache_size > 0) { + (*cache_sizes)[i] = cache_size; + } + } + } + + static constexpr bool IsWhitespace(char c) { + return c == ' ' || c == '\t'; + } + + std::string TrimString(std::string value) { + size_t ltrim_chars = 0; + while (ltrim_chars < value.size() && IsWhitespace(value[ltrim_chars])) { + ++ltrim_chars; + } + value.erase(0, ltrim_chars); + size_t rtrim_chars = 0; + while (rtrim_chars < value.size() && IsWhitespace(value[value.size() - 1 - rtrim_chars])) { + ++rtrim_chars; + } + value.erase(value.size() - rtrim_chars, rtrim_chars); + return value; + } + + // Read from /proc/cpuinfo + void OsRetrieveCpuInfo(int64_t* hardware_flags, CpuInfo::Vendor* vendor, + std::string* model_name) { + std::ifstream cpuinfo("/proc/cpuinfo", std::ios::in); + while (cpuinfo) { + std::string line; + std::getline(cpuinfo, line); + const size_t colon = line.find(':'); + if (colon != std::string::npos) { + const std::string name = TrimString(line.substr(0, colon - 1)); + const std::string value = TrimString(line.substr(colon + 1, std::string::npos)); + if (name.compare("flags") == 0 || name.compare("Features") == 0) { + *hardware_flags |= LinuxParseCpuFlags(value); + } else if (name.compare("model name") == 0) { + *model_name = value; + } else if (name.compare("vendor_id") == 0) { + if (value.compare("GenuineIntel") == 0) { + *vendor = CpuInfo::Vendor::Intel; + } else if (value.compare("AuthenticAMD") == 0) { + *vendor = CpuInfo::Vendor::AMD; + } + } + } + } + } +#endif // WINDOWS, MACOS, LINUX + + //============================== Arch Dependent ==============================// + +#if defined(CPUINFO_ARCH_X86) + //------------------------------ X86_64 ------------------------------// + bool ArchParseUserSimdLevel(const std::string& simd_level, int64_t* hardware_flags) { + enum { + USER_SIMD_NONE, + USER_SIMD_AVX512, + USER_SIMD_MAX, + }; + + int level = USER_SIMD_MAX; + // Parse the level + if (simd_level == "AVX512") { + level = USER_SIMD_AVX512; + } else if (simd_level == "NONE") { + level = USER_SIMD_NONE; + } else { + return false; + } + + // Disable feature as the level + if (level < USER_SIMD_AVX512) { + *hardware_flags &= ~CpuInfo::AVX512; + } + return true; + } + + void ArchVerifyCpuRequirements(const CpuInfo* ci) { +#if defined(ORC_HAVE_RUNTIME_AVX512) + if (!ci->isDetected(CpuInfo::AVX512)) { + throw ParseError("CPU does not support the Supplemental AVX512 instruction set"); + } +#else + UNUSED(ci); +#endif + } + +#endif // X86 + + } // namespace + + struct CpuInfo::Impl { + int64_t hardware_flags = 0; + int numCores = 0; + int64_t original_hardware_flags = 0; + Vendor vendor = Vendor::Unknown; + std::string model_name = "Unknown"; + std::array cache_sizes{}; + + Impl() { + OsRetrieveCacheSize(&cache_sizes); + OsRetrieveCpuInfo(&hardware_flags, &vendor, &model_name); + original_hardware_flags = hardware_flags; + numCores = std::max(static_cast(std::thread::hardware_concurrency()), 1); + + // parse user simd level + const auto maybe_env_var = std::getenv("ORC_USER_SIMD_LEVEL"); + std::string userSimdLevel = maybe_env_var == nullptr ? "NONE" : std::string(maybe_env_var); + std::transform(userSimdLevel.begin(), userSimdLevel.end(), userSimdLevel.begin(), + [](unsigned char c) { return std::toupper(c); }); + if (!ArchParseUserSimdLevel(userSimdLevel, &hardware_flags)) { + throw ParseError("Invalid value for ORC_USER_SIMD_LEVEL: " + userSimdLevel); + } + } + }; + + CpuInfo::~CpuInfo() = default; + + CpuInfo::CpuInfo() : impl_(new Impl) {} + +#ifdef __clang__ +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wexit-time-destructors" +#endif + + const CpuInfo* CpuInfo::getInstance() { + static CpuInfo cpu_info; + return &cpu_info; + } + +#ifdef __clang__ +#pragma clang diagnostic pop +#endif + + int64_t CpuInfo::hardwareFlags() const { + return impl_->hardware_flags; + } + + int CpuInfo::numCores() const { + return impl_->numCores <= 0 ? 1 : impl_->numCores; + } + + CpuInfo::Vendor CpuInfo::vendor() const { + return impl_->vendor; + } + + const std::string& CpuInfo::modelName() const { + return impl_->model_name; + } + + int64_t CpuInfo::cacheSize(CacheLevel level) const { + constexpr int64_t kDefaultCacheSizes[] = { + 32 * 1024, // Level 1: 32K + 256 * 1024, // Level 2: 256K + 3072 * 1024, // Level 3: 3M + }; + static_assert(sizeof(kDefaultCacheSizes) / sizeof(kDefaultCacheSizes[0]) == kCacheLevels, ""); + + static_assert(static_cast(CacheLevel::L1) == 0, ""); + const int i = static_cast(level); + if (impl_->cache_sizes[i] > 0) return impl_->cache_sizes[i]; + if (i == 0) return kDefaultCacheSizes[0]; + // l3 may be not available, return maximum of l2 or default size + return std::max(kDefaultCacheSizes[i], impl_->cache_sizes[i - 1]); + } + + bool CpuInfo::isSupported(int64_t flags) const { + return (impl_->hardware_flags & flags) == flags; + } + + bool CpuInfo::isDetected(int64_t flags) const { + return (impl_->original_hardware_flags & flags) == flags; + } + + void CpuInfo::verifyCpuRequirements() const { + return ArchVerifyCpuRequirements(this); + } + +} // namespace orc + +#undef CPUINFO_ARCH_X86 diff --git a/c++/src/CpuInfoUtil.hh b/c++/src/CpuInfoUtil.hh new file mode 100644 index 0000000000..ad7df6a82e --- /dev/null +++ b/c++/src/CpuInfoUtil.hh @@ -0,0 +1,110 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file CpuInfoUtil.hh is from Apache Arrow as of 2023-03-21 + */ + +#ifndef ORC_CPUINFOUTIL_HH +#define ORC_CPUINFOUTIL_HH + +#include +#include +#include + +namespace orc { + + /** + * CpuInfo is an interface to query for cpu information at runtime. The caller can + * ask for the sizes of the caches and what hardware features are supported. + * On Linux, this information is pulled from a couple of sys files (/proc/cpuinfo and + * /sys/devices) + */ + class CpuInfo { + public: + ~CpuInfo(); + + // x86 features + static constexpr int64_t SSSE3 = (1LL << 0); + static constexpr int64_t SSE4_1 = (1LL << 1); + static constexpr int64_t SSE4_2 = (1LL << 2); + static constexpr int64_t POPCNT = (1LL << 3); + static constexpr int64_t AVX = (1LL << 4); + static constexpr int64_t AVX2 = (1LL << 5); + static constexpr int64_t AVX512F = (1LL << 6); + static constexpr int64_t AVX512CD = (1LL << 7); + static constexpr int64_t AVX512VL = (1LL << 8); + static constexpr int64_t AVX512DQ = (1LL << 9); + static constexpr int64_t AVX512BW = (1LL << 10); + static constexpr int64_t AVX512 = AVX512F | AVX512CD | AVX512VL | AVX512DQ | AVX512BW; + static constexpr int64_t BMI1 = (1LL << 11); + static constexpr int64_t BMI2 = (1LL << 12); + + // Cache enums for L1 (data), L2 and L3 + enum class CacheLevel { L1 = 0, L2, L3, Last = L3 }; + + // CPU vendors + enum class Vendor { Unknown, Intel, AMD }; + + static const CpuInfo* getInstance(); + + // Returns all the flags for this cpu + int64_t hardwareFlags() const; + + // Returns the number of cores (including hyper-threaded) on this machine. + int numCores() const; + + // Returns the vendor of the cpu. + Vendor vendor() const; + + // Returns the model name of the cpu (e.g. Intel i7-2600) + const std::string& modelName() const; + + // Returns the size of the cache in KB at this cache level + int64_t cacheSize(CacheLevel level) const; + + /** + * Returns whether or not the given feature is enabled. + * isSupported() is true if isDetected() is also true and the feature + * wasn't disabled by the user (for example by setting the ORC_USER_SIMD_LEVEL + * environment variable). + */ + bool isSupported(int64_t flags) const; + + // Returns whether or not the given feature is available on the CPU. + bool isDetected(int64_t flags) const; + + // Determine if the CPU meets the minimum CPU requirements and if not, issue an error + // and terminate. + void verifyCpuRequirements() const; + + bool hasEfficientBmi2() const { + // BMI2 (pext, pdep) is only efficient on Intel X86 processors. + return vendor() == Vendor::Intel && isSupported(BMI2); + } + + private: + CpuInfo(); + + struct Impl; + std::unique_ptr impl_; + }; + +} // namespace orc + +#endif diff --git a/c++/src/Dispatch.hh b/c++/src/Dispatch.hh new file mode 100644 index 0000000000..489317b28a --- /dev/null +++ b/c++/src/Dispatch.hh @@ -0,0 +1,110 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef ORC_DISPATCH_HH +#define ORC_DISPATCH_HH + +#include +#include + +#include "CpuInfoUtil.hh" + +namespace orc { + enum class DispatchLevel : int { + // These dispatch levels, corresponding to instruction set features, + // are sorted in increasing order of preference. + NONE = 0, + AVX512, + MAX + }; + + /** + * A facility for dynamic dispatch according to available DispatchLevel. + * + * Typical use: + * + * static void my_function_default(...); + * static void my_function_avx512(...); + * + * struct MyDynamicFunction { + * using FunctionType = decltype(&my_function_default); + * + * static std::vector> implementations() { + * return { + * { DispatchLevel::NONE, my_function_default } + * #if defined(ORC_HAVE_RUNTIME_AVX512) + * , { DispatchLevel::AVX512, my_function_avx512 } + * #endif + * }; + * } + * }; + * + * void my_function(...) { + * static DynamicDispatch dispatch; + * return dispatch.func(...); + * } + */ + template + class DynamicDispatch { + protected: + using FunctionType = typename DynamicFunction::FunctionType; + using Implementation = std::pair; + + public: + DynamicDispatch() { + Resolve(DynamicFunction::implementations()); + } + + FunctionType func = {}; + + protected: + // Use the Implementation with the highest DispatchLevel + void Resolve(const std::vector& implementations) { + Implementation cur{DispatchLevel::NONE, {}}; + + for (const auto& impl : implementations) { + if (impl.first >= cur.first && levelSupported(impl.first)) { + // Higher (or same) level than current + cur = impl; + } + } + + if (!cur.second) { + throw InvalidArgument("No appropriate implementation found"); + } + func = cur.second; + } + + private: + bool levelSupported(DispatchLevel level) const { + static const auto cpu_info = CpuInfo::getInstance(); + + switch (level) { + case DispatchLevel::NONE: + return true; + case DispatchLevel::AVX512: + case DispatchLevel::MAX: + return cpu_info->isSupported(CpuInfo::AVX512); + default: + return false; + } + } + }; +} // namespace orc + +#endif diff --git a/c++/src/RLEv2.hh b/c++/src/RLEv2.hh index f48ce8391b..1cee59d0a6 100644 --- a/c++/src/RLEv2.hh +++ b/c++/src/RLEv2.hh @@ -166,6 +166,50 @@ namespace orc { void next(int16_t* data, uint64_t numValues, const char* notNull) override; + unsigned char readByte(); + + void setBufStart(const char* start) { + bufferStart = const_cast(start); + } + + char* getBufStart() { + return bufferStart; + } + + void setBufEnd(const char* end) { + bufferEnd = const_cast(end); + } + + char* getBufEnd() { + return bufferEnd; + } + + uint64_t bufLength() { + return bufferEnd - bufferStart; + } + + void setBitsLeft(const uint32_t bits) { + bitsLeft = bits; + } + + void setCurByte(const uint32_t byte) { + curByte = byte; + } + + uint32_t getBitsLeft() { + return bitsLeft; + } + + uint32_t getCurByte() { + return curByte; + } + + /** + * Most hotspot of this function locates in saving stack, so inline this function to have + * performance gain. + */ + inline void resetBufferStart(uint64_t len, bool resetBuf, uint32_t backupLen); + private: /** * Decode the next gap and patch from 'unpackedPatch' and update the index on it. @@ -189,23 +233,10 @@ namespace orc { resetReadLongs(); } - unsigned char readByte(); - int64_t readLongBE(uint64_t bsz); int64_t readVslong(); uint64_t readVulong(); void readLongs(int64_t* data, uint64_t offset, uint64_t len, uint64_t fbs); - void plainUnpackLongs(int64_t* data, uint64_t offset, uint64_t len, uint64_t fbs); - - void unrolledUnpack4(int64_t* data, uint64_t offset, uint64_t len); - void unrolledUnpack8(int64_t* data, uint64_t offset, uint64_t len); - void unrolledUnpack16(int64_t* data, uint64_t offset, uint64_t len); - void unrolledUnpack24(int64_t* data, uint64_t offset, uint64_t len); - void unrolledUnpack32(int64_t* data, uint64_t offset, uint64_t len); - void unrolledUnpack40(int64_t* data, uint64_t offset, uint64_t len); - void unrolledUnpack48(int64_t* data, uint64_t offset, uint64_t len); - void unrolledUnpack56(int64_t* data, uint64_t offset, uint64_t len); - void unrolledUnpack64(int64_t* data, uint64_t offset, uint64_t len); template uint64_t nextShortRepeats(T* data, uint64_t offset, uint64_t numValues, const char* notNull); @@ -220,17 +251,39 @@ namespace orc { const std::unique_ptr inputStream; const bool isSigned; - unsigned char firstByte; - uint64_t runLength; // Length of the current run - uint64_t runRead; // Number of returned values of the current run - const char* bufferStart; - const char* bufferEnd; + char* bufferStart; + char* bufferEnd; + uint64_t runLength; // Length of the current run + uint64_t runRead; // Number of returned values of the current run uint32_t bitsLeft; // Used by readLongs when bitSize < 8 uint32_t curByte; // Used by anything that uses readLongs DataBuffer unpackedPatch; // Used by PATCHED_BASE DataBuffer literals; // Values of the current run }; + + inline void RleDecoderV2::resetBufferStart(uint64_t len, bool resetBuf, uint32_t backupByteLen) { + uint64_t remainingLen = bufLength(); + int bufferLength = 0; + const void* bufferPointer = nullptr; + + if (backupByteLen != 0) { + inputStream->BackUp(backupByteLen); + } + + if (len >= remainingLen && resetBuf) { + if (!inputStream->Next(&bufferPointer, &bufferLength)) { + throw ParseError("bad read in RleDecoderV2::resetBufferStart"); + } + } + + if (bufferPointer == nullptr) { + bufferStart += len; + } else { + bufferStart = const_cast(static_cast(bufferPointer)); + bufferEnd = bufferStart + bufferLength; + } + } } // namespace orc #endif // ORC_RLEV2_HH diff --git a/c++/src/RleDecoderV2.cc b/c++/src/RleDecoderV2.cc index 2742aef6f6..c03294ecf1 100644 --- a/c++/src/RleDecoderV2.cc +++ b/c++/src/RleDecoderV2.cc @@ -17,7 +17,12 @@ */ #include "Adaptor.hh" +#include "BpackingDefault.hh" +#if defined(ORC_HAVE_RUNTIME_AVX512) +#include "BpackingAvx512.hh" +#endif #include "Compression.hh" +#include "Dispatch.hh" #include "RLEV2Util.hh" #include "RLEv2.hh" #include "Utils.hh" @@ -32,7 +37,7 @@ namespace orc { if (!inputStream->Next(&bufferPointer, &bufferLength)) { throw ParseError("bad read in RleDecoderV2::readByte"); } - bufferStart = static_cast(bufferPointer); + bufferStart = const_cast(static_cast(bufferPointer)); bufferEnd = bufferStart + bufferLength; } @@ -66,336 +71,22 @@ namespace orc { return ret; } - void RleDecoderV2::readLongs(int64_t* data, uint64_t offset, uint64_t len, uint64_t fbs) { - switch (fbs) { - case 4: - unrolledUnpack4(data, offset, len); - return; - case 8: - unrolledUnpack8(data, offset, len); - return; - case 16: - unrolledUnpack16(data, offset, len); - return; - case 24: - unrolledUnpack24(data, offset, len); - return; - case 32: - unrolledUnpack32(data, offset, len); - return; - case 40: - unrolledUnpack40(data, offset, len); - return; - case 48: - unrolledUnpack48(data, offset, len); - return; - case 56: - unrolledUnpack56(data, offset, len); - return; - case 64: - unrolledUnpack64(data, offset, len); - return; - default: - // Fallback to the default implementation for deprecated bit size. - plainUnpackLongs(data, offset, len, fbs); - return; - } - } - - void RleDecoderV2::unrolledUnpack4(int64_t* data, uint64_t offset, uint64_t len) { - uint64_t curIdx = offset; - while (curIdx < offset + len) { - // Make sure bitsLeft is 0 before the loop. bitsLeft can only be 0, 4, or 8. - while (bitsLeft > 0 && curIdx < offset + len) { - bitsLeft -= 4; - data[curIdx++] = (curByte >> bitsLeft) & 15; - } - if (curIdx == offset + len) return; - - // Exhaust the buffer - uint64_t numGroups = (offset + len - curIdx) / 2; - numGroups = std::min(numGroups, static_cast(bufferEnd - bufferStart)); - // Avoid updating 'bufferStart' inside the loop. - const auto* buffer = reinterpret_cast(bufferStart); - uint32_t localByte; - for (uint64_t i = 0; i < numGroups; ++i) { - localByte = *buffer++; - data[curIdx] = (localByte >> 4) & 15; - data[curIdx + 1] = localByte & 15; - curIdx += 2; - } - bufferStart = reinterpret_cast(buffer); - if (curIdx == offset + len) return; - - // readByte() will update 'bufferStart' and 'bufferEnd' - curByte = readByte(); - bitsLeft = 8; - } - } - - void RleDecoderV2::unrolledUnpack8(int64_t* data, uint64_t offset, uint64_t len) { - uint64_t curIdx = offset; - while (curIdx < offset + len) { - // Exhaust the buffer - int64_t bufferNum = bufferEnd - bufferStart; - bufferNum = std::min(bufferNum, static_cast(offset + len - curIdx)); - // Avoid updating 'bufferStart' inside the loop. - const auto* buffer = reinterpret_cast(bufferStart); - for (int i = 0; i < bufferNum; ++i) { - data[curIdx++] = *buffer++; - } - bufferStart = reinterpret_cast(buffer); - if (curIdx == offset + len) return; - - // readByte() will update 'bufferStart' and 'bufferEnd'. - data[curIdx++] = readByte(); - } - } - - void RleDecoderV2::unrolledUnpack16(int64_t* data, uint64_t offset, uint64_t len) { - uint64_t curIdx = offset; - while (curIdx < offset + len) { - // Exhaust the buffer - int64_t bufferNum = (bufferEnd - bufferStart) / 2; - bufferNum = std::min(bufferNum, static_cast(offset + len - curIdx)); - uint16_t b0, b1; - // Avoid updating 'bufferStart' inside the loop. - const auto* buffer = reinterpret_cast(bufferStart); - for (int i = 0; i < bufferNum; ++i) { - b0 = static_cast(*buffer); - b1 = static_cast(*(buffer + 1)); - buffer += 2; - data[curIdx++] = (b0 << 8) | b1; - } - bufferStart = reinterpret_cast(buffer); - if (curIdx == offset + len) return; - - // One of the following readByte() will update 'bufferStart' and 'bufferEnd'. - b0 = readByte(); - b1 = readByte(); - data[curIdx++] = (b0 << 8) | b1; - } - } - - void RleDecoderV2::unrolledUnpack24(int64_t* data, uint64_t offset, uint64_t len) { - uint64_t curIdx = offset; - while (curIdx < offset + len) { - // Exhaust the buffer - int64_t bufferNum = (bufferEnd - bufferStart) / 3; - bufferNum = std::min(bufferNum, static_cast(offset + len - curIdx)); - uint32_t b0, b1, b2; - // Avoid updating 'bufferStart' inside the loop. - const auto* buffer = reinterpret_cast(bufferStart); - for (int i = 0; i < bufferNum; ++i) { - b0 = static_cast(*buffer); - b1 = static_cast(*(buffer + 1)); - b2 = static_cast(*(buffer + 2)); - buffer += 3; - data[curIdx++] = static_cast((b0 << 16) | (b1 << 8) | b2); - } - bufferStart += bufferNum * 3; - if (curIdx == offset + len) return; - - // One of the following readByte() will update 'bufferStart' and 'bufferEnd'. - b0 = readByte(); - b1 = readByte(); - b2 = readByte(); - data[curIdx++] = static_cast((b0 << 16) | (b1 << 8) | b2); - } - } - - void RleDecoderV2::unrolledUnpack32(int64_t* data, uint64_t offset, uint64_t len) { - uint64_t curIdx = offset; - while (curIdx < offset + len) { - // Exhaust the buffer - int64_t bufferNum = (bufferEnd - bufferStart) / 4; - bufferNum = std::min(bufferNum, static_cast(offset + len - curIdx)); - uint32_t b0, b1, b2, b3; - // Avoid updating 'bufferStart' inside the loop. - const auto* buffer = reinterpret_cast(bufferStart); - for (int i = 0; i < bufferNum; ++i) { - b0 = static_cast(*buffer); - b1 = static_cast(*(buffer + 1)); - b2 = static_cast(*(buffer + 2)); - b3 = static_cast(*(buffer + 3)); - buffer += 4; - data[curIdx++] = static_cast((b0 << 24) | (b1 << 16) | (b2 << 8) | b3); - } - bufferStart = reinterpret_cast(buffer); - if (curIdx == offset + len) return; - - // One of the following readByte() will update 'bufferStart' and 'bufferEnd'. - b0 = readByte(); - b1 = readByte(); - b2 = readByte(); - b3 = readByte(); - data[curIdx++] = static_cast((b0 << 24) | (b1 << 16) | (b2 << 8) | b3); - } - } + struct UnpackDynamicFunction { + using FunctionType = decltype(&BitUnpack::readLongs); - void RleDecoderV2::unrolledUnpack40(int64_t* data, uint64_t offset, uint64_t len) { - uint64_t curIdx = offset; - while (curIdx < offset + len) { - // Exhaust the buffer - int64_t bufferNum = (bufferEnd - bufferStart) / 5; - bufferNum = std::min(bufferNum, static_cast(offset + len - curIdx)); - uint64_t b0, b1, b2, b3, b4; - // Avoid updating 'bufferStart' inside the loop. - const auto* buffer = reinterpret_cast(bufferStart); - for (int i = 0; i < bufferNum; ++i) { - b0 = static_cast(*buffer); - b1 = static_cast(*(buffer + 1)); - b2 = static_cast(*(buffer + 2)); - b3 = static_cast(*(buffer + 3)); - b4 = static_cast(*(buffer + 4)); - buffer += 5; - data[curIdx++] = - static_cast((b0 << 32) | (b1 << 24) | (b2 << 16) | (b3 << 8) | b4); - } - bufferStart = reinterpret_cast(buffer); - if (curIdx == offset + len) return; - - // One of the following readByte() will update 'bufferStart' and 'bufferEnd'. - b0 = readByte(); - b1 = readByte(); - b2 = readByte(); - b3 = readByte(); - b4 = readByte(); - data[curIdx++] = static_cast((b0 << 32) | (b1 << 24) | (b2 << 16) | (b3 << 8) | b4); + static std::vector> implementations() { +#if defined(ORC_HAVE_RUNTIME_AVX512) + return {{DispatchLevel::NONE, BitUnpackDefault::readLongs}, + {DispatchLevel::AVX512, BitUnpackAVX512::readLongs}}; +#else + return {{DispatchLevel::NONE, BitUnpackDefault::readLongs}}; +#endif } - } + }; - void RleDecoderV2::unrolledUnpack48(int64_t* data, uint64_t offset, uint64_t len) { - uint64_t curIdx = offset; - while (curIdx < offset + len) { - // Exhaust the buffer - int64_t bufferNum = (bufferEnd - bufferStart) / 6; - bufferNum = std::min(bufferNum, static_cast(offset + len - curIdx)); - uint64_t b0, b1, b2, b3, b4, b5; - // Avoid updating 'bufferStart' inside the loop. - const auto* buffer = reinterpret_cast(bufferStart); - for (int i = 0; i < bufferNum; ++i) { - b0 = static_cast(*buffer); - b1 = static_cast(*(buffer + 1)); - b2 = static_cast(*(buffer + 2)); - b3 = static_cast(*(buffer + 3)); - b4 = static_cast(*(buffer + 4)); - b5 = static_cast(*(buffer + 5)); - buffer += 6; - data[curIdx++] = static_cast((b0 << 40) | (b1 << 32) | (b2 << 24) | (b3 << 16) | - (b4 << 8) | b5); - } - bufferStart = reinterpret_cast(buffer); - if (curIdx == offset + len) return; - - // One of the following readByte() will update 'bufferStart' and 'bufferEnd'. - b0 = readByte(); - b1 = readByte(); - b2 = readByte(); - b3 = readByte(); - b4 = readByte(); - b5 = readByte(); - data[curIdx++] = - static_cast((b0 << 40) | (b1 << 32) | (b2 << 24) | (b3 << 16) | (b4 << 8) | b5); - } - } - - void RleDecoderV2::unrolledUnpack56(int64_t* data, uint64_t offset, uint64_t len) { - uint64_t curIdx = offset; - while (curIdx < offset + len) { - // Exhaust the buffer - int64_t bufferNum = (bufferEnd - bufferStart) / 7; - bufferNum = std::min(bufferNum, static_cast(offset + len - curIdx)); - uint64_t b0, b1, b2, b3, b4, b5, b6; - // Avoid updating 'bufferStart' inside the loop. - const auto* buffer = reinterpret_cast(bufferStart); - for (int i = 0; i < bufferNum; ++i) { - b0 = static_cast(*buffer); - b1 = static_cast(*(buffer + 1)); - b2 = static_cast(*(buffer + 2)); - b3 = static_cast(*(buffer + 3)); - b4 = static_cast(*(buffer + 4)); - b5 = static_cast(*(buffer + 5)); - b6 = static_cast(*(buffer + 6)); - buffer += 7; - data[curIdx++] = static_cast((b0 << 48) | (b1 << 40) | (b2 << 32) | (b3 << 24) | - (b4 << 16) | (b5 << 8) | b6); - } - bufferStart = reinterpret_cast(buffer); - if (curIdx == offset + len) return; - - // One of the following readByte() will update 'bufferStart' and 'bufferEnd'. - b0 = readByte(); - b1 = readByte(); - b2 = readByte(); - b3 = readByte(); - b4 = readByte(); - b5 = readByte(); - b6 = readByte(); - data[curIdx++] = static_cast((b0 << 48) | (b1 << 40) | (b2 << 32) | (b3 << 24) | - (b4 << 16) | (b5 << 8) | b6); - } - } - - void RleDecoderV2::unrolledUnpack64(int64_t* data, uint64_t offset, uint64_t len) { - uint64_t curIdx = offset; - while (curIdx < offset + len) { - // Exhaust the buffer - int64_t bufferNum = (bufferEnd - bufferStart) / 8; - bufferNum = std::min(bufferNum, static_cast(offset + len - curIdx)); - uint64_t b0, b1, b2, b3, b4, b5, b6, b7; - // Avoid updating 'bufferStart' inside the loop. - const auto* buffer = reinterpret_cast(bufferStart); - for (int i = 0; i < bufferNum; ++i) { - b0 = static_cast(*buffer); - b1 = static_cast(*(buffer + 1)); - b2 = static_cast(*(buffer + 2)); - b3 = static_cast(*(buffer + 3)); - b4 = static_cast(*(buffer + 4)); - b5 = static_cast(*(buffer + 5)); - b6 = static_cast(*(buffer + 6)); - b7 = static_cast(*(buffer + 7)); - buffer += 8; - data[curIdx++] = static_cast((b0 << 56) | (b1 << 48) | (b2 << 40) | (b3 << 32) | - (b4 << 24) | (b5 << 16) | (b6 << 8) | b7); - } - bufferStart = reinterpret_cast(buffer); - if (curIdx == offset + len) return; - - // One of the following readByte() will update 'bufferStart' and 'bufferEnd'. - b0 = readByte(); - b1 = readByte(); - b2 = readByte(); - b3 = readByte(); - b4 = readByte(); - b5 = readByte(); - b6 = readByte(); - b7 = readByte(); - data[curIdx++] = static_cast((b0 << 56) | (b1 << 48) | (b2 << 40) | (b3 << 32) | - (b4 << 24) | (b5 << 16) | (b6 << 8) | b7); - } - } - - void RleDecoderV2::plainUnpackLongs(int64_t* data, uint64_t offset, uint64_t len, uint64_t fbs) { - for (uint64_t i = offset; i < (offset + len); i++) { - uint64_t result = 0; - uint64_t bitsLeftToRead = fbs; - while (bitsLeftToRead > bitsLeft) { - result <<= bitsLeft; - result |= curByte & ((1 << bitsLeft) - 1); - bitsLeftToRead -= bitsLeft; - curByte = readByte(); - bitsLeft = 8; - } - - // handle the left over bits - if (bitsLeftToRead > 0) { - result <<= bitsLeftToRead; - bitsLeft -= static_cast(bitsLeftToRead); - result |= (curByte >> bitsLeft) & ((1 << bitsLeftToRead) - 1); - } - data[i] = static_cast(result); - } + void RleDecoderV2::readLongs(int64_t* data, uint64_t offset, uint64_t len, uint64_t fbs) { + static DynamicDispatch dispatch; + return dispatch.func(this, data, offset, len, fbs); } RleDecoderV2::RleDecoderV2(std::unique_ptr input, bool _isSigned, @@ -404,10 +95,10 @@ namespace orc { inputStream(std::move(input)), isSigned(_isSigned), firstByte(0), - runLength(0), - runRead(0), bufferStart(nullptr), bufferEnd(bufferStart), + runLength(0), + runRead(0), bitsLeft(0), curByte(0), unpackedPatch(pool, 0), diff --git a/c++/test/CMakeLists.txt b/c++/test/CMakeLists.txt index ead2f5e4a0..b04055366c 100644 --- a/c++/test/CMakeLists.txt +++ b/c++/test/CMakeLists.txt @@ -23,6 +23,10 @@ include_directories( set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CXX17_FLAGS} ${WARN_FLAGS}") +if(BUILD_ENABLE_AVX512) + set(SIMD_TEST_SRCS TestRleVectorDecoder.cc) +endif(BUILD_ENABLE_AVX512) + add_executable (orc-test MemoryInputStream.cc MemoryOutputStream.cc @@ -58,6 +62,7 @@ add_executable (orc-test TestTimezone.cc TestType.cc TestWriter.cc + ${SIMD_TEST_SRCS} ) target_link_libraries (orc-test diff --git a/c++/test/TestRleVectorDecoder.cc b/c++/test/TestRleVectorDecoder.cc new file mode 100644 index 0000000000..352b883485 --- /dev/null +++ b/c++/test/TestRleVectorDecoder.cc @@ -0,0 +1,561 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include "MemoryOutputStream.hh" +#include "RLEv2.hh" +#include "wrap/gtest-wrapper.h" +#include "wrap/orc-proto-wrapper.hh" + +#ifdef __clang__ +DIAGNOSTIC_IGNORE("-Wmissing-variable-declarations") +#endif + +namespace orc { + using ::testing::TestWithParam; + using ::testing::Values; + + const int DEFAULT_MEM_STREAM_SIZE = 1024 * 1024; // 1M + const char finish = '#'; + std::string flags = "-\\|/"; + + class RleV2BitUnpackAvx512Test : public TestWithParam { + virtual void SetUp(); + + protected: + bool alignBitpacking; + std::unique_ptr getEncoder(RleVersion version, MemoryOutputStream& memStream, + bool isSigned); + + void runExampleTest(int64_t* inputData, uint64_t inputLength, unsigned char* expectedOutput, + uint64_t outputLength); + + void runTest(RleVersion version, uint64_t numValues, int64_t start, int64_t delta, bool random, + bool isSigned, uint8_t bitWidth, uint64_t blockSize = 0, uint64_t numNulls = 0); + }; + + void vectorDecodeAndVerify(RleVersion version, const MemoryOutputStream& memStream, int64_t* data, + uint64_t numValues, const char* notNull, uint64_t blockSize, + bool isSinged) { + std::unique_ptr decoder = + createRleDecoder(std::unique_ptr(new SeekableArrayInputStream( + memStream.getData(), memStream.getLength(), blockSize)), + isSinged, version, *getDefaultPool(), getDefaultReaderMetrics()); + + int64_t* decodedData = new int64_t[numValues]; + decoder->next(decodedData, numValues, notNull); + + for (uint64_t i = 0; i < numValues; ++i) { + if (!notNull || notNull[i]) { + EXPECT_EQ(data[i], decodedData[i]); + } + } + + delete[] decodedData; + } + + void RleV2BitUnpackAvx512Test::SetUp() { + alignBitpacking = GetParam(); + } + + void generateDataForBits(uint64_t numValues, int64_t start, int64_t delta, bool random, + int64_t* data, uint8_t bitWidth, uint64_t numNulls = 0, + char* notNull = nullptr) { + int64_t max = pow(2, bitWidth); + if (numNulls != 0 && notNull != nullptr) { + memset(notNull, 1, numValues); + while (numNulls > 0) { + uint64_t pos = static_cast(std::rand()) % numValues; + if (notNull[pos]) { + notNull[pos] = static_cast(0); + --numNulls; + } + } + } + + for (uint64_t i = 0; i < numValues; ++i) { + if (notNull == nullptr || notNull[i]) { + if (!random) { + data[i] = start + delta * static_cast(i); + } else { + data[i] = std::rand() % max; + } + } + } + } + + void printBar(const char* testName, int64_t offset, int64_t total) { + int64_t n = offset * 100 / total; + std::string progress(100, '.'); + for (int i = 0; i < n; i++) { + progress[i] = finish; + } + + std::string f, p; + if (n == 100) { + f = "OK"; + p = "100%"; + } else { + f = flags[n % 4]; + p = std::to_string(n) + '%'; + } + std::cout << std::unitbuf << testName << ":" << '[' << f << ']' << '[' << progress << ']' << '[' + << p << "]" << '\r'; + if (n >= 100) { + std::cout << std::endl; + } + } + + std::unique_ptr RleV2BitUnpackAvx512Test::getEncoder(RleVersion version, + MemoryOutputStream& memStream, + bool isSigned) { + MemoryPool* pool = getDefaultPool(); + + return createRleEncoder(std::unique_ptr(new BufferedOutputStream( + *pool, &memStream, 500 * 1024, 1024, nullptr)), + isSigned, version, *pool, alignBitpacking); + } + + void RleV2BitUnpackAvx512Test::runTest(RleVersion version, uint64_t numValues, int64_t start, + int64_t delta, bool random, bool isSigned, + uint8_t bitWidth, uint64_t blockSize, uint64_t numNulls) { + MemoryOutputStream memStream(DEFAULT_MEM_STREAM_SIZE); + + std::unique_ptr encoder = getEncoder(version, memStream, isSigned); + + char* notNull = numNulls == 0 ? nullptr : new char[numValues]; + int64_t* data = new int64_t[numValues]; + generateDataForBits(numValues, start, delta, random, data, bitWidth, numNulls, notNull); + encoder->add(data, numValues, notNull); + encoder->flush(); + + vectorDecodeAndVerify(version, memStream, data, numValues, notNull, blockSize, isSigned); + delete[] data; + delete[] notNull; + } + + TEST_P(RleV2BitUnpackAvx512Test, RleV2_basic_vector_decode_1bit) { + uint8_t bitWidth = 1; + for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { + runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); + printBar("1bit Test 1st Part", blockSize, 10000); + } + + for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize += 1000) { + for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { + runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); + } + printBar("1bit Test 2nd Part", blockSize, 10000); + } + } + + TEST_P(RleV2BitUnpackAvx512Test, RleV2_basic_vector_decode_2bit) { + uint8_t bitWidth = 2; + for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { + runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); + printBar("2bit Test 1st Part", blockSize, 10000); + } + + for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize += 1000) { + for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { + runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); + } + printBar("2bit Test 2nd Part", blockSize, 10000); + } + } + + TEST_P(RleV2BitUnpackAvx512Test, RleV2_basic_vector_decode_3bit) { + uint8_t bitWidth = 3; + for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { + runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); + printBar("3bit Test 1st Part", blockSize, 10000); + } + + for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize += 1000) { + for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { + runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); + } + printBar("3bit Test 2nd Part", blockSize, 10000); + } + } + + TEST_P(RleV2BitUnpackAvx512Test, RleV2_basic_vector_decode_4bit) { + uint8_t bitWidth = 4; + for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { + runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); + printBar("4bit Test 1st Part", blockSize, 10000); + } + + for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize += 1000) { + for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { + runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); + } + printBar("4bit Test 2nd Part", blockSize, 10000); + } + } + + TEST_P(RleV2BitUnpackAvx512Test, RleV2_basic_vector_decode_5bit) { + uint8_t bitWidth = 5; + for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { + runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); + printBar("5bit Test 1st Part", blockSize, 10000); + } + + for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize += 1000) { + for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { + runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); + } + printBar("5bit Test 2nd Part", blockSize, 10000); + } + } + + TEST_P(RleV2BitUnpackAvx512Test, RleV2_basic_vector_decode_6bit) { + uint8_t bitWidth = 6; + for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { + runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); + printBar("6bit Test 1st Part", blockSize, 10000); + } + + for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize += 1000) { + for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { + runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); + } + printBar("6bit Test 2nd Part", blockSize, 10000); + } + } + + TEST_P(RleV2BitUnpackAvx512Test, RleV2_basic_vector_decode_7bit) { + uint8_t bitWidth = 7; + for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { + runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); + printBar("7bit Test 1st Part", blockSize, 10000); + } + + for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize += 1000) { + for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { + runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); + } + printBar("7bit Test 2nd Part", blockSize, 10000); + } + } + + TEST_P(RleV2BitUnpackAvx512Test, RleV2_basic_vector_decode_9bit) { + uint8_t bitWidth = 9; + + for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { + runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); + printBar("9bit Test 1st Part", blockSize, 10000); + } + + for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize += 1000) { + for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { + runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); + } + printBar("9bit Test 2nd Part", blockSize, 10000); + } + } + + TEST_P(RleV2BitUnpackAvx512Test, RleV2_basic_vector_decode_10bit) { + uint8_t bitWidth = 10; + for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { + runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); + printBar("10bit Test 1st Part", blockSize, 10000); + } + + for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize += 1000) { + for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { + runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); + } + printBar("10bit Test 2nd Part", blockSize, 10000); + } + } + + TEST_P(RleV2BitUnpackAvx512Test, RleV2_basic_vector_decode_11bit) { + uint8_t bitWidth = 11; + for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { + runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); + printBar("11bit Test 1st Part", blockSize, 10000); + } + + for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize += 1000) { + for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { + runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); + } + printBar("11bit Test 2nd Part", blockSize, 10000); + } + } + + TEST_P(RleV2BitUnpackAvx512Test, RleV2_basic_vector_decode_12bit) { + uint8_t bitWidth = 12; + for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { + runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); + printBar("12bit Test 1st Part", blockSize, 10000); + } + + for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize += 1000) { + for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { + runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); + } + printBar("12bit Test 2nd Part", blockSize, 10000); + } + } + + TEST_P(RleV2BitUnpackAvx512Test, RleV2_basic_vector_decode_13bit) { + uint8_t bitWidth = 13; + for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { + runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); + printBar("13bit Test 1st Part", blockSize, 10000); + } + + for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize += 1000) { + for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { + runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); + } + printBar("13bit Test 2nd Part", blockSize, 10000); + } + } + + TEST_P(RleV2BitUnpackAvx512Test, RleV2_basic_vector_decode_14bit) { + uint8_t bitWidth = 14; + for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { + runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); + printBar("14bit Test 1st Part", blockSize, 10000); + } + + for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize += 1000) { + for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { + runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); + } + printBar("14bit Test 2nd Part", blockSize, 10000); + } + } + + TEST_P(RleV2BitUnpackAvx512Test, RleV2_basic_vector_decode_15bit) { + uint8_t bitWidth = 15; + for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { + runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); + printBar("15bit Test 1st Part", blockSize, 10000); + } + + for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize += 1000) { + for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { + runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); + } + printBar("15bit Test 2nd Part", blockSize, 10000); + } + } + + TEST_P(RleV2BitUnpackAvx512Test, RleV2_basic_vector_decode_16bit) { + uint8_t bitWidth = 16; + for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { + runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); + printBar("16bit Test 1st Part", blockSize, 10000); + } + + for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize += 1000) { + for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { + runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); + } + printBar("16bit Test 2nd Part", blockSize, 10000); + } + } + + TEST_P(RleV2BitUnpackAvx512Test, RleV2_basic_vector_decode_17bit) { + uint8_t bitWidth = 17; + for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { + runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); + printBar("17bit Test 1st Part", blockSize, 10000); + } + + for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize += 1000) { + for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { + runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); + } + printBar("17bit Test 2nd Part", blockSize, 10000); + } + } + + TEST_P(RleV2BitUnpackAvx512Test, RleV2_basic_vector_decode_18bit) { + uint8_t bitWidth = 18; + for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { + runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); + printBar("18bit Test 1st Part", blockSize, 10000); + } + + for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize += 1000) { + for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { + runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); + } + printBar("18bit Test 2nd Part", blockSize, 10000); + } + } + + TEST_P(RleV2BitUnpackAvx512Test, RleV2_basic_vector_decode_19bit) { + uint8_t bitWidth = 19; + for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { + runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); + printBar("19bit Test 1st Part", blockSize, 10000); + } + + for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize += 1000) { + for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { + runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); + } + printBar("19bit Test 2nd Part", blockSize, 10000); + } + } + + TEST_P(RleV2BitUnpackAvx512Test, RleV2_basic_vector_decode_20bit) { + uint8_t bitWidth = 20; + for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { + runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); + printBar("20bit Test 1st Part", blockSize, 10000); + } + + for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize += 1000) { + for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { + runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); + } + printBar("20bit Test 2nd Part", blockSize, 10000); + } + } + + TEST_P(RleV2BitUnpackAvx512Test, RleV2_basic_vector_decode_21bit) { + uint8_t bitWidth = 21; + for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { + runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); + printBar("21bit Test 1st Part", blockSize, 10000); + } + + for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize += 1000) { + for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { + runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); + } + printBar("21bit Test 2nd Part", blockSize, 10000); + } + } + + TEST_P(RleV2BitUnpackAvx512Test, RleV2_basic_vector_decode_22bit) { + uint8_t bitWidth = 22; + for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { + runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); + printBar("22bit Test 1st Part", blockSize, 10000); + } + + for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize += 1000) { + for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { + runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); + } + printBar("22bit Test 2nd Part", blockSize, 10000); + } + } + + TEST_P(RleV2BitUnpackAvx512Test, RleV2_basic_vector_decode_23bit) { + uint8_t bitWidth = 23; + runTest(RleVersion_2, 3277, 0, 0, true, false, bitWidth, 108); + for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { + runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); + printBar("23bit Test 1st Part", blockSize, 10000); + } + + for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize += 1000) { + for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { + runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); + } + printBar("23bit Test 2nd Part", blockSize, 10000); + } + } + + TEST_P(RleV2BitUnpackAvx512Test, RleV2_basic_vector_decode_24bit) { + uint8_t bitWidth = 24; + for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { + runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); + printBar("24bit Test 1st Part", blockSize, 10000); + } + + for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize += 1000) { + for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { + runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); + } + printBar("24bit Test 2nd Part", blockSize, 10000); + } + } + + TEST_P(RleV2BitUnpackAvx512Test, RleV2_basic_vector_decode_26bit) { + uint8_t bitWidth = 26; + for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { + runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); + printBar("26bit Test 1st Part", blockSize, 10000); + } + + for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize += 1000) { + for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { + runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); + } + printBar("26bit Test 2nd Part", blockSize, 10000); + } + } + + TEST_P(RleV2BitUnpackAvx512Test, RleV2_basic_vector_decode_28bit) { + uint8_t bitWidth = 28; + for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { + runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); + printBar("28bit Test 1st Part", blockSize, 10000); + } + + for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize += 1000) { + for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { + runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); + } + printBar("28bit Test 2nd Part", blockSize, 10000); + } + } + + TEST_P(RleV2BitUnpackAvx512Test, RleV2_basic_vector_decode_30bit) { + uint8_t bitWidth = 30; + for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { + runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); + printBar("30bit Test 1st Part", blockSize, 10000); + } + + for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize += 1000) { + for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { + runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); + } + printBar("30bit Test 2nd Part", blockSize, 10000); + } + } + + TEST_P(RleV2BitUnpackAvx512Test, RleV2_basic_vector_decode_32bit) { + uint8_t bitWidth = 32; + for (uint64_t blockSize = 1; blockSize <= 10000; blockSize++) { + runTest(RleVersion_2, 10240, 0, 0, true, false, bitWidth, blockSize); + printBar("32bit Test 1st Part", blockSize, 10000); + } + + for (uint64_t blockSize = 1000; blockSize <= 10000; blockSize += 1000) { + for (uint64_t dataSize = 1000; dataSize <= 70000; dataSize += 1000) { + runTest(RleVersion_2, dataSize, 0, 0, true, false, bitWidth, blockSize); + } + printBar("32bit Test 2nd Part", blockSize, 10000); + } + } + + INSTANTIATE_TEST_SUITE_P(OrcTest, RleV2BitUnpackAvx512Test, Values(true, false)); +} // namespace orc diff --git a/cmake_modules/ConfigSimdLevel.cmake b/cmake_modules/ConfigSimdLevel.cmake new file mode 100644 index 0000000000..86608e63b5 --- /dev/null +++ b/cmake_modules/ConfigSimdLevel.cmake @@ -0,0 +1,104 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +INCLUDE(CheckCXXSourceRuns) +INCLUDE(CheckCXXCompilerFlag) +message(STATUS "System processor: ${CMAKE_SYSTEM_PROCESSOR}") + +if(NOT DEFINED ORC_SIMD_LEVEL) + set(ORC_SIMD_LEVEL + "DEFAULT" + CACHE STRING "Compile time SIMD optimization level") +endif() + +if(NOT DEFINED ORC_CPU_FLAG) + if(CMAKE_SYSTEM_PROCESSOR MATCHES "AMD64|X86|x86|i[3456]86|x64") + set(ORC_CPU_FLAG "x86") + else() + message(STATUS "Unsupported system processor for SIMD optimization") + endif() +endif() + +# Check architecture specific compiler flags +if(ORC_CPU_FLAG STREQUAL "x86") + # x86/amd64 compiler flags, msvc/gcc/clang + if(MSVC) + set(ORC_AVX512_FLAG "/arch:AVX512") + check_cxx_compiler_flag(${ORC_AVX512_FLAG} COMPILER_SUPPORT_AVX512) + else() + # "arch=native" selects the CPU to generate code for at compilation time by determining the processor type of the compiling machine. + # Using -march=native enables all instruction subsets supported by the local machine. + # Using -mtune=native produces code optimized for the local machine under the constraints of the selected instruction set. + set(ORC_AVX512_FLAG "-march=native -mtune=native") + check_cxx_compiler_flag("-mavx512f -mavx512cd -mavx512vl -mavx512dq -mavx512bw" COMPILER_SUPPORT_AVX512) + endif() + + if(MINGW) + # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=65782 + message(STATUS "Disable AVX512 support on MINGW for now") + else() + # Check for AVX512 support in the compiler. + set(OLD_CMAKE_REQURED_FLAGS ${CMAKE_REQUIRED_FLAGS}) + set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} ${ORC_AVX512_FLAG}") + CHECK_CXX_SOURCE_COMPILES(" + #ifdef _MSC_VER + #include + #else + #include + #endif + + int main() { + __m512i mask = _mm512_set1_epi32(0x1); + char out[32]; + _mm512_storeu_si512(out, mask); + return 0; + }" + CXX_SUPPORTS_AVX512) + set(CMAKE_REQUIRED_FLAGS ${OLD_CMAKE_REQURED_FLAGS}) + endif() + + if(CXX_SUPPORTS_AVX512) + execute_process(COMMAND grep flags /proc/cpuinfo + COMMAND head -1 + OUTPUT_VARIABLE flags_ver) + message(STATUS "CPU ${flags_ver}") + execute_process(COMMAND grep avx512f /proc/cpuinfo + COMMAND head -1 + OUTPUT_VARIABLE CPU_HAS_AVX512) + endif() + + # Runtime SIMD level it can get from compiler + if(CPU_HAS_AVX512 AND CXX_SUPPORTS_AVX512 AND COMPILER_SUPPORT_AVX512) + message(STATUS "Enabled the AVX512 for RLE bit-unpacking") + set(ORC_SIMD_LEVEL "AVX512") + add_definitions(-DORC_HAVE_RUNTIME_AVX512) + else() + message(STATUS "WARNING: AVX512 required but compiler doesn't support it, failed to enable AVX512.") + set(BUILD_ENABLE_AVX512 OFF) + endif() + if(ORC_SIMD_LEVEL STREQUAL "DEFAULT") + set(ORC_SIMD_LEVEL "NONE") + endif() + + if(ORC_SIMD_LEVEL STREQUAL "AVX512") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${ORC_AVX512_FLAG}") + message(STATUS "ORC_HAVE_RUNTIME_AVX512 defined, ORC_SIMD_LEVEL: ${ORC_SIMD_LEVEL}") + else() + message(STATUS "ORC_HAVE_RUNTIME_AVX512 not defined, ORC_SIMD_LEVEL: ${ORC_SIMD_LEVEL}") + endif() +endif() +