diff --git a/cpp/cmake_modules/SetupCxxFlags.cmake b/cpp/cmake_modules/SetupCxxFlags.cmake index 6f379f9e01d21..39765f437082d 100644 --- a/cpp/cmake_modules/SetupCxxFlags.cmake +++ b/cpp/cmake_modules/SetupCxxFlags.cmake @@ -231,6 +231,26 @@ if (APPLE) set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -stdlib=libc++") endif() +if (CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|AARCH64") + CHECK_CXX_SOURCE_COMPILES(" + #define CRC32CX(crc, value) __asm__(\"crc32cx %w[c], %w[c], %x[v]\":[c]\"+r\"(crc):[v]\"r\"(value)) + asm(\".arch_extension crc\"); + unsigned int foo(unsigned int ret) { + CRC32CX(ret, 0); + return ret; + } + int main() { foo(0); }" ARROW_HAVE_ARMCE) + if (ARROW_HAVE_ARMCE) + message(STATUS " aarch64 CRC32 supported") + endif() + + CHECK_CXX_COMPILER_FLAG("-march=armv8-a+crc" HAVE_ARMV8_CRC_INTRINSICS) + if (HAVE_ARMV8_CRC_INTRINSICS) + set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -march=armv8-a+crc") + message(STATUS " aarch64 CRC32 INTRINSICS supported") + endif() +endif() + # ---------------------------------------------------------------------- # Setup Gold linker, if available. Code originally from Apache Kudu diff --git a/cpp/src/arrow/util/armce-util.h b/cpp/src/arrow/util/armce-util.h new file mode 100644 index 0000000000000..cb6b1a8b5f1e4 --- /dev/null +++ b/cpp/src/arrow/util/armce-util.h @@ -0,0 +1,106 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_UTIL_ARMCE_UTIL_H +#define ARROW_UTIL_ARMCE_UTIL_H + +#include "my_config.h" + +namespace arrow { + +#if defined(__GNUC__) && defined(__linux__) && defined(ARROW_HAVE_ARMCE) + +#include +#include +#ifndef HWCAP_CRC32 +#define HWCAP_CRC32 (1 << 7) +#endif +static inline uint32_t crc32c_runtime_check(void) +{ + unsigned long auxv = getauxval(AT_HWCAP); + return (auxv & HWCAP_CRC32) != 0; +} + +#ifdef HAVE_ARMV8_CRC_INTRINSICS +// compiler intrinsics. +#include +#include + +#define ARMCE_crc32_u8 __crc32cb +#define ARMCE_crc32_u16 __crc32ch +#define ARMCE_crc32_u32 __crc32cw +#define ARMCE_crc32_u64 __crc32cd + +#else +// Request crc extension capabilities from the assembler +asm(".arch_extension crc"); + +// define our own implementations of the intrinsics instead. +static inline uint32_t ARMCE_crc32_u8(uint32_t crc, uint8_t value) { + __asm__("crc32cb %w[c], %w[c], %w[v]":[c]"+r"(crc):[v]"r"(value)); + return crc; +} + +static inline uint32_t ARMCE_crc32_u16(uint32_t crc, uint16_t value) { + __asm__("crc32ch %w[c], %w[c], %w[v]":[c]"+r"(crc):[v]"r"(value)); + return crc; +} + +static inline uint32_t ARMCE_crc32_u32(uint32_t crc, uint32_t value) { + __asm__("crc32cw %w[c], %w[c], %w[v]":[c]"+r"(crc):[v]"r"(value)); + return crc; +} + +static inline uint32_t ARMCE_crc32_u64(uint32_t crc, uint64_t value) { + uint64_t result = crc; + __asm__("crc32cx %w[c], %w[c], %x[v]":[c]"+r"(crc):[v]"r"(value)); + return static_cast(result); +} +#endif // HAVE_ARMV8_CRC_INTRINSICS + +#else + +static inline uint32_t crc32c_runtime_check(void) { + DCHECK(false) << "Arm crc32 support is not enabled"; + return 0; +} + +static inline uint32_t ARMCE_crc32_u8(uint32_t, uint8_t) { + DCHECK(false) << "Arm crc32 support is not enabled"; + return 0; +} + +static inline uint32_t ARMCE_crc32_u16(uint32_t, uint16_t) { + DCHECK(false) << "Arm crc32 is not enabled"; + return 0; +} + +static inline uint32_t ARMCE_crc32_u32(uint32_t, uint32_t) { + DCHECK(false) << "Arm crc32 support is not enabled"; + return 0; +} + +static inline uint32_t ARMCE_crc32_u64(uint32_t, uint64_t) { + DCHECK(false) << "Arm crc32 support is not enabled"; + return 0; +} + +#endif // defined(__GNUC__) && defined(__linux__) && defined(ARROW_HAVE_ARMCE) + +} // namespace arrow + +#endif // ARROW_UTIL_ARMCE_UTIL_H diff --git a/cpp/src/arrow/util/config.in.cmake b/cpp/src/arrow/util/config.in.cmake new file mode 100644 index 0000000000000..3b2fa6bafeb44 --- /dev/null +++ b/cpp/src/arrow/util/config.in.cmake @@ -0,0 +1,32 @@ +/* Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, + software distributed under the License is distributed on an + "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + KIND, either express or implied. See the License for the + specific language governing permissions and limitations + under the License. + + This module sets the following variables in your project:: + + arrow_FOUND - true if arrow found on the system */ + +/* my_config.h file expanded by Cmake for build */ + + +#ifndef MY_CONFIG_H +#define MY_CONFIG_H + +/* Support Armv8 CRC instructions */ +#cmakedefine ARROW_HAVE_ARMCE 1 +#cmakedefine HAVE_ARMV8_CRC_INTRINSICS 1 + +#endif /* MY_CONFIG_H */ diff --git a/cpp/src/arrow/util/hash-util.h b/cpp/src/arrow/util/hash-util.h index 3f7e4048bdf10..edbaad6d86de2 100644 --- a/cpp/src/arrow/util/hash-util.h +++ b/cpp/src/arrow/util/hash-util.h @@ -26,25 +26,34 @@ #include "arrow/util/logging.h" #include "arrow/util/macros.h" #include "arrow/util/sse-util.h" +#include "arrow/util/armce-util.h" namespace arrow { +enum CompHashMode { + USE_DEFAULT = 0, + USE_SSE42 = 1, + USE_ARMCRC = 2, +}; + /// Utility class to compute hash values. class HashUtil { public: #ifdef ARROW_HAVE_SSE4_2 - static constexpr bool have_hardware_crc32 = true; + static constexpr CompHashMode have_hardware_crc32 = USE_SSE42; +#elif ARROW_HAVE_ARMCE + static constexpr CompHashMode have_hardware_crc32 = USE_ARMCRC; #else - static constexpr bool have_hardware_crc32 = false; + static constexpr CompHashMode have_hardware_crc32 = USE_DEFAULT; #endif - /// Compute the Crc32 hash for data using SSE4 instructions. The input hash + /// Compute the Crc32 hash for data using SSE4/Armce instructions. The input hash /// parameter is the current hash/seed value. - /// This should only be called if SSE is supported. + /// This should only be called if SSE/Armce is supported. /// This is ~4x faster than Fnv/Boost Hash. /// TODO: crc32 hashes with different seeds do not result in different hash functions. /// The resulting hashes are correlated. - static uint32_t CrcHash(const void* data, int32_t nbytes, uint32_t hash) { + static uint32_t CrcHashSSE(const void* data, int32_t nbytes, uint32_t hash) { const uint8_t* p = reinterpret_cast(data); const uint8_t* end = p + nbytes; @@ -67,6 +76,29 @@ class HashUtil { return hash; } + static uint32_t CrcHashARMCE(const void* data, int32_t nbytes, uint32_t hash) { + const uint8_t* p = reinterpret_cast(data); + const uint8_t* end = p + nbytes; + + while (p <= end - 8) { + hash = ARMCE_crc32_u64(hash, *reinterpret_cast(p)); + p += 8; + } + while (p <= end - 4) { + hash = ARMCE_crc32_u32(hash, *reinterpret_cast(p)); + p += 4; + } + while (p < end) { + hash = ARMCE_crc32_u8(hash, *p); + ++p; + } + + // The lower half of the CRC hash has has poor uniformity, so swap the halves + // for anyone who only uses the first several bits of the hash. + hash = (hash << 16) | (hash >> 16); + return hash; + } + /// A variant of CRC32 hashing that computes two independent running CRCs /// over interleaved halves of the input, giving out a 64-bit integer. /// The result's quality should be improved by a finalization step. @@ -74,7 +106,7 @@ class HashUtil { /// In addition to producing more bits of output, this should be twice /// faster than CrcHash on CPUs that can overlap several independent /// CRC computations. - static uint64_t DoubleCrcHash(const void* data, int32_t nbytes, uint64_t hash) { + static uint64_t DoubleCrcHashSSE(const void* data, int32_t nbytes, uint64_t hash) { const uint8_t* p = reinterpret_cast(data); uint32_t h1 = static_cast(hash >> 32); @@ -115,6 +147,47 @@ class HashUtil { return (static_cast(h1) << 32) + h2; } + static uint64_t DoubleCrcHashARMCE(const void* data, int32_t nbytes, uint64_t hash) { + const uint8_t* p = reinterpret_cast(data); + + uint32_t h1 = static_cast(hash >> 32); + uint32_t h2 = static_cast(hash); + + while (nbytes >= 16) { + h1 = ARMCE_crc32_u64(h1, *reinterpret_cast(p)); + h2 = ARMCE_crc32_u64(h2, *reinterpret_cast(p + 8)); + nbytes -= 16; + p += 16; + } + if (nbytes >= 8) { + h1 = ARMCE_crc32_u32(h1, *reinterpret_cast(p)); + h2 = ARMCE_crc32_u32(h2, *reinterpret_cast(p + 4)); + nbytes -= 8; + p += 8; + } + if (nbytes >= 4) { + h1 = ARMCE_crc32_u16(h1, *reinterpret_cast(p)); + h2 = ARMCE_crc32_u16(h2, *reinterpret_cast(p + 2)); + nbytes -= 4; + p += 4; + } + switch (nbytes) { + case 3: + h1 = ARMCE_crc32_u8(h1, p[3]); + case 2: + h2 = ARMCE_crc32_u8(h2, p[2]); + case 1: + h1 = ARMCE_crc32_u8(h1, p[1]); + case 0: + break; + default: + assert(0); + } + + // A finalization step is recommended to mix up the result's bits + return (static_cast(h1) << 32) + h2; + } + /// CrcHash() specialized for 1-byte data static inline uint32_t CrcHash1(const void* v, uint32_t hash) { const uint8_t* s = reinterpret_cast(v); @@ -251,10 +324,14 @@ class HashUtil { return static_cast((hash_u64 >> 32) ^ (hash_u64 & 0xFFFFFFFF)); } - // With sse4.2 - template + // Hash template + template static inline int Hash(const void* data, int32_t bytes, uint32_t seed); + // Double Hash template + template + static inline int DoubleHash(const void* data, int32_t bytes, uint32_t seed); + /// The magic number (used in hash_combine()) 0x9e3779b9 = 2^32 / (golden ratio). static const uint32_t HASH_COMBINE_SEED = 0x9e3779b9; @@ -288,18 +365,49 @@ class HashUtil { } }; -// With sse4.2 +// Hash specialized for Arm Crc32 +template <> +inline int HashUtil::Hash(const void* data, int32_t bytes, uint32_t seed) { + // Need run time check + if (crc32c_runtime_check()) + return static_cast(HashUtil::CrcHashARMCE(data, bytes, seed)); + else + // Fall back to Murmur + return static_cast(HashUtil::MurmurHash2_64(data, bytes, seed)); +} + + +// Hash specialized for sse4.2 template <> -inline int HashUtil::Hash(const void* data, int32_t bytes, uint32_t seed) { - return static_cast(HashUtil::CrcHash(data, bytes, seed)); +inline int HashUtil::Hash(const void* data, int32_t bytes, uint32_t seed) { + return static_cast(HashUtil::CrcHashSSE(data, bytes, seed)); } -// Non-sse4 hash +// Hash specialized for default murmur hash template <> -inline int HashUtil::Hash(const void* data, int32_t bytes, uint32_t seed) { +inline int HashUtil::Hash(const void* data, int32_t bytes, uint32_t seed) { return static_cast(HashUtil::MurmurHash2_64(data, bytes, seed)); } +// DoubleHash specialized for Arm Crc32 +template <> +inline int HashUtil::DoubleHash(const void* data, int32_t bytes, uint32_t seed) { + // Need run time check + if (crc32c_runtime_check()) + return static_cast(HashUtil::DoubleCrcHashARMCE(data, bytes, seed)); + else + // Fall back to Murmur + return static_cast(HashUtil::MurmurHash2_64(data, bytes, seed)); +} + + +// DoubleHash specialized for sse4.2 +template <> +inline int HashUtil::DoubleHash(const void* data, int32_t bytes, uint32_t seed) { + return static_cast(HashUtil::DoubleCrcHashSSE(data, bytes, seed)); +} + + } // namespace arrow #endif // ARROW_UTIL_HASH_UTIL_H diff --git a/cpp/src/arrow/util/hashing.h b/cpp/src/arrow/util/hashing.h index 24325e81eb4fd..b434c17a12df0 100644 --- a/cpp/src/arrow/util/hashing.h +++ b/cpp/src/arrow/util/hashing.h @@ -155,16 +155,21 @@ hash_t ComputeStringHash(const void* data, int64_t length) { return n ^ hx ^ hy; } - if (HashUtil::have_hardware_crc32) { - // DoubleCrcHash is faster that Murmur2. - auto h = HashUtil::DoubleCrcHash(data, static_cast(length), AlgNum); + // DoubleCrcHash is faster that Murmur2. + if (HashUtil::have_hardware_crc32 == USE_SSE42) { + // SSE4. + auto h = HashUtil::DoubleHash(data, static_cast(length), AlgNum); + return ScalarHelper::ComputeHash(h); + } else if (HashUtil::have_hardware_crc32 == USE_ARMCRC) { + // Armce. + auto h = HashUtil::DoubleHash(data, static_cast(length), AlgNum); return ScalarHelper::ComputeHash(h); } else { // Fall back on 64-bit Murmur2 for longer strings. // It has decent speed for medium-sized strings. There may be faster // hashes on long strings such as xxHash, but that may not matter much // for the typical length distribution of hash keys. - return HashUtil::MurmurHash2_64(data, static_cast(length), AlgNum); + return HashUtil::Hash(data, static_cast(length), AlgNum); } }