Skip to content

Commit

Permalink
Rebase the patch to master
Browse files Browse the repository at this point in the history
Change-Id: I3044a89bae619968e340636996f014a0134f1030
Signed-off-by: Yuqi Gu <[email protected]>
  • Loading branch information
guyuqi committed Nov 25, 2018
1 parent e9c1317 commit bdcf7b7
Show file tree
Hide file tree
Showing 5 changed files with 288 additions and 17 deletions.
20 changes: 20 additions & 0 deletions cpp/cmake_modules/SetupCxxFlags.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -231,6 +231,26 @@ if (APPLE)
set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -stdlib=libc++")
endif()

if (CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|AARCH64")
CHECK_CXX_SOURCE_COMPILES("
#define CRC32CX(crc, value) __asm__(\"crc32cx %w[c], %w[c], %x[v]\":[c]\"+r\"(crc):[v]\"r\"(value))
asm(\".arch_extension crc\");
unsigned int foo(unsigned int ret) {
CRC32CX(ret, 0);
return ret;
}
int main() { foo(0); }" ARROW_HAVE_ARMCE)
if (ARROW_HAVE_ARMCE)
message(STATUS " aarch64 CRC32 supported")
endif()

CHECK_CXX_COMPILER_FLAG("-march=armv8-a+crc" HAVE_ARMV8_CRC_INTRINSICS)
if (HAVE_ARMV8_CRC_INTRINSICS)
set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -march=armv8-a+crc")
message(STATUS " aarch64 CRC32 INTRINSICS supported")
endif()
endif()

# ----------------------------------------------------------------------
# Setup Gold linker, if available. Code originally from Apache Kudu

Expand Down
106 changes: 106 additions & 0 deletions cpp/src/arrow/util/armce-util.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

#ifndef ARROW_UTIL_ARMCE_UTIL_H
#define ARROW_UTIL_ARMCE_UTIL_H

#include "my_config.h"

namespace arrow {

#if defined(__GNUC__) && defined(__linux__) && defined(ARROW_HAVE_ARMCE)

#include <sys/auxv.h>
#include <asm/hwcap.h>
#ifndef HWCAP_CRC32
#define HWCAP_CRC32 (1 << 7)
#endif
static inline uint32_t crc32c_runtime_check(void)
{
unsigned long auxv = getauxval(AT_HWCAP);
return (auxv & HWCAP_CRC32) != 0;
}

#ifdef HAVE_ARMV8_CRC_INTRINSICS
// compiler intrinsics.
#include <arm_acle.h>
#include <arm_neon.h>

#define ARMCE_crc32_u8 __crc32cb
#define ARMCE_crc32_u16 __crc32ch
#define ARMCE_crc32_u32 __crc32cw
#define ARMCE_crc32_u64 __crc32cd

#else
// Request crc extension capabilities from the assembler
asm(".arch_extension crc");

// define our own implementations of the intrinsics instead.
static inline uint32_t ARMCE_crc32_u8(uint32_t crc, uint8_t value) {
__asm__("crc32cb %w[c], %w[c], %w[v]":[c]"+r"(crc):[v]"r"(value));
return crc;
}

static inline uint32_t ARMCE_crc32_u16(uint32_t crc, uint16_t value) {
__asm__("crc32ch %w[c], %w[c], %w[v]":[c]"+r"(crc):[v]"r"(value));
return crc;
}

static inline uint32_t ARMCE_crc32_u32(uint32_t crc, uint32_t value) {
__asm__("crc32cw %w[c], %w[c], %w[v]":[c]"+r"(crc):[v]"r"(value));
return crc;
}

static inline uint32_t ARMCE_crc32_u64(uint32_t crc, uint64_t value) {
uint64_t result = crc;
__asm__("crc32cx %w[c], %w[c], %x[v]":[c]"+r"(crc):[v]"r"(value));
return static_cast<uint32_t>(result);
}
#endif // HAVE_ARMV8_CRC_INTRINSICS

#else

static inline uint32_t crc32c_runtime_check(void) {
DCHECK(false) << "Arm crc32 support is not enabled";
return 0;
}

static inline uint32_t ARMCE_crc32_u8(uint32_t, uint8_t) {
DCHECK(false) << "Arm crc32 support is not enabled";
return 0;
}

static inline uint32_t ARMCE_crc32_u16(uint32_t, uint16_t) {
DCHECK(false) << "Arm crc32 is not enabled";
return 0;
}

static inline uint32_t ARMCE_crc32_u32(uint32_t, uint32_t) {
DCHECK(false) << "Arm crc32 support is not enabled";
return 0;
}

static inline uint32_t ARMCE_crc32_u64(uint32_t, uint64_t) {
DCHECK(false) << "Arm crc32 support is not enabled";
return 0;
}

#endif // defined(__GNUC__) && defined(__linux__) && defined(ARROW_HAVE_ARMCE)

} // namespace arrow

#endif // ARROW_UTIL_ARMCE_UTIL_H
32 changes: 32 additions & 0 deletions cpp/src/arrow/util/config.in.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
/* Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
distributed with this work for additional information
regarding copyright ownership. The ASF licenses this file
to you under the Apache License, Version 2.0 (the
"License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing,
software distributed under the License is distributed on an
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, either express or implied. See the License for the
specific language governing permissions and limitations
under the License.

This module sets the following variables in your project::

arrow_FOUND - true if arrow found on the system */

/* my_config.h file expanded by Cmake for build */


#ifndef MY_CONFIG_H
#define MY_CONFIG_H

/* Support Armv8 CRC instructions */
#cmakedefine ARROW_HAVE_ARMCE 1
#cmakedefine HAVE_ARMV8_CRC_INTRINSICS 1

#endif /* MY_CONFIG_H */
134 changes: 121 additions & 13 deletions cpp/src/arrow/util/hash-util.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,25 +26,34 @@
#include "arrow/util/logging.h"
#include "arrow/util/macros.h"
#include "arrow/util/sse-util.h"
#include "arrow/util/armce-util.h"

namespace arrow {

enum CompHashMode {
USE_DEFAULT = 0,
USE_SSE42 = 1,
USE_ARMCRC = 2,
};

/// Utility class to compute hash values.
class HashUtil {
public:
#ifdef ARROW_HAVE_SSE4_2
static constexpr bool have_hardware_crc32 = true;
static constexpr CompHashMode have_hardware_crc32 = USE_SSE42;
#elif ARROW_HAVE_ARMCE
static constexpr CompHashMode have_hardware_crc32 = USE_ARMCRC;
#else
static constexpr bool have_hardware_crc32 = false;
static constexpr CompHashMode have_hardware_crc32 = USE_DEFAULT;
#endif

/// Compute the Crc32 hash for data using SSE4 instructions. The input hash
/// Compute the Crc32 hash for data using SSE4/Armce instructions. The input hash
/// parameter is the current hash/seed value.
/// This should only be called if SSE is supported.
/// This should only be called if SSE/Armce is supported.
/// This is ~4x faster than Fnv/Boost Hash.
/// TODO: crc32 hashes with different seeds do not result in different hash functions.
/// The resulting hashes are correlated.
static uint32_t CrcHash(const void* data, int32_t nbytes, uint32_t hash) {
static uint32_t CrcHashSSE(const void* data, int32_t nbytes, uint32_t hash) {
const uint8_t* p = reinterpret_cast<const uint8_t*>(data);
const uint8_t* end = p + nbytes;

Expand All @@ -67,14 +76,37 @@ class HashUtil {
return hash;
}

static uint32_t CrcHashARMCE(const void* data, int32_t nbytes, uint32_t hash) {
const uint8_t* p = reinterpret_cast<const uint8_t*>(data);
const uint8_t* end = p + nbytes;

while (p <= end - 8) {
hash = ARMCE_crc32_u64(hash, *reinterpret_cast<const uint64_t*>(p));
p += 8;
}
while (p <= end - 4) {
hash = ARMCE_crc32_u32(hash, *reinterpret_cast<const uint32_t*>(p));
p += 4;
}
while (p < end) {
hash = ARMCE_crc32_u8(hash, *p);
++p;
}

// The lower half of the CRC hash has has poor uniformity, so swap the halves
// for anyone who only uses the first several bits of the hash.
hash = (hash << 16) | (hash >> 16);
return hash;
}

/// A variant of CRC32 hashing that computes two independent running CRCs
/// over interleaved halves of the input, giving out a 64-bit integer.
/// The result's quality should be improved by a finalization step.
///
/// In addition to producing more bits of output, this should be twice
/// faster than CrcHash on CPUs that can overlap several independent
/// CRC computations.
static uint64_t DoubleCrcHash(const void* data, int32_t nbytes, uint64_t hash) {
static uint64_t DoubleCrcHashSSE(const void* data, int32_t nbytes, uint64_t hash) {
const uint8_t* p = reinterpret_cast<const uint8_t*>(data);

uint32_t h1 = static_cast<uint32_t>(hash >> 32);
Expand Down Expand Up @@ -115,6 +147,47 @@ class HashUtil {
return (static_cast<uint64_t>(h1) << 32) + h2;
}

static uint64_t DoubleCrcHashARMCE(const void* data, int32_t nbytes, uint64_t hash) {
const uint8_t* p = reinterpret_cast<const uint8_t*>(data);

uint32_t h1 = static_cast<uint32_t>(hash >> 32);
uint32_t h2 = static_cast<uint32_t>(hash);

while (nbytes >= 16) {
h1 = ARMCE_crc32_u64(h1, *reinterpret_cast<const uint64_t*>(p));
h2 = ARMCE_crc32_u64(h2, *reinterpret_cast<const uint64_t*>(p + 8));
nbytes -= 16;
p += 16;
}
if (nbytes >= 8) {
h1 = ARMCE_crc32_u32(h1, *reinterpret_cast<const uint32_t*>(p));
h2 = ARMCE_crc32_u32(h2, *reinterpret_cast<const uint32_t*>(p + 4));
nbytes -= 8;
p += 8;
}
if (nbytes >= 4) {
h1 = ARMCE_crc32_u16(h1, *reinterpret_cast<const uint16_t*>(p));
h2 = ARMCE_crc32_u16(h2, *reinterpret_cast<const uint16_t*>(p + 2));
nbytes -= 4;
p += 4;
}
switch (nbytes) {
case 3:
h1 = ARMCE_crc32_u8(h1, p[3]);
case 2:
h2 = ARMCE_crc32_u8(h2, p[2]);
case 1:
h1 = ARMCE_crc32_u8(h1, p[1]);
case 0:
break;
default:
assert(0);
}

// A finalization step is recommended to mix up the result's bits
return (static_cast<uint64_t>(h1) << 32) + h2;
}

/// CrcHash() specialized for 1-byte data
static inline uint32_t CrcHash1(const void* v, uint32_t hash) {
const uint8_t* s = reinterpret_cast<const uint8_t*>(v);
Expand Down Expand Up @@ -251,10 +324,14 @@ class HashUtil {
return static_cast<uint32_t>((hash_u64 >> 32) ^ (hash_u64 & 0xFFFFFFFF));
}

// With sse4.2
template <bool use_sse42 = true>
// Hash template
template <CompHashMode comp_mode>
static inline int Hash(const void* data, int32_t bytes, uint32_t seed);

// Double Hash template
template <CompHashMode comp_mode>
static inline int DoubleHash(const void* data, int32_t bytes, uint32_t seed);

/// The magic number (used in hash_combine()) 0x9e3779b9 = 2^32 / (golden ratio).
static const uint32_t HASH_COMBINE_SEED = 0x9e3779b9;

Expand Down Expand Up @@ -288,18 +365,49 @@ class HashUtil {
}
};

// With sse4.2
// Hash specialized for Arm Crc32
template <>
inline int HashUtil::Hash<USE_ARMCRC>(const void* data, int32_t bytes, uint32_t seed) {
// Need run time check
if (crc32c_runtime_check())
return static_cast<int>(HashUtil::CrcHashARMCE(data, bytes, seed));
else
// Fall back to Murmur
return static_cast<int>(HashUtil::MurmurHash2_64(data, bytes, seed));
}


// Hash specialized for sse4.2
template <>
inline int HashUtil::Hash<true>(const void* data, int32_t bytes, uint32_t seed) {
return static_cast<int>(HashUtil::CrcHash(data, bytes, seed));
inline int HashUtil::Hash<USE_SSE42>(const void* data, int32_t bytes, uint32_t seed) {
return static_cast<int>(HashUtil::CrcHashSSE(data, bytes, seed));
}

// Non-sse4 hash
// Hash specialized for default murmur hash
template <>
inline int HashUtil::Hash<false>(const void* data, int32_t bytes, uint32_t seed) {
inline int HashUtil::Hash<USE_DEFAULT>(const void* data, int32_t bytes, uint32_t seed) {
return static_cast<int>(HashUtil::MurmurHash2_64(data, bytes, seed));
}

// DoubleHash specialized for Arm Crc32
template <>
inline int HashUtil::DoubleHash<USE_ARMCRC>(const void* data, int32_t bytes, uint32_t seed) {
// Need run time check
if (crc32c_runtime_check())
return static_cast<int>(HashUtil::DoubleCrcHashARMCE(data, bytes, seed));
else
// Fall back to Murmur
return static_cast<int>(HashUtil::MurmurHash2_64(data, bytes, seed));
}


// DoubleHash specialized for sse4.2
template <>
inline int HashUtil::DoubleHash<USE_SSE42>(const void* data, int32_t bytes, uint32_t seed) {
return static_cast<int>(HashUtil::DoubleCrcHashSSE(data, bytes, seed));
}


} // namespace arrow

#endif // ARROW_UTIL_HASH_UTIL_H
13 changes: 9 additions & 4 deletions cpp/src/arrow/util/hashing.h
Original file line number Diff line number Diff line change
Expand Up @@ -155,16 +155,21 @@ hash_t ComputeStringHash(const void* data, int64_t length) {
return n ^ hx ^ hy;
}

if (HashUtil::have_hardware_crc32) {
// DoubleCrcHash is faster that Murmur2.
auto h = HashUtil::DoubleCrcHash(data, static_cast<int32_t>(length), AlgNum);
// DoubleCrcHash is faster that Murmur2.
if (HashUtil::have_hardware_crc32 == USE_SSE42) {
// SSE4.
auto h = HashUtil::DoubleHash<USE_SSE42>(data, static_cast<int32_t>(length), AlgNum);
return ScalarHelper<uint64_t, AlgNum>::ComputeHash(h);
} else if (HashUtil::have_hardware_crc32 == USE_ARMCRC) {
// Armce.
auto h = HashUtil::DoubleHash<USE_ARMCRC>(data, static_cast<int32_t>(length), AlgNum);
return ScalarHelper<uint64_t, AlgNum>::ComputeHash(h);
} else {
// Fall back on 64-bit Murmur2 for longer strings.
// It has decent speed for medium-sized strings. There may be faster
// hashes on long strings such as xxHash, but that may not matter much
// for the typical length distribution of hash keys.
return HashUtil::MurmurHash2_64(data, static_cast<int>(length), AlgNum);
return HashUtil::Hash<USE_DEFAULT>(data, static_cast<int>(length), AlgNum);
}
}

Expand Down

0 comments on commit bdcf7b7

Please sign in to comment.