Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Better vectorization and crc64. Cleaned up cmake and added better runtime cpu detection #1083

Merged
merged 23 commits into from
Mar 22, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
11e0c23
Modularized the simd flags so they can be used individually by files …
JonathanHenson Jan 22, 2024
08092b5
Fix cmake.
JonathanHenson Jan 22, 2024
86128bc
try it this way?
JonathanHenson Jan 22, 2024
9e0d6a4
use flags instead?
JonathanHenson Jan 22, 2024
923d6ca
Added more flags.
JonathanHenson Jan 22, 2024
c9bcc2e
Fix typo for clmul.
JonathanHenson Jan 23, 2024
e782c37
move flags around.
JonathanHenson Jan 23, 2024
54911e9
Fix typo on compiler flag
JonathanHenson Jan 23, 2024
f859d7c
Add space to compiler flags maybe?:
JonathanHenson Jan 23, 2024
8419ad6
clean up the simd function macros to one.
JonathanHenson Jan 23, 2024
b53bc0b
add the sse 4.2 flag back.
JonathanHenson Jan 23, 2024
4ccdc39
hopefully fix the incorrect arm feature detectin.
JonathanHenson Jan 23, 2024
8fdfe3a
make sure werror is passed to cross compile.
JonathanHenson Jan 23, 2024
2562d5f
turn on SSE2 for SSE4.2
JonathanHenson Jan 23, 2024
88143c1
Undo the new ss2 flag.
JonathanHenson Jan 23, 2024
f1075a2
Full arm cpu feature detection.
JonathanHenson Jan 24, 2024
d6b9399
Add in the other flags for the auxv implementation.
JonathanHenson Jan 25, 2024
ddb7d02
Added intel and arm detection flags to config.
JonathanHenson Jan 29, 2024
8cac3d7
Fix bad vcplmulq detection and update simd feature tests.
JonathanHenson Feb 5, 2024
ca909c0
Make sure the config file export is correct.
JonathanHenson Feb 5, 2024
7da0e04
Merge branch 'main' into better_vectorization_and_crc64
alfred2g Mar 18, 2024
faa858c
Style: clang-format
alfred2g Mar 18, 2024
b3a150f
Merge branch 'main' into better_vectorization_and_crc64
alfred2g Mar 19, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 9 additions & 5 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -152,13 +152,17 @@ if (USE_CPU_EXTENSIONS)
)
endif()
elseif (AWS_ARCH_ARM64 OR AWS_ARCH_ARM32)
if (MSVC)
if (WINDOWS)
file(GLOB AWS_COMMON_ARCH_SRC
"source/arch/arm/msvc/*.c"
"source/arch/arm/windows/*.c"
)
elseif (AWS_HAVE_AUXV)
elseif(APPLE)
file(GLOB AWS_COMMON_ARCH_SRC
"source/arch/arm/darwin/*.c"
)
else()
file(GLOB AWS_COMMON_ARCH_SRC
"source/arch/arm/asm/*.c"
"source/arch/arm/auxv/*.c"
)
endif()
endif()
Expand Down Expand Up @@ -221,7 +225,7 @@ target_compile_definitions(${PROJECT_NAME} PRIVATE -DCJSON_HIDE_SYMBOLS)

if (AWS_HAVE_AVX2_INTRINSICS)
target_compile_definitions(${PROJECT_NAME} PRIVATE -DUSE_SIMD_ENCODING)
simd_add_source_avx(${PROJECT_NAME} "source/arch/intel/encoding_avx2.c")
simd_append_source_and_features(${PROJECT_NAME} "source/arch/intel/encoding_avx2.c" ${AWS_AVX2_FLAG})
message(STATUS "Building SIMD base64 decoder")
endif()

Expand Down
14 changes: 14 additions & 0 deletions bin/system_info/print_system_info.c
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
#include <aws/common/byte_buf.h>
#include <aws/common/logging.h>
#include <aws/common/system_info.h>
#include <aws/common/cpuid.h>

int main(void) {
struct aws_allocator *allocator = aws_default_allocator();
Expand Down Expand Up @@ -39,6 +40,19 @@ int main(void) {
fprintf(stdout, " 'numa architecture': 'false'\n");
}

fprintf(stdout, " 'cpu_capabilities': {\n");
fprintf(stdout, " 'arm_crc': %s,\n", aws_cpu_has_feature(AWS_CPU_FEATURE_ARM_CRC) ? "true" : "false");
fprintf(stdout, " 'arm_pmull': %s,\n", aws_cpu_has_feature(AWS_CPU_FEATURE_ARM_PMULL) ? "true" : "false");
fprintf(stdout, " 'arm_crypto': %s,\n", aws_cpu_has_feature(AWS_CPU_FEATURE_ARM_CRYPTO) ? "true" : "false");
fprintf(stdout, " 'amd_sse4_1': %s,\n", aws_cpu_has_feature(AWS_CPU_FEATURE_SSE_4_1) ? "true" : "false");
fprintf(stdout, " 'amd_sse4_2': %s,\n", aws_cpu_has_feature(AWS_CPU_FEATURE_SSE_4_2) ? "true" : "false");
fprintf(stdout, " 'amd_clmul': %s,\n", aws_cpu_has_feature(AWS_CPU_FEATURE_CLMUL) ? "true" : "false");
fprintf(stdout, " 'amd_vpclmulqdq': %s,\n", aws_cpu_has_feature(AWS_CPU_FEATURE_VPCLMULQDQ) ? "true" : "false");
fprintf(stdout, " 'amd_avx2': %s,\n", aws_cpu_has_feature(AWS_CPU_FEATURE_AVX2) ? "true" : "false");
fprintf(stdout, " 'amd_avx512': %s,\n", aws_cpu_has_feature(AWS_CPU_FEATURE_AVX512) ? "true" : "false");
fprintf(stdout, " 'amd_bmi2': %s\n", aws_cpu_has_feature(AWS_CPU_FEATURE_BMI2) ? "true" : "false");
fprintf(stdout, " }\n");

fprintf(stdout, "}\n");
aws_system_environment_release(env);
aws_logger_clean_up(&logger);
Expand Down
13 changes: 13 additions & 0 deletions cmake/AwsFeatureTests.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,10 @@ if(MINGW)
set(USE_CPU_EXTENSIONS OFF)
endif()

if (USE_CPU_EXTENSIONS)
set(AWS_USE_CPU_EXTENSIONS ON)
endif()

if(NOT CMAKE_CROSSCOMPILING)
check_c_source_runs("
#include <stdbool.h>
Expand Down Expand Up @@ -54,6 +58,15 @@ check_c_source_compiles("
}
" AWS_ARCH_INTEL)

check_c_source_compiles("
int main() {
#if !(defined(__x86_64__) || defined(_M_X64))
# error \"not intel\"
#endif
return 0;
}
" AWS_ARCH_INTEL_X64)

check_c_source_compiles("
int main() {
#if !(defined(__aarch64__) || defined(_M_ARM64))
Expand Down
97 changes: 73 additions & 24 deletions cmake/AwsSIMD.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -4,35 +4,39 @@
include(CheckCCompilerFlag)
include(CheckIncludeFile)

if (MSVC)
set(AWS_AVX2_FLAG "/arch:AVX2")
set(AWS_AVX512_FLAG "/arch:AVX512")
set(AWS_AVX512vL_FLAG "")
set(AWS_CLMUL_FLAG "")
set(AWS_SSE4_2_FLAG "")
set(AWS_ARMv8_1_FLAG "/arch:arm8.1")
set(WERROR_FLAG "")
else()
set(AWS_AVX2_FLAG "-mavx -mavx2")
set(AWS_AVX512_FLAG "-mavx512f -mvpclmulqdq")
set(AWS_AVX512vL_FLAG "-mavx512vl")
set(AWS_CLMUL_FLAG "-mpclmul")
set(AWS_SSE4_2_FLAG "-msse4.2")
set(AWS_ARMv8_1_FLAG "-march=armv8-a+crc+crypto -mtune=neoverse-v1")
set(WERROR_FLAG "-Werror")
endif()

if (USE_CPU_EXTENSIONS)
if (MSVC)
check_c_compiler_flag("/arch:AVX2" HAVE_M_AVX2_FLAG)
if (HAVE_M_AVX2_FLAG)
set(AVX_CFLAGS "/arch:AVX2")
endif()
else()
check_c_compiler_flag(-mavx2 HAVE_M_AVX2_FLAG)
if (HAVE_M_AVX2_FLAG)
set(AVX_CFLAGS "-mavx -mavx2")
endif()
set(AVX_CFLAGS ${AWS_SSE4_2_FLAG})

check_c_compiler_flag(${AWS_AVX2_FLAG} HAVE_M_AVX2_FLAG)
if (HAVE_M_AVX2_FLAG)
set(AVX_CFLAGS "${AWS_AVX2_FLAG} ${AVX_CFLAGS}")
endif()

if (MSVC)
check_c_compiler_flag("/arch:AVX512" HAVE_M_AVX512_FLAG)
if (HAVE_M_AVX512_FLAG)
# docs imply AVX512 brings in AVX2. And it will compile, but it will break at runtime on
# instructions such as _mm256_load_si256(). Leave it on.
set(AVX_CFLAGS "/arch:AVX512 /arch:AVX2")
endif()
else()
check_c_compiler_flag("-mavx512f -mvpclmulqdq" HAVE_M_AVX512_FLAG)
if (HAVE_M_AVX512_FLAG)
set(AVX_CFLAGS "-mavx512f -mvpclmulqdq -mpclmul -mavx -mavx2 -msse4.2")
endif()
check_c_compiler_flag("${AWS_AVX512_FLAG} ${AWS_CLMUL_FLAG}" HAVE_M_AVX512_FLAG)
if (HAVE_M_AVX512_FLAG)
set(AVX_CFLAGS "${AWS_AVX512_FLAG} ${AWS_CLMUL_FLAG} ${AVX_CFLAGS}")
endif()

set(old_flags "${CMAKE_REQUIRED_FLAGS}")
set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} ${AVX_CFLAGS}")
set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} ${AVX_CFLAGS} ${WERROR_FLAG}")

check_c_source_compiles("
#include <immintrin.h>
Expand Down Expand Up @@ -68,7 +72,35 @@ if (USE_CPU_EXTENSIONS)
return (int)_mm256_extract_epi64(vec, 2);
}" AWS_HAVE_MM256_EXTRACT_EPI64)

check_c_source_compiles("
#include <wmmintrin.h>
#include <emmintrin.h>
int main() {
__m128i a = _mm_setzero_si128();
__m128i b = _mm_setzero_si128();
__m128i result = _mm_clmulepi64_si128(a, b, 0x00);
(void)result;
return 0;
}" AWS_HAVE_CLMUL)

set(CMAKE_REQUIRED_FLAGS "${old_flags} ${AWS_ARMv8_1_FLAG} ${WERROR_FLAG}")
check_c_source_compiles("
#include <arm_acle.h>
int main() {
int crc = __crc32d(0, 1);
return 0;
}" AWS_HAVE_ARM32_CRC)

check_c_source_compiles("
#include <stdatomic.h>
int main() {
_Atomic int var = 0;
atomic_fetch_add_explicit(&var, 1, memory_order_relaxed);
return 0;
}" AWS_HAVE_ARMv8_1)

set(CMAKE_REQUIRED_FLAGS "${old_flags}")

endif() # USE_CPU_EXTENSIONS

# The part where the definition is added to the compiler flags has been moved to config.h.in
Expand All @@ -80,6 +112,23 @@ endif() # USE_CPU_EXTENSIONS
function(simd_add_source_avx target)
foreach(file ${ARGN})
target_sources(${target} PRIVATE ${file})
set_source_files_properties(${file} PROPERTIES COMPILE_FLAGS "${AVX_CFLAGS}")
set_source_files_properties(${file} PROPERTIES COMPILE_FLAGS " ${AVX_CFLAGS}")
endforeach()
endfunction(simd_add_source_avx)

# The part where the definition is added to the compiler flags has been moved to config.h.in
# see git history for more details.

# Adds compiler flags to the source and adds the source to target.
# Unfortunately the flags have to be passed as strings. Predefined flags are
# at the top of this file.
# Usage: simd_append_source_and_features(target file1.c ${AWS_AVX512_FLAG} ${AWS_AVX2_FLAG} ...)
function(simd_append_source_and_features target file)
set(CC_FLAGS "")
foreach(flag ${ARGN})
set(CC_FLAGS "${CC_FLAGS} ${flag}")
endforeach()

target_sources(${target} PRIVATE ${file})
set_source_files_properties(${file} PROPERTIES COMPILE_FLAGS " ${CC_FLAGS}")
endfunction(simd_append_source_and_features)
7 changes: 7 additions & 0 deletions include/aws/common/config.h.in
Original file line number Diff line number Diff line change
Expand Up @@ -22,5 +22,12 @@
#cmakedefine AWS_HAVE_AVX2_INTRINSICS
#cmakedefine AWS_HAVE_AVX512_INTRINSICS
#cmakedefine AWS_HAVE_MM256_EXTRACT_EPI64
#cmakedefine AWS_HAVE_CLMUL
#cmakedefine AWS_HAVE_ARM32_CRC
#cmakedefine AWS_HAVE_ARMv8_1
#cmakedefine AWS_ARCH_ARM64
#cmakedefine AWS_ARCH_INTEL
#cmakedefine AWS_ARCH_INTEL_X64
#cmakedefine AWS_USE_CPU_EXTENSIONS

#endif
2 changes: 2 additions & 0 deletions include/aws/common/cpuid.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@ enum aws_cpu_feature_name {
AWS_CPU_FEATURE_ARM_CRC,
AWS_CPU_FEATURE_BMI2,
AWS_CPU_FEATURE_VPCLMULQDQ,
AWS_CPU_FEATURE_ARM_PMULL,
AWS_CPU_FEATURE_ARM_CRYPTO,
AWS_CPU_FEATURE_COUNT,
};

Expand Down
8 changes: 7 additions & 1 deletion source/arch/arm/asm/cpuid.c → source/arch/arm/auxv/cpuid.c
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,9 @@ struct cap_bits {

# if (defined(__aarch64__))
struct cap_bits s_check_cap[AWS_CPU_FEATURE_COUNT] = {
[AWS_CPU_FEATURE_ARM_CRC] = {0, 1 << 7 /* HWCAP_CRC */},
[AWS_CPU_FEATURE_ARM_CRC] = {0, 1 << 7 /* HWCAP_CRC32 */},
[AWS_CPU_FEATURE_ARM_PMULL] = {0, 1 << 4 /* HWCAP_PMULL */},
[AWS_CPU_FEATURE_ARM_CRYPTO] = {0, 1 << 3 /* HWCAP_AES */},
};
# else
struct cap_bits s_check_cap[AWS_CPU_FEATURE_COUNT] = {
Expand Down Expand Up @@ -67,6 +69,10 @@ bool aws_cpu_has_feature(enum aws_cpu_feature_name feature_name) {

switch (feature_name) {
case AWS_CPU_FEATURE_ARM_CRC:
# if (defined(__aarch64__))
case AWS_CPU_FEATURE_ARM_PMULL:
case AWS_CPU_FEATURE_ARM_CRYPTO:
# endif // (defined(__aarch64__))
return s_hwcap[s_check_cap[feature_name].cap] & s_check_cap[feature_name].bit;
default:
return false;
Expand Down
40 changes: 40 additions & 0 deletions source/arch/arm/darwin/cpuid.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
/*
* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License").
* You may not use this file except in compliance with the License.
* A copy of the License is located at
*
* http://aws.amazon.com/apache2.0
*
* or in the "license" file accompanying this file. This file is distributed
* on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
* express or implied. See the License for the specific language governing
* permissions and limitations under the License.
*/

#include <aws/common/cpuid.h>

#include <sys/sysctl.h>

bool aws_cpu_has_feature(enum aws_cpu_feature_name feature_name) {
int64_t ret = 0;
size_t size = sizeof(ret);

switch (feature_name) {
case AWS_CPU_FEATURE_ARM_PMULL:
if (sysctlbyname("hw.optional.arm.FEAT_PMULL", &ret, &size, NULL, 0) != -1) {
return ret == 1;
}
case AWS_CPU_FEATURE_ARM_CRC:
if (sysctlbyname("hw.optional.armv8_crc32", &ret, &size, NULL, 0) != -1) {
return ret == 1;
}
case AWS_CPU_FEATURE_ARM_CRYPTO:
if (sysctlbyname("hw.optional.arm.FEAT_AES", &ret, &size, NULL, 0) != -1) {
return ret == 1;
}
default:
return false;
}
}
13 changes: 11 additions & 2 deletions source/arch/arm/msvc/cpuid.c → source/arch/arm/windows/cpuid.c
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,18 @@
* permissions and limitations under the License.
*/

#include <Windows.h>
#include <aws/common/cpuid.h>
#include <stdlib.h>

bool aws_cpu_has_feature(enum aws_cpu_feature_name feature_name) {
return false;
switch (feature_name) {
case AWS_CPU_FEATURE_ARM_CRC:
return IsProcessorFeaturePresent(PF_ARM_V8_CRC32_INSTRUCTIONS_AVAILABLE) != 0;
// this is the best we've got on windows as they don't separate PMULL and AES from each other.
case AWS_CPU_FEATURE_ARM_PMULL:
case AWS_CPU_FEATURE_ARM_CRYPTO:
return IsProcessorFeaturePresent(PF_ARM_V8_CRYPTO_INSTRUCTIONS_AVAILABLE) != 0;
default:
return false;
}
}
4 changes: 2 additions & 2 deletions source/arch/intel/cpuid.c
Original file line number Diff line number Diff line change
Expand Up @@ -116,8 +116,8 @@ static bool s_has_bmi2(void) {
static bool s_has_vpclmulqdq(void) {
uint32_t abcd[4];
/* Check VPCLMULQDQ:
* CPUID.(EAX=07H, ECX=0H):ECX.VPCLMULQDQ[bit 20]==1 */
uint32_t vpclmulqdq_mask = (1 << 20);
* CPUID.(EAX=07H, ECX=0H):ECX.VPCLMULQDQ[bit 10]==1 */
uint32_t vpclmulqdq_mask = (1 << 10);
aws_run_cpuid(7, 0, abcd);
if ((abcd[2] & vpclmulqdq_mask) != vpclmulqdq_mask) {
return false;
Expand Down
Loading