Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Speed up CRC32 calculation on LoongArch #86

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 28 additions & 0 deletions CMakeLists.txt
xry111 marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
Expand Up @@ -327,6 +327,7 @@ add_library(liblzma
src/liblzma/check/crc_common.h
src/liblzma/check/crc_x86_clmul.h
src/liblzma/check/crc32_arm64.h
src/liblzma/check/crc32_loongarch.h
src/liblzma/common/block_util.c
src/liblzma/common/common.c
src/liblzma/common/common.h
Expand Down Expand Up @@ -1156,6 +1157,33 @@ if(ALLOW_ARM64_CRC32)
endif()
endif()

# LoongArch CRC32 Intrinsics are in larchintrin.h.
# These are supported by at least GCC and Clang.
option(ALLOW_LOONGARCH_CRC32 "Allow LoongArch CRC32 instruction if \
supported by the system" ON)

if(ALLOW_LOONGARCH_CRC32)
check_c_source_compiles("
// As at now only 64-bit LoongArch is supported.
#if !(defined(__loongarch__) && __loongarch_grlen >= 64)
# error LoongArch CRC32 only supported on 64-bit LoongArch
#endif

#include <larchintrin.h>

int my_crc(int word, int crc)
{
return __crc_w_w_w(word, crc);
}
int main(void) { return 0; }
"
HAVE_LOONGARCH_CRC32)
xry111 marked this conversation as resolved.
Show resolved Hide resolved

if(HAVE_LOONGARCH_CRC32)
target_compile_definitions(liblzma PRIVATE HAVE_LOONGARCH_CRC32)
endif()
endif()


# Symbol visibility support:
#
Expand Down
47 changes: 47 additions & 0 deletions configure.ac
xry111 marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
Expand Up @@ -394,6 +394,16 @@ AC_ARG_ENABLE([arm64-crc32], AS_HELP_STRING([--disable-arm64-crc32],
[], [enable_arm64_crc32=yes])

xry111 marked this conversation as resolved.
Show resolved Hide resolved

################################
# LoongArch CRC32 Instructions #
################################

AC_ARG_ENABLE([loongarch-crc32], AS_HELP_STRING([--disable-loongarch-crc32],
[Do not use LoongArch CRC32 instructions even if support for it
is detected.]),
[], [enable_loongarch_crc32=yes])


#####################
# Size optimization #
#####################
Expand Down Expand Up @@ -1152,6 +1162,43 @@ AS_IF([test "x$enable_arm64_crc32" = xyes], [
AC_CHECK_FUNCS([getauxval elf_aux_info sysctlbyname])
])

# LoongArch Intrinsics define CRC32 functions in larchintrin.h.
# These are supported by at least GCC and Clang.
AC_MSG_CHECKING([if LoongArch CRC32 instruction is usable])
AS_IF([test "x$enable_loongarch_crc32" = xno], [
AC_MSG_RESULT([no, --disable-loongarch-crc32 was used])
], [
# Set -Werror here because -Wimplicit-function-declaration was
# only a warning in GCC <= 13. This does not need to be done
# with CMake because tests will attempt to link and the error
# will be reported then.
OLD_CFLAGS="$CFLAGS"
CFLAGS="$CFLAGS -Werror"

AC_COMPILE_IFELSE([AC_LANG_SOURCE([[
// As at now only 64-bit LoongArch is supported.
#if !(defined(__loongarch__) && __loongarch_grlen >= 64)
# error
#endif

#include <larchintrin.h>
int my_crc(int word, int crc)
{
return __crc_w_w_w(word, crc);
}
]])], [
AC_DEFINE([HAVE_LOONGARCH_CRC32], [1],
[Define to 1 if LoongArch CRC32 instruction is supported.
See configure.ac for details.])
enable_loongarch_crc32=yes
], [
enable_loongarch_crc32=no
])
AC_MSG_RESULT([$enable_loongarch_crc32])

CFLAGS="$OLD_CFLAGS"
])


# Check for sandbox support. If one is found, set enable_sandbox=found.
#
Expand Down
3 changes: 2 additions & 1 deletion src/liblzma/check/Makefile.inc
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,8 @@ liblzma_la_SOURCES += \
check/check.h \
check/crc_common.h \
check/crc_x86_clmul.h \
check/crc32_arm64.h
check/crc32_arm64.h \
check/crc32_loongarch.h

if COND_SMALL
liblzma_la_SOURCES += check/crc32_small.c
Expand Down
2 changes: 2 additions & 0 deletions src/liblzma/check/crc32_fast.c
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@
# include "crc_x86_clmul.h"
#elif defined(CRC32_ARM64)
# include "crc32_arm64.h"
#elif defined(CRC32_LOONGARCH)
# include "crc32_loongarch.h"
#endif


Expand Down
71 changes: 71 additions & 0 deletions src/liblzma/check/crc32_loongarch.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
// SPDX-License-Identifier: 0BSD

///////////////////////////////////////////////////////////////////////////////
//
/// \file crc32_loongarch.h
/// \brief CRC32 calculation with LoongArch optimization
//
// Authors: Xi Ruoyao
//
///////////////////////////////////////////////////////////////////////////////


#ifndef LZMA_CRC32_LOONGARCH_H
#define LZMA_CRC32_LOONGARCH_H

#include <larchintrin.h>

static uint32_t
xry111 marked this conversation as resolved.
Show resolved Hide resolved
crc32_arch_optimized(const uint8_t *buf, size_t size, uint32_t crc_unsigned)
{
int32_t crc = (int32_t)~crc_unsigned;

// Align the input buffer because this was shown to be
// significantly faster than unaligned accesses.
const size_t align_amount = my_min(size,
(8 - (uintptr_t)buf) & (8 - 1));
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(0 - (uintptr_t)buf) does the same thing and should map to a negation instruction. Possibly a compiler will do the same from the current code too but using a zero makes it explicit. A similar change was done in 2337f70. Using 0U instead of 0 isn't important here as uintptr_t will promote the math to unsigned anyway so it's just what one prefers.

Some would write just (-(uintptr_t)buf) but that can cause warnings due to sign conversion, thus I prefer writing it as a subtraction from zero.


if (align_amount & 1)
crc = __crc_w_b_w((int8_t)*buf++, crc);

if (align_amount & 2) {
crc = __crc_w_h_w((int16_t)aligned_read16le(buf), crc);
buf += 2;
}

if (align_amount & 4) {
crc = __crc_w_w_w((int32_t)aligned_read32le(buf), crc);
buf += 4;
}

size -= align_amount;

// Process 8 bytes at a time. The end point is determined by
// ignoring the least significant 3 bits of size to ensure
// we do not process past the bounds of the buffer. This guarantees
// that limit is a multiple of 8 and is strictly less than size.
for (const uint8_t *limit = buf + (size & ~(size_t)7);
buf < limit; buf += 8)
crc = __crc_w_d_w((int64_t)aligned_read64le(buf), crc);

// Process the remaining bytes that are not 8 byte aligned.
xry111 marked this conversation as resolved.
Show resolved Hide resolved
const size_t remaining_amount = size & 7;

if (remaining_amount & 4) {
crc = __crc_w_w_w((int32_t)aligned_read32le(buf), crc);
buf += 4;
}

if (remaining_amount & 2) {
crc = __crc_w_h_w((int16_t)aligned_read16le(buf), crc);
buf += 2;
}

if (remaining_amount & 1)
crc = __crc_w_b_w((int8_t)*buf, crc);

return (uint32_t)~crc;
}

xry111 marked this conversation as resolved.
Show resolved Hide resolved

#endif // LZMA_CRC32_LOONGARCH_H
7 changes: 6 additions & 1 deletion src/liblzma/check/crc32_table.c
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,14 @@
# define ARM64_CRC32_NO_TABLE 1
#endif

#if defined(HAVE_LOONGARCH_CRC32)
# define LOONGARCH_CRC32_NO_TABLE 1
#endif


#if !defined(HAVE_ENCODERS) && (defined(X86_CLMUL_NO_TABLE) \
|| defined(ARM64_CRC32_NO_TABLE_))
|| defined(ARM64_CRC32_NO_TABLE_) \
|| defined(LOONGARCH_CRC32_NO_TABLE))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for remembering this! I had forgotten this was needed in the first round of review

// No table needed. Use a typedef to avoid an empty translation unit.
typedef void lzma_crc32_dummy;

Expand Down
11 changes: 11 additions & 0 deletions src/liblzma/check/crc_common.h
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,8 @@
#undef CRC32_ARM64
#undef CRC64_ARM64_CLMUL

#undef CRC32_LOONGARCH

#undef CRC_USE_IFUNC

#undef CRC_USE_GENERIC_FOR_SMALL_INPUTS
Expand Down Expand Up @@ -128,6 +130,15 @@
# endif
#endif

// Only 64-bit LoongArch is supported now, thus no runtime detection is
// needed because the LoongArch specification says CRC32 instructions are
// a part of the Basic Integer Instructions and they shall be implemented
// by 64-bit LoongArch implementations.
#if defined(HAVE_LOONGARCH_CRC32)
# define CRC32_ARCH_OPTIMIZED 1
# define CRC32_LOONGARCH 1
#endif

// For CRC32 use the generic slice-by-eight implementation if no optimized
// version is available.
#if !defined(CRC32_ARCH_OPTIMIZED) && !defined(CRC32_GENERIC)
Expand Down