Skip to content

Commit

Permalink
Make SIMD tests work when building multiarch binaries
Browse files Browse the repository at this point in the history
MacOS multiarch binaries compile source code for each architecture
and then join them together using 'lipo'.  This means architecture
specific code both in the actual source and configure tests need
to be compilable on both architectures.

Switch the configure tests and hts_probe_cc.sh so that they check
if a given flag is needed to compile the test code instead of
just testing to see if the flag works.  By adding #ifdef __x86_64__
guards around the test code, compilation will work on non-x86_64
returning the result that no special compiler flag is needed.
Similar #ifdef guards are added to the source files so that the
SIMD-specific code only gets compiled for x86_64.  The
htscodecs submodule is updated to pull in these source file
changes.

The SIMD parts of built-in htscodecs are now compiled
unconditionally.  Tests for NEON have also been removed as they
weren't really doing anything.

The configure and hts_probe_cc.sh are adjusted to exactly match
those used by htscodecs' configure, for ease of maintenance.
  • Loading branch information
daviesrob authored and whitwham committed Mar 31, 2023
1 parent 7ed911e commit 26f4d44
Show file tree
Hide file tree
Showing 7 changed files with 218 additions and 149 deletions.
16 changes: 13 additions & 3 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -126,10 +126,18 @@ srcdir = .
srcprefix =
HTSPREFIX =

# Flags for SIMD code
HTS_CFLAGS_AVX2 =
HTS_CFLAGS_AVX512 =
HTS_CFLAGS_SSE4 =

# Control building of SIMD code. Not used if configure has been run.
HTS_BUILD_AVX2 =
HTS_BUILD_AVX512 =
HTS_BUILD_SSSE3 =
HTS_BUILD_POPCNT =
HTS_BUILD_SSE4_1 =

include htslib_vars.mk
include htscodecs.mk

Expand Down Expand Up @@ -274,18 +282,20 @@ config.h:
echo '#endif' >> $@
echo '#define HAVE_DRAND48 1' >> $@
echo '#define HAVE_LIBCURL 1' >> $@
if [ "x$(HTS_CFLAGS_SSE4)" != "x" ] ; then \
if [ "x$(HTS_BUILD_POPCNT)" != "x" ] && \
[ "x$(HTS_BUILD_SSE4_1)" != "x" ] && \
[ "x$(HTS_BUILD_SSSE3)" != "x" ]; then \
echo '#define HAVE_POPCNT 1' >> $@ ; \
echo '#define HAVE_SSE4_1 1' >> $@ ; \
echo '#define HAVE_SSSE3 1' >> $@ ; \
echo '#if defined(HTS_ALLOW_UNALIGNED) && HTS_ALLOW_UNALIGNED == 0' >> $@ ; \
echo '#define UBSAN 1' >> $@ ; \
echo '#endif' >> $@ ; \
fi
if [ "x$(HTS_CFLAGS_AVX2)" != "x" ] ; then \
if [ "x$(HTS_BUILD_AVX2)" != "x" ] ; then \
echo '#define HAVE_AVX2 1' >> $@ ; \
fi
if [ "x$(HTS_CFLAGS_AVX512)" != "x" ] ; then \
if [ "x$(HTS_BUILD_AVX512)" != "x" ] ; then \
echo '#define HAVE_AVX512 1' >> $@ ; \
fi

Expand Down
122 changes: 71 additions & 51 deletions configure.ac
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ AC_CONFIG_SRCDIR(hts.c)
AC_CONFIG_HEADERS(config.h)

m4_include([m4/hts_prog_cc_warnings.m4])
m4_include([m4/ax_check_compile_flag.m4])
m4_include([m4/hts_check_compile_flags_needed.m4])
m4_include([m4/hts_hide_dynamic_syms.m4])
m4_include([m4/pkg.m4])

Expand Down Expand Up @@ -71,16 +71,53 @@ dnl later as they can interfere with some of the tests (notably AC_SEARCH_LIBS)
HTS_PROG_CC_WERROR(hts_late_cflags)

dnl Check for various compiler flags to enable SIMD features
dnl Options for rANS32x16 sse4.1 version
AX_CHECK_COMPILE_FLAG([-mssse3 -mpopcnt -msse4.1], [
hts_cflags_sse4="-mssse3 -mpopcnt -msse4.1"
AC_SUBST([hts_cflags_sse4])
AC_DEFINE([HAVE_SSSE3],1,
[Defined to 1 if the compiler can issue SSSE3 instructions.])
AC_DEFINE([HAVE_POPCNT],1,
[Defined to 1 if the compiler can issue popcnt instructions.])
AC_DEFINE([HAVE_SSE4_1],1,
[Defined to 1 if the compiler can issue SSE4.1 instructions.])
dnl Options for rANS32x16 sse4.1 version - ssse3
hts_cflags_sse4=""
HTS_CHECK_COMPILE_FLAGS_NEEDED([ssse3], [-mssse3], [AC_LANG_PROGRAM([[
#ifdef __x86_64__
#include "x86intrin.h"
#endif
]],[[
#ifdef __x86_64__
__m128i a = _mm_set_epi32(1, 2, 3, 4), b = _mm_set_epi32(4, 3, 2, 1);
__m128i c = _mm_shuffle_epi8(a, b);
return *((char *) &c);
#endif
]])], [
hts_cflags_sse4="$flags_needed $hts_cflags_sse4"
AC_DEFINE([HAVE_SSSE3],1,[Defined to 1 if rANS source using SSSE3 can be compiled.])
])

dnl Options for rANS32x16 sse4.1 version - popcnt
HTS_CHECK_COMPILE_FLAGS_NEEDED([popcnt], [-mpopcnt], [AC_LANG_PROGRAM([[
#ifdef __x86_64__
#include "x86intrin.h"
#endif
]],[[
#ifdef __x86_64__
unsigned int i = _mm_popcnt_u32(1);
return i != 1;
#endif
]])], [
hts_cflags_sse4="$flags_needed $hts_cflags_sse4"
AC_DEFINE([HAVE_POPCNT],1,[Defined to 1 if rANS source using popcnt can be compiled.])
])

dnl Options for rANS32x16 sse4.1 version - sse4.1
HTS_CHECK_COMPILE_FLAGS_NEEDED([sse4.1], [-msse4.1], [AC_LANG_PROGRAM([[
#ifdef __x86_64__
#include "x86intrin.h"
#endif
]],[[
#ifdef __x86_64__
__m128i a = _mm_set_epi32(1, 2, 3, 4), b = _mm_set_epi32(4, 3, 2, 1);
__m128i c = _mm_max_epu32(a, b);
return *((char *) &c);
#endif
]])], [
hts_cflags_sse4="$flags_needed $hts_cflags_sse4"
AC_DEFINE([HAVE_SSE4_1],1,[Defined to 1 if rANS source using SSE4.1 can be compiled.
])
dnl Propagate HTSlib's unaligned access preference to htscodecs
AH_VERBATIM([UBSAN],[
/* Prevent unaligned access in htscodecs SSE4 rANS codec */
Expand All @@ -89,60 +126,43 @@ dnl Propagate HTSlib's unaligned access preference to htscodecs
#endif
])
AC_DEFINE([UBSAN],1,[])
], [], [], [AC_LANG_PROGRAM([[
#include "x86intrin.h"
]],[[
unsigned int i = _mm_popcnt_u32(1);
__m128i a = _mm_set_epi32(1, 2, 3, i), b = _mm_set_epi32(4, 3, 2, 1);
__m128i c = _mm_max_epu32(a, b);
b = _mm_shuffle_epi8(a, c);
return *((char *) &b);
]])])
])
AC_SUBST([hts_cflags_sse4])

dnl Options for rANS32x16 avx2 version
AX_CHECK_COMPILE_FLAG([-mavx2], [
hts_cflags_avx2="-mavx2"
AC_SUBST([hts_cflags_avx2])
AC_DEFINE([HAVE_AVX2],1,
[Defined to 1 if the compiler can issue AVX2 instructions.])
], [], [], [AC_LANG_PROGRAM([[
#include "x86intrin.h"
HTS_CHECK_COMPILE_FLAGS_NEEDED([avx2], [-mavx2], [AC_LANG_PROGRAM([[
#ifdef __x86_64__
#include "x86intrin.h"
#endif
]],[[
#ifdef __x86_64__
__m256i a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
__m256i b = _mm256_add_epi32(a, a);
long long c = _mm256_extract_epi64(b, 0);
return (int) c;
]])])
#endif
]])], [
hts_cflags_avx2="$flags_needed"
AC_SUBST([hts_cflags_avx2])
AC_DEFINE([HAVE_AVX2],1,[Defined to 1 if rANS source using AVX2 can be compiled.])
])

dnl Options for rANS32x16 avx512 version
AX_CHECK_COMPILE_FLAG([-mavx512f], [
hts_cflags_avx512="-mavx512f"
AC_SUBST([hts_cflags_avx512])
AC_DEFINE([HAVE_AVX512],1,
[Defined to 1 if the compiler can issue AVX512 instructions.])
], [], [], [AC_LANG_PROGRAM([[
HTS_CHECK_COMPILE_FLAGS_NEEDED([avx512f], [-mavx512f], [AC_LANG_PROGRAM([[
#ifdef __x86_64__
#include "x86intrin.h"
#endif
]],[[
#ifdef __x86_64__
__m512i a = _mm512_set1_epi32(1);
__m512i b = _mm512_add_epi32(a, a);
return *((char *) &b);
]])])

dnl Detect ARM Neon availability
AC_CACHE_CHECK([whether C compiler supports ARM Neon], [hts_cv_have_neon], [
AC_COMPILE_IFELSE([
AC_LANG_PROGRAM([[
#include "arm_neon.h"
]], [[
int32x4_t a = vdupq_n_s32(1);
int32x4_t b = vaddq_s32(a, a);
return *((char *) &b);
]])], [hts_cv_have_neon=yes], [hts_cv_have_neon=no])])
if test "$hts_cv_have_neon" = yes; then
hts_have_neon=yes
AC_SUBST([hts_have_neon])
fi

#endif
]])], [
hts_cflags_avx512="$flags_needed"
AC_SUBST([hts_cflags_avx512])
AC_DEFINE([HAVE_AVX512],1,[Defined to 1 if rANS source using AVX512F can be compiled.])
])

dnl Avoid chicken-and-egg problem where pkg-config supplies the
dnl PKG_PROG_PKG_CONFIG macro, but we want to use it to check
Expand Down
103 changes: 66 additions & 37 deletions hts_probe_cc.sh
Original file line number Diff line number Diff line change
Expand Up @@ -43,72 +43,101 @@ run_compiler ()
return $retval
}

# Run a test. $1 is the flag to try, $2 is the Makefile variable to set
# with the flag probe result, $3 is a Makefile variable which will be
# set to 1 if the code was built successfully. The code to test should
# be passed in via fd 0.
# First try compiling conftest.c without the flag. If that fails, try
# again with it to see if the flag is needed.
run_test ()
{
rm -f conftest conftest.err conftest.c
cat - > conftest.c
if run_compiler ; then
echo "$2 ="
echo "$3 = 1"
elif run_compiler "$1" ; then
echo "$2 = $1"
echo "$3 = 1"
else
echo "$3 ="
fi
}

echo "# Compiler probe results, generated by $0"

# Check for sse4.1 etc. support
# Check for ssse3
run_test "-mssse3" HTS_CFLAGS_SSSE3 HTS_BUILD_SSSE3 <<'EOF'
#ifdef __x86_64__
#include "x86intrin.h"
int main(int argc, char **argv) {
__m128i a = _mm_set_epi32(1, 2, 3, 4), b = _mm_set_epi32(4, 3, 2, 1);
__m128i c = _mm_shuffle_epi8(a, b);
return *((char *) &c);
}
#else
int main(int argc, char **argv) { return 0; }
#endif
EOF

rm -f conftest conftest.err conftest.c
cat - <<'EOF' > conftest.c
# Check for popcnt
run_test "-mpopcnt" HTS_CFLAGS_POPCNT HTS_BUILD_POPCNT <<'EOF'
#ifdef __x86_64__
#include "x86intrin.h"
int main(int argc, char **argv) {
unsigned int i = _mm_popcnt_u32(1);
__m128i a = _mm_set_epi32(1, 2, 3, i), b = _mm_set_epi32(4, 3, 2, 1);
return i != 1;
}
#else
int main(int argc, char **argv) { return 0; }
#endif
EOF

# Check for sse4.1 etc. support
run_test "-msse4.1" HTS_CFLAGS_SSE4_1 HTS_BUILD_SSE4_1 <<'EOF'
#ifdef __x86_64__
#include "x86intrin.h"
int main(int argc, char **argv) {
__m128i a = _mm_set_epi32(1, 2, 3, 4), b = _mm_set_epi32(4, 3, 2, 1);
__m128i c = _mm_max_epu32(a, b);
b = _mm_shuffle_epi8(a, c);
return *((char *) &b);
return *((char *) &c);
}
#else
int main(int argc, char **argv) { return 0; }
#endif
EOF
FLAGS="-mpopcnt -msse4.1 -mssse3"
if run_compiler "$FLAGS" ; then
echo "HTS_CFLAGS_SSE4 = $FLAGS"
fi

echo 'HTS_CFLAGS_SSE4 = $(HTS_CFLAGS_SSSE3) $(HTS_CFLAGS_POPCNT) $(HTS_CFLAGS_SSE4_1)'

# Check for avx2

rm -f conftest.c
cat - <<'EOF' > conftest.c
run_test -mavx2 HTS_CFLAGS_AVX2 HTS_BUILD_AVX2 <<'EOF'
#ifdef __x86_64__
#include "x86intrin.h"
int main(int argc, char **argv) {
__m256i a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
__m256i b = _mm256_add_epi32(a, a);
long long c = _mm256_extract_epi64(b, 0);
return (int) c;
}
#else
int main(int argc, char **argv) { return 0; }
#endif
EOF
FLAGS="-mavx2"
if run_compiler "$FLAGS" ; then
echo "HTS_CFLAGS_AVX2 = $FLAGS"
fi

# Check for avx512

rm -f conftest.c
cat - <<'EOF' > conftest.c
run_test -mavx512f HTS_CFLAGS_AVX512 HTS_BUILD_AVX512 <<'EOF'
#ifdef __x86_64__
#include "x86intrin.h"
int main(int argc, char **argv) {
__m512i a = _mm512_set1_epi32(1);
__m512i b = _mm512_add_epi32(a, a);
return *((char *) &b);
}
#else
int main(int argc, char **argv) { return 0; }
#endif
EOF
FLAGS="-mavx512f"
if run_compiler "$FLAGS" ; then
echo "HTS_CFLAGS_AVX512 = $FLAGS"
fi

# Check for neon

rm -f conftest.c
cat - <<'EOF' > conftest.c
#include "arm_neon.h"
int main(int argc, char **argv) {
int32x4_t a = vdupq_n_s32(1);
int32x4_t b = vaddq_s32(a, a);
return *((char *) &b);
}
EOF
if run_compiler "" ; then
echo "HTS_HAVE_NEON = yes"
fi

rm -f conftest.c
8 changes: 4 additions & 4 deletions htscodecs_bundled.mk
Original file line number Diff line number Diff line change
Expand Up @@ -28,10 +28,10 @@ HTSCODECS_SOURCES = $(HTSPREFIX)htscodecs/htscodecs/arith_dynamic.c \
$(HTSPREFIX)htscodecs/htscodecs/htscodecs.c \
$(HTSPREFIX)htscodecs/htscodecs/pack.c \
$(HTSPREFIX)htscodecs/htscodecs/rANS_static4x16pr.c \
$(if $(HTS_CFLAGS_AVX2),$(HTSPREFIX)htscodecs/htscodecs/rANS_static32x16pr_avx2.c) \
$(if $(HTS_CFLAGS_AVX512),$(HTSPREFIX)htscodecs/htscodecs/rANS_static32x16pr_avx512.c) \
$(if $(HTS_CFLAGS_SSE4),$(HTSPREFIX)htscodecs/htscodecs/rANS_static32x16pr_sse4.c) \
$(if $(HTS_HAVE_NEON),$(HTSPREFIX)htscodecs/htscodecs/rANS_static32x16pr_neon.c) \
$(HTSPREFIX)htscodecs/htscodecs/rANS_static32x16pr_avx2.c \
$(HTSPREFIX)htscodecs/htscodecs/rANS_static32x16pr_avx512.c \
$(HTSPREFIX)htscodecs/htscodecs/rANS_static32x16pr_sse4.c \
$(HTSPREFIX)htscodecs/htscodecs/rANS_static32x16pr_neon.c \
$(HTSPREFIX)htscodecs/htscodecs/rANS_static32x16pr.c \
$(HTSPREFIX)htscodecs/htscodecs/rANS_static.c \
$(HTSPREFIX)htscodecs/htscodecs/rle.c \
Expand Down
Loading

0 comments on commit 26f4d44

Please sign in to comment.