Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[OpenBLAS_jll] Update to new build with BFloat16 kernels #53059

Merged
merged 2 commits into from
Jan 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
184 changes: 92 additions & 92 deletions deps/checksums/openblas

Large diffs are not rendered by default.

19 changes: 16 additions & 3 deletions deps/openblas.mk
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,9 @@ endif
# don't touch scheduler affinity since we manage this ourselves
OPENBLAS_BUILD_OPTS += NO_AFFINITY=1

# Build BFloat16 kernels
OPENBLAS_BUILD_OPTS += BUILD_BFLOAT16=1

# Build for all architectures - required for distribution
ifeq ($(SANITIZE_MEMORY),1)
OPENBLAS_BUILD_OPTS += TARGET=GENERIC
Expand Down Expand Up @@ -95,12 +98,22 @@ $(BUILDDIR)/$(OPENBLAS_SRC_DIR)/openblas-ofast-power.patch-applied: $(BUILDDIR)/
patch -p1 -f < $(SRCDIR)/patches/openblas-ofast-power.patch
echo 1 > $@

$(BUILDDIR)/$(OPENBLAS_SRC_DIR)/neoverse-generic-kernels.patch-applied: $(BUILDDIR)/$(OPENBLAS_SRC_DIR)/openblas-ofast-power.patch-applied
$(BUILDDIR)/$(OPENBLAS_SRC_DIR)/openblas-avx512bf-kernels.patch-applied: $(BUILDDIR)/$(OPENBLAS_SRC_DIR)/openblas-ofast-power.patch-applied
cd $(BUILDDIR)/$(OPENBLAS_SRC_DIR) && \
patch -p1 -f < $(SRCDIR)/patches/openblas-avx512bf-kernels.patch
echo 1 > $@

$(BUILDDIR)/$(OPENBLAS_SRC_DIR)/openblas-gemv-multithreading.patch-applied: $(BUILDDIR)/$(OPENBLAS_SRC_DIR)/openblas-avx512bf-kernels.patch-applied
cd $(BUILDDIR)/$(OPENBLAS_SRC_DIR) && \
patch -p1 -f < $(SRCDIR)/patches/openblas-gemv-multithreading.patch
echo 1 > $@

$(BUILDDIR)/$(OPENBLAS_SRC_DIR)/openblas-darwin-sve.patch-applied: $(BUILDDIR)/$(OPENBLAS_SRC_DIR)/openblas-gemv-multithreading.patch-applied
cd $(BUILDDIR)/$(OPENBLAS_SRC_DIR) && \
patch -p1 -f < $(SRCDIR)/patches/neoverse-generic-kernels.patch
patch -p1 -f < $(SRCDIR)/patches/openblas-darwin-sve.patch
echo 1 > $@

$(BUILDDIR)/$(OPENBLAS_SRC_DIR)/build-configured: $(BUILDDIR)/$(OPENBLAS_SRC_DIR)/neoverse-generic-kernels.patch-applied
$(BUILDDIR)/$(OPENBLAS_SRC_DIR)/build-configured: $(BUILDDIR)/$(OPENBLAS_SRC_DIR)/openblas-darwin-sve.patch-applied
echo 1 > $@

$(BUILDDIR)/$(OPENBLAS_SRC_DIR)/build-compiled: $(BUILDDIR)/$(OPENBLAS_SRC_DIR)/build-configured
Expand Down
19 changes: 0 additions & 19 deletions deps/patches/neoverse-generic-kernels.patch

This file was deleted.

107 changes: 107 additions & 0 deletions deps/patches/openblas-avx512bf-kernels.patch
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
From 1dada6d65d89d19b2cf89b12169f6b2196c90f1d Mon Sep 17 00:00:00 2001
From: Martin Kroeker <[email protected]>
Date: Fri, 12 Jan 2024 00:10:56 +0100
Subject: [PATCH 1/2] Add compiler test and flag for AVX512BF16 capability

---
c_check | 22 ++++++++++++++++++++++
1 file changed, 22 insertions(+)

diff --git a/c_check b/c_check
index b5e4a9ad00..3e507be818 100755
--- a/c_check
+++ b/c_check
@@ -244,6 +244,7 @@ case "$data" in
esac

no_avx512=0
+no_avx512bf=0
if [ "$architecture" = "x86" ] || [ "$architecture" = "x86_64" ]; then
tmpd=$(mktemp -d 2>/dev/null || mktemp -d -t 'OBC')
tmpf="$tmpd/a.c"
@@ -262,6 +263,25 @@ if [ "$architecture" = "x86" ] || [ "$architecture" = "x86_64" ]; then
}

rm -rf "$tmpd"
+ if [ "$no_avx512" -eq 0 ]; then
+ tmpd=$(mktemp -d 2>/dev/null || mktemp -d -t 'OBC')
+ tmpf="$tmpd/a.c"
+ code='"__m512 a= _mm512_dpbf16_ps(a, (__m512bh) _mm512_loadu_si512(%1]), (__m512bh) _mm512_loadu_si512(%2]));"'
+ printf "#include <immintrin.h>\n\nint main(void){ %s; }\n" "$code" >> "$tmpf"
+ if [ "$compiler" = "PGI" ]; then
+ args=" -tp cooperlake -c -o $tmpf.o $tmpf"
+ else
+ args=" -march=cooperlake -c -o $tmpf.o $tmpf"
+ fi
+ no_avx512bf=0
+ {
+ $compiler_name $flags $args >/dev/null 2>&1
+ } || {
+ no_avx512bf=1
+ }
+
+ rm -rf "$tmpd"
+ fi
fi

no_rv64gv=0
@@ -409,6 +429,7 @@ done
[ "$makefile" = "-" ] && {
[ "$no_rv64gv" -eq 1 ] && printf "NO_RV64GV=1\n"
[ "$no_avx512" -eq 1 ] && printf "NO_AVX512=1\n"
+ [ "$no_avx512bf" -eq 1 ] && printf "NO_AVX512BF16=1\n"
[ "$no_avx2" -eq 1 ] && printf "NO_AVX2=1\n"
[ "$oldgcc" -eq 1 ] && printf "OLDGCC=1\n"
exit 0
@@ -437,6 +458,7 @@ done
[ "$no_sve" -eq 1 ] && printf "NO_SVE=1\n"
[ "$no_rv64gv" -eq 1 ] && printf "NO_RV64GV=1\n"
[ "$no_avx512" -eq 1 ] && printf "NO_AVX512=1\n"
+ [ "$no_avx512bf" -eq 1 ] && printf "NO_AVX512BF16=1\n"
[ "$no_avx2" -eq 1 ] && printf "NO_AVX2=1\n"
[ "$oldgcc" -eq 1 ] && printf "OLDGCC=1\n"
[ "$no_lsx" -eq 1 ] && printf "NO_LSX=1\n"

From 995a990e24fdcc8080128a8abc17b4ccc66bd4fd Mon Sep 17 00:00:00 2001
From: Martin Kroeker <[email protected]>
Date: Fri, 12 Jan 2024 00:12:46 +0100
Subject: [PATCH 2/2] Make AVX512 BFLOAT16 kernels conditional on compiler
capability

---
kernel/x86_64/KERNEL.COOPERLAKE | 3 ++-
kernel/x86_64/KERNEL.SAPPHIRERAPIDS | 2 ++
2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/kernel/x86_64/KERNEL.COOPERLAKE b/kernel/x86_64/KERNEL.COOPERLAKE
index dba94aea86..22b042029f 100644
--- a/kernel/x86_64/KERNEL.COOPERLAKE
+++ b/kernel/x86_64/KERNEL.COOPERLAKE
@@ -1,5 +1,5 @@
include $(KERNELDIR)/KERNEL.SKYLAKEX
-
+ifneq ($(NO_AVX512BF16), 1)
SBGEMM_SMALL_M_PERMIT = sbgemm_small_kernel_permit_cooperlake.c
SBGEMM_SMALL_K_NN = sbgemm_small_kernel_nn_cooperlake.c
SBGEMM_SMALL_K_B0_NN = sbgemm_small_kernel_nn_cooperlake.c
@@ -20,3 +20,4 @@ SBGEMMINCOPYOBJ = sbgemm_incopy$(TSUFFIX).$(SUFFIX)
SBGEMMITCOPYOBJ = sbgemm_itcopy$(TSUFFIX).$(SUFFIX)
SBGEMMONCOPYOBJ = sbgemm_oncopy$(TSUFFIX).$(SUFFIX)
SBGEMMOTCOPYOBJ = sbgemm_otcopy$(TSUFFIX).$(SUFFIX)
+endif
diff --git a/kernel/x86_64/KERNEL.SAPPHIRERAPIDS b/kernel/x86_64/KERNEL.SAPPHIRERAPIDS
index 3a832e9174..0ab2b4ddcf 100644
--- a/kernel/x86_64/KERNEL.SAPPHIRERAPIDS
+++ b/kernel/x86_64/KERNEL.SAPPHIRERAPIDS
@@ -1,5 +1,6 @@
include $(KERNELDIR)/KERNEL.COOPERLAKE

+ifneq ($(NO_AVX512BF16), 1)
SBGEMM_SMALL_M_PERMIT =
SBGEMM_SMALL_K_NN =
SBGEMM_SMALL_K_B0_NN =
@@ -20,3 +21,4 @@ SBGEMMINCOPYOBJ = sbgemm_incopy$(TSUFFIX).$(SUFFIX)
SBGEMMITCOPYOBJ = sbgemm_itcopy$(TSUFFIX).$(SUFFIX)
SBGEMMONCOPYOBJ = sbgemm_oncopy$(TSUFFIX).$(SUFFIX)
SBGEMMOTCOPYOBJ = sbgemm_otcopy$(TSUFFIX).$(SUFFIX)
+endif
34 changes: 34 additions & 0 deletions deps/patches/openblas-darwin-sve.patch
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
From 03688a42622cf76e696859ce384e45aa26d927fc Mon Sep 17 00:00:00 2001
From: Ian McInerney <[email protected]>
Date: Tue, 23 Jan 2024 10:29:57 +0000
Subject: [PATCH] Build with proper aarch64 flags on Neoverse Darwin

We aren't affected by the problems in AppleClang that prompted this
fallback to an older architecture.
---
Makefile.arm64 | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/Makefile.arm64 b/Makefile.arm64
index ed52a9424..a8f3cb0f0 100644
--- a/Makefile.arm64
+++ b/Makefile.arm64
@@ -135,11 +135,11 @@ ifeq ($(CORE), NEOVERSEN2)
ifeq (1, $(filter 1,$(GCCVERSIONGTEQ7) $(ISCLANG)))
ifeq (1, $(filter 1,$(GCCVERSIONGTEQ10) $(ISCLANG)))
ifeq (1, $(filter 1,$(GCCMINORVERSIONGTEQ4) $(GCCVERSIONGTEQ11) $(ISCLANG)))
-ifneq ($(OSNAME), Darwin)
+#ifneq ($(OSNAME), Darwin)
CCOMMON_OPT += -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2
-else
-CCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72
-endif
+#else
+#CCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72
+#endif
ifneq ($(F_COMPILER), NAG)
FCOMMON_OPT += -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2
endif
--
2.43.0

22 changes: 22 additions & 0 deletions deps/patches/openblas-gemv-multithreading.patch
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
From d2fc4f3b4d7f41527bc7dc8f62e9aa6229cfac89 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <[email protected]>
Date: Wed, 17 Jan 2024 20:59:24 +0100
Subject: [PATCH] Increase multithreading threshold by a factor of 50

---
interface/gemv.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/interface/gemv.c b/interface/gemv.c
index 1f07635799..2c121f1308 100644
--- a/interface/gemv.c
+++ b/interface/gemv.c
@@ -226,7 +226,7 @@ void CNAME(enum CBLAS_ORDER order,

#ifdef SMP

- if ( 1L * m * n < 2304L * GEMM_MULTITHREAD_THRESHOLD )
+ if ( 1L * m * n < 115200L * GEMM_MULTITHREAD_THRESHOLD )
nthreads = 1;
else
nthreads = num_cpu_avail(2);
28 changes: 20 additions & 8 deletions deps/patches/openblas-ofast-power.patch
Original file line number Diff line number Diff line change
@@ -1,17 +1,29 @@
diff --git a/Makefile.power b/Makefile.power
index 28a0bae0..b4869fbd 100644
index aa1ca080a..42c417a78 100644
--- a/Makefile.power
+++ b/Makefile.power
@@ -11,7 +11,7 @@ endif

ifeq ($(CORE), POWER10)
@@ -13,16 +13,16 @@ ifeq ($(CORE), POWER10)
ifneq ($(C_COMPILER), PGI)
ifeq ($(C_COMPILER), GCC))
ifeq ($(GCCVERSIONGTEQ10), 1)
-CCOMMON_OPT += -Ofast -mcpu=power10 -mtune=power10 -mvsx -fno-fast-math
+CCOMMON_OPT += -mcpu=power10 -mtune=power10 -mvsx -fno-fast-math
ifeq ($(F_COMPILER), IBM)
FCOMMON_OPT += -O2 -qrecur -qnosave
else ifneq ($(GCCVERSIONGT4), 1)
$(warning your compiler is too old to fully support POWER9, getting a newer version of gcc is recommended)
-CCOMMON_OPT += -Ofast -mcpu=power8 -mtune=power8 -mvsx -fno-fast-math
+CCOMMON_OPT += -mcpu=power8 -mtune=power8 -mvsx -fno-fast-math
else
$(warning your compiler is too old to fully support POWER10, getting a newer version of gcc is recommended)
-CCOMMON_OPT += -Ofast -mcpu=power9 -mtune=power9 -mvsx -fno-fast-math
+CCOMMON_OPT += -mcpu=power9 -mtune=power9 -mvsx -fno-fast-math
endif
else
@@ -22,7 +22,7 @@ endif
-CCOMMON_OPT += -Ofast -mcpu=power10 -mtune=power10 -mvsx -fno-fast-math
+CCOMMON_OPT += -mcpu=power10 -mtune=power10 -mvsx -fno-fast-math
endif
ifeq ($(F_COMPILER), IBM)
FCOMMON_OPT += -O2 -qrecur -qnosave -qarch=pwr10 -qtune=pwr10 -qfloat=nomaf -qzerosize
@@ -34,7 +34,7 @@ endif

ifeq ($(CORE), POWER9)
ifneq ($(C_COMPILER), PGI)
Expand All @@ -20,7 +32,7 @@ index 28a0bae0..b4869fbd 100644
ifeq ($(C_COMPILER), GCC)
ifneq ($(GCCVERSIONGT4), 1)
$(warning your compiler is too old to fully support POWER9, getting a newer version of gcc is recommended)
@@ -59,7 +59,7 @@ endif
@@ -70,7 +70,7 @@ endif

ifeq ($(CORE), POWER8)
ifneq ($(C_COMPILER), PGI)
Expand Down
9 changes: 9 additions & 0 deletions stdlib/LinearAlgebra/test/blas.jl
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,15 @@ Random.seed!(100)
@test BLAS.iamax(b) == findmax(fabs, b)[2] * (step(ind) >= 0)
end
end
@testset "nrm2 with non-finite elements" begin
# These tests would have caught <https://github.com/OpenMathLib/OpenBLAS/issues/2998>
# when running on appropriate hardware.
a = zeros(elty,n)
a[begin] = elty(-Inf)
@test BLAS.nrm2(a) === abs2(elty(Inf))
a[begin] = elty(NaN)
@test BLAS.nrm2(a) === abs2(elty(NaN))
end
@testset "scal" begin
α = rand(elty)
a = rand(elty,n)
Expand Down
2 changes: 1 addition & 1 deletion stdlib/OpenBLAS_jll/Project.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
name = "OpenBLAS_jll"
uuid = "4536629a-c528-5b80-bd46-f80d51c5b363"
version = "0.3.26+1"
version = "0.3.26+2"

[deps]
# See note in `src/OpenBLAS_jll.jl` about this dependency.
Expand Down