marian-nmt · emjotde · Apr 9, 2021 · Apr 9, 2021 · Apr 9, 2021 · Apr 9, 2021
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -9,6 +9,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
 ## [Unreleased]
 
 ### Added
+- Allow for fine-grained CPU intrinsics overrides when BUILD_ARCH != native e.g. -DBUILD_ARCH=x86-64 -DCOMPILE_AVX512=off
 - Better suppression of unwanted output symbols, specifically "\n" from SentencePiece with byte-fallback. Can be deactivated with --allow-special
 - Display decoder time statistics with marian-decoder --stat-freq 10 ...
 - Support for MS-internal binary shortlist
@@ -34,6 +35,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
 - Broken links to MNIST data sets
 
 ### Changed
+- For BUILD_ARCH != native enable all intrinsics types by default, can be disabled like this: -DCOMPILE_AVX512=off
 - Moved FBGEMM pointer to commit c258054 for gcc 9.3+ fix
 - Change compile options a la -DCOMPILE_CUDA_SM35 to -DCOMPILE_KEPLER, -DCOMPILE_MAXWELL,
 -DCOMPILE_PASCAL, -DCOMPILE_VOLTA, -DCOMPILE_TURING and -DCOMPILE_AMPERE

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -124,50 +124,95 @@ else(MSVC)
 
   # Detect support CPU instrinsics for the current platform. This will
   # only by used with BUILD_ARCH=native. For overridden BUILD_ARCH we
-  # minimally use -msse4.1. This seems to work with MKL.
+  # force intrinsics as set in the options.
   set(INTRINSICS "")
   list(APPEND INTRINSICS_NVCC)
 
+  option(COMPILE_SSE2   "Compile CPU code with SSE2 support"   ON)
+  option(COMPILE_SSE3   "Compile CPU code with SSE3 support"   ON)
+  option(COMPILE_SSE4_1 "Compile CPU code with SSE4.1 support" ON)
+  option(COMPILE_SSE4_2 "Compile CPU code with SSE4.2 support" ON)
+  option(COMPILE_AVX    "Compile CPU code with AVX support"    ON)
+  option(COMPILE_AVX2   "Compile CPU code with AVX2 support"   ON)
+  option(COMPILE_AVX512 "Compile CPU code with AVX512 support" ON)
+
   if(BUILD_ARCH STREQUAL "native")
+    # @TODO: if we are building "-march=native" anyway is the whole shebang here even useful?
     message(STATUS "Checking support for CPU intrinsics")
     include(FindSSE)
-    if(SSE2_FOUND)
-      message(STATUS "SSE2 support found")
+    if(SSE2_FOUND AND COMPILE_SSE2)
+      message(STATUS "SSE2 support requested and found")
       set(INTRINSICS "${INTRINSICS} -msse2")
       list(APPEND INTRINSICS_NVCC -Xcompiler\ -msse2)
-    endif(SSE2_FOUND)
-    if(SSE3_FOUND)
-      message(STATUS "SSE3 support found")
+    endif(SSE2_FOUND AND COMPILE_SSE2)
+    if(SSE3_FOUND AND COMPILE_SSE3)
+      message(STATUS "SSE3 support requested and found")
       set(INTRINSICS "${INTRINSICS} -msse3")
       list(APPEND INTRINSICS_NVCC -Xcompiler\ -msse3)
-    endif(SSE3_FOUND)
-    if(SSE4_1_FOUND)
-      message(STATUS "SSE4.1 support found")
+    endif(SSE3_FOUND AND COMPILE_SSE3)
+    if(SSE4_1_FOUND AND COMPILE_SSE4_1)
+      message(STATUS "SSE4.1 support requested and found")
       set(INTRINSICS "${INTRINSICS} -msse4.1")
       list(APPEND INTRINSICS_NVCC -Xcompiler\ -msse4.1)
-    endif(SSE4_1_FOUND)
-    if(SSE4_2_FOUND)
-      message(STATUS "SSE4.2 support found")
+    endif(SSE4_1_FOUND AND COMPILE_SSE4_1)
+    if(SSE4_2_FOUND AND COMPILE_SSE4_2)
+      message(STATUS "SSE4.2 support requested and found")
       set(INTRINSICS "${INTRINSICS} -msse4.2")
       list(APPEND INTRINSICS_NVCC -Xcompiler\ -msse4.2)
-    endif(SSE4_2_FOUND)
-    if(AVX_FOUND)
-      message(STATUS "AVX support found")
+    endif(SSE4_2_FOUND AND COMPILE_SSE4_2)
+    if(AVX_FOUND AND COMPILE_AVX)
+      message(STATUS "AVX support requested and found")
       set(INTRINSICS "${INTRINSICS} -mavx")
       list(APPEND INTRINSICS_NVCC -Xcompiler\ -mavx)
-    endif(AVX_FOUND)
-    if(AVX2_FOUND)
-      message(STATUS "AVX2 support found")
+    endif(AVX_FOUND AND COMPILE_AVX)
+    if(AVX2_FOUND AND COMPILE_AVX2)
+      message(STATUS "AVX2 support requested and found")
       set(INTRINSICS "${INTRINSICS} -mavx2")
       list(APPEND INTRINSICS_NVCC -Xcompiler\ -mavx2)
-    endif(AVX2_FOUND)
-    if(AVX512_FOUND)
-      message(STATUS "AVX512 support found")
+    endif(AVX2_FOUND AND COMPILE_AVX2)
+    if(AVX512_FOUND AND COMPILE_AVX512)
+      message(STATUS "AVX512 support requested and found")
       set(INTRINSICS "${INTRINSICS} -mavx512f")
       list(APPEND INTRINSICS_NVCC -Xcompiler\ -mavx512f)
-    endif(AVX512_FOUND)
+    endif(AVX512_FOUND AND COMPILE_AVX512)
   else()
-    set(INTRINSICS "-msse4.1")
+    # force to build with the requested intrisics, requires compiler support
+    message(STATUS "Building for ${BUILD_ARCH} and forcing intrisics as requested")
+    if(COMPILE_SSE2)
+      message(STATUS "SSE2 support requested")
+      set(INTRINSICS "${INTRINSICS} -msse2")
+      list(APPEND INTRINSICS_NVCC -Xcompiler\ -msse2)
+    endif(COMPILE_SSE2)
+    if(COMPILE_SSE3)
+      message(STATUS "SSE3 support requested")
+      set(INTRINSICS "${INTRINSICS} -msse3")
+      list(APPEND INTRINSICS_NVCC -Xcompiler\ -msse3)
+    endif(COMPILE_SSE3)
+    if(COMPILE_SSE4_1)
+      message(STATUS "SSE4.1 support requested")
+      set(INTRINSICS "${INTRINSICS} -msse4.1")
+      list(APPEND INTRINSICS_NVCC -Xcompiler\ -msse4.1)
+    endif(COMPILE_SSE4_1)
+    if(COMPILE_SSE4_2)
+      message(STATUS "SSE4.2 support requested")
+      set(INTRINSICS "${INTRINSICS} -msse4.2")
+      list(APPEND INTRINSICS_NVCC -Xcompiler\ -msse4.2)
+    endif(COMPILE_SSE4_2)
+    if(COMPILE_AVX)
+      message(STATUS "AVX support requested")
+      set(INTRINSICS "${INTRINSICS} -mavx")
+      list(APPEND INTRINSICS_NVCC -Xcompiler\ -mavx)
+    endif(COMPILE_AVX)
+    if(COMPILE_AVX2)
+      message(STATUS "AVX2 support requested")
+      set(INTRINSICS "${INTRINSICS} -mavx2")
+      list(APPEND INTRINSICS_NVCC -Xcompiler\ -mavx2)
+    endif(COMPILE_AVX2)
+    if(COMPILE_AVX512)
+      message(STATUS "AVX512 support requested")
+      set(INTRINSICS "${INTRINSICS} -mavx512f")
+      list(APPEND INTRINSICS_NVCC -Xcompiler\ -mavx512f)
+    endif(COMPILE_AVX512)
   endif()
 
   if(USE_FBGEMM)