diff --git a/.azure_pipeline.yml b/.azure_pipeline.yml index 7f51aa13..6f3e26ba 100644 --- a/.azure_pipeline.yml +++ b/.azure_pipeline.yml @@ -73,6 +73,31 @@ jobs: VERSION_PYTHON: '*' CC_OUTER_LOOP: 'gcc' CC_INNER_LOOP: 'clang-8' + # Linux environment with numpy linked to BLIS + pylatest_blis_gcc_clang: + PACKAGER: 'conda' + VERSION_PYTHON: '*' + INSTALL_BLIS: 'true' + BLIS_NUM_THREADS: '4' + CC_OUTER_LOOP: 'gcc' + CC_INNER_LOOP: 'gcc' + BLIS_CC: 'clang-8' + pylatest_blis_clang_gcc: + PACKAGER: 'conda' + VERSION_PYTHON: '*' + INSTALL_BLIS: 'true' + BLIS_NUM_THREADS: '4' + CC_OUTER_LOOP: 'clang-8' + CC_INNER_LOOP: 'clang-8' + BLIS_CC: 'gcc' + pylatest_blis_sinlge_threaded: + PACKAGER: 'conda' + VERSION_PYTHON: '*' + INSTALL_BLIS: 'true' + BLIS_NUM_THREADS: '1' + CC_OUTER_LOOP: 'gcc' + CC_INNER_LOOP: 'gcc' + BLIS_CC: 'gcc' - template: continuous_integration/posix.yml parameters: diff --git a/continuous_integration/install_with_blis.sh b/continuous_integration/install_with_blis.sh new file mode 100755 index 00000000..010c3b64 --- /dev/null +++ b/continuous_integration/install_with_blis.sh @@ -0,0 +1,53 @@ +#!/bin/bash + +set -e + +pushd .. +ABS_PATH=$(pwd) +popd + +# Assume Ubuntu: install a recent version of clang and libomp +echo "deb http://apt.llvm.org/xenial/ llvm-toolchain-xenial-8 main" | sudo tee -a /etc/apt/sources.list.d/llvm.list +echo "deb-src http://apt.llvm.org/xenial/ llvm-toolchain-xenial-8 main" | sudo tee -a /etc/apt/sources.list.d/llvm.list +wget -O - https://apt.llvm.org/llvm-snapshot.gpg.key | sudo apt-key add - +sudo apt update +sudo apt install clang-8 libomp-8-dev + +# create conda env +conda create -n $VIRTUALENV -q --yes python=$VERSION_PYTHON pip cython +source activate $VIRTUALENV + +pushd .. + +# build & install blis +mkdir BLIS_install +git clone https://github.com/flame/blis.git +pushd blis +./configure --prefix=$ABS_PATH/BLIS_install --enable-cblas --enable-threading=openmp CC=$BLIS_CC auto +make -j4 +make install +popd + +# build & install numpy +git clone https://github.com/numpy/numpy.git +pushd numpy +echo "[blis] +libraries = blis +library_dirs = $ABS_PATH/BLIS_install/lib +include_dirs = $ABS_PATH/BLIS_install/include/blis +runtime_library_dirs = $ABS_PATH/BLIS_install/lib" > site.cfg +python setup.py build_ext -i +pip install -e . +popd + +popd + +python -m pip install -q -r dev-requirements.txt +CFLAGS=-I$ABS_PATH/BLIS_install/include/blis LDFLAGS=-L$ABS_PATH/BLIS_install/lib \ + bash ./continuous_integration/build_test_ext.sh + +python --version +python -c "import numpy; print('numpy %s' % numpy.__version__)" || echo "no numpy" +python -c "import scipy; print('scipy %s' % scipy.__version__)" || echo "no scipy" + +python -m flit install --symlink diff --git a/continuous_integration/posix.yml b/continuous_integration/posix.yml index 54edf669..fff39a3d 100644 --- a/continuous_integration/posix.yml +++ b/continuous_integration/posix.yml @@ -22,7 +22,12 @@ jobs: condition: eq('${{ parameters.name }}', 'macOS') - script: | continuous_integration/install.sh - displayName: 'Install' + displayName: 'Install without BLIS' + condition: ne(variables['INSTALL_BLIS'], 'true') + - script: | + continuous_integration/install_with_blis.sh + displayName: 'Install with BLIS' + condition: eq(variables['INSTALL_BLIS'], 'true') - script: | continuous_integration/test_script.sh displayName: 'Test Library' diff --git a/tests/_openmp_test_helper/nested_prange_blas.pyx b/tests/_openmp_test_helper/nested_prange_blas.pyx index 79a19f12..a6615fe5 100644 --- a/tests/_openmp_test_helper/nested_prange_blas.pyx +++ b/tests/_openmp_test_helper/nested_prange_blas.pyx @@ -1,8 +1,24 @@ cimport openmp -from cython.parallel import prange +from cython.parallel import parallel, prange import numpy as np -from scipy.linalg.cython_blas cimport dgemm + +IF USE_BLIS: + cdef extern from 'cblas.h' nogil: + ctypedef enum CBLAS_ORDER: + CblasRowMajor=101 + CblasColMajor=102 + ctypedef enum CBLAS_TRANSPOSE: + CblasNoTrans=111 + CblasTrans=112 + CblasConjTrans=113 + void dgemm 'cblas_dgemm' ( + CBLAS_ORDER Order, CBLAS_TRANSPOSE TransA, + CBLAS_TRANSPOSE TransB, int M, int N, + int K, double alpha, double *A, int lda, + double *B, int ldb, double beta, double *C, int ldc) +ELSE: + from scipy.linalg.cython_blas cimport dgemm from threadpoolctl import threadpool_info @@ -25,18 +41,25 @@ def check_nested_prange_blas(double[:, ::1] A, double[:, ::1] B, int nthreads): int i int prange_num_threads + int *prange_num_threads_ptr = &prange_num_threads - threadpool_infos = None + threadpool_infos = [None] - for i in prange(n_chunks, num_threads=nthreads, nogil=True): - dgemm(trans, no_trans, &n, &chunk_size, &k, - &alpha, &B[0, 0], &k, &A[i * chunk_size, 0], &k, - &beta, &C[i * chunk_size, 0], &n) + with nogil, parallel(num_threads=nthreads): + if openmp.omp_get_thread_num() == 0: + with gil: + threadpool_infos[0] = threadpool_info() - prange_num_threads = openmp.omp_get_num_threads() + prange_num_threads_ptr[0] = openmp.omp_get_num_threads() - if i == 0: - with gil: - threadpool_infos = threadpool_info() + for i in prange(n_chunks): + IF USE_BLIS: + dgemm(CblasRowMajor, CblasNoTrans, CblasTrans, + chunk_size, n, k, alpha, &A[i * chunk_size, 0], k, + &B[0, 0], k, beta, &C[i * chunk_size, 0], n) + ELSE: + dgemm(trans, no_trans, &n, &chunk_size, &k, + &alpha, &B[0, 0], &k, &A[i * chunk_size, 0], &k, + &beta, &C[i * chunk_size, 0], &n) - return np.asarray(C), prange_num_threads, threadpool_infos + return np.asarray(C), prange_num_threads, threadpool_infos[0] diff --git a/tests/_openmp_test_helper/setup_nested_prange_blas.py b/tests/_openmp_test_helper/setup_nested_prange_blas.py index 746e573d..54da058f 100644 --- a/tests/_openmp_test_helper/setup_nested_prange_blas.py +++ b/tests/_openmp_test_helper/setup_nested_prange_blas.py @@ -12,12 +12,16 @@ set_cc_variables("CC_OUTER_LOOP") openmp_flag = get_openmp_flag() + use_blis = os.getenv('INSTALL_BLIS', False) + libraries = ['blis'] if use_blis else [] + ext_modules = [ Extension( "nested_prange_blas", ["nested_prange_blas.pyx"], extra_compile_args=openmp_flag, - extra_link_args=openmp_flag + extra_link_args=openmp_flag, + libraries=libraries ) ] @@ -25,6 +29,7 @@ name='_openmp_test_helper_nested_prange_blas', ext_modules=cythonize( ext_modules, + compile_time_env={'USE_BLIS': use_blis}, compiler_directives={'language_level': 3, 'boundscheck': False, 'wraparound': False}) diff --git a/tests/test_threadpoolctl.py b/tests/test_threadpoolctl.py index 5713be76..636e9acd 100644 --- a/tests/test_threadpoolctl.py +++ b/tests/test_threadpoolctl.py @@ -245,11 +245,17 @@ def test_nested_prange_blas(nthreads_outer): blas_info = [module for module in threadpool_info() if module["user_api"] == "blas"] - for module in threadpool_info(): - if is_old_openblas(module): - # OpenBLAS 0.3.3 and older are known to cause an unrecoverable - # deadlock at process shutdown time (after pytest has exited). - pytest.skip("Old OpenBLAS: skipping test to avoid deadlock") + + blis_linked = any([module['internal_api'] == 'blis' + for module in threadpool_info()]) + if not blis_linked: + # numpy can be linked to BLIS for CBLAS and OpenBLAS for LAPACK. In that + # case this test will run BLIS gemm so no need to skip. + for module in threadpool_info(): + if is_old_openblas(module): + # OpenBLAS 0.3.3 and older are known to cause an unrecoverable + # deadlock at process shutdown time (after pytest has exited). + pytest.skip("Old OpenBLAS: skipping test to avoid deadlock") from ._openmp_test_helper import check_nested_prange_blas A = np.ones((1000, 10)) diff --git a/threadpoolctl.py b/threadpoolctl.py index 54bd616d..347c92df 100644 --- a/threadpoolctl.py +++ b/threadpoolctl.py @@ -74,6 +74,11 @@ class _dl_phdr_info(ctypes.Structure): "internal_api": "mkl", "filename_prefixes": ("libmkl_rt", "mkl_rt",), }, + { + "user_api": "blas", + "internal_api": "blis", + "filename_prefixes": ("libblis",), + }, ] # map a internal_api (openmp, openblas, mkl) to set and get functions @@ -88,6 +93,9 @@ class _dl_phdr_info(ctypes.Structure): "mkl": { "set_num_threads": "MKL_Set_Num_Threads", "get_num_threads": "MKL_Get_Max_Threads"}, + "blis": { + "set_num_threads": "bli_thread_set_num_threads", + "get_num_threads": "bli_thread_get_num_threads"} } # Helpers for the doc and test names @@ -110,9 +118,8 @@ def decorator(o): def _get_limit(prefix, user_api, limits): if prefix in limits: return limits[prefix] - if user_api in limits: + else: return limits[user_api] - return None @_format_docstring(ALL_PREFIXES=_ALL_PREFIXES, @@ -210,6 +217,10 @@ def threadpool_info(): modules = _load_modules(user_api=_ALL_USER_APIS) for module in modules: module['num_threads'] = module['get_num_threads']() + # by default BLIS is single-threaded and get_num_threads returns -1. + # we map it to 1 for consistency with other libraries. + if module['num_threads'] == -1 and module['internal_api'] == 'blis': + module['num_threads'] = 1 # Remove the wrapper for the module and its function del module['set_num_threads'], module['get_num_threads'] del module['dynlib'] @@ -227,6 +238,8 @@ def _get_version(dynlib, internal_api): return None elif internal_api == "openblas": return _get_openblas_version(dynlib) + elif internal_api == "blis": + return _get_blis_version(dynlib) else: raise NotImplementedError("Unsupported API {}".format(internal_api)) @@ -257,6 +270,13 @@ def _get_openblas_version(openblas_dynlib): return None +def _get_blis_version(blis_dynlib): + """Return the BLIS version""" + get_version = getattr(blis_dynlib, "bli_info_get_version_str") + get_version.restype = ctypes.c_char_p + return get_version().decode('utf-8') + + # Loading utilities for dynamically linked shared objects def _load_modules(prefixes=None, user_api=None):