diff --git a/.gitmodules b/.gitmodules index 99e901f0..44e5b4ac 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,7 +1,3 @@ -[submodule "src/pfaffine"] - path = src/pfaffine - url = https://github.com/xrq-phys/Pfaffine - branch = blis [submodule "src/blis"] path = src/blis url = https://github.com/xrq-phys/blis diff --git a/CMakeLists.txt b/CMakeLists.txt index 70851aa8..3257d79d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,8 +1,9 @@ -cmake_minimum_required(VERSION 2.8.0 FATAL_ERROR) +cmake_minimum_required(VERSION 3.0 FATAL_ERROR) project(mVMC NONE) option(USE_SCALAPACK "Use Scalapack" OFF) option(PFAFFIAN_BLOCKED "Use blocked-update Pfaffian to speed up." OFF) +option(USE_GEMMT "Use GEMMT. Recommended regardless blocked-Pfaffian-update." ON) add_definitions(-D_mVMC) if(CONFIG) @@ -17,11 +18,10 @@ message(STATUS "Build type: " ${CMAKE_BUILD_TYPE}) option(BUILD_SHARED_LIBS "Build shared libraries" ON) option(GIT_SUBMODULE_UPDATE "execute `git submodule update` automatically" ON) -enable_language(C Fortran) -if(PFAFFIAN_BLOCKED) - enable_language(CXX) - set(CMAKE_CXX_STANDARD 11) -endif(PFAFFIAN_BLOCKED) + +# First, enables C language only. +# External packages only use their C API. +enable_language(C) set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/lib") set(CMAKE_SKIP_BUILD_RPATH FALSE) @@ -29,6 +29,7 @@ set(CMAKE_BUILD_WITH_INSTALL_RPATH FALSE) set(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE) set(CMAKE_MACOSX_RPATH 1) +# TODO: Is this really needed? if("${CMAKE_BUILD_TYPE}" MATCHES "Debug") set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${CMAKE_C_FLAGS_DEBUG}") else() @@ -37,16 +38,19 @@ else() endif() if(CMAKE_C_COMPILER_ID STREQUAL "Intel") + # TODO: Really needs separation? if("${CMAKE_C_COMPILER_VERSION}" VERSION_LESS "15.0.0.20140528") set(OMP_FLAG_Intel "-openmp") else() set(OMP_FLAG_Intel "-qopenmp") endif() set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OMP_FLAG_Intel}") + set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OMP_FLAG_Intel}") else() find_package(OpenMP) if(OPENMP_FOUND) set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}") + set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_C_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}") endif(OPENMP_FOUND) endif() @@ -57,6 +61,9 @@ if(MPI_C_FOUND) set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${MPI_C_LINK_FLAGS}") endif(MPI_C_FOUND) +# BLAS/LAPACK/Pfapack77 needs Fortran interface. +enable_language(Fortran) + find_package(LAPACK) if(USE_SCALAPACK MATCHES OFF) if(LAPACK_FOUND) @@ -64,6 +71,34 @@ if(USE_SCALAPACK MATCHES OFF) endif(LAPACK_FOUND) endif() +if(PFAFFIAN_BLOCKED + OR USE_GEMMT) + set(Require_BLIS ON) +endif() + +if(BLA_VENDOR MATCHES "Intel10*" + OR BLAS_LIBRARIES MATCHES "/*mkl*") + # Don't require BLIS when MKL is used. + add_definitions(-DMKL) + add_definitions(-DBLAS_EXTERNAL) + add_definitions(-DF77_COMPLEX_RET_INTEL) + set(Require_BLIS OFF) +elseif(BLA_VENDOR MATCHES "FLA*" + OR BLAS_LIBRARIES MATCHES "/*blis*") + # Skip extra BLIS if it's already the BLAS vendor. + list(GET BLAS_LIBRARIES 0 BLIS_FIRST_LIB) + get_filename_component(BLIS_LIB_DIR ${BLIS_FIRST_LIB} DIRECTORY) + include_directories(${BLIS_LIB_DIR}/../include) + include_directories(${BLIS_LIB_DIR}/../include/blis) + set(Require_BLIS OFF) +else() + # BLAS vendor preference: + # External > BLIS > Reference + if(DEFINED BLA_VENDOR) + add_definitions(-DBLAS_EXTERNAL) + endif() +endif() + # Build and enable tests # testing setup # enable_testing() must be called in the top-level CMakeLists.txt before any add_subdirectory() is called. @@ -97,12 +132,23 @@ if(GIT_FOUND AND EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/.git") endif() add_subdirectory("${STDFACE_DIR}") +# C++ support for pfupdates & ltl2inv. +enable_language(CXX) +set(CMAKE_CXX_STANDARD 11) + +if(Require_BLIS) + include("download_blis_artifact.cmake") +else(Require_BLIS) + # Use bundled blis.h + include_directories(src/common/deps) +endif(Require_BLIS) + if(PFAFFIAN_BLOCKED) add_definitions(-D_pf_block_update) - include("download_blis_artifact.cmake") - # Must set BLIS artifact BEFORE adding pfupdates target. add_subdirectory(src/pfupdates) - add_dependencies(pfupdates blis_include) + if(Require_BLIS) + add_dependencies(pfupdates blis_include) + endif(Require_BLIS) endif(PFAFFIAN_BLOCKED) if (Document) @@ -111,5 +157,9 @@ endif(Document) add_subdirectory(src/ComplexUHF) add_subdirectory(src/pfapack/fortran) +add_subdirectory(src/ltl2inv) + if(Require_BLIS) + add_dependencies(ltl2inv blis_include) + endif(Require_BLIS) add_subdirectory(src/mVMC) add_subdirectory(tool) diff --git a/config/aocc.cmake b/config/aocc.cmake index a53a783a..258d9772 100644 --- a/config/aocc.cmake +++ b/config/aocc.cmake @@ -9,6 +9,9 @@ set(CMAKE_C_FLAGS_RELEASE "-Wno-unknown-pragmas -O3 -DNDEBUG -DHAVE_SSE2" CACHE set(CMAKE_Fortran_COMPILER "flang" CACHE STRING "" FORCE) set(CMAKE_Fortran_FLAGS_RELEASE "-O3 -DNDEBUG -DHAVE_SSE2" CACHE STRING "" FORCE) +# OpenMP, libatomic +set(CMAKE_EXE_LINKER_FLAGS "-fopenmp -latomic") + # for AOCL set(BLA_VENDOR "FLAME" CACHE STRING "" FORCE) diff --git a/config/apple.cmake b/config/apple.cmake index 9b197129..16ff1130 100644 --- a/config/apple.cmake +++ b/config/apple.cmake @@ -6,6 +6,7 @@ set(CMAKE_CXX_COMPILER "clang++" CACHE STRING "" FORCE) set(CMAKE_C_FLAGS_DEBUG "-g -O0 -Wall -Wformat -Werror=format-security") set(CMAKE_C_FLAGS_RELEASE "-O3 -Wno-unknown-pragmas -Wno-logical-not-parentheses") set(CMAKE_Fortran_COMPILER "gfortran" CACHE STRING "" FORCE) +add_definitions(-DF77_COMPLEX_RET_INTEL) # OpenMP with libomp set(CMAKE_EXE_LINKER_FLAGS "-L/usr/local/lib -lomp") diff --git a/mVMCconfig.sh b/mVMCconfig.sh index f731ccc7..0b7e1d62 100755 --- a/mVMCconfig.sh +++ b/mVMCconfig.sh @@ -40,7 +40,7 @@ F90 = frt FFLAGS = -DNDEBUG -DFUJITSU -Kfast CFLAGS = -DNDEBUG -Ofast -fopenmp CXXFLAGS = -DNDEBUG -Ofast -fopenmp -CFLAGS += -D_lapack -D_pf_block_update # -D_pfaffine +CFLAGS += -D_lapack -D_pf_block_update CXXFLAGS += -DBLAS_EXTERNAL BLIS_ROOT = ${PWD}/src/blis-install @@ -74,7 +74,7 @@ F90 = ifort FFLAGS = -O3 -implicitnone CFLAGS = -O3 -qopenmp -Wno-unknown-pragmas CXXFLAGS = -O3 -std=gnu++14 -fpic -qopenmp -CFLAGS += -D_lapack -D_pf_block_update -D_pfaffine +CFLAGS += -D_lapack -D_pf_block_update CXXFLAGS += -DMKL -DBLAS_EXTERNAL -DF77_COMPLEX_RET_INTEL BLIS_ROOT = ${PWD}/src/blis-install @@ -100,7 +100,7 @@ F90 = ifort FFLAGS = -O3 -implicitnone CFLAGS = -O3 -qopenmp -Wno-unknown-pragmas CXXFLAGS = -O3 -std=gnu++14 -fpic -qopenmp -CFLAGS += -D_lapack -D_pf_block_update -D_pfaffine +CFLAGS += -D_lapack -D_pf_block_update CXXFLAGS += -DMKL -DBLAS_EXTERNAL -DF77_COMPLEX_RET_INTEL BLIS_ROOT = ${PWD}/src/blis-install @@ -124,7 +124,7 @@ F90 = flang FFLAGS = -O3 CFLAGS = -O3 -fopenmp CXXFLAGS = -O3 -fPIC -CFLAGS += -D_lapack -D_pf_block_update -D_pfaffine +CFLAGS += -D_lapack -D_pf_block_update CXXFLAGS += BLIS_ROOT = ${PWD}/src/blis-install @@ -148,7 +148,7 @@ F90 = gfortran FFLAGS = -O3 -fimplicit-none CFLAGS = -O3 -fopenmp CXXFLAGS = -O3 -fPIC -CFLAGS += -D_lapack -D_pf_block_update -D_pfaffine +CFLAGS += -D_lapack -D_pf_block_update CXXFLAGS += BLIS_ROOT = ${PWD}/src/blis-install @@ -172,7 +172,7 @@ F90 = gfortran FFLAGS = -O3 -fimplicit-none CFLAGS = -O3 -fopenmp CXXFLAGS = -O3 -CFLAGS += -D_lapack -D_pf_block_update -D_pfaffine +CFLAGS += -D_lapack -D_pf_block_update CXXFLAGS += -DMKL -DBLAS_EXTERNAL -DF77_COMPLEX_RET_INTEL BLIS_ROOT = ${PWD}/src/blis-install @@ -196,7 +196,7 @@ F90 = gfortran FFLAGS = -O3 -fimplicit-none CFLAGS = -O3 -fopenmp CXXFLAGS = -O3 -CFLAGS += -D_lapack -D_pf_block_update -D_pfaffine +CFLAGS += -D_lapack -D_pf_block_update CXXFLAGS += BLIS_ROOT = ${PWD}/src/blis-install @@ -220,7 +220,7 @@ F90 = gfortran FFLAGS = -O3 -fimplicit-none CFLAGS = -O3 -fopenmp CXXFLAGS = -O3 -CFLAGS += -D_lapack -D_pf_block_update -D_pfaffine +CFLAGS += -D_lapack -D_pf_block_update CXXFLAGS += # -DBLAS_EXTERNAL # -DF77_COMPLEX_RET_INTEL # For Apple. BLIS_ROOT = ${PWD}/src/blis-install @@ -242,7 +242,6 @@ EOF cat src/make.sys cat src/make-ext.sys ln -s $PWD/src/make.sys src/StdFace/make.sys; - ln -s $PWD/src/make.sys src/pfaffine/make.inc; ln -s $PWD/src/make.sys src/pfapack/fortran/make.inc; cat > makefile < +#include +typedef std::complex ccscmplx; +typedef std::complex ccdcmplx; +// BLIS definitions. +#include "blis.h" +// BLAS definitions +#include "blalink_fort.h" + +// error info +typedef enum { + Pfaffine_SIGN_ERR = 1, + Pfaffine_OUT_OF_BOUND, + Pfaffine_BAD_SCRATCHPAD, + Pfaffine_BAD_REPRESENTATION, + Pfaffine_INTEGER_OVERFLOW, + Pfaffine_DOUBLE_NAN_DETECTED, + Pfaffine_NOT_IMPLEMNTED, + Pfaffine_NUM_ERROR_TYPE +} skpfa_error_t; +inline signed err_info(signed type, signed pos) +{ return type * Pfaffine_NUM_ERROR_TYPE + pos; } + +// gemm +template +inline void gemm(trans_t transa, trans_t transb, + dim_t m, dim_t n, dim_t k, + T alpha, + T *a, inc_t lda, + T *b, inc_t ldb, + T beta, + T *c, inc_t ldc); +#ifndef BLAS_EXTERNAL +#define BLALINK_MAC(cctype, ctype, cchar) \ + template <> inline void gemm(trans_t transa, trans_t transb, \ + dim_t m, dim_t n, dim_t k, \ + cctype alpha, \ + cctype *a, inc_t lda, \ + cctype *b, inc_t ldb, \ + cctype beta, \ + cctype *c, inc_t ldc) \ + { \ + bli_##cchar##gemm(transa, transb, \ + m, n, k, \ + (ctype *)&alpha, \ + (ctype *)a, 1, lda, \ + (ctype *)b, 1, ldb, \ + (ctype *)&beta, \ + (ctype *)c, 1, ldc); \ + } +#else +#define BLALINK_MAC(cctype, ctype, cchar) \ + template <> inline void gemm(trans_t transa, trans_t transb, \ + dim_t m, dim_t n, dim_t k, \ + cctype alpha, \ + cctype *a, inc_t lda, \ + cctype *b, inc_t ldb, \ + cctype beta, \ + cctype *c, inc_t ldc) \ + { \ + char ta = trans2char(transa), \ + tb = trans2char(transb); \ + cchar##gemm_(&ta, &tb, &m, &n, &k, \ + (ctype *)&alpha, \ + (ctype *)a, &lda, \ + (ctype *)b, &ldb, \ + (ctype *)&beta, \ + (ctype *)c, &ldc); \ + } +#endif +BLALINK_MAC( float, float, s ) +BLALINK_MAC( double, double, d ) +BLALINK_MAC( ccscmplx, scomplex, c ) +BLALINK_MAC( ccdcmplx, dcomplex, z ) +#undef BLALINK_MAC + + +// ger +template +inline void ger(dim_t m, dim_t n, + T alpha, + T *x, inc_t incx, + T *y, inc_t incy, + T *a, inc_t lda); +#ifndef BLAS_EXTERNAL +#define BLALINK_MAC(cctype, ctype, cchar, cfunc) \ + template <> inline void ger(dim_t m, dim_t n, \ + cctype alpha, \ + cctype *x, inc_t incx, \ + cctype *y, inc_t incy, \ + cctype *a, inc_t lda) \ + { \ + bli_##cchar##ger(BLIS_NO_CONJUGATE, \ + BLIS_NO_CONJUGATE, \ + m, n, \ + (ctype *)&alpha, \ + (ctype *)x, incx, \ + (ctype *)y, incy, \ + (ctype *)a, 1, lda); \ + } +#else +#define BLALINK_MAC(cctype, ctype, cchar, cfunc) \ + template <> inline void ger(dim_t m, dim_t n, \ + cctype alpha, \ + cctype *x, inc_t incx, \ + cctype *y, inc_t incy, \ + cctype *a, inc_t lda) \ + { \ + cchar##cfunc##_(&m, &n, \ + (ctype *)&alpha, \ + (ctype *)x, &incx, \ + (ctype *)y, &incy, \ + (ctype *)a, &lda); \ + } +#endif +BLALINK_MAC( float, float, s, ger ) +BLALINK_MAC( double, double, d, ger ) +BLALINK_MAC( ccscmplx, scomplex, c, geru ) +BLALINK_MAC( ccdcmplx, dcomplex, z, geru ) +#undef BLALINK_MAC + + +// gemv +template +inline void gemv(trans_t trans, + dim_t m, dim_t n, + T alpha, + T *a, inc_t lda, + T *x, inc_t incx, + T beta, + T *y, inc_t incy); +#ifndef BLAS_EXTERNAL +#define BLALINK_MAC(cctype, ctype, cchar) \ + template <> inline void gemv(trans_t trans, \ + dim_t m, dim_t n, \ + cctype alpha, \ + cctype *a, inc_t lda, \ + cctype *x, inc_t incx, \ + cctype beta, \ + cctype *y, inc_t incy) \ + { \ + bli_##cchar##gemv(trans, \ + BLIS_NO_CONJUGATE, \ + m, n, \ + (ctype *)&alpha, \ + (ctype *)a, 1, lda, \ + (ctype *)x, incx, \ + (ctype *)&beta, \ + (ctype *)y, incy); \ + } +#else +#define BLALINK_MAC(cctype, ctype, cchar) \ + template <> inline void gemv(trans_t trans, \ + dim_t m, dim_t n, \ + cctype alpha, \ + cctype *a, inc_t lda, \ + cctype *x, inc_t incx, \ + cctype beta, \ + cctype *y, inc_t incy) \ + { \ + char t = trans2char(trans); \ + cchar##gemv_(&t, &m, &n, \ + (ctype *)&alpha, \ + (ctype *)a, &lda, \ + (ctype *)x, &incx, \ + (ctype *)&beta, \ + (ctype *)y, &incy); \ + } +#endif +BLALINK_MAC( float, float, s ) +BLALINK_MAC( double, double, d ) +BLALINK_MAC( ccscmplx, scomplex, c ) +BLALINK_MAC( ccdcmplx, dcomplex, z ) +#undef BLALINK_MAC + + +// trmm +template +inline void trmm(side_t sidea, + uplo_t uploa, + trans_t transa, + diag_t diaga, + dim_t m, dim_t n, + T alpha, + T *a, inc_t lda, + T *b, inc_t ldb); +#ifndef BLAS_EXTERNAL +#define BLALINK_MAC(cctype, ctype, cchar) \ + template <> inline void trmm(side_t sidea, uplo_t uploa, trans_t transa, diag_t diaga, \ + dim_t m, dim_t n, \ + cctype alpha, \ + cctype *a, inc_t lda, \ + cctype *b, inc_t ldb) \ + { \ + bli_##cchar##trmm(sidea, uploa, transa, diaga, \ + m, n, \ + (ctype *)&alpha, \ + (ctype *)a, 1, lda, \ + (ctype *)b, 1, ldb); \ + } +#else +#define BLALINK_MAC(cctype, ctype, cchar) \ + template <> inline void trmm(side_t sidea, uplo_t uploa, trans_t transa, diag_t diaga, \ + dim_t m, dim_t n, \ + cctype alpha, \ + cctype *a, inc_t lda, \ + cctype *b, inc_t ldb) \ + { \ + char ul = uplo2char(uploa), \ + si = side2char(sidea), \ + tr = trans2char(transa), \ + dg = diag2char(diaga); \ + cchar##trmm_(&si, &ul, &tr, &dg, \ + &m, &n, \ + (ctype *)&alpha, \ + (ctype *)a, &lda, \ + (ctype *)b, &ldb); \ + } +#endif +BLALINK_MAC( float, float, s ) +BLALINK_MAC( double, double, d ) +BLALINK_MAC( ccscmplx, scomplex, c ) +BLALINK_MAC( ccdcmplx, dcomplex, z ) +#undef BLALINK_MAC + + +// trmv +template +inline void trmv(uplo_t uploa, + trans_t transa, + dim_t m, + T alpha, + T *a, inc_t lda, + T *x, inc_t incx); +#ifndef BLAS_EXTERNAL +#define BLALINK_MAC(cctype, ctype, cchar) \ + template <> inline void trmv(uplo_t uploa, trans_t transa, dim_t m, \ + cctype alpha, \ + cctype *a, inc_t lda, \ + cctype *x, inc_t incx) \ + { \ + bli_##cchar##trmv(uploa, transa, \ + BLIS_NONUNIT_DIAG, m, \ + (ctype *)&alpha, \ + (ctype *)a, 1, lda, \ + (ctype *)x, incx); \ + } +#else +#define BLALINK_MAC(cctype, ctype, cchar) \ + template <> inline void trmv(uplo_t uploa, trans_t transa, dim_t m, \ + cctype alpha, \ + cctype *a, inc_t lda, \ + cctype *x, inc_t incx) \ + { \ + char ul = uplo2char(uploa), \ + tr = trans2char(transa), \ + dg = 'N'; \ + /* + * TRMV has alpha in its templated interface, which is absent in BLAS. + */ \ + assert(alpha == cctype(1.0)); \ + cchar##trmv_(&ul, &tr, &dg, &m, \ + (ctype *)a, &lda, \ + (ctype *)x, &incx); \ + } +#endif +BLALINK_MAC( float, float, s ) +BLALINK_MAC( double, double, d ) +BLALINK_MAC( ccscmplx, scomplex, c ) +BLALINK_MAC( ccdcmplx, dcomplex, z ) +#undef BLALINK_MAC + + +// swap +template +inline void swap(dim_t n, + T *x, inc_t incx, + T *y, inc_t incy); +#ifndef BLAS_EXTERNAL +#define BLALINK_MAC(cctype, ctype, cchar) \ + template <> inline void swap(dim_t n, \ + cctype *x, inc_t incx, \ + cctype *y, inc_t incy) \ + { \ + bli_##cchar##swapv(n, \ + (ctype *)x, incx, \ + (ctype *)y, incy); \ + } +#else +#define BLALINK_MAC(cctype, ctype, cchar) \ + template <> inline void swap(dim_t n, \ + cctype *x, inc_t incx, \ + cctype *y, inc_t incy) \ + { \ + cchar##swap_(&n, \ + (ctype *)x, &incx, \ + (ctype *)y, &incy); \ + } +#endif +BLALINK_MAC( float, float, s ) +BLALINK_MAC( double, double, d ) +BLALINK_MAC( ccscmplx, scomplex, c ) +BLALINK_MAC( ccdcmplx, dcomplex, z ) +#undef BLALINK_MAC + + +// axpy +template +inline void axpy(dim_t n, + T alpha, + T *x, inc_t incx, + T *y, inc_t incy); +#ifndef BLAS_EXTERNAL +#define BLALINK_MAC(cctype, ctype, cchar) \ + template <> inline void axpy(dim_t n, \ + cctype alpha, \ + cctype *x, inc_t incx, \ + cctype *y, inc_t incy) \ + { \ + bli_##cchar##axpyv(BLIS_NO_CONJUGATE, n, \ + (ctype *)&alpha, \ + (ctype *)x, incx, \ + (ctype *)y, incy); \ + } +#else +#define BLALINK_MAC(cctype, ctype, cchar) \ + template <> inline void axpy(dim_t n, \ + cctype alpha, \ + cctype *x, inc_t incx, \ + cctype *y, inc_t incy) \ + { \ + cchar##axpy_(&n, \ + (ctype *)&alpha, \ + (ctype *)x, &incx, \ + (ctype *)y, &incy); \ + } +#endif +BLALINK_MAC( float, float, s ) +BLALINK_MAC( double, double, d ) +BLALINK_MAC( ccscmplx, scomplex, c ) +BLALINK_MAC( ccdcmplx, dcomplex, z ) +#undef BLALINK_MAC + +// dot +template +inline T dot(dim_t n, + T *sx, inc_t incx, + T *sy, inc_t incy); +#ifndef BLAS_EXTERNAL +#define BLALINK_MAC(cctype, ctype, cchar, cfunc) \ + template <> inline cctype dot(dim_t n, \ + cctype *sx, inc_t incx, \ + cctype *sy, inc_t incy) \ + { \ + cctype rho; \ + bli_##cchar##dotv(BLIS_NO_CONJUGATE, \ + BLIS_NO_CONJUGATE, \ + n, \ + (ctype *)sx, incx, \ + (ctype *)sy, incy, \ + (ctype *)&rho); \ + return rho; \ + } +#else +#define BLALINK_MAC(cctype, ctype, cchar, cfunc) \ + template <> inline cctype dot(dim_t n, \ + cctype *sx, inc_t incx, \ + cctype *sy, inc_t incy) \ + { \ + ctype rho = cchar##cfunc##_(&n, \ + (ctype *)sx, &incx, \ + (ctype *)sy, &incy); \ + return *((cctype *)&rho); \ + } +#endif +BLALINK_MAC( float, float, s, dot ) +BLALINK_MAC( double, double, d, dot ) +#if defined(BLAS_EXTERNAL) && defined(F77_COMPLEX_RET_INTEL) +// Intel style complex return. +#undef BLALINK_MAC +#define BLALINK_MAC(cctype, ctype, cchar, cfunc) \ + template <> inline cctype dot(dim_t n, \ + cctype *sx, inc_t incx, \ + cctype *sy, inc_t incy) \ + { \ + cctype rho; \ + cchar##cfunc##_((ctype *)&rho, &n, \ + (ctype *)sx, &incx, \ + (ctype *)sy, &incy); \ + return rho; \ + } +#endif +BLALINK_MAC( ccscmplx, scomplex, c, dotu ) +BLALINK_MAC( ccdcmplx, dcomplex, z, dotu ) +#undef BLALINK_MAC + + + +// [LAPACK] lacpy +template +inline void lacpy(uplo_t uploa, + dim_t m, dim_t n, + T *a, inc_t lda, + T *b, inc_t ldb); +#define BLALINK_MAC(cctype, ctype, cchar) \ + template <> inline void lacpy(uplo_t uploa, \ + dim_t m, dim_t n, \ + cctype *a, inc_t lda, \ + cctype *b, inc_t ldb) \ + { \ + char ul = uplo2char(uploa); \ + cchar##lacpy_(&ul, &m, &n, a, &lda, b, &ldb); \ + } +BLALINK_MAC( float, float, s ) +BLALINK_MAC( double, double, d ) +BLALINK_MAC( ccscmplx, scomplex, c ) +BLALINK_MAC( ccdcmplx, dcomplex, z ) +#undef BLALINK_MAC + +// [LAPACK] trtri +template +inline int trtri(uplo_t uploa, + diag_t diaga, + dim_t n, + T *a, inc_t lda); +#define BLALINK_MAC(cctype, ctype, cchar) \ + template <> inline int trtri(uplo_t uploa, \ + diag_t diaga, \ + dim_t n, \ + cctype *a, inc_t lda) \ + { \ + char ul = uplo2char(uploa); \ + char dg = diag2char(diaga); \ + int info; \ + cchar##trtri_(&ul, &dg, &n, a, &lda, &info); \ + return info; \ + } +BLALINK_MAC( float, float, s ) +BLALINK_MAC( double, double, d ) +BLALINK_MAC( ccscmplx, scomplex, c ) +BLALINK_MAC( ccdcmplx, dcomplex, z ) +#undef BLALINK_MAC + +// [PFAPACK] sktrf +template +inline int sktrf(uplo_t uploa, + dim_t n, + T *a, inc_t lda, + int *ipiv, + T *work, dim_t lwork); +#define BLALINK_MAC(cctype, ctype, cchar) \ + template <> inline int sktrf(uplo_t uploa, \ + dim_t n, \ + cctype *a, inc_t lda, \ + int *ipiv, \ + cctype *work, dim_t lwork) \ + { \ + char ul = uplo2char(uploa); \ + char mode = 'N'; \ + int info; \ + cchar##sktrf_(&ul, &mode, &n, a, &lda, ipiv, work, &lwork, &info); \ + return info; \ + } +BLALINK_MAC( float, float, s ) +BLALINK_MAC( double, double, d ) +BLALINK_MAC( ccscmplx, scomplex, c ) +BLALINK_MAC( ccdcmplx, dcomplex, z ) +#undef BLALINK_MAC + +// [PFAPACK] skpfa +template +inline int skpfa(uplo_t uploa, + dim_t n, + T *a, inc_t lda, + T *pfa, int *ipiv, + T *work, dim_t lwork); +#define BLALINK_MAC(cctype, ctype, cchar) \ + template <> inline int skpfa(uplo_t uploa, \ + dim_t n, \ + cctype *a, inc_t lda, \ + cctype *pfa, int *ipiv, \ + cctype *work, dim_t lwork) \ + { \ + char ul = uplo2char(uploa); \ + char mthd = 'P'; \ + int info; \ + cchar##skpfa_(&ul, &mthd, &n, a, &lda, pfa, ipiv, work, &lwork, &info); \ + return info; \ + } +BLALINK_MAC( float, float, s ) +BLALINK_MAC( double, double, d ) +#undef BLALINK_MAC +#define BLALINK_MAC(cctype, ctype, cchar) \ + template <> inline int skpfa(uplo_t uploa, \ + dim_t n, \ + cctype *a, inc_t lda, \ + cctype *pfa, int *ipiv, \ + cctype *work, dim_t lwork) \ + { \ + char ul = uplo2char(uploa); \ + char mthd = 'P'; \ + int info; \ + cchar##skpfa_(&ul, &mthd, &n, a, &lda, pfa, ipiv, work, &lwork, 0, &info); \ + return info; \ + } +BLALINK_MAC( ccscmplx, scomplex, c ) +BLALINK_MAC( ccdcmplx, dcomplex, z ) +#undef BLALINK_MAC + diff --git a/src/common/blalink_fort.h b/src/common/blalink_fort.h new file mode 100644 index 00000000..eb0b3a39 --- /dev/null +++ b/src/common/blalink_fort.h @@ -0,0 +1,184 @@ +/** + * \file blalink_fort.hh + * Wrapper for external BLAS. + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at https://mozilla.org/MPL/2.0/. + */ +#pragma once +#include +#include "blis.h" + + +// converter from BLIS enum to BLAS char. +BLIS_INLINE char trans2char(trans_t t) +{ + switch (t) { + case BLIS_NO_TRANSPOSE: + return 'N'; + case BLIS_TRANSPOSE: + return 'T'; + case BLIS_CONJ_NO_TRANSPOSE: + return 'C'; + default: + abort(); + } +} +BLIS_INLINE char uplo2char(uplo_t t) +{ + switch (t) { + case BLIS_UPPER: + return 'U'; + case BLIS_LOWER: + return 'L'; + default: + abort(); + } +} +BLIS_INLINE char side2char(side_t t) +{ + switch (t) { + case BLIS_LEFT: + return 'L'; + case BLIS_RIGHT: + return 'R'; + default: + abort(); + } +} +BLIS_INLINE char diag2char(diag_t t) +{ + switch (t) { + case BLIS_UNIT_DIAG: + return 'U'; + case BLIS_NONUNIT_DIAG: + return 'N'; + } +} + + +#ifdef __cplusplus +extern "C" +{ +#endif + +// BLAS { +#ifdef BLAS_EXTERNAL + +// gemm +void sgemm_(char *transa, char *transb, dim_t *m, dim_t *n, dim_t *k, float *alpha, + float *a, dim_t *lda, float *b, dim_t *ldb, float *beta, float *c, dim_t *ldc); +void dgemm_(char *transa, char *transb, dim_t *m, dim_t *n, dim_t *k, double *alpha, + double *a, dim_t *lda, double *b, dim_t *ldb, double *beta, double *c, dim_t *ldc); +void cgemm_(char *transa, char *transb, dim_t *m, dim_t *n, dim_t *k, void *alpha, + void *a, dim_t *lda, void *b, dim_t *ldb, void *beta, void *c, dim_t *ldc); +void zgemm_(char *transa, char *transb, dim_t *m, dim_t *n, dim_t *k, void *alpha, + void *a, dim_t *lda, void *b, dim_t *ldb, void *beta, void *c, dim_t *ldc); + + +// ger +void sger_(dim_t *m, dim_t *n, float *alpha, + float *x, dim_t *incx, float *y, dim_t *incy, float *a, dim_t *lda); +void dger_(dim_t *m, dim_t *n, double *alpha, + double *x, dim_t *incx, double *y, dim_t *incy, double *a, dim_t *lda); +void cgeru_(dim_t *m, dim_t *n, void *alpha, + void *x, dim_t *incx, void *y, dim_t *incy, void *a, dim_t *lda); +void zgeru_(dim_t *m, dim_t *n, void *alpha, + void *x, dim_t *incx, void *y, dim_t *incy, void *a, dim_t *lda); + + +// gemv +void sgemv_(char *trans, dim_t *m, dim_t *n, float *alpha, float *a, dim_t *lda, + float *x, dim_t *incx, float *beta, float *y, dim_t *incy); +void dgemv_(char *trans, dim_t *m, dim_t *n, double *alpha, double *a, dim_t *lda, + double *x, dim_t *incx, double *beta, double *y, dim_t *incy); +void cgemv_(char *trans, dim_t *m, dim_t *n, void *alpha, void *a, dim_t *lda, + void *x, dim_t *incx, void *beta, void *y, dim_t *incy); +void zgemv_(char *trans, dim_t *m, dim_t *n, void *alpha, void *a, dim_t *lda, + void *x, dim_t *incx, void *beta, void *y, dim_t *incy); + + +// trmm +void strmm_(char *side, char *uplo, char *transa, char *diag, dim_t *m, dim_t *n, + float *alpha, float *a, inc_t *lda, float *b, inc_t *ldb); +void dtrmm_(char *side, char *uplo, char *transa, char *diag, dim_t *m, dim_t *n, + double *alpha, double *a, inc_t *lda, double *b, inc_t *ldb); +void ctrmm_(char *side, char *uplo, char *transa, char *diag, dim_t *m, dim_t *n, + void *alpha, void *a, inc_t *lda, void *b, inc_t *ldb); +void ztrmm_(char *side, char *uplo, char *transa, char *diag, dim_t *m, dim_t *n, + void *alpha, void *a, inc_t *lda, void *b, inc_t *ldb); + + +// trmv +void strmv_(char *uplo, char *transa, char *diag, dim_t *n, + float *a, inc_t *lda, float *b, inc_t *ldb); +void dtrmv_(char *uplo, char *transa, char *diag, dim_t *n, + double *a, inc_t *lda, double *b, inc_t *ldb); +void ctrmv_(char *uplo, char *transa, char *diag, dim_t *n, + void *a, inc_t *lda, void *b, inc_t *ldb); +void ztrmv_(char *uplo, char *transa, char *diag, dim_t *n, + void *a, inc_t *lda, void *b, inc_t *ldb); + +// swap +void sswap_(dim_t *n, float *sx, dim_t *incx, float *sy, dim_t *incy); +void dswap_(dim_t *n, double *sx, dim_t *incx, double *sy, dim_t *incy); +void cswap_(dim_t *n, void *sx, dim_t *incx, void *sy, dim_t *incy); +void zswap_(dim_t *n, void *sx, dim_t *incx, void *sy, dim_t *incy); + +// axpy +void saxpy_(dim_t *n, float *alpha, float *sx, dim_t *incx, float *sy, dim_t *incy); +void daxpy_(dim_t *n, double *alpha, double *sx, dim_t *incx, double *sy, dim_t *incy); +void caxpy_(dim_t *n, void *alpha, void *sx, dim_t *incx, void *sy, dim_t *incy); +void zaxpy_(dim_t *n, void *alpha, void *sx, dim_t *incx, void *sy, dim_t *incy); + +// dot +float sdot_(dim_t *n, float *sx, dim_t *incx, float *sy, dim_t *incy); +double ddot_(dim_t *n, double *sx, dim_t *incx, double *sy, dim_t *incy); +scomplex cdotc_(dim_t *n, void *sx, dim_t *incx, void *sy, dim_t *incy); +dcomplex zdotc_(dim_t *n, void *sx, dim_t *incx, void *sy, dim_t *incy); +#ifndef F77_COMPLEX_RET_INTEL +scomplex cdotu_(dim_t *n, void *sx, dim_t *incx, void *sy, dim_t *incy); +dcomplex zdotu_(dim_t *n, void *sx, dim_t *incx, void *sy, dim_t *incy); +#else +void cdotu_(scomplex *rho, dim_t *n, void *sx, dim_t *incx, void *sy, dim_t *incy); +void zdotu_(dcomplex *rho, dim_t *n, void *sx, dim_t *incx, void *sy, dim_t *incy); +#endif + +// } +#endif + +// LAPACK { + +// lacpy +void slacpy_(char *uplo, dim_t *m, dim_t *n, float *a, inc_t *lda, float *b, inc_t *ldb); +void dlacpy_(char *uplo, dim_t *m, dim_t *n, double *a, inc_t *lda, double *b, inc_t *ldb); +void clacpy_(char *uplo, dim_t *m, dim_t *n, void *a, inc_t *lda, void *b, inc_t *ldb); +void zlacpy_(char *uplo, dim_t *m, dim_t *n, void *a, inc_t *lda, void *b, inc_t *ldb); + +// trtri +void strtri_(char *uplo, char *diag, dim_t *n, float *a, inc_t *lda, int *info); +void dtrtri_(char *uplo, char *diag, dim_t *n, double *a, inc_t *lda, int *info); +void ctrtri_(char *uplo, char *diag, dim_t *n, void *a, inc_t *lda, int *info); +void ztrtri_(char *uplo, char *diag, dim_t *n, void *a, inc_t *lda, int *info); + +// } + +// PFAPACK77 { + +void ssktrf_(char *uplo, char *mode, dim_t *n, float *a, dim_t *lda, int *ipiv, float *work, dim_t *lwork, int *info); +void dsktrf_(char *uplo, char *mode, dim_t *n, double *a, dim_t *lda, int *ipiv, double *work, dim_t *lwork, int *info); +void csktrf_(char *uplo, char *mode, dim_t *n, void *a, dim_t *lda, int *ipiv, void *work, dim_t *lwork, int *info); +void zsktrf_(char *uplo, char *mode, dim_t *n, void *a, dim_t *lda, int *ipiv, void *work, dim_t *lwork, int *info); + +void sskpfa_(char *uplo, char *mthd, dim_t *n, float *a, dim_t *lda, float *pfa, int *ipiv, float *work, dim_t *lwork, int *info); +void dskpfa_(char *uplo, char *mthd, dim_t *n, double *a, dim_t *lda, double *pfa, int *ipiv, double *work, dim_t *lwork, int *info); +void cskpfa_(char *uplo, char *mthd, dim_t *n, void *a, dim_t *lda, void *pfa, int *ipiv, void *work, dim_t *lwork, void *rwork, int *info); +void zskpfa_(char *uplo, char *mthd, dim_t *n, void *a, dim_t *lda, void *pfa, int *ipiv, void *work, dim_t *lwork, void *rwork, int *info); + +// } + +#ifdef __cplusplus +} +#endif + diff --git a/src/common/colmaj.hh b/src/common/colmaj.hh new file mode 100644 index 00000000..d9a03c1e --- /dev/null +++ b/src/common/colmaj.hh @@ -0,0 +1,26 @@ +/** + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at https://mozilla.org/MPL/2.0/. + */ +#pragma once + +template +struct colmaj +{ + T *dat; + int ld; + + colmaj() = delete; + colmaj(T *dat_, int ld_) + : dat(dat_), ld(ld_) { } + + T &operator()(int i, int j) + { + return dat[ i + j * ld ]; + } + T &operator()() + { + return dat[ 0 ]; + } +}; diff --git a/src/common/deps/blis.h b/src/common/deps/blis.h new file mode 100644 index 00000000..0627afe2 --- /dev/null +++ b/src/common/deps/blis.h @@ -0,0 +1,552 @@ +/* + * blis.h + This header is cut out from BLIS v0.8.0, + containing necessary type definitions & macros. + User can use this file to avert BLIS dependency + if their BLAS library vendors a GEMMT implementation. + Here begins 3-clause BSD-style license of BLIS: + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2016, Hewlett Packard Enterprise Development LP + Copyright (C) 2020, Advanced Micro Devices, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_H +#define BLIS_H + + +// Allow C++ users to include this header file in their source code. However, +// we make the extern "C" conditional on whether we're using a C++ compiler, +// since regular C compilers don't understand the extern "C" construct. +#ifdef __cplusplus +extern "C" { +#endif + +// NOTE: PLEASE DON'T CHANGE THE ORDER IN WHICH HEADERS ARE INCLUDED UNLESS +// YOU ARE SURE THAT IT DOESN'T BREAK INTER-HEADER MACRO DEPENDENCIES. + +#include // skipped +#include // skipped +#include // skipped +#include // skipped +#include // skipped +#include // skipped +#include // skipped +#include // skipped +#include // skipped + +// -- STATIC INLINE FUNCTIONS -------------------------------------------------- + +// C and C++ have different semantics for defining "inline" functions. In C, +// the keyword phrase "static inline" accomplishes this, though the "inline" +// is optional. In C++, the "inline" keyword is required and obviates "static" +// altogether. Why does this matter? While BLIS is compiled in C99, blis.h may +// be #included by a source file that is compiled with C++. +#ifdef __cplusplus + #define BLIS_INLINE inline +#else + //#define BLIS_INLINE static inline + #define BLIS_INLINE static +#endif + +// -- Common BLIS definitions -- + +// begin bli_type_defs.h + + +#ifndef BLIS_TYPE_DEFS_H +#define BLIS_TYPE_DEFS_H + + +// +// -- BLIS basic types --------------------------------------------------------- +// + +#ifdef __cplusplus + // For C++, include stdint.h. +#include // skipped +#elif __STDC_VERSION__ >= 199901L + // For C99 (or later), include stdint.h. +#include // skipped +#include // skipped +#else + // When stdint.h is not available, manually typedef the types we will use. + #ifdef _WIN32 + typedef __int32 int32_t; + typedef unsigned __int32 uint32_t; + typedef __int64 int64_t; + typedef unsigned __int64 uint64_t; + #else + #error "Attempting to compile on pre-C99 system without stdint.h." + #endif +#endif + +// -- General-purpose integers -- + +// If BLAS integers are 64 bits, mandate that BLIS integers also be 64 bits. +// NOTE: This cpp guard will only meaningfully change BLIS's behavior on +// systems where the BLIS integer size would have been automatically selected +// to be 32 bits, since explicit selection of 32 bits is prohibited at +// configure-time (and explicit or automatic selection of 64 bits is fine +// and would have had the same result). +#if BLIS_BLAS_INT_SIZE == 64 + #undef BLIS_INT_TYPE_SIZE + #define BLIS_INT_TYPE_SIZE 64 +#endif + +// Define integer types depending on what size integer was requested. +#if BLIS_INT_TYPE_SIZE == 32 +typedef int32_t gint_t; +typedef uint32_t guint_t; +#elif BLIS_INT_TYPE_SIZE == 64 +typedef int64_t gint_t; +typedef uint64_t guint_t; +#else +typedef signed long int gint_t; +typedef unsigned long int guint_t; +#endif + +// -- Boolean type -- + +// NOTE: bool_t is no longer used and has been replaced with C99's bool type. +//typedef bool bool_t; + +// BLIS uses TRUE and FALSE macro constants as possible boolean values, but we +// define these macros in terms of true and false, respectively, which are +// defined by C99 in stdbool.h. +#ifndef TRUE + #define TRUE true +#endif + +#ifndef FALSE + #define FALSE false +#endif + +// -- Special-purpose integers -- + +// This cpp guard provides a temporary hack to allow libflame +// interoperability with BLIS. +#ifndef _DEFINED_DIM_T +#define _DEFINED_DIM_T +typedef gint_t dim_t; // dimension type +#endif +typedef gint_t inc_t; // increment/stride type +typedef gint_t doff_t; // diagonal offset type +typedef guint_t siz_t; // byte size type +typedef uint32_t objbits_t; // object information bit field + +// -- Real types -- + +// Define the number of floating-point types supported, and the size of the +// largest type. +#define BLIS_NUM_FP_TYPES 4 +#define BLIS_MAX_TYPE_SIZE sizeof(dcomplex) + +// There are some places where we need to use sizeof() inside of a C +// preprocessor #if conditional, and so here we define the various sizes +// for those purposes. +#define BLIS_SIZEOF_S 4 // sizeof(float) +#define BLIS_SIZEOF_D 8 // sizeof(double) +#define BLIS_SIZEOF_C 8 // sizeof(scomplex) +#define BLIS_SIZEOF_Z 16 // sizeof(dcomplex) + +// -- Complex types -- + +#ifdef BLIS_ENABLE_C99_COMPLEX + + #if __STDC_VERSION__ >= 199901L +#include // skipped + + // Typedef official complex types to BLIS complex type names. + typedef float complex scomplex; + typedef double complex dcomplex; + #else + #error "Configuration requested C99 complex types, but C99 does not appear to be supported." + #endif + +#else // ifndef BLIS_ENABLE_C99_COMPLEX + + // This cpp guard provides a temporary hack to allow libflame + // interoperability with BLIS. + #ifndef _DEFINED_SCOMPLEX + #define _DEFINED_SCOMPLEX + typedef struct + { + float real; + float imag; + } scomplex; + #endif + + // This cpp guard provides a temporary hack to allow libflame + // interoperability with BLIS. + #ifndef _DEFINED_DCOMPLEX + #define _DEFINED_DCOMPLEX + typedef struct + { + double real; + double imag; + } dcomplex; + #endif + +#endif // BLIS_ENABLE_C99_COMPLEX + +// -- Atom type -- + +// Note: atom types are used to hold "bufferless" scalar object values. Note +// that it needs to be as large as the largest possible scalar value we might +// want to hold. Thus, for now, it is a dcomplex. +typedef dcomplex atom_t; + +// -- Fortran-77 types -- + +// Note: These types are typically only used by BLAS compatibility layer, but +// we must define them even when the compatibility layer isn't being built +// because they also occur in bli_slamch() and bli_dlamch(). + +// Define f77_int depending on what size of integer was requested. +#if BLIS_BLAS_INT_TYPE_SIZE == 32 +typedef int32_t f77_int; +#elif BLIS_BLAS_INT_TYPE_SIZE == 64 +typedef int64_t f77_int; +#else +typedef long int f77_int; +#endif + +typedef char f77_char; +typedef float f77_float; +typedef double f77_double; +typedef scomplex f77_scomplex; +typedef dcomplex f77_dcomplex; + +// -- Void function pointer types -- + +// Note: This type should be used in any situation where the address of a +// *function* will be conveyed or stored prior to it being typecast back +// to the correct function type. It does not need to be used when conveying +// or storing the address of *data* (such as an array of float or double). + +//typedef void (*void_fp)( void ); +typedef void* void_fp; + + +// +// -- BLIS info bit field offsets ---------------------------------------------- +// + + + +// info +#define BLIS_DATATYPE_SHIFT 0 +#define BLIS_DOMAIN_SHIFT 0 +#define BLIS_PRECISION_SHIFT 1 +#define BLIS_CONJTRANS_SHIFT 3 +#define BLIS_TRANS_SHIFT 3 +#define BLIS_CONJ_SHIFT 4 +#define BLIS_UPLO_SHIFT 5 +#define BLIS_UPPER_SHIFT 5 +#define BLIS_DIAG_SHIFT 6 +#define BLIS_LOWER_SHIFT 7 +#define BLIS_UNIT_DIAG_SHIFT 8 +#define BLIS_INVERT_DIAG_SHIFT 9 +#define BLIS_TARGET_DT_SHIFT 10 +#define BLIS_TARGET_DOMAIN_SHIFT 10 +#define BLIS_TARGET_PREC_SHIFT 11 +#define BLIS_EXEC_DT_SHIFT 13 +#define BLIS_EXEC_DOMAIN_SHIFT 13 +#define BLIS_EXEC_PREC_SHIFT 14 +#define BLIS_PACK_SCHEMA_SHIFT 16 +#define BLIS_PACK_RC_SHIFT 16 +#define BLIS_PACK_PANEL_SHIFT 17 +#define BLIS_PACK_FORMAT_SHIFT 18 +#define BLIS_PACK_SHIFT 22 +#define BLIS_PACK_REV_IF_UPPER_SHIFT 23 +#define BLIS_PACK_REV_IF_LOWER_SHIFT 24 +#define BLIS_PACK_BUFFER_SHIFT 25 +#define BLIS_STRUC_SHIFT 27 +#define BLIS_COMP_DT_SHIFT 29 +#define BLIS_COMP_DOMAIN_SHIFT 29 +#define BLIS_COMP_PREC_SHIFT 30 + +// info2 +#define BLIS_SCALAR_DT_SHIFT 0 +#define BLIS_SCALAR_DOMAIN_SHIFT 0 +#define BLIS_SCALAR_PREC_SHIFT 1 + +// +// -- BLIS info bit field masks ------------------------------------------------ +// + +// info +#define BLIS_DATATYPE_BITS ( 0x7 << BLIS_DATATYPE_SHIFT ) +#define BLIS_DOMAIN_BIT ( 0x1 << BLIS_DOMAIN_SHIFT ) +#define BLIS_PRECISION_BIT ( 0x1 << BLIS_PRECISION_SHIFT ) +#define BLIS_CONJTRANS_BITS ( 0x3 << BLIS_CONJTRANS_SHIFT ) +#define BLIS_TRANS_BIT ( 0x1 << BLIS_TRANS_SHIFT ) +#define BLIS_CONJ_BIT ( 0x1 << BLIS_CONJ_SHIFT ) +#define BLIS_UPLO_BITS ( 0x7 << BLIS_UPLO_SHIFT ) +#define BLIS_UPPER_BIT ( 0x1 << BLIS_UPPER_SHIFT ) +#define BLIS_DIAG_BIT ( 0x1 << BLIS_DIAG_SHIFT ) +#define BLIS_LOWER_BIT ( 0x1 << BLIS_LOWER_SHIFT ) +#define BLIS_UNIT_DIAG_BIT ( 0x1 << BLIS_UNIT_DIAG_SHIFT ) +#define BLIS_INVERT_DIAG_BIT ( 0x1 << BLIS_INVERT_DIAG_SHIFT ) +#define BLIS_TARGET_DT_BITS ( 0x7 << BLIS_TARGET_DT_SHIFT ) +#define BLIS_TARGET_DOMAIN_BIT ( 0x1 << BLIS_TARGET_DOMAIN_SHIFT ) +#define BLIS_TARGET_PREC_BIT ( 0x1 << BLIS_TARGET_PREC_SHIFT ) +#define BLIS_EXEC_DT_BITS ( 0x7 << BLIS_EXEC_DT_SHIFT ) +#define BLIS_EXEC_DOMAIN_BIT ( 0x1 << BLIS_EXEC_DOMAIN_SHIFT ) +#define BLIS_EXEC_PREC_BIT ( 0x1 << BLIS_EXEC_PREC_SHIFT ) +#define BLIS_PACK_SCHEMA_BITS ( 0x7F << BLIS_PACK_SCHEMA_SHIFT ) +#define BLIS_PACK_RC_BIT ( 0x1 << BLIS_PACK_RC_SHIFT ) +#define BLIS_PACK_PANEL_BIT ( 0x1 << BLIS_PACK_PANEL_SHIFT ) +#define BLIS_PACK_FORMAT_BITS ( 0xF << BLIS_PACK_FORMAT_SHIFT ) +#define BLIS_PACK_BIT ( 0x1 << BLIS_PACK_SHIFT ) +#define BLIS_PACK_REV_IF_UPPER_BIT ( 0x1 << BLIS_PACK_REV_IF_UPPER_SHIFT ) +#define BLIS_PACK_REV_IF_LOWER_BIT ( 0x1 << BLIS_PACK_REV_IF_LOWER_SHIFT ) +#define BLIS_PACK_BUFFER_BITS ( 0x3 << BLIS_PACK_BUFFER_SHIFT ) +#define BLIS_STRUC_BITS ( 0x3 << BLIS_STRUC_SHIFT ) +#define BLIS_COMP_DT_BITS ( 0x7 << BLIS_COMP_DT_SHIFT ) +#define BLIS_COMP_DOMAIN_BIT ( 0x1 << BLIS_COMP_DOMAIN_SHIFT ) +#define BLIS_COMP_PREC_BIT ( 0x1 << BLIS_COMP_PREC_SHIFT ) + +// info2 +#define BLIS_SCALAR_DT_BITS ( 0x7 << BLIS_SCALAR_DT_SHIFT ) +#define BLIS_SCALAR_DOMAIN_BIT ( 0x1 << BLIS_SCALAR_DOMAIN_SHIFT ) +#define BLIS_SCALAR_PREC_BIT ( 0x1 << BLIS_SCALAR_PREC_SHIFT ) + + +// +// -- BLIS enumerated type value definitions ----------------------------------- +// + +#define BLIS_BITVAL_REAL 0x0 +#define BLIS_BITVAL_COMPLEX BLIS_DOMAIN_BIT +#define BLIS_BITVAL_SINGLE_PREC 0x0 +#define BLIS_BITVAL_DOUBLE_PREC BLIS_PRECISION_BIT +#define BLIS_BITVAL_FLOAT_TYPE 0x0 +#define BLIS_BITVAL_SCOMPLEX_TYPE BLIS_DOMAIN_BIT +#define BLIS_BITVAL_DOUBLE_TYPE BLIS_PRECISION_BIT +#define BLIS_BITVAL_DCOMPLEX_TYPE ( BLIS_DOMAIN_BIT | BLIS_PRECISION_BIT ) +#define BLIS_BITVAL_INT_TYPE 0x04 +#define BLIS_BITVAL_CONST_TYPE 0x05 +#define BLIS_BITVAL_NO_TRANS 0x0 +#define BLIS_BITVAL_TRANS BLIS_TRANS_BIT +#define BLIS_BITVAL_NO_CONJ 0x0 +#define BLIS_BITVAL_CONJ BLIS_CONJ_BIT +#define BLIS_BITVAL_CONJ_TRANS ( BLIS_CONJ_BIT | BLIS_TRANS_BIT ) +#define BLIS_BITVAL_ZEROS 0x0 +#define BLIS_BITVAL_UPPER ( BLIS_UPPER_BIT | BLIS_DIAG_BIT ) +#define BLIS_BITVAL_LOWER ( BLIS_LOWER_BIT | BLIS_DIAG_BIT ) +#define BLIS_BITVAL_DENSE BLIS_UPLO_BITS +#define BLIS_BITVAL_NONUNIT_DIAG 0x0 +#define BLIS_BITVAL_UNIT_DIAG BLIS_UNIT_DIAG_BIT +#define BLIS_BITVAL_INVERT_DIAG BLIS_INVERT_DIAG_BIT +#define BLIS_BITVAL_NOT_PACKED 0x0 +#define BLIS_BITVAL_4MI ( 0x1 << BLIS_PACK_FORMAT_SHIFT ) +#define BLIS_BITVAL_3MI ( 0x2 << BLIS_PACK_FORMAT_SHIFT ) +#define BLIS_BITVAL_4MS ( 0x3 << BLIS_PACK_FORMAT_SHIFT ) +#define BLIS_BITVAL_3MS ( 0x4 << BLIS_PACK_FORMAT_SHIFT ) +#define BLIS_BITVAL_RO ( 0x5 << BLIS_PACK_FORMAT_SHIFT ) +#define BLIS_BITVAL_IO ( 0x6 << BLIS_PACK_FORMAT_SHIFT ) +#define BLIS_BITVAL_RPI ( 0x7 << BLIS_PACK_FORMAT_SHIFT ) +#define BLIS_BITVAL_1E ( 0x8 << BLIS_PACK_FORMAT_SHIFT ) +#define BLIS_BITVAL_1R ( 0x9 << BLIS_PACK_FORMAT_SHIFT ) +#define BLIS_BITVAL_PACKED_UNSPEC ( BLIS_PACK_BIT ) +#define BLIS_BITVAL_PACKED_ROWS ( BLIS_PACK_BIT ) +#define BLIS_BITVAL_PACKED_COLUMNS ( BLIS_PACK_BIT | BLIS_PACK_RC_BIT ) +#define BLIS_BITVAL_PACKED_ROW_PANELS ( BLIS_PACK_BIT | BLIS_PACK_PANEL_BIT ) +#define BLIS_BITVAL_PACKED_COL_PANELS ( BLIS_PACK_BIT | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT ) +#define BLIS_BITVAL_PACKED_ROW_PANELS_4MI ( BLIS_PACK_BIT | BLIS_BITVAL_4MI | BLIS_PACK_PANEL_BIT ) +#define BLIS_BITVAL_PACKED_COL_PANELS_4MI ( BLIS_PACK_BIT | BLIS_BITVAL_4MI | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT ) +#define BLIS_BITVAL_PACKED_ROW_PANELS_3MI ( BLIS_PACK_BIT | BLIS_BITVAL_3MI | BLIS_PACK_PANEL_BIT ) +#define BLIS_BITVAL_PACKED_COL_PANELS_3MI ( BLIS_PACK_BIT | BLIS_BITVAL_3MI | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT ) +#define BLIS_BITVAL_PACKED_ROW_PANELS_4MS ( BLIS_PACK_BIT | BLIS_BITVAL_4MS | BLIS_PACK_PANEL_BIT ) +#define BLIS_BITVAL_PACKED_COL_PANELS_4MS ( BLIS_PACK_BIT | BLIS_BITVAL_4MS | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT ) +#define BLIS_BITVAL_PACKED_ROW_PANELS_3MS ( BLIS_PACK_BIT | BLIS_BITVAL_3MS | BLIS_PACK_PANEL_BIT ) +#define BLIS_BITVAL_PACKED_COL_PANELS_3MS ( BLIS_PACK_BIT | BLIS_BITVAL_3MS | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT ) +#define BLIS_BITVAL_PACKED_ROW_PANELS_RO ( BLIS_PACK_BIT | BLIS_BITVAL_RO | BLIS_PACK_PANEL_BIT ) +#define BLIS_BITVAL_PACKED_COL_PANELS_RO ( BLIS_PACK_BIT | BLIS_BITVAL_RO | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT ) +#define BLIS_BITVAL_PACKED_ROW_PANELS_IO ( BLIS_PACK_BIT | BLIS_BITVAL_IO | BLIS_PACK_PANEL_BIT ) +#define BLIS_BITVAL_PACKED_COL_PANELS_IO ( BLIS_PACK_BIT | BLIS_BITVAL_IO | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT ) +#define BLIS_BITVAL_PACKED_ROW_PANELS_RPI ( BLIS_PACK_BIT | BLIS_BITVAL_RPI | BLIS_PACK_PANEL_BIT ) +#define BLIS_BITVAL_PACKED_COL_PANELS_RPI ( BLIS_PACK_BIT | BLIS_BITVAL_RPI | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT ) +#define BLIS_BITVAL_PACKED_ROW_PANELS_1E ( BLIS_PACK_BIT | BLIS_BITVAL_1E | BLIS_PACK_PANEL_BIT ) +#define BLIS_BITVAL_PACKED_COL_PANELS_1E ( BLIS_PACK_BIT | BLIS_BITVAL_1E | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT ) +#define BLIS_BITVAL_PACKED_ROW_PANELS_1R ( BLIS_PACK_BIT | BLIS_BITVAL_1R | BLIS_PACK_PANEL_BIT ) +#define BLIS_BITVAL_PACKED_COL_PANELS_1R ( BLIS_PACK_BIT | BLIS_BITVAL_1R | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT ) +#define BLIS_BITVAL_PACK_FWD_IF_UPPER 0x0 +#define BLIS_BITVAL_PACK_REV_IF_UPPER BLIS_PACK_REV_IF_UPPER_BIT +#define BLIS_BITVAL_PACK_FWD_IF_LOWER 0x0 +#define BLIS_BITVAL_PACK_REV_IF_LOWER BLIS_PACK_REV_IF_LOWER_BIT +#define BLIS_BITVAL_BUFFER_FOR_A_BLOCK 0x0 +#define BLIS_BITVAL_BUFFER_FOR_B_PANEL ( 0x1 << BLIS_PACK_BUFFER_SHIFT ) +#define BLIS_BITVAL_BUFFER_FOR_C_PANEL ( 0x2 << BLIS_PACK_BUFFER_SHIFT ) +#define BLIS_BITVAL_BUFFER_FOR_GEN_USE ( 0x3 << BLIS_PACK_BUFFER_SHIFT ) +#define BLIS_BITVAL_GENERAL 0x0 +#define BLIS_BITVAL_HERMITIAN ( 0x1 << BLIS_STRUC_SHIFT ) +#define BLIS_BITVAL_SYMMETRIC ( 0x2 << BLIS_STRUC_SHIFT ) +#define BLIS_BITVAL_TRIANGULAR ( 0x3 << BLIS_STRUC_SHIFT ) + + +// +// -- BLIS enumerated type definitions ----------------------------------------- +// + +// -- Operational parameter types -- + +typedef enum +{ + BLIS_NO_TRANSPOSE = 0x0, + BLIS_TRANSPOSE = BLIS_BITVAL_TRANS, + BLIS_CONJ_NO_TRANSPOSE = BLIS_BITVAL_CONJ, + BLIS_CONJ_TRANSPOSE = BLIS_BITVAL_CONJ_TRANS +} trans_t; + +typedef enum +{ + BLIS_NO_CONJUGATE = 0x0, + BLIS_CONJUGATE = BLIS_BITVAL_CONJ +} conj_t; + +typedef enum +{ + BLIS_ZEROS = BLIS_BITVAL_ZEROS, + BLIS_LOWER = BLIS_BITVAL_LOWER, + BLIS_UPPER = BLIS_BITVAL_UPPER, + BLIS_DENSE = BLIS_BITVAL_DENSE +} uplo_t; + +typedef enum +{ + BLIS_LEFT = 0x0, + BLIS_RIGHT +} side_t; + +typedef enum +{ + BLIS_NONUNIT_DIAG = 0x0, + BLIS_UNIT_DIAG = BLIS_BITVAL_UNIT_DIAG +} diag_t; + +typedef enum +{ + BLIS_NO_INVERT_DIAG = 0x0, + BLIS_INVERT_DIAG = BLIS_BITVAL_INVERT_DIAG +} invdiag_t; + +typedef enum +{ + BLIS_GENERAL = BLIS_BITVAL_GENERAL, + BLIS_HERMITIAN = BLIS_BITVAL_HERMITIAN, + BLIS_SYMMETRIC = BLIS_BITVAL_SYMMETRIC, + BLIS_TRIANGULAR = BLIS_BITVAL_TRIANGULAR +} struc_t; + + +// -- Data type -- + +typedef enum +{ + BLIS_FLOAT = BLIS_BITVAL_FLOAT_TYPE, + BLIS_DOUBLE = BLIS_BITVAL_DOUBLE_TYPE, + BLIS_SCOMPLEX = BLIS_BITVAL_SCOMPLEX_TYPE, + BLIS_DCOMPLEX = BLIS_BITVAL_DCOMPLEX_TYPE, + BLIS_INT = BLIS_BITVAL_INT_TYPE, + BLIS_CONSTANT = BLIS_BITVAL_CONST_TYPE, + BLIS_DT_LO = BLIS_FLOAT, + BLIS_DT_HI = BLIS_DCOMPLEX +} num_t; + +typedef enum +{ + BLIS_REAL = BLIS_BITVAL_REAL, + BLIS_COMPLEX = BLIS_BITVAL_COMPLEX +} dom_t; + +typedef enum +{ + BLIS_SINGLE_PREC = BLIS_BITVAL_SINGLE_PREC, + BLIS_DOUBLE_PREC = BLIS_BITVAL_DOUBLE_PREC +} prec_t; + +#endif +// end bli_type_defs.h + +// begin bli_param_map.h +BLIS_INLINE void bli_param_map_netlib_to_blis_side( char side, side_t* blis_side ) +{ + if ( side == 'l' || side == 'L' ) *blis_side = BLIS_LEFT; + else if ( side == 'r' || side == 'R' ) *blis_side = BLIS_RIGHT; + else + { + *blis_side = BLIS_LEFT; + } +} + +BLIS_INLINE void bli_param_map_netlib_to_blis_uplo( char uplo, uplo_t* blis_uplo ) +{ + if ( uplo == 'l' || uplo == 'L' ) *blis_uplo = BLIS_LOWER; + else if ( uplo == 'u' || uplo == 'U' ) *blis_uplo = BLIS_UPPER; + else + { + *blis_uplo = BLIS_LOWER; + } +} + +BLIS_INLINE void bli_param_map_netlib_to_blis_trans( char trans, trans_t* blis_trans ) +{ + if ( trans == 'n' || trans == 'N' ) *blis_trans = BLIS_NO_TRANSPOSE; + else if ( trans == 't' || trans == 'T' ) *blis_trans = BLIS_TRANSPOSE; + else if ( trans == 'c' || trans == 'C' ) *blis_trans = BLIS_CONJ_TRANSPOSE; + else + { + *blis_trans = BLIS_NO_TRANSPOSE; + } +} + +BLIS_INLINE void bli_param_map_netlib_to_blis_diag( char diag, diag_t* blis_diag ) +{ + if ( diag == 'n' || diag == 'N' ) *blis_diag = BLIS_NONUNIT_DIAG; + else if ( diag == 'u' || diag == 'U' ) *blis_diag = BLIS_UNIT_DIAG; + else + { + *blis_diag = BLIS_NONUNIT_DIAG; + } +} +// end bli_param_map.h + +// End extern "C" construct block. +#ifdef __cplusplus +} +#endif + +#endif + diff --git a/src/ltl2inv/CMakeLists.txt b/src/ltl2inv/CMakeLists.txt new file mode 100644 index 00000000..8aa7141c --- /dev/null +++ b/src/ltl2inv/CMakeLists.txt @@ -0,0 +1,16 @@ +# include guard +cmake_minimum_required(VERSION 2.8.0 ) + +if(${CMAKE_PROJECT_NAME} STREQUAL "Project") + message(FATAL_ERROR "cmake should be executed not for 'src' subdirectory, but for the top directory of mVMC.") +endif(${CMAKE_PROJECT_NAME} STREQUAL "Project") + +if("${ARCHITECTURE}" STREQUAL "x86_64") + add_definitions(-DF77_COMPLEX_RET_INTEL) +endif() +include_directories(../common) + +# TODO: Move blalink_gemmt.c to other subprojects? +add_library(ltl2inv STATIC ltl2inv.cc blalink_gemmt.c ilaenv_lauum.cc) +target_compile_definitions(ltl2inv PRIVATE -D_CC_IMPL) + diff --git a/src/ltl2inv/blalink_gemmt.c b/src/ltl2inv/blalink_gemmt.c new file mode 100644 index 00000000..fbaee562 --- /dev/null +++ b/src/ltl2inv/blalink_gemmt.c @@ -0,0 +1,42 @@ +/** + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at https://mozilla.org/MPL/2.0/. + */ +#include "blis.h" +#include "blalink_fort.h" + +#if !( defined(BLAS_EXTERNAL) && defined(MKL) ) +// Redefine GEMMT Fortran from BLIS (which is built without BLAS API). +// Required by Pfapack77. +#define BLAGEN_MAC(cctype, ctype, cchar) \ + void cchar##gemmt_(const char *uplo_, \ + const char *transa_, \ + const char *transb_, \ + const int *m, const int *k, \ + const ctype *alpha, \ + const ctype *a, const int *lda, \ + const ctype *b, const int *ldb, \ + const ctype *beta, ctype *c, const int *ldc) \ + { \ + uplo_t uploc; \ + trans_t transa; \ + trans_t transb; \ + bli_param_map_netlib_to_blis_uplo(*uplo_, &uploc) ; \ + bli_param_map_netlib_to_blis_trans(*transa_, &transa) ; \ + bli_param_map_netlib_to_blis_trans(*transb_, &transb) ; \ + bli_##cchar##gemmt(uploc, \ + transa, transb, \ + *m, *k, \ + alpha, \ + a, 1, *lda, \ + b, 1, *ldb, \ + beta, \ + c, 1, *ldc); \ + } +BLAGEN_MAC( float, float, s ) +BLAGEN_MAC( double, double, d ) +BLAGEN_MAC( ccscmplx, scomplex, c ) +BLAGEN_MAC( ccdcmplx, dcomplex, z ) +#undef BLAGEN_MAC +#endif diff --git a/src/ltl2inv/blalink_gemmt.hh b/src/ltl2inv/blalink_gemmt.hh new file mode 100644 index 00000000..52bb2c45 --- /dev/null +++ b/src/ltl2inv/blalink_gemmt.hh @@ -0,0 +1,80 @@ +/** + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at https://mozilla.org/MPL/2.0/. + */ +#pragma once +#include "blalink.hh" + + +// gemmt is not part of BLAS standard. +// It's currently know as exposed by BLIS and MKL. +template +inline void gemmt(uplo_t uploc, + trans_t transa, trans_t transb, + dim_t m, dim_t k, + T alpha, + T *a, inc_t lda, + T *b, inc_t ldb, + T beta, + T *c, inc_t ldc); +#if !( defined(BLAS_EXTERNAL) && defined(MKL) ) +#define BLALINK_MAC(cctype, ctype, cchar) \ + template <> inline void gemmt(uplo_t uploc, \ + trans_t transa, trans_t transb, \ + dim_t m, dim_t k, \ + cctype alpha, \ + cctype *a, inc_t lda, \ + cctype *b, inc_t ldb, \ + cctype beta, \ + cctype *c, inc_t ldc) \ + { \ + bli_##cchar##gemmt(uploc, \ + transa, transb, \ + m, k, \ + (ctype *)&alpha, \ + (ctype *)a, 1, lda, \ + (ctype *)b, 1, ldb, \ + (ctype *)&beta, \ + (ctype *)c, 1, ldc); \ + } +#else +#define BLALINK_MAC(cctype, ctype, cchar) \ + template <> inline void gemmt(uplo_t uploc, \ + trans_t transa, trans_t transb, \ + dim_t m, dim_t k, \ + cctype alpha, \ + cctype *a, inc_t lda, \ + cctype *b, inc_t ldb, \ + cctype beta, \ + cctype *c, inc_t ldc) \ + { \ + char ul = uplo2char(uploc); \ + char ta = trans2char(transa); \ + char tb = trans2char(transb); \ + cchar##gemmt_(&ul, \ + &ta, &tb, \ + &m, &k, \ + (ctype *)&alpha, \ + (ctype *)a, &lda, \ + (ctype *)b, &ldb, \ + (ctype *)&beta, \ + (ctype *)c, &ldc); \ + } +extern "C" { +void sgemmt_(char *uplo, char *transa, char *transb, dim_t *m, dim_t *k, float *alpha, + float *a, dim_t *lda, float *b, dim_t *ldb, float *beta, float *c, dim_t *ldc); +void dgemmt_(char *uplo, char *transa, char *transb, dim_t *m, dim_t *k, double *alpha, + double *a, dim_t *lda, double *b, dim_t *ldb, double *beta, double *c, dim_t *ldc); +void cgemmt_(char *uplo, char *transa, char *transb, dim_t *m, dim_t *k, void *alpha, + void *a, dim_t *lda, void *b, dim_t *ldb, void *beta, void *c, dim_t *ldc); +void zgemmt_(char *uplo, char *transa, char *transb, dim_t *m, dim_t *k, void *alpha, + void *a, dim_t *lda, void *b, dim_t *ldb, void *beta, void *c, dim_t *ldc); +} +#endif +BLALINK_MAC( float, float, s ) +BLALINK_MAC( double, double, d ) +BLALINK_MAC( ccscmplx, scomplex, c ) +BLALINK_MAC( ccdcmplx, dcomplex, z ) +#undef BLALINK_MAC + diff --git a/src/ltl2inv/ilaenv.h b/src/ltl2inv/ilaenv.h new file mode 100644 index 00000000..add0a1ed --- /dev/null +++ b/src/ltl2inv/ilaenv.h @@ -0,0 +1,15 @@ +/** + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at https://mozilla.org/MPL/2.0/. + */ +#pragma once +#ifdef __cplusplus +extern "C" { +#endif + +int ilaenv_(const int *ispec, const char *name, const char *opts, const int *n1, const int *n2, const int *n3, const int *n4); + +#ifdef __cplusplus +} +#endif diff --git a/src/ltl2inv/ilaenv_lauum.cc b/src/ltl2inv/ilaenv_lauum.cc new file mode 100644 index 00000000..f150384f --- /dev/null +++ b/src/ltl2inv/ilaenv_lauum.cc @@ -0,0 +1,23 @@ +/** + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at https://mozilla.org/MPL/2.0/. + */ +#include "blalink.hh" +#include "ilaenv.h" +#include "ilaenv_lauum.hh" + +#define EXPANDMAC(cctype, name) \ +template <> int ilaenv_lauum(uplo_t uplo, int n) \ +{ \ + char uplo_ = uplo2char(uplo); \ + int ispec = 1; \ + int dummy = 0; \ + return ilaenv_(&ispec, #name, &uplo_, &n, &dummy, &dummy, &dummy); \ +} +EXPANDMAC( float, SLAUUM ) +EXPANDMAC( double, DLAUUM ) +EXPANDMAC( ccscmplx, CLAUUM ) +EXPANDMAC( ccdcmplx, ZLAUUM ) +#undef EXPANDMAC + diff --git a/src/ltl2inv/ilaenv_lauum.hh b/src/ltl2inv/ilaenv_lauum.hh new file mode 100644 index 00000000..68d89b57 --- /dev/null +++ b/src/ltl2inv/ilaenv_lauum.hh @@ -0,0 +1,11 @@ +/** + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at https://mozilla.org/MPL/2.0/. + */ +#pragma once +#include "blalink.hh" + +template +int ilaenv_lauum(uplo_t uplo, int n); + diff --git a/src/ltl2inv/invert.tcc b/src/ltl2inv/invert.tcc new file mode 100644 index 00000000..0b11edf1 --- /dev/null +++ b/src/ltl2inv/invert.tcc @@ -0,0 +1,132 @@ +/** + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at https://mozilla.org/MPL/2.0/. + */ +#pragma once +#include "colmaj.hh" +#include "blalink.hh" +#include "trmmt.tcc" + +template +void sktdsmx(int n, T *vT, T *B_, int ldB, T *C_, int ldC) +{ + colmaj B(B_, ldB); + colmaj C(C_, ldC); + + for (int j = 0; j < n; ++j) + C(1, j) = B(0, j) / -vT[0]; + for (int i = 2; i < n; i += 2) + for (int j = 0; j < n; ++j) + C(i+1, j) = (B(i, j) - C(i-1, j) * vT[i-1]) / -vT[i]; + + for (int j = 0; j < n; ++j) + C(n-2, j) = B(n-1, j) / vT[n-2]; + for (int i = n-3; i >= 0; i -= 2) + for (int j = 0; j < n; ++j) + C(i-1, j) = (B(i, j) + C(i+1, j) * vT[i]) / vT[i-1]; +} + +template +void ltl2inv(int n, T *A_, int ldA, int *iPiv, T *vT, T *M_, int ldM) +{ + colmaj A(A_, ldA); + colmaj M(M_, ldM); + + int npanel_trmm = ilaenv_lauum(BLIS_LOWER, n); + int full = npanel_trmm <= 1 || npanel_trmm >= n; + + // Set M. + for (int j = 0; j < n; ++j) + for (int i = 0; i < n; ++i) + M(i, j) = T(!(i - j)); + + trtri(BLIS_LOWER, BLIS_UNIT_DIAG, n-1, &A(1, 0), ldA); + lacpy(BLIS_LOWER, n-2, n-2, &A(2, 0), ldA, &M(2, 1), ldM); + + for (int i = 0; i < n-1; ++i) + vT[i] = A(i+1, i); + sktdsmx(n, vT, &M(0, 0), ldM, &A(0, 0), ldA); + + if (!full) { + trmmt(BLIS_LOWER, BLIS_UNIT_DIAG, n, T(1.0), M, A); + for (int j = 0; j < n; ++j) { + A(j, j) = 0.0; + for (int i = 0; i < j; ++i) + A(i, j) = -A(j, i); + } + } + + // In-place permute columns. + for (int j = n-1; j >= 0; --j) + if (iPiv[j]-1 != j) + swap(n, &A(0, j), 1, &A(0, iPiv[j]-1), 1); + + if (full) + trmm(BLIS_LEFT, BLIS_LOWER, BLIS_TRANSPOSE, BLIS_UNIT_DIAG, + n, n, T(1.0), &M(0, 0), ldM, &A(0, 0), ldA); + + // In-place permute rows. + for (int i = n-1; i >= 0; --i) + if (iPiv[i]-1 != i) + swap(n, &A(i, 0), ldA, &A(iPiv[i]-1, 0), ldA); +} + +template +void utu2inv(int n, T *A_, int ldA, int *iPiv, T *vT, T *M_, int ldM) +{ + colmaj A(A_, ldA); + colmaj M(M_, ldM); + + int npanel_trmm = ilaenv_lauum(BLIS_UPPER, n); + int full = npanel_trmm <= 1 || npanel_trmm >= n; + + // Set M. + for (int j = 0; j < n; ++j) + for (int i = 0; i < n; ++i) + M(i, j) = T(!(i - j)); + + trtri(BLIS_UPPER, BLIS_UNIT_DIAG, n-1, &A(0, 1), ldA); + lacpy(BLIS_UPPER, n-2, n-2, &A(0, 2), ldA, &M(0, 1), ldM); + + for (int i = 0; i < n-1; ++i) + vT[i] = -A(i, i+1); + sktdsmx(n, vT, &M(0, 0), ldM, &A(0, 0), ldA); + + if (!full) { + trmmt(BLIS_UPPER, BLIS_UNIT_DIAG, n, T(1.0), M, A); + for (int j = 0; j < n; ++j) { + A(j, j) = 0.0; + for (int i = j + 1; i < n; ++i) + A(i, j) = -A(j, i); + } + } + + // In-place permute columns. + for (int j = 0; j < n; ++j) + if (iPiv[j]-1 != j) + swap(n, &A(0, j), 1, &A(0, iPiv[j]-1), 1); + + if (full) + trmm(BLIS_LEFT, BLIS_UPPER, BLIS_TRANSPOSE, BLIS_UNIT_DIAG, + n, n, T(1.0), &M(0, 0), ldM, &A(0, 0), ldA); + + // In-place permute rows. + for (int i = 0; i < n; ++i) + if (iPiv[i]-1 != i) + swap(n, &A(i, 0), ldA, &A(iPiv[i]-1, 0), ldA); +} + +template +void ltl2inv(uplo_t uplo, int n, T *A_, int ldA, int *iPiv, T *vT, T *M_, int ldM) +{ + switch (uplo) { + case BLIS_LOWER: + ltl2inv(n, A_, ldA, iPiv, vT, M_, ldM); break; + case BLIS_UPPER: + utu2inv(n, A_, ldA, iPiv, vT, M_, ldM); break; + default: + break; + } +} + diff --git a/src/ltl2inv/ltl2inv.cc b/src/ltl2inv/ltl2inv.cc new file mode 100644 index 00000000..19f10256 --- /dev/null +++ b/src/ltl2inv/ltl2inv.cc @@ -0,0 +1,33 @@ +/** + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at https://mozilla.org/MPL/2.0/. + */ +#include "pfaffian.tcc" +#include "invert.tcc" + + + +extern "C" { + +void ltl2inv_s(int n, float *A_, int ldA, int *iPiv, float *vT, float *M_, int ldM) { ltl2inv(n, A_, ldA, iPiv, vT, M_, ldM); } +void ltl2inv_d(int n, double *A_, int ldA, int *iPiv, double *vT, double *M_, int ldM) { ltl2inv(n, A_, ldA, iPiv, vT, M_, ldM); } +void ltl2inv_c(int n, ccscmplx *A_, int ldA, int *iPiv, ccscmplx *vT, ccscmplx *M_, int ldM) { ltl2inv(n, A_, ldA, iPiv, vT, M_, ldM); } +void ltl2inv_z(int n, ccdcmplx *A_, int ldA, int *iPiv, ccdcmplx *vT, ccdcmplx *M_, int ldM) { ltl2inv(n, A_, ldA, iPiv, vT, M_, ldM); } +void utu2inv_s(int n, float *A_, int ldA, int *iPiv, float *vT, float *M_, int ldM) { utu2inv(n, A_, ldA, iPiv, vT, M_, ldM); } +void utu2inv_d(int n, double *A_, int ldA, int *iPiv, double *vT, double *M_, int ldM) { utu2inv(n, A_, ldA, iPiv, vT, M_, ldM); } +void utu2inv_c(int n, ccscmplx *A_, int ldA, int *iPiv, ccscmplx *vT, ccscmplx *M_, int ldM) { utu2inv(n, A_, ldA, iPiv, vT, M_, ldM); } +void utu2inv_z(int n, ccdcmplx *A_, int ldA, int *iPiv, ccdcmplx *vT, ccdcmplx *M_, int ldM) { utu2inv(n, A_, ldA, iPiv, vT, M_, ldM); } + + +void ltl2pfa_s(int n, float *A_, int ldA, int *iPiv, float *Pfa) { *Pfa = ltl2pfa(n, A_, ldA, iPiv); } +void ltl2pfa_d(int n, double *A_, int ldA, int *iPiv, double *Pfa) { *Pfa = ltl2pfa(n, A_, ldA, iPiv); } +void ltl2pfa_c(int n, ccscmplx *A_, int ldA, int *iPiv, ccscmplx *Pfa) { *Pfa = ltl2pfa(n, A_, ldA, iPiv); } +void ltl2pfa_z(int n, ccdcmplx *A_, int ldA, int *iPiv, ccdcmplx *Pfa) { *Pfa = ltl2pfa(n, A_, ldA, iPiv); } +void utu2pfa_s(int n, float *A_, int ldA, int *iPiv, float *Pfa) { *Pfa = utu2pfa(n, A_, ldA, iPiv); } +void utu2pfa_d(int n, double *A_, int ldA, int *iPiv, double *Pfa) { *Pfa = utu2pfa(n, A_, ldA, iPiv); } +void utu2pfa_c(int n, ccscmplx *A_, int ldA, int *iPiv, ccscmplx *Pfa) { *Pfa = utu2pfa(n, A_, ldA, iPiv); } +void utu2pfa_z(int n, ccdcmplx *A_, int ldA, int *iPiv, ccdcmplx *Pfa) { *Pfa = utu2pfa(n, A_, ldA, iPiv); } + +} + diff --git a/src/ltl2inv/makefile_ltl2inv b/src/ltl2inv/makefile_ltl2inv new file mode 100644 index 00000000..8c76afb1 --- /dev/null +++ b/src/ltl2inv/makefile_ltl2inv @@ -0,0 +1,29 @@ +include ../make.sys + +OBJ = ltl2inv.o blalink_gemmt.o ilaenv_lauum.o +SRC = ltl2inv.cc blalink_gemmt.c ilaenv_lauum.cc +HDR = invert.tcc pfaffian.tcc \ + ../common/blalink.hh ../common/blalink_fort.h ../common/colmaj.hh + +ltl2inv.a: $(OBJ) + $(AR) rvu $@ $(OBJ) + +clean: + rm -f $(OBJ) + +SUFFIXES: .o .c + +.c.o: $(HDR) + $(CC) $(OPTION) $(CFLAGS) -c $< -o $@ \ + -Wno-incompatible-pointer-types-discards-qualifiers \ + -I../common \ + -I$(BLIS_ROOT)/include \ + -I$(BLIS_ROOT)/include/blis + +SUFFIXES: .o .cc + +.cc.o: $(HDR) + $(CXX) $(CXXFLAGS) -D_CC_IMPL -c $< -o $@ \ + -I../common \ + -I$(BLIS_ROOT)/include \ + -I$(BLIS_ROOT)/include/blis diff --git a/src/ltl2inv/pfaffian.tcc b/src/ltl2inv/pfaffian.tcc new file mode 100644 index 00000000..ed028524 --- /dev/null +++ b/src/ltl2inv/pfaffian.tcc @@ -0,0 +1,52 @@ +/** + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at https://mozilla.org/MPL/2.0/. + */ +#pragma once +#include "colmaj.hh" +#include "blalink.hh" + +template +T ltl2pfa(int n, T *A_, int ldA, int *iPiv) +{ + T pfa = 1.0; + colmaj A(A_, ldA); + + for (int i = 0; i < n; i += 2) { + pfa *= -A(i+1, i); + + if (iPiv[i ]-1 != i ) pfa = -pfa; + if (iPiv[i+1]-1 != i+1) pfa = -pfa; + } + return pfa; +} + +template +T utu2pfa(int n, T *A_, int ldA, int *iPiv) +{ + T pfa = 1.0; + colmaj A(A_, ldA); + + for (int i = 0; i < n; i += 2) { + pfa *= A(i, i+1); + + if (iPiv[i ]-1 != i ) pfa = -pfa; + if (iPiv[i+1]-1 != i+1) pfa = -pfa; + } + return pfa; +} + +template +T ltl2pfa(uplo_t uplo, int n, T *A_, int ldA, int *iPiv) +{ + switch (uplo) { + case BLIS_LOWER: + return ltl2pfa(n, A_, ldA, iPiv); + case BLIS_UPPER: + return utu2pfa(n, A_, ldA, iPiv); + default: + return 0.0; + } +} + diff --git a/src/ltl2inv/testinv.cc b/src/ltl2inv/testinv.cc new file mode 100644 index 00000000..c7cbc358 --- /dev/null +++ b/src/ltl2inv/testinv.cc @@ -0,0 +1,125 @@ +#include +#include +#include +#include + +#include "colmaj.hh" +#include "invert.tcc" + +struct info_t { + double err; + long long tim; +}; + +template +info_t test_decomp2inv(uplo_t uplo, int n) +{ + using namespace std::chrono; + + colmaj A(new T[n * n], n); + colmaj M(new T[n * n], n); + colmaj X(new T[n * n], n); + int *ipiv = new int[n]; + T *vT = new T[n-1]; + + assert(&A(0, 0)); + assert(&M(0, 0)); + assert(&X(0, 0)); + assert(ipiv); + assert(vT); + + for (int j = 0; j < n; ++j) { + A(j, j) = 0.0; + X(j, j) = 0.0; + + for (int i = j+1; i < n; ++i) { + A(i, j) = rand(); + A(i, j) /= RAND_MAX; + + A(j, i) = -A(i, j); + switch (uplo) { + case BLIS_LOWER: + X(i, j) = A(i, j); break; + case BLIS_UPPER: + X(j, i) = A(j, i); break; + default: + break; + } + } + } + + auto start = high_resolution_clock::now(); + + sktrf(uplo, n, &X(0, 0), n, ipiv, &M(0, 0), n*n); + switch (uplo) { + case BLIS_LOWER: + ltl2inv(n, &X(0, 0), n, ipiv, vT, &M(0, 0), n); break; + case BLIS_UPPER: + utu2inv(n, &X(0, 0), n, ipiv, vT, &M(0, 0), n); break; + default: + break; + } + + auto tim = duration_cast(high_resolution_clock::now() - start).count(); + + gemm(BLIS_NO_TRANSPOSE, BLIS_NO_TRANSPOSE, + n, n, n, + 1.0, + &A(0, 0), n, + &X(0, 0), n, + 0.0, + &M(0, 0), n); + +#ifdef VERBOSE + printf("iPiv = [ "); + for (int i = 0; i < n; ++i) + printf("%d ", ipiv[i]); + printf("]\n"); + printf("vT = [ "); + for (int i = 0; i < n-1; ++i) + printf("%f ", vT[i]); + printf("]\n"); + + printf("A*inv(A) =\n"); + for (int i = 0; i < n; ++i) { + for (int j = 0; j < n; ++j) + printf("%10.6f ", M(i, j)); + printf("\n"); + } +#endif + + double err = 0.0; + for (int j = 0; j < n; ++j) + for (int i = 0; i < n; ++i) + err += std::abs(M(i, j) - double(!(i - j))); + err /= n; // sqrt(n*n); + + delete[] &A(0, 0); + delete[] &M(0, 0); + delete[] &X(0, 0); + delete[] ipiv; + delete[] vT; + + return info_t{ err, tim }; +} + +int main(void) +{ + info_t info; int n; + n= 10; info = test_decomp2inv (BLIS_LOWER, n); printf("double n=%5d lower err=%e at %6lfgflops.\n", n, info.err, 1.33*std::pow(10 ,3)/info.tim); + n= 10; info = test_decomp2inv (BLIS_LOWER, n); printf("float n=%5d lower err=%e at %6lfgflops.\n", n, info.err, 1.33*std::pow(10 ,3)/info.tim); + n= 10; info = test_decomp2inv(BLIS_LOWER, n); printf("dcomplex n=%5d lower err=%e at %6lfgflops.\n", n, info.err, 5.33*std::pow(10 ,3)/info.tim); + n=200; info = test_decomp2inv (BLIS_LOWER, n); printf("double n=%5d lower err=%e at %6lfgflops.\n", n, info.err, 1.33*std::pow(200,3)/info.tim); + n=200; info = test_decomp2inv (BLIS_LOWER, n); printf("float n=%5d lower err=%e at %6lfgflops.\n", n, info.err, 1.33*std::pow(200,3)/info.tim); + n=200; info = test_decomp2inv(BLIS_LOWER, n); printf("dcomplex n=%5d lower err=%e at %6lfgflops.\n", n, info.err, 5.33*std::pow(200,3)/info.tim); + + n= 10; info = test_decomp2inv (BLIS_UPPER, n); printf("double n=%5d upper err=%e at %6lfgflops.\n", n, info.err, 1.33*std::pow(10 ,3)/info.tim); + n= 10; info = test_decomp2inv (BLIS_UPPER, n); printf("float n=%5d upper err=%e at %6lfgflops.\n", n, info.err, 1.33*std::pow(10 ,3)/info.tim); + n= 10; info = test_decomp2inv(BLIS_UPPER, n); printf("dcomplex n=%5d upper err=%e at %6lfgflops.\n", n, info.err, 5.33*std::pow(10 ,3)/info.tim); + n=200; info = test_decomp2inv (BLIS_UPPER, n); printf("double n=%5d upper err=%e at %6lfgflops.\n", n, info.err, 1.33*std::pow(200,3)/info.tim); + n=200; info = test_decomp2inv (BLIS_UPPER, n); printf("float n=%5d upper err=%e at %6lfgflops.\n", n, info.err, 1.33*std::pow(200,3)/info.tim); + n=200; info = test_decomp2inv(BLIS_UPPER, n); printf("dcomplex n=%5d upper err=%e at %6lfgflops.\n", n, info.err, 5.33*std::pow(200,3)/info.tim); + + return 0; +} + diff --git a/src/ltl2inv/trmmt.tcc b/src/ltl2inv/trmmt.tcc new file mode 100644 index 00000000..79bc7170 --- /dev/null +++ b/src/ltl2inv/trmmt.tcc @@ -0,0 +1,35 @@ +/** + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at https://mozilla.org/MPL/2.0/. + */ +#pragma once +#include "colmaj.hh" +#include "blalink.hh" +#include "ilaenv_lauum.hh" + + +template +inline void trmmt(uplo_t uploab, diag_t diaga, int n, T alpha, colmaj &A, colmaj &B) +{ + int npanel = ilaenv_lauum(uploab, n); + + if (uploab == BLIS_UPPER) { + // A upper => A' lower -> write to UPPER part of B. + for (int j = 0; j < n; j += npanel) { + int nloc = std::min(npanel, n - j); + + trmm(BLIS_LEFT, BLIS_UPPER, BLIS_TRANSPOSE, diaga, + j+nloc, nloc, alpha, &A(0, 0), A.ld, &B(0, j), B.ld); + } + } else { + // A lower => A' upper -> write to LOWER part of B. + for (int j = 0; j < n; j += npanel) { + int nloc = std::min(npanel, n - j); + + trmm(BLIS_LEFT, BLIS_LOWER, BLIS_TRANSPOSE, diaga, + n-j, nloc, alpha, &A(j, j), A.ld, &B(j, j), B.ld); + } + } +} + diff --git a/src/mVMC/CMakeLists.txt b/src/mVMC/CMakeLists.txt index f80471ca..44ae92cf 100644 --- a/src/mVMC/CMakeLists.txt +++ b/src/mVMC/CMakeLists.txt @@ -27,9 +27,13 @@ target_link_libraries(vmcdry.out StdFace m) add_executable(vmc.out ${SOURCES_vmcmain} ${SOURCES_sfmt}) target_link_libraries(vmc.out StdFace) target_link_libraries(vmc.out pfapack) +target_link_libraries(vmc.out ltl2inv) if(PFAFFIAN_BLOCKED) - target_link_libraries(vmc.out pfupdates blis pthread) + target_link_libraries(vmc.out pfupdates) endif(PFAFFIAN_BLOCKED) +if(Require_BLIS) + target_link_libraries(vmc.out blis pthread) +endif(Require_BLIS) target_link_libraries(vmc.out ${LAPACK_LIBRARIES} m) if(USE_SCALAPACK) diff --git a/src/mVMC/include/blas_externs.h b/src/mVMC/include/blas_externs.h index c1df0682..7679cac4 100644 --- a/src/mVMC/include/blas_externs.h +++ b/src/mVMC/include/blas_externs.h @@ -46,15 +46,11 @@ along with this program. If not, see http://www.gnu.org/licenses/. #define M_ZGETRI zgetri_ #define M_ZPOSV zposv_ -// Skew-symmetric LAPACK-level routines: -// Vendor switch: PFAPACK77 or Pfaffine -#ifdef _pfaffine -#define M_DSKPFA m_dskpfa_ -#define M_ZSKPFA m_zskpfa_ -#else +// Pfapack #define M_DSKPFA dskpfa_ #define M_ZSKPFA zskpfa_ -#endif +#define M_DSKTRF dsktrf_ +#define M_ZSKTRF zsktrf_ // pBLAS #define M_PDGEMV pdgemv_ @@ -116,11 +112,21 @@ extern int M_DSKPFA(const char *uplo, const char *mthd, const int *n, double *work, const int *lwork, int *info); extern int M_ZSKPFA(const char *uplo, const char *mthd, const int *n, double complex *a, const int *lda, double complex *pfaff, int *iwork, - double complex *work, const int *lwork, -#ifndef _pfaffine - double *rwork, -#endif - int *info); + double complex *work, const int *lwork, double *rwork, int *info); +extern int M_DSKTRF(const char *uplo, const char *mode, const int *n, + double *a, const int *lda, int *ipiv, + double *work, const int *lwork, int *info); +extern int M_ZSKTRF(const char *uplo, const char *mode, const int *n, + double complex *a, const int *lda, int *ipiv, + double complex *work, const int *lwork, int *info); +extern void ltl2pfa_d(int n, double *A_, int ldA, int *iPiv, double *Pfa); +extern void ltl2inv_d(int n, double *A_, int ldA, int *iPiv, double *vT, double *M_, int ldM); +extern void ltl2pfa_z(int n, double complex *A_, int ldA, int *iPiv, double complex *Pfa); +extern void ltl2inv_z(int n, double complex *A_, int ldA, int *iPiv, double complex *vT, double complex *M_, int ldM); +extern void utu2pfa_d(int n, double *A_, int ldA, int *iPiv, double *Pfa); +extern void utu2inv_d(int n, double *A_, int ldA, int *iPiv, double *vT, double *M_, int ldM); +extern void utu2pfa_z(int n, double complex *A_, int ldA, int *iPiv, double complex *Pfa); +extern void utu2inv_z(int n, double complex *A_, int ldA, int *iPiv, double complex *vT, double complex *M_, int ldM); // pBLAS extern void M_PDGEMV(const char *trans, const int *m, const int *n, diff --git a/src/mVMC/locgrn.c b/src/mVMC/locgrn.c index 0f8837c3..b1fdd7c5 100644 --- a/src/mVMC/locgrn.c +++ b/src/mVMC/locgrn.c @@ -392,12 +392,7 @@ double complex calculateNewPfMN_child(const int qpidx, const int n, const int *m /* calculate Pf M */ //M_DSKPFA(&uplo, &mthd, &nn, mat, &lda, &pfaff, iwork, work, &lwork, &info); //M_ZSKPFA(&uplo, &mthd, &n, mat, &lda, &pfaff, iwork, work, &lwork, rwork, &info); //TBC -#ifdef _pfaffine - info = 1; // Skip inverse. - M_ZSKPFA(&uplo, &mthd, &n, mat, &lda, &pfaff, iwork, work, &lwork/*, rwork*/, &info); -#else M_ZSKPFA(&uplo, &mthd, &n, mat, &lda, &pfaff, iwork, work, &lwork, rwork, &info); -#endif sgn = ( (n*(n-1)/2)%2==0 ) ? 1.0 : -1.0; return sgn * pfaff * PfM[qpidx]; diff --git a/src/mVMC/locgrn_real.c b/src/mVMC/locgrn_real.c index 4991a54e..82cc4037 100644 --- a/src/mVMC/locgrn_real.c +++ b/src/mVMC/locgrn_real.c @@ -397,9 +397,6 @@ double calculateNewPfMN_real_child(const int qpidx, const int n, const int *msa, } /* calculate Pf M */ - // TODO: Maybe block updating is faster than recalculation of Pfaffian. - // Maybe I'll also give support for this in Pfaffine. - info = 1; // Skip inverse. M_DSKPFA(&uplo, &mthd, &nn, mat, &lda, &pfaff, iwork, work, &lwork, &info); sgn = ( (n*(n-1)/2)%2==0 ) ? 1.0 : -1.0; diff --git a/src/mVMC/makefile_src b/src/mVMC/makefile_src index 52ac8076..bf9eae6f 100644 --- a/src/mVMC/makefile_src +++ b/src/mVMC/makefile_src @@ -1,8 +1,8 @@ include ../make.sys PFAPACK = ../pfapack/fortran/libpfapack.a -PFAFFINE = ../pfaffine/src/libpfaffine.a PFUPDATES = ../pfupdates/pfupdates.o +LTL2INV = ../ltl2inv/ltl2inv.a SFMT = ../sfmt/SFMT.o STDFACE = ../StdFace/src/libStdFace.a OPTION = -D_mpi_use @@ -19,7 +19,7 @@ OBJS = \ splitloop.o \ vmcmain.o \ $(PFAPACK) $(SFMT) $(STDFACE) \ - $(PFUPDATES) $(PFAFFINE) + $(PFUPDATES) $(LTL2INV) SOURCES = \ average.c \ @@ -107,7 +107,7 @@ HEADERS = \ all : $(MAKE) -C ../pfapack/fortran libpfapack.a - $(MAKE) -C ../pfaffine/src libpfaffine.a + $(MAKE) -C ../ltl2inv -f makefile_ltl2inv $(MAKE) -C ../pfupdates -f makefile_pfupdates $(MAKE) -C ../sfmt -f makefile_sfmt $(MAKE) -C ../StdFace/src -f makefile_StdFace libStdFace.a @@ -130,7 +130,7 @@ clean : rm -f *.o vmc.out vmcdry.out $(MAKE) -C ../sfmt -f makefile_sfmt clean $(MAKE) -C ../pfapack/fortran -f makefile clean - $(MAKE) -C ../pfaffine -f makefile clean + $(MAKE) -C ../ltl2inv -f makefile_ltl2inv clean $(MAKE) -C ../pfupdates -f makefile_pfupdates clean $(MAKE) -C ../StdFace/src -f makefile_StdFace clean $(MAKE) -C ../ComplexUHF -f makefile_uhf clean diff --git a/src/mVMC/matrix.c b/src/mVMC/matrix.c index a86330ca..c91ba200 100644 --- a/src/mVMC/matrix.c +++ b/src/mVMC/matrix.c @@ -32,8 +32,6 @@ along with this program. If not, see http://www.gnu.org/licenses/. #include "./include/global.h" #include "workspace.c" -//int calculateMAll_child_real(const int *eleIdx, const int qpStart, const int qpEnd, const int qpidx, -// double *bufM, int *iwork, double *work, int lwork); #define D_PfLimit 1.0e-100 int getLWork() { @@ -69,11 +67,7 @@ int getLWork_fcmp() { lwork=-1; M_ZGETRI(&n, &a, &lda, &iwork, &optSize1, &lwork, &info); lwork=-1; -#ifdef _pfaffine - M_ZSKPFA(&uplo, &mthd, &n, &a, &lda, &pfaff, &iwork, &optSize2, &lwork/*, &rwork*/, &info); -#else M_ZSKPFA(&uplo, &mthd, &n, &a, &lda, &pfaff, &iwork, &optSize2, &lwork, &rwork, &info); -#endif lwork = (creal(optSize1)>creal(optSize2)) ? (int)creal(optSize1) : (int)creal(optSize2); return lwork; @@ -135,7 +129,6 @@ int calculateMAll_child_fsz(const int *eleIdx,const int *eleSpn, const int qpSta int msi,msj; int rsi,rsj; - char uplo='U', mthd='P'; int m,n,nsq,one,lda,info=0; //int nspn = 2*Ne+2*Nsite+2*Nsite+NProj; this is useful? double complex pfaff,minus_one; @@ -147,60 +140,40 @@ int calculateMAll_child_fsz(const int *eleIdx,const int *eleSpn, const int qpSta const double complex *sltE_i; double complex *invM = InvM + qpidx*Nsize*Nsize; - double complex *invM_i; - - double complex *bufM_i, *bufM_i2; + double complex *invM_i, *invM_i2; m=n=lda=Nsize; nsq=n*n; one=1; minus_one=-1.0; - /* store bufM */ - /* Note that bufM is column-major and skew-symmetric. */ - /* bufM[msj][msi] = -sltE[rsi][rsj] */ + /* store invM */ + /* Note that invM is column-major and skew-symmetric. */ + /* invM[msj][msi] = -sltE[rsi][rsj] */ #pragma loop noalias for(msi=0;msi InvM(T) -> -InvM */ M_ZSCAL(&nsq, &minus_one, invM, &one); -#endif return info; } @@ -254,7 +227,6 @@ int calculateMAll_child_fsz_real(const int *eleIdx,const int *eleSpn, const int int msi,msj; int rsi,rsj; - char uplo='U', mthd='P'; int m,n,lda,one,nsq,info=0; //int nspn = 2*Ne+2*Nsite+2*Nsite+NProj; this is useful? double pfaff,minus_one; @@ -266,61 +238,41 @@ int calculateMAll_child_fsz_real(const int *eleIdx,const int *eleSpn, const int const double *sltE_i; double *invM = InvM_real + qpidx*Nsize*Nsize; - double *invM_i; - - double *bufM_i, *bufM_i2; + double *invM_i, *invM_i2; m=n=lda=Nsize; nsq=n*n; one=1; minus_one=-1.0; - /* store bufM */ - /* Note that bufM is column-major and skew-symmetric. */ - /* bufM[msj][msi] = -sltE[rsi][rsj] */ + /* store invM */ + /* Note that invM is column-major and skew-symmetric. */ + /* invM[msj][msi] = -sltE[rsi][rsj] */ #pragma loop noalias for(msi=0;msi InvM(T) -> -InvM */ M_DSCAL(&nsq, &minus_one, invM, &one); -#endif return info; } @@ -384,7 +336,6 @@ int calculateMAll_child_fcmp(const int *eleIdx, const int qpStart, const int qpE int msi,msj; int rsi,rsj; - char uplo='U', mthd='P'; int m,n,nsq,lda,one,info=0; double complex pfaff,minus_one; @@ -395,61 +346,42 @@ int calculateMAll_child_fcmp(const int *eleIdx, const int qpStart, const int qpE const double complex *sltE_i; double complex *invM = InvM + qpidx*Nsize*Nsize; - double complex *invM_i; - - double complex *bufM_i, *bufM_i2; + double complex *invM_i, *invM_i2; m=n=lda=Nsize; nsq=n*n; one=1; minus_one=-1.0; - /* store bufM */ - /* Note that bufM is column-major and skew-symmetric. */ - /* bufM[msj][msi] = -sltE[rsi][rsj] */ + /* store invM */ + /* Note that invM is column-major and skew-symmetric. */ + /* invM[msj][msi] = -sltE[rsi][rsj] */ #pragma loop noalias for(msi=0;msi -InvM according antisymmetric properties. */ M_ZSCAL(&nsq, &minus_one, invM, &one); -#endif return info; } @@ -553,10 +485,6 @@ int calculateMAll_BF_fcmp_child( } } -#ifdef _pfaffine - info=0; /* Fused Pfaffian/inverse computation. */ - M_ZSKPFA(&uplo, &mthd, &n, bufM, &lda, &pfaff, iwork, work, &lwork/*, rwork*/, &info); -#else /* Pfaffian/inverse computed separately. */ /* Copy bufM to invM before using bufM to compute Pfaffian. */ for(msi=0;msi InvM(T) -> -InvM */ M_ZSCAL(&nsq, &minus_one, invM, &one); -#endif return info; } @@ -634,8 +554,6 @@ int CalculateMAll_real(const int *eleIdx, const int qpStart, const int qpEnd) { return info; } -//int calculateMAll_child_real(const int *eleIdx, const int qpStart, const int qpEnd, const int qpidx, -// double *bufM, int *iwork, double *work, int lwork) { int calculateMAll_child_real(const int *eleIdx, const int qpStart, const int qpEnd, const int qpidx, double *bufM, int *iwork, double *work, int lwork, double* PfM_real, double *InvM_real) { #pragma procedure serial @@ -654,60 +572,43 @@ int calculateMAll_child_real(const int *eleIdx, const int qpStart, const int qpE const double *sltE_i; double *invM = InvM_real + qpidx*Nsize*Nsize; - double *invM_i; - - double *bufM_i, *bufM_i2; + double *invM_i, *invM_i2; m=n=lda=Nsize; nsq=n*n; one=1; minus_one=-1.0; - /* store bufM */ - /* Note that bufM is column-major and skew-symmetric. */ - /* bufM[msj][msi] = -sltE[rsi][rsj] */ + /* store invM */ + /* Note that invM is column-major and skew-symmetric. */ + /* invM[msj][msi] = -sltE[rsi][rsj] */ #pragma loop noalias for(msi=0;msi InvM' = -InvM + // TODO: Include this in ltl2inv M_DSCAL(&nsq, &minus_one, invM, &one); -#endif return info; } @@ -793,14 +694,10 @@ int calculateMAll_BF_real_child(const int *eleIdx, const int qpStart, const int } } -#ifdef _pfaffine - info=0; /* Fused Pfaffian/inverse computation. */ -#else /* Pfaffian/inverse computed separately. */ /* Copy bufM to invM before using bufM to compute Pfaffian. */ for(msi=0;msi InvM' = -InvM M_DSCAL(&nsq, &minus_one, invM, &one); -#endif return info; } diff --git a/src/mVMC/pfupdate.c b/src/mVMC/pfupdate.c index 42b82b35..5dc67cec 100644 --- a/src/mVMC/pfupdate.c +++ b/src/mVMC/pfupdate.c @@ -352,12 +352,7 @@ double complex calculateNewPfMBFN4_child(const int qpidx, const int n, const int /* Calculate Pf M */ //TODO: CHECK rwork is real <-- [R-Xu] Not needed anymore? -#ifdef _pfaffine - info = 1; // Skipping inverse. - M_ZSKPFA(&uplo, &mthd, &nn, mat, &lda, &pfaff, iwork, work, &lwork/*, rwork*/, &info); -#else M_ZSKPFA(&uplo, &mthd, &nn, mat, &lda, &pfaff, iwork, work, &lwork, rwork, &info); -#endif //M_DSKPFA(&uplo, &mthd, &nn, mat, &lda, &pfaff, iwork, work, &lwork, &info); sgn = ( (n*(n-1)/2)%2==0 ) ? 1.0 : -1.0; @@ -593,12 +588,7 @@ double complex updateMAll_BF_fcmp_child( } /* Calculate Pf M */ -#ifdef _pfaffine - info = 1; // Skip Pfaffine inverse. - M_ZSKPFA(&uplo, &mthd, &nn, mat, &lda, &pfaff, iwork, work, &lwork/*, rwork*/, &info); -#else M_ZSKPFA(&uplo, &mthd, &nn, mat, &lda, &pfaff, iwork, work, &lwork, rwork, &info); -#endif //M_DSKPFA(&uplo, &mthd, &nn, invM, &lda, &pfaff, iwork, work, &lwork, &info); //M_DSKPFA(&uplo, &mthd, &nn, mat, &lda, &pfaff, iwork, work, &lwork, &info); diff --git a/src/mVMC/pfupdate_real.c b/src/mVMC/pfupdate_real.c index 2b25909d..690b7c86 100644 --- a/src/mVMC/pfupdate_real.c +++ b/src/mVMC/pfupdate_real.c @@ -361,8 +361,6 @@ double calculateNewPfMBFN4_real_child(const int qpidx, const int n, const int *m } /* calculate Pf M */ - //M_ZSKPFA(&uplo, &mthd, &nn, mat, &lda, &pfaff, iwork, work, &lwork, rwork, &info); - info = 1; // Skip Pfaffine's inverse. M_DSKPFA(&uplo, &mthd, &nn, mat, &lda, &pfaff, iwork, work, &lwork, &info); sgn = ( (n*(n-1)/2)%2==0 ) ? 1.0 : -1.0; @@ -547,7 +545,7 @@ double updateMAll_BF_real_child(const int qpidx, const int n, const int *msa, // NOTE: This routines is not called in current version (i.e. lcoal coverage 0), // hence cannot be tested. I'm keeping GETRF and GETRI here. // TODO: If anyone's interested in utilizing this routine, she or he might want to - // utilize inverse feature of Pfaffine skpfa. + // utilize DSKTRF+LTL2INV m=nn=lda=n2; //M_ZGETRF(&m, &nn, invMat, &lda, iwork2, &info); /* ipiv = iwork */ M_DGETRF(&m, &nn, invMat, &lda, iwork2, &info); /* ipiv = iwork */ @@ -601,8 +599,6 @@ double updateMAll_BF_real_child(const int qpidx, const int n, const int *msa, /* calculate Pf M */ //M_ZSKPFA(&uplo, &mthd, &nn, mat, &lda, &pfaff, iwork, work, &lwork, rwork, &info); //M_DSKPFA(&uplo, &mthd, &nn, invM, &lda, &pfaff, iwork, work, &lwork, &info); - // As stated above, I'm now only setting Pfaffine to skip optinal inverse. - info = 1; // This is parameter for skipping inversion. M_DSKPFA(&uplo, &mthd, &nn, mat, &lda, &pfaff, iwork, work, &lwork, &info); sgn = ( (n*(n-1)/2)%2==0 ) ? 1.0 : -1.0; diff --git a/src/pfaffine b/src/pfaffine deleted file mode 160000 index ebbb37ee..00000000 --- a/src/pfaffine +++ /dev/null @@ -1 +0,0 @@ -Subproject commit ebbb37ee4ac9b08b0907e81c5b593df5c975514a diff --git a/src/pfapack b/src/pfapack index b655435f..ebaa11b7 160000 --- a/src/pfapack +++ b/src/pfapack @@ -1 +1 @@ -Subproject commit b655435f2351c1f18bad0bfb7107822f505332ec +Subproject commit ebaa11b722dbb5f1ddc62da607592d2e301c0c4a diff --git a/src/pfupdates/CMakeLists.txt b/src/pfupdates/CMakeLists.txt index 5f313232..1a06ef74 100644 --- a/src/pfupdates/CMakeLists.txt +++ b/src/pfupdates/CMakeLists.txt @@ -5,14 +5,12 @@ if(${CMAKE_PROJECT_NAME} STREQUAL "Project") message(FATAL_ERROR "cmake should be executed not for 'src' subdirectory, but for the top directory of mVMC.") endif(${CMAKE_PROJECT_NAME} STREQUAL "Project") -add_definitions(-DBLAS_EXTERNAL) -add_definitions(-DF77_COMPLEX_RET_INTEL) -include_directories(../pfaffine/src) +if(${ARCHITECTURE} STREQUAL "x86_64") + add_definitions(-DF77_COMPLEX_RET_INTEL) +endif() +include_directories(../common) +include_directories(../ltl2inv) -add_library(pfupdates STATIC - pf_interface.cc - ../pfaffine/src/skpfa.cc - ../pfaffine/src/sktdf.cc - ../pfaffine/src/sktdi.cc) +add_library(pfupdates STATIC pf_interface.cc) target_compile_definitions(pfupdates PRIVATE -D_CC_IMPL) diff --git a/src/pfupdates/makefile_pfupdates b/src/pfupdates/makefile_pfupdates index ee946fc0..bd8502e3 100644 --- a/src/pfupdates/makefile_pfupdates +++ b/src/pfupdates/makefile_pfupdates @@ -1,16 +1,13 @@ include ../make.sys -ifeq ($(BLIS_ROOT),) - BLIS_ROOT = ../pfaffine/src/dep -endif - OBJ = pfupdates.o SRC = pf_interface.cc -HDR = pf_interface.h orbital_mat.tcc skmv.tcc updated_tdi.tcc +HDR = pf_interface.h orbital_mat.tcc skmv.tcc skr2k.tcc skslc.tcc updated_tdi.tcc \ + ../common/blalink.hh ../common/blalink_fort.h ../common/colmaj.hh $(OBJ): $(SRC) $(HDR) $(CXX) $(CXXFLAGS) -D_CC_IMPL -c $< -o $@ \ - -I../pfaffine/src \ + -I../common -I../ltl2inv \ -I$(BLIS_ROOT)/include \ -I$(BLIS_ROOT)/include/blis diff --git a/src/pfupdates/orbital_mat.tcc b/src/pfupdates/orbital_mat.tcc index cb8a6117..3e7ccf76 100644 --- a/src/pfupdates/orbital_mat.tcc +++ b/src/pfupdates/orbital_mat.tcc @@ -7,20 +7,10 @@ */ #pragma once #include "blis.h" -#include "colmaj.tcc" +#include "colmaj.hh" #include -#ifdef UseBoost -#include -#include -#include -template -using matrix_t = boost::numeric::ublas::matrix; -template -using vector_t = boost::numeric::ublas::vector; -#else template using matrix_t = colmaj; -#endif template struct orbital_mat @@ -31,16 +21,7 @@ struct orbital_mat orbital_mat(uplo_t uplo_, dim_t nsite_, T *X_, inc_t ldX) : uplo(uplo_), nsite(nsite_), - #ifdef UseBoost - X(nsite, nsite) { - colmaj X_tmp(X_, ldX); - for (dim_t j = 0; j < nsite; ++j) - for (dim_t i = 0; i < nsite; ++i) - X(i, j) = X_tmp(i, j); - } - #else X(X_, ldX) { } - #endif orbital_mat(uplo_t uplo_, dim_t nsite_, matrix_t &X_) : uplo(uplo_), nsite(nsite_), X(X_) { } diff --git a/src/pfupdates/pf_interface.cc b/src/pfupdates/pf_interface.cc index a19596a4..a95efe01 100644 --- a/src/pfupdates/pf_interface.cc +++ b/src/pfupdates/pf_interface.cc @@ -51,9 +51,9 @@ objv(iqp, ctype)->initialize(); \ } \ } -GENIMPL( float, s ) +// GENIMPL( float, s ) GENIMPL( double, d ) -GENIMPL( ccscmplx, c ) +// GENIMPL( ccscmplx, c ) GENIMPL( ccdcmplx, z ) #undef GENIMPL @@ -68,9 +68,9 @@ GENIMPL( ccdcmplx, z ) delete objv(iqp, ctype); \ } \ } -GENIMPL( float, s ) +// GENIMPL( float, s ) GENIMPL( double, d ) -GENIMPL( ccscmplx, c ) +// GENIMPL( ccscmplx, c ) GENIMPL( ccdcmplx, z ) #undef GENIMPL @@ -84,9 +84,9 @@ GENIMPL( ccdcmplx, z ) for (int iqp = 0; iqp < num_qp; ++iqp) \ pfav[iqp] = -objv(iqp, ctype)->get_Pfa(); \ } -GENIMPL( float, s ) +// GENIMPL( float, s ) GENIMPL( double, d ) -GENIMPL( ccscmplx, c ) +// GENIMPL( ccscmplx, c ) GENIMPL( ccdcmplx, z ) #undef GENIMPL @@ -102,9 +102,9 @@ GENIMPL( ccdcmplx, z ) for (int iqp = 0; iqp < num_qp; ++iqp) \ objv(iqp, ctype)->push_update_safe(osi, msj, cal_pfa!=0); \ } -GENIMPL( float, s ) +// GENIMPL( float, s ) GENIMPL( double, d ) -GENIMPL( ccscmplx, c ) +// GENIMPL( ccscmplx, c ) GENIMPL( ccdcmplx, z ) #undef GENIMPL @@ -134,9 +134,9 @@ GENIMPL( ccdcmplx, z ) objv(iqp, ctype)->push_update(osk, msl, cal_pfa!=0); \ } \ } -GENIMPL( float, s ) +// GENIMPL( float, s ) GENIMPL( double, d ) -GENIMPL( ccscmplx, c ) +// GENIMPL( ccscmplx, c ) GENIMPL( ccdcmplx, z ) #undef GENIMPL @@ -150,9 +150,9 @@ GENIMPL( ccdcmplx, z ) objv(iqp, ctype)->pop_update(cal_pfa!=0); \ } \ } -GENIMPL( float, s ) +// GENIMPL( float, s ) GENIMPL( double, d ) -GENIMPL( ccscmplx, c ) +// GENIMPL( ccscmplx, c ) GENIMPL( ccdcmplx, z ) #undef GENIMPL diff --git a/src/pfupdates/pf_interface.h b/src/pfupdates/pf_interface.h index b9f4cbf2..8593a12c 100644 --- a/src/pfupdates/pf_interface.h +++ b/src/pfupdates/pf_interface.h @@ -40,9 +40,9 @@ extern "C" { void *objv[], \ void *orbv[] ); -GENDEF( float, s ) +// GENDEF( float, s ) GENDEF( double, d ) -GENDEF( ccscmplx, c ) +// GENDEF( ccscmplx, c ) GENDEF( ccdcmplx, z ) #undef GENDEF @@ -52,9 +52,9 @@ GENDEF( ccdcmplx, z ) void *objv[], \ void *orbv[] ); -GENDEF( float, s ) +// GENDEF( float, s ) GENDEF( double, d ) -GENDEF( ccscmplx, c ) +// GENDEF( ccscmplx, c ) GENDEF( ccdcmplx, z ) #undef GENDEF @@ -64,9 +64,9 @@ GENDEF( ccdcmplx, z ) ctype pfav[], \ void *objv[] ); -GENDEF( float, s ) +// GENDEF( float, s ) GENDEF( double, d ) -GENDEF( ccscmplx, c ) +// GENDEF( ccscmplx, c ) GENDEF( ccdcmplx, z ) #undef GENDEF @@ -78,9 +78,9 @@ GENDEF( ccdcmplx, z ) int64_t cal_pfa, \ void *objv[] ); -GENDEF( float, s ) +// GENDEF( float, s ) GENDEF( double, d ) -GENDEF( ccscmplx, c ) +// GENDEF( ccscmplx, c ) GENDEF( ccdcmplx, z ) #undef GENDEF @@ -94,9 +94,9 @@ GENDEF( ccdcmplx, z ) int64_t cal_pfa, \ void *objv[] ); -GENDEF( float, s ) +// GENDEF( float, s ) GENDEF( double, d ) -GENDEF( ccscmplx, c ) +// GENDEF( ccscmplx, c ) GENDEF( ccdcmplx, z ) #undef GENDEF @@ -106,9 +106,9 @@ GENDEF( ccdcmplx, z ) int64_t cal_pfa, \ void *objv[] ); -GENDEF( float, s ) +// GENDEF( float, s ) GENDEF( double, d ) -GENDEF( ccscmplx, c ) +// GENDEF( ccscmplx, c ) GENDEF( ccdcmplx, z ) #undef GENDEF diff --git a/src/pfupdates/skmv.tcc b/src/pfupdates/skmv.tcc index 5f484c53..2abc2c6d 100644 --- a/src/pfupdates/skmv.tcc +++ b/src/pfupdates/skmv.tcc @@ -6,7 +6,7 @@ * file, You can obtain one at https://mozilla.org/MPL/2.0/. */ #pragma once -#include "colmaj.tcc" +#include "colmaj.hh" #include "blalink.hh" #include diff --git a/src/pfupdates/skr2k.tcc b/src/pfupdates/skr2k.tcc new file mode 100644 index 00000000..0b8cdcb6 --- /dev/null +++ b/src/pfupdates/skr2k.tcc @@ -0,0 +1,35 @@ +/** + * \file skr2k.tcc + * Dispatcher and minimal implementation for SKR2k. + * Minimal implementation is used when M is extremely small. + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at https://mozilla.org/MPL/2.0/. + */ +#include "blalink.hh" +#include "blalink_gemmt.hh" +#include "colmaj.hh" +#include + + +template +inline void skr2k(uplo_t uploc, + trans_t transab, + dim_t m, dim_t k, + T alpha, + T *_A, inc_t ldA, + T *_B, inc_t ldB, + T beta, + T *_C, inc_t ldC) +{ + T one = 1.0; + if (transab == BLIS_NO_TRANSPOSE) { + gemmt(uploc, BLIS_NO_TRANSPOSE, BLIS_TRANSPOSE, m, k, alpha, _A, ldA, _B, ldB, beta, _C, ldC); + gemmt(uploc, BLIS_NO_TRANSPOSE, BLIS_TRANSPOSE, m, k,-alpha, _B, ldB, _A, ldA, one , _C, ldC); + } else { + gemmt(uploc, BLIS_TRANSPOSE, BLIS_NO_TRANSPOSE, m, k, alpha, _A, ldA, _B, ldB, beta, _C, ldC); + gemmt(uploc, BLIS_TRANSPOSE, BLIS_NO_TRANSPOSE, m, k,-alpha, _B, ldB, _A, ldA, one , _C, ldC); + } +} + diff --git a/src/pfupdates/skslc.tcc b/src/pfupdates/skslc.tcc new file mode 100644 index 00000000..0fd5a4cf --- /dev/null +++ b/src/pfupdates/skslc.tcc @@ -0,0 +1,33 @@ +/** + * \file skslc.tcc + * Get a slice from antisymmetric matrix. + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at https://mozilla.org/MPL/2.0/. + */ +#include "colmaj.hh" + +template +inline signed skslc(uplo_t uploA, + unsigned n, + unsigned i, + T *x, + T *_A, unsigned ldA) +{ + colmaj A(_A, ldA); + x[i] = 0.0; + switch (uploA) { + case BLIS_UPPER: + for (unsigned j = 0; j < i; ++j) + x[j] = A(j, i); + for (unsigned j = i+1; j < n; ++j) + x[j] = -A(i, j); + return 0; + + default: + std::cerr << "SKSLC: Lower triangular storage not implemented. Sorry." << std::endl; + return err_info(Pfaffine_NOT_IMPLEMNTED, 0); + } +} + diff --git a/src/pfupdates/testupdate.cc b/src/pfupdates/testupdate.cc new file mode 100644 index 00000000..8f1c237b --- /dev/null +++ b/src/pfupdates/testupdate.cc @@ -0,0 +1,70 @@ +/** + * \copyright Copyright (c) Dept. Phys., Univ. Tokyo + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at https://mozilla.org/MPL/2.0/. + */ +#include "updated_tdi.tcc" +#include +#include +#include + +struct info_t { + double err; + long long tim; +}; + +template +info_t test_tdi_update(dim_t nsite, std::vector cfg, std::vector from, std::vector to) +{ + dim_t nelec = cfg.size(); + dim_t mmax = from.size(); + T err = 0; + long long tim; + + T *orb__ = new T[nsite * nsite]; + colmaj orb_(orb__, nsite); + orbital_mat orb(BLIS_UPPER, nsite, orb_); + orb.randomize(0.6, 1234); + + T *mat_ = new T[nelec * nelec]; + updated_tdi tdi(orb, cfg, mat_, nelec, mmax); + + for (dim_t i = 0; i < mmax; ++i) { + if (from.at(i) < 0) + tdi.pop_update(true); + else + tdi.push_update_safe(to.at(i), from[i], true); + + std::vector cfg2(tdi.elem_cfg); + // Apply hopping. + for (dim_t j = 0; j < tdi.from_idx.size(); ++j) + cfg2.at(tdi.from_idx.at(j)) = tdi.to_site.at(j); + + T *mat2_ = new T[nelec * nelec]; + T pfa2 = updated_tdi(orb, cfg2, mat2_, nelec, 1).Pfa; + + double err_ = std::abs(pfa2 - tdi.Pfa * tdi.PfaRatio); + printf("Pfa = %e; Pfa diff = %e\n", tdi.Pfa, err_); + err += err_; + + delete[] mat2_; + } + + delete[] mat_; + delete[] orb__; + + return info_t{ err, tim }; +} + +int main(void) +{ + test_tdi_update(200, + std::vector{ 8, 63, 20, 53, 57, 14, 7, 22, 32, 67, 56, 66, 46, 16, 18, 26, 70, 28, 51, 64, 17, 68, 49, 4, 48, 41, 43, 21, 19, 45, 58, 11, 1, 24, 2, 69, 12, 23, 30, 10, 65, 3, 31, 55, 52, 37, 42, 34, 25, 60, 59, 50, 40, 9, 44, 54, 6, 36, 13, 29, 27, 38, 61, 33, 47, 5, 15, 39, 62, 35 }, + std::vector{ 0, 39, 49, 10, 60, 3 }, + std::vector{ 99, 101, 120, 199, 133, 177 }); + + return 0; +} + diff --git a/src/pfupdates/updated_tdi.tcc b/src/pfupdates/updated_tdi.tcc index 63bc035f..40a338af 100644 --- a/src/pfupdates/updated_tdi.tcc +++ b/src/pfupdates/updated_tdi.tcc @@ -8,12 +8,12 @@ #pragma once #include "orbital_mat.tcc" #include "blalink.hh" -#include "skpfa.hh" -#include "sktdi.hh" +#include "blalink_gemmt.hh" +#include "pfaffian.tcc" +#include "invert.tcc" #include "skr2k.tcc" #include "skslc.tcc" #include "skmv.tcc" -#include "optpanel.hh" #include #include @@ -26,8 +26,6 @@ template struct updated_tdi { // Performance tuning constants. const bool single_hop_alpha; ///< Whether to simplify k=1 situations. - const dim_t npanel_big; - const dim_t npanel_sub; // Matrix M. matrix_t M; @@ -42,7 +40,7 @@ template struct updated_tdi { matrix_t W; matrix_t UMU, UMV, VMV; matrix_t Cp; ///< C+BMB = [ W -I; I 0 ] + [ UMU UMV; -UMV VMV ] - matrix_t Gc; ///< Gaussian vectors when tri-diagonalizing C. + matrix_t Gc; ///< Used to be Gaussian vectors. Now just scratchpads. signed *const cPov; ///< Pivot when tri-diagonalizing C. T Pfa; T PfaRatio; //< Updated Pfaffian / Base Pfaffian. @@ -55,11 +53,7 @@ template struct updated_tdi { void initialize() { using namespace std; auto &cfg = elem_cfg; - #ifdef UseBoost - matrix_t G(nelec, nelec); - #else matrix_t G(new T[nelec * nelec], nelec); - #endif from_idx.clear(); to_site.clear(); from_idx.reserve(mmax * sizeof(dim_t)); @@ -81,30 +75,23 @@ template struct updated_tdi { } // Allocate scratchpad. - signed *iPivFull = new signed[nelec + 1]; - dim_t lwork = nelec * npanel_big; - T *pfwork = new T[lwork]; + T *vT = new T[nelec-1]; + signed *iPiv = new signed[nelec]; - #ifdef UseBoost - signed info = skpfa(uplo, nelec, &M(0, 0), M.size1(), &G(0, 0), G.size1(), iPivFull, - true, &Pfa, pfwork, lwork); - #else - signed info = skpfa(uplo, nelec, &M(0, 0), M.ld, &G(0, 0), G.ld, iPivFull, - true, &Pfa, pfwork, lwork); - #endif + // Perform LTL factorization regardless of `uplo`. + signed info = sktrf(uplo, nelec, &M(0, 0), M.ld, iPiv, &G(0, 0), G.ld * nelec); + Pfa = ltl2pfa(uplo, nelec, &M(0, 0), M.ld, iPiv); + ltl2inv(uplo, nelec, &M(0, 0), M.ld, iPiv, vT, &G(0, 0), G.ld); #ifdef _DEBUG cerr << "SKPFA+INV: n=" << nelec << " info=" << info << endl; #endif - delete[] iPivFull; - delete[] pfwork; - #ifndef UseBoost + delete[] vT; + delete[] iPiv; delete[](&G(0, 0)); - #endif } ~updated_tdi() { - #ifndef UseBoost delete[](&U(0, 0)); delete[](&Q(0, 0)); @@ -115,7 +102,6 @@ template struct updated_tdi { delete[](&UMU(0, 0)); delete[](&UMV(0, 0)); delete[](&VMV(0, 0)); - #endif delete[] cPov; } @@ -123,30 +109,14 @@ template struct updated_tdi { updated_tdi(orbital_mat &Xij_, std::vector &cfg, T *M_, inc_t ldM, dim_t mmax_) : Xij(Xij_), nelec(cfg.size()), mmax(mmax_), nq_updated(0), - single_hop_alpha(true), npanel_big(optpanel(nelec, 4)), npanel_sub(4), - #ifdef UseBoost - M(nelec, nelec), - U(nelec, mmax * 2), Q(nelec, mmax * 2), - P(nelec, mmax), W(mmax, mmax), - UMU(mmax, mmax), UMV(mmax, mmax), - VMV(mmax, mmax), Cp(2 * mmax, 2 * mmax), - Gc(2 * mmax, 2 * mmax), - #else - M(M_, ldM), + single_hop_alpha(true), M(M_, ldM), U(new T[nelec * mmax * 2], nelec), Q(new T[nelec * mmax * 2], nelec), P(&Q(0, mmax), Q.ld), W(new T[mmax * mmax], mmax), UMU(new T[mmax * mmax], mmax), UMV(new T[mmax * mmax], mmax), VMV(new T[mmax * mmax], mmax), Cp(new T[2 * mmax * 2 * mmax], 2 * mmax), Gc(new T[2 * mmax * 2 * mmax], 2 * mmax), - #endif cPov(new signed[2 * mmax + 1]), Pfa(0.0), PfaRatio(1.0), elem_cfg(cfg), from_idx(0), to_site(0), uplo(BLIS_UPPER) { - #ifdef UseBoost - colmaj M_tmp(M_, ldM); - for (dim_t j = 0; j < nelec; ++j) - for (dim_t i = 0; i < nelec; ++i) - M(i, j) = M_tmp(i, j); - #endif initialize(); } @@ -154,8 +124,7 @@ template struct updated_tdi { updated_tdi(orbital_mat &Xij_, std::vector &cfg, matrix_t &M_, dim_t mmax_) : Xij(Xij_), nelec(cfg.size()), mmax(mmax_), nq_updated(0), - single_hop_alpha(true), npanel_big(optpanel(nelec, 4)), npanel_sub(4), - M(M_), + single_hop_alpha(true), M(M_), U(new T[nelec * mmax * 2], nelec), Q(new T[nelec * mmax * 2], nelec), P(&Q(0, mmax), Q.ld), W(new T[mmax * mmax], mmax), UMU(new T[mmax * mmax], mmax), UMV(new T[mmax * mmax], mmax), @@ -169,31 +138,14 @@ template struct updated_tdi { // Unsafe construction without initializaion. updated_tdi(orbital_mat &Xij_, dim_t nelec_, T *M_, inc_t ldM, dim_t mmax_) : Xij(Xij_), nelec(nelec_), mmax(mmax_), nq_updated(0), - single_hop_alpha(true), npanel_big(optpanel(nelec, 4)), npanel_sub(4), - #ifdef UseBoost - M(nelec, nelec), - U(nelec, mmax * 2), Q(nelec, mmax * 2), - P(nelec, mmax), W(mmax, mmax), - UMU(mmax, mmax), UMV(mmax, mmax), - VMV(mmax, mmax), Cp(2 * mmax, 2 * mmax), - Gc(2 * mmax, 2 * mmax), - #else - M(M_, ldM), + single_hop_alpha(true), M(M_, ldM), U(new T[nelec * mmax * 2], nelec), Q(new T[nelec * mmax * 2], nelec), P(&Q(0, mmax), Q.ld), W(new T[mmax * mmax], mmax), UMU(new T[mmax * mmax], mmax), UMV(new T[mmax * mmax], mmax), VMV(new T[mmax * mmax], mmax), Cp(new T[2 * mmax * 2 * mmax], 2 * mmax), Gc(new T[2 * mmax * 2 * mmax], 2 * mmax), - #endif cPov(new signed[2 * mmax + 1]), Pfa(0.0), PfaRatio(1.0), elem_cfg(nelec, 0), - from_idx(0), to_site(0), uplo(BLIS_UPPER) { - #ifdef UseBoost - colmaj M_tmp(M_, ldM); - for (dim_t j = 0; j < nelec; ++j) - for (dim_t i = 0; i < nelec; ++i) - M(i, j) = M_tmp(i, j); - #endif - } + from_idx(0), to_site(0), uplo(BLIS_UPPER) { } T get_Pfa() { return Pfa * PfaRatio; } @@ -241,22 +193,12 @@ template struct updated_tdi { for (; nq_updated < k_cal; ++nq_updated) { // Update single column. Use SKMV. - #ifdef UseBoost - skmv(uplo, n, T(1.0), &M(0, 0), M.size1(), &U(0, nq_updated), &Q(0, nq_updated)); - #else skmv(uplo, n, T(1.0), &M(0, 0), M.ld, &U(0, nq_updated), &Q(0, nq_updated)); - #endif } /* else { // Update multiple columns. Use SKMM. - #ifdef UseBoost - skmm(BLIS_LEFT, uplo, BLIS_NO_CONJUGATE, BLIS_NO_TRANSPOSE, n, k_cal - nq_updated, - T(1.0), &M(0, 0), M.size1(), &U(0, nq_updated), U.size1(), - T(0.0), &Q(0, nq_updated), Q.size1()); - #else skmm(BLIS_LEFT, uplo, BLIS_NO_CONJUGATE, BLIS_NO_TRANSPOSE, n, k_cal - nq_updated, T(1.0), &M(0, 0), M.ld, &U(0, nq_updated), U.ld, T(0.0), &Q(0, nq_updated), Q.ld); - #endif } nq_updated = k_cal; */ @@ -279,6 +221,8 @@ template struct updated_tdi { // Always assume l < o to calculate one less column of Q. UMU(o, l) = dot(n, &U(0, o), 1, &Q(0, l), 1); break; + default: + break; } return true; } @@ -372,9 +316,6 @@ template struct updated_tdi { if (compute_pfa) { // NOTE: Update k to be new size. k += 1; - // Allocate scratchpad. - dim_t lwork = 2 * k * npanel_sub; - T *pfwork = new T[lwork]; // Calculate unupdated columns of U. require_Q(false); @@ -386,25 +327,17 @@ template struct updated_tdi { // If it's the first update Pfafian can be directly read out. if (k == 1) { PfaRatio = -UMV(0, 0) + T(1.0); - delete[] pfwork; return; } // Compute pfaffian. - #ifdef UseBoost - signed info = skpfa(uplo, 2 * k, &Cp(0, 0), Cp.size1(), &Gc(0, 0), Gc.size1(), cPov, - false, &PfaRatio, pfwork, lwork); - #else - signed info = skpfa(uplo, 2 * k, &Cp(0, 0), Cp.ld, &Gc(0, 0), Gc.ld, cPov, - false, &PfaRatio, pfwork, lwork); - #endif + signed info = skpfa(uplo, 2 * k, &Cp(0, 0), Cp.ld, &PfaRatio, cPov, &Gc(0, 0), Gc.ld * 2 * k); #ifdef _DEBUG cerr << "SKPFA: info=" << info << endl; #endif // Pfaffian of C = [ W -I; I 0 ]. PfaRatio *= pow(-1.0, k * (k + 1) / 2); - delete[] pfwork; } else // Set to 0.0 to denote dirty. PfaRatio = 0.0; @@ -475,7 +408,9 @@ template struct updated_tdi { nq_updated = k; // Compute new (previous, in fact) Pfaffian. - if (compute_pfa) { + if (!k) + PfaRatio = 1.0; + else if (compute_pfa) { // All compute_pfa requires first K-1 of Q. require_Q(false); @@ -483,19 +418,10 @@ template struct updated_tdi { require_UMU(); // Reassemble C and scratchpads. assemble_C_BMB(); - dim_t lwork = 2 * k * npanel_sub; - T *pfwork = new T[lwork]; - - #ifdef UseBoost - signed info = skpfa(uplo, 2 * k, &Cp(0, 0), Cp.size1(), &Gc(0, 0), Gc.size1(), cPov, - false, &PfaRatio, pfwork, lwork); - #else - signed info = skpfa(uplo, 2 * k, &Cp(0, 0), Cp.ld, &Gc(0, 0), Gc.ld, cPov, - false, &PfaRatio, pfwork, lwork); - #endif + + signed info = skpfa(uplo, 2 * k, &Cp(0, 0), Cp.ld, &PfaRatio, cPov, &Gc(0, 0), Gc.ld * 2 * k); PfaRatio *= pow(-1.0, k * (k + 1) / 2); - delete[] pfwork; } else // Set to 0.0 to denote dirty. PfaRatio = 0.0; @@ -510,38 +436,31 @@ template struct updated_tdi { return; // Allocate scratchpad. - dim_t lwork = 2 * k * npanel_sub; - T *pfwork = new T[lwork]; - + T *vT = new T[2 * k - 1]; + // Update whole Q. require_Q(true); - // If M is dirty, redo the tridiagonal factorization. - if (PfaRatio == T(0.0)) { - require_UMU(); - assemble_C_BMB(); - #ifdef UseBoost - signed info = skpfa(uplo, 2 * k, &Cp(0, 0), Cp.size1(), &Gc(0, 0), Gc.size1(), cPov, - false, &PfaRatio, pfwork, lwork); - #else - signed info = skpfa(uplo, 2 * k, &Cp(0, 0), Cp.ld, &Gc(0, 0), Gc.ld, cPov, - false, &PfaRatio, pfwork, lwork); - #endif - PfaRatio *= pow(-1.0, k * (k + 1) / 2); - } - if (k == 1) { + // M is dirty. + if (PfaRatio == T(0.0)) { + require_UMU(); + assemble_C_BMB(); + PfaRatio = -Cp(0, 1); + } // Trivial inverse. Cp(0, 1) = T(-1.0) / Cp(0, 1); Cp(1, 0) = T(-1.0) / Cp(1, 0); - } else - #ifdef UseBoost - signed info = sktdi(uplo, 2 * k, &Cp(0, 0), Cp.size1(), &Gc(0, 0), Gc.size1(), cPov, - pfwork, lwork); - #else - signed info = sktdi(uplo, 2 * k, &Cp(0, 0), Cp.ld, &Gc(0, 0), Gc.ld, cPov, - pfwork, lwork); - #endif + } else { + // Redo the tridiagonal factorization. + require_UMU(); + assemble_C_BMB(); + signed info = sktrf(uplo, 2 * k, &Cp(0, 0), Cp.ld, cPov, &Gc(0, 0), Gc.ld * 2 * k); + // PfaRatio is dirty. + if (PfaRatio == T(0.0)) + PfaRatio = ltl2pfa(uplo, 2 * k, &Cp(0, 0), Cp.ld, cPov) * T(pow(-1.0, k * (k + 1) / 2)); + ltl2inv(uplo, 2 * k, &Cp(0, 0), Cp.ld, cPov, vT, &Gc(0, 0), Gc.ld); + } inv_update(k, Cp); // Apply hopping. @@ -553,36 +472,24 @@ template struct updated_tdi { Pfa *= PfaRatio; PfaRatio = 1.0; - delete[] pfwork; + delete[] vT; } void inv_update(dim_t k, matrix_t &C) { - #ifdef UseBoost - matrix_t &ABC = U; ///< Use U as inv(A)*B*upper(C) buffer. - matrix_t &AB = Q; - #else matrix_t ABC(&U(0, 0), U.ld); ///< Use U as inv(A)*B*upper(C) buffer. matrix_t AB(&Q(0, 0), Q.ld); - #endif if (k == 1 && single_hop_alpha) { // k == 1 requires no copying - #ifdef UseBoost - skr2k(uplo, BLIS_NO_TRANSPOSE, nelec, 1, C(0, 1), &Q(0, 0), Q.size1(), - &P(0, 0), P.size1(), T(1.0), &M(0, 0), M.size1()); - #else skr2k(uplo, BLIS_NO_TRANSPOSE, nelec, 1, C(0, 1), &Q(0, 0), Q.ld, &P(0, 0), P.ld, T(1.0), &M(0, 0), M.ld); - #endif // See below for reason of calling this procedule. // update_uplo(uplo); return; } // Close empty space between Q and P. - #ifndef UseBoost if (k != mmax) - #endif for (dim_t j = 0; j < k; ++j) memcpy(&AB(0, k + j), &P(0, j), nelec * sizeof(T)); @@ -594,22 +501,12 @@ template struct updated_tdi { // 0 0 0 + // 0 0 0 0 ] => AB[:, 0:2] -> ABC[:, 1:3] memcpy(&ABC(0, j + 1), &AB(0, j), nelec * sizeof(T)); - #ifdef UseBoost - trmm(BLIS_RIGHT, BLIS_UPPER, BLIS_NO_TRANSPOSE, nelec, 2 * k - 1, T(1.0), - &C(0, 1), C.size1(), &ABC(0, 1), ABC.size1()); - #else trmm(BLIS_RIGHT, BLIS_UPPER, BLIS_NO_TRANSPOSE, nelec, 2 * k - 1, T(1.0), &C(0, 1), C.ld, &ABC(0, 1), ABC.ld); - #endif // Update: write to M. - #ifdef UseBoost - skr2k(uplo, BLIS_NO_TRANSPOSE, nelec, 2 * k - 1, T(1.0), &ABC(0, 1), ABC.size1(), - &AB(0, 1), AB.size1(), T(1.0), &M(0, 0), M.size1()); - #else skr2k(uplo, BLIS_NO_TRANSPOSE, nelec, 2 * k - 1, T(1.0), &ABC(0, 1), ABC.ld, &AB(0, 1), AB.ld, T(1.0), &M(0, 0), M.ld); - #endif #else gemm(BLIS_NO_TRANSPOSE, BLIS_NO_TRANSPOSE, nelec, 2 * k, 2 * k, diff --git a/tool/makefile_tool b/tool/makefile_tool index 1d2a177b..e89c7a9a 100644 --- a/tool/makefile_tool +++ b/tool/makefile_tool @@ -4,10 +4,10 @@ include ../src/make.sys .SUFFIXES : .o .F90 .SUFFIXES : .o .c -all:greenr2k +all: greenr2k -greenr2k:greenr2k.o key2lower.o - $(F90) greenr2k.o key2lower.o $(LIBS) -o $@ +greenr2k: greenr2k.o + $(F90) greenr2k.o $(LIBS) -o $@ .F90.o: $(F90) -c $< $(FFLAGS)