diff --git a/.gitmodules b/.gitmodules
index 99e901f0..44e5b4ac 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,7 +1,3 @@
-[submodule "src/pfaffine"]
-	path = src/pfaffine
-	url = https://github.com/xrq-phys/Pfaffine
-	branch = blis
 [submodule "src/blis"]
 	path = src/blis
 	url = https://github.com/xrq-phys/blis
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 70851aa8..3257d79d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,8 +1,9 @@
-cmake_minimum_required(VERSION 2.8.0 FATAL_ERROR)
+cmake_minimum_required(VERSION 3.0 FATAL_ERROR)
 project(mVMC NONE)
 
 option(USE_SCALAPACK "Use Scalapack" OFF)
 option(PFAFFIAN_BLOCKED "Use blocked-update Pfaffian to speed up." OFF)
+option(USE_GEMMT "Use GEMMT. Recommended regardless blocked-Pfaffian-update." ON)
 
 add_definitions(-D_mVMC)
 if(CONFIG)
@@ -17,11 +18,10 @@ message(STATUS "Build type: " ${CMAKE_BUILD_TYPE})
 option(BUILD_SHARED_LIBS "Build shared libraries" ON)
 
 option(GIT_SUBMODULE_UPDATE "execute `git submodule update` automatically" ON)
-enable_language(C Fortran)
-if(PFAFFIAN_BLOCKED)
-  enable_language(CXX)
-  set(CMAKE_CXX_STANDARD 11)
-endif(PFAFFIAN_BLOCKED)
+
+# First, enables C language only.
+# External packages only use their C API.
+enable_language(C)
 
 set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/lib")
 set(CMAKE_SKIP_BUILD_RPATH FALSE)
@@ -29,6 +29,7 @@ set(CMAKE_BUILD_WITH_INSTALL_RPATH FALSE)
 set(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE)
 set(CMAKE_MACOSX_RPATH 1)
 
+# TODO: Is this really needed?
 if("${CMAKE_BUILD_TYPE}" MATCHES "Debug")
   set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${CMAKE_C_FLAGS_DEBUG}")
 else()
@@ -37,16 +38,19 @@ else()
 endif()
 
 if(CMAKE_C_COMPILER_ID STREQUAL "Intel")
+  # TODO: Really needs separation?
   if("${CMAKE_C_COMPILER_VERSION}" VERSION_LESS "15.0.0.20140528")
      set(OMP_FLAG_Intel "-openmp")
   else()
     set(OMP_FLAG_Intel "-qopenmp")
   endif()
     set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OMP_FLAG_Intel}")
+    set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OMP_FLAG_Intel}")
 else()
   find_package(OpenMP)
   if(OPENMP_FOUND)
     set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
+    set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_C_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}")
   endif(OPENMP_FOUND)	
 endif()
 
@@ -57,6 +61,9 @@ if(MPI_C_FOUND)
   set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${MPI_C_LINK_FLAGS}")
 endif(MPI_C_FOUND)
 
+# BLAS/LAPACK/Pfapack77 needs Fortran interface.
+enable_language(Fortran)
+
 find_package(LAPACK)
 if(USE_SCALAPACK MATCHES OFF)
   if(LAPACK_FOUND)
@@ -64,6 +71,34 @@ if(USE_SCALAPACK MATCHES OFF)
   endif(LAPACK_FOUND)
 endif()
 
+if(PFAFFIAN_BLOCKED
+    OR USE_GEMMT)
+  set(Require_BLIS ON)
+endif()
+
+if(BLA_VENDOR MATCHES "Intel10*"
+    OR BLAS_LIBRARIES MATCHES "/*mkl*")
+  # Don't require BLIS when MKL is used.
+  add_definitions(-DMKL)
+  add_definitions(-DBLAS_EXTERNAL)
+  add_definitions(-DF77_COMPLEX_RET_INTEL)
+  set(Require_BLIS OFF)
+elseif(BLA_VENDOR MATCHES "FLA*"
+    OR BLAS_LIBRARIES MATCHES "/*blis*")
+  # Skip extra BLIS if it's already the BLAS vendor.
+  list(GET BLAS_LIBRARIES 0 BLIS_FIRST_LIB)
+  get_filename_component(BLIS_LIB_DIR ${BLIS_FIRST_LIB} DIRECTORY)
+  include_directories(${BLIS_LIB_DIR}/../include)
+  include_directories(${BLIS_LIB_DIR}/../include/blis)
+  set(Require_BLIS OFF)
+else()
+  # BLAS vendor preference:
+  # External > BLIS > Reference
+  if(DEFINED BLA_VENDOR)
+    add_definitions(-DBLAS_EXTERNAL)
+  endif()
+endif()
+
 # Build and enable tests
 # testing setup
 # enable_testing() must be called in the top-level CMakeLists.txt before any add_subdirectory() is called.
@@ -97,12 +132,23 @@ if(GIT_FOUND AND EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/.git")
 endif()
 add_subdirectory("${STDFACE_DIR}")
 
+# C++ support for pfupdates & ltl2inv.
+enable_language(CXX)
+set(CMAKE_CXX_STANDARD 11)
+
+if(Require_BLIS)
+  include("download_blis_artifact.cmake")
+else(Require_BLIS)
+  # Use bundled blis.h
+  include_directories(src/common/deps)
+endif(Require_BLIS)
+
 if(PFAFFIAN_BLOCKED)
   add_definitions(-D_pf_block_update)
-  include("download_blis_artifact.cmake")
-  # Must set BLIS artifact BEFORE adding pfupdates target.
   add_subdirectory(src/pfupdates)
-  add_dependencies(pfupdates blis_include)
+  if(Require_BLIS)
+    add_dependencies(pfupdates blis_include)
+  endif(Require_BLIS)
 endif(PFAFFIAN_BLOCKED)
 
 if (Document)
@@ -111,5 +157,9 @@ endif(Document)
 
 add_subdirectory(src/ComplexUHF)
 add_subdirectory(src/pfapack/fortran)
+add_subdirectory(src/ltl2inv)
+  if(Require_BLIS)
+    add_dependencies(ltl2inv blis_include)
+  endif(Require_BLIS)
 add_subdirectory(src/mVMC)
 add_subdirectory(tool)
diff --git a/config/aocc.cmake b/config/aocc.cmake
index a53a783a..258d9772 100644
--- a/config/aocc.cmake
+++ b/config/aocc.cmake
@@ -9,6 +9,9 @@ set(CMAKE_C_FLAGS_RELEASE "-Wno-unknown-pragmas -O3 -DNDEBUG -DHAVE_SSE2" CACHE
 set(CMAKE_Fortran_COMPILER "flang" CACHE STRING "" FORCE)
 set(CMAKE_Fortran_FLAGS_RELEASE "-O3 -DNDEBUG -DHAVE_SSE2" CACHE STRING "" FORCE)
 
+# OpenMP, libatomic
+set(CMAKE_EXE_LINKER_FLAGS "-fopenmp -latomic") 
+
 # for AOCL
 set(BLA_VENDOR "FLAME" CACHE STRING "" FORCE)
 
diff --git a/config/apple.cmake b/config/apple.cmake
index 9b197129..16ff1130 100644
--- a/config/apple.cmake
+++ b/config/apple.cmake
@@ -6,6 +6,7 @@ set(CMAKE_CXX_COMPILER "clang++" CACHE STRING "" FORCE)
 set(CMAKE_C_FLAGS_DEBUG "-g -O0 -Wall  -Wformat -Werror=format-security")
 set(CMAKE_C_FLAGS_RELEASE "-O3 -Wno-unknown-pragmas -Wno-logical-not-parentheses")
 set(CMAKE_Fortran_COMPILER "gfortran" CACHE STRING "" FORCE)
+add_definitions(-DF77_COMPLEX_RET_INTEL)
 
 # OpenMP with libomp
 set(CMAKE_EXE_LINKER_FLAGS "-L/usr/local/lib -lomp") 
diff --git a/mVMCconfig.sh b/mVMCconfig.sh
index f731ccc7..0b7e1d62 100755
--- a/mVMCconfig.sh
+++ b/mVMCconfig.sh
@@ -40,7 +40,7 @@ F90 = frt
 FFLAGS = -DNDEBUG -DFUJITSU -Kfast
 CFLAGS = -DNDEBUG -Ofast -fopenmp
 CXXFLAGS = -DNDEBUG -Ofast -fopenmp
-CFLAGS += -D_lapack -D_pf_block_update # -D_pfaffine
+CFLAGS += -D_lapack -D_pf_block_update
 CXXFLAGS += -DBLAS_EXTERNAL
 
 BLIS_ROOT = ${PWD}/src/blis-install
@@ -74,7 +74,7 @@ F90 = ifort
 FFLAGS = -O3 -implicitnone
 CFLAGS = -O3 -qopenmp -Wno-unknown-pragmas
 CXXFLAGS = -O3 -std=gnu++14 -fpic -qopenmp
-CFLAGS += -D_lapack -D_pf_block_update -D_pfaffine
+CFLAGS += -D_lapack -D_pf_block_update
 CXXFLAGS += -DMKL -DBLAS_EXTERNAL -DF77_COMPLEX_RET_INTEL
 
 BLIS_ROOT = ${PWD}/src/blis-install
@@ -100,7 +100,7 @@ F90 = ifort
 FFLAGS = -O3 -implicitnone
 CFLAGS = -O3 -qopenmp -Wno-unknown-pragmas
 CXXFLAGS = -O3 -std=gnu++14 -fpic -qopenmp
-CFLAGS += -D_lapack -D_pf_block_update -D_pfaffine
+CFLAGS += -D_lapack -D_pf_block_update
 CXXFLAGS += -DMKL -DBLAS_EXTERNAL -DF77_COMPLEX_RET_INTEL
 
 BLIS_ROOT = ${PWD}/src/blis-install
@@ -124,7 +124,7 @@ F90 = flang
 FFLAGS = -O3
 CFLAGS = -O3 -fopenmp
 CXXFLAGS = -O3 -fPIC
-CFLAGS += -D_lapack -D_pf_block_update -D_pfaffine
+CFLAGS += -D_lapack -D_pf_block_update
 CXXFLAGS +=
 
 BLIS_ROOT = ${PWD}/src/blis-install
@@ -148,7 +148,7 @@ F90 = gfortran
 FFLAGS = -O3 -fimplicit-none
 CFLAGS = -O3 -fopenmp
 CXXFLAGS = -O3 -fPIC
-CFLAGS += -D_lapack -D_pf_block_update -D_pfaffine
+CFLAGS += -D_lapack -D_pf_block_update
 CXXFLAGS +=
 
 BLIS_ROOT = ${PWD}/src/blis-install
@@ -172,7 +172,7 @@ F90 = gfortran
 FFLAGS = -O3 -fimplicit-none
 CFLAGS = -O3 -fopenmp
 CXXFLAGS = -O3
-CFLAGS += -D_lapack -D_pf_block_update -D_pfaffine
+CFLAGS += -D_lapack -D_pf_block_update
 CXXFLAGS += -DMKL -DBLAS_EXTERNAL -DF77_COMPLEX_RET_INTEL
 
 BLIS_ROOT = ${PWD}/src/blis-install
@@ -196,7 +196,7 @@ F90 = gfortran
 FFLAGS = -O3 -fimplicit-none
 CFLAGS = -O3 -fopenmp
 CXXFLAGS = -O3
-CFLAGS += -D_lapack -D_pf_block_update -D_pfaffine
+CFLAGS += -D_lapack -D_pf_block_update
 CXXFLAGS +=
 
 BLIS_ROOT = ${PWD}/src/blis-install
@@ -220,7 +220,7 @@ F90 = gfortran
 FFLAGS = -O3 -fimplicit-none
 CFLAGS = -O3 -fopenmp
 CXXFLAGS = -O3
-CFLAGS += -D_lapack -D_pf_block_update -D_pfaffine
+CFLAGS += -D_lapack -D_pf_block_update
 CXXFLAGS += # -DBLAS_EXTERNAL # -DF77_COMPLEX_RET_INTEL # For Apple.
 
 BLIS_ROOT = ${PWD}/src/blis-install
@@ -242,7 +242,6 @@ EOF
     cat src/make.sys
     cat src/make-ext.sys
     ln -s $PWD/src/make.sys src/StdFace/make.sys;
-    ln -s $PWD/src/make.sys src/pfaffine/make.inc;
     ln -s $PWD/src/make.sys src/pfapack/fortran/make.inc;
 
     cat > makefile <<EOF
diff --git a/src/common/blalink.hh b/src/common/blalink.hh
new file mode 100644
index 00000000..2750c34c
--- /dev/null
+++ b/src/common/blalink.hh
@@ -0,0 +1,518 @@
+/**
+ * \file blalink.hh
+ * C++ to C type wrapper for BLIS type-apis.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at https://mozilla.org/MPL/2.0/.
+ */
+#pragma once
+#include <complex>
+#include <cassert>
+typedef std::complex<float>  ccscmplx;
+typedef std::complex<double> ccdcmplx;
+// BLIS definitions.
+#include "blis.h"
+// BLAS definitions
+#include "blalink_fort.h"
+
+// error info
+typedef enum {
+    Pfaffine_SIGN_ERR = 1,
+    Pfaffine_OUT_OF_BOUND,
+    Pfaffine_BAD_SCRATCHPAD,
+    Pfaffine_BAD_REPRESENTATION,
+    Pfaffine_INTEGER_OVERFLOW,
+    Pfaffine_DOUBLE_NAN_DETECTED,
+    Pfaffine_NOT_IMPLEMNTED,
+    Pfaffine_NUM_ERROR_TYPE
+} skpfa_error_t;
+inline signed err_info(signed type, signed pos)
+{ return type * Pfaffine_NUM_ERROR_TYPE + pos; }
+
+// gemm
+template <typename T>
+inline void gemm(trans_t transa, trans_t transb,
+                 dim_t m, dim_t n, dim_t k,
+                 T alpha,
+                 T *a, inc_t lda,
+                 T *b, inc_t ldb,
+                 T beta,
+                 T *c, inc_t ldc);
+#ifndef BLAS_EXTERNAL
+#define BLALINK_MAC(cctype, ctype, cchar) \
+    template <> inline void gemm<cctype>(trans_t transa, trans_t transb, \
+                                         dim_t m, dim_t n, dim_t k, \
+                                         cctype alpha, \
+                                         cctype *a, inc_t lda, \
+                                         cctype *b, inc_t ldb, \
+                                         cctype beta, \
+                                         cctype *c, inc_t ldc) \
+    { \
+        bli_##cchar##gemm(transa, transb, \
+                          m, n, k, \
+                          (ctype *)&alpha, \
+                          (ctype *)a, 1, lda, \
+                          (ctype *)b, 1, ldb, \
+                          (ctype *)&beta, \
+                          (ctype *)c, 1, ldc); \
+    }
+#else
+#define BLALINK_MAC(cctype, ctype, cchar) \
+    template <> inline void gemm<cctype>(trans_t transa, trans_t transb, \
+                                         dim_t m, dim_t n, dim_t k, \
+                                         cctype alpha, \
+                                         cctype *a, inc_t lda, \
+                                         cctype *b, inc_t ldb, \
+                                         cctype beta, \
+                                         cctype *c, inc_t ldc) \
+    { \
+        char ta = trans2char(transa), \
+             tb = trans2char(transb); \
+        cchar##gemm_(&ta, &tb, &m, &n, &k, \
+                     (ctype *)&alpha, \
+                     (ctype *)a, &lda, \
+                     (ctype *)b, &ldb, \
+                     (ctype *)&beta, \
+                     (ctype *)c, &ldc); \
+    }
+#endif
+BLALINK_MAC( float,    float,    s )
+BLALINK_MAC( double,   double,   d )
+BLALINK_MAC( ccscmplx, scomplex, c )
+BLALINK_MAC( ccdcmplx, dcomplex, z )
+#undef BLALINK_MAC
+
+
+// ger
+template <typename T>
+inline void ger(dim_t m, dim_t n,
+                T alpha,
+                T *x, inc_t incx,
+                T *y, inc_t incy,
+                T *a, inc_t lda);
+#ifndef BLAS_EXTERNAL
+#define BLALINK_MAC(cctype, ctype, cchar, cfunc) \
+    template <> inline void ger<cctype>(dim_t m, dim_t n, \
+                                        cctype alpha, \
+                                        cctype *x, inc_t incx, \
+                                        cctype *y, inc_t incy, \
+                                        cctype *a, inc_t lda) \
+    { \
+        bli_##cchar##ger(BLIS_NO_CONJUGATE, \
+                         BLIS_NO_CONJUGATE, \
+                         m, n, \
+                         (ctype *)&alpha, \
+                         (ctype *)x, incx, \
+                         (ctype *)y, incy, \
+                         (ctype *)a, 1, lda); \
+    }
+#else
+#define BLALINK_MAC(cctype, ctype, cchar, cfunc) \
+    template <> inline void ger<cctype>(dim_t m, dim_t n, \
+                                        cctype alpha, \
+                                        cctype *x, inc_t incx, \
+                                        cctype *y, inc_t incy, \
+                                        cctype *a, inc_t lda) \
+    { \
+        cchar##cfunc##_(&m, &n, \
+                        (ctype *)&alpha, \
+                        (ctype *)x, &incx, \
+                        (ctype *)y, &incy, \
+                        (ctype *)a, &lda); \
+    }
+#endif
+BLALINK_MAC( float,    float,    s, ger  )
+BLALINK_MAC( double,   double,   d, ger  )
+BLALINK_MAC( ccscmplx, scomplex, c, geru )
+BLALINK_MAC( ccdcmplx, dcomplex, z, geru )
+#undef BLALINK_MAC
+
+
+// gemv
+template <typename T>
+inline void gemv(trans_t trans,
+                 dim_t m, dim_t n,
+                 T alpha,
+                 T *a, inc_t lda,
+                 T *x, inc_t incx,
+                 T beta,
+                 T *y, inc_t incy);
+#ifndef BLAS_EXTERNAL
+#define BLALINK_MAC(cctype, ctype, cchar) \
+    template <> inline void gemv<cctype>(trans_t trans, \
+                                         dim_t m, dim_t n, \
+                                         cctype alpha, \
+                                         cctype *a, inc_t lda, \
+                                         cctype *x, inc_t incx, \
+                                         cctype beta, \
+                                         cctype *y, inc_t incy) \
+    { \
+        bli_##cchar##gemv(trans, \
+                          BLIS_NO_CONJUGATE, \
+                          m, n, \
+                          (ctype *)&alpha, \
+                          (ctype *)a, 1, lda, \
+                          (ctype *)x, incx, \
+                          (ctype *)&beta, \
+                          (ctype *)y, incy); \
+    }
+#else
+#define BLALINK_MAC(cctype, ctype, cchar) \
+    template <> inline void gemv<cctype>(trans_t trans, \
+                                         dim_t m, dim_t n, \
+                                         cctype alpha, \
+                                         cctype *a, inc_t lda, \
+                                         cctype *x, inc_t incx, \
+                                         cctype beta, \
+                                         cctype *y, inc_t incy) \
+    { \
+        char t = trans2char(trans); \
+        cchar##gemv_(&t, &m, &n, \
+                     (ctype *)&alpha, \
+                     (ctype *)a, &lda, \
+                     (ctype *)x, &incx, \
+                     (ctype *)&beta, \
+                     (ctype *)y, &incy); \
+    }
+#endif
+BLALINK_MAC( float,    float,    s )
+BLALINK_MAC( double,   double,   d )
+BLALINK_MAC( ccscmplx, scomplex, c )
+BLALINK_MAC( ccdcmplx, dcomplex, z )
+#undef BLALINK_MAC
+
+
+// trmm
+template <typename T>
+inline void trmm(side_t sidea, 
+                 uplo_t uploa, 
+                 trans_t transa, 
+                 diag_t diaga,
+                 dim_t m, dim_t n,
+                 T alpha, 
+                 T *a, inc_t lda, 
+                 T *b, inc_t ldb);
+#ifndef BLAS_EXTERNAL
+#define BLALINK_MAC(cctype, ctype, cchar) \
+    template <> inline void trmm<cctype>(side_t sidea, uplo_t uploa, trans_t transa, diag_t diaga, \
+                                         dim_t m, dim_t n, \
+                                         cctype alpha, \
+                                         cctype *a, inc_t lda, \
+                                         cctype *b, inc_t ldb) \
+    { \
+        bli_##cchar##trmm(sidea, uploa, transa, diaga, \
+                          m, n, \
+                          (ctype *)&alpha, \
+                          (ctype *)a, 1, lda, \
+                          (ctype *)b, 1, ldb); \
+    }
+#else
+#define BLALINK_MAC(cctype, ctype, cchar) \
+    template <> inline void trmm<cctype>(side_t sidea, uplo_t uploa, trans_t transa, diag_t diaga, \
+                                         dim_t m, dim_t n, \
+                                         cctype alpha, \
+                                         cctype *a, inc_t lda, \
+                                         cctype *b, inc_t ldb) \
+    { \
+        char ul = uplo2char(uploa), \
+             si = side2char(sidea), \
+             tr = trans2char(transa), \
+             dg = diag2char(diaga); \
+        cchar##trmm_(&si, &ul, &tr, &dg, \
+                     &m, &n, \
+                     (ctype *)&alpha, \
+                     (ctype *)a, &lda, \
+                     (ctype *)b, &ldb); \
+    }
+#endif
+BLALINK_MAC( float,    float,    s )
+BLALINK_MAC( double,   double,   d )
+BLALINK_MAC( ccscmplx, scomplex, c )
+BLALINK_MAC( ccdcmplx, dcomplex, z )
+#undef BLALINK_MAC
+
+
+// trmv
+template <typename T>
+inline void trmv(uplo_t uploa,
+                 trans_t transa,
+                 dim_t m,
+                 T alpha,
+                 T *a, inc_t lda,
+                 T *x, inc_t incx);
+#ifndef BLAS_EXTERNAL
+#define BLALINK_MAC(cctype, ctype, cchar) \
+    template <> inline void trmv<cctype>(uplo_t uploa, trans_t transa, dim_t m, \
+                                         cctype alpha, \
+                                         cctype *a, inc_t lda, \
+                                         cctype *x, inc_t incx) \
+    { \
+        bli_##cchar##trmv(uploa, transa, \
+                          BLIS_NONUNIT_DIAG, m, \
+                          (ctype *)&alpha, \
+                          (ctype *)a, 1, lda, \
+                          (ctype *)x, incx); \
+    }
+#else
+#define BLALINK_MAC(cctype, ctype, cchar) \
+    template <> inline void trmv<cctype>(uplo_t uploa, trans_t transa, dim_t m, \
+                                         cctype alpha, \
+                                         cctype *a, inc_t lda, \
+                                         cctype *x, inc_t incx) \
+    { \
+        char ul = uplo2char(uploa), \
+             tr = trans2char(transa), \
+             dg = 'N'; \
+        /*
+         * TRMV has alpha in its templated interface, which is absent in BLAS.
+         */ \
+        assert(alpha == cctype(1.0)); \
+        cchar##trmv_(&ul, &tr, &dg, &m, \
+                     (ctype *)a, &lda, \
+                     (ctype *)x, &incx); \
+    }
+#endif
+BLALINK_MAC( float,    float,    s )
+BLALINK_MAC( double,   double,   d )
+BLALINK_MAC( ccscmplx, scomplex, c )
+BLALINK_MAC( ccdcmplx, dcomplex, z )
+#undef BLALINK_MAC
+
+
+// swap
+template <typename T>
+inline void swap(dim_t n,
+                 T *x, inc_t incx,
+                 T *y, inc_t incy);
+#ifndef BLAS_EXTERNAL
+#define BLALINK_MAC(cctype, ctype, cchar) \
+    template <> inline void swap<cctype>(dim_t n, \
+                                         cctype *x, inc_t incx, \
+                                         cctype *y, inc_t incy) \
+    { \
+        bli_##cchar##swapv(n, \
+                           (ctype *)x, incx, \
+                           (ctype *)y, incy); \
+    }
+#else
+#define BLALINK_MAC(cctype, ctype, cchar) \
+    template <> inline void swap<cctype>(dim_t n, \
+                                         cctype *x, inc_t incx, \
+                                         cctype *y, inc_t incy) \
+    { \
+        cchar##swap_(&n, \
+                     (ctype *)x, &incx, \
+                     (ctype *)y, &incy); \
+    }
+#endif
+BLALINK_MAC( float,    float,    s )
+BLALINK_MAC( double,   double,   d )
+BLALINK_MAC( ccscmplx, scomplex, c )
+BLALINK_MAC( ccdcmplx, dcomplex, z )
+#undef BLALINK_MAC
+
+
+// axpy
+template <typename T>
+inline void axpy(dim_t n,
+                 T alpha,
+                 T *x, inc_t incx,
+                 T *y, inc_t incy);
+#ifndef BLAS_EXTERNAL
+#define BLALINK_MAC(cctype, ctype, cchar) \
+    template <> inline void axpy<cctype>(dim_t n, \
+                                         cctype alpha, \
+                                         cctype *x, inc_t incx, \
+                                         cctype *y, inc_t incy) \
+    { \
+        bli_##cchar##axpyv(BLIS_NO_CONJUGATE, n, \
+                           (ctype *)&alpha, \
+                           (ctype *)x, incx, \
+                           (ctype *)y, incy); \
+    }
+#else
+#define BLALINK_MAC(cctype, ctype, cchar) \
+    template <> inline void axpy<cctype>(dim_t n, \
+                                         cctype alpha, \
+                                         cctype *x, inc_t incx, \
+                                         cctype *y, inc_t incy) \
+    { \
+        cchar##axpy_(&n, \
+                     (ctype *)&alpha, \
+                     (ctype *)x, &incx, \
+                     (ctype *)y, &incy); \
+    }
+#endif
+BLALINK_MAC( float,    float,    s )
+BLALINK_MAC( double,   double,   d )
+BLALINK_MAC( ccscmplx, scomplex, c )
+BLALINK_MAC( ccdcmplx, dcomplex, z )
+#undef BLALINK_MAC
+
+// dot
+template <typename T>
+inline T dot(dim_t n,
+             T *sx, inc_t incx,
+             T *sy, inc_t incy);
+#ifndef BLAS_EXTERNAL
+#define BLALINK_MAC(cctype, ctype, cchar, cfunc) \
+    template <> inline cctype dot<cctype>(dim_t n, \
+                                          cctype *sx, inc_t incx, \
+                                          cctype *sy, inc_t incy) \
+    { \
+        cctype rho; \
+        bli_##cchar##dotv(BLIS_NO_CONJUGATE, \
+                          BLIS_NO_CONJUGATE, \
+                          n, \
+                          (ctype *)sx, incx, \
+                          (ctype *)sy, incy, \
+                          (ctype *)&rho); \
+        return rho; \
+    }
+#else
+#define BLALINK_MAC(cctype, ctype, cchar, cfunc) \
+    template <> inline cctype dot<cctype>(dim_t n, \
+                                          cctype *sx, inc_t incx, \
+                                          cctype *sy, inc_t incy) \
+    { \
+        ctype rho = cchar##cfunc##_(&n, \
+                                    (ctype *)sx, &incx, \
+                                    (ctype *)sy, &incy); \
+        return *((cctype *)&rho); \
+    }
+#endif
+BLALINK_MAC( float,    float,    s, dot  )
+BLALINK_MAC( double,   double,   d, dot  )
+#if defined(BLAS_EXTERNAL) && defined(F77_COMPLEX_RET_INTEL)
+// Intel style complex return.
+#undef BLALINK_MAC
+#define BLALINK_MAC(cctype, ctype, cchar, cfunc) \
+    template <> inline cctype dot<cctype>(dim_t n, \
+                                          cctype *sx, inc_t incx, \
+                                          cctype *sy, inc_t incy) \
+    { \
+        cctype rho; \
+        cchar##cfunc##_((ctype *)&rho, &n, \
+                        (ctype *)sx, &incx, \
+                        (ctype *)sy, &incy); \
+        return rho; \
+    }
+#endif
+BLALINK_MAC( ccscmplx, scomplex, c, dotu )
+BLALINK_MAC( ccdcmplx, dcomplex, z, dotu )
+#undef BLALINK_MAC
+
+
+
+// [LAPACK] lacpy
+template <typename T>
+inline void lacpy(uplo_t uploa,
+                  dim_t m, dim_t n,
+                  T *a, inc_t lda,
+                  T *b, inc_t ldb);
+#define BLALINK_MAC(cctype, ctype, cchar) \
+    template <> inline void lacpy<cctype>(uplo_t uploa, \
+                                          dim_t m, dim_t n, \
+                                          cctype *a, inc_t lda, \
+                                          cctype *b, inc_t ldb) \
+    { \
+        char ul = uplo2char(uploa); \
+        cchar##lacpy_(&ul, &m, &n, a, &lda, b, &ldb); \
+    }
+BLALINK_MAC( float,    float,    s )
+BLALINK_MAC( double,   double,   d )
+BLALINK_MAC( ccscmplx, scomplex, c )
+BLALINK_MAC( ccdcmplx, dcomplex, z )
+#undef BLALINK_MAC
+
+// [LAPACK] trtri
+template <typename T>
+inline int trtri(uplo_t uploa,
+                 diag_t diaga,
+                 dim_t n,
+                 T *a, inc_t lda);
+#define BLALINK_MAC(cctype, ctype, cchar) \
+    template <> inline int trtri(uplo_t uploa, \
+                                 diag_t diaga, \
+                                 dim_t n, \
+                                 cctype *a, inc_t lda) \
+    { \
+        char ul = uplo2char(uploa); \
+        char dg = diag2char(diaga); \
+        int info; \
+        cchar##trtri_(&ul, &dg, &n, a, &lda, &info); \
+        return info; \
+    }
+BLALINK_MAC( float,    float,    s )
+BLALINK_MAC( double,   double,   d )
+BLALINK_MAC( ccscmplx, scomplex, c )
+BLALINK_MAC( ccdcmplx, dcomplex, z )
+#undef BLALINK_MAC
+
+// [PFAPACK] sktrf
+template <typename T>
+inline int sktrf(uplo_t uploa,
+                 dim_t n,
+                 T *a, inc_t lda,
+                 int *ipiv,
+                 T *work, dim_t lwork);
+#define BLALINK_MAC(cctype, ctype, cchar) \
+    template <> inline int sktrf(uplo_t uploa, \
+                                 dim_t n, \
+                                 cctype *a, inc_t lda, \
+                                 int *ipiv, \
+                                 cctype *work, dim_t lwork) \
+    { \
+        char ul = uplo2char(uploa); \
+        char mode = 'N'; \
+        int info; \
+        cchar##sktrf_(&ul, &mode, &n, a, &lda, ipiv, work, &lwork, &info); \
+        return info; \
+    }
+BLALINK_MAC( float,    float,    s )
+BLALINK_MAC( double,   double,   d )
+BLALINK_MAC( ccscmplx, scomplex, c )
+BLALINK_MAC( ccdcmplx, dcomplex, z )
+#undef BLALINK_MAC
+
+// [PFAPACK] skpfa
+template <typename T>
+inline int skpfa(uplo_t uploa,
+                 dim_t n,
+                 T *a, inc_t lda,
+                 T *pfa, int *ipiv,
+                 T *work, dim_t lwork);
+#define BLALINK_MAC(cctype, ctype, cchar) \
+    template <> inline int skpfa(uplo_t uploa, \
+                                 dim_t n, \
+                                 cctype *a, inc_t lda, \
+                                 cctype *pfa, int *ipiv, \
+                                 cctype *work, dim_t lwork) \
+    { \
+        char ul = uplo2char(uploa); \
+        char mthd = 'P'; \
+        int info; \
+        cchar##skpfa_(&ul, &mthd, &n, a, &lda, pfa, ipiv, work, &lwork, &info); \
+        return info; \
+    }
+BLALINK_MAC( float,    float,    s )
+BLALINK_MAC( double,   double,   d )
+#undef BLALINK_MAC
+#define BLALINK_MAC(cctype, ctype, cchar) \
+    template <> inline int skpfa(uplo_t uploa, \
+                                 dim_t n, \
+                                 cctype *a, inc_t lda, \
+                                 cctype *pfa, int *ipiv, \
+                                 cctype *work, dim_t lwork) \
+    { \
+        char ul = uplo2char(uploa); \
+        char mthd = 'P'; \
+        int info; \
+        cchar##skpfa_(&ul, &mthd, &n, a, &lda, pfa, ipiv, work, &lwork, 0, &info); \
+        return info; \
+    }
+BLALINK_MAC( ccscmplx, scomplex, c )
+BLALINK_MAC( ccdcmplx, dcomplex, z )
+#undef BLALINK_MAC
+
diff --git a/src/common/blalink_fort.h b/src/common/blalink_fort.h
new file mode 100644
index 00000000..eb0b3a39
--- /dev/null
+++ b/src/common/blalink_fort.h
@@ -0,0 +1,184 @@
+/**
+ * \file blalink_fort.hh
+ * Wrapper for external BLAS.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at https://mozilla.org/MPL/2.0/.
+ */
+#pragma once
+#include <stdlib.h>
+#include "blis.h"
+
+
+// converter from BLIS enum to BLAS char.
+BLIS_INLINE char trans2char(trans_t t)
+{
+    switch (t) {
+    case BLIS_NO_TRANSPOSE:
+        return 'N';
+    case BLIS_TRANSPOSE:
+        return 'T';
+    case BLIS_CONJ_NO_TRANSPOSE:
+        return 'C';
+    default:
+        abort();
+    }
+}
+BLIS_INLINE char uplo2char(uplo_t t)
+{
+    switch (t) {
+    case BLIS_UPPER:
+        return 'U';
+    case BLIS_LOWER:
+        return 'L';
+    default:
+        abort();
+    }
+}
+BLIS_INLINE char side2char(side_t t)
+{
+    switch (t) {
+    case BLIS_LEFT:
+        return 'L';
+    case BLIS_RIGHT:
+        return 'R';
+    default:
+        abort();
+    }
+}
+BLIS_INLINE char diag2char(diag_t t)
+{
+    switch (t) {
+    case BLIS_UNIT_DIAG:
+        return 'U';
+    case BLIS_NONUNIT_DIAG:
+        return 'N';
+    }
+}
+
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+// BLAS {
+#ifdef BLAS_EXTERNAL
+
+// gemm
+void sgemm_(char *transa, char *transb, dim_t *m, dim_t *n, dim_t *k, float *alpha,
+            float *a, dim_t *lda, float *b, dim_t *ldb, float *beta, float *c, dim_t *ldc);
+void dgemm_(char *transa, char *transb, dim_t *m, dim_t *n, dim_t *k, double *alpha,
+            double *a, dim_t *lda, double *b, dim_t *ldb, double *beta, double *c, dim_t *ldc);
+void cgemm_(char *transa, char *transb, dim_t *m, dim_t *n, dim_t *k, void *alpha,
+            void *a, dim_t *lda, void *b, dim_t *ldb, void *beta, void *c, dim_t *ldc);
+void zgemm_(char *transa, char *transb, dim_t *m, dim_t *n, dim_t *k, void *alpha,
+            void *a, dim_t *lda, void *b, dim_t *ldb, void *beta, void *c, dim_t *ldc);
+
+
+// ger
+void sger_(dim_t *m, dim_t *n, float *alpha,
+           float *x, dim_t *incx, float *y, dim_t *incy, float *a, dim_t *lda);
+void dger_(dim_t *m, dim_t *n, double *alpha,
+           double *x, dim_t *incx, double *y, dim_t *incy, double *a, dim_t *lda);
+void cgeru_(dim_t *m, dim_t *n, void *alpha,
+            void *x, dim_t *incx, void *y, dim_t *incy, void *a, dim_t *lda);
+void zgeru_(dim_t *m, dim_t *n, void *alpha,
+            void *x, dim_t *incx, void *y, dim_t *incy, void *a, dim_t *lda);
+
+
+// gemv
+void sgemv_(char *trans, dim_t *m, dim_t *n, float *alpha, float *a, dim_t *lda,
+            float *x, dim_t *incx, float *beta, float *y, dim_t *incy);
+void dgemv_(char *trans, dim_t *m, dim_t *n, double *alpha, double *a, dim_t *lda,
+            double *x, dim_t *incx, double *beta, double *y, dim_t *incy);
+void cgemv_(char *trans, dim_t *m, dim_t *n, void *alpha, void *a, dim_t *lda,
+            void *x, dim_t *incx, void *beta, void *y, dim_t *incy);
+void zgemv_(char *trans, dim_t *m, dim_t *n, void *alpha, void *a, dim_t *lda,
+            void *x, dim_t *incx, void *beta, void *y, dim_t *incy);
+
+
+// trmm
+void strmm_(char *side, char *uplo, char *transa, char *diag, dim_t *m, dim_t *n,
+            float *alpha, float *a, inc_t *lda, float *b, inc_t *ldb);	
+void dtrmm_(char *side, char *uplo, char *transa, char *diag, dim_t *m, dim_t *n,
+            double *alpha, double *a, inc_t *lda, double *b, inc_t *ldb);	
+void ctrmm_(char *side, char *uplo, char *transa, char *diag, dim_t *m, dim_t *n,
+            void *alpha, void *a, inc_t *lda, void *b, inc_t *ldb);	
+void ztrmm_(char *side, char *uplo, char *transa, char *diag, dim_t *m, dim_t *n,
+            void *alpha, void *a, inc_t *lda, void *b, inc_t *ldb);	
+
+
+// trmv
+void strmv_(char *uplo, char *transa, char *diag, dim_t *n,
+            float *a, inc_t *lda, float *b, inc_t *ldb);	
+void dtrmv_(char *uplo, char *transa, char *diag, dim_t *n,
+            double *a, inc_t *lda, double *b, inc_t *ldb);	
+void ctrmv_(char *uplo, char *transa, char *diag, dim_t *n,
+            void *a, inc_t *lda, void *b, inc_t *ldb);	
+void ztrmv_(char *uplo, char *transa, char *diag, dim_t *n,
+            void *a, inc_t *lda, void *b, inc_t *ldb);	
+
+// swap
+void sswap_(dim_t *n, float *sx, dim_t *incx, float *sy, dim_t *incy);
+void dswap_(dim_t *n, double *sx, dim_t *incx, double *sy, dim_t *incy);
+void cswap_(dim_t *n, void *sx, dim_t *incx, void *sy, dim_t *incy);
+void zswap_(dim_t *n, void *sx, dim_t *incx, void *sy, dim_t *incy);
+
+// axpy
+void saxpy_(dim_t *n, float *alpha, float *sx, dim_t *incx, float *sy, dim_t *incy);
+void daxpy_(dim_t *n, double *alpha, double *sx, dim_t *incx, double *sy, dim_t *incy);
+void caxpy_(dim_t *n, void *alpha, void *sx, dim_t *incx, void *sy, dim_t *incy);
+void zaxpy_(dim_t *n, void *alpha, void *sx, dim_t *incx, void *sy, dim_t *incy);
+
+// dot
+float sdot_(dim_t *n, float *sx, dim_t *incx, float *sy, dim_t *incy);
+double ddot_(dim_t *n, double *sx, dim_t *incx, double *sy, dim_t *incy);
+scomplex cdotc_(dim_t *n, void *sx, dim_t *incx, void *sy, dim_t *incy);
+dcomplex zdotc_(dim_t *n, void *sx, dim_t *incx, void *sy, dim_t *incy);
+#ifndef F77_COMPLEX_RET_INTEL
+scomplex cdotu_(dim_t *n, void *sx, dim_t *incx, void *sy, dim_t *incy);
+dcomplex zdotu_(dim_t *n, void *sx, dim_t *incx, void *sy, dim_t *incy);
+#else
+void cdotu_(scomplex *rho, dim_t *n, void *sx, dim_t *incx, void *sy, dim_t *incy);
+void zdotu_(dcomplex *rho, dim_t *n, void *sx, dim_t *incx, void *sy, dim_t *incy);
+#endif
+
+// }
+#endif
+
+// LAPACK {
+
+// lacpy
+void slacpy_(char *uplo, dim_t *m, dim_t *n, float *a, inc_t *lda, float *b, inc_t *ldb);
+void dlacpy_(char *uplo, dim_t *m, dim_t *n, double *a, inc_t *lda, double *b, inc_t *ldb);
+void clacpy_(char *uplo, dim_t *m, dim_t *n, void *a, inc_t *lda, void *b, inc_t *ldb);
+void zlacpy_(char *uplo, dim_t *m, dim_t *n, void *a, inc_t *lda, void *b, inc_t *ldb);
+
+// trtri
+void strtri_(char *uplo, char *diag, dim_t *n, float *a, inc_t *lda, int *info);
+void dtrtri_(char *uplo, char *diag, dim_t *n, double *a, inc_t *lda, int *info);
+void ctrtri_(char *uplo, char *diag, dim_t *n, void *a, inc_t *lda, int *info);
+void ztrtri_(char *uplo, char *diag, dim_t *n, void *a, inc_t *lda, int *info);
+
+// }
+
+// PFAPACK77 {
+
+void ssktrf_(char *uplo, char *mode, dim_t *n, float *a, dim_t *lda, int *ipiv, float *work, dim_t *lwork, int *info);
+void dsktrf_(char *uplo, char *mode, dim_t *n, double *a, dim_t *lda, int *ipiv, double *work, dim_t *lwork, int *info);
+void csktrf_(char *uplo, char *mode, dim_t *n, void *a, dim_t *lda, int *ipiv, void *work, dim_t *lwork, int *info);
+void zsktrf_(char *uplo, char *mode, dim_t *n, void *a, dim_t *lda, int *ipiv, void *work, dim_t *lwork, int *info);
+
+void sskpfa_(char *uplo, char *mthd, dim_t *n, float *a, dim_t *lda, float *pfa, int *ipiv, float *work, dim_t *lwork, int *info);
+void dskpfa_(char *uplo, char *mthd, dim_t *n, double *a, dim_t *lda, double *pfa, int *ipiv, double *work, dim_t *lwork, int *info);
+void cskpfa_(char *uplo, char *mthd, dim_t *n, void *a, dim_t *lda, void *pfa, int *ipiv, void *work, dim_t *lwork, void *rwork, int *info);
+void zskpfa_(char *uplo, char *mthd, dim_t *n, void *a, dim_t *lda, void *pfa, int *ipiv, void *work, dim_t *lwork, void *rwork, int *info);
+
+// }
+
+#ifdef __cplusplus
+}
+#endif
+
diff --git a/src/common/colmaj.hh b/src/common/colmaj.hh
new file mode 100644
index 00000000..d9a03c1e
--- /dev/null
+++ b/src/common/colmaj.hh
@@ -0,0 +1,26 @@
+/**
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at https://mozilla.org/MPL/2.0/.
+ */
+#pragma once
+
+template <typename T>
+struct colmaj
+{
+  T *dat;
+  int ld;
+
+  colmaj() = delete;
+  colmaj(T *dat_, int ld_)
+  : dat(dat_), ld(ld_) { }
+
+  T &operator()(int i, int j)
+  {
+    return dat[ i + j * ld ];
+  }
+  T &operator()()
+  {
+    return dat[ 0 ];
+  }
+};
diff --git a/src/common/deps/blis.h b/src/common/deps/blis.h
new file mode 100644
index 00000000..0627afe2
--- /dev/null
+++ b/src/common/deps/blis.h
@@ -0,0 +1,552 @@
+/*
+ * blis.h
+   This header is cut out from BLIS v0.8.0,
+    containing necessary type definitions & macros.
+   User can use this file to avert BLIS dependency
+    if their BLAS library vendors a GEMMT implementation.
+   Here begins 3-clause BSD-style license of BLIS:
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2016, Hewlett Packard Enterprise Development LP
+   Copyright (C) 2020, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef BLIS_H
+#define BLIS_H
+
+
+// Allow C++ users to include this header file in their source code. However,
+// we make the extern "C" conditional on whether we're using a C++ compiler,
+// since regular C compilers don't understand the extern "C" construct.
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// NOTE: PLEASE DON'T CHANGE THE ORDER IN WHICH HEADERS ARE INCLUDED UNLESS
+// YOU ARE SURE THAT IT DOESN'T BREAK INTER-HEADER MACRO DEPENDENCIES.
+
+#include <stdio.h> // skipped
+#include <stdlib.h> // skipped
+#include <math.h> // skipped
+#include <string.h> // skipped
+#include <stdarg.h> // skipped
+#include <float.h> // skipped
+#include <errno.h> // skipped
+#include <ctype.h> // skipped
+#include <assert.h> // skipped
+
+// -- STATIC INLINE FUNCTIONS --------------------------------------------------
+
+// C and C++ have different semantics for defining "inline" functions. In C,
+// the keyword phrase "static inline" accomplishes this, though the "inline"
+// is optional. In C++, the "inline" keyword is required and obviates "static"
+// altogether. Why does this matter? While BLIS is compiled in C99, blis.h may
+// be #included by a source file that is compiled with C++.
+#ifdef __cplusplus
+  #define BLIS_INLINE inline
+#else
+  //#define BLIS_INLINE static inline
+  #define BLIS_INLINE static
+#endif
+
+// -- Common BLIS definitions --
+
+// begin bli_type_defs.h
+
+
+#ifndef BLIS_TYPE_DEFS_H
+#define BLIS_TYPE_DEFS_H
+
+
+//
+// -- BLIS basic types ---------------------------------------------------------
+//
+
+#ifdef __cplusplus
+  // For C++, include stdint.h.
+#include <stdint.h> // skipped
+#elif __STDC_VERSION__ >= 199901L
+  // For C99 (or later), include stdint.h.
+#include <stdint.h> // skipped
+#include <stdbool.h> // skipped
+#else
+  // When stdint.h is not available, manually typedef the types we will use.
+  #ifdef _WIN32
+    typedef          __int32  int32_t;
+    typedef unsigned __int32 uint32_t;
+    typedef          __int64  int64_t;
+    typedef unsigned __int64 uint64_t;
+  #else
+    #error "Attempting to compile on pre-C99 system without stdint.h."
+  #endif
+#endif
+
+// -- General-purpose integers --
+
+// If BLAS integers are 64 bits, mandate that BLIS integers also be 64 bits.
+// NOTE: This cpp guard will only meaningfully change BLIS's behavior on
+// systems where the BLIS integer size would have been automatically selected
+// to be 32 bits, since explicit selection of 32 bits is prohibited at
+// configure-time (and explicit or automatic selection of 64 bits is fine
+// and would have had the same result).
+#if BLIS_BLAS_INT_SIZE == 64
+  #undef  BLIS_INT_TYPE_SIZE
+  #define BLIS_INT_TYPE_SIZE 64
+#endif
+
+// Define integer types depending on what size integer was requested.
+#if   BLIS_INT_TYPE_SIZE == 32
+typedef           int32_t  gint_t;
+typedef          uint32_t guint_t;
+#elif BLIS_INT_TYPE_SIZE == 64
+typedef           int64_t  gint_t;
+typedef          uint64_t guint_t;
+#else
+typedef   signed long int  gint_t;
+typedef unsigned long int guint_t;
+#endif
+
+// -- Boolean type --
+
+// NOTE: bool_t is no longer used and has been replaced with C99's bool type.
+//typedef bool bool_t;
+
+// BLIS uses TRUE and FALSE macro constants as possible boolean values, but we
+// define these macros in terms of true and false, respectively, which are
+// defined by C99 in stdbool.h.
+#ifndef TRUE
+  #define TRUE  true
+#endif
+
+#ifndef FALSE
+  #define FALSE false
+#endif
+
+// -- Special-purpose integers --
+
+// This cpp guard provides a temporary hack to allow libflame
+// interoperability with BLIS.
+#ifndef _DEFINED_DIM_T
+#define _DEFINED_DIM_T
+typedef   gint_t dim_t;      // dimension type
+#endif
+typedef   gint_t inc_t;      // increment/stride type
+typedef   gint_t doff_t;     // diagonal offset type
+typedef  guint_t siz_t;      // byte size type
+typedef uint32_t objbits_t;  // object information bit field
+
+// -- Real types --
+
+// Define the number of floating-point types supported, and the size of the
+// largest type.
+#define BLIS_NUM_FP_TYPES   4
+#define BLIS_MAX_TYPE_SIZE  sizeof(dcomplex)
+
+// There are some places where we need to use sizeof() inside of a C
+// preprocessor #if conditional, and so here we define the various sizes
+// for those purposes.
+#define BLIS_SIZEOF_S      4  // sizeof(float)
+#define BLIS_SIZEOF_D      8  // sizeof(double)
+#define BLIS_SIZEOF_C      8  // sizeof(scomplex)
+#define BLIS_SIZEOF_Z      16 // sizeof(dcomplex)
+
+// -- Complex types --
+
+#ifdef BLIS_ENABLE_C99_COMPLEX
+
+	#if __STDC_VERSION__ >= 199901L
+#include <complex.h> // skipped
+
+		// Typedef official complex types to BLIS complex type names.
+		typedef  float complex scomplex;
+		typedef double complex dcomplex;
+	#else
+		#error "Configuration requested C99 complex types, but C99 does not appear to be supported."
+	#endif
+
+#else // ifndef BLIS_ENABLE_C99_COMPLEX
+
+	// This cpp guard provides a temporary hack to allow libflame
+	// interoperability with BLIS.
+	#ifndef _DEFINED_SCOMPLEX
+	#define _DEFINED_SCOMPLEX
+	typedef struct
+	{
+		float  real;
+		float  imag;
+	} scomplex;
+	#endif
+
+	// This cpp guard provides a temporary hack to allow libflame
+	// interoperability with BLIS.
+	#ifndef _DEFINED_DCOMPLEX
+	#define _DEFINED_DCOMPLEX
+	typedef struct
+	{
+		double real;
+		double imag;
+	} dcomplex;
+	#endif
+
+#endif // BLIS_ENABLE_C99_COMPLEX
+
+// -- Atom type --
+
+// Note: atom types are used to hold "bufferless" scalar object values. Note
+// that it needs to be as large as the largest possible scalar value we might
+// want to hold. Thus, for now, it is a dcomplex.
+typedef dcomplex atom_t;
+
+// -- Fortran-77 types --
+
+// Note: These types are typically only used by BLAS compatibility layer, but
+// we must define them even when the compatibility layer isn't being built
+// because they also occur in bli_slamch() and bli_dlamch().
+
+// Define f77_int depending on what size of integer was requested.
+#if   BLIS_BLAS_INT_TYPE_SIZE == 32
+typedef int32_t   f77_int;
+#elif BLIS_BLAS_INT_TYPE_SIZE == 64
+typedef int64_t   f77_int;
+#else
+typedef long int  f77_int;
+#endif
+
+typedef char      f77_char;
+typedef float     f77_float;
+typedef double    f77_double;
+typedef scomplex  f77_scomplex;
+typedef dcomplex  f77_dcomplex;
+
+// -- Void function pointer types --
+
+// Note: This type should be used in any situation where the address of a
+// *function* will be conveyed or stored prior to it being typecast back
+// to the correct function type. It does not need to be used when conveying
+// or storing the address of *data* (such as an array of float or double).
+
+//typedef void (*void_fp)( void );
+typedef void* void_fp;
+
+
+//
+// -- BLIS info bit field offsets ----------------------------------------------
+//
+
+
+
+// info
+#define BLIS_DATATYPE_SHIFT                0
+#define   BLIS_DOMAIN_SHIFT                0
+#define   BLIS_PRECISION_SHIFT             1
+#define BLIS_CONJTRANS_SHIFT               3
+#define   BLIS_TRANS_SHIFT                 3
+#define   BLIS_CONJ_SHIFT                  4
+#define BLIS_UPLO_SHIFT                    5
+#define   BLIS_UPPER_SHIFT                 5
+#define   BLIS_DIAG_SHIFT                  6
+#define   BLIS_LOWER_SHIFT                 7
+#define BLIS_UNIT_DIAG_SHIFT               8
+#define BLIS_INVERT_DIAG_SHIFT             9
+#define BLIS_TARGET_DT_SHIFT               10
+#define   BLIS_TARGET_DOMAIN_SHIFT         10
+#define   BLIS_TARGET_PREC_SHIFT           11
+#define BLIS_EXEC_DT_SHIFT                 13
+#define   BLIS_EXEC_DOMAIN_SHIFT           13
+#define   BLIS_EXEC_PREC_SHIFT             14
+#define BLIS_PACK_SCHEMA_SHIFT             16
+#define   BLIS_PACK_RC_SHIFT               16
+#define   BLIS_PACK_PANEL_SHIFT            17
+#define   BLIS_PACK_FORMAT_SHIFT           18
+#define   BLIS_PACK_SHIFT                  22
+#define BLIS_PACK_REV_IF_UPPER_SHIFT       23
+#define BLIS_PACK_REV_IF_LOWER_SHIFT       24
+#define BLIS_PACK_BUFFER_SHIFT             25
+#define BLIS_STRUC_SHIFT                   27
+#define BLIS_COMP_DT_SHIFT                 29
+#define   BLIS_COMP_DOMAIN_SHIFT           29
+#define   BLIS_COMP_PREC_SHIFT             30
+
+// info2
+#define BLIS_SCALAR_DT_SHIFT                0
+#define   BLIS_SCALAR_DOMAIN_SHIFT          0
+#define   BLIS_SCALAR_PREC_SHIFT            1
+
+//
+// -- BLIS info bit field masks ------------------------------------------------
+//
+
+// info
+#define BLIS_DATATYPE_BITS                 ( 0x7  << BLIS_DATATYPE_SHIFT )
+#define   BLIS_DOMAIN_BIT                  ( 0x1  << BLIS_DOMAIN_SHIFT )
+#define   BLIS_PRECISION_BIT               ( 0x1  << BLIS_PRECISION_SHIFT )
+#define BLIS_CONJTRANS_BITS                ( 0x3  << BLIS_CONJTRANS_SHIFT )
+#define   BLIS_TRANS_BIT                   ( 0x1  << BLIS_TRANS_SHIFT )
+#define   BLIS_CONJ_BIT                    ( 0x1  << BLIS_CONJ_SHIFT )
+#define BLIS_UPLO_BITS                     ( 0x7  << BLIS_UPLO_SHIFT )
+#define   BLIS_UPPER_BIT                   ( 0x1  << BLIS_UPPER_SHIFT )
+#define   BLIS_DIAG_BIT                    ( 0x1  << BLIS_DIAG_SHIFT )
+#define   BLIS_LOWER_BIT                   ( 0x1  << BLIS_LOWER_SHIFT )
+#define BLIS_UNIT_DIAG_BIT                 ( 0x1  << BLIS_UNIT_DIAG_SHIFT )
+#define BLIS_INVERT_DIAG_BIT               ( 0x1  << BLIS_INVERT_DIAG_SHIFT )
+#define BLIS_TARGET_DT_BITS                ( 0x7  << BLIS_TARGET_DT_SHIFT )
+#define   BLIS_TARGET_DOMAIN_BIT           ( 0x1  << BLIS_TARGET_DOMAIN_SHIFT )
+#define   BLIS_TARGET_PREC_BIT             ( 0x1  << BLIS_TARGET_PREC_SHIFT )
+#define BLIS_EXEC_DT_BITS                  ( 0x7  << BLIS_EXEC_DT_SHIFT )
+#define   BLIS_EXEC_DOMAIN_BIT             ( 0x1  << BLIS_EXEC_DOMAIN_SHIFT )
+#define   BLIS_EXEC_PREC_BIT               ( 0x1  << BLIS_EXEC_PREC_SHIFT )
+#define BLIS_PACK_SCHEMA_BITS              ( 0x7F << BLIS_PACK_SCHEMA_SHIFT )
+#define   BLIS_PACK_RC_BIT                 ( 0x1  << BLIS_PACK_RC_SHIFT )
+#define   BLIS_PACK_PANEL_BIT              ( 0x1  << BLIS_PACK_PANEL_SHIFT )
+#define   BLIS_PACK_FORMAT_BITS            ( 0xF  << BLIS_PACK_FORMAT_SHIFT )
+#define   BLIS_PACK_BIT                    ( 0x1  << BLIS_PACK_SHIFT )
+#define BLIS_PACK_REV_IF_UPPER_BIT         ( 0x1  << BLIS_PACK_REV_IF_UPPER_SHIFT )
+#define BLIS_PACK_REV_IF_LOWER_BIT         ( 0x1  << BLIS_PACK_REV_IF_LOWER_SHIFT )
+#define BLIS_PACK_BUFFER_BITS              ( 0x3  << BLIS_PACK_BUFFER_SHIFT )
+#define BLIS_STRUC_BITS                    ( 0x3  << BLIS_STRUC_SHIFT )
+#define BLIS_COMP_DT_BITS                  ( 0x7  << BLIS_COMP_DT_SHIFT )
+#define   BLIS_COMP_DOMAIN_BIT             ( 0x1  << BLIS_COMP_DOMAIN_SHIFT )
+#define   BLIS_COMP_PREC_BIT               ( 0x1  << BLIS_COMP_PREC_SHIFT )
+
+// info2
+#define BLIS_SCALAR_DT_BITS                ( 0x7  << BLIS_SCALAR_DT_SHIFT )
+#define   BLIS_SCALAR_DOMAIN_BIT           ( 0x1  << BLIS_SCALAR_DOMAIN_SHIFT )
+#define   BLIS_SCALAR_PREC_BIT             ( 0x1  << BLIS_SCALAR_PREC_SHIFT )
+
+
+//
+// -- BLIS enumerated type value definitions -----------------------------------
+//
+
+#define BLIS_BITVAL_REAL                      0x0
+#define BLIS_BITVAL_COMPLEX                   BLIS_DOMAIN_BIT
+#define BLIS_BITVAL_SINGLE_PREC               0x0
+#define BLIS_BITVAL_DOUBLE_PREC               BLIS_PRECISION_BIT
+#define   BLIS_BITVAL_FLOAT_TYPE              0x0
+#define   BLIS_BITVAL_SCOMPLEX_TYPE           BLIS_DOMAIN_BIT  
+#define   BLIS_BITVAL_DOUBLE_TYPE             BLIS_PRECISION_BIT
+#define   BLIS_BITVAL_DCOMPLEX_TYPE         ( BLIS_DOMAIN_BIT | BLIS_PRECISION_BIT )
+#define   BLIS_BITVAL_INT_TYPE                0x04
+#define   BLIS_BITVAL_CONST_TYPE              0x05
+#define BLIS_BITVAL_NO_TRANS                  0x0
+#define BLIS_BITVAL_TRANS                     BLIS_TRANS_BIT
+#define BLIS_BITVAL_NO_CONJ                   0x0
+#define BLIS_BITVAL_CONJ                      BLIS_CONJ_BIT
+#define BLIS_BITVAL_CONJ_TRANS              ( BLIS_CONJ_BIT | BLIS_TRANS_BIT )
+#define BLIS_BITVAL_ZEROS                     0x0 
+#define BLIS_BITVAL_UPPER                   ( BLIS_UPPER_BIT | BLIS_DIAG_BIT )
+#define BLIS_BITVAL_LOWER                   ( BLIS_LOWER_BIT | BLIS_DIAG_BIT )
+#define BLIS_BITVAL_DENSE                     BLIS_UPLO_BITS  
+#define BLIS_BITVAL_NONUNIT_DIAG              0x0
+#define BLIS_BITVAL_UNIT_DIAG                 BLIS_UNIT_DIAG_BIT
+#define BLIS_BITVAL_INVERT_DIAG               BLIS_INVERT_DIAG_BIT
+#define BLIS_BITVAL_NOT_PACKED                0x0
+#define   BLIS_BITVAL_4MI                   ( 0x1  << BLIS_PACK_FORMAT_SHIFT )
+#define   BLIS_BITVAL_3MI                   ( 0x2  << BLIS_PACK_FORMAT_SHIFT )
+#define   BLIS_BITVAL_4MS                   ( 0x3  << BLIS_PACK_FORMAT_SHIFT )
+#define   BLIS_BITVAL_3MS                   ( 0x4  << BLIS_PACK_FORMAT_SHIFT )
+#define   BLIS_BITVAL_RO                    ( 0x5  << BLIS_PACK_FORMAT_SHIFT )
+#define   BLIS_BITVAL_IO                    ( 0x6  << BLIS_PACK_FORMAT_SHIFT )
+#define   BLIS_BITVAL_RPI                   ( 0x7  << BLIS_PACK_FORMAT_SHIFT )
+#define   BLIS_BITVAL_1E                    ( 0x8  << BLIS_PACK_FORMAT_SHIFT )
+#define   BLIS_BITVAL_1R                    ( 0x9  << BLIS_PACK_FORMAT_SHIFT )
+#define   BLIS_BITVAL_PACKED_UNSPEC         ( BLIS_PACK_BIT                                                            )
+#define   BLIS_BITVAL_PACKED_ROWS           ( BLIS_PACK_BIT                                                            )
+#define   BLIS_BITVAL_PACKED_COLUMNS        ( BLIS_PACK_BIT                                         | BLIS_PACK_RC_BIT )
+#define   BLIS_BITVAL_PACKED_ROW_PANELS     ( BLIS_PACK_BIT                   | BLIS_PACK_PANEL_BIT                    )
+#define   BLIS_BITVAL_PACKED_COL_PANELS     ( BLIS_PACK_BIT                   | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT )
+#define   BLIS_BITVAL_PACKED_ROW_PANELS_4MI ( BLIS_PACK_BIT | BLIS_BITVAL_4MI | BLIS_PACK_PANEL_BIT                    )
+#define   BLIS_BITVAL_PACKED_COL_PANELS_4MI ( BLIS_PACK_BIT | BLIS_BITVAL_4MI | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT )
+#define   BLIS_BITVAL_PACKED_ROW_PANELS_3MI ( BLIS_PACK_BIT | BLIS_BITVAL_3MI | BLIS_PACK_PANEL_BIT                    )
+#define   BLIS_BITVAL_PACKED_COL_PANELS_3MI ( BLIS_PACK_BIT | BLIS_BITVAL_3MI | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT )
+#define   BLIS_BITVAL_PACKED_ROW_PANELS_4MS ( BLIS_PACK_BIT | BLIS_BITVAL_4MS | BLIS_PACK_PANEL_BIT                    )
+#define   BLIS_BITVAL_PACKED_COL_PANELS_4MS ( BLIS_PACK_BIT | BLIS_BITVAL_4MS | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT )
+#define   BLIS_BITVAL_PACKED_ROW_PANELS_3MS ( BLIS_PACK_BIT | BLIS_BITVAL_3MS | BLIS_PACK_PANEL_BIT                    )
+#define   BLIS_BITVAL_PACKED_COL_PANELS_3MS ( BLIS_PACK_BIT | BLIS_BITVAL_3MS | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT )
+#define   BLIS_BITVAL_PACKED_ROW_PANELS_RO  ( BLIS_PACK_BIT | BLIS_BITVAL_RO  | BLIS_PACK_PANEL_BIT                    )
+#define   BLIS_BITVAL_PACKED_COL_PANELS_RO  ( BLIS_PACK_BIT | BLIS_BITVAL_RO  | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT )
+#define   BLIS_BITVAL_PACKED_ROW_PANELS_IO  ( BLIS_PACK_BIT | BLIS_BITVAL_IO  | BLIS_PACK_PANEL_BIT                    )
+#define   BLIS_BITVAL_PACKED_COL_PANELS_IO  ( BLIS_PACK_BIT | BLIS_BITVAL_IO  | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT )
+#define   BLIS_BITVAL_PACKED_ROW_PANELS_RPI ( BLIS_PACK_BIT | BLIS_BITVAL_RPI | BLIS_PACK_PANEL_BIT                    )
+#define   BLIS_BITVAL_PACKED_COL_PANELS_RPI ( BLIS_PACK_BIT | BLIS_BITVAL_RPI | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT )
+#define   BLIS_BITVAL_PACKED_ROW_PANELS_1E  ( BLIS_PACK_BIT | BLIS_BITVAL_1E  | BLIS_PACK_PANEL_BIT                    )
+#define   BLIS_BITVAL_PACKED_COL_PANELS_1E  ( BLIS_PACK_BIT | BLIS_BITVAL_1E  | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT )
+#define   BLIS_BITVAL_PACKED_ROW_PANELS_1R  ( BLIS_PACK_BIT | BLIS_BITVAL_1R  | BLIS_PACK_PANEL_BIT                    )
+#define   BLIS_BITVAL_PACKED_COL_PANELS_1R  ( BLIS_PACK_BIT | BLIS_BITVAL_1R  | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT )
+#define BLIS_BITVAL_PACK_FWD_IF_UPPER         0x0
+#define BLIS_BITVAL_PACK_REV_IF_UPPER         BLIS_PACK_REV_IF_UPPER_BIT
+#define BLIS_BITVAL_PACK_FWD_IF_LOWER         0x0
+#define BLIS_BITVAL_PACK_REV_IF_LOWER         BLIS_PACK_REV_IF_LOWER_BIT
+#define BLIS_BITVAL_BUFFER_FOR_A_BLOCK        0x0
+#define BLIS_BITVAL_BUFFER_FOR_B_PANEL      ( 0x1 << BLIS_PACK_BUFFER_SHIFT )
+#define BLIS_BITVAL_BUFFER_FOR_C_PANEL      ( 0x2 << BLIS_PACK_BUFFER_SHIFT )
+#define BLIS_BITVAL_BUFFER_FOR_GEN_USE      ( 0x3 << BLIS_PACK_BUFFER_SHIFT )
+#define BLIS_BITVAL_GENERAL                   0x0
+#define BLIS_BITVAL_HERMITIAN               ( 0x1 << BLIS_STRUC_SHIFT )
+#define BLIS_BITVAL_SYMMETRIC               ( 0x2 << BLIS_STRUC_SHIFT )
+#define BLIS_BITVAL_TRIANGULAR              ( 0x3 << BLIS_STRUC_SHIFT )
+
+
+//
+// -- BLIS enumerated type definitions -----------------------------------------
+//
+
+// -- Operational parameter types --
+
+typedef enum
+{
+	BLIS_NO_TRANSPOSE      = 0x0,
+	BLIS_TRANSPOSE         = BLIS_BITVAL_TRANS,
+	BLIS_CONJ_NO_TRANSPOSE = BLIS_BITVAL_CONJ,
+	BLIS_CONJ_TRANSPOSE    = BLIS_BITVAL_CONJ_TRANS
+} trans_t;
+
+typedef enum
+{
+	BLIS_NO_CONJUGATE      = 0x0,
+	BLIS_CONJUGATE         = BLIS_BITVAL_CONJ
+} conj_t;
+
+typedef enum
+{
+	BLIS_ZEROS             = BLIS_BITVAL_ZEROS,
+	BLIS_LOWER             = BLIS_BITVAL_LOWER,
+	BLIS_UPPER             = BLIS_BITVAL_UPPER,
+	BLIS_DENSE             = BLIS_BITVAL_DENSE
+} uplo_t;
+
+typedef enum
+{
+	BLIS_LEFT              = 0x0,
+	BLIS_RIGHT
+} side_t;
+
+typedef enum
+{
+	BLIS_NONUNIT_DIAG      = 0x0,
+	BLIS_UNIT_DIAG         = BLIS_BITVAL_UNIT_DIAG
+} diag_t;
+
+typedef enum
+{
+	BLIS_NO_INVERT_DIAG    = 0x0,
+	BLIS_INVERT_DIAG       = BLIS_BITVAL_INVERT_DIAG
+} invdiag_t;
+
+typedef enum
+{
+	BLIS_GENERAL           = BLIS_BITVAL_GENERAL,
+	BLIS_HERMITIAN         = BLIS_BITVAL_HERMITIAN,
+	BLIS_SYMMETRIC         = BLIS_BITVAL_SYMMETRIC,
+	BLIS_TRIANGULAR        = BLIS_BITVAL_TRIANGULAR
+} struc_t;
+
+
+// -- Data type --
+
+typedef enum
+{
+	BLIS_FLOAT             = BLIS_BITVAL_FLOAT_TYPE,
+	BLIS_DOUBLE            = BLIS_BITVAL_DOUBLE_TYPE,
+	BLIS_SCOMPLEX          = BLIS_BITVAL_SCOMPLEX_TYPE,
+	BLIS_DCOMPLEX          = BLIS_BITVAL_DCOMPLEX_TYPE,
+	BLIS_INT               = BLIS_BITVAL_INT_TYPE,
+	BLIS_CONSTANT          = BLIS_BITVAL_CONST_TYPE,
+	BLIS_DT_LO             = BLIS_FLOAT,
+	BLIS_DT_HI             = BLIS_DCOMPLEX
+} num_t;
+
+typedef enum
+{
+	BLIS_REAL              = BLIS_BITVAL_REAL,
+	BLIS_COMPLEX           = BLIS_BITVAL_COMPLEX
+} dom_t;
+
+typedef enum
+{
+	BLIS_SINGLE_PREC       = BLIS_BITVAL_SINGLE_PREC,
+	BLIS_DOUBLE_PREC       = BLIS_BITVAL_DOUBLE_PREC
+} prec_t;
+
+#endif
+// end bli_type_defs.h
+
+// begin bli_param_map.h
+BLIS_INLINE void bli_param_map_netlib_to_blis_side( char side, side_t* blis_side )
+{
+	if      ( side == 'l' || side == 'L' ) *blis_side = BLIS_LEFT;
+	else if ( side == 'r' || side == 'R' ) *blis_side = BLIS_RIGHT;
+	else
+	{
+		*blis_side = BLIS_LEFT;
+	}
+}
+
+BLIS_INLINE void bli_param_map_netlib_to_blis_uplo( char uplo, uplo_t* blis_uplo )
+{
+	if      ( uplo == 'l' || uplo == 'L' ) *blis_uplo = BLIS_LOWER;
+	else if ( uplo == 'u' || uplo == 'U' ) *blis_uplo = BLIS_UPPER;
+	else
+	{
+		*blis_uplo = BLIS_LOWER;
+	}
+}
+
+BLIS_INLINE void bli_param_map_netlib_to_blis_trans( char trans, trans_t* blis_trans )
+{
+	if      ( trans == 'n' || trans == 'N' ) *blis_trans = BLIS_NO_TRANSPOSE;
+	else if ( trans == 't' || trans == 'T' ) *blis_trans = BLIS_TRANSPOSE;
+	else if ( trans == 'c' || trans == 'C' ) *blis_trans = BLIS_CONJ_TRANSPOSE;
+	else
+	{
+		*blis_trans = BLIS_NO_TRANSPOSE;
+	}
+}
+
+BLIS_INLINE void bli_param_map_netlib_to_blis_diag( char diag, diag_t* blis_diag )
+{
+	if      ( diag == 'n' || diag == 'N' ) *blis_diag = BLIS_NONUNIT_DIAG;
+	else if ( diag == 'u' || diag == 'U' ) *blis_diag = BLIS_UNIT_DIAG;
+	else
+	{
+		*blis_diag = BLIS_NONUNIT_DIAG;
+	}
+}
+// end bli_param_map.h
+
+// End extern "C" construct block.
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+
diff --git a/src/ltl2inv/CMakeLists.txt b/src/ltl2inv/CMakeLists.txt
new file mode 100644
index 00000000..8aa7141c
--- /dev/null
+++ b/src/ltl2inv/CMakeLists.txt
@@ -0,0 +1,16 @@
+# include guard
+cmake_minimum_required(VERSION 2.8.0 )
+
+if(${CMAKE_PROJECT_NAME} STREQUAL "Project")
+  message(FATAL_ERROR "cmake should be executed not for 'src' subdirectory, but for the top directory of mVMC.")
+endif(${CMAKE_PROJECT_NAME} STREQUAL "Project")
+
+if("${ARCHITECTURE}" STREQUAL "x86_64")
+  add_definitions(-DF77_COMPLEX_RET_INTEL)
+endif()
+include_directories(../common)
+
+# TODO: Move blalink_gemmt.c to other subprojects?
+add_library(ltl2inv STATIC ltl2inv.cc blalink_gemmt.c ilaenv_lauum.cc)
+target_compile_definitions(ltl2inv PRIVATE -D_CC_IMPL)
+
diff --git a/src/ltl2inv/blalink_gemmt.c b/src/ltl2inv/blalink_gemmt.c
new file mode 100644
index 00000000..fbaee562
--- /dev/null
+++ b/src/ltl2inv/blalink_gemmt.c
@@ -0,0 +1,42 @@
+/**
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at https://mozilla.org/MPL/2.0/.
+ */
+#include "blis.h"
+#include "blalink_fort.h"
+
+#if !( defined(BLAS_EXTERNAL) && defined(MKL) )
+// Redefine GEMMT Fortran from BLIS (which is built without BLAS API).
+// Required by Pfapack77.
+#define BLAGEN_MAC(cctype, ctype, cchar) \
+    void cchar##gemmt_(const char *uplo_, \
+                       const char *transa_, \
+                       const char *transb_, \
+                       const int *m, const int *k, \
+                       const ctype *alpha, \
+                       const ctype *a, const int *lda, \
+                       const ctype *b, const int *ldb, \
+                       const ctype *beta, ctype *c, const int *ldc) \
+    { \
+        uplo_t uploc; \
+        trans_t transa; \
+        trans_t transb; \
+        bli_param_map_netlib_to_blis_uplo(*uplo_, &uploc) ; \
+        bli_param_map_netlib_to_blis_trans(*transa_, &transa) ; \
+        bli_param_map_netlib_to_blis_trans(*transb_, &transb) ; \
+        bli_##cchar##gemmt(uploc, \
+                           transa, transb, \
+                           *m, *k, \
+                           alpha, \
+                           a, 1, *lda, \
+                           b, 1, *ldb, \
+                           beta, \
+                           c, 1, *ldc); \
+    }
+BLAGEN_MAC( float,    float,    s )
+BLAGEN_MAC( double,   double,   d )
+BLAGEN_MAC( ccscmplx, scomplex, c )
+BLAGEN_MAC( ccdcmplx, dcomplex, z )
+#undef BLAGEN_MAC
+#endif
diff --git a/src/ltl2inv/blalink_gemmt.hh b/src/ltl2inv/blalink_gemmt.hh
new file mode 100644
index 00000000..52bb2c45
--- /dev/null
+++ b/src/ltl2inv/blalink_gemmt.hh
@@ -0,0 +1,80 @@
+/**
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at https://mozilla.org/MPL/2.0/.
+ */
+#pragma once
+#include "blalink.hh"
+
+
+// gemmt is not part of BLAS standard.
+// It's currently know as exposed by BLIS and MKL.
+template <typename T>
+inline void gemmt(uplo_t uploc,
+                  trans_t transa, trans_t transb,
+                  dim_t m, dim_t k,
+                  T alpha,
+                  T *a, inc_t lda,
+                  T *b, inc_t ldb,
+                  T beta,
+                  T *c, inc_t ldc);
+#if !( defined(BLAS_EXTERNAL) && defined(MKL) )
+#define BLALINK_MAC(cctype, ctype, cchar) \
+    template <> inline void gemmt<cctype>(uplo_t uploc, \
+                                          trans_t transa, trans_t transb, \
+                                          dim_t m, dim_t k, \
+                                          cctype alpha, \
+                                          cctype *a, inc_t lda, \
+                                          cctype *b, inc_t ldb, \
+                                          cctype beta, \
+                                          cctype *c, inc_t ldc) \
+    { \
+        bli_##cchar##gemmt(uploc, \
+                           transa, transb, \
+                           m, k, \
+                           (ctype *)&alpha, \
+                           (ctype *)a, 1, lda, \
+                           (ctype *)b, 1, ldb, \
+                           (ctype *)&beta, \
+                           (ctype *)c, 1, ldc); \
+    }
+#else
+#define BLALINK_MAC(cctype, ctype, cchar) \
+    template <> inline void gemmt<cctype>(uplo_t uploc, \
+                                          trans_t transa, trans_t transb, \
+                                          dim_t m, dim_t k, \
+                                          cctype alpha, \
+                                          cctype *a, inc_t lda, \
+                                          cctype *b, inc_t ldb, \
+                                          cctype beta, \
+                                          cctype *c, inc_t ldc) \
+    { \
+        char ul = uplo2char(uploc); \
+        char ta = trans2char(transa); \
+        char tb = trans2char(transb); \
+        cchar##gemmt_(&ul, \
+                      &ta, &tb, \
+                      &m, &k, \
+                      (ctype *)&alpha, \
+                      (ctype *)a, &lda, \
+                      (ctype *)b, &ldb, \
+                      (ctype *)&beta, \
+                      (ctype *)c, &ldc); \
+    }
+extern "C" {
+void sgemmt_(char *uplo, char *transa, char *transb, dim_t *m, dim_t *k, float *alpha,
+             float *a, dim_t *lda, float *b, dim_t *ldb, float *beta, float *c, dim_t *ldc);
+void dgemmt_(char *uplo, char *transa, char *transb, dim_t *m, dim_t *k, double *alpha,
+             double *a, dim_t *lda, double *b, dim_t *ldb, double *beta, double *c, dim_t *ldc);
+void cgemmt_(char *uplo, char *transa, char *transb, dim_t *m, dim_t *k, void *alpha,
+             void *a, dim_t *lda, void *b, dim_t *ldb, void *beta, void *c, dim_t *ldc);
+void zgemmt_(char *uplo, char *transa, char *transb, dim_t *m, dim_t *k, void *alpha,
+             void *a, dim_t *lda, void *b, dim_t *ldb, void *beta, void *c, dim_t *ldc);
+}
+#endif
+BLALINK_MAC( float,    float,    s )
+BLALINK_MAC( double,   double,   d )
+BLALINK_MAC( ccscmplx, scomplex, c )
+BLALINK_MAC( ccdcmplx, dcomplex, z )
+#undef BLALINK_MAC
+
diff --git a/src/ltl2inv/ilaenv.h b/src/ltl2inv/ilaenv.h
new file mode 100644
index 00000000..add0a1ed
--- /dev/null
+++ b/src/ltl2inv/ilaenv.h
@@ -0,0 +1,15 @@
+/**
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at https://mozilla.org/MPL/2.0/.
+ */
+#pragma once
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+int ilaenv_(const int *ispec, const char *name, const char *opts, const int *n1, const int *n2, const int *n3, const int *n4);
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/src/ltl2inv/ilaenv_lauum.cc b/src/ltl2inv/ilaenv_lauum.cc
new file mode 100644
index 00000000..f150384f
--- /dev/null
+++ b/src/ltl2inv/ilaenv_lauum.cc
@@ -0,0 +1,23 @@
+/**
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at https://mozilla.org/MPL/2.0/.
+ */
+#include "blalink.hh"
+#include "ilaenv.h"
+#include "ilaenv_lauum.hh"
+
+#define EXPANDMAC(cctype, name) \
+template <> int ilaenv_lauum<cctype>(uplo_t uplo, int n) \
+{ \
+    char uplo_ = uplo2char(uplo); \
+    int ispec = 1; \
+    int dummy = 0; \
+    return ilaenv_(&ispec, #name, &uplo_, &n, &dummy, &dummy, &dummy); \
+}
+EXPANDMAC( float,    SLAUUM )
+EXPANDMAC( double,   DLAUUM )
+EXPANDMAC( ccscmplx, CLAUUM )
+EXPANDMAC( ccdcmplx, ZLAUUM )
+#undef EXPANDMAC
+
diff --git a/src/ltl2inv/ilaenv_lauum.hh b/src/ltl2inv/ilaenv_lauum.hh
new file mode 100644
index 00000000..68d89b57
--- /dev/null
+++ b/src/ltl2inv/ilaenv_lauum.hh
@@ -0,0 +1,11 @@
+/**
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at https://mozilla.org/MPL/2.0/.
+ */
+#pragma once
+#include "blalink.hh"
+
+template <typename T>
+int ilaenv_lauum(uplo_t uplo, int n);
+
diff --git a/src/ltl2inv/invert.tcc b/src/ltl2inv/invert.tcc
new file mode 100644
index 00000000..0b11edf1
--- /dev/null
+++ b/src/ltl2inv/invert.tcc
@@ -0,0 +1,132 @@
+/**
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at https://mozilla.org/MPL/2.0/.
+ */
+#pragma once
+#include "colmaj.hh"
+#include "blalink.hh"
+#include "trmmt.tcc"
+
+template <typename T>
+void sktdsmx(int n, T *vT, T *B_, int ldB, T *C_, int ldC)
+{
+  colmaj<T> B(B_, ldB);
+  colmaj<T> C(C_, ldC);
+
+  for (int j = 0; j < n; ++j)
+    C(1, j) = B(0, j) / -vT[0];
+  for (int i = 2; i < n; i += 2)
+    for (int j = 0; j < n; ++j)
+      C(i+1, j) = (B(i, j) - C(i-1, j) * vT[i-1]) / -vT[i];
+
+  for (int j = 0; j < n; ++j)
+    C(n-2, j) = B(n-1, j) / vT[n-2];
+  for (int i = n-3; i >= 0; i -= 2)
+    for (int j = 0; j < n; ++j)
+      C(i-1, j) = (B(i, j) + C(i+1, j) * vT[i]) / vT[i-1];
+}
+
+template <typename T>
+void ltl2inv(int n, T *A_, int ldA, int *iPiv, T *vT, T *M_, int ldM)
+{
+  colmaj<T> A(A_, ldA);
+  colmaj<T> M(M_, ldM);
+
+  int npanel_trmm = ilaenv_lauum<T>(BLIS_LOWER, n);
+  int full = npanel_trmm <= 1 || npanel_trmm >= n;
+
+  // Set M.
+  for (int j = 0; j < n; ++j)
+    for (int i = 0; i < n; ++i)
+      M(i, j) = T(!(i - j));
+
+  trtri(BLIS_LOWER, BLIS_UNIT_DIAG, n-1, &A(1, 0), ldA);
+  lacpy(BLIS_LOWER, n-2, n-2, &A(2, 0), ldA, &M(2, 1), ldM);
+
+  for (int i = 0; i < n-1; ++i)
+    vT[i] = A(i+1, i);
+  sktdsmx<T>(n, vT, &M(0, 0), ldM, &A(0, 0), ldA);
+
+  if (!full) {
+    trmmt(BLIS_LOWER, BLIS_UNIT_DIAG, n, T(1.0), M, A);
+    for (int j = 0; j < n; ++j) {
+      A(j, j) = 0.0;
+      for (int i = 0; i < j; ++i)
+        A(i, j) = -A(j, i);
+    }
+  }
+
+  // In-place permute columns.
+  for (int j = n-1; j >= 0; --j)
+    if (iPiv[j]-1 != j)
+      swap(n, &A(0, j), 1, &A(0, iPiv[j]-1), 1);
+
+  if (full)
+    trmm(BLIS_LEFT, BLIS_LOWER, BLIS_TRANSPOSE, BLIS_UNIT_DIAG,
+         n, n, T(1.0), &M(0, 0), ldM, &A(0, 0), ldA);
+
+  // In-place permute rows.
+  for (int i = n-1; i >= 0; --i)
+    if (iPiv[i]-1 != i)
+      swap(n, &A(i, 0), ldA, &A(iPiv[i]-1, 0), ldA);
+}
+
+template <typename T>
+void utu2inv(int n, T *A_, int ldA, int *iPiv, T *vT, T *M_, int ldM)
+{
+  colmaj<T> A(A_, ldA);
+  colmaj<T> M(M_, ldM);
+
+  int npanel_trmm = ilaenv_lauum<T>(BLIS_UPPER, n);
+  int full = npanel_trmm <= 1 || npanel_trmm >= n;
+
+  // Set M.
+  for (int j = 0; j < n; ++j)
+    for (int i = 0; i < n; ++i)
+      M(i, j) = T(!(i - j));
+
+  trtri(BLIS_UPPER, BLIS_UNIT_DIAG, n-1, &A(0, 1), ldA);
+  lacpy(BLIS_UPPER, n-2, n-2, &A(0, 2), ldA, &M(0, 1), ldM);
+
+  for (int i = 0; i < n-1; ++i)
+    vT[i] = -A(i, i+1);
+  sktdsmx<T>(n, vT, &M(0, 0), ldM, &A(0, 0), ldA);
+
+  if (!full) {
+    trmmt(BLIS_UPPER, BLIS_UNIT_DIAG, n, T(1.0), M, A);
+      for (int j = 0; j < n; ++j) {
+        A(j, j) = 0.0;
+        for (int i = j + 1; i < n; ++i)
+          A(i, j) = -A(j, i);
+      }
+  }
+
+  // In-place permute columns.
+  for (int j = 0; j < n; ++j)
+    if (iPiv[j]-1 != j)
+      swap(n, &A(0, j), 1, &A(0, iPiv[j]-1), 1);
+
+  if (full)
+    trmm(BLIS_LEFT, BLIS_UPPER, BLIS_TRANSPOSE, BLIS_UNIT_DIAG,
+         n, n, T(1.0), &M(0, 0), ldM, &A(0, 0), ldA);
+
+  // In-place permute rows.
+  for (int i = 0; i < n; ++i)
+    if (iPiv[i]-1 != i)
+      swap(n, &A(i, 0), ldA, &A(iPiv[i]-1, 0), ldA);
+}
+
+template <typename T>
+void ltl2inv(uplo_t uplo, int n, T *A_, int ldA, int *iPiv, T *vT, T *M_, int ldM)
+{
+  switch (uplo) {
+  case BLIS_LOWER:
+    ltl2inv(n, A_, ldA, iPiv, vT, M_, ldM); break;
+  case BLIS_UPPER:
+    utu2inv(n, A_, ldA, iPiv, vT, M_, ldM); break;
+  default:
+    break;
+  }
+}
+
diff --git a/src/ltl2inv/ltl2inv.cc b/src/ltl2inv/ltl2inv.cc
new file mode 100644
index 00000000..19f10256
--- /dev/null
+++ b/src/ltl2inv/ltl2inv.cc
@@ -0,0 +1,33 @@
+/**
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at https://mozilla.org/MPL/2.0/.
+ */
+#include "pfaffian.tcc"
+#include "invert.tcc"
+
+
+
+extern "C" {
+
+void ltl2inv_s(int n, float    *A_, int ldA, int *iPiv, float    *vT, float    *M_, int ldM) { ltl2inv<float   >(n, A_, ldA, iPiv, vT, M_, ldM); }
+void ltl2inv_d(int n, double   *A_, int ldA, int *iPiv, double   *vT, double   *M_, int ldM) { ltl2inv<double  >(n, A_, ldA, iPiv, vT, M_, ldM); }
+void ltl2inv_c(int n, ccscmplx *A_, int ldA, int *iPiv, ccscmplx *vT, ccscmplx *M_, int ldM) { ltl2inv<ccscmplx>(n, A_, ldA, iPiv, vT, M_, ldM); }
+void ltl2inv_z(int n, ccdcmplx *A_, int ldA, int *iPiv, ccdcmplx *vT, ccdcmplx *M_, int ldM) { ltl2inv<ccdcmplx>(n, A_, ldA, iPiv, vT, M_, ldM); }
+void utu2inv_s(int n, float    *A_, int ldA, int *iPiv, float    *vT, float    *M_, int ldM) { utu2inv<float   >(n, A_, ldA, iPiv, vT, M_, ldM); }
+void utu2inv_d(int n, double   *A_, int ldA, int *iPiv, double   *vT, double   *M_, int ldM) { utu2inv<double  >(n, A_, ldA, iPiv, vT, M_, ldM); }
+void utu2inv_c(int n, ccscmplx *A_, int ldA, int *iPiv, ccscmplx *vT, ccscmplx *M_, int ldM) { utu2inv<ccscmplx>(n, A_, ldA, iPiv, vT, M_, ldM); }
+void utu2inv_z(int n, ccdcmplx *A_, int ldA, int *iPiv, ccdcmplx *vT, ccdcmplx *M_, int ldM) { utu2inv<ccdcmplx>(n, A_, ldA, iPiv, vT, M_, ldM); }
+
+
+void ltl2pfa_s(int n, float    *A_, int ldA, int *iPiv, float    *Pfa) { *Pfa = ltl2pfa<float   >(n, A_, ldA, iPiv); }
+void ltl2pfa_d(int n, double   *A_, int ldA, int *iPiv, double   *Pfa) { *Pfa = ltl2pfa<double  >(n, A_, ldA, iPiv); }
+void ltl2pfa_c(int n, ccscmplx *A_, int ldA, int *iPiv, ccscmplx *Pfa) { *Pfa = ltl2pfa<ccscmplx>(n, A_, ldA, iPiv); }
+void ltl2pfa_z(int n, ccdcmplx *A_, int ldA, int *iPiv, ccdcmplx *Pfa) { *Pfa = ltl2pfa<ccdcmplx>(n, A_, ldA, iPiv); }
+void utu2pfa_s(int n, float    *A_, int ldA, int *iPiv, float    *Pfa) { *Pfa = utu2pfa<float   >(n, A_, ldA, iPiv); }
+void utu2pfa_d(int n, double   *A_, int ldA, int *iPiv, double   *Pfa) { *Pfa = utu2pfa<double  >(n, A_, ldA, iPiv); }
+void utu2pfa_c(int n, ccscmplx *A_, int ldA, int *iPiv, ccscmplx *Pfa) { *Pfa = utu2pfa<ccscmplx>(n, A_, ldA, iPiv); }
+void utu2pfa_z(int n, ccdcmplx *A_, int ldA, int *iPiv, ccdcmplx *Pfa) { *Pfa = utu2pfa<ccdcmplx>(n, A_, ldA, iPiv); }
+
+}
+
diff --git a/src/ltl2inv/makefile_ltl2inv b/src/ltl2inv/makefile_ltl2inv
new file mode 100644
index 00000000..8c76afb1
--- /dev/null
+++ b/src/ltl2inv/makefile_ltl2inv
@@ -0,0 +1,29 @@
+include ../make.sys
+
+OBJ = ltl2inv.o blalink_gemmt.o ilaenv_lauum.o
+SRC = ltl2inv.cc blalink_gemmt.c ilaenv_lauum.cc
+HDR = invert.tcc pfaffian.tcc \
+	  ../common/blalink.hh ../common/blalink_fort.h ../common/colmaj.hh
+
+ltl2inv.a: $(OBJ)
+	$(AR) rvu $@ $(OBJ)
+
+clean:
+	rm -f $(OBJ)
+
+SUFFIXES: .o .c
+
+.c.o: $(HDR)
+	$(CC) $(OPTION) $(CFLAGS) -c $< -o $@ \
+		-Wno-incompatible-pointer-types-discards-qualifiers \
+		-I../common \
+		-I$(BLIS_ROOT)/include \
+		-I$(BLIS_ROOT)/include/blis
+
+SUFFIXES: .o .cc
+
+.cc.o: $(HDR)
+	$(CXX) $(CXXFLAGS) -D_CC_IMPL -c $< -o $@ \
+		-I../common \
+		-I$(BLIS_ROOT)/include \
+		-I$(BLIS_ROOT)/include/blis
diff --git a/src/ltl2inv/pfaffian.tcc b/src/ltl2inv/pfaffian.tcc
new file mode 100644
index 00000000..ed028524
--- /dev/null
+++ b/src/ltl2inv/pfaffian.tcc
@@ -0,0 +1,52 @@
+/**
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at https://mozilla.org/MPL/2.0/.
+ */
+#pragma once
+#include "colmaj.hh"
+#include "blalink.hh"
+
+template <typename T>
+T ltl2pfa(int n, T *A_, int ldA, int *iPiv)
+{
+  T pfa = 1.0;
+  colmaj<T> A(A_, ldA);
+
+  for (int i = 0; i < n; i += 2) {
+    pfa *= -A(i+1, i);
+
+    if (iPiv[i  ]-1 != i  ) pfa = -pfa;
+    if (iPiv[i+1]-1 != i+1) pfa = -pfa;
+  }
+  return pfa;
+}
+
+template <typename T>
+T utu2pfa(int n, T *A_, int ldA, int *iPiv)
+{
+  T pfa = 1.0;
+  colmaj<T> A(A_, ldA);
+
+  for (int i = 0; i < n; i += 2) {
+    pfa *= A(i, i+1);
+
+    if (iPiv[i  ]-1 != i  ) pfa = -pfa;
+    if (iPiv[i+1]-1 != i+1) pfa = -pfa;
+  }
+  return pfa;
+}
+
+template <typename T>
+T ltl2pfa(uplo_t uplo, int n, T *A_, int ldA, int *iPiv)
+{
+  switch (uplo) {
+  case BLIS_LOWER:
+    return ltl2pfa(n, A_, ldA, iPiv);
+  case BLIS_UPPER:
+    return utu2pfa(n, A_, ldA, iPiv);
+  default:
+    return 0.0;
+  }
+}
+
diff --git a/src/ltl2inv/testinv.cc b/src/ltl2inv/testinv.cc
new file mode 100644
index 00000000..c7cbc358
--- /dev/null
+++ b/src/ltl2inv/testinv.cc
@@ -0,0 +1,125 @@
+#include <cstdio>
+#include <cstdlib>
+#include <cassert>
+#include <chrono>
+
+#include "colmaj.hh"
+#include "invert.tcc"
+
+struct info_t {
+    double err;
+    long long tim;
+};
+
+template <typename T>
+info_t test_decomp2inv(uplo_t uplo, int n)
+{
+    using namespace std::chrono;
+
+    colmaj<T> A(new T[n * n], n);
+    colmaj<T> M(new T[n * n], n);
+    colmaj<T> X(new T[n * n], n);
+    int *ipiv = new int[n];
+    T *vT = new T[n-1];
+
+    assert(&A(0, 0));
+    assert(&M(0, 0));
+    assert(&X(0, 0));
+    assert(ipiv);
+    assert(vT);
+
+    for (int j = 0; j < n; ++j) {
+        A(j, j) = 0.0;
+        X(j, j) = 0.0;
+
+        for (int i = j+1; i < n; ++i) {
+            A(i, j) = rand();
+            A(i, j) /= RAND_MAX;
+
+            A(j, i) = -A(i, j);
+            switch (uplo) {
+            case BLIS_LOWER:
+                X(i, j) = A(i, j); break;
+            case BLIS_UPPER:
+                X(j, i) = A(j, i); break;
+            default:
+                break;
+            }
+        }
+    }
+
+    auto start = high_resolution_clock::now();
+
+    sktrf<T>(uplo, n, &X(0, 0), n, ipiv, &M(0, 0), n*n);
+    switch (uplo) {
+    case BLIS_LOWER:
+        ltl2inv<T>(n, &X(0, 0), n, ipiv, vT, &M(0, 0), n); break;
+    case BLIS_UPPER:
+        utu2inv<T>(n, &X(0, 0), n, ipiv, vT, &M(0, 0), n); break;
+    default:
+        break;
+    }
+
+    auto tim = duration_cast<nanoseconds>(high_resolution_clock::now() - start).count();
+
+    gemm<T>(BLIS_NO_TRANSPOSE, BLIS_NO_TRANSPOSE,
+            n, n, n,
+            1.0,
+            &A(0, 0), n,
+            &X(0, 0), n,
+            0.0,
+            &M(0, 0), n);
+
+#ifdef VERBOSE
+    printf("iPiv = [ ");
+    for (int i = 0; i < n; ++i)
+        printf("%d ", ipiv[i]);
+    printf("]\n");
+    printf("vT = [ ");
+    for (int i = 0; i < n-1; ++i)
+        printf("%f ", vT[i]);
+    printf("]\n");
+
+    printf("A*inv(A) =\n");
+    for (int i = 0; i < n; ++i) {
+        for (int j = 0; j < n; ++j)
+            printf("%10.6f ", M(i, j));
+        printf("\n");
+    }
+#endif
+
+    double err = 0.0;
+    for (int j = 0; j < n; ++j)
+        for (int i = 0; i < n; ++i)
+            err += std::abs(M(i, j) - double(!(i - j)));
+    err /= n; // sqrt(n*n);
+
+    delete[] &A(0, 0);
+    delete[] &M(0, 0);
+    delete[] &X(0, 0);
+    delete[] ipiv;
+    delete[] vT;
+
+    return info_t{ err, tim };
+}
+
+int main(void)
+{
+    info_t info; int n;
+    n= 10; info = test_decomp2inv<double>  (BLIS_LOWER, n); printf("double   n=%5d lower err=%e at %6lfgflops.\n", n, info.err, 1.33*std::pow(10 ,3)/info.tim);
+    n= 10; info = test_decomp2inv<float>   (BLIS_LOWER, n); printf("float    n=%5d lower err=%e at %6lfgflops.\n", n, info.err, 1.33*std::pow(10 ,3)/info.tim);
+    n= 10; info = test_decomp2inv<ccdcmplx>(BLIS_LOWER, n); printf("dcomplex n=%5d lower err=%e at %6lfgflops.\n", n, info.err, 5.33*std::pow(10 ,3)/info.tim);
+    n=200; info = test_decomp2inv<double>  (BLIS_LOWER, n); printf("double   n=%5d lower err=%e at %6lfgflops.\n", n, info.err, 1.33*std::pow(200,3)/info.tim);
+    n=200; info = test_decomp2inv<float>   (BLIS_LOWER, n); printf("float    n=%5d lower err=%e at %6lfgflops.\n", n, info.err, 1.33*std::pow(200,3)/info.tim);
+    n=200; info = test_decomp2inv<ccdcmplx>(BLIS_LOWER, n); printf("dcomplex n=%5d lower err=%e at %6lfgflops.\n", n, info.err, 5.33*std::pow(200,3)/info.tim);
+
+    n= 10; info = test_decomp2inv<double>  (BLIS_UPPER, n); printf("double   n=%5d upper err=%e at %6lfgflops.\n", n, info.err, 1.33*std::pow(10 ,3)/info.tim);
+    n= 10; info = test_decomp2inv<float>   (BLIS_UPPER, n); printf("float    n=%5d upper err=%e at %6lfgflops.\n", n, info.err, 1.33*std::pow(10 ,3)/info.tim);
+    n= 10; info = test_decomp2inv<ccdcmplx>(BLIS_UPPER, n); printf("dcomplex n=%5d upper err=%e at %6lfgflops.\n", n, info.err, 5.33*std::pow(10 ,3)/info.tim);
+    n=200; info = test_decomp2inv<double>  (BLIS_UPPER, n); printf("double   n=%5d upper err=%e at %6lfgflops.\n", n, info.err, 1.33*std::pow(200,3)/info.tim);
+    n=200; info = test_decomp2inv<float>   (BLIS_UPPER, n); printf("float    n=%5d upper err=%e at %6lfgflops.\n", n, info.err, 1.33*std::pow(200,3)/info.tim);
+    n=200; info = test_decomp2inv<ccdcmplx>(BLIS_UPPER, n); printf("dcomplex n=%5d upper err=%e at %6lfgflops.\n", n, info.err, 5.33*std::pow(200,3)/info.tim);
+
+    return 0;
+}
+
diff --git a/src/ltl2inv/trmmt.tcc b/src/ltl2inv/trmmt.tcc
new file mode 100644
index 00000000..79bc7170
--- /dev/null
+++ b/src/ltl2inv/trmmt.tcc
@@ -0,0 +1,35 @@
+/**
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at https://mozilla.org/MPL/2.0/.
+ */
+#pragma once
+#include "colmaj.hh"
+#include "blalink.hh"
+#include "ilaenv_lauum.hh"
+
+
+template <typename T>
+inline void trmmt(uplo_t uploab, diag_t diaga, int n, T alpha, colmaj<T> &A, colmaj<T> &B)
+{
+  int npanel = ilaenv_lauum<T>(uploab, n);
+
+  if (uploab == BLIS_UPPER) {
+    // A upper => A' lower -> write to UPPER part of B.
+    for (int j = 0; j < n; j += npanel) {
+      int nloc = std::min<int>(npanel, n - j);
+
+      trmm<T>(BLIS_LEFT, BLIS_UPPER, BLIS_TRANSPOSE, diaga,
+              j+nloc, nloc, alpha, &A(0, 0), A.ld, &B(0, j), B.ld);
+    }
+  } else {
+    // A lower => A' upper -> write to LOWER part of B.
+    for (int j = 0; j < n; j += npanel) {
+      int nloc = std::min<int>(npanel, n - j);
+
+      trmm<T>(BLIS_LEFT, BLIS_LOWER, BLIS_TRANSPOSE, diaga,
+              n-j, nloc, alpha, &A(j, j), A.ld, &B(j, j), B.ld);
+    }
+  }
+}
+
diff --git a/src/mVMC/CMakeLists.txt b/src/mVMC/CMakeLists.txt
index f80471ca..44ae92cf 100644
--- a/src/mVMC/CMakeLists.txt
+++ b/src/mVMC/CMakeLists.txt
@@ -27,9 +27,13 @@ target_link_libraries(vmcdry.out StdFace m)
 add_executable(vmc.out ${SOURCES_vmcmain} ${SOURCES_sfmt})
 target_link_libraries(vmc.out StdFace)
 target_link_libraries(vmc.out pfapack)
+target_link_libraries(vmc.out ltl2inv)
 if(PFAFFIAN_BLOCKED)
-  target_link_libraries(vmc.out pfupdates blis pthread)
+  target_link_libraries(vmc.out pfupdates)
 endif(PFAFFIAN_BLOCKED)
+if(Require_BLIS)
+  target_link_libraries(vmc.out blis pthread)
+endif(Require_BLIS)
 target_link_libraries(vmc.out ${LAPACK_LIBRARIES} m)
 
 if(USE_SCALAPACK)
diff --git a/src/mVMC/include/blas_externs.h b/src/mVMC/include/blas_externs.h
index c1df0682..7679cac4 100644
--- a/src/mVMC/include/blas_externs.h
+++ b/src/mVMC/include/blas_externs.h
@@ -46,15 +46,11 @@ along with this program. If not, see http://www.gnu.org/licenses/.
 #define M_ZGETRI zgetri_
 #define M_ZPOSV  zposv_
 
-// Skew-symmetric LAPACK-level routines:
-// Vendor switch: PFAPACK77 or Pfaffine
-#ifdef _pfaffine
-#define M_DSKPFA m_dskpfa_
-#define M_ZSKPFA m_zskpfa_
-#else
+// Pfapack
 #define M_DSKPFA dskpfa_
 #define M_ZSKPFA zskpfa_
-#endif
+#define M_DSKTRF dsktrf_
+#define M_ZSKTRF zsktrf_
 
 // pBLAS
 #define M_PDGEMV  pdgemv_
@@ -116,11 +112,21 @@ extern int M_DSKPFA(const char *uplo, const char *mthd, const int *n,
                     double *work, const int *lwork, int *info);
 extern int M_ZSKPFA(const char *uplo, const char *mthd, const int *n,
                     double complex *a, const int *lda, double complex *pfaff, int *iwork,
-                    double complex *work, const int *lwork,
-#ifndef _pfaffine
-                    double *rwork,
-#endif
-                    int *info);
+                    double complex *work, const int *lwork, double *rwork, int *info);
+extern int M_DSKTRF(const char *uplo, const char *mode, const int *n,
+                    double *a, const int *lda, int *ipiv,
+                    double *work, const int *lwork, int *info);
+extern int M_ZSKTRF(const char *uplo, const char *mode, const int *n,
+                    double complex *a, const int *lda, int *ipiv,
+                    double complex *work, const int *lwork, int *info);
+extern void ltl2pfa_d(int n, double *A_, int ldA, int *iPiv, double *Pfa);
+extern void ltl2inv_d(int n, double *A_, int ldA, int *iPiv, double *vT, double *M_, int ldM);
+extern void ltl2pfa_z(int n, double complex *A_, int ldA, int *iPiv, double complex *Pfa);
+extern void ltl2inv_z(int n, double complex *A_, int ldA, int *iPiv, double complex *vT, double complex *M_, int ldM);
+extern void utu2pfa_d(int n, double *A_, int ldA, int *iPiv, double *Pfa);
+extern void utu2inv_d(int n, double *A_, int ldA, int *iPiv, double *vT, double *M_, int ldM);
+extern void utu2pfa_z(int n, double complex *A_, int ldA, int *iPiv, double complex *Pfa);
+extern void utu2inv_z(int n, double complex *A_, int ldA, int *iPiv, double complex *vT, double complex *M_, int ldM);
 
 // pBLAS
 extern void M_PDGEMV(const char *trans, const int *m, const int *n,
diff --git a/src/mVMC/locgrn.c b/src/mVMC/locgrn.c
index 0f8837c3..b1fdd7c5 100644
--- a/src/mVMC/locgrn.c
+++ b/src/mVMC/locgrn.c
@@ -392,12 +392,7 @@ double complex calculateNewPfMN_child(const int qpidx, const int n, const int *m
   /* calculate Pf M */
   //M_DSKPFA(&uplo, &mthd, &nn, mat, &lda, &pfaff, iwork, work, &lwork, &info);
   //M_ZSKPFA(&uplo, &mthd, &n, mat, &lda, &pfaff, iwork, work, &lwork, rwork, &info); //TBC
-#ifdef _pfaffine
-  info = 1; // Skip inverse.
-  M_ZSKPFA(&uplo, &mthd, &n, mat, &lda, &pfaff, iwork, work, &lwork/*, rwork*/, &info);
-#else
   M_ZSKPFA(&uplo, &mthd, &n, mat, &lda, &pfaff, iwork, work, &lwork, rwork, &info);
-#endif
   sgn = ( (n*(n-1)/2)%2==0 ) ? 1.0 : -1.0;
 
   return sgn * pfaff * PfM[qpidx];
diff --git a/src/mVMC/locgrn_real.c b/src/mVMC/locgrn_real.c
index 4991a54e..82cc4037 100644
--- a/src/mVMC/locgrn_real.c
+++ b/src/mVMC/locgrn_real.c
@@ -397,9 +397,6 @@ double calculateNewPfMN_real_child(const int qpidx, const int n, const int *msa,
   }
 
   /* calculate Pf M */
-  // TODO: Maybe block updating is faster than recalculation of Pfaffian.
-  //       Maybe I'll also give support for this in Pfaffine.
-  info = 1; // Skip inverse.
   M_DSKPFA(&uplo, &mthd, &nn, mat, &lda, &pfaff, iwork, work, &lwork, &info);
 
   sgn = ( (n*(n-1)/2)%2==0 ) ? 1.0 : -1.0;
diff --git a/src/mVMC/makefile_src b/src/mVMC/makefile_src
index 52ac8076..bf9eae6f 100644
--- a/src/mVMC/makefile_src
+++ b/src/mVMC/makefile_src
@@ -1,8 +1,8 @@
 include ../make.sys
 
 PFAPACK = ../pfapack/fortran/libpfapack.a
-PFAFFINE = ../pfaffine/src/libpfaffine.a
 PFUPDATES = ../pfupdates/pfupdates.o
+LTL2INV = ../ltl2inv/ltl2inv.a
 SFMT = ../sfmt/SFMT.o
 STDFACE = ../StdFace/src/libStdFace.a
 OPTION = -D_mpi_use
@@ -19,7 +19,7 @@ OBJS = \
 	splitloop.o \
 	vmcmain.o \
 	$(PFAPACK) $(SFMT) $(STDFACE) \
-	$(PFUPDATES) $(PFAFFINE)
+	$(PFUPDATES) $(LTL2INV)
 
 SOURCES = \
 average.c \
@@ -107,7 +107,7 @@ HEADERS = \
 
 all : 
 	$(MAKE) -C ../pfapack/fortran                libpfapack.a
-	$(MAKE) -C ../pfaffine/src                   libpfaffine.a
+	$(MAKE) -C ../ltl2inv     -f makefile_ltl2inv
 	$(MAKE) -C ../pfupdates   -f makefile_pfupdates
 	$(MAKE) -C ../sfmt        -f makefile_sfmt
 	$(MAKE) -C ../StdFace/src -f makefile_StdFace libStdFace.a
@@ -130,7 +130,7 @@ clean :
 	rm -f *.o vmc.out vmcdry.out
 	$(MAKE) -C ../sfmt            -f makefile_sfmt      clean
 	$(MAKE) -C ../pfapack/fortran -f makefile           clean
-	$(MAKE) -C ../pfaffine        -f makefile           clean
+	$(MAKE) -C ../ltl2inv         -f makefile_ltl2inv   clean
 	$(MAKE) -C ../pfupdates       -f makefile_pfupdates clean
 	$(MAKE) -C ../StdFace/src     -f makefile_StdFace   clean
 	$(MAKE) -C ../ComplexUHF      -f makefile_uhf       clean
diff --git a/src/mVMC/matrix.c b/src/mVMC/matrix.c
index a86330ca..c91ba200 100644
--- a/src/mVMC/matrix.c
+++ b/src/mVMC/matrix.c
@@ -32,8 +32,6 @@ along with this program. If not, see http://www.gnu.org/licenses/.
 #include "./include/global.h"
 #include "workspace.c"
 
-//int calculateMAll_child_real(const int *eleIdx, const int qpStart, const int qpEnd, const int qpidx,
-//                        double *bufM, int *iwork, double *work, int lwork);
 #define D_PfLimit 1.0e-100
 
 int getLWork() {
@@ -69,11 +67,7 @@ int getLWork_fcmp() {
   lwork=-1;
   M_ZGETRI(&n, &a, &lda, &iwork, &optSize1, &lwork, &info);
   lwork=-1;
-#ifdef _pfaffine
-  M_ZSKPFA(&uplo, &mthd, &n, &a, &lda, &pfaff, &iwork, &optSize2, &lwork/*, &rwork*/, &info);
-#else
   M_ZSKPFA(&uplo, &mthd, &n, &a, &lda, &pfaff, &iwork, &optSize2, &lwork, &rwork, &info);
-#endif
 
   lwork = (creal(optSize1)>creal(optSize2)) ? (int)creal(optSize1) : (int)creal(optSize2);
   return lwork;
@@ -135,7 +129,6 @@ int calculateMAll_child_fsz(const int *eleIdx,const int *eleSpn, const int qpSta
   int msi,msj;
   int rsi,rsj;
 
-  char uplo='U', mthd='P';
   int m,n,nsq,one,lda,info=0;
   //int nspn = 2*Ne+2*Nsite+2*Nsite+NProj; this is useful?
   double complex pfaff,minus_one;
@@ -147,60 +140,40 @@ int calculateMAll_child_fsz(const int *eleIdx,const int *eleSpn, const int qpSta
   const double complex *sltE_i;
 
   double complex *invM = InvM + qpidx*Nsize*Nsize;
-  double complex *invM_i;
-
-  double complex *bufM_i, *bufM_i2;
+  double complex *invM_i, *invM_i2;
 
   m=n=lda=Nsize;
   nsq=n*n;
   one=1;
   minus_one=-1.0;
 
-  /* store bufM */
-  /* Note that bufM is column-major and skew-symmetric. */
-  /* bufM[msj][msi] = -sltE[rsi][rsj] */
+  /* store invM */
+  /* Note that invM is column-major and skew-symmetric. */
+  /* invM[msj][msi] = -sltE[rsi][rsj] */
 #pragma loop noalias
   for(msi=0;msi<nsize;msi++) {
     rsi = eleIdx[msi] + eleSpn[msi]*Nsite;//fsz
-    bufM_i = bufM + msi*Nsize;
+    invM_i = invM + msi*Nsize;
     sltE_i = sltE + rsi*Nsite2;
 #pragma loop norecurrence
     for(msj=0;msj<nsize;msj++) {
       rsj = eleIdx[msj] + eleSpn[msj]*Nsite;//fsz
-      bufM_i[msj] = -sltE_i[rsj];
+      invM_i[msj] = -sltE_i[rsj];
     }
   }
 
-#ifdef _pfaffine
-  info=0; /* Fused Pfaffian/inverse computation. */
-  M_ZSKPFA(&uplo, &mthd, &n, bufM, &lda, &pfaff, iwork, work/*, rwork*/, &lwork, &info);
-#else
-  /* Pfaffian/inverse computed separately. */
-  /* Copy bufM to invM before using bufM to compute Pfaffian. */
-  for(msi=0;msi<nsize*nsize;msi++)
-    invM[msi] = bufM[msi];
-
-  /* Calculate Pf M */
-  M_ZSKPFA(&uplo, &mthd, &n, bufM, &lda, &pfaff, iwork, work, &lwork, rwork, &info);
-#endif
-
+  M_ZSKTRF("U", "N", &n, invM, &lda, iwork, bufM, &nsq, &info);
   if(info!=0) return info;
+
+  utu2pfa_z(n, invM, lda, iwork, &pfaff);
   if(!isfinite(creal(pfaff) + cimag(pfaff))) return qpidx+1;
   PfM[qpidx] = pfaff;
 
-#ifdef _pfaffine
-  /* For fused Pfaffian/inverse, inv(M) is already stored in bufM.
-   * Now to transpose (.* -1) it to invM. */
-  for(msi=0;msi<nsize*nsize;msi++)
-    invM[msi] = -bufM[msi];
-#else
   /* Calculate inverse. */
-  M_ZGETRF(&m, &n, invM, &lda, iwork, &info); /* ipiv = iwork */
-  M_ZGETRI(&n, invM, &lda, iwork, work, &lwork, &info);
+  utu2inv_z(n, invM, lda, iwork, work, bufM, lda);
 
   /* InvM -> InvM(T) -> -InvM */
   M_ZSCAL(&nsq, &minus_one, invM, &one);
-#endif
 
   return info;
 }
@@ -254,7 +227,6 @@ int calculateMAll_child_fsz_real(const int *eleIdx,const int *eleSpn, const int
   int msi,msj;
   int rsi,rsj;
 
-  char uplo='U', mthd='P';
   int m,n,lda,one,nsq,info=0;
   //int nspn = 2*Ne+2*Nsite+2*Nsite+NProj; this is useful?
   double pfaff,minus_one;
@@ -266,61 +238,41 @@ int calculateMAll_child_fsz_real(const int *eleIdx,const int *eleSpn, const int
   const double *sltE_i;
 
   double *invM = InvM_real + qpidx*Nsize*Nsize;
-  double *invM_i;
-
-  double *bufM_i, *bufM_i2;
+  double *invM_i, *invM_i2;
 
   m=n=lda=Nsize;
   nsq=n*n;
   one=1;
   minus_one=-1.0;
 
-  /* store bufM */
-  /* Note that bufM is column-major and skew-symmetric. */
-  /* bufM[msj][msi] = -sltE[rsi][rsj] */
+  /* store invM */
+  /* Note that invM is column-major and skew-symmetric. */
+  /* invM[msj][msi] = -sltE[rsi][rsj] */
 #pragma loop noalias
   for(msi=0;msi<nsize;msi++) {
     rsi = eleIdx[msi] + eleSpn[msi]*Nsite;//fsz
-    bufM_i = bufM + msi*Nsize;
+    invM_i = invM + msi*Nsize;
     sltE_i = sltE + rsi*Nsite2;
 #pragma loop norecurrence
     for(msj=0;msj<nsize;msj++) {
       rsj = eleIdx[msj] + eleSpn[msj]*Nsite;//fsz
-      bufM_i[msj] = -sltE_i[rsj];
+      invM_i[msj] = -sltE_i[rsj];
     }
   }
 
-#ifdef _pfaffine
-  info=0; /* Fused Pfaffian/inverse computation. */
-#else
-  /* Pfaffian/inverse computed separately. */
-  /* Copy bufM to invM before using bufM to compute Pfaffian. */
-  for(msi=0;msi<nsize*nsize;msi++)
-    invM[msi] = bufM[msi];
-#endif
-
   /* Calculate Pf M */
-  M_DSKPFA(&uplo, &mthd, &n, bufM, &lda, &pfaff, iwork, work, &lwork, &info);
-
+  M_DSKTRF("U", "N", &n, invM, &lda, iwork, bufM, &nsq, &info);
   if(info!=0) return info;
+
+  utu2pfa_d(n, invM, lda, iwork, &pfaff);
   if(!isfinite(pfaff)) return qpidx+1;
   PfM_real[qpidx] = pfaff;
 
-#ifdef _pfaffine
-  /* inv(M) already stored in bufM.
-   * Transpose (.* -1) to invM. */
-  for(msi=0;msi<nsize*nsize;msi++)
-    invM[msi] = -bufM[msi];
-#else
   /* Calculate inverse. */
-  M_DGETRF(&m, &n, invM, &lda, iwork, &info); /* ipiv = iwork */
-  if(info!=0) return info;
-  M_DGETRI(&n, invM, &lda, iwork, work, &lwork, &info);
-  if(info!=0) return info;
+  utu2inv_d(n, invM, lda, iwork, work, bufM, lda);
 
   /* InvM -> InvM(T) -> -InvM */
   M_DSCAL(&nsq, &minus_one, invM, &one);
-#endif
 
   return info;
 }
@@ -384,7 +336,6 @@ int calculateMAll_child_fcmp(const int *eleIdx, const int qpStart, const int qpE
   int msi,msj;
   int rsi,rsj;
 
-  char uplo='U', mthd='P';
   int m,n,nsq,lda,one,info=0;
   double complex pfaff,minus_one;
 
@@ -395,61 +346,42 @@ int calculateMAll_child_fcmp(const int *eleIdx, const int qpStart, const int qpE
   const double complex *sltE_i;
 
   double complex *invM = InvM + qpidx*Nsize*Nsize;
-  double complex *invM_i;
-
-  double complex *bufM_i, *bufM_i2;
+  double complex *invM_i, *invM_i2;
 
   m=n=lda=Nsize;
   nsq=n*n;
   one=1;
   minus_one=-1.0;
 
-  /* store bufM */
-  /* Note that bufM is column-major and skew-symmetric. */
-  /* bufM[msj][msi] = -sltE[rsi][rsj] */
+  /* store invM */
+  /* Note that invM is column-major and skew-symmetric. */
+  /* invM[msj][msi] = -sltE[rsi][rsj] */
 #pragma loop noalias
   for(msi=0;msi<nsize;msi++) {
     rsi = eleIdx[msi] + (msi/Ne)*Nsite;
-    bufM_i = bufM + msi*Nsize;
+    invM_i = invM + msi*Nsize;
     sltE_i = sltE + rsi*Nsite2;
 #pragma loop norecurrence
     for(msj=0;msj<nsize;msj++) {
       rsj = eleIdx[msj] + (msj/Ne)*Nsite;
-      bufM_i[msj] = -sltE_i[rsj];
+      invM_i[msj] = -sltE_i[rsj];
     }
   }
 
-#ifdef _pfaffine
-  info=0; /* Fused Pfaffian/inverse computation. */
-  M_ZSKPFA(&uplo, &mthd, &n, bufM, &lda, &pfaff, iwork, work/*, rwork*/, &lwork, &info);
-#else
-  /* Pfaffian/inverse computed separately. */
-  /* Copy bufM to invM before using bufM to compute Pfaffian. */
-  for(msi=0;msi<nsize*nsize;msi++)
-    invM[msi] = bufM[msi];
-
   /* Calculate Pf M */
-  M_ZSKPFA(&uplo, &mthd, &n, bufM, &lda, &pfaff, iwork, work, &lwork, rwork, &info);
-#endif
-
+  M_ZSKTRF("U", "N", &n, invM, &lda, iwork, bufM, &nsq, &info);
   if(info!=0) return info;
+
+  utu2pfa_z(n, invM, lda, iwork, &pfaff);
   if(!isfinite(creal(pfaff) + cimag(pfaff))) return qpidx+1;
   PfM[qpidx] = pfaff;
 
-#ifdef _pfaffine
-  /* inv(M) already stored in bufM.
-   * Transpose (.* -1) to invM. */
-  for(msi=0;msi<nsize*nsize;msi++)
-    invM[msi] = -bufM[msi];
-#else
   /* Calculate inverse. */
-  M_ZGETRF(&m, &n, invM, &lda, iwork, &info); /* ipiv = iwork */
-  M_ZGETRI(&n, invM, &lda, iwork, work, &lwork, &info);
+  utu2inv_z(n, invM, lda, iwork, work, bufM, lda);
 
   /* mVMC's handling InvM as row-major,
    * i.e. InvM needs a transpose, InvM -> -InvM according antisymmetric properties. */
   M_ZSCAL(&nsq, &minus_one, invM, &one);
-#endif
 
   return info;
 }
@@ -553,10 +485,6 @@ int calculateMAll_BF_fcmp_child(
     }
   }
 
-#ifdef _pfaffine
-  info=0; /* Fused Pfaffian/inverse computation. */
-  M_ZSKPFA(&uplo, &mthd, &n, bufM, &lda, &pfaff, iwork, work, &lwork/*, rwork*/, &info);
-#else
   /* Pfaffian/inverse computed separately. */
   /* Copy bufM to invM before using bufM to compute Pfaffian. */
   for(msi=0;msi<nsize*nsize;msi++)
@@ -564,18 +492,11 @@ int calculateMAll_BF_fcmp_child(
 
   /* Calculate Pf M */
   M_ZSKPFA(&uplo, &mthd, &n, bufM, &lda, &pfaff, iwork, work, &lwork, rwork, &info);
-#endif
 
   if(info!=0) return info;
   if(!(isfinite(creal(pfaff)) && isfinite(cimag(pfaff)))) return qpidx+1;
   PfM[qpidx] = pfaff;
 
-#ifdef _pfaffine
-  /* inv(M) already stored in bufM.
-   * Transpose (.* -1) to invM. */
-  for(msi=0;msi<nsize*nsize;msi++)
-    invM[msi] = -bufM[msi];
-#else
   /* Calculate inverse. */
   M_ZGETRF(&m, &n, invM, &lda, iwork, &info); /* ipiv = iwork */
   if(info!=0) return info;
@@ -584,7 +505,6 @@ int calculateMAll_BF_fcmp_child(
 
   /* InvM -> InvM(T) -> -InvM */
   M_ZSCAL(&nsq, &minus_one, invM, &one);
-#endif
 
   return info;
 }
@@ -634,8 +554,6 @@ int CalculateMAll_real(const int *eleIdx, const int qpStart, const int qpEnd) {
   return info;
 }
 
-//int calculateMAll_child_real(const int *eleIdx, const int qpStart, const int qpEnd, const int qpidx,
-//                        double *bufM, int *iwork, double *work, int lwork) {
 int calculateMAll_child_real(const int *eleIdx, const int qpStart, const int qpEnd, const int qpidx,
     double *bufM, int *iwork, double *work, int lwork, double* PfM_real, double *InvM_real) {
 #pragma procedure serial
@@ -654,60 +572,43 @@ int calculateMAll_child_real(const int *eleIdx, const int qpStart, const int qpE
   const double *sltE_i;
 
   double *invM = InvM_real + qpidx*Nsize*Nsize;
-  double *invM_i;
-
-  double *bufM_i, *bufM_i2;
+  double *invM_i, *invM_i2;
 
   m=n=lda=Nsize;
   nsq=n*n;
   one=1;
   minus_one=-1.0;
 
-  /* store bufM */
-  /* Note that bufM is column-major and skew-symmetric. */
-  /* bufM[msj][msi] = -sltE[rsi][rsj] */
+  /* store invM */
+  /* Note that invM is column-major and skew-symmetric. */
+  /* invM[msj][msi] = -sltE[rsi][rsj] */
 #pragma loop noalias
   for(msi=0;msi<nsize;msi++) {
     rsi = eleIdx[msi] + (msi/Ne)*Nsite;
-    bufM_i = bufM + msi*Nsize;
+    invM_i = invM + msi*Nsize;
     sltE_i = sltE + rsi*Nsite2;
 #pragma loop norecurrence
     for(msj=0;msj<nsize;msj++) {
       rsj = eleIdx[msj] + (msj/Ne)*Nsite;
-      bufM_i[msj] = -sltE_i[rsj];
+      invM_i[msj] = -sltE_i[rsj];
 
     }
   }
 
-#ifdef _pfaffine
-  info=0; /* Fused Pfaffian/inverse computation. */
-#else
-  /* Pfaffian/inverse computed separately. */
-  /* Copy bufM to invM before using bufM to compute Pfaffian. */
-  for(msi=0;msi<nsize*nsize;msi++)
-    invM[msi] = bufM[msi];
-#endif
-
   /* calculate Pf M */
-  M_DSKPFA(&uplo, &mthd, &n, bufM, &lda, &pfaff, iwork, work, &lwork, &info);
-
+  M_DSKTRF("U", "N", &n, invM, &lda, iwork, bufM, &nsq, &info);
   if(info!=0) return info;
+
+  utu2pfa_d(n, invM, lda, iwork, &pfaff);
   if(!isfinite(pfaff)) return qpidx+1;
   PfM_real[qpidx] = pfaff;
 
-#ifdef _pfaffine
-  /* inv(M) already stored in bufM.
-   * Transpose (.* -1) to invM. */
-  for(msi=0;msi<nsize*nsize;msi++)
-    invM[msi] = -bufM[msi];
-#else
   /* Compute inverse */
-  M_DGETRF(&m, &n, invM, &lda, iwork, &info); /* ipiv = iwork */
-  M_DGETRI(&n, invM, &lda, iwork, work, &lwork, &info);
+  utu2inv_d(n, invM, lda, iwork, work, bufM, lda);
 
   // InvM -> InvM' = -InvM
+  // TODO: Include this in ltl2inv
   M_DSCAL(&nsq, &minus_one, invM, &one);
-#endif
 
   return info;
 }
@@ -793,14 +694,10 @@ int calculateMAll_BF_real_child(const int *eleIdx, const int qpStart, const int
     }
   }
 
-#ifdef _pfaffine
-  info=0; /* Fused Pfaffian/inverse computation. */
-#else
   /* Pfaffian/inverse computed separately. */
   /* Copy bufM to invM before using bufM to compute Pfaffian. */
   for(msi=0;msi<nsize*nsize;msi++)
     invM[msi] = bufM[msi];
-#endif
 
   /* Calculate Pf M */
   M_DSKPFA(&uplo, &mthd, &n, bufM, &lda, &pfaff, iwork, work, &lwork, &info);
@@ -809,12 +706,6 @@ int calculateMAll_BF_real_child(const int *eleIdx, const int qpStart, const int
   if(!isfinite(pfaff)) return qpidx+1;
   PfM_real[qpidx] = pfaff;
 
-#ifdef _pfaffine
-  /* inv(M) already stored in bufM.
-   * Transpose (.* -1) to invM. */
-  for(msi=0;msi<nsize*nsize;msi++)
-    invM[msi] = -bufM[msi];
-#else
   /* Compute inverse. */
   M_DGETRF(&m, &n, invM, &lda, iwork, &info); /* ipiv = iwork */
   if(info!=0) return info;
@@ -823,7 +714,6 @@ int calculateMAll_BF_real_child(const int *eleIdx, const int qpStart, const int
 
   // InvM -> InvM' = -InvM
   M_DSCAL(&nsq, &minus_one, invM, &one);
-#endif
 
   return info;
 }
diff --git a/src/mVMC/pfupdate.c b/src/mVMC/pfupdate.c
index 42b82b35..5dc67cec 100644
--- a/src/mVMC/pfupdate.c
+++ b/src/mVMC/pfupdate.c
@@ -352,12 +352,7 @@ double complex calculateNewPfMBFN4_child(const int qpidx, const int n, const int
 
   /* Calculate Pf M */
   //TODO: CHECK rwork is real <-- [R-Xu] Not needed anymore?
-#ifdef _pfaffine
-  info = 1; // Skipping inverse.
-  M_ZSKPFA(&uplo, &mthd, &nn, mat, &lda, &pfaff, iwork, work, &lwork/*, rwork*/, &info);
-#else
   M_ZSKPFA(&uplo, &mthd, &nn, mat, &lda, &pfaff, iwork, work, &lwork, rwork, &info);
-#endif
   //M_DSKPFA(&uplo, &mthd, &nn, mat, &lda, &pfaff, iwork, work, &lwork, &info);
 
   sgn = ( (n*(n-1)/2)%2==0 ) ? 1.0 : -1.0;
@@ -593,12 +588,7 @@ double complex updateMAll_BF_fcmp_child(
   }
 
   /* Calculate Pf M */
-#ifdef _pfaffine
-  info = 1; // Skip Pfaffine inverse.
-  M_ZSKPFA(&uplo, &mthd, &nn, mat, &lda, &pfaff, iwork, work, &lwork/*, rwork*/, &info);
-#else
   M_ZSKPFA(&uplo, &mthd, &nn, mat, &lda, &pfaff, iwork, work, &lwork, rwork, &info);
-#endif
   //M_DSKPFA(&uplo, &mthd, &nn, invM, &lda, &pfaff, iwork, work, &lwork, &info);
   //M_DSKPFA(&uplo, &mthd, &nn, mat, &lda, &pfaff, iwork, work, &lwork, &info);
 
diff --git a/src/mVMC/pfupdate_real.c b/src/mVMC/pfupdate_real.c
index 2b25909d..690b7c86 100644
--- a/src/mVMC/pfupdate_real.c
+++ b/src/mVMC/pfupdate_real.c
@@ -361,8 +361,6 @@ double calculateNewPfMBFN4_real_child(const int qpidx, const int n, const int *m
   }
 
   /* calculate Pf M */
-  //M_ZSKPFA(&uplo, &mthd, &nn, mat, &lda, &pfaff, iwork, work, &lwork, rwork, &info);
-  info = 1; // Skip Pfaffine's inverse.
   M_DSKPFA(&uplo, &mthd, &nn, mat, &lda, &pfaff, iwork, work, &lwork, &info);
 
   sgn = ( (n*(n-1)/2)%2==0 ) ? 1.0 : -1.0;
@@ -547,7 +545,7 @@ double updateMAll_BF_real_child(const int qpidx, const int n, const int *msa,
   // NOTE: This routines is not called in current version (i.e. lcoal coverage 0),
   //       hence cannot be tested. I'm keeping GETRF and GETRI here.
   // TODO: If anyone's interested in utilizing this routine, she or he might want to
-  //       utilize inverse feature of Pfaffine skpfa.
+  //       utilize DSKTRF+LTL2INV
   m=nn=lda=n2;
   //M_ZGETRF(&m, &nn, invMat, &lda, iwork2, &info); /* ipiv = iwork */
   M_DGETRF(&m, &nn, invMat, &lda, iwork2, &info); /* ipiv = iwork */
@@ -601,8 +599,6 @@ double updateMAll_BF_real_child(const int qpidx, const int n, const int *msa,
   /* calculate Pf M */
   //M_ZSKPFA(&uplo, &mthd, &nn, mat, &lda, &pfaff, iwork, work, &lwork, rwork, &info);
   //M_DSKPFA(&uplo, &mthd, &nn, invM, &lda, &pfaff, iwork, work, &lwork, &info);
-  // As stated above, I'm now only setting Pfaffine to skip optinal inverse.
-  info = 1; // This is parameter for skipping inversion.
   M_DSKPFA(&uplo, &mthd, &nn, mat, &lda, &pfaff, iwork, work, &lwork, &info);
 
   sgn = ( (n*(n-1)/2)%2==0 ) ? 1.0 : -1.0;
diff --git a/src/pfaffine b/src/pfaffine
deleted file mode 160000
index ebbb37ee..00000000
--- a/src/pfaffine
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit ebbb37ee4ac9b08b0907e81c5b593df5c975514a
diff --git a/src/pfapack b/src/pfapack
index b655435f..ebaa11b7 160000
--- a/src/pfapack
+++ b/src/pfapack
@@ -1 +1 @@
-Subproject commit b655435f2351c1f18bad0bfb7107822f505332ec
+Subproject commit ebaa11b722dbb5f1ddc62da607592d2e301c0c4a
diff --git a/src/pfupdates/CMakeLists.txt b/src/pfupdates/CMakeLists.txt
index 5f313232..1a06ef74 100644
--- a/src/pfupdates/CMakeLists.txt
+++ b/src/pfupdates/CMakeLists.txt
@@ -5,14 +5,12 @@ if(${CMAKE_PROJECT_NAME} STREQUAL "Project")
   message(FATAL_ERROR "cmake should be executed not for 'src' subdirectory, but for the top directory of mVMC.")
 endif(${CMAKE_PROJECT_NAME} STREQUAL "Project")
 
-add_definitions(-DBLAS_EXTERNAL)
-add_definitions(-DF77_COMPLEX_RET_INTEL)
-include_directories(../pfaffine/src)
+if(${ARCHITECTURE} STREQUAL "x86_64")
+  add_definitions(-DF77_COMPLEX_RET_INTEL)
+endif()
+include_directories(../common)
+include_directories(../ltl2inv)
 
-add_library(pfupdates STATIC
-  pf_interface.cc
-  ../pfaffine/src/skpfa.cc
-  ../pfaffine/src/sktdf.cc
-  ../pfaffine/src/sktdi.cc)
+add_library(pfupdates STATIC pf_interface.cc)
 target_compile_definitions(pfupdates PRIVATE -D_CC_IMPL)
 
diff --git a/src/pfupdates/makefile_pfupdates b/src/pfupdates/makefile_pfupdates
index ee946fc0..bd8502e3 100644
--- a/src/pfupdates/makefile_pfupdates
+++ b/src/pfupdates/makefile_pfupdates
@@ -1,16 +1,13 @@
 include ../make.sys
 
-ifeq ($(BLIS_ROOT),)
-	BLIS_ROOT = ../pfaffine/src/dep
-endif
-
 OBJ = pfupdates.o
 SRC = pf_interface.cc
-HDR = pf_interface.h orbital_mat.tcc skmv.tcc updated_tdi.tcc
+HDR = pf_interface.h orbital_mat.tcc skmv.tcc skr2k.tcc skslc.tcc updated_tdi.tcc \
+	  ../common/blalink.hh ../common/blalink_fort.h ../common/colmaj.hh
 
 $(OBJ): $(SRC) $(HDR) 
 	$(CXX) $(CXXFLAGS) -D_CC_IMPL -c $< -o $@ \
-		-I../pfaffine/src \
+		-I../common -I../ltl2inv \
 		-I$(BLIS_ROOT)/include \
 		-I$(BLIS_ROOT)/include/blis
 
diff --git a/src/pfupdates/orbital_mat.tcc b/src/pfupdates/orbital_mat.tcc
index cb8a6117..3e7ccf76 100644
--- a/src/pfupdates/orbital_mat.tcc
+++ b/src/pfupdates/orbital_mat.tcc
@@ -7,20 +7,10 @@
  */
 #pragma once
 #include "blis.h"
-#include "colmaj.tcc"
+#include "colmaj.hh"
 #include <random>
-#ifdef UseBoost
-#include <boost/numeric/ublas/matrix.hpp>
-#include <boost/numeric/ublas/vector.hpp>
-#include <boost/numeric/ublas/io.hpp>
-template <typename T>
-using matrix_t = boost::numeric::ublas::matrix<T, boost::numeric::ublas::column_major>;
-template <typename T>
-using vector_t = boost::numeric::ublas::vector<T>;
-#else
 template <typename T>
 using matrix_t = colmaj<T>;
-#endif
 
 template <typename T>
 struct orbital_mat
@@ -31,16 +21,7 @@ struct orbital_mat
 
   orbital_mat(uplo_t uplo_, dim_t nsite_, T *X_, inc_t ldX)
       : uplo(uplo_), nsite(nsite_),
-  #ifdef UseBoost
-        X(nsite, nsite) {
-    colmaj<T> X_tmp(X_, ldX);
-    for (dim_t j = 0; j < nsite; ++j)
-      for (dim_t i = 0; i < nsite; ++i)
-        X(i, j) = X_tmp(i, j);
-  }
-  #else
         X(X_, ldX) { }
-  #endif
 
   orbital_mat(uplo_t uplo_, dim_t nsite_, matrix_t<T> &X_)
   : uplo(uplo_), nsite(nsite_), X(X_) { }
diff --git a/src/pfupdates/pf_interface.cc b/src/pfupdates/pf_interface.cc
index a19596a4..a95efe01 100644
--- a/src/pfupdates/pf_interface.cc
+++ b/src/pfupdates/pf_interface.cc
@@ -51,9 +51,9 @@
     objv(iqp, ctype)->initialize(); \
   } \
 }
-GENIMPL( float,    s )
+// GENIMPL( float,    s )
 GENIMPL( double,   d )
-GENIMPL( ccscmplx, c )
+// GENIMPL( ccscmplx, c )
 GENIMPL( ccdcmplx, z )
 #undef GENIMPL
 
@@ -68,9 +68,9 @@ GENIMPL( ccdcmplx, z )
     delete objv(iqp, ctype); \
   } \
 }
-GENIMPL( float,    s )
+// GENIMPL( float,    s )
 GENIMPL( double,   d )
-GENIMPL( ccscmplx, c )
+// GENIMPL( ccscmplx, c )
 GENIMPL( ccdcmplx, z )
 #undef GENIMPL
 
@@ -84,9 +84,9 @@ GENIMPL( ccdcmplx, z )
   for (int iqp = 0; iqp < num_qp; ++iqp) \
     pfav[iqp] = -objv(iqp, ctype)->get_Pfa(); \
 }
-GENIMPL( float,    s )
+// GENIMPL( float,    s )
 GENIMPL( double,   d )
-GENIMPL( ccscmplx, c )
+// GENIMPL( ccscmplx, c )
 GENIMPL( ccdcmplx, z )
 #undef GENIMPL
 
@@ -102,9 +102,9 @@ GENIMPL( ccdcmplx, z )
   for (int iqp = 0; iqp < num_qp; ++iqp) \
     objv(iqp, ctype)->push_update_safe(osi, msj, cal_pfa!=0); \
 }
-GENIMPL( float,    s )
+// GENIMPL( float,    s )
 GENIMPL( double,   d )
-GENIMPL( ccscmplx, c )
+// GENIMPL( ccscmplx, c )
 GENIMPL( ccdcmplx, z )
 #undef GENIMPL
 
@@ -134,9 +134,9 @@ GENIMPL( ccdcmplx, z )
     objv(iqp, ctype)->push_update(osk, msl, cal_pfa!=0); \
   } \
 }
-GENIMPL( float,    s )
+// GENIMPL( float,    s )
 GENIMPL( double,   d )
-GENIMPL( ccscmplx, c )
+// GENIMPL( ccscmplx, c )
 GENIMPL( ccdcmplx, z )
 #undef GENIMPL
 
@@ -150,9 +150,9 @@ GENIMPL( ccdcmplx, z )
     objv(iqp, ctype)->pop_update(cal_pfa!=0); \
   } \
 }
-GENIMPL( float,    s )
+// GENIMPL( float,    s )
 GENIMPL( double,   d )
-GENIMPL( ccscmplx, c )
+// GENIMPL( ccscmplx, c )
 GENIMPL( ccdcmplx, z )
 #undef GENIMPL
 
diff --git a/src/pfupdates/pf_interface.h b/src/pfupdates/pf_interface.h
index b9f4cbf2..8593a12c 100644
--- a/src/pfupdates/pf_interface.h
+++ b/src/pfupdates/pf_interface.h
@@ -40,9 +40,9 @@ extern "C" {
       void     *objv[], \
       void     *orbv[] );
 
-GENDEF( float,    s )
+// GENDEF( float,    s )
 GENDEF( double,   d )
-GENDEF( ccscmplx, c )
+// GENDEF( ccscmplx, c )
 GENDEF( ccdcmplx, z )
 #undef GENDEF
 
@@ -52,9 +52,9 @@ GENDEF( ccdcmplx, z )
       void     *objv[], \
       void     *orbv[] );
 
-GENDEF( float,    s )
+// GENDEF( float,    s )
 GENDEF( double,   d )
-GENDEF( ccscmplx, c )
+// GENDEF( ccscmplx, c )
 GENDEF( ccdcmplx, z )
 #undef GENDEF
 
@@ -64,9 +64,9 @@ GENDEF( ccdcmplx, z )
       ctype     pfav[], \
       void     *objv[] );
 
-GENDEF( float,    s )
+// GENDEF( float,    s )
 GENDEF( double,   d )
-GENDEF( ccscmplx, c )
+// GENDEF( ccscmplx, c )
 GENDEF( ccdcmplx, z )
 #undef GENDEF
 
@@ -78,9 +78,9 @@ GENDEF( ccdcmplx, z )
       int64_t   cal_pfa, \
       void     *objv[] );
 
-GENDEF( float,    s )
+// GENDEF( float,    s )
 GENDEF( double,   d )
-GENDEF( ccscmplx, c )
+// GENDEF( ccscmplx, c )
 GENDEF( ccdcmplx, z )
 #undef GENDEF
 
@@ -94,9 +94,9 @@ GENDEF( ccdcmplx, z )
       int64_t   cal_pfa, \
       void     *objv[] );
 
-GENDEF( float,    s )
+// GENDEF( float,    s )
 GENDEF( double,   d )
-GENDEF( ccscmplx, c )
+// GENDEF( ccscmplx, c )
 GENDEF( ccdcmplx, z )
 #undef GENDEF
 
@@ -106,9 +106,9 @@ GENDEF( ccdcmplx, z )
       int64_t   cal_pfa, \
       void     *objv[] );
 
-GENDEF( float,    s )
+// GENDEF( float,    s )
 GENDEF( double,   d )
-GENDEF( ccscmplx, c )
+// GENDEF( ccscmplx, c )
 GENDEF( ccdcmplx, z )
 #undef GENDEF
 
diff --git a/src/pfupdates/skmv.tcc b/src/pfupdates/skmv.tcc
index 5f484c53..2abc2c6d 100644
--- a/src/pfupdates/skmv.tcc
+++ b/src/pfupdates/skmv.tcc
@@ -6,7 +6,7 @@
  * file, You can obtain one at https://mozilla.org/MPL/2.0/.
  */
 #pragma once
-#include "colmaj.tcc"
+#include "colmaj.hh"
 #include "blalink.hh"
 #include <vector>
 
diff --git a/src/pfupdates/skr2k.tcc b/src/pfupdates/skr2k.tcc
new file mode 100644
index 00000000..0b8cdcb6
--- /dev/null
+++ b/src/pfupdates/skr2k.tcc
@@ -0,0 +1,35 @@
+/**
+ * \file skr2k.tcc
+ * Dispatcher and minimal implementation for SKR2k.
+ * Minimal implementation is used when M is extremely small.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at https://mozilla.org/MPL/2.0/.
+ */
+#include "blalink.hh"
+#include "blalink_gemmt.hh"
+#include "colmaj.hh"
+#include <iostream>
+
+
+template <typename T>
+inline void skr2k(uplo_t uploc,
+                  trans_t transab,
+                  dim_t m, dim_t k,
+                  T alpha,
+                  T *_A, inc_t ldA,
+                  T *_B, inc_t ldB,
+                  T beta,
+                  T *_C, inc_t ldC)
+{
+    T one = 1.0;
+    if (transab == BLIS_NO_TRANSPOSE) {
+        gemmt<T>(uploc, BLIS_NO_TRANSPOSE, BLIS_TRANSPOSE, m, k, alpha, _A, ldA, _B, ldB, beta, _C, ldC);
+        gemmt<T>(uploc, BLIS_NO_TRANSPOSE, BLIS_TRANSPOSE, m, k,-alpha, _B, ldB, _A, ldA, one , _C, ldC);
+    } else {
+        gemmt<T>(uploc, BLIS_TRANSPOSE, BLIS_NO_TRANSPOSE, m, k, alpha, _A, ldA, _B, ldB, beta, _C, ldC);
+        gemmt<T>(uploc, BLIS_TRANSPOSE, BLIS_NO_TRANSPOSE, m, k,-alpha, _B, ldB, _A, ldA, one , _C, ldC);
+    }
+}
+
diff --git a/src/pfupdates/skslc.tcc b/src/pfupdates/skslc.tcc
new file mode 100644
index 00000000..0fd5a4cf
--- /dev/null
+++ b/src/pfupdates/skslc.tcc
@@ -0,0 +1,33 @@
+/**
+ * \file skslc.tcc
+ * Get a slice from antisymmetric matrix.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at https://mozilla.org/MPL/2.0/.
+ */
+#include "colmaj.hh"
+
+template<typename T>
+inline signed skslc(uplo_t uploA,
+                    unsigned n,
+                    unsigned i,
+                    T *x,
+                    T *_A, unsigned ldA)
+{
+    colmaj<T> A(_A, ldA);
+    x[i] = 0.0;
+    switch (uploA) {
+    case BLIS_UPPER:
+        for (unsigned j = 0; j < i; ++j)
+            x[j] =  A(j, i);
+        for (unsigned j = i+1; j < n; ++j)
+            x[j] = -A(i, j);
+        return 0;
+
+    default:
+        std::cerr << "SKSLC: Lower triangular storage not implemented. Sorry." << std::endl;
+        return err_info(Pfaffine_NOT_IMPLEMNTED, 0);
+    }
+}
+
diff --git a/src/pfupdates/testupdate.cc b/src/pfupdates/testupdate.cc
new file mode 100644
index 00000000..8f1c237b
--- /dev/null
+++ b/src/pfupdates/testupdate.cc
@@ -0,0 +1,70 @@
+/**
+ * \copyright Copyright (c) Dept. Phys., Univ. Tokyo
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at https://mozilla.org/MPL/2.0/.
+ */
+#include "updated_tdi.tcc"
+#include <cstdio>
+#include <chrono>
+#include <vector>
+
+struct info_t {
+    double err;
+    long long tim;
+};
+
+template <typename T>
+info_t test_tdi_update(dim_t nsite, std::vector<dim_t> cfg, std::vector<dim_t> from, std::vector<dim_t> to)
+{
+    dim_t nelec = cfg.size();
+    dim_t mmax = from.size();
+    T err = 0;
+    long long tim;
+
+    T *orb__ = new T[nsite * nsite];
+    colmaj<T> orb_(orb__, nsite);
+    orbital_mat<T> orb(BLIS_UPPER, nsite, orb_);
+    orb.randomize(0.6, 1234);
+
+    T *mat_ = new T[nelec * nelec];
+    updated_tdi<T> tdi(orb, cfg, mat_, nelec, mmax);
+
+    for (dim_t i = 0; i < mmax; ++i) {
+        if (from.at(i) < 0)
+            tdi.pop_update(true);
+        else
+            tdi.push_update_safe(to.at(i), from[i], true);
+
+        std::vector<dim_t> cfg2(tdi.elem_cfg);
+        // Apply hopping.
+        for (dim_t j = 0; j < tdi.from_idx.size(); ++j)
+          cfg2.at(tdi.from_idx.at(j)) = tdi.to_site.at(j);
+
+        T *mat2_ = new T[nelec * nelec];
+        T pfa2 = updated_tdi<T>(orb, cfg2, mat2_, nelec, 1).Pfa;
+
+        double err_ = std::abs(pfa2 - tdi.Pfa * tdi.PfaRatio);
+        printf("Pfa = %e; Pfa diff = %e\n", tdi.Pfa, err_);
+        err += err_;
+
+        delete[] mat2_;
+    }
+
+    delete[] mat_;
+    delete[] orb__;
+
+    return info_t{ err, tim };
+}
+
+int main(void)
+{
+    test_tdi_update<double>(200,
+        std::vector<dim_t>{ 8, 63, 20, 53, 57, 14, 7, 22, 32, 67, 56, 66, 46, 16, 18, 26, 70, 28, 51, 64, 17, 68, 49, 4, 48, 41, 43, 21, 19, 45, 58, 11, 1, 24, 2, 69, 12, 23, 30, 10, 65, 3, 31, 55, 52, 37, 42, 34, 25, 60, 59, 50, 40, 9, 44, 54, 6, 36, 13, 29, 27, 38, 61, 33, 47, 5, 15, 39, 62, 35 },
+        std::vector<dim_t>{  0,  39,  49,  10,  60,   3 },
+        std::vector<dim_t>{ 99, 101, 120, 199, 133, 177 });
+
+    return 0;
+}
+
diff --git a/src/pfupdates/updated_tdi.tcc b/src/pfupdates/updated_tdi.tcc
index 63bc035f..40a338af 100644
--- a/src/pfupdates/updated_tdi.tcc
+++ b/src/pfupdates/updated_tdi.tcc
@@ -8,12 +8,12 @@
 #pragma once
 #include "orbital_mat.tcc"
 #include "blalink.hh"
-#include "skpfa.hh"
-#include "sktdi.hh"
+#include "blalink_gemmt.hh"
+#include "pfaffian.tcc"
+#include "invert.tcc"
 #include "skr2k.tcc"
 #include "skslc.tcc"
 #include "skmv.tcc"
-#include "optpanel.hh"
 #include <iostream>
 #include <vector>
 
@@ -26,8 +26,6 @@ template <typename T> struct updated_tdi {
 
   // Performance tuning constants.
   const bool single_hop_alpha; ///< Whether to simplify k=1 situations.
-  const dim_t npanel_big;
-  const dim_t npanel_sub;
 
   // Matrix M.
   matrix_t<T> M;
@@ -42,7 +40,7 @@ template <typename T> struct updated_tdi {
   matrix_t<T> W;
   matrix_t<T> UMU, UMV, VMV;
   matrix_t<T> Cp;       ///< C+BMB = [ W -I; I 0 ] + [ UMU UMV; -UMV VMV ]
-  matrix_t<T> Gc;       ///< Gaussian vectors when tri-diagonalizing C.
+  matrix_t<T> Gc;       ///< Used to be Gaussian vectors. Now just scratchpads.
   signed *const cPov; ///< Pivot when tri-diagonalizing C.
   T Pfa;
   T PfaRatio; //< Updated Pfaffian / Base Pfaffian.
@@ -55,11 +53,7 @@ template <typename T> struct updated_tdi {
   void initialize() {
     using namespace std;
     auto &cfg = elem_cfg;
-    #ifdef UseBoost
-    matrix_t<T> G(nelec, nelec);
-    #else
     matrix_t<T> G(new T[nelec * nelec], nelec);
-    #endif
     from_idx.clear();
     to_site.clear();
     from_idx.reserve(mmax * sizeof(dim_t));
@@ -81,30 +75,23 @@ template <typename T> struct updated_tdi {
     }
 
     // Allocate scratchpad.
-    signed *iPivFull = new signed[nelec + 1];
-    dim_t lwork = nelec * npanel_big;
-    T *pfwork = new T[lwork];
+    T *vT = new T[nelec-1];
+    signed *iPiv = new signed[nelec];
 
-    #ifdef UseBoost
-    signed info = skpfa(uplo, nelec, &M(0, 0), M.size1(), &G(0, 0), G.size1(), iPivFull,
-                        true, &Pfa, pfwork, lwork);
-    #else
-    signed info = skpfa(uplo, nelec, &M(0, 0), M.ld, &G(0, 0), G.ld, iPivFull,
-                        true, &Pfa, pfwork, lwork);
-    #endif
+    // Perform LTL factorization regardless of `uplo`.
+    signed info = sktrf(uplo, nelec, &M(0, 0), M.ld, iPiv, &G(0, 0), G.ld * nelec);
+    Pfa = ltl2pfa(uplo, nelec, &M(0, 0), M.ld, iPiv);
+    ltl2inv(uplo, nelec, &M(0, 0), M.ld, iPiv, vT, &G(0, 0), G.ld);
 #ifdef _DEBUG
     cerr << "SKPFA+INV: n=" << nelec << " info=" << info << endl;
 #endif
 
-    delete[] iPivFull;
-    delete[] pfwork;
-    #ifndef UseBoost
+    delete[] vT;
+    delete[] iPiv;
     delete[](&G(0, 0));
-    #endif
   }
 
   ~updated_tdi() {
-    #ifndef UseBoost
     delete[](&U(0, 0));
     delete[](&Q(0, 0));
 
@@ -115,7 +102,6 @@ template <typename T> struct updated_tdi {
     delete[](&UMU(0, 0));
     delete[](&UMV(0, 0));
     delete[](&VMV(0, 0));
-    #endif
 
     delete[] cPov;
   }
@@ -123,30 +109,14 @@ template <typename T> struct updated_tdi {
   updated_tdi(orbital_mat<T> &Xij_, std::vector<dim_t> &cfg, T *M_, inc_t ldM,
               dim_t mmax_)
       : Xij(Xij_), nelec(cfg.size()), mmax(mmax_), nq_updated(0),
-        single_hop_alpha(true), npanel_big(optpanel(nelec, 4)), npanel_sub(4),
-    #ifdef UseBoost
-        M(nelec, nelec),
-        U(nelec, mmax * 2), Q(nelec, mmax * 2),
-        P(nelec, mmax), W(mmax, mmax),
-        UMU(mmax, mmax), UMV(mmax, mmax),
-        VMV(mmax, mmax), Cp(2 * mmax, 2 * mmax),
-        Gc(2 * mmax, 2 * mmax),
-    #else
-        M(M_, ldM),
+        single_hop_alpha(true), M(M_, ldM),
         U(new T[nelec * mmax * 2], nelec), Q(new T[nelec * mmax * 2], nelec),
         P(&Q(0, mmax), Q.ld), W(new T[mmax * mmax], mmax),
         UMU(new T[mmax * mmax], mmax), UMV(new T[mmax * mmax], mmax),
         VMV(new T[mmax * mmax], mmax), Cp(new T[2 * mmax * 2 * mmax], 2 * mmax),
         Gc(new T[2 * mmax * 2 * mmax], 2 * mmax),
-    #endif
         cPov(new signed[2 * mmax + 1]), Pfa(0.0), PfaRatio(1.0), elem_cfg(cfg),
         from_idx(0), to_site(0), uplo(BLIS_UPPER) {
-    #ifdef UseBoost
-    colmaj<T> M_tmp(M_, ldM);
-    for (dim_t j = 0; j < nelec; ++j)
-      for (dim_t i = 0; i < nelec; ++i)
-        M(i, j) = M_tmp(i, j);
-    #endif
     initialize();
   }
 
@@ -154,8 +124,7 @@ template <typename T> struct updated_tdi {
   updated_tdi(orbital_mat<T> &Xij_, std::vector<dim_t> &cfg, matrix_t<T> &M_,
               dim_t mmax_)
       : Xij(Xij_), nelec(cfg.size()), mmax(mmax_), nq_updated(0),
-        single_hop_alpha(true), npanel_big(optpanel(nelec, 4)), npanel_sub(4),
-        M(M_),
+        single_hop_alpha(true), M(M_),
         U(new T[nelec * mmax * 2], nelec), Q(new T[nelec * mmax * 2], nelec),
         P(&Q(0, mmax), Q.ld), W(new T[mmax * mmax], mmax),
         UMU(new T[mmax * mmax], mmax), UMV(new T[mmax * mmax], mmax),
@@ -169,31 +138,14 @@ template <typename T> struct updated_tdi {
   // Unsafe construction without initializaion.
   updated_tdi(orbital_mat<T> &Xij_, dim_t nelec_, T *M_, inc_t ldM, dim_t mmax_) 
       : Xij(Xij_), nelec(nelec_), mmax(mmax_), nq_updated(0),
-        single_hop_alpha(true), npanel_big(optpanel(nelec, 4)), npanel_sub(4),
-    #ifdef UseBoost
-        M(nelec, nelec),
-        U(nelec, mmax * 2), Q(nelec, mmax * 2),
-        P(nelec, mmax), W(mmax, mmax),
-        UMU(mmax, mmax), UMV(mmax, mmax),
-        VMV(mmax, mmax), Cp(2 * mmax, 2 * mmax),
-        Gc(2 * mmax, 2 * mmax),
-    #else
-        M(M_, ldM),
+        single_hop_alpha(true), M(M_, ldM),
         U(new T[nelec * mmax * 2], nelec), Q(new T[nelec * mmax * 2], nelec),
         P(&Q(0, mmax), Q.ld), W(new T[mmax * mmax], mmax),
         UMU(new T[mmax * mmax], mmax), UMV(new T[mmax * mmax], mmax),
         VMV(new T[mmax * mmax], mmax), Cp(new T[2 * mmax * 2 * mmax], 2 * mmax),
         Gc(new T[2 * mmax * 2 * mmax], 2 * mmax),
-    #endif
         cPov(new signed[2 * mmax + 1]), Pfa(0.0), PfaRatio(1.0), elem_cfg(nelec, 0),
-        from_idx(0), to_site(0), uplo(BLIS_UPPER) { 
-    #ifdef UseBoost
-    colmaj<T> M_tmp(M_, ldM);
-    for (dim_t j = 0; j < nelec; ++j)
-      for (dim_t i = 0; i < nelec; ++i)
-        M(i, j) = M_tmp(i, j);
-    #endif
-  }
+        from_idx(0), to_site(0), uplo(BLIS_UPPER) { }
 
   T get_Pfa() { return Pfa * PfaRatio; }
 
@@ -241,22 +193,12 @@ template <typename T> struct updated_tdi {
 
     for (; nq_updated < k_cal; ++nq_updated) {
       // Update single column. Use SKMV.
-      #ifdef UseBoost
-      skmv(uplo, n, T(1.0), &M(0, 0), M.size1(), &U(0, nq_updated), &Q(0, nq_updated));
-      #else
       skmv(uplo, n, T(1.0), &M(0, 0), M.ld, &U(0, nq_updated), &Q(0, nq_updated));
-      #endif
     } /* else {
       // Update multiple columns. Use SKMM.
-      #ifdef UseBoost
-      skmm(BLIS_LEFT, uplo, BLIS_NO_CONJUGATE, BLIS_NO_TRANSPOSE, n, k_cal - nq_updated,
-           T(1.0), &M(0, 0), M.size1(), &U(0, nq_updated), U.size1(),
-           T(0.0), &Q(0, nq_updated), Q.size1());
-      #else
       skmm(BLIS_LEFT, uplo, BLIS_NO_CONJUGATE, BLIS_NO_TRANSPOSE, n, k_cal - nq_updated,
            T(1.0), &M(0, 0), M.ld, &U(0, nq_updated), U.ld,
            T(0.0), &Q(0, nq_updated), Q.ld);
-      #endif
     }
     nq_updated = k_cal;
     */
@@ -279,6 +221,8 @@ template <typename T> struct updated_tdi {
           // Always assume l < o to calculate one less column of Q.
           UMU(o, l) = dot(n, &U(0, o), 1, &Q(0, l), 1);
           break;
+        default:
+          break;
         }
     return true;
   }
@@ -372,9 +316,6 @@ template <typename T> struct updated_tdi {
     if (compute_pfa) {
       // NOTE: Update k to be new size.
       k += 1;
-      // Allocate scratchpad.
-      dim_t lwork = 2 * k * npanel_sub;
-      T *pfwork = new T[lwork];
 
       // Calculate unupdated columns of U.
       require_Q(false);
@@ -386,25 +327,17 @@ template <typename T> struct updated_tdi {
       // If it's the first update Pfafian can be directly read out.
       if (k == 1) {
         PfaRatio = -UMV(0, 0) + T(1.0);
-        delete[] pfwork;
         return;
       }
 
       // Compute pfaffian.
-      #ifdef UseBoost
-      signed info = skpfa(uplo, 2 * k, &Cp(0, 0), Cp.size1(), &Gc(0, 0), Gc.size1(), cPov,
-                          false, &PfaRatio, pfwork, lwork);
-      #else
-      signed info = skpfa(uplo, 2 * k, &Cp(0, 0), Cp.ld, &Gc(0, 0), Gc.ld, cPov,
-                          false, &PfaRatio, pfwork, lwork);
-      #endif
+      signed info = skpfa(uplo, 2 * k, &Cp(0, 0), Cp.ld, &PfaRatio, cPov, &Gc(0, 0), Gc.ld * 2 * k);
 #ifdef _DEBUG
       cerr << "SKPFA: info=" << info << endl;
 #endif
       // Pfaffian of C = [ W -I; I 0 ].
       PfaRatio *= pow(-1.0, k * (k + 1) / 2);
 
-      delete[] pfwork;
     } else
       // Set to 0.0 to denote dirty.
       PfaRatio = 0.0;
@@ -475,7 +408,9 @@ template <typename T> struct updated_tdi {
       nq_updated = k;
 
     // Compute new (previous, in fact) Pfaffian.
-    if (compute_pfa) {
+    if (!k)
+      PfaRatio = 1.0;
+    else if (compute_pfa) {
       // All compute_pfa requires first K-1 of Q.
       require_Q(false);
 
@@ -483,19 +418,10 @@ template <typename T> struct updated_tdi {
       require_UMU();
       // Reassemble C and scratchpads.
       assemble_C_BMB();
-      dim_t lwork = 2 * k * npanel_sub;
-      T *pfwork = new T[lwork];
-
-      #ifdef UseBoost
-      signed info = skpfa(uplo, 2 * k, &Cp(0, 0), Cp.size1(), &Gc(0, 0), Gc.size1(), cPov,
-                          false, &PfaRatio, pfwork, lwork);
-      #else
-      signed info = skpfa(uplo, 2 * k, &Cp(0, 0), Cp.ld, &Gc(0, 0), Gc.ld, cPov,
-                          false, &PfaRatio, pfwork, lwork);
-      #endif
+
+      signed info = skpfa(uplo, 2 * k, &Cp(0, 0), Cp.ld, &PfaRatio, cPov, &Gc(0, 0), Gc.ld * 2 * k);
       PfaRatio *= pow(-1.0, k * (k + 1) / 2);
 
-      delete[] pfwork;
     } else
       // Set to 0.0 to denote dirty.
       PfaRatio = 0.0;
@@ -510,38 +436,31 @@ template <typename T> struct updated_tdi {
       return;
 
     // Allocate scratchpad.
-    dim_t lwork = 2 * k * npanel_sub;
-    T *pfwork = new T[lwork];
-    
+    T *vT = new T[2 * k - 1];
+
     // Update whole Q.
     require_Q(true);
 
-    // If M is dirty, redo the tridiagonal factorization.
-    if (PfaRatio == T(0.0)) {
-      require_UMU();
-      assemble_C_BMB();
-      #ifdef UseBoost
-      signed info = skpfa(uplo, 2 * k, &Cp(0, 0), Cp.size1(), &Gc(0, 0), Gc.size1(), cPov,
-                          false, &PfaRatio, pfwork, lwork);
-      #else
-      signed info = skpfa(uplo, 2 * k, &Cp(0, 0), Cp.ld, &Gc(0, 0), Gc.ld, cPov,
-                          false, &PfaRatio, pfwork, lwork);
-      #endif
-      PfaRatio *= pow(-1.0, k * (k + 1) / 2);
-    }
-
     if (k == 1) {
+      // M is dirty.
+      if (PfaRatio == T(0.0)) {
+        require_UMU();
+        assemble_C_BMB();
+        PfaRatio = -Cp(0, 1);
+      }
       // Trivial inverse.
       Cp(0, 1) = T(-1.0) / Cp(0, 1);
       Cp(1, 0) = T(-1.0) / Cp(1, 0);
-    } else
-      #ifdef UseBoost
-      signed info = sktdi(uplo, 2 * k, &Cp(0, 0), Cp.size1(), &Gc(0, 0), Gc.size1(), cPov,
-                          pfwork, lwork);
-      #else
-      signed info = sktdi(uplo, 2 * k, &Cp(0, 0), Cp.ld, &Gc(0, 0), Gc.ld, cPov,
-                          pfwork, lwork);
-      #endif
+    } else {
+      // Redo the tridiagonal factorization.
+      require_UMU();
+      assemble_C_BMB();
+      signed info = sktrf(uplo, 2 * k, &Cp(0, 0), Cp.ld, cPov, &Gc(0, 0), Gc.ld * 2 * k);
+      // PfaRatio is dirty.
+      if (PfaRatio == T(0.0))
+        PfaRatio = ltl2pfa(uplo, 2 * k, &Cp(0, 0), Cp.ld, cPov) * T(pow(-1.0, k * (k + 1) / 2));
+      ltl2inv(uplo, 2 * k, &Cp(0, 0), Cp.ld, cPov, vT, &Gc(0, 0), Gc.ld);
+    }
     inv_update(k, Cp);
 
     // Apply hopping.
@@ -553,36 +472,24 @@ template <typename T> struct updated_tdi {
     Pfa *= PfaRatio;
     PfaRatio = 1.0;
 
-    delete[] pfwork;
+    delete[] vT;
   }
 
   void inv_update(dim_t k, matrix_t<T> &C) {
-    #ifdef UseBoost
-    matrix_t<T> &ABC = U; ///< Use U as inv(A)*B*upper(C) buffer.
-    matrix_t<T> &AB = Q;
-    #else
     matrix_t<T> ABC(&U(0, 0), U.ld); ///< Use U as inv(A)*B*upper(C) buffer.
     matrix_t<T> AB(&Q(0, 0), Q.ld);
-    #endif
 
     if (k == 1 && single_hop_alpha) {
       // k == 1 requires no copying
-      #ifdef UseBoost
-      skr2k(uplo, BLIS_NO_TRANSPOSE, nelec, 1, C(0, 1), &Q(0, 0), Q.size1(),
-            &P(0, 0), P.size1(), T(1.0), &M(0, 0), M.size1());
-      #else
       skr2k(uplo, BLIS_NO_TRANSPOSE, nelec, 1, C(0, 1), &Q(0, 0), Q.ld,
             &P(0, 0), P.ld, T(1.0), &M(0, 0), M.ld);
-      #endif
       // See below for reason of calling this procedule.
       // update_uplo(uplo);
       return;
     }
 
     // Close empty space between Q and P.
-    #ifndef UseBoost
     if (k != mmax)
-    #endif
       for (dim_t j = 0; j < k; ++j)
         memcpy(&AB(0, k + j), &P(0, j), nelec * sizeof(T));
 
@@ -594,22 +501,12 @@ template <typename T> struct updated_tdi {
       //             0 0 0 +
       //             0 0 0 0 ] => AB[:, 0:2] -> ABC[:, 1:3]
       memcpy(&ABC(0, j + 1), &AB(0, j), nelec * sizeof(T));
-    #ifdef UseBoost
-    trmm(BLIS_RIGHT, BLIS_UPPER, BLIS_NO_TRANSPOSE, nelec, 2 * k - 1, T(1.0),
-         &C(0, 1), C.size1(), &ABC(0, 1), ABC.size1());
-    #else
     trmm(BLIS_RIGHT, BLIS_UPPER, BLIS_NO_TRANSPOSE, nelec, 2 * k - 1, T(1.0),
          &C(0, 1), C.ld, &ABC(0, 1), ABC.ld);
-    #endif
 
     // Update: write to M.
-    #ifdef UseBoost
-    skr2k(uplo, BLIS_NO_TRANSPOSE, nelec, 2 * k - 1, T(1.0), &ABC(0, 1), ABC.size1(),
-          &AB(0, 1), AB.size1(), T(1.0), &M(0, 0), M.size1());
-    #else
     skr2k(uplo, BLIS_NO_TRANSPOSE, nelec, 2 * k - 1, T(1.0), &ABC(0, 1), ABC.ld,
           &AB(0, 1), AB.ld, T(1.0), &M(0, 0), M.ld);
-    #endif
     #else
     gemm(BLIS_NO_TRANSPOSE, BLIS_NO_TRANSPOSE,
          nelec, 2 * k, 2 * k,
diff --git a/tool/makefile_tool b/tool/makefile_tool
index 1d2a177b..e89c7a9a 100644
--- a/tool/makefile_tool
+++ b/tool/makefile_tool
@@ -4,10 +4,10 @@ include ../src/make.sys
 .SUFFIXES : .o .F90
 .SUFFIXES : .o .c
 
-all:greenr2k
+all: greenr2k
 
-greenr2k:greenr2k.o key2lower.o
-	$(F90) greenr2k.o key2lower.o $(LIBS) -o $@
+greenr2k: greenr2k.o
+	$(F90) greenr2k.o $(LIBS) -o $@
 
 .F90.o:
 	$(F90) -c $< $(FFLAGS)