Merge pull request #232 from bd4/pr/sparse-solver2

Pr/sparse solver2
wdmapp · Jan 4, 2023 · 3e1be44 · 3e1be44
2 parents 3a94d03 + cff2152
commit 3e1be44
Show file tree

Hide file tree

Showing 18 changed files with 1,710 additions and 60 deletions.
diff --git a/.github/workflows/ccpp.yml b/.github/workflows/ccpp.yml
@@ -199,15 +199,15 @@ jobs:
       env:
         CXX: g++-7
     - name: cmake thrust
-      run: cmake -S . -B build-cuda -DGTENSOR_DEVICE=cuda -DCMAKE_BUILD_TYPE=RelWithDebInfo -DGTENSOR_BUILD_EXAMPLES=ON -DGTENSOR_USE_THRUST=ON -DGTEST_ROOT=${{ env.GTEST_ROOT }} -DGTENSOR_ENABLE_CLIB=ON -DGTENSOR_ENABLE_BLAS=ON -DGTENSOR_ENABLE_FFT=ON
+      run: cmake -S . -B build-cuda -DGTENSOR_DEVICE=cuda -DCMAKE_BUILD_TYPE=RelWithDebInfo -DGTENSOR_BUILD_EXAMPLES=ON -DGTENSOR_USE_THRUST=ON -DGTEST_ROOT=${{ env.GTEST_ROOT }} -DGTENSOR_ENABLE_CLIB=ON -DGTENSOR_ENABLE_BLAS=ON -DGTENSOR_ENABLE_FFT=ON -DGTENSOR_ENABLE_SOLVER=ON
     - name: cmake thrust build
       run: cmake --build build-cuda -v
     - name: cmake thrust debug
-      run: cmake -S . -B build-cuda-debug -DGTENSOR_DEVICE=cuda -DCMAKE_BUILD_TYPE=Debug -DGTENSOR_BUILD_EXAMPLES=ON -DGTENSOR_USE_THRUST=ON -DGTEST_ROOT=${{ env.GTEST_ROOT }} -DGTENSOR_ENABLE_CLIB=ON -DGTENSOR_ENABLE_BLAS=ON -DGTENSOR_ENABLE_FFT=ON
+      run: cmake -S . -B build-cuda-debug -DGTENSOR_DEVICE=cuda -DCMAKE_BUILD_TYPE=Debug -DGTENSOR_BUILD_EXAMPLES=ON -DGTENSOR_USE_THRUST=ON -DGTEST_ROOT=${{ env.GTEST_ROOT }} -DGTENSOR_ENABLE_CLIB=ON -DGTENSOR_ENABLE_BLAS=ON -DGTENSOR_ENABLE_FFT=ON -DGTENSOR_ENABLE_SOLVER=ON
     - name: cmake thrust debug build
       run: cmake --build build-cuda-debug -v
     - name: cmake nothrust
-      run: cmake -S . -B build-cuda-nothrust -DGTENSOR_DEVICE=cuda -DCMAKE_BUILD_TYPE=RelWithDebInfo -DGTENSOR_BUILD_EXAMPLES=ON -DGTENSOR_USE_THRUST=OFF -DGTEST_ROOT=${{ env.GTEST_ROOT }} -DGTENSOR_ENABLE_CLIB=ON -DGTENSOR_ENABLE_BLAS=ON -DGTENSOR_ENABLE_FFT=ON
+      run: cmake -S . -B build-cuda-nothrust -DGTENSOR_DEVICE=cuda -DCMAKE_BUILD_TYPE=RelWithDebInfo -DGTENSOR_BUILD_EXAMPLES=ON -DGTENSOR_USE_THRUST=OFF -DGTEST_ROOT=${{ env.GTEST_ROOT }} -DGTENSOR_ENABLE_CLIB=ON -DGTENSOR_ENABLE_BLAS=ON -DGTENSOR_ENABLE_FFT=ON -DGTENSOR_ENABLE_SOLVER=ON
     - name: cmake nothrust build
       run: cmake --build build-cuda-nothrust -v
     - name: GNU make setup gtensor subdir
@@ -233,7 +233,7 @@ jobs:
     - name: install core packages
       run: apt-get update -y && apt-get install -y wget git cmake clang
     - name: install extra ROCm packages
-      run: apt-get install -y rocthrust rocprim rocfft rocblas hipfft rocsolver
+      run: apt-get install -y rocthrust rocprim rocfft rocblas hipfft rocsolver rocsparse
     - name: install googletest
       run: |
         mkdir -p ${{ env.GTEST_ROOT }}
@@ -245,15 +245,15 @@ jobs:
       env:
         CXX: clang++
     - name: cmake thrust
-      run: cmake -S . -B build-hip -DGTENSOR_DEVICE=hip -DCMAKE_BUILD_TYPE=RelWithDebInfo -DGTENSOR_BUILD_EXAMPLES=ON -DGTENSOR_USE_THRUST=ON -DGTEST_ROOT=${{ env.GTEST_ROOT }} -DGTENSOR_ENABLE_CLIB=ON -DGTENSOR_ENABLE_BLAS=ON -DGTENSOR_ENABLE_FFT=ON
+      run: cmake -S . -B build-hip -DGTENSOR_DEVICE=hip -DCMAKE_BUILD_TYPE=RelWithDebInfo -DGTENSOR_BUILD_EXAMPLES=ON -DGTENSOR_USE_THRUST=ON -DGTEST_ROOT=${{ env.GTEST_ROOT }} -DGTENSOR_ENABLE_CLIB=ON -DGTENSOR_ENABLE_BLAS=ON -DGTENSOR_ENABLE_FFT=ON -DGTENSOR_ENABLE_SOLVER=ON
     - name: cmake thrust build
       run: cmake --build build-hip -v
     - name: cmake thrust debug
-      run: cmake -S . -B build-hip-debug -DGTENSOR_DEVICE=hip -DCMAKE_BUILD_TYPE=Debug -DGTENSOR_BUILD_EXAMPLES=ON -DGTENSOR_USE_THRUST=ON -DGTEST_ROOT=${{ env.GTEST_ROOT }} -DGTENSOR_ENABLE_CLIB=ON -DGTENSOR_ENABLE_BLAS=ON -DGTENSOR_ENABLE_FFT=ON
+      run: cmake -S . -B build-hip-debug -DGTENSOR_DEVICE=hip -DCMAKE_BUILD_TYPE=Debug -DGTENSOR_BUILD_EXAMPLES=ON -DGTENSOR_USE_THRUST=ON -DGTEST_ROOT=${{ env.GTEST_ROOT }} -DGTENSOR_ENABLE_CLIB=ON -DGTENSOR_ENABLE_BLAS=ON -DGTENSOR_ENABLE_FFT=ON -DGTENSOR_ENABLE_SOLVER=ON
     - name: cmake thrust debug build
       run: cmake --build build-hip-debug -v
     - name: cmake nothrust
-      run: cmake -S . -B build-hip-nothrust -DGTENSOR_DEVICE=hip -DCMAKE_BUILD_TYPE=RelWithDebInfo -DGTENSOR_BUILD_EXAMPLES=ON -DGTENSOR_USE_THRUST=OFF -DGTEST_ROOT=${{ env.GTEST_ROOT }} -DGTENSOR_ENABLE_CLIB=ON -DGTENSOR_ENABLE_BLAS=ON -DGTENSOR_ENABLE_FFT=ON
+      run: cmake -S . -B build-hip-nothrust -DGTENSOR_DEVICE=hip -DCMAKE_BUILD_TYPE=RelWithDebInfo -DGTENSOR_BUILD_EXAMPLES=ON -DGTENSOR_USE_THRUST=OFF -DGTEST_ROOT=${{ env.GTEST_ROOT }} -DGTENSOR_ENABLE_CLIB=ON -DGTENSOR_ENABLE_BLAS=ON -DGTENSOR_ENABLE_FFT=ON -DGTENSOR_ENABLE_SOLVER=ON
     - name: cmake nothrust build
       run: cmake --build build-hip-nothrust -v
     - name: GNU make setup gtensor subdir

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -185,6 +185,11 @@ if ("hip" IN_LIST GTENSOR_BUILD_DEVICES)
                         "${ROCM_PATH}/lib/librocsolver.so")
   target_include_directories(rocsolver INTERFACE "${ROCM_PATH}/include")
 
+  add_library(rocsparse INTERFACE IMPORTED)
+  target_link_libraries(rocsparse INTERFACE
+                        "${ROCM_PATH}/lib/librocsparse.so")
+  target_include_directories(rocsparse INTERFACE "${ROCM_PATH}/include")
+
   # NB: in ROCm, rocfft library / package was deprecated in favor of
   # hipfft library / package.
   add_library(hipfft INTERFACE IMPORTED)
@@ -449,6 +454,11 @@ if (GTENSOR_ENABLE_BLAS)
     add_library(gtsolver)
     target_gtensor_sources(gtsolver PRIVATE src/gtsolver.cxx)
     target_link_libraries(gtsolver gtblas)
+    if (${GTENSOR_DEVICE} STREQUAL "cuda")
+      target_link_libraries(gtsolver CUDA::cusparse)
+    elseif (${GTENSOR_DEVICE} STREQUAL "hip")
+      target_link_libraries(gtsolver rocsparse)
+    endif()
     list(APPEND GTENSOR_TARGETS gtsolver)
     add_library(gtensor::gtsolver ALIAS gtsolver)
   endif()

diff --git a/README.md b/README.md
@@ -455,3 +455,67 @@ native stream object is a `sycl::queue`.
 
 See also `tests/test_stream.cxx`. Note that this API is likely to change; in
 particular, the stream objects will become templated on space type.
+
+# Library Wrapper Extensions
+
+## gt-blas
+
+Provides wrappers around commonly used blas routines. Requires cuBLAS, rocblas,
+or oneMKL, depending on the GPU backend. Interface is mostly C style taking
+raw pointers, for easy interoperability with Fortran, with a few higher level
+gtensor specific helpers.
+
+```
+#include "gt-blas/blas.h"
+
+void blas()
+{
+  gt::blas::handle_t h;
+  gt::gtensor_device<double, 1> x = gt::arange<double>(1, 11);
+  gt::gtensor_device<double, 1> y = gt::arange<double>(1, 11);
+  gt::blas::axpy(h, 2.0, x, y);
+  std::cout << "a*x+y = " << y << std::endl;
+  /* a*x+y = { 3 6 9 12 15 18 21 24 27 30 } */
+}
+```
+
+A naive banded LU solver implementation is also provided, useful in cases where
+the matrices are banded and the native GPU batched LU solve has not been
+optimized yet. Parallelism for this implementaiton is on batch only.
+
+## gt-fft
+
+Provides high level C++ style interface around cuFFT, rocFFT, and oneMKL DFT.
+
+```
+#include "gt-fft/fft.h"
+
+void fft()
+{
+  // python: x = np.array([2., 3., -1., 4.])
+  gt::gtensor_device<double, 1> x = {2, 3, -1, 4};
+  auto y = gt::empty_device<gt::complex<double>>({3});
+
+  // python: y = np.fft.fft(x)
+  gt::fft::FFTPlanMany<gt::fft::Domain::REAL, double> plan({x.shape(0)}, 1);
+  plan(x, y);
+  std::cout << y << std::endl;
+  /* { (8,0) (3,1) (-6,0) } */
+}
+```
+
+## gt-solver
+
+Provides a high level C++ style interface around batched LU solve, in
+particular the case where a single set of matrices is used repeatedly to solve
+with different right hand side vectors. It maintains it's own contiguous copy
+of the factored matrices in device memory, and device buffers for staging input
+and output right hand side vectors. This allows the application to build the
+matrices on host and pass them off to the solver interface for all the device
+specific handling. On some platforms, there are performance issues with solving
+out of managed memory, and the internal device buffers can significantly
+improve performance on these platforms.
+
+This should be preferred over directly calling gt-blas routines like getrf and
+getrs, when the use case matches (single factor and many solves with different
+right hand sides).
diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt
@@ -38,3 +38,9 @@ if (GTENSOR_ENABLE_FFT)
   target_gtensor_sources(bench_fft PRIVATE bench_fft.cxx)
   target_link_libraries(bench_fft gtensor::gtensor gtensor::gtfft benchmark::benchmark)
 endif()
+
+if (GTENSOR_ENABLE_SOLVER)
+  add_executable(bench_solver)
+  target_gtensor_sources(bench_solver PRIVATE bench_solver.cxx)
+  target_link_libraries(bench_solver gtensor::gtensor gtensor::gtsolver benchmark::benchmark)
+endif()
diff --git a/benchmarks/bench_solver.cxx b/benchmarks/bench_solver.cxx
@@ -0,0 +1,135 @@
+#include <iostream>
+#include <numeric>
+#include <string>
+
+#include <benchmark/benchmark.h>
+
+#include <gtensor/gtensor.h>
+
+#include <gt-solver/solver.h>
+
+#include "gt-bm.h"
+
+using namespace gt::placeholders;
+
+// #define BENCH_SOLVER_DEBUG
+
+template <typename T>
+auto make_test_matrix(int n, int bw, int batch_size, bool needs_pivot)
+{
+  auto h_Adata = gt::zeros<T>({n, n, batch_size});
+  for (int b = 0; b < batch_size; b++) {
+    for (int i = 0; i < n; i++) {
+      h_Adata(i, i, b) = T(bw + 1.0);
+      // set upper / lower diags at bw diagonal
+      for (int d = 1; d <= bw; d++) {
+        if (i + d < n) {
+          h_Adata(i, i + d, b) = T(-1.0);
+          h_Adata(i + d, i, b) = T(-0.5);
+        }
+      }
+    }
+    if (needs_pivot) {
+      h_Adata(0, 0, b) = T(n / 64.0);
+    }
+  }
+  return h_Adata;
+}
+
+// ======================================================================
+// BM_solver
+//
+
+// args: int N, int BW, int NRHS, int NBATCH
+template <typename Solver>
+static void BM_solver(benchmark::State& state)
+{
+  using T = typename Solver::value_type;
+  using R = gt::complex_subtype_t<T>;
+
+  const int N = state.range(0);
+  const int BW = state.range(1);
+  const int NRHS = state.range(2);
+  const int NBATCH = state.range(3);
+
+  auto h_A = make_test_matrix<T>(N, BW, NBATCH, true);
+
+  gt::gtensor<T, 3> h_rhs(gt::shape(N, NRHS, NBATCH));
+  gt::gtensor<T, 3> h_result(h_rhs.shape());
+  gt::gtensor_device<T, 3> d_rhs(h_rhs.shape());
+  gt::gtensor_device<T, 3> d_result(h_rhs.shape());
+
+  gt::gtensor<T*, 1> h_Aptr(NBATCH);
+
+  h_Aptr(0) = gt::raw_pointer_cast(h_A.data());
+  for (int b = 1; b < NBATCH; b++) {
+    h_Aptr(b) = h_Aptr(0) + (N * N * b);
+    for (int i = 0; i < N; i++) {
+      for (int rhs = 0; rhs < NRHS; rhs++) {
+        h_rhs(i, rhs, b) = T(1.0 + R(rhs) / NRHS);
+      }
+    }
+  }
+
+  gt::copy(h_rhs, d_rhs);
+
+  gt::blas::handle_t h;
+
+  Solver s(h, N, NBATCH, NRHS, gt::raw_pointer_cast(h_Aptr.data()));
+
+  std::cout << typeid(Solver).name() << " devmem bytes "
+            << static_cast<double>(s.get_device_memory_usage()) / (1024 * 1024)
+            << " MB" << std::endl;
+
+  auto fn = [&]() {
+    s.solve(d_rhs.data().get(), d_result.data().get());
+    gt::synchronize();
+  };
+
+  // warm up, device compile, check
+  fn();
+
+  for (auto _ : state) {
+    fn();
+  }
+}
+
+// Solver, N, BW, NRHS, NBATCH
+BENCHMARK(BM_solver<gt::solver::solver_dense<double>>)
+  ->Args({512, 32, 1, 64})
+  ->Args({210, 32, 1, 256})
+  ->Unit(benchmark::kMillisecond);
+
+BENCHMARK(BM_solver<gt::solver::solver_invert<double>>)
+  ->Args({512, 32, 1, 64})
+  ->Args({210, 32, 1, 256})
+  ->Unit(benchmark::kMillisecond);
+
+BENCHMARK(BM_solver<gt::solver::solver_sparse<double>>)
+  ->Args({512, 32, 1, 64})
+  ->Args({512, 5, 1, 64})
+  ->Args({210, 32, 1, 256})
+  ->Args({210, 5, 1, 256})
+  ->Unit(benchmark::kMillisecond);
+
+BENCHMARK(BM_solver<gt::solver::solver_dense<gt::complex<double>>>)
+  ->Args({512, 32, 1, 64})
+  ->Args({210, 32, 1, 256})
+  ->Unit(benchmark::kMillisecond);
+
+BENCHMARK(BM_solver<gt::solver::solver_invert<gt::complex<double>>>)
+  ->Args({512, 32, 1, 64})
+  ->Args({210, 32, 1, 256})
+  ->Unit(benchmark::kMillisecond);
+
+// oneMKL sparse API does not yet support complex
+#ifndef GTENSOR_DEVICE_SYCL
+BENCHMARK(BM_solver<gt::solver::solver_sparse<gt::complex<double>>>)
+  ->Args({512, 32, 1, 64})
+  ->Args({512, 5, 1, 64})
+  ->Args({210, 32, 1, 256})
+  ->Args({210, 5, 1, 256})
+  ->Unit(benchmark::kMillisecond);
+#endif
+
+BENCHMARK_MAIN();
diff --git a/include/gt-blas/backend/cuda.h b/include/gt-blas/backend/cuda.h
@@ -317,6 +317,8 @@ inline void getrf_npvt_batched(handle_t& h, int n, T** d_Aarray, int lda,
 CREATE_GETRF_NPVT_BATCHED(cublasZgetrfBatched, gt::complex<double>,
                           cuDoubleComplex)
 CREATE_GETRF_NPVT_BATCHED(cublasCgetrfBatched, gt::complex<float>, cuComplex)
+CREATE_GETRF_NPVT_BATCHED(cublasDgetrfBatched, double, double)
+CREATE_GETRF_NPVT_BATCHED(cublasSgetrfBatched, float, float)
 
 #undef CREATE_GETRF_NPVT_BATCHED
 

diff --git a/include/gt-blas/backend/hip.h b/include/gt-blas/backend/hip.h
@@ -319,6 +319,8 @@ CREATE_GETRF_NPVT_BATCHED(rocsolver_zgetrf_npvt_batched, gt::complex<double>,
                           rocblas_double_complex)
 CREATE_GETRF_NPVT_BATCHED(rocsolver_cgetrf_npvt_batched, gt::complex<float>,
                           rocblas_float_complex)
+CREATE_GETRF_NPVT_BATCHED(rocsolver_dgetrf_npvt_batched, double, double)
+CREATE_GETRF_NPVT_BATCHED(rocsolver_sgetrf_npvt_batched, float, float)
 
 #undef CREATE_GETRF_NPVT_BATCHED