Skip to content

Commit

Permalink
Merge pull request #232 from bd4/pr/sparse-solver2
Browse files Browse the repository at this point in the history
Pr/sparse solver2
  • Loading branch information
bd4 authored Jan 4, 2023
2 parents 3a94d03 + cff2152 commit 3e1be44
Show file tree
Hide file tree
Showing 18 changed files with 1,710 additions and 60 deletions.
14 changes: 7 additions & 7 deletions .github/workflows/ccpp.yml
Original file line number Diff line number Diff line change
Expand Up @@ -199,15 +199,15 @@ jobs:
env:
CXX: g++-7
- name: cmake thrust
run: cmake -S . -B build-cuda -DGTENSOR_DEVICE=cuda -DCMAKE_BUILD_TYPE=RelWithDebInfo -DGTENSOR_BUILD_EXAMPLES=ON -DGTENSOR_USE_THRUST=ON -DGTEST_ROOT=${{ env.GTEST_ROOT }} -DGTENSOR_ENABLE_CLIB=ON -DGTENSOR_ENABLE_BLAS=ON -DGTENSOR_ENABLE_FFT=ON
run: cmake -S . -B build-cuda -DGTENSOR_DEVICE=cuda -DCMAKE_BUILD_TYPE=RelWithDebInfo -DGTENSOR_BUILD_EXAMPLES=ON -DGTENSOR_USE_THRUST=ON -DGTEST_ROOT=${{ env.GTEST_ROOT }} -DGTENSOR_ENABLE_CLIB=ON -DGTENSOR_ENABLE_BLAS=ON -DGTENSOR_ENABLE_FFT=ON -DGTENSOR_ENABLE_SOLVER=ON
- name: cmake thrust build
run: cmake --build build-cuda -v
- name: cmake thrust debug
run: cmake -S . -B build-cuda-debug -DGTENSOR_DEVICE=cuda -DCMAKE_BUILD_TYPE=Debug -DGTENSOR_BUILD_EXAMPLES=ON -DGTENSOR_USE_THRUST=ON -DGTEST_ROOT=${{ env.GTEST_ROOT }} -DGTENSOR_ENABLE_CLIB=ON -DGTENSOR_ENABLE_BLAS=ON -DGTENSOR_ENABLE_FFT=ON
run: cmake -S . -B build-cuda-debug -DGTENSOR_DEVICE=cuda -DCMAKE_BUILD_TYPE=Debug -DGTENSOR_BUILD_EXAMPLES=ON -DGTENSOR_USE_THRUST=ON -DGTEST_ROOT=${{ env.GTEST_ROOT }} -DGTENSOR_ENABLE_CLIB=ON -DGTENSOR_ENABLE_BLAS=ON -DGTENSOR_ENABLE_FFT=ON -DGTENSOR_ENABLE_SOLVER=ON
- name: cmake thrust debug build
run: cmake --build build-cuda-debug -v
- name: cmake nothrust
run: cmake -S . -B build-cuda-nothrust -DGTENSOR_DEVICE=cuda -DCMAKE_BUILD_TYPE=RelWithDebInfo -DGTENSOR_BUILD_EXAMPLES=ON -DGTENSOR_USE_THRUST=OFF -DGTEST_ROOT=${{ env.GTEST_ROOT }} -DGTENSOR_ENABLE_CLIB=ON -DGTENSOR_ENABLE_BLAS=ON -DGTENSOR_ENABLE_FFT=ON
run: cmake -S . -B build-cuda-nothrust -DGTENSOR_DEVICE=cuda -DCMAKE_BUILD_TYPE=RelWithDebInfo -DGTENSOR_BUILD_EXAMPLES=ON -DGTENSOR_USE_THRUST=OFF -DGTEST_ROOT=${{ env.GTEST_ROOT }} -DGTENSOR_ENABLE_CLIB=ON -DGTENSOR_ENABLE_BLAS=ON -DGTENSOR_ENABLE_FFT=ON -DGTENSOR_ENABLE_SOLVER=ON
- name: cmake nothrust build
run: cmake --build build-cuda-nothrust -v
- name: GNU make setup gtensor subdir
Expand All @@ -233,7 +233,7 @@ jobs:
- name: install core packages
run: apt-get update -y && apt-get install -y wget git cmake clang
- name: install extra ROCm packages
run: apt-get install -y rocthrust rocprim rocfft rocblas hipfft rocsolver
run: apt-get install -y rocthrust rocprim rocfft rocblas hipfft rocsolver rocsparse
- name: install googletest
run: |
mkdir -p ${{ env.GTEST_ROOT }}
Expand All @@ -245,15 +245,15 @@ jobs:
env:
CXX: clang++
- name: cmake thrust
run: cmake -S . -B build-hip -DGTENSOR_DEVICE=hip -DCMAKE_BUILD_TYPE=RelWithDebInfo -DGTENSOR_BUILD_EXAMPLES=ON -DGTENSOR_USE_THRUST=ON -DGTEST_ROOT=${{ env.GTEST_ROOT }} -DGTENSOR_ENABLE_CLIB=ON -DGTENSOR_ENABLE_BLAS=ON -DGTENSOR_ENABLE_FFT=ON
run: cmake -S . -B build-hip -DGTENSOR_DEVICE=hip -DCMAKE_BUILD_TYPE=RelWithDebInfo -DGTENSOR_BUILD_EXAMPLES=ON -DGTENSOR_USE_THRUST=ON -DGTEST_ROOT=${{ env.GTEST_ROOT }} -DGTENSOR_ENABLE_CLIB=ON -DGTENSOR_ENABLE_BLAS=ON -DGTENSOR_ENABLE_FFT=ON -DGTENSOR_ENABLE_SOLVER=ON
- name: cmake thrust build
run: cmake --build build-hip -v
- name: cmake thrust debug
run: cmake -S . -B build-hip-debug -DGTENSOR_DEVICE=hip -DCMAKE_BUILD_TYPE=Debug -DGTENSOR_BUILD_EXAMPLES=ON -DGTENSOR_USE_THRUST=ON -DGTEST_ROOT=${{ env.GTEST_ROOT }} -DGTENSOR_ENABLE_CLIB=ON -DGTENSOR_ENABLE_BLAS=ON -DGTENSOR_ENABLE_FFT=ON
run: cmake -S . -B build-hip-debug -DGTENSOR_DEVICE=hip -DCMAKE_BUILD_TYPE=Debug -DGTENSOR_BUILD_EXAMPLES=ON -DGTENSOR_USE_THRUST=ON -DGTEST_ROOT=${{ env.GTEST_ROOT }} -DGTENSOR_ENABLE_CLIB=ON -DGTENSOR_ENABLE_BLAS=ON -DGTENSOR_ENABLE_FFT=ON -DGTENSOR_ENABLE_SOLVER=ON
- name: cmake thrust debug build
run: cmake --build build-hip-debug -v
- name: cmake nothrust
run: cmake -S . -B build-hip-nothrust -DGTENSOR_DEVICE=hip -DCMAKE_BUILD_TYPE=RelWithDebInfo -DGTENSOR_BUILD_EXAMPLES=ON -DGTENSOR_USE_THRUST=OFF -DGTEST_ROOT=${{ env.GTEST_ROOT }} -DGTENSOR_ENABLE_CLIB=ON -DGTENSOR_ENABLE_BLAS=ON -DGTENSOR_ENABLE_FFT=ON
run: cmake -S . -B build-hip-nothrust -DGTENSOR_DEVICE=hip -DCMAKE_BUILD_TYPE=RelWithDebInfo -DGTENSOR_BUILD_EXAMPLES=ON -DGTENSOR_USE_THRUST=OFF -DGTEST_ROOT=${{ env.GTEST_ROOT }} -DGTENSOR_ENABLE_CLIB=ON -DGTENSOR_ENABLE_BLAS=ON -DGTENSOR_ENABLE_FFT=ON -DGTENSOR_ENABLE_SOLVER=ON
- name: cmake nothrust build
run: cmake --build build-hip-nothrust -v
- name: GNU make setup gtensor subdir
Expand Down
10 changes: 10 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -185,6 +185,11 @@ if ("hip" IN_LIST GTENSOR_BUILD_DEVICES)
"${ROCM_PATH}/lib/librocsolver.so")
target_include_directories(rocsolver INTERFACE "${ROCM_PATH}/include")

add_library(rocsparse INTERFACE IMPORTED)
target_link_libraries(rocsparse INTERFACE
"${ROCM_PATH}/lib/librocsparse.so")
target_include_directories(rocsparse INTERFACE "${ROCM_PATH}/include")

# NB: in ROCm, rocfft library / package was deprecated in favor of
# hipfft library / package.
add_library(hipfft INTERFACE IMPORTED)
Expand Down Expand Up @@ -449,6 +454,11 @@ if (GTENSOR_ENABLE_BLAS)
add_library(gtsolver)
target_gtensor_sources(gtsolver PRIVATE src/gtsolver.cxx)
target_link_libraries(gtsolver gtblas)
if (${GTENSOR_DEVICE} STREQUAL "cuda")
target_link_libraries(gtsolver CUDA::cusparse)
elseif (${GTENSOR_DEVICE} STREQUAL "hip")
target_link_libraries(gtsolver rocsparse)
endif()
list(APPEND GTENSOR_TARGETS gtsolver)
add_library(gtensor::gtsolver ALIAS gtsolver)
endif()
Expand Down
64 changes: 64 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -455,3 +455,67 @@ native stream object is a `sycl::queue`.
See also `tests/test_stream.cxx`. Note that this API is likely to change; in
particular, the stream objects will become templated on space type.
# Library Wrapper Extensions
## gt-blas
Provides wrappers around commonly used blas routines. Requires cuBLAS, rocblas,
or oneMKL, depending on the GPU backend. Interface is mostly C style taking
raw pointers, for easy interoperability with Fortran, with a few higher level
gtensor specific helpers.
```
#include "gt-blas/blas.h"

void blas()
{
gt::blas::handle_t h;
gt::gtensor_device<double, 1> x = gt::arange<double>(1, 11);
gt::gtensor_device<double, 1> y = gt::arange<double>(1, 11);
gt::blas::axpy(h, 2.0, x, y);
std::cout << "a*x+y = " << y << std::endl;
/* a*x+y = { 3 6 9 12 15 18 21 24 27 30 } */
}
```
A naive banded LU solver implementation is also provided, useful in cases where
the matrices are banded and the native GPU batched LU solve has not been
optimized yet. Parallelism for this implementaiton is on batch only.
## gt-fft
Provides high level C++ style interface around cuFFT, rocFFT, and oneMKL DFT.
```
#include "gt-fft/fft.h"

void fft()
{
// python: x = np.array([2., 3., -1., 4.])
gt::gtensor_device<double, 1> x = {2, 3, -1, 4};
auto y = gt::empty_device<gt::complex<double>>({3});

// python: y = np.fft.fft(x)
gt::fft::FFTPlanMany<gt::fft::Domain::REAL, double> plan({x.shape(0)}, 1);
plan(x, y);
std::cout << y << std::endl;
/* { (8,0) (3,1) (-6,0) } */
}
```
## gt-solver
Provides a high level C++ style interface around batched LU solve, in
particular the case where a single set of matrices is used repeatedly to solve
with different right hand side vectors. It maintains it's own contiguous copy
of the factored matrices in device memory, and device buffers for staging input
and output right hand side vectors. This allows the application to build the
matrices on host and pass them off to the solver interface for all the device
specific handling. On some platforms, there are performance issues with solving
out of managed memory, and the internal device buffers can significantly
improve performance on these platforms.
This should be preferred over directly calling gt-blas routines like getrf and
getrs, when the use case matches (single factor and many solves with different
right hand sides).
6 changes: 6 additions & 0 deletions benchmarks/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -38,3 +38,9 @@ if (GTENSOR_ENABLE_FFT)
target_gtensor_sources(bench_fft PRIVATE bench_fft.cxx)
target_link_libraries(bench_fft gtensor::gtensor gtensor::gtfft benchmark::benchmark)
endif()

if (GTENSOR_ENABLE_SOLVER)
add_executable(bench_solver)
target_gtensor_sources(bench_solver PRIVATE bench_solver.cxx)
target_link_libraries(bench_solver gtensor::gtensor gtensor::gtsolver benchmark::benchmark)
endif()
135 changes: 135 additions & 0 deletions benchmarks/bench_solver.cxx
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
#include <iostream>
#include <numeric>
#include <string>

#include <benchmark/benchmark.h>

#include <gtensor/gtensor.h>

#include <gt-solver/solver.h>

#include "gt-bm.h"

using namespace gt::placeholders;

// #define BENCH_SOLVER_DEBUG

template <typename T>
auto make_test_matrix(int n, int bw, int batch_size, bool needs_pivot)
{
auto h_Adata = gt::zeros<T>({n, n, batch_size});
for (int b = 0; b < batch_size; b++) {
for (int i = 0; i < n; i++) {
h_Adata(i, i, b) = T(bw + 1.0);
// set upper / lower diags at bw diagonal
for (int d = 1; d <= bw; d++) {
if (i + d < n) {
h_Adata(i, i + d, b) = T(-1.0);
h_Adata(i + d, i, b) = T(-0.5);
}
}
}
if (needs_pivot) {
h_Adata(0, 0, b) = T(n / 64.0);
}
}
return h_Adata;
}

// ======================================================================
// BM_solver
//

// args: int N, int BW, int NRHS, int NBATCH
template <typename Solver>
static void BM_solver(benchmark::State& state)
{
using T = typename Solver::value_type;
using R = gt::complex_subtype_t<T>;

const int N = state.range(0);
const int BW = state.range(1);
const int NRHS = state.range(2);
const int NBATCH = state.range(3);

auto h_A = make_test_matrix<T>(N, BW, NBATCH, true);

gt::gtensor<T, 3> h_rhs(gt::shape(N, NRHS, NBATCH));
gt::gtensor<T, 3> h_result(h_rhs.shape());
gt::gtensor_device<T, 3> d_rhs(h_rhs.shape());
gt::gtensor_device<T, 3> d_result(h_rhs.shape());

gt::gtensor<T*, 1> h_Aptr(NBATCH);

h_Aptr(0) = gt::raw_pointer_cast(h_A.data());
for (int b = 1; b < NBATCH; b++) {
h_Aptr(b) = h_Aptr(0) + (N * N * b);
for (int i = 0; i < N; i++) {
for (int rhs = 0; rhs < NRHS; rhs++) {
h_rhs(i, rhs, b) = T(1.0 + R(rhs) / NRHS);
}
}
}

gt::copy(h_rhs, d_rhs);

gt::blas::handle_t h;

Solver s(h, N, NBATCH, NRHS, gt::raw_pointer_cast(h_Aptr.data()));

std::cout << typeid(Solver).name() << " devmem bytes "
<< static_cast<double>(s.get_device_memory_usage()) / (1024 * 1024)
<< " MB" << std::endl;

auto fn = [&]() {
s.solve(d_rhs.data().get(), d_result.data().get());
gt::synchronize();
};

// warm up, device compile, check
fn();

for (auto _ : state) {
fn();
}
}

// Solver, N, BW, NRHS, NBATCH
BENCHMARK(BM_solver<gt::solver::solver_dense<double>>)
->Args({512, 32, 1, 64})
->Args({210, 32, 1, 256})
->Unit(benchmark::kMillisecond);

BENCHMARK(BM_solver<gt::solver::solver_invert<double>>)
->Args({512, 32, 1, 64})
->Args({210, 32, 1, 256})
->Unit(benchmark::kMillisecond);

BENCHMARK(BM_solver<gt::solver::solver_sparse<double>>)
->Args({512, 32, 1, 64})
->Args({512, 5, 1, 64})
->Args({210, 32, 1, 256})
->Args({210, 5, 1, 256})
->Unit(benchmark::kMillisecond);

BENCHMARK(BM_solver<gt::solver::solver_dense<gt::complex<double>>>)
->Args({512, 32, 1, 64})
->Args({210, 32, 1, 256})
->Unit(benchmark::kMillisecond);

BENCHMARK(BM_solver<gt::solver::solver_invert<gt::complex<double>>>)
->Args({512, 32, 1, 64})
->Args({210, 32, 1, 256})
->Unit(benchmark::kMillisecond);

// oneMKL sparse API does not yet support complex
#ifndef GTENSOR_DEVICE_SYCL
BENCHMARK(BM_solver<gt::solver::solver_sparse<gt::complex<double>>>)
->Args({512, 32, 1, 64})
->Args({512, 5, 1, 64})
->Args({210, 32, 1, 256})
->Args({210, 5, 1, 256})
->Unit(benchmark::kMillisecond);
#endif

BENCHMARK_MAIN();
2 changes: 2 additions & 0 deletions include/gt-blas/backend/cuda.h
Original file line number Diff line number Diff line change
Expand Up @@ -317,6 +317,8 @@ inline void getrf_npvt_batched(handle_t& h, int n, T** d_Aarray, int lda,
CREATE_GETRF_NPVT_BATCHED(cublasZgetrfBatched, gt::complex<double>,
cuDoubleComplex)
CREATE_GETRF_NPVT_BATCHED(cublasCgetrfBatched, gt::complex<float>, cuComplex)
CREATE_GETRF_NPVT_BATCHED(cublasDgetrfBatched, double, double)
CREATE_GETRF_NPVT_BATCHED(cublasSgetrfBatched, float, float)

#undef CREATE_GETRF_NPVT_BATCHED

Expand Down
2 changes: 2 additions & 0 deletions include/gt-blas/backend/hip.h
Original file line number Diff line number Diff line change
Expand Up @@ -319,6 +319,8 @@ CREATE_GETRF_NPVT_BATCHED(rocsolver_zgetrf_npvt_batched, gt::complex<double>,
rocblas_double_complex)
CREATE_GETRF_NPVT_BATCHED(rocsolver_cgetrf_npvt_batched, gt::complex<float>,
rocblas_float_complex)
CREATE_GETRF_NPVT_BATCHED(rocsolver_dgetrf_npvt_batched, double, double)
CREATE_GETRF_NPVT_BATCHED(rocsolver_sgetrf_npvt_batched, float, float)

#undef CREATE_GETRF_NPVT_BATCHED

Expand Down
Loading

0 comments on commit 3e1be44

Please sign in to comment.