Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

【Sparse】add some kernels(csr*dense->csr, dense*dense->csr) of SparseTensor matmul #42935

Merged
merged 8 commits into from
Jun 15, 2022
5 changes: 5 additions & 0 deletions paddle/fluid/eager/grad_node_info.cc
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
#include "paddle/phi/common/data_type.h"
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/core/sparse_coo_tensor.h"
#include "paddle/phi/core/sparse_csr_tensor.h"

/**
* Implementation of GradNodeBase, Edge and GradTensorHolder.
Expand Down Expand Up @@ -114,6 +115,10 @@ void GradNodeBase::SetGradInMeta(const paddle::experimental::Tensor& fwd_out,
phi::SparseCooTensor* coo_tensor =
static_cast<phi::SparseCooTensor*>(fwd_out.impl().get());
dense_tensor = coo_tensor->mutable_non_zero_elements();
} else if (phi::SparseCsrTensor::classof(fwd_out.impl().get())) {
phi::SparseCsrTensor* csr_tensor =
static_cast<phi::SparseCsrTensor*>(fwd_out.impl().get());
dense_tensor = csr_tensor->mutable_non_zero_elements();
} else {
VLOG(6) << "Unable to initialize the DenseTensorMeta of GradSlotMeta with "
"non-DenseTensor argument.";
Expand Down
13 changes: 11 additions & 2 deletions paddle/fluid/eager/grad_tensor_holder.cc
Original file line number Diff line number Diff line change
Expand Up @@ -66,8 +66,17 @@ void GradTensorHolder::CopyValueFromTensor(
// Create new tensor->impl and fill it with 1.0
if (t.defined()) {
// Fill 1.0, use full to support complex, one_like don't support it.
buffer_[slot_id][rank] =
paddle::experimental::full(t.shape(), 1, t.dtype(), t.place());
if (t.is_dense_tensor()) {
buffer_[slot_id][rank] =
paddle::experimental::full(t.shape(), 1, t.dtype(), t.place());
} else if (t.is_sparse_csr_tensor() || t.is_sparse_coo_tensor()) {
buffer_[slot_id][rank] =
paddle::experimental::sparse::full_like(t, 1, t.dtype());
} else {
PADDLE_THROW(paddle::platform::errors::Fatal(
"Only Support DENSE_TENSOR, SPARSE_COO_TENSOR, SPARSE_CSR_TENSOR "
"now."));
}
}
}
}
Expand Down
4 changes: 2 additions & 2 deletions paddle/fluid/platform/device/gpu/cuda/cusparse_helper.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ class CusparseHandleHolder {
// ROCM is not yet supported
#if defined(PADDLE_WITH_CUDA)
// The generic APIs is supported from CUDA10.1
#if CUDA_VERSION >= 10010
#if CUDA_VERSION >= 11000
PADDLE_RETRY_CUDA_SUCCESS(dynload::cusparseCreate(&handle_));
PADDLE_RETRY_CUDA_SUCCESS(dynload::cusparseSetStream(handle_, stream));
#endif
Expand All @@ -41,7 +41,7 @@ class CusparseHandleHolder {

~CusparseHandleHolder() PADDLE_MAY_THROW {
#if defined(PADDLE_WITH_CUDA)
#if CUDA_VERSION >= 10010
#if CUDA_VERSION >= 11000
PADDLE_RETRY_CUDA_SUCCESS(dynload::cusparseDestroy(handle_));
#endif
#endif
Expand Down
4 changes: 0 additions & 4 deletions paddle/fluid/platform/dynload/cusparse.cc
Original file line number Diff line number Diff line change
Expand Up @@ -24,10 +24,6 @@ namespace dynload {
CUSPARSE_ROUTINE_EACH(DEFINE_WRAP);
#endif

#ifdef CUSPARSE_ROUTINE_EACH_11020
CUSPARSE_ROUTINE_EACH_11020(DEFINE_WRAP);
#endif

#ifdef CUSPARSE_ROUTINE_EACH_R2
CUSPARSE_ROUTINE_EACH_R2(DEFINE_WRAP);
#endif
Expand Down
39 changes: 17 additions & 22 deletions paddle/fluid/platform/dynload/cusparse.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,23 +29,17 @@ namespace dynload {
extern DynLoad__##__name __name

#if defined(PADDLE_WITH_CUDA)
// The generic APIs is supported from CUDA10.1
#if CUDA_VERSION >= 10010
#define CUSPARSE_ROUTINE_EACH(__macro) \
__macro(cusparseCreate); \
__macro(cusparseSetStream); \
__macro(cusparseCreateMatDescr); \
__macro(cusparseDestroy); \
__macro(cusparseSnnz); \
__macro(cusparseDnnz); \
__macro(cusparseSetMatType); \
__macro(cusparseSetMatIndexBase);

CUSPARSE_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP);

// APIs available after CUDA 11.2
#if CUDA_VERSION >= 11020
#define CUSPARSE_ROUTINE_EACH_11020(__macro) \
// APIs available after CUDA 11.0
#if CUDA_VERSION >= 11000
#define CUSPARSE_ROUTINE_EACH(__macro) \
__macro(cusparseCreate); \
__macro(cusparseSetStream); \
__macro(cusparseCreateMatDescr); \
__macro(cusparseDestroy); \
__macro(cusparseSnnz); \
__macro(cusparseDnnz); \
__macro(cusparseSetMatType); \
__macro(cusparseSetMatIndexBase); \
__macro(cusparseCreateCsr); \
__macro(cusparseCreateCoo); \
__macro(cusparseCreateDnMat); \
Expand All @@ -59,11 +53,13 @@ CUSPARSE_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP);
__macro(cusparseDenseToSparse_analysis); \
__macro(cusparseDenseToSparse_convert); \
__macro(cusparseSparseToDense_bufferSize); \
__macro(cusparseSparseToDense);
__macro(cusparseSparseToDense); \
__macro(cusparseDnMatSetStridedBatch); \
__macro(cusparseCsrSetStridedBatch);

CUSPARSE_ROUTINE_EACH_11020(PLATFORM_DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP)
CUSPARSE_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP)
#endif

// APIs available after CUDA 11.3
#if CUDA_VERSION >= 11030
#define CUSPARSE_ROUTINE_EACH_R2(__macro) \
__macro(cusparseSDDMM_bufferSize); \
Expand All @@ -72,8 +68,7 @@ CUSPARSE_ROUTINE_EACH_11020(PLATFORM_DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP)

CUSPARSE_ROUTINE_EACH_R2(PLATFORM_DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP)
#endif
#endif
#endif

#endif

#undef PLATFORM_DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP
Expand Down
4 changes: 0 additions & 4 deletions paddle/phi/backends/dynload/cusparse.cc
Original file line number Diff line number Diff line change
Expand Up @@ -26,10 +26,6 @@ void *cusparse_dso_handle;
CUSPARSE_ROUTINE_EACH(DEFINE_WRAP);
#endif

#ifdef CUSPARSE_ROUTINE_EACH_11020
CUSPARSE_ROUTINE_EACH_11020(DEFINE_WRAP);
#endif

#ifdef CUSPARSE_ROUTINE_EACH_R2
CUSPARSE_ROUTINE_EACH_R2(DEFINE_WRAP);
#endif
Expand Down
43 changes: 19 additions & 24 deletions paddle/phi/backends/dynload/cusparse.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,34 +30,28 @@ extern void *cusparse_dso_handle;
struct DynLoad__##__name { \
template <typename... Args> \
cusparseStatus_t operator()(Args... args) { \
using cusparseFunc = decltype(&::__name); \
using Func = decltype(&::__name); \
std::call_once(cusparse_dso_flag, []() { \
cusparse_dso_handle = phi::dynload::GetCusparseDsoHandle(); \
}); \
static void *p_##__name = dlsym(cusparse_dso_handle, #__name); \
return reinterpret_cast<cusparseFunc>(p_##__name)(args...); \
return reinterpret_cast<Func>(p_##__name)(args...); \
} \
}; \
extern DynLoad__##__name __name

#if defined(PADDLE_WITH_CUDA)
// The generic APIs is supported from CUDA10.1
#if CUDA_VERSION >= 10010
#define CUSPARSE_ROUTINE_EACH(__macro) \
__macro(cusparseCreate); \
__macro(cusparseSetStream); \
__macro(cusparseCreateMatDescr); \
__macro(cusparseDestroy); \
__macro(cusparseSnnz); \
__macro(cusparseDnnz); \
__macro(cusparseSetMatType); \
__macro(cusparseSetMatIndexBase);

CUSPARSE_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP);

// APIs available after CUDA 11.2
#if CUDA_VERSION >= 11020
#define CUSPARSE_ROUTINE_EACH_11020(__macro) \
// APIs available after CUDA 11.0
#if CUDA_VERSION >= 11000
#define CUSPARSE_ROUTINE_EACH(__macro) \
__macro(cusparseCreate); \
__macro(cusparseSetStream); \
__macro(cusparseCreateMatDescr); \
__macro(cusparseDestroy); \
__macro(cusparseSnnz); \
__macro(cusparseDnnz); \
__macro(cusparseSetMatType); \
__macro(cusparseSetMatIndexBase); \
__macro(cusparseCreateCsr); \
__macro(cusparseCreateCoo); \
__macro(cusparseCreateDnMat); \
Expand All @@ -71,11 +65,13 @@ CUSPARSE_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP);
__macro(cusparseDenseToSparse_analysis); \
__macro(cusparseDenseToSparse_convert); \
__macro(cusparseSparseToDense_bufferSize); \
__macro(cusparseSparseToDense);
__macro(cusparseSparseToDense); \
__macro(cusparseDnMatSetStridedBatch); \
__macro(cusparseCsrSetStridedBatch);

CUSPARSE_ROUTINE_EACH_11020(DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP)
CUSPARSE_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP)
#endif

// APIs available after CUDA 11.3
#if CUDA_VERSION >= 11030
#define CUSPARSE_ROUTINE_EACH_R2(__macro) \
__macro(cusparseSDDMM_bufferSize); \
Expand All @@ -84,8 +80,7 @@ CUSPARSE_ROUTINE_EACH_11020(DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP)

CUSPARSE_ROUTINE_EACH_R2(DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP)
#endif
#endif
#endif

#endif

#undef DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP
Expand Down
13 changes: 11 additions & 2 deletions paddle/phi/backends/gpu/gpu_context.cc
Original file line number Diff line number Diff line change
Expand Up @@ -402,7 +402,10 @@ struct GPUContext::Impl {

void SetSolverHandle(solverHandle_t handle) { solver_handle_ = handle; }

sparseHandle_t GetSparseHandle() const {
sparseHandle_t GetSparseHandle() {
std::call_once(flag_sparse_, [=]() {
if (!sparse_handle_) phi::InitSparseHandle(&sparse_handle_, stream_);
});
PD_CHECK(sparse_handle_ != nullptr, "the gpu sparse handle is nullptr.");
return sparse_handle_;
}
Expand Down Expand Up @@ -519,7 +522,12 @@ struct GPUContext::Impl {
}

inline void CusparseCall(
const std::function<void(sparseHandle_t)>& callback) const {
const std::function<void(sparseHandle_t)>& callback) {
std::call_once(flag_sparse_, [=]() {
if (!sparse_handle_) {
phi::InitSparseHandle(&sparse_handle_, stream_);
}
});
std::lock_guard<std::mutex> guard(sparse_mtx_);
callback(sparse_handle_);
}
Expand Down Expand Up @@ -598,6 +606,7 @@ struct GPUContext::Impl {
sparseHandle_t sparse_handle_{nullptr};
DnnWorkspaceHandle* workspace_{nullptr};

std::once_flag flag_sparse_;
std::once_flag flag_blas_;
std::once_flag flag_blaslt_;
std::once_flag flag_dnn_;
Expand Down
4 changes: 2 additions & 2 deletions paddle/phi/backends/gpu/gpu_resources.cc
Original file line number Diff line number Diff line change
Expand Up @@ -250,7 +250,7 @@ void InitSparseHandle(sparseHandle_t* handle, gpuStream_t stream) {
// ROCM is not yet supported
#if defined(PADDLE_WITH_CUDA)
// The generic APIs is supported from CUDA10.1
#if CUDA_VERSION >= 10010
#if CUDA_VERSION >= 11000
PADDLE_RETRY_CUDA_SUCCESS(dynload::cusparseCreate(handle));
PADDLE_RETRY_CUDA_SUCCESS(dynload::cusparseSetStream(*handle, stream));
#endif
Expand All @@ -259,7 +259,7 @@ void InitSparseHandle(sparseHandle_t* handle, gpuStream_t stream) {

void DestroySparseHandle(sparseHandle_t handle) {
#ifdef PADDLE_WITH_CUDA
#if CUDA_VERSION >= 10010
#if CUDA_VERSION >= 11000
if (handle != nullptr) {
PADDLE_RETRY_CUDA_SUCCESS(dynload::cusparseDestroy(handle));
handle = nullptr;
Expand Down
4 changes: 4 additions & 0 deletions paddle/phi/core/sparse_csr_tensor.h
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,10 @@ class SparseCsrTensor : public TensorBase,
/// \return The non zero elemetns in original dense tensor.
const DenseTensor& non_zero_elements() const { return non_zero_elements_; }

/// \brief Returns the total number of non zero elements in original dense
/// tensor.
int64_t nnz() const { return non_zero_elements_.numel(); }

/// \brief Return the number of elements contained in original dense tensor
/// \return The number of elements contained in original dense tensor
int64_t numel() const override { return product(dims_); }
Expand Down
96 changes: 96 additions & 0 deletions paddle/phi/kernels/funcs/sparse/sparse_blas.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

2018->2022

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done, thx

//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#pragma once

#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/core/device_context.h"
#include "paddle/phi/core/sparse_coo_tensor.h"
#include "paddle/phi/core/sparse_csr_tensor.h"

namespace phi {
namespace funcs {
namespace sparse {

template <typename DeviceContext>
class SparseBlas {
public:
explicit SparseBlas(const DeviceContext& dev_ctx) : dev_ctx_(dev_ctx) {}

// TODO(zhouwei25): implement "COO @ DENSE -> DENSE" of DSDMM
template <typename T>
void DSDMM(bool transa,
bool transb,
T alpha,
const phi::SparseCooTensor& mat_a,
const phi::DenseTensor& mat_b,
T beta,
phi::DenseTensor* mat_c) const;

template <typename T>
void DSDMM(bool transa,
bool transb,
T alpha,
const phi::SparseCsrTensor& mat_a,
const phi::DenseTensor& mat_b,
T beta,
phi::DenseTensor* mat_c) const;

template <typename T>
void SDDMM(bool transa,
bool transb,
T alpha,
const phi::DenseTensor& mat_a,
const phi::DenseTensor& mat_b,
T beta,
phi::SparseCsrTensor* mat_c) const;

private:
const DeviceContext& dev_ctx_;
};

template <typename DeviceContext, typename T>
class SparseBlasT : private SparseBlas<DeviceContext> {
public:
using SparseBlas<DeviceContext>::SparseBlas;

template <typename... ARGS>
void DSDMM(ARGS... args) const {
Base()->template DSDMM<T>(args...);
}

template <typename... ARGS>
void SDDMM(ARGS... args) const {
Base()->template SDDMM<T>(args...);
}

private:
const SparseBlas<DeviceContext>* Base() const {
return static_cast<const SparseBlas<DeviceContext>*>(this);
}
};

template <typename DeviceContext, typename T>
inline SparseBlasT<DeviceContext, T> GetSparseBlas(
const DeviceContext& dev_ctx) {
return SparseBlasT<DeviceContext, T>(dev_ctx);
}

} // namespace sparse
} // namespace funcs
} // namespace phi

#if defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 11000
#include "paddle/phi/kernels/funcs/sparse/sparse_blas_impl.cu.h"
#endif
Loading