Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

CudaTensor CMake patch #56

Merged
merged 2 commits into from
Jul 30, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 25 additions & 28 deletions include/jet/CudaTensor.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,6 @@
#include <curand.h>
#include <cutensor.h>

#include <taskflow/cudaflow.hpp>
#include <taskflow/taskflow.hpp>

namespace {
using namespace Jet::CudaTensorHelpers;
}
Expand All @@ -42,7 +39,7 @@ template <class T = cuComplex, int CUDA_DEVICE = 0> class CudaTensor {
static CudaTensor<U, D> AddTensors(const CudaTensor<U, D> &A,
const CudaTensor<U, D> &B)
{
tf::cudaScopedDevice ctx(CUDA_DEVICE);
CudaScopedDevice ctx(CUDA_DEVICE);
static const CudaTensor<U, D> zero;

// The zero tensor is used in reductions where the shape of an
Expand Down Expand Up @@ -148,7 +145,7 @@ template <class T = cuComplex, int CUDA_DEVICE = 0> class CudaTensor {
void InitIndicesAndShape(const std::vector<std::string> &indices,
const std::vector<size_t> &shape)
{
tf::cudaScopedDevice ctx(CUDA_DEVICE);
CudaScopedDevice ctx(CUDA_DEVICE);
Clear_();
shape_ = shape;
indices_ = indices;
Expand All @@ -165,7 +162,7 @@ template <class T = cuComplex, int CUDA_DEVICE = 0> class CudaTensor {

CudaTensor() : data_{nullptr}
{
tf::cudaScopedDevice ctx(CUDA_DEVICE);
CudaScopedDevice ctx(CUDA_DEVICE);
T h_dat({.x = 0.0, .y = 0.0});
JET_CUDA_IS_SUCCESS(
cudaMalloc(reinterpret_cast<void **>(&data_), sizeof(T)));
Expand All @@ -184,7 +181,7 @@ template <class T = cuComplex, int CUDA_DEVICE = 0> class CudaTensor {
const std::vector<size_t> &shape, const std::vector<T> data)
: CudaTensor(indices, shape)
{
tf::cudaScopedDevice ctx(CUDA_DEVICE);
CudaScopedDevice ctx(CUDA_DEVICE);
JET_CUDA_IS_SUCCESS(cudaMemcpy(data_, data.data(),
sizeof(T) * data.size(),
cudaMemcpyHostToDevice));
Expand All @@ -194,7 +191,7 @@ template <class T = cuComplex, int CUDA_DEVICE = 0> class CudaTensor {
const std::vector<size_t> &shape, const T *data)
: CudaTensor(indices, shape)
{
tf::cudaScopedDevice ctx(CUDA_DEVICE);
CudaScopedDevice ctx(CUDA_DEVICE);
JET_CUDA_IS_SUCCESS(cudaMemcpy(
data_, data, sizeof(T) * Jet::Utilities::ShapeToSize(shape),
cudaMemcpyHostToDevice));
Expand All @@ -212,15 +209,15 @@ template <class T = cuComplex, int CUDA_DEVICE = 0> class CudaTensor {

~CudaTensor()
{
JET_CUDA_IS_SUCCESS(tf::cudaScopedDevice ctx(CUDA_DEVICE);
cudaFree(data_));
CudaScopedDevice ctx(CUDA_DEVICE);
JET_CUDA_IS_SUCCESS(cudaFree(data_));
}

template <class U = T, int D = CUDA_DEVICE>
static CudaTensor<U, D> ContractTensors(const CudaTensor<U, D> &a_tensor,
const CudaTensor<U, D> &b_tensor)
{
tf::cudaScopedDevice ctx(CUDA_DEVICE);
CudaScopedDevice ctx(CUDA_DEVICE);
using namespace Utilities;

auto &&left_indices =
Expand Down Expand Up @@ -298,13 +295,13 @@ template <class T = cuComplex, int CUDA_DEVICE = 0> class CudaTensor {

CudaTensor(CudaTensor &&other) : data_{nullptr}
{
tf::cudaScopedDevice ctx(CUDA_DEVICE);
CudaScopedDevice ctx(CUDA_DEVICE);
Move_(std::move(other));
}

CudaTensor(const CudaTensor &other) : data_{nullptr}
{
tf::cudaScopedDevice ctx(CUDA_DEVICE);
CudaScopedDevice ctx(CUDA_DEVICE);
InitIndicesAndShape(other.GetIndices(), other.GetShape());

JET_CUDA_IS_SUCCESS(cudaMemcpy(data_, other.GetData(),
Expand All @@ -315,7 +312,7 @@ template <class T = cuComplex, int CUDA_DEVICE = 0> class CudaTensor {
template <class CPUData>
CudaTensor(const Tensor<CPUData> &other) : data_{nullptr}
{
tf::cudaScopedDevice ctx(CUDA_DEVICE);
CudaScopedDevice ctx(CUDA_DEVICE);
static_assert(sizeof(CPUData) == sizeof(T),
"Size of CPU and GPU data types do not match.");

Expand All @@ -327,7 +324,7 @@ template <class T = cuComplex, int CUDA_DEVICE = 0> class CudaTensor {

template <class CPUData> CudaTensor &operator=(const Tensor<CPUData> &other)
{
tf::cudaScopedDevice ctx(CUDA_DEVICE);
CudaScopedDevice ctx(CUDA_DEVICE);
static_assert(sizeof(CPUData) == sizeof(T),
"Size of CPU and GPU data types do not match.");

Expand All @@ -340,7 +337,7 @@ template <class T = cuComplex, int CUDA_DEVICE = 0> class CudaTensor {

CudaTensor &operator=(const CudaTensor &other)
{
tf::cudaScopedDevice ctx(CUDA_DEVICE);
CudaScopedDevice ctx(CUDA_DEVICE);
if (this != &other) // not a self-assignment
{
InitIndicesAndShape(other.GetIndices(), other.GetShape());
Expand All @@ -360,37 +357,37 @@ template <class T = cuComplex, int CUDA_DEVICE = 0> class CudaTensor {

inline void CopyHostDataToGpu(T *host_tensor)
{
tf::cudaScopedDevice ctx(CUDA_DEVICE);
CudaScopedDevice ctx(CUDA_DEVICE);
JET_CUDA_IS_SUCCESS(cudaMemcpy(
data_, host_tensor, sizeof(T) * GetSize(), cudaMemcpyHostToDevice));
}

inline void CopyGpuDataToHost(T *host_tensor) const
{
tf::cudaScopedDevice ctx(CUDA_DEVICE);
CudaScopedDevice ctx(CUDA_DEVICE);
JET_CUDA_IS_SUCCESS(cudaMemcpy(
host_tensor, data_, sizeof(T) * GetSize(), cudaMemcpyDeviceToHost));
}

inline void CopyGpuDataToGpu(T *host_tensor)
{
tf::cudaScopedDevice ctx(CUDA_DEVICE);
CudaScopedDevice ctx(CUDA_DEVICE);
JET_CUDA_IS_SUCCESS(cudaMemcpy(host_tensor, data_,
sizeof(T) * GetSize(),
cudaMemcpyDeviceToDevice));
}

inline void AsyncCopyHostDataToGpu(T *host_tensor, cudaStream_t stream = 0)
{
tf::cudaScopedDevice ctx(CUDA_DEVICE);
CudaScopedDevice ctx(CUDA_DEVICE);
JET_CUDA_IS_SUCCESS(cudaMemcpyAsync(data_, host_tensor,
sizeof(T) * GetSize(),
cudaMemcpyHostToDevice, stream));
}

inline void AsyncCopyGpuDataToHost(T *host_tensor, cudaStream_t stream = 0)
{
tf::cudaScopedDevice ctx(CUDA_DEVICE);
CudaScopedDevice ctx(CUDA_DEVICE);
JET_CUDA_IS_SUCCESS(cudaMemcpyAsync(host_tensor, data_,
sizeof(T) * GetSize(),
cudaMemcpyDeviceToHost, stream));
Expand All @@ -403,7 +400,7 @@ template <class T = cuComplex, int CUDA_DEVICE = 0> class CudaTensor {

explicit operator Tensor<std::complex<scalar_type_t_precision>>() const
{
tf::cudaScopedDevice ctx(CUDA_DEVICE);
CudaScopedDevice ctx(CUDA_DEVICE);
std::vector<std::complex<scalar_type_t_precision>> host_data(
GetSize(), {0.0, 0.0});

Expand All @@ -421,7 +418,7 @@ template <class T = cuComplex, int CUDA_DEVICE = 0> class CudaTensor {
*/
void FillRandom(size_t seed)
{
tf::cudaScopedDevice ctx(CUDA_DEVICE);
CudaScopedDevice ctx(CUDA_DEVICE);
static curandGenerator_t rng;
JET_CURAND_IS_SUCCESS(
curandCreateGenerator(&rng, CURAND_RNG_PSEUDO_DEFAULT));
Expand All @@ -437,7 +434,7 @@ template <class T = cuComplex, int CUDA_DEVICE = 0> class CudaTensor {
*/
void FillRandom()
{
tf::cudaScopedDevice ctx(CUDA_DEVICE);
CudaScopedDevice ctx(CUDA_DEVICE);
static curandGenerator_t rng;
JET_CURAND_IS_SUCCESS(
curandCreateGenerator(&rng, CURAND_RNG_PSEUDO_DEFAULT));
Expand Down Expand Up @@ -478,7 +475,7 @@ template <class T = cuComplex, int CUDA_DEVICE = 0> class CudaTensor {

~CudaContractionPlan()
{
tf::cudaScopedDevice ctx(CUDA_DEVICE);
CudaScopedDevice ctx(CUDA_DEVICE);
JET_CUDA_IS_SUCCESS(cudaFree(work));
}
};
Expand Down Expand Up @@ -675,7 +672,7 @@ template <class T = cuComplex, int CUDA_DEVICE = 0> class CudaTensor {
static CudaTensor<U, D> Reshape(const CudaTensor<U> &old_tensor,
const std::vector<size_t> &new_shape)
{
tf::cudaScopedDevice ctx(CUDA_DEVICE);
CudaScopedDevice ctx(CUDA_DEVICE);
JET_ABORT_IF_NOT(old_tensor.GetSize() ==
Jet::Utilities::ShapeToSize(new_shape),
"Size is inconsistent between tensors.");
Expand Down Expand Up @@ -706,7 +703,7 @@ template <class T = cuComplex, int CUDA_DEVICE = 0> class CudaTensor {
Transpose(const CudaTensor<U, D> &tensor,
const std::vector<std::string> &new_indices)
{
tf::cudaScopedDevice ctx(CUDA_DEVICE);
CudaScopedDevice ctx(CUDA_DEVICE);
using namespace Jet::Utilities;

if (tensor.GetIndices() == new_indices)
Expand Down Expand Up @@ -870,7 +867,7 @@ template <class T = cuComplex, int CUDA_DEVICE = 0> class CudaTensor {
const std::string &index_str,
size_t index_value)
{
tf::cudaScopedDevice ctx(D);
CudaScopedDevice ctx(D);
std::vector<std::string> new_indices = tens.GetIndices();
std::vector<std::string> old_indices = tens.GetIndices();

Expand Down
47 changes: 47 additions & 0 deletions include/jet/CudaTensorHelpers.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -176,5 +176,52 @@ ReverseVector(const std::vector<DataType> &input)
return std::vector<DataType>{input.rbegin(), input.rend()};
}

/** @class CudaScopedDevice

@brief RAII-styled device context switch. Code taken from Taskflow.

%cudaScopedDevice is neither movable nor copyable.
*/
class CudaScopedDevice {

public:
/**
@brief constructs a RAII-styled device switcher

@param device device context to scope in the guard
*/
explicit CudaScopedDevice(int device);

/**
@brief destructs the guard and switches back to the previous device context
*/
~CudaScopedDevice();

private:
CudaScopedDevice() = delete;
CudaScopedDevice(const CudaScopedDevice &) = delete;
CudaScopedDevice(CudaScopedDevice &&) = delete;

int _p;
};

inline CudaScopedDevice::CudaScopedDevice(int dev)
{
JET_CUDA_IS_SUCCESS(cudaGetDevice(&_p));
if (_p == dev) {
_p = -1;
}
else {
JET_CUDA_IS_SUCCESS(cudaSetDevice(dev));
}
}

inline CudaScopedDevice::~CudaScopedDevice()
{
if (_p != -1) {
cudaSetDevice(_p);
}
}

} // namespace CudaTensorHelpers
} // namespace Jet