diff --git a/include/jet/CudaTensor.hpp b/include/jet/CudaTensor.hpp index c3f9ec1e..c1931f67 100644 --- a/include/jet/CudaTensor.hpp +++ b/include/jet/CudaTensor.hpp @@ -19,9 +19,6 @@ #include #include -#include -#include - namespace { using namespace Jet::CudaTensorHelpers; } @@ -42,7 +39,7 @@ template class CudaTensor { static CudaTensor AddTensors(const CudaTensor &A, const CudaTensor &B) { - tf::cudaScopedDevice ctx(CUDA_DEVICE); + CudaScopedDevice ctx(CUDA_DEVICE); static const CudaTensor zero; // The zero tensor is used in reductions where the shape of an @@ -148,7 +145,7 @@ template class CudaTensor { void InitIndicesAndShape(const std::vector &indices, const std::vector &shape) { - tf::cudaScopedDevice ctx(CUDA_DEVICE); + CudaScopedDevice ctx(CUDA_DEVICE); Clear_(); shape_ = shape; indices_ = indices; @@ -165,7 +162,7 @@ template class CudaTensor { CudaTensor() : data_{nullptr} { - tf::cudaScopedDevice ctx(CUDA_DEVICE); + CudaScopedDevice ctx(CUDA_DEVICE); T h_dat({.x = 0.0, .y = 0.0}); JET_CUDA_IS_SUCCESS( cudaMalloc(reinterpret_cast(&data_), sizeof(T))); @@ -184,7 +181,7 @@ template class CudaTensor { const std::vector &shape, const std::vector data) : CudaTensor(indices, shape) { - tf::cudaScopedDevice ctx(CUDA_DEVICE); + CudaScopedDevice ctx(CUDA_DEVICE); JET_CUDA_IS_SUCCESS(cudaMemcpy(data_, data.data(), sizeof(T) * data.size(), cudaMemcpyHostToDevice)); @@ -194,7 +191,7 @@ template class CudaTensor { const std::vector &shape, const T *data) : CudaTensor(indices, shape) { - tf::cudaScopedDevice ctx(CUDA_DEVICE); + CudaScopedDevice ctx(CUDA_DEVICE); JET_CUDA_IS_SUCCESS(cudaMemcpy( data_, data, sizeof(T) * Jet::Utilities::ShapeToSize(shape), cudaMemcpyHostToDevice)); @@ -212,15 +209,15 @@ template class CudaTensor { ~CudaTensor() { - JET_CUDA_IS_SUCCESS(tf::cudaScopedDevice ctx(CUDA_DEVICE); - cudaFree(data_)); + CudaScopedDevice ctx(CUDA_DEVICE); + JET_CUDA_IS_SUCCESS(cudaFree(data_)); } template static CudaTensor ContractTensors(const CudaTensor &a_tensor, const CudaTensor &b_tensor) { - tf::cudaScopedDevice ctx(CUDA_DEVICE); + CudaScopedDevice ctx(CUDA_DEVICE); using namespace Utilities; auto &&left_indices = @@ -298,13 +295,13 @@ template class CudaTensor { CudaTensor(CudaTensor &&other) : data_{nullptr} { - tf::cudaScopedDevice ctx(CUDA_DEVICE); + CudaScopedDevice ctx(CUDA_DEVICE); Move_(std::move(other)); } CudaTensor(const CudaTensor &other) : data_{nullptr} { - tf::cudaScopedDevice ctx(CUDA_DEVICE); + CudaScopedDevice ctx(CUDA_DEVICE); InitIndicesAndShape(other.GetIndices(), other.GetShape()); JET_CUDA_IS_SUCCESS(cudaMemcpy(data_, other.GetData(), @@ -315,7 +312,7 @@ template class CudaTensor { template CudaTensor(const Tensor &other) : data_{nullptr} { - tf::cudaScopedDevice ctx(CUDA_DEVICE); + CudaScopedDevice ctx(CUDA_DEVICE); static_assert(sizeof(CPUData) == sizeof(T), "Size of CPU and GPU data types do not match."); @@ -327,7 +324,7 @@ template class CudaTensor { template CudaTensor &operator=(const Tensor &other) { - tf::cudaScopedDevice ctx(CUDA_DEVICE); + CudaScopedDevice ctx(CUDA_DEVICE); static_assert(sizeof(CPUData) == sizeof(T), "Size of CPU and GPU data types do not match."); @@ -340,7 +337,7 @@ template class CudaTensor { CudaTensor &operator=(const CudaTensor &other) { - tf::cudaScopedDevice ctx(CUDA_DEVICE); + CudaScopedDevice ctx(CUDA_DEVICE); if (this != &other) // not a self-assignment { InitIndicesAndShape(other.GetIndices(), other.GetShape()); @@ -360,21 +357,21 @@ template class CudaTensor { inline void CopyHostDataToGpu(T *host_tensor) { - tf::cudaScopedDevice ctx(CUDA_DEVICE); + CudaScopedDevice ctx(CUDA_DEVICE); JET_CUDA_IS_SUCCESS(cudaMemcpy( data_, host_tensor, sizeof(T) * GetSize(), cudaMemcpyHostToDevice)); } inline void CopyGpuDataToHost(T *host_tensor) const { - tf::cudaScopedDevice ctx(CUDA_DEVICE); + CudaScopedDevice ctx(CUDA_DEVICE); JET_CUDA_IS_SUCCESS(cudaMemcpy( host_tensor, data_, sizeof(T) * GetSize(), cudaMemcpyDeviceToHost)); } inline void CopyGpuDataToGpu(T *host_tensor) { - tf::cudaScopedDevice ctx(CUDA_DEVICE); + CudaScopedDevice ctx(CUDA_DEVICE); JET_CUDA_IS_SUCCESS(cudaMemcpy(host_tensor, data_, sizeof(T) * GetSize(), cudaMemcpyDeviceToDevice)); @@ -382,7 +379,7 @@ template class CudaTensor { inline void AsyncCopyHostDataToGpu(T *host_tensor, cudaStream_t stream = 0) { - tf::cudaScopedDevice ctx(CUDA_DEVICE); + CudaScopedDevice ctx(CUDA_DEVICE); JET_CUDA_IS_SUCCESS(cudaMemcpyAsync(data_, host_tensor, sizeof(T) * GetSize(), cudaMemcpyHostToDevice, stream)); @@ -390,7 +387,7 @@ template class CudaTensor { inline void AsyncCopyGpuDataToHost(T *host_tensor, cudaStream_t stream = 0) { - tf::cudaScopedDevice ctx(CUDA_DEVICE); + CudaScopedDevice ctx(CUDA_DEVICE); JET_CUDA_IS_SUCCESS(cudaMemcpyAsync(host_tensor, data_, sizeof(T) * GetSize(), cudaMemcpyDeviceToHost, stream)); @@ -403,7 +400,7 @@ template class CudaTensor { explicit operator Tensor>() const { - tf::cudaScopedDevice ctx(CUDA_DEVICE); + CudaScopedDevice ctx(CUDA_DEVICE); std::vector> host_data( GetSize(), {0.0, 0.0}); @@ -421,7 +418,7 @@ template class CudaTensor { */ void FillRandom(size_t seed) { - tf::cudaScopedDevice ctx(CUDA_DEVICE); + CudaScopedDevice ctx(CUDA_DEVICE); static curandGenerator_t rng; JET_CURAND_IS_SUCCESS( curandCreateGenerator(&rng, CURAND_RNG_PSEUDO_DEFAULT)); @@ -437,7 +434,7 @@ template class CudaTensor { */ void FillRandom() { - tf::cudaScopedDevice ctx(CUDA_DEVICE); + CudaScopedDevice ctx(CUDA_DEVICE); static curandGenerator_t rng; JET_CURAND_IS_SUCCESS( curandCreateGenerator(&rng, CURAND_RNG_PSEUDO_DEFAULT)); @@ -478,7 +475,7 @@ template class CudaTensor { ~CudaContractionPlan() { - tf::cudaScopedDevice ctx(CUDA_DEVICE); + CudaScopedDevice ctx(CUDA_DEVICE); JET_CUDA_IS_SUCCESS(cudaFree(work)); } }; @@ -675,7 +672,7 @@ template class CudaTensor { static CudaTensor Reshape(const CudaTensor &old_tensor, const std::vector &new_shape) { - tf::cudaScopedDevice ctx(CUDA_DEVICE); + CudaScopedDevice ctx(CUDA_DEVICE); JET_ABORT_IF_NOT(old_tensor.GetSize() == Jet::Utilities::ShapeToSize(new_shape), "Size is inconsistent between tensors."); @@ -706,7 +703,7 @@ template class CudaTensor { Transpose(const CudaTensor &tensor, const std::vector &new_indices) { - tf::cudaScopedDevice ctx(CUDA_DEVICE); + CudaScopedDevice ctx(CUDA_DEVICE); using namespace Jet::Utilities; if (tensor.GetIndices() == new_indices) @@ -870,7 +867,7 @@ template class CudaTensor { const std::string &index_str, size_t index_value) { - tf::cudaScopedDevice ctx(D); + CudaScopedDevice ctx(D); std::vector new_indices = tens.GetIndices(); std::vector old_indices = tens.GetIndices(); diff --git a/include/jet/CudaTensorHelpers.hpp b/include/jet/CudaTensorHelpers.hpp index 355e6a72..5bb75d67 100644 --- a/include/jet/CudaTensorHelpers.hpp +++ b/include/jet/CudaTensorHelpers.hpp @@ -176,5 +176,52 @@ ReverseVector(const std::vector &input) return std::vector{input.rbegin(), input.rend()}; } +/** @class CudaScopedDevice + +@brief RAII-styled device context switch. Code taken from Taskflow. + +%cudaScopedDevice is neither movable nor copyable. +*/ +class CudaScopedDevice { + + public: + /** + @brief constructs a RAII-styled device switcher + + @param device device context to scope in the guard + */ + explicit CudaScopedDevice(int device); + + /** + @brief destructs the guard and switches back to the previous device context + */ + ~CudaScopedDevice(); + + private: + CudaScopedDevice() = delete; + CudaScopedDevice(const CudaScopedDevice &) = delete; + CudaScopedDevice(CudaScopedDevice &&) = delete; + + int _p; +}; + +inline CudaScopedDevice::CudaScopedDevice(int dev) +{ + JET_CUDA_IS_SUCCESS(cudaGetDevice(&_p)); + if (_p == dev) { + _p = -1; + } + else { + JET_CUDA_IS_SUCCESS(cudaSetDevice(dev)); + } +} + +inline CudaScopedDevice::~CudaScopedDevice() +{ + if (_p != -1) { + cudaSetDevice(_p); + } +} + } // namespace CudaTensorHelpers } // namespace Jet