Spin orbit torque implementation (#191)

* Abstract the Cuda RK4 solver I've re-implemented the Cuda RK4 solver as a base algorithm which calls functions to integrate. * Add LLG solver with spin orbit torques * Optimise CUDA dipole fft Using the symmetry of the dipole tensor r_ij = r_ji * Use SyncedMemory zero to avoid device transfers
stonerlab · Oct 31, 2023 · 2939425 · 2939425
1 parent 794f303
commit 2939425
Show file tree

Hide file tree

Showing 22 changed files with 591 additions and 369 deletions.
diff --git a/cmake/Sources.cmake b/cmake/Sources.cmake
@@ -127,6 +127,8 @@ set(JAMS_SOURCES_CUDA
         solvers/cuda_llg_heun.cu
         solvers/cuda_llg_rk4.cu
         solvers/cuda_ll_lorentzian_rk4.cu
+        solvers/cuda_rk4_base.cu
+        solvers/cuda_rk4_llg_sot.cu
         thermostats/thm_bose_einstein_cuda_srk4.cu
         thermostats/thm_bose_einstein_cuda_srk4_kernel.cuh
         thermostats/cuda_langevin_bose.cu

diff --git a/src/jams/containers/multiarray.h b/src/jams/containers/multiarray.h
@@ -287,7 +287,7 @@ namespace jams {
         }
 
         inline void zero() noexcept {
-          memset(data_.mutable_host_data(), 0, data_.memory());
+          data_.zero();
         }
 
         inline void fill(const value_type &value) {
@@ -485,7 +485,7 @@ namespace jams {
         }
 
         inline void zero() noexcept {
-          memset(data_.mutable_host_data(), 0, data_.memory());
+          data_.zero();
         }
 
         inline void fill(const value_type &value) {

diff --git a/src/jams/containers/synced_memory.h b/src/jams/containers/synced_memory.h
@@ -344,7 +344,7 @@ SyncedMemory<T> &SyncedMemory<T>::operator=(const SyncedMemory& rhs) &{
     if (rhs.host_ptr_) {
       #if HAS_CUDA
       if (has_cuda_context()) {
-        #if SYNCEDMEMORY_PRINT_MEMCPY
+        #if SYNCED_MEMORY_PRINT_MEMCPY
         std::cout << "INFO(SyncedMemory): cudaMemcpyHostToHost" << std::endl;
         #endif
         SYNCED_MEMORY_CHECK_CUDA_STATUS(cudaMemcpy(mutable_host_data(), rhs.host_ptr_, size_ * sizeof(T), cudaMemcpyHostToHost));
@@ -358,7 +358,7 @@ SyncedMemory<T> &SyncedMemory<T>::operator=(const SyncedMemory& rhs) &{
 
     if (rhs.device_ptr_) {
       #if HAS_CUDA
-      #if SYNCEDMEMORY_PRINT_MEMCPY
+      #if SYNCED_MEMORY_PRINT_MEMCPY
       std::cout << "INFO(SyncedMemory): cudaMemcpyDeviceToDevice" << std::endl;
       #endif
       SYNCED_MEMORY_CHECK_CUDA_STATUS(cudaMemcpy(mutable_device_data(), rhs.device_ptr_, size_ * sizeof(T), cudaMemcpyDeviceToDevice));
@@ -474,7 +474,7 @@ void SyncedMemory<T>::copy_to_device() {
     if (!device_ptr_) {
       allocate_device_memory(size_);
     }
-    #if SYNCEDMEMORY_PRINT_MEMCPY
+    #if SYNCED_MEMORY_PRINT_MEMCPY
     std::cout << "INFO(SyncedMemory): cudaMemcpyHostToDevice" << std::endl;
     #endif
     assert(device_ptr_ && host_ptr_);
@@ -486,7 +486,7 @@ void SyncedMemory<T>::copy_to_device() {
 
   if (sync_status_ == SyncStatus::UNINITIALIZED) {
     allocate_device_memory(size_);
-    #ifdef SYNCEDMEMORY_ZERO_ON_ALLOCATION
+    #ifdef SYNCED_MEMORY_ZERO_ON_ALLOCATION
     zero_device();
     #endif
     sync_status_ = SyncStatus::DEVICE_IS_MUTATED;
@@ -509,7 +509,7 @@ void SyncedMemory<T>::copy_to_host() {
     if (!host_ptr_) {
       allocate_host_memory(size_);
     }
-    #if SYNCEDMEMORY_PRINT_MEMCPY
+    #if SYNCED_MEMORY_PRINT_MEMCPY
     std::cout << "INFO(SyncedMemory): cudaMemcpyDeviceToHost" << std::endl;
     #endif
     assert(device_ptr_ && host_ptr_);
@@ -521,7 +521,7 @@ void SyncedMemory<T>::copy_to_host() {
 
   if (sync_status_ == SyncStatus::UNINITIALIZED) {
     allocate_host_memory(size_);
-    #ifdef SYNCEDMEMORY_ZERO_ON_ALLOCATION
+    #ifdef SYNCED_MEMORY_ZERO_ON_ALLOCATION
     zero_host();
     #endif
     sync_status_ = SyncStatus::HOST_IS_MUTATED;

diff --git a/src/jams/core/solver.cc b/src/jams/core/solver.cc
@@ -23,6 +23,7 @@
 #include "jams/solvers/cuda_llg_heun.h"
 #include "jams/solvers/cuda_llg_rk4.h"
 #include "jams/solvers/cuda_ll_lorentzian_rk4.h"
+#include "jams/solvers/cuda_rk4_llg_sot.h"
 #include "jams/solvers/cpu_llg_heun.h"
 #include "jams/solvers/cpu_rotations.h"
 #include "jams/solvers/cpu_monte_carlo_metropolis.h"
@@ -69,6 +70,7 @@ Solver* Solver::create(const libconfig::Setting &settings) {
   DEFINED_SOLVER("llg-heun-gpu", CUDAHeunLLGSolver, settings);
   DEFINED_SOLVER("llg-rk4-gpu", CUDALLGRK4Solver, settings);
   DEFINED_SOLVER("ll-lorentzian-rk4-gpu", CUDALLLorentzianRK4Solver, settings);
+  DEFINED_SOLVER("llg-sot-rk4-gpu", CudaRK4LLGSOTSolver, settings);
 #endif
 
   throw std::runtime_error("unknown solver " + std::string(settings["module"].c_str()));

diff --git a/src/jams/cuda/cuda_spin_ops.cu b/src/jams/cuda/cuda_spin_ops.cu
@@ -1,6 +1,31 @@
 #include <jams/cuda/cuda_spin_ops.h>
 #include <jams/cuda/cuda_device_vector_ops.h>
 
+__global__ void cuda_normalise_spins_kernel(double * spins, const unsigned size)
+{
+  const unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
+
+  if (idx < size) {
+    double s[3] = {spins[3*idx + 0], spins[3*idx + 1], spins[3*idx + 2]};
+
+    double recip_snorm = rsqrt(s[0]*s[0] + s[1]*s[1] + s[2]*s[2]);
+
+    for (auto n = 0; n < 3; ++n) {
+      spins[3*idx + n] = s[n] * recip_snorm;
+    }
+  }
+}
+
+void jams::normalise_spins_cuda(jams::MultiArray<double, 2> &spins) {
+  dim3 block_size;
+  block_size.x = 128;
+
+  dim3 grid_size;
+  grid_size.x = (spins.size(0) + block_size.x - 1) / block_size.x;
+
+  cuda_normalise_spins_kernel<<<grid_size, block_size>>>(spins.device_data(), spins.size(0));
+}
+
 __global__ void cuda_rotate_spins_kernel(double* spins, const int* indices, const unsigned size,
                                          double Rxx, double Rxy, double Rxz,
                                          double Ryx, double Ryy, double Ryz,

diff --git a/src/jams/cuda/cuda_spin_ops.h b/src/jams/cuda/cuda_spin_ops.h
@@ -8,6 +8,10 @@
 
 namespace jams {
 
+/// Normalise spins to unit vectors
+CUDA_ONLY_IMPLEMENTATION(
+    void normalise_spins_cuda(jams::MultiArray<double, 2> &spins));
+
 /// Rotate spins with given indices by the rotation matrix
 ///
 /// @warning No check is made that rotation_matrix is unitary.

diff --git a/src/jams/hamiltonian/cuda_dipole_fft.cu b/src/jams/hamiltonian/cuda_dipole_fft.cu
@@ -37,9 +37,9 @@ __global__ void cuda_dipole_convolution(
         gpu_sq[3 * (num_pos * idx + pos_j) + 2]
       };
 
-      gpu_hq[3 * (num_pos * idx + pos_i) + 0] +=  alpha * (gpu_wq[9 * idx + 0] * sq[0] + gpu_wq[9 * idx + 1] * sq[1] + gpu_wq[9 * idx + 2] * sq[2]);
-      gpu_hq[3 * (num_pos * idx + pos_i) + 1] +=  alpha * (gpu_wq[9 * idx + 3] * sq[0] + gpu_wq[9 * idx + 4] * sq[1] + gpu_wq[9 * idx + 5] * sq[2]);
-      gpu_hq[3 * (num_pos * idx + pos_i) + 2] +=  alpha * (gpu_wq[9 * idx + 6] * sq[0] + gpu_wq[9 * idx + 7] * sq[1] + gpu_wq[9 * idx + 8] * sq[2]);
+      gpu_hq[3 * (num_pos * idx + pos_i) + 0] +=  alpha * (gpu_wq[6 * idx + 0] * sq[0] + gpu_wq[6 * idx + 1] * sq[1] + gpu_wq[6 * idx + 2] * sq[2]);
+      gpu_hq[3 * (num_pos * idx + pos_i) + 1] +=  alpha * (gpu_wq[6 * idx + 1] * sq[0] + gpu_wq[6 * idx + 3] * sq[1] + gpu_wq[6 * idx + 4] * sq[2]);
+      gpu_hq[3 * (num_pos * idx + pos_i) + 2] +=  alpha * (gpu_wq[6 * idx + 2] * sq[0] + gpu_wq[6 * idx + 4] * sq[1] + gpu_wq[6 * idx + 5] * sq[2]);
   }
 
 }
@@ -194,17 +194,17 @@ void CudaDipoleFFTHamiltonian::calculate_fields(double time) {
   kspace_h_.zero();
 
   CHECK_CUFFT_STATUS(cufftExecD2Z(cuda_fft_s_rspace_to_kspace, reinterpret_cast<cufftDoubleReal*>(globals::s.device_data()), kspace_s_.device_data()));
-
-  cudaDeviceSynchronize();
+  cudaStreamSynchronize(dev_stream_[0].get());
 
   for (int pos_j = 0; pos_j < globals::lattice->num_motif_atoms(); ++pos_j) {
+    const double mus_j = globals::lattice->material(
+        globals::lattice->motif_atom(pos_j).material_index).moment;
+
     for (int pos_i = 0; pos_i < globals::lattice->num_motif_atoms(); ++pos_i) {
-      const double mus_j = globals::lattice->material(
-          globals::lattice->motif_atom(pos_j).material_index).moment;
 
       const unsigned int fft_size = kspace_padded_size_[0] * kspace_padded_size_[1] * (kspace_padded_size_[2] / 2 + 1);
 
-      dim3 block_size = {128, 1, 1};
+      dim3 block_size = {32, 1, 1};
       dim3 grid_size = cuda_grid_size(block_size, {fft_size, 1, 1});
 
       cuda_dipole_convolution<<<grid_size, block_size, 0, dev_stream_[pos_i%4].get()>>>(fft_size, pos_i, pos_j, globals::lattice->num_motif_atoms(), mus_j, kspace_s_.device_data(), kspace_tensors_[pos_i][pos_j].device_data(), kspace_h_.device_data());
@@ -221,7 +221,7 @@ void CudaDipoleFFTHamiltonian::calculate_fields(double time) {
 
 // Generates the dipole tensor between unit cell positions i and j and appends
 // the generated positions to a vector
-jams::MultiArray<Complex, 5>
+jams::MultiArray<Complex, 4>
 CudaDipoleFFTHamiltonian::generate_kspace_dipole_tensor(const int pos_i, const int pos_j, std::vector<Vec3> &generated_positions) {
     using std::pow;
 
@@ -231,17 +231,17 @@ CudaDipoleFFTHamiltonian::generate_kspace_dipole_tensor(const int pos_i, const i
     const Vec3 r_cart_i = globals::lattice->fractional_to_cartesian(r_frac_i);
     const Vec3 r_cart_j = globals::lattice->fractional_to_cartesian(r_frac_j);
 
-  jams::MultiArray<double, 5> rspace_tensor(
+  jams::MultiArray<double, 4> rspace_tensor(
         kspace_padded_size_[0],
         kspace_padded_size_[1],
         kspace_padded_size_[2],
-        3, 3);
+        6);
 
-  jams::MultiArray<Complex, 5> kspace_tensor(
+  jams::MultiArray<Complex, 4> kspace_tensor(
         kspace_padded_size_[0],
         kspace_padded_size_[1],
         kspace_padded_size_[2]/2 + 1,
-        3, 3);
+        6);
 
 
     rspace_tensor.zero();
@@ -274,17 +274,32 @@ CudaDipoleFFTHamiltonian::generate_kspace_dipole_tensor(const int pos_i, const i
                     continue;
                 }
 
-              generated_positions.push_back(r_ij);
-
-                for (int m = 0; m < 3; ++m) {
-                    for (int n = 0; n < 3; ++n) {
-                        auto value = w0 * (3 * r_ij[m] * r_ij[n] - r_abs_sq * Id[m][n]) / pow(sqrt(r_abs_sq), 5);
-                        if (!std::isfinite(value)) {
-                          throw std::runtime_error("fatal error in CudaDipoleFFTHamiltonian::generate_kspace_dipole_tensor: tensor Szz is not finite");
-                        }
-                        rspace_tensor(nx, ny, nz, m, n) = value;
-                    }
-                }
+                generated_positions.push_back(r_ij);
+
+                // xx
+                rspace_tensor(nx, ny, nz, 0) =  w0 * (3 * r_ij[0] * r_ij[0] - r_abs_sq) / pow(sqrt(r_abs_sq), 5);
+                // xy
+                rspace_tensor(nx, ny, nz, 1) =  w0 * (3 * r_ij[0] * r_ij[1]) / pow(sqrt(r_abs_sq), 5);
+                // xz
+                rspace_tensor(nx, ny, nz, 2) =  w0 * (3 * r_ij[0] * r_ij[2]) / pow(sqrt(r_abs_sq), 5);
+                // yy
+                rspace_tensor(nx, ny, nz, 3) =  w0 * (3 * r_ij[1] * r_ij[1] - r_abs_sq) / pow(sqrt(r_abs_sq), 5);
+                // yz
+                rspace_tensor(nx, ny, nz, 4) =  w0 * (3 * r_ij[1] * r_ij[2]) / pow(sqrt(r_abs_sq), 5);
+                // zz
+                rspace_tensor(nx, ny, nz, 5) =  w0 * (3 * r_ij[2] * r_ij[2] - r_abs_sq) / pow(sqrt(r_abs_sq), 5);
+
+//                for (int m = 0; m < 3; ++m) {
+//                    for (int n = m; n < 3; ++n) {
+//                        auto value = w0 * (3 * r_ij[m] * r_ij[n] - r_abs_sq * Id[m][n]) / pow(sqrt(r_abs_sq), 5);
+//
+//                        std::cout << m << " " << n << " " << value << std::endl;
+//                        if (!std::isfinite(value)) {
+//                          throw std::runtime_error("fatal error in CudaDipoleFFTHamiltonian::generate_kspace_dipole_tensor: tensor Szz is not finite");
+//                        }
+//                        rspace_tensor(nx, ny, nz, m, n) = value;
+//                    }
+//                }
             }
         }
     }
@@ -299,9 +314,9 @@ CudaDipoleFFTHamiltonian::generate_kspace_dipole_tensor(const int pos_i, const i
     }
 
     int rank            = 3;
-    int stride          = 9;
+    int stride          = 6;
     int dist            = 1;
-    int num_transforms  = 9;
+    int num_transforms  = 6;
     int * nembed        = nullptr;
     int transform_size[3]  = {kspace_padded_size_[0], kspace_padded_size_[1], kspace_padded_size_[2]};
 

diff --git a/src/jams/hamiltonian/cuda_dipole_fft.h b/src/jams/hamiltonian/cuda_dipole_fft.h
@@ -30,7 +30,7 @@ class CudaDipoleFFTHamiltonian : public Hamiltonian {
         bool check_radius_   = true;
         bool check_symmetry_ = true;
 
-        jams::MultiArray<Complex, 5> generate_kspace_dipole_tensor(const int pos_i, const int pos_j, std::vector<Vec3> &generated_positions);
+        jams::MultiArray<Complex, 4> generate_kspace_dipole_tensor(const int pos_i, const int pos_j, std::vector<Vec3> &generated_positions);
 
         double                          r_cutoff_;
         double                          distance_tolerance_;

diff --git a/src/jams/hamiltonian/cuda_fft_interaction.h b/src/jams/hamiltonian/cuda_fft_interaction.h
@@ -0,0 +1,28 @@
+// cuda_fft_interaction.h                                              -*-C++-*-
+#ifndef INCLUDED_JAMS_CUDA_FFT_INTERACTION
+#define INCLUDED_JAMS_CUDA_FFT_INTERACTION
+#include <jams/core/hamiltonian.h>
+
+class CudaFFTInteractionHamiltonian : public Hamiltonian {
+
+    CudaFFTInteractionHamiltonian(const libconfig::Setting &settings, unsigned int size);
+
+    double calculate_total_energy(double time) override;
+
+    void calculate_energies(double time) override;
+
+    void calculate_fields(double time) override;
+
+    Vec3 calculate_field(int i, double time) override;
+
+    double calculate_energy(int i, double time) override;
+
+    double calculate_energy_difference(int i, const Vec3 &spin_initial, const Vec3 &spin_final, double time) override;
+
+private:
+    CudaStream cusparse_stream_; // cuda stream to run in
+
+};
+
+#endif
+// ----------------------------- END-OF-FILE ----------------------------------
diff --git a/src/jams/helpers/consts.h b/src/jams/helpers/consts.h
@@ -37,6 +37,7 @@ constexpr double kVacuumPermeabilityIU   = 4*kPi*1E-7 / kJoule2meV; // Original
 
 constexpr double kMeterToAngstroms      = 1e10;
 constexpr double kMeterToNanometer      = 1e9;
+constexpr double kSecondToPicosecond    = 1e12;
 
 constexpr double kBytesToMegaBytes      = 1048576.0;