Skip to content

Commit

Permalink
Spin orbit torque implementation (#191)
Browse files Browse the repository at this point in the history
* Abstract the Cuda RK4 solver

I've re-implemented the Cuda RK4 solver as a base algorithm which calls functions to integrate.

* Add LLG solver with spin orbit torques

* Optimise CUDA dipole fft

Using the symmetry of the dipole tensor r_ij = r_ji

* Use SyncedMemory zero to avoid device transfers
  • Loading branch information
drjbarker authored Oct 31, 2023
1 parent 794f303 commit 2939425
Show file tree
Hide file tree
Showing 22 changed files with 591 additions and 369 deletions.
2 changes: 2 additions & 0 deletions cmake/Sources.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,8 @@ set(JAMS_SOURCES_CUDA
solvers/cuda_llg_heun.cu
solvers/cuda_llg_rk4.cu
solvers/cuda_ll_lorentzian_rk4.cu
solvers/cuda_rk4_base.cu
solvers/cuda_rk4_llg_sot.cu
thermostats/thm_bose_einstein_cuda_srk4.cu
thermostats/thm_bose_einstein_cuda_srk4_kernel.cuh
thermostats/cuda_langevin_bose.cu
Expand Down
4 changes: 2 additions & 2 deletions src/jams/containers/multiarray.h
Original file line number Diff line number Diff line change
Expand Up @@ -287,7 +287,7 @@ namespace jams {
}

inline void zero() noexcept {
memset(data_.mutable_host_data(), 0, data_.memory());
data_.zero();
}

inline void fill(const value_type &value) {
Expand Down Expand Up @@ -485,7 +485,7 @@ namespace jams {
}

inline void zero() noexcept {
memset(data_.mutable_host_data(), 0, data_.memory());
data_.zero();
}

inline void fill(const value_type &value) {
Expand Down
12 changes: 6 additions & 6 deletions src/jams/containers/synced_memory.h
Original file line number Diff line number Diff line change
Expand Up @@ -344,7 +344,7 @@ SyncedMemory<T> &SyncedMemory<T>::operator=(const SyncedMemory& rhs) &{
if (rhs.host_ptr_) {
#if HAS_CUDA
if (has_cuda_context()) {
#if SYNCEDMEMORY_PRINT_MEMCPY
#if SYNCED_MEMORY_PRINT_MEMCPY
std::cout << "INFO(SyncedMemory): cudaMemcpyHostToHost" << std::endl;
#endif
SYNCED_MEMORY_CHECK_CUDA_STATUS(cudaMemcpy(mutable_host_data(), rhs.host_ptr_, size_ * sizeof(T), cudaMemcpyHostToHost));
Expand All @@ -358,7 +358,7 @@ SyncedMemory<T> &SyncedMemory<T>::operator=(const SyncedMemory& rhs) &{

if (rhs.device_ptr_) {
#if HAS_CUDA
#if SYNCEDMEMORY_PRINT_MEMCPY
#if SYNCED_MEMORY_PRINT_MEMCPY
std::cout << "INFO(SyncedMemory): cudaMemcpyDeviceToDevice" << std::endl;
#endif
SYNCED_MEMORY_CHECK_CUDA_STATUS(cudaMemcpy(mutable_device_data(), rhs.device_ptr_, size_ * sizeof(T), cudaMemcpyDeviceToDevice));
Expand Down Expand Up @@ -474,7 +474,7 @@ void SyncedMemory<T>::copy_to_device() {
if (!device_ptr_) {
allocate_device_memory(size_);
}
#if SYNCEDMEMORY_PRINT_MEMCPY
#if SYNCED_MEMORY_PRINT_MEMCPY
std::cout << "INFO(SyncedMemory): cudaMemcpyHostToDevice" << std::endl;
#endif
assert(device_ptr_ && host_ptr_);
Expand All @@ -486,7 +486,7 @@ void SyncedMemory<T>::copy_to_device() {

if (sync_status_ == SyncStatus::UNINITIALIZED) {
allocate_device_memory(size_);
#ifdef SYNCEDMEMORY_ZERO_ON_ALLOCATION
#ifdef SYNCED_MEMORY_ZERO_ON_ALLOCATION
zero_device();
#endif
sync_status_ = SyncStatus::DEVICE_IS_MUTATED;
Expand All @@ -509,7 +509,7 @@ void SyncedMemory<T>::copy_to_host() {
if (!host_ptr_) {
allocate_host_memory(size_);
}
#if SYNCEDMEMORY_PRINT_MEMCPY
#if SYNCED_MEMORY_PRINT_MEMCPY
std::cout << "INFO(SyncedMemory): cudaMemcpyDeviceToHost" << std::endl;
#endif
assert(device_ptr_ && host_ptr_);
Expand All @@ -521,7 +521,7 @@ void SyncedMemory<T>::copy_to_host() {

if (sync_status_ == SyncStatus::UNINITIALIZED) {
allocate_host_memory(size_);
#ifdef SYNCEDMEMORY_ZERO_ON_ALLOCATION
#ifdef SYNCED_MEMORY_ZERO_ON_ALLOCATION
zero_host();
#endif
sync_status_ = SyncStatus::HOST_IS_MUTATED;
Expand Down
2 changes: 2 additions & 0 deletions src/jams/core/solver.cc
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
#include "jams/solvers/cuda_llg_heun.h"
#include "jams/solvers/cuda_llg_rk4.h"
#include "jams/solvers/cuda_ll_lorentzian_rk4.h"
#include "jams/solvers/cuda_rk4_llg_sot.h"
#include "jams/solvers/cpu_llg_heun.h"
#include "jams/solvers/cpu_rotations.h"
#include "jams/solvers/cpu_monte_carlo_metropolis.h"
Expand Down Expand Up @@ -69,6 +70,7 @@ Solver* Solver::create(const libconfig::Setting &settings) {
DEFINED_SOLVER("llg-heun-gpu", CUDAHeunLLGSolver, settings);
DEFINED_SOLVER("llg-rk4-gpu", CUDALLGRK4Solver, settings);
DEFINED_SOLVER("ll-lorentzian-rk4-gpu", CUDALLLorentzianRK4Solver, settings);
DEFINED_SOLVER("llg-sot-rk4-gpu", CudaRK4LLGSOTSolver, settings);
#endif

throw std::runtime_error("unknown solver " + std::string(settings["module"].c_str()));
Expand Down
25 changes: 25 additions & 0 deletions src/jams/cuda/cuda_spin_ops.cu
Original file line number Diff line number Diff line change
@@ -1,6 +1,31 @@
#include <jams/cuda/cuda_spin_ops.h>
#include <jams/cuda/cuda_device_vector_ops.h>

__global__ void cuda_normalise_spins_kernel(double * spins, const unsigned size)
{
const unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;

if (idx < size) {
double s[3] = {spins[3*idx + 0], spins[3*idx + 1], spins[3*idx + 2]};

double recip_snorm = rsqrt(s[0]*s[0] + s[1]*s[1] + s[2]*s[2]);

for (auto n = 0; n < 3; ++n) {
spins[3*idx + n] = s[n] * recip_snorm;
}
}
}

void jams::normalise_spins_cuda(jams::MultiArray<double, 2> &spins) {
dim3 block_size;
block_size.x = 128;

dim3 grid_size;
grid_size.x = (spins.size(0) + block_size.x - 1) / block_size.x;

cuda_normalise_spins_kernel<<<grid_size, block_size>>>(spins.device_data(), spins.size(0));
}

__global__ void cuda_rotate_spins_kernel(double* spins, const int* indices, const unsigned size,
double Rxx, double Rxy, double Rxz,
double Ryx, double Ryy, double Ryz,
Expand Down
4 changes: 4 additions & 0 deletions src/jams/cuda/cuda_spin_ops.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,10 @@

namespace jams {

/// Normalise spins to unit vectors
CUDA_ONLY_IMPLEMENTATION(
void normalise_spins_cuda(jams::MultiArray<double, 2> &spins));

/// Rotate spins with given indices by the rotation matrix
///
/// @warning No check is made that rotation_matrix is unitary.
Expand Down
67 changes: 41 additions & 26 deletions src/jams/hamiltonian/cuda_dipole_fft.cu
Original file line number Diff line number Diff line change
Expand Up @@ -37,9 +37,9 @@ __global__ void cuda_dipole_convolution(
gpu_sq[3 * (num_pos * idx + pos_j) + 2]
};

gpu_hq[3 * (num_pos * idx + pos_i) + 0] += alpha * (gpu_wq[9 * idx + 0] * sq[0] + gpu_wq[9 * idx + 1] * sq[1] + gpu_wq[9 * idx + 2] * sq[2]);
gpu_hq[3 * (num_pos * idx + pos_i) + 1] += alpha * (gpu_wq[9 * idx + 3] * sq[0] + gpu_wq[9 * idx + 4] * sq[1] + gpu_wq[9 * idx + 5] * sq[2]);
gpu_hq[3 * (num_pos * idx + pos_i) + 2] += alpha * (gpu_wq[9 * idx + 6] * sq[0] + gpu_wq[9 * idx + 7] * sq[1] + gpu_wq[9 * idx + 8] * sq[2]);
gpu_hq[3 * (num_pos * idx + pos_i) + 0] += alpha * (gpu_wq[6 * idx + 0] * sq[0] + gpu_wq[6 * idx + 1] * sq[1] + gpu_wq[6 * idx + 2] * sq[2]);
gpu_hq[3 * (num_pos * idx + pos_i) + 1] += alpha * (gpu_wq[6 * idx + 1] * sq[0] + gpu_wq[6 * idx + 3] * sq[1] + gpu_wq[6 * idx + 4] * sq[2]);
gpu_hq[3 * (num_pos * idx + pos_i) + 2] += alpha * (gpu_wq[6 * idx + 2] * sq[0] + gpu_wq[6 * idx + 4] * sq[1] + gpu_wq[6 * idx + 5] * sq[2]);
}

}
Expand Down Expand Up @@ -194,17 +194,17 @@ void CudaDipoleFFTHamiltonian::calculate_fields(double time) {
kspace_h_.zero();

CHECK_CUFFT_STATUS(cufftExecD2Z(cuda_fft_s_rspace_to_kspace, reinterpret_cast<cufftDoubleReal*>(globals::s.device_data()), kspace_s_.device_data()));

cudaDeviceSynchronize();
cudaStreamSynchronize(dev_stream_[0].get());

for (int pos_j = 0; pos_j < globals::lattice->num_motif_atoms(); ++pos_j) {
const double mus_j = globals::lattice->material(
globals::lattice->motif_atom(pos_j).material_index).moment;

for (int pos_i = 0; pos_i < globals::lattice->num_motif_atoms(); ++pos_i) {
const double mus_j = globals::lattice->material(
globals::lattice->motif_atom(pos_j).material_index).moment;

const unsigned int fft_size = kspace_padded_size_[0] * kspace_padded_size_[1] * (kspace_padded_size_[2] / 2 + 1);

dim3 block_size = {128, 1, 1};
dim3 block_size = {32, 1, 1};
dim3 grid_size = cuda_grid_size(block_size, {fft_size, 1, 1});

cuda_dipole_convolution<<<grid_size, block_size, 0, dev_stream_[pos_i%4].get()>>>(fft_size, pos_i, pos_j, globals::lattice->num_motif_atoms(), mus_j, kspace_s_.device_data(), kspace_tensors_[pos_i][pos_j].device_data(), kspace_h_.device_data());
Expand All @@ -221,7 +221,7 @@ void CudaDipoleFFTHamiltonian::calculate_fields(double time) {

// Generates the dipole tensor between unit cell positions i and j and appends
// the generated positions to a vector
jams::MultiArray<Complex, 5>
jams::MultiArray<Complex, 4>
CudaDipoleFFTHamiltonian::generate_kspace_dipole_tensor(const int pos_i, const int pos_j, std::vector<Vec3> &generated_positions) {
using std::pow;

Expand All @@ -231,17 +231,17 @@ CudaDipoleFFTHamiltonian::generate_kspace_dipole_tensor(const int pos_i, const i
const Vec3 r_cart_i = globals::lattice->fractional_to_cartesian(r_frac_i);
const Vec3 r_cart_j = globals::lattice->fractional_to_cartesian(r_frac_j);

jams::MultiArray<double, 5> rspace_tensor(
jams::MultiArray<double, 4> rspace_tensor(
kspace_padded_size_[0],
kspace_padded_size_[1],
kspace_padded_size_[2],
3, 3);
6);

jams::MultiArray<Complex, 5> kspace_tensor(
jams::MultiArray<Complex, 4> kspace_tensor(
kspace_padded_size_[0],
kspace_padded_size_[1],
kspace_padded_size_[2]/2 + 1,
3, 3);
6);


rspace_tensor.zero();
Expand Down Expand Up @@ -274,17 +274,32 @@ CudaDipoleFFTHamiltonian::generate_kspace_dipole_tensor(const int pos_i, const i
continue;
}

generated_positions.push_back(r_ij);

for (int m = 0; m < 3; ++m) {
for (int n = 0; n < 3; ++n) {
auto value = w0 * (3 * r_ij[m] * r_ij[n] - r_abs_sq * Id[m][n]) / pow(sqrt(r_abs_sq), 5);
if (!std::isfinite(value)) {
throw std::runtime_error("fatal error in CudaDipoleFFTHamiltonian::generate_kspace_dipole_tensor: tensor Szz is not finite");
}
rspace_tensor(nx, ny, nz, m, n) = value;
}
}
generated_positions.push_back(r_ij);

// xx
rspace_tensor(nx, ny, nz, 0) = w0 * (3 * r_ij[0] * r_ij[0] - r_abs_sq) / pow(sqrt(r_abs_sq), 5);
// xy
rspace_tensor(nx, ny, nz, 1) = w0 * (3 * r_ij[0] * r_ij[1]) / pow(sqrt(r_abs_sq), 5);
// xz
rspace_tensor(nx, ny, nz, 2) = w0 * (3 * r_ij[0] * r_ij[2]) / pow(sqrt(r_abs_sq), 5);
// yy
rspace_tensor(nx, ny, nz, 3) = w0 * (3 * r_ij[1] * r_ij[1] - r_abs_sq) / pow(sqrt(r_abs_sq), 5);
// yz
rspace_tensor(nx, ny, nz, 4) = w0 * (3 * r_ij[1] * r_ij[2]) / pow(sqrt(r_abs_sq), 5);
// zz
rspace_tensor(nx, ny, nz, 5) = w0 * (3 * r_ij[2] * r_ij[2] - r_abs_sq) / pow(sqrt(r_abs_sq), 5);

// for (int m = 0; m < 3; ++m) {
// for (int n = m; n < 3; ++n) {
// auto value = w0 * (3 * r_ij[m] * r_ij[n] - r_abs_sq * Id[m][n]) / pow(sqrt(r_abs_sq), 5);
//
// std::cout << m << " " << n << " " << value << std::endl;
// if (!std::isfinite(value)) {
// throw std::runtime_error("fatal error in CudaDipoleFFTHamiltonian::generate_kspace_dipole_tensor: tensor Szz is not finite");
// }
// rspace_tensor(nx, ny, nz, m, n) = value;
// }
// }
}
}
}
Expand All @@ -299,9 +314,9 @@ CudaDipoleFFTHamiltonian::generate_kspace_dipole_tensor(const int pos_i, const i
}

int rank = 3;
int stride = 9;
int stride = 6;
int dist = 1;
int num_transforms = 9;
int num_transforms = 6;
int * nembed = nullptr;
int transform_size[3] = {kspace_padded_size_[0], kspace_padded_size_[1], kspace_padded_size_[2]};

Expand Down
2 changes: 1 addition & 1 deletion src/jams/hamiltonian/cuda_dipole_fft.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ class CudaDipoleFFTHamiltonian : public Hamiltonian {
bool check_radius_ = true;
bool check_symmetry_ = true;

jams::MultiArray<Complex, 5> generate_kspace_dipole_tensor(const int pos_i, const int pos_j, std::vector<Vec3> &generated_positions);
jams::MultiArray<Complex, 4> generate_kspace_dipole_tensor(const int pos_i, const int pos_j, std::vector<Vec3> &generated_positions);

double r_cutoff_;
double distance_tolerance_;
Expand Down
28 changes: 28 additions & 0 deletions src/jams/hamiltonian/cuda_fft_interaction.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
// cuda_fft_interaction.h -*-C++-*-
#ifndef INCLUDED_JAMS_CUDA_FFT_INTERACTION
#define INCLUDED_JAMS_CUDA_FFT_INTERACTION
#include <jams/core/hamiltonian.h>

class CudaFFTInteractionHamiltonian : public Hamiltonian {

CudaFFTInteractionHamiltonian(const libconfig::Setting &settings, unsigned int size);

double calculate_total_energy(double time) override;

void calculate_energies(double time) override;

void calculate_fields(double time) override;

Vec3 calculate_field(int i, double time) override;

double calculate_energy(int i, double time) override;

double calculate_energy_difference(int i, const Vec3 &spin_initial, const Vec3 &spin_final, double time) override;

private:
CudaStream cusparse_stream_; // cuda stream to run in

};

#endif
// ----------------------------- END-OF-FILE ----------------------------------
1 change: 1 addition & 0 deletions src/jams/helpers/consts.h
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ constexpr double kVacuumPermeabilityIU = 4*kPi*1E-7 / kJoule2meV; // Original

constexpr double kMeterToAngstroms = 1e10;
constexpr double kMeterToNanometer = 1e9;
constexpr double kSecondToPicosecond = 1e12;

constexpr double kBytesToMegaBytes = 1048576.0;

Expand Down
Loading

0 comments on commit 2939425

Please sign in to comment.