Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Spin orbit torque implementation #191

Merged
merged 10 commits into from
Oct 31, 2023
2 changes: 2 additions & 0 deletions cmake/Sources.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,8 @@ set(JAMS_SOURCES_CUDA
solvers/cuda_llg_heun.cu
solvers/cuda_llg_rk4.cu
solvers/cuda_ll_lorentzian_rk4.cu
solvers/cuda_rk4_base.cu
solvers/cuda_rk4_llg_sot.cu
thermostats/thm_bose_einstein_cuda_srk4.cu
thermostats/thm_bose_einstein_cuda_srk4_kernel.cuh
thermostats/cuda_langevin_bose.cu
Expand Down
4 changes: 2 additions & 2 deletions src/jams/containers/multiarray.h
Original file line number Diff line number Diff line change
Expand Up @@ -287,7 +287,7 @@ namespace jams {
}

inline void zero() noexcept {
memset(data_.mutable_host_data(), 0, data_.memory());
data_.zero();
}

inline void fill(const value_type &value) {
Expand Down Expand Up @@ -485,7 +485,7 @@ namespace jams {
}

inline void zero() noexcept {
memset(data_.mutable_host_data(), 0, data_.memory());
data_.zero();
}

inline void fill(const value_type &value) {
Expand Down
12 changes: 6 additions & 6 deletions src/jams/containers/synced_memory.h
Original file line number Diff line number Diff line change
Expand Up @@ -344,7 +344,7 @@ SyncedMemory<T> &SyncedMemory<T>::operator=(const SyncedMemory& rhs) &{
if (rhs.host_ptr_) {
#if HAS_CUDA
if (has_cuda_context()) {
#if SYNCEDMEMORY_PRINT_MEMCPY
#if SYNCED_MEMORY_PRINT_MEMCPY
std::cout << "INFO(SyncedMemory): cudaMemcpyHostToHost" << std::endl;
#endif
SYNCED_MEMORY_CHECK_CUDA_STATUS(cudaMemcpy(mutable_host_data(), rhs.host_ptr_, size_ * sizeof(T), cudaMemcpyHostToHost));
Expand All @@ -358,7 +358,7 @@ SyncedMemory<T> &SyncedMemory<T>::operator=(const SyncedMemory& rhs) &{

if (rhs.device_ptr_) {
#if HAS_CUDA
#if SYNCEDMEMORY_PRINT_MEMCPY
#if SYNCED_MEMORY_PRINT_MEMCPY
std::cout << "INFO(SyncedMemory): cudaMemcpyDeviceToDevice" << std::endl;
#endif
SYNCED_MEMORY_CHECK_CUDA_STATUS(cudaMemcpy(mutable_device_data(), rhs.device_ptr_, size_ * sizeof(T), cudaMemcpyDeviceToDevice));
Expand Down Expand Up @@ -474,7 +474,7 @@ void SyncedMemory<T>::copy_to_device() {
if (!device_ptr_) {
allocate_device_memory(size_);
}
#if SYNCEDMEMORY_PRINT_MEMCPY
#if SYNCED_MEMORY_PRINT_MEMCPY
std::cout << "INFO(SyncedMemory): cudaMemcpyHostToDevice" << std::endl;
#endif
assert(device_ptr_ && host_ptr_);
Expand All @@ -486,7 +486,7 @@ void SyncedMemory<T>::copy_to_device() {

if (sync_status_ == SyncStatus::UNINITIALIZED) {
allocate_device_memory(size_);
#ifdef SYNCEDMEMORY_ZERO_ON_ALLOCATION
#ifdef SYNCED_MEMORY_ZERO_ON_ALLOCATION
zero_device();
#endif
sync_status_ = SyncStatus::DEVICE_IS_MUTATED;
Expand All @@ -509,7 +509,7 @@ void SyncedMemory<T>::copy_to_host() {
if (!host_ptr_) {
allocate_host_memory(size_);
}
#if SYNCEDMEMORY_PRINT_MEMCPY
#if SYNCED_MEMORY_PRINT_MEMCPY
std::cout << "INFO(SyncedMemory): cudaMemcpyDeviceToHost" << std::endl;
#endif
assert(device_ptr_ && host_ptr_);
Expand All @@ -521,7 +521,7 @@ void SyncedMemory<T>::copy_to_host() {

if (sync_status_ == SyncStatus::UNINITIALIZED) {
allocate_host_memory(size_);
#ifdef SYNCEDMEMORY_ZERO_ON_ALLOCATION
#ifdef SYNCED_MEMORY_ZERO_ON_ALLOCATION
zero_host();
#endif
sync_status_ = SyncStatus::HOST_IS_MUTATED;
Expand Down
2 changes: 2 additions & 0 deletions src/jams/core/solver.cc
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
#include "jams/solvers/cuda_llg_heun.h"
#include "jams/solvers/cuda_llg_rk4.h"
#include "jams/solvers/cuda_ll_lorentzian_rk4.h"
#include "jams/solvers/cuda_rk4_llg_sot.h"
#include "jams/solvers/cpu_llg_heun.h"
#include "jams/solvers/cpu_rotations.h"
#include "jams/solvers/cpu_monte_carlo_metropolis.h"
Expand Down Expand Up @@ -69,6 +70,7 @@ Solver* Solver::create(const libconfig::Setting &settings) {
DEFINED_SOLVER("llg-heun-gpu", CUDAHeunLLGSolver, settings);
DEFINED_SOLVER("llg-rk4-gpu", CUDALLGRK4Solver, settings);
DEFINED_SOLVER("ll-lorentzian-rk4-gpu", CUDALLLorentzianRK4Solver, settings);
DEFINED_SOLVER("llg-sot-rk4-gpu", CudaRK4LLGSOTSolver, settings);
#endif

throw std::runtime_error("unknown solver " + std::string(settings["module"].c_str()));
Expand Down
25 changes: 25 additions & 0 deletions src/jams/cuda/cuda_spin_ops.cu
Original file line number Diff line number Diff line change
@@ -1,6 +1,31 @@
#include <jams/cuda/cuda_spin_ops.h>
#include <jams/cuda/cuda_device_vector_ops.h>

__global__ void cuda_normalise_spins_kernel(double * spins, const unsigned size)
{
const unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;

if (idx < size) {
double s[3] = {spins[3*idx + 0], spins[3*idx + 1], spins[3*idx + 2]};

double recip_snorm = rsqrt(s[0]*s[0] + s[1]*s[1] + s[2]*s[2]);

for (auto n = 0; n < 3; ++n) {
spins[3*idx + n] = s[n] * recip_snorm;
}
}
}

void jams::normalise_spins_cuda(jams::MultiArray<double, 2> &spins) {
dim3 block_size;
block_size.x = 128;

dim3 grid_size;
grid_size.x = (spins.size(0) + block_size.x - 1) / block_size.x;

cuda_normalise_spins_kernel<<<grid_size, block_size>>>(spins.device_data(), spins.size(0));
}

__global__ void cuda_rotate_spins_kernel(double* spins, const int* indices, const unsigned size,
double Rxx, double Rxy, double Rxz,
double Ryx, double Ryy, double Ryz,
Expand Down
4 changes: 4 additions & 0 deletions src/jams/cuda/cuda_spin_ops.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,10 @@

namespace jams {

/// Normalise spins to unit vectors
CUDA_ONLY_IMPLEMENTATION(
void normalise_spins_cuda(jams::MultiArray<double, 2> &spins));

/// Rotate spins with given indices by the rotation matrix
///
/// @warning No check is made that rotation_matrix is unitary.
Expand Down
67 changes: 41 additions & 26 deletions src/jams/hamiltonian/cuda_dipole_fft.cu
Original file line number Diff line number Diff line change
Expand Up @@ -37,9 +37,9 @@ __global__ void cuda_dipole_convolution(
gpu_sq[3 * (num_pos * idx + pos_j) + 2]
};

gpu_hq[3 * (num_pos * idx + pos_i) + 0] += alpha * (gpu_wq[9 * idx + 0] * sq[0] + gpu_wq[9 * idx + 1] * sq[1] + gpu_wq[9 * idx + 2] * sq[2]);
gpu_hq[3 * (num_pos * idx + pos_i) + 1] += alpha * (gpu_wq[9 * idx + 3] * sq[0] + gpu_wq[9 * idx + 4] * sq[1] + gpu_wq[9 * idx + 5] * sq[2]);
gpu_hq[3 * (num_pos * idx + pos_i) + 2] += alpha * (gpu_wq[9 * idx + 6] * sq[0] + gpu_wq[9 * idx + 7] * sq[1] + gpu_wq[9 * idx + 8] * sq[2]);
gpu_hq[3 * (num_pos * idx + pos_i) + 0] += alpha * (gpu_wq[6 * idx + 0] * sq[0] + gpu_wq[6 * idx + 1] * sq[1] + gpu_wq[6 * idx + 2] * sq[2]);
gpu_hq[3 * (num_pos * idx + pos_i) + 1] += alpha * (gpu_wq[6 * idx + 1] * sq[0] + gpu_wq[6 * idx + 3] * sq[1] + gpu_wq[6 * idx + 4] * sq[2]);
gpu_hq[3 * (num_pos * idx + pos_i) + 2] += alpha * (gpu_wq[6 * idx + 2] * sq[0] + gpu_wq[6 * idx + 4] * sq[1] + gpu_wq[6 * idx + 5] * sq[2]);
}

}
Expand Down Expand Up @@ -194,17 +194,17 @@ void CudaDipoleFFTHamiltonian::calculate_fields(double time) {
kspace_h_.zero();

CHECK_CUFFT_STATUS(cufftExecD2Z(cuda_fft_s_rspace_to_kspace, reinterpret_cast<cufftDoubleReal*>(globals::s.device_data()), kspace_s_.device_data()));

cudaDeviceSynchronize();
cudaStreamSynchronize(dev_stream_[0].get());

for (int pos_j = 0; pos_j < globals::lattice->num_motif_atoms(); ++pos_j) {
const double mus_j = globals::lattice->material(
globals::lattice->motif_atom(pos_j).material_index).moment;

for (int pos_i = 0; pos_i < globals::lattice->num_motif_atoms(); ++pos_i) {
const double mus_j = globals::lattice->material(
globals::lattice->motif_atom(pos_j).material_index).moment;

const unsigned int fft_size = kspace_padded_size_[0] * kspace_padded_size_[1] * (kspace_padded_size_[2] / 2 + 1);

dim3 block_size = {128, 1, 1};
dim3 block_size = {32, 1, 1};
dim3 grid_size = cuda_grid_size(block_size, {fft_size, 1, 1});

cuda_dipole_convolution<<<grid_size, block_size, 0, dev_stream_[pos_i%4].get()>>>(fft_size, pos_i, pos_j, globals::lattice->num_motif_atoms(), mus_j, kspace_s_.device_data(), kspace_tensors_[pos_i][pos_j].device_data(), kspace_h_.device_data());
Expand All @@ -221,7 +221,7 @@ void CudaDipoleFFTHamiltonian::calculate_fields(double time) {

// Generates the dipole tensor between unit cell positions i and j and appends
// the generated positions to a vector
jams::MultiArray<Complex, 5>
jams::MultiArray<Complex, 4>
CudaDipoleFFTHamiltonian::generate_kspace_dipole_tensor(const int pos_i, const int pos_j, std::vector<Vec3> &generated_positions) {
using std::pow;

Expand All @@ -231,17 +231,17 @@ CudaDipoleFFTHamiltonian::generate_kspace_dipole_tensor(const int pos_i, const i
const Vec3 r_cart_i = globals::lattice->fractional_to_cartesian(r_frac_i);
const Vec3 r_cart_j = globals::lattice->fractional_to_cartesian(r_frac_j);

jams::MultiArray<double, 5> rspace_tensor(
jams::MultiArray<double, 4> rspace_tensor(
kspace_padded_size_[0],
kspace_padded_size_[1],
kspace_padded_size_[2],
3, 3);
6);

jams::MultiArray<Complex, 5> kspace_tensor(
jams::MultiArray<Complex, 4> kspace_tensor(
kspace_padded_size_[0],
kspace_padded_size_[1],
kspace_padded_size_[2]/2 + 1,
3, 3);
6);


rspace_tensor.zero();
Expand Down Expand Up @@ -274,17 +274,32 @@ CudaDipoleFFTHamiltonian::generate_kspace_dipole_tensor(const int pos_i, const i
continue;
}

generated_positions.push_back(r_ij);

for (int m = 0; m < 3; ++m) {
for (int n = 0; n < 3; ++n) {
auto value = w0 * (3 * r_ij[m] * r_ij[n] - r_abs_sq * Id[m][n]) / pow(sqrt(r_abs_sq), 5);
if (!std::isfinite(value)) {
throw std::runtime_error("fatal error in CudaDipoleFFTHamiltonian::generate_kspace_dipole_tensor: tensor Szz is not finite");
}
rspace_tensor(nx, ny, nz, m, n) = value;
}
}
generated_positions.push_back(r_ij);

// xx
rspace_tensor(nx, ny, nz, 0) = w0 * (3 * r_ij[0] * r_ij[0] - r_abs_sq) / pow(sqrt(r_abs_sq), 5);
// xy
rspace_tensor(nx, ny, nz, 1) = w0 * (3 * r_ij[0] * r_ij[1]) / pow(sqrt(r_abs_sq), 5);
// xz
rspace_tensor(nx, ny, nz, 2) = w0 * (3 * r_ij[0] * r_ij[2]) / pow(sqrt(r_abs_sq), 5);
// yy
rspace_tensor(nx, ny, nz, 3) = w0 * (3 * r_ij[1] * r_ij[1] - r_abs_sq) / pow(sqrt(r_abs_sq), 5);
// yz
rspace_tensor(nx, ny, nz, 4) = w0 * (3 * r_ij[1] * r_ij[2]) / pow(sqrt(r_abs_sq), 5);
// zz
rspace_tensor(nx, ny, nz, 5) = w0 * (3 * r_ij[2] * r_ij[2] - r_abs_sq) / pow(sqrt(r_abs_sq), 5);

// for (int m = 0; m < 3; ++m) {
// for (int n = m; n < 3; ++n) {
// auto value = w0 * (3 * r_ij[m] * r_ij[n] - r_abs_sq * Id[m][n]) / pow(sqrt(r_abs_sq), 5);
//
// std::cout << m << " " << n << " " << value << std::endl;
// if (!std::isfinite(value)) {
// throw std::runtime_error("fatal error in CudaDipoleFFTHamiltonian::generate_kspace_dipole_tensor: tensor Szz is not finite");
// }
// rspace_tensor(nx, ny, nz, m, n) = value;
// }
// }
}
}
}
Expand All @@ -299,9 +314,9 @@ CudaDipoleFFTHamiltonian::generate_kspace_dipole_tensor(const int pos_i, const i
}

int rank = 3;
int stride = 9;
int stride = 6;
int dist = 1;
int num_transforms = 9;
int num_transforms = 6;
int * nembed = nullptr;
int transform_size[3] = {kspace_padded_size_[0], kspace_padded_size_[1], kspace_padded_size_[2]};

Expand Down
2 changes: 1 addition & 1 deletion src/jams/hamiltonian/cuda_dipole_fft.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ class CudaDipoleFFTHamiltonian : public Hamiltonian {
bool check_radius_ = true;
bool check_symmetry_ = true;

jams::MultiArray<Complex, 5> generate_kspace_dipole_tensor(const int pos_i, const int pos_j, std::vector<Vec3> &generated_positions);
jams::MultiArray<Complex, 4> generate_kspace_dipole_tensor(const int pos_i, const int pos_j, std::vector<Vec3> &generated_positions);

double r_cutoff_;
double distance_tolerance_;
Expand Down
28 changes: 28 additions & 0 deletions src/jams/hamiltonian/cuda_fft_interaction.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
// cuda_fft_interaction.h -*-C++-*-
#ifndef INCLUDED_JAMS_CUDA_FFT_INTERACTION
#define INCLUDED_JAMS_CUDA_FFT_INTERACTION
#include <jams/core/hamiltonian.h>

class CudaFFTInteractionHamiltonian : public Hamiltonian {

CudaFFTInteractionHamiltonian(const libconfig::Setting &settings, unsigned int size);

double calculate_total_energy(double time) override;

void calculate_energies(double time) override;

void calculate_fields(double time) override;

Vec3 calculate_field(int i, double time) override;

double calculate_energy(int i, double time) override;

double calculate_energy_difference(int i, const Vec3 &spin_initial, const Vec3 &spin_final, double time) override;

private:
CudaStream cusparse_stream_; // cuda stream to run in

};

#endif
// ----------------------------- END-OF-FILE ----------------------------------
1 change: 1 addition & 0 deletions src/jams/helpers/consts.h
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ constexpr double kVacuumPermeabilityIU = 4*kPi*1E-7 / kJoule2meV; // Original

constexpr double kMeterToAngstroms = 1e10;
constexpr double kMeterToNanometer = 1e9;
constexpr double kSecondToPicosecond = 1e12;

constexpr double kBytesToMegaBytes = 1048576.0;

Expand Down
Loading