diff --git a/src/cuda/cuda2go.go b/src/cuda/cuda2go.go index 6699816..3489381 100644 --- a/src/cuda/cuda2go.go +++ b/src/cuda/cuda2go.go @@ -112,8 +112,8 @@ func wrapgen(filename, funcname string, argt, argn []string) { // find corresponding .PTX files if ls == nil { dir, errd := os.Open(".") - defer dir.Close() log.Log.PanicIfError(errd) + defer dir.Close() var errls error ls, errls = dir.Readdirnames(-1) log.Log.PanicIfError(errls) diff --git a/src/cuda/shift.go b/src/cuda/shift.go index 3a1a873..9656423 100644 --- a/src/cuda/shift.go +++ b/src/cuda/shift.go @@ -15,6 +15,21 @@ func ShiftX(dst, src *data.Slice, shiftX int, clampL, clampR float32) { k_shiftx_async(dst.DevPtr(0), src.DevPtr(0), N[X], N[Y], N[Z], shiftX, clampL, clampR, cfg) } +// Shifts a component `src` of a vector field by `shiftX` cells along the X-axis. +// Unlike the normal `shift()`, the new edge value is the current edge value. +// +// To avoid the situation where the magnetization could be set to (0,0,0) within the geometry, it is +// also required to pass the two other vector components `othercomp` and `anothercomp` to this function. +// In cells where the vector (`src`, `othercomp`, `anothercomp`) is the zero-vector, +// `clampL` or `clampR` is used for the component `src` instead. +func ShiftEdgeCarryX(dst, src, othercomp, anothercomp *data.Slice, shiftX int, clampL, clampR float32) { + log.AssertMsg(dst.NComp() == 1 && src.NComp() == 1 && othercomp.NComp() == 1 && anothercomp.NComp() == 1, "Component mismatch: dst, src, othercomp and anothercomp must all have 1 component in ShiftEdgeCarryX") + log.AssertMsg(dst.Len() == src.Len(), "Length mismatch: dst and src must have the same length in ShiftEdgeCarryX") + N := dst.Size() + cfg := make3DConf(N) + k_shiftedgecarryX_async(dst.DevPtr(0), src.DevPtr(0), othercomp.DevPtr(0), anothercomp.DevPtr(0), N[X], N[Y], N[Z], shiftX, clampL, clampR, cfg) +} + func ShiftY(dst, src *data.Slice, shiftY int, clampL, clampR float32) { log.AssertMsg(dst.NComp() == 1 && src.NComp() == 1, "Component mismatch: dst and src must both have 1 component in ShiftY") log.AssertMsg(dst.Len() == src.Len(), "Length mismatch: dst and src must have the same length in ShiftY") @@ -23,6 +38,21 @@ func ShiftY(dst, src *data.Slice, shiftY int, clampL, clampR float32) { k_shifty_async(dst.DevPtr(0), src.DevPtr(0), N[X], N[Y], N[Z], shiftY, clampL, clampR, cfg) } +// Shifts a component `src` of a vector field by `shiftY` cells along the Y-axis. +// Unlike the normal `shift()`, the new edge value is the current edge value. +// +// To avoid the situation where the magnetization could be set to (0,0,0) within the geometry, it is +// also required to pass the two other vector components `othercomp` and `anothercomp` to this function. +// In cells where the vector (`src`, `othercomp`, `anothercomp`) is the zero-vector, +// `clampD` or `clampU` is used for the component `src` instead. +func ShiftEdgeCarry(dst, src, othercomp, anothercomp *data.Slice, shiftY int, clampL, clampR float32) { + log.AssertMsg(dst.NComp() == 1 && src.NComp() == 1 && othercomp.NComp() == 1 && anothercomp.NComp() == 1, "Component mismatch: dst, src, othercomp and anothercomp must all have 1 component in ShiftEdgeCarry") + log.AssertMsg(dst.Len() == src.Len(), "Length mismatch: dst and src must have the same length in ShiftEdgeCarry") + N := dst.Size() + cfg := make3DConf(N) + k_shiftedgecarryY_async(dst.DevPtr(0), src.DevPtr(0), othercomp.DevPtr(0), anothercomp.DevPtr(0), N[X], N[Y], N[Z], shiftY, clampL, clampR, cfg) +} + func ShiftZ(dst, src *data.Slice, shiftZ int, clampL, clampR float32) { log.AssertMsg(dst.NComp() == 1 && src.NComp() == 1, "Component mismatch: dst and src must both have 1 component in ShiftZ") log.AssertMsg(dst.Len() == src.Len(), "Length mismatch: dst and src must have the same length in ShiftZ") diff --git a/src/cuda/shiftedgecarryx.cu b/src/cuda/shiftedgecarryx.cu new file mode 100644 index 0000000..0e10967 --- /dev/null +++ b/src/cuda/shiftedgecarryx.cu @@ -0,0 +1,37 @@ +#include "stencil.h" + +// Shifts a component `src` of a vector field by `shx` cells along the X-axis. +// Unlike the normal `shiftx()`, the new edge value is the current edge value. +// +// To avoid the situation where the magnetization could be set to (0,0,0) within the geometry, it is +// also required to pass the two other vector components `othercomp` and `anothercomp` to this function. +// In cells where the vector (`src`, `othercomp`, `anothercomp`) is the zero-vector, +// `clampL` or `clampR` is used for the component `src` instead. +extern "C" __global__ void +shiftedgecarryX(float* __restrict__ dst, float* __restrict__ src, + float* __restrict__ othercomp, float* __restrict__ anothercomp, + int Nx, int Ny, int Nz, int shx, float clampL, float clampR) { + + int ix = blockIdx.x * blockDim.x + threadIdx.x; + int iy = blockIdx.y * blockDim.y + threadIdx.y; + int iz = blockIdx.z * blockDim.z + threadIdx.z; + + if(ix < Nx && iy < Ny && iz < Nz) { + int ix2 = ix-shx; // old X-index + float newval; + if (ix2 < 0) { // left edge (shifting right) + newval = src[idx(0, iy, iz)]; + if (newval == 0 && othercomp[idx(0, iy, iz)] == 0 && anothercomp[idx(0, iy, iz)] == 0) { // If zero-vector + newval = clampL; + } + } else if (ix2 >= Nx) { // right edge (shifting left) + newval = src[idx(Nx-1, iy, iz)]; + if (newval == 0 && othercomp[idx(Nx-1, iy, iz)] == 0 && anothercomp[idx(Nx-1, iy, iz)] == 0) { // If zero-vector + newval = clampR; + } + } else { // bulk, doesn't matter which way the shift is + newval = src[idx(ix2, iy, iz)]; + } + dst[idx(ix, iy, iz)] = newval; + } +} \ No newline at end of file diff --git a/src/cuda/shiftedgecarryx_wrapper.go b/src/cuda/shiftedgecarryx_wrapper.go new file mode 100644 index 0000000..7327d89 --- /dev/null +++ b/src/cuda/shiftedgecarryx_wrapper.go @@ -0,0 +1,219 @@ +package cuda + +/* + THIS FILE IS AUTO-GENERATED BY CUDA2GO. + EDITING IS FUTILE. +*/ + +import( + "unsafe" + "github.com/MathieuMoalic/amumax/src/cuda/cu" + "github.com/MathieuMoalic/amumax/src/timer" + "sync" +) + +// CUDA handle for shiftedgecarryX kernel +var shiftedgecarryX_code cu.Function + +// Stores the arguments for shiftedgecarryX kernel invocation +type shiftedgecarryX_args_t struct{ + arg_dst unsafe.Pointer + arg_src unsafe.Pointer + arg_othercomp unsafe.Pointer + arg_anothercomp unsafe.Pointer + arg_Nx int + arg_Ny int + arg_Nz int + arg_shx int + arg_clampL float32 + arg_clampR float32 + argptr [10]unsafe.Pointer + sync.Mutex +} + +// Stores the arguments for shiftedgecarryX kernel invocation +var shiftedgecarryX_args shiftedgecarryX_args_t + +func init(){ + // CUDA driver kernel call wants pointers to arguments, set them up once. + shiftedgecarryX_args.argptr[0] = unsafe.Pointer(&shiftedgecarryX_args.arg_dst) + shiftedgecarryX_args.argptr[1] = unsafe.Pointer(&shiftedgecarryX_args.arg_src) + shiftedgecarryX_args.argptr[2] = unsafe.Pointer(&shiftedgecarryX_args.arg_othercomp) + shiftedgecarryX_args.argptr[3] = unsafe.Pointer(&shiftedgecarryX_args.arg_anothercomp) + shiftedgecarryX_args.argptr[4] = unsafe.Pointer(&shiftedgecarryX_args.arg_Nx) + shiftedgecarryX_args.argptr[5] = unsafe.Pointer(&shiftedgecarryX_args.arg_Ny) + shiftedgecarryX_args.argptr[6] = unsafe.Pointer(&shiftedgecarryX_args.arg_Nz) + shiftedgecarryX_args.argptr[7] = unsafe.Pointer(&shiftedgecarryX_args.arg_shx) + shiftedgecarryX_args.argptr[8] = unsafe.Pointer(&shiftedgecarryX_args.arg_clampL) + shiftedgecarryX_args.argptr[9] = unsafe.Pointer(&shiftedgecarryX_args.arg_clampR) + } + +// Wrapper for shiftedgecarryX CUDA kernel, asynchronous. +func k_shiftedgecarryX_async ( dst unsafe.Pointer, src unsafe.Pointer, othercomp unsafe.Pointer, anothercomp unsafe.Pointer, Nx int, Ny int, Nz int, shx int, clampL float32, clampR float32, cfg *config) { + if Synchronous{ // debug + Sync() + timer.Start("shiftedgecarryX") + } + + shiftedgecarryX_args.Lock() + defer shiftedgecarryX_args.Unlock() + + if shiftedgecarryX_code == 0{ + shiftedgecarryX_code = fatbinLoad(shiftedgecarryX_map, "shiftedgecarryX") + } + + shiftedgecarryX_args.arg_dst = dst + shiftedgecarryX_args.arg_src = src + shiftedgecarryX_args.arg_othercomp = othercomp + shiftedgecarryX_args.arg_anothercomp = anothercomp + shiftedgecarryX_args.arg_Nx = Nx + shiftedgecarryX_args.arg_Ny = Ny + shiftedgecarryX_args.arg_Nz = Nz + shiftedgecarryX_args.arg_shx = shx + shiftedgecarryX_args.arg_clampL = clampL + shiftedgecarryX_args.arg_clampR = clampR + + + args := shiftedgecarryX_args.argptr[:] + cu.LaunchKernel(shiftedgecarryX_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) + + if Synchronous{ // debug + Sync() + timer.Stop("shiftedgecarryX") + } +} + +// maps compute capability on PTX code for shiftedgecarryX kernel. +var shiftedgecarryX_map = map[int]string{ 0: "" , +52: shiftedgecarryX_ptx_52 } + +// shiftedgecarryX PTX code for various compute capabilities. +const( + shiftedgecarryX_ptx_52 = ` +.version 7.0 +.target sm_52 +.address_size 64 + + // .globl shiftedgecarryX + +.visible .entry shiftedgecarryX( + .param .u64 shiftedgecarryX_param_0, + .param .u64 shiftedgecarryX_param_1, + .param .u64 shiftedgecarryX_param_2, + .param .u64 shiftedgecarryX_param_3, + .param .u32 shiftedgecarryX_param_4, + .param .u32 shiftedgecarryX_param_5, + .param .u32 shiftedgecarryX_param_6, + .param .u32 shiftedgecarryX_param_7, + .param .f32 shiftedgecarryX_param_8, + .param .f32 shiftedgecarryX_param_9 +) +{ + .reg .pred %p<14>; + .reg .f32 %f<14>; + .reg .b32 %r<24>; + .reg .b64 %rd<25>; + + + ld.param.u64 %rd4, [shiftedgecarryX_param_0]; + ld.param.u64 %rd5, [shiftedgecarryX_param_1]; + ld.param.u64 %rd6, [shiftedgecarryX_param_2]; + ld.param.u64 %rd7, [shiftedgecarryX_param_3]; + ld.param.u32 %r7, [shiftedgecarryX_param_4]; + ld.param.u32 %r8, [shiftedgecarryX_param_5]; + ld.param.u32 %r10, [shiftedgecarryX_param_6]; + ld.param.u32 %r9, [shiftedgecarryX_param_7]; + ld.param.f32 %f7, [shiftedgecarryX_param_8]; + ld.param.f32 %f8, [shiftedgecarryX_param_9]; + cvta.to.global.u64 %rd1, %rd7; + cvta.to.global.u64 %rd2, %rd6; + cvta.to.global.u64 %rd3, %rd5; + mov.u32 %r11, %ntid.x; + mov.u32 %r12, %ctaid.x; + mov.u32 %r13, %tid.x; + mad.lo.s32 %r1, %r11, %r12, %r13; + mov.u32 %r14, %ntid.y; + mov.u32 %r15, %ctaid.y; + mov.u32 %r16, %tid.y; + mad.lo.s32 %r2, %r14, %r15, %r16; + mov.u32 %r17, %ntid.z; + mov.u32 %r18, %ctaid.z; + mov.u32 %r19, %tid.z; + mad.lo.s32 %r3, %r17, %r18, %r19; + setp.lt.s32 %p1, %r1, %r7; + setp.lt.s32 %p2, %r2, %r8; + and.pred %p3, %p1, %p2; + setp.lt.s32 %p4, %r3, %r10; + and.pred %p5, %p3, %p4; + @!%p5 bra BB0_11; + bra.uni BB0_1; + +BB0_1: + sub.s32 %r4, %r1, %r9; + setp.lt.s32 %p6, %r4, 0; + mad.lo.s32 %r20, %r3, %r8, %r2; + mul.lo.s32 %r5, %r20, %r7; + @%p6 bra BB0_7; + + setp.lt.s32 %p7, %r4, %r7; + @%p7 bra BB0_6; + bra.uni BB0_3; + +BB0_6: + add.s32 %r22, %r5, %r4; + mul.wide.s32 %rd14, %r22, 4; + add.s64 %rd15, %rd3, %rd14; + ld.global.nc.f32 %f13, [%rd15]; + bra.uni BB0_10; + +BB0_7: + mul.wide.s32 %rd16, %r5, 4; + add.s64 %rd17, %rd3, %rd16; + ld.global.nc.f32 %f13, [%rd17]; + setp.neu.f32 %p11, %f13, 0f00000000; + @%p11 bra BB0_10; + + add.s64 %rd19, %rd2, %rd16; + ld.global.nc.f32 %f11, [%rd19]; + setp.neu.f32 %p12, %f11, 0f00000000; + @%p12 bra BB0_10; + + add.s64 %rd21, %rd1, %rd16; + ld.global.nc.f32 %f12, [%rd21]; + setp.eq.f32 %p13, %f12, 0f00000000; + selp.f32 %f13, %f7, %f13, %p13; + bra.uni BB0_10; + +BB0_3: + add.s32 %r21, %r7, %r5; + add.s32 %r6, %r21, -1; + mul.wide.s32 %rd8, %r6, 4; + add.s64 %rd9, %rd3, %rd8; + ld.global.nc.f32 %f13, [%rd9]; + setp.neu.f32 %p8, %f13, 0f00000000; + @%p8 bra BB0_10; + + add.s64 %rd11, %rd2, %rd8; + ld.global.nc.f32 %f9, [%rd11]; + setp.neu.f32 %p9, %f9, 0f00000000; + @%p9 bra BB0_10; + + add.s64 %rd13, %rd1, %rd8; + ld.global.nc.f32 %f10, [%rd13]; + setp.eq.f32 %p10, %f10, 0f00000000; + selp.f32 %f13, %f8, %f13, %p10; + +BB0_10: + cvta.to.global.u64 %rd22, %rd4; + add.s32 %r23, %r5, %r1; + mul.wide.s32 %rd23, %r23, 4; + add.s64 %rd24, %rd22, %rd23; + st.global.f32 [%rd24], %f13; + +BB0_11: + ret; +} + + +` + ) diff --git a/src/cuda/shiftedgecarryy.cu b/src/cuda/shiftedgecarryy.cu new file mode 100644 index 0000000..2f6a41f --- /dev/null +++ b/src/cuda/shiftedgecarryy.cu @@ -0,0 +1,37 @@ +#include "stencil.h" + +// Shifts a component `src` of a vector field by `shy` cells along the Y-axis. +// Unlike the normal `shifty()`, the new edge value is the current edge value. +// +// To avoid the situation where the magnetization could be set to (0,0,0) within the geometry, it is +// also required to pass the two other vector components `othercomp` and `anothercomp` to this function. +// In cells where the vector (`src`, `othercomp`, `anothercomp`) is the zero-vector, +// `clampD` or `clampU` is used for the component `src` instead. +extern "C" __global__ void +shiftedgecarryY(float* __restrict__ dst, float* __restrict__ src, + float* __restrict__ othercomp, float* __restrict__ anothercomp, + int Nx, int Ny, int Nz, int shy, float clampD, float clampU) { + + int ix = blockIdx.x * blockDim.x + threadIdx.x; + int iy = blockIdx.y * blockDim.y + threadIdx.y; + int iz = blockIdx.z * blockDim.z + threadIdx.z; + + if(ix < Nx && iy < Ny && iz < Nz) { + int iy2 = iy-shy; // old Y-index + float newval; + if (iy2 < 0) { // bottom edge (shifting up) + newval = src[idx(ix, 0, iz)]; + if (newval == 0 && othercomp[idx(ix, 0, iz)] == 0 && anothercomp[idx(ix, 0, iz)] == 0) { // If zero-vector + newval = clampD; + } + } else if (iy2 >= Ny) { // top edge (shifting down) + newval = src[idx(ix, Ny-1, iz)]; + if (newval == 0 && othercomp[idx(ix, Ny-1, iz)] == 0 && anothercomp[idx(ix, Ny-1, iz)] == 0) { // If zero-vector + newval = clampU; + } + } else { // bulk, doesn't matter which way the shift is + newval = src[idx(ix, iy2, iz)]; + } + dst[idx(ix, iy, iz)] = newval; + } +} \ No newline at end of file diff --git a/src/cuda/shiftedgecarryy_wrapper.go b/src/cuda/shiftedgecarryy_wrapper.go new file mode 100644 index 0000000..1c909c8 --- /dev/null +++ b/src/cuda/shiftedgecarryy_wrapper.go @@ -0,0 +1,222 @@ +package cuda + +/* + THIS FILE IS AUTO-GENERATED BY CUDA2GO. + EDITING IS FUTILE. +*/ + +import( + "unsafe" + "github.com/MathieuMoalic/amumax/src/cuda/cu" + "github.com/MathieuMoalic/amumax/src/timer" + "sync" +) + +// CUDA handle for shiftedgecarryY kernel +var shiftedgecarryY_code cu.Function + +// Stores the arguments for shiftedgecarryY kernel invocation +type shiftedgecarryY_args_t struct{ + arg_dst unsafe.Pointer + arg_src unsafe.Pointer + arg_othercomp unsafe.Pointer + arg_anothercomp unsafe.Pointer + arg_Nx int + arg_Ny int + arg_Nz int + arg_shy int + arg_clampD float32 + arg_clampU float32 + argptr [10]unsafe.Pointer + sync.Mutex +} + +// Stores the arguments for shiftedgecarryY kernel invocation +var shiftedgecarryY_args shiftedgecarryY_args_t + +func init(){ + // CUDA driver kernel call wants pointers to arguments, set them up once. + shiftedgecarryY_args.argptr[0] = unsafe.Pointer(&shiftedgecarryY_args.arg_dst) + shiftedgecarryY_args.argptr[1] = unsafe.Pointer(&shiftedgecarryY_args.arg_src) + shiftedgecarryY_args.argptr[2] = unsafe.Pointer(&shiftedgecarryY_args.arg_othercomp) + shiftedgecarryY_args.argptr[3] = unsafe.Pointer(&shiftedgecarryY_args.arg_anothercomp) + shiftedgecarryY_args.argptr[4] = unsafe.Pointer(&shiftedgecarryY_args.arg_Nx) + shiftedgecarryY_args.argptr[5] = unsafe.Pointer(&shiftedgecarryY_args.arg_Ny) + shiftedgecarryY_args.argptr[6] = unsafe.Pointer(&shiftedgecarryY_args.arg_Nz) + shiftedgecarryY_args.argptr[7] = unsafe.Pointer(&shiftedgecarryY_args.arg_shy) + shiftedgecarryY_args.argptr[8] = unsafe.Pointer(&shiftedgecarryY_args.arg_clampD) + shiftedgecarryY_args.argptr[9] = unsafe.Pointer(&shiftedgecarryY_args.arg_clampU) + } + +// Wrapper for shiftedgecarryY CUDA kernel, asynchronous. +func k_shiftedgecarryY_async ( dst unsafe.Pointer, src unsafe.Pointer, othercomp unsafe.Pointer, anothercomp unsafe.Pointer, Nx int, Ny int, Nz int, shy int, clampD float32, clampU float32, cfg *config) { + if Synchronous{ // debug + Sync() + timer.Start("shiftedgecarryY") + } + + shiftedgecarryY_args.Lock() + defer shiftedgecarryY_args.Unlock() + + if shiftedgecarryY_code == 0{ + shiftedgecarryY_code = fatbinLoad(shiftedgecarryY_map, "shiftedgecarryY") + } + + shiftedgecarryY_args.arg_dst = dst + shiftedgecarryY_args.arg_src = src + shiftedgecarryY_args.arg_othercomp = othercomp + shiftedgecarryY_args.arg_anothercomp = anothercomp + shiftedgecarryY_args.arg_Nx = Nx + shiftedgecarryY_args.arg_Ny = Ny + shiftedgecarryY_args.arg_Nz = Nz + shiftedgecarryY_args.arg_shy = shy + shiftedgecarryY_args.arg_clampD = clampD + shiftedgecarryY_args.arg_clampU = clampU + + + args := shiftedgecarryY_args.argptr[:] + cu.LaunchKernel(shiftedgecarryY_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) + + if Synchronous{ // debug + Sync() + timer.Stop("shiftedgecarryY") + } +} + +// maps compute capability on PTX code for shiftedgecarryY kernel. +var shiftedgecarryY_map = map[int]string{ 0: "" , +52: shiftedgecarryY_ptx_52 } + +// shiftedgecarryY PTX code for various compute capabilities. +const( + shiftedgecarryY_ptx_52 = ` +.version 7.0 +.target sm_52 +.address_size 64 + + // .globl shiftedgecarryY + +.visible .entry shiftedgecarryY( + .param .u64 shiftedgecarryY_param_0, + .param .u64 shiftedgecarryY_param_1, + .param .u64 shiftedgecarryY_param_2, + .param .u64 shiftedgecarryY_param_3, + .param .u32 shiftedgecarryY_param_4, + .param .u32 shiftedgecarryY_param_5, + .param .u32 shiftedgecarryY_param_6, + .param .u32 shiftedgecarryY_param_7, + .param .f32 shiftedgecarryY_param_8, + .param .f32 shiftedgecarryY_param_9 +) +{ + .reg .pred %p<14>; + .reg .f32 %f<14>; + .reg .b32 %r<27>; + .reg .b64 %rd<25>; + + + ld.param.u64 %rd4, [shiftedgecarryY_param_0]; + ld.param.u64 %rd5, [shiftedgecarryY_param_1]; + ld.param.u64 %rd6, [shiftedgecarryY_param_2]; + ld.param.u64 %rd7, [shiftedgecarryY_param_3]; + ld.param.u32 %r8, [shiftedgecarryY_param_4]; + ld.param.u32 %r9, [shiftedgecarryY_param_5]; + ld.param.u32 %r11, [shiftedgecarryY_param_6]; + ld.param.u32 %r10, [shiftedgecarryY_param_7]; + ld.param.f32 %f7, [shiftedgecarryY_param_8]; + ld.param.f32 %f8, [shiftedgecarryY_param_9]; + cvta.to.global.u64 %rd1, %rd7; + cvta.to.global.u64 %rd2, %rd6; + cvta.to.global.u64 %rd3, %rd5; + mov.u32 %r12, %ntid.x; + mov.u32 %r13, %ctaid.x; + mov.u32 %r14, %tid.x; + mad.lo.s32 %r1, %r12, %r13, %r14; + mov.u32 %r15, %ntid.y; + mov.u32 %r16, %ctaid.y; + mov.u32 %r17, %tid.y; + mad.lo.s32 %r2, %r15, %r16, %r17; + mov.u32 %r18, %ntid.z; + mov.u32 %r19, %ctaid.z; + mov.u32 %r20, %tid.z; + mad.lo.s32 %r3, %r18, %r19, %r20; + setp.lt.s32 %p1, %r1, %r8; + setp.lt.s32 %p2, %r2, %r9; + and.pred %p3, %p1, %p2; + setp.lt.s32 %p4, %r3, %r11; + and.pred %p5, %p3, %p4; + @!%p5 bra BB0_11; + bra.uni BB0_1; + +BB0_1: + sub.s32 %r4, %r2, %r10; + setp.lt.s32 %p6, %r4, 0; + mul.lo.s32 %r5, %r3, %r9; + @%p6 bra BB0_7; + + setp.lt.s32 %p7, %r4, %r9; + @%p7 bra BB0_6; + bra.uni BB0_3; + +BB0_6: + add.s32 %r23, %r5, %r4; + mad.lo.s32 %r24, %r23, %r8, %r1; + mul.wide.s32 %rd14, %r24, 4; + add.s64 %rd15, %rd3, %rd14; + ld.global.nc.f32 %f13, [%rd15]; + bra.uni BB0_10; + +BB0_7: + mad.lo.s32 %r7, %r5, %r8, %r1; + mul.wide.s32 %rd16, %r7, 4; + add.s64 %rd17, %rd3, %rd16; + ld.global.nc.f32 %f13, [%rd17]; + setp.neu.f32 %p11, %f13, 0f00000000; + @%p11 bra BB0_10; + + add.s64 %rd19, %rd2, %rd16; + ld.global.nc.f32 %f11, [%rd19]; + setp.neu.f32 %p12, %f11, 0f00000000; + @%p12 bra BB0_10; + + add.s64 %rd21, %rd1, %rd16; + ld.global.nc.f32 %f12, [%rd21]; + setp.eq.f32 %p13, %f12, 0f00000000; + selp.f32 %f13, %f7, %f13, %p13; + bra.uni BB0_10; + +BB0_3: + add.s32 %r21, %r9, %r5; + add.s32 %r22, %r21, -1; + mad.lo.s32 %r6, %r22, %r8, %r1; + mul.wide.s32 %rd8, %r6, 4; + add.s64 %rd9, %rd3, %rd8; + ld.global.nc.f32 %f13, [%rd9]; + setp.neu.f32 %p8, %f13, 0f00000000; + @%p8 bra BB0_10; + + add.s64 %rd11, %rd2, %rd8; + ld.global.nc.f32 %f9, [%rd11]; + setp.neu.f32 %p9, %f9, 0f00000000; + @%p9 bra BB0_10; + + add.s64 %rd13, %rd1, %rd8; + ld.global.nc.f32 %f10, [%rd13]; + setp.eq.f32 %p10, %f10, 0f00000000; + selp.f32 %f13, %f8, %f13, %p10; + +BB0_10: + cvta.to.global.u64 %rd22, %rd4; + add.s32 %r25, %r5, %r2; + mad.lo.s32 %r26, %r25, %r8, %r1; + mul.wide.s32 %rd23, %r26, 4; + add.s64 %rd24, %rd22, %rd23; + st.global.f32 [%rd24], %f13; + +BB0_11: + ret; +} + + +` + ) diff --git a/src/engine/geom.go b/src/engine/geom.go index 1168002..274e0fd 100644 --- a/src/engine/geom.go +++ b/src/engine/geom.go @@ -190,6 +190,10 @@ func (g *geom) cellVolume(ix, iy, iz int) float32 { return vol / float32(N*N*N) } +func (g *geom) GetCell(ix, iy, iz int) float64 { + return float64(cuda.GetCell(g.Gpu(), 0, ix, iy, iz)) +} + func (g *geom) shift(dx int) { // empty mask, nothing to do if g == nil || g.Buffer.IsNil() { diff --git a/src/engine/script_var.go b/src/engine/script_var.go index e4a6e3a..b6d1b0f 100644 --- a/src/engine/script_var.go +++ b/src/engine/script_var.go @@ -58,6 +58,7 @@ func init() { declVar("ShiftGeom", &shiftGeom, "Whether Shift() acts on geometry") declVar("ShiftRegions", &shiftRegions, "Whether Shift() acts on regions") declVar("TotalShift", &totalShift, "Amount by which the simulation has been shifted (m).") + declVar("EdgeCarryShift", &EdgeCarryShift, "Whether to use the current magnetization at the border for the cells inserted by Shift") declVar("GammaLL", &gammaLL, "Gyromagnetic ratio in rad/Ts") declVar("DisableZhangLiTorque", &disableZhangLiTorque, "Disables Zhang-Li torque (default=false)") diff --git a/src/engine/shift.go b/src/engine/shift.go index c32db2c..562ed91 100644 --- a/src/engine/shift.go +++ b/src/engine/shift.go @@ -9,6 +9,7 @@ var ( totalShift, totalYShift float64 // accumulated window shift (X and Y) in meter shiftMagL, shiftMagR, shiftMagU, shiftMagD data.Vector // when shifting m, put these value at the left/right edge. shiftM, shiftGeom, shiftRegions bool = true, true, true // should shift act on magnetization, geometry, regions? + EdgeCarryShift bool = true // Use the values of M at the border for the new cells ) // position of the window lab frame @@ -35,7 +36,11 @@ func shiftMag(m *data.Slice, dx int) { defer cuda.Recycle(m2) for c := 0; c < m.NComp(); c++ { comp := m.Comp(c) - cuda.ShiftX(m2, comp, dx, float32(shiftMagL[c]), float32(shiftMagR[c])) + if EdgeCarryShift { + cuda.ShiftEdgeCarryX(m2, comp, m.Comp((c+1)%3), m.Comp((c+2)%3), dx, float32(shiftMagL[c]), float32(shiftMagL[c])) + } else { + cuda.ShiftX(m2, comp, dx, float32(shiftMagL[c]), float32(shiftMagL[c])) + } data.Copy(comp, m2) // str0 ? } } @@ -60,7 +65,11 @@ func shiftMagY(m *data.Slice, dy int) { defer cuda.Recycle(m2) for c := 0; c < m.NComp(); c++ { comp := m.Comp(c) - cuda.ShiftY(m2, comp, dy, float32(shiftMagU[c]), float32(shiftMagD[c])) + if EdgeCarryShift { + cuda.ShiftEdgeCarryX(m2, comp, m.Comp((c+1)%3), m.Comp((c+2)%3), dy, float32(shiftMagL[c]), float32(shiftMagL[c])) + } else { + cuda.ShiftX(m2, comp, dy, float32(shiftMagL[c]), float32(shiftMagL[c])) + } data.Copy(comp, m2) // str0 ? } }