-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
5a04a0e
commit 2ae38f6
Showing
9 changed files
with
562 additions
and
3 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
#include "stencil.h" | ||
|
||
// Shifts a component `src` of a vector field by `shx` cells along the X-axis. | ||
// Unlike the normal `shiftx()`, the new edge value is the current edge value. | ||
// | ||
// To avoid the situation where the magnetization could be set to (0,0,0) within the geometry, it is | ||
// also required to pass the two other vector components `othercomp` and `anothercomp` to this function. | ||
// In cells where the vector (`src`, `othercomp`, `anothercomp`) is the zero-vector, | ||
// `clampL` or `clampR` is used for the component `src` instead. | ||
extern "C" __global__ void | ||
shiftedgecarryX(float* __restrict__ dst, float* __restrict__ src, | ||
float* __restrict__ othercomp, float* __restrict__ anothercomp, | ||
int Nx, int Ny, int Nz, int shx, float clampL, float clampR) { | ||
|
||
int ix = blockIdx.x * blockDim.x + threadIdx.x; | ||
int iy = blockIdx.y * blockDim.y + threadIdx.y; | ||
int iz = blockIdx.z * blockDim.z + threadIdx.z; | ||
|
||
if(ix < Nx && iy < Ny && iz < Nz) { | ||
int ix2 = ix-shx; // old X-index | ||
float newval; | ||
if (ix2 < 0) { // left edge (shifting right) | ||
newval = src[idx(0, iy, iz)]; | ||
if (newval == 0 && othercomp[idx(0, iy, iz)] == 0 && anothercomp[idx(0, iy, iz)] == 0) { // If zero-vector | ||
newval = clampL; | ||
} | ||
} else if (ix2 >= Nx) { // right edge (shifting left) | ||
newval = src[idx(Nx-1, iy, iz)]; | ||
if (newval == 0 && othercomp[idx(Nx-1, iy, iz)] == 0 && anothercomp[idx(Nx-1, iy, iz)] == 0) { // If zero-vector | ||
newval = clampR; | ||
} | ||
} else { // bulk, doesn't matter which way the shift is | ||
newval = src[idx(ix2, iy, iz)]; | ||
} | ||
dst[idx(ix, iy, iz)] = newval; | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,219 @@ | ||
package cuda | ||
|
||
/* | ||
THIS FILE IS AUTO-GENERATED BY CUDA2GO. | ||
EDITING IS FUTILE. | ||
*/ | ||
|
||
import( | ||
"unsafe" | ||
"github.com/MathieuMoalic/amumax/src/cuda/cu" | ||
"github.com/MathieuMoalic/amumax/src/timer" | ||
"sync" | ||
) | ||
|
||
// CUDA handle for shiftedgecarryX kernel | ||
var shiftedgecarryX_code cu.Function | ||
|
||
// Stores the arguments for shiftedgecarryX kernel invocation | ||
type shiftedgecarryX_args_t struct{ | ||
arg_dst unsafe.Pointer | ||
arg_src unsafe.Pointer | ||
arg_othercomp unsafe.Pointer | ||
arg_anothercomp unsafe.Pointer | ||
arg_Nx int | ||
arg_Ny int | ||
arg_Nz int | ||
arg_shx int | ||
arg_clampL float32 | ||
arg_clampR float32 | ||
argptr [10]unsafe.Pointer | ||
sync.Mutex | ||
} | ||
|
||
// Stores the arguments for shiftedgecarryX kernel invocation | ||
var shiftedgecarryX_args shiftedgecarryX_args_t | ||
|
||
func init(){ | ||
// CUDA driver kernel call wants pointers to arguments, set them up once. | ||
shiftedgecarryX_args.argptr[0] = unsafe.Pointer(&shiftedgecarryX_args.arg_dst) | ||
shiftedgecarryX_args.argptr[1] = unsafe.Pointer(&shiftedgecarryX_args.arg_src) | ||
shiftedgecarryX_args.argptr[2] = unsafe.Pointer(&shiftedgecarryX_args.arg_othercomp) | ||
shiftedgecarryX_args.argptr[3] = unsafe.Pointer(&shiftedgecarryX_args.arg_anothercomp) | ||
shiftedgecarryX_args.argptr[4] = unsafe.Pointer(&shiftedgecarryX_args.arg_Nx) | ||
shiftedgecarryX_args.argptr[5] = unsafe.Pointer(&shiftedgecarryX_args.arg_Ny) | ||
shiftedgecarryX_args.argptr[6] = unsafe.Pointer(&shiftedgecarryX_args.arg_Nz) | ||
shiftedgecarryX_args.argptr[7] = unsafe.Pointer(&shiftedgecarryX_args.arg_shx) | ||
shiftedgecarryX_args.argptr[8] = unsafe.Pointer(&shiftedgecarryX_args.arg_clampL) | ||
shiftedgecarryX_args.argptr[9] = unsafe.Pointer(&shiftedgecarryX_args.arg_clampR) | ||
} | ||
|
||
// Wrapper for shiftedgecarryX CUDA kernel, asynchronous. | ||
func k_shiftedgecarryX_async ( dst unsafe.Pointer, src unsafe.Pointer, othercomp unsafe.Pointer, anothercomp unsafe.Pointer, Nx int, Ny int, Nz int, shx int, clampL float32, clampR float32, cfg *config) { | ||
if Synchronous{ // debug | ||
Sync() | ||
timer.Start("shiftedgecarryX") | ||
} | ||
|
||
shiftedgecarryX_args.Lock() | ||
defer shiftedgecarryX_args.Unlock() | ||
|
||
if shiftedgecarryX_code == 0{ | ||
shiftedgecarryX_code = fatbinLoad(shiftedgecarryX_map, "shiftedgecarryX") | ||
} | ||
|
||
shiftedgecarryX_args.arg_dst = dst | ||
shiftedgecarryX_args.arg_src = src | ||
shiftedgecarryX_args.arg_othercomp = othercomp | ||
shiftedgecarryX_args.arg_anothercomp = anothercomp | ||
shiftedgecarryX_args.arg_Nx = Nx | ||
shiftedgecarryX_args.arg_Ny = Ny | ||
shiftedgecarryX_args.arg_Nz = Nz | ||
shiftedgecarryX_args.arg_shx = shx | ||
shiftedgecarryX_args.arg_clampL = clampL | ||
shiftedgecarryX_args.arg_clampR = clampR | ||
|
||
|
||
args := shiftedgecarryX_args.argptr[:] | ||
cu.LaunchKernel(shiftedgecarryX_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) | ||
|
||
if Synchronous{ // debug | ||
Sync() | ||
timer.Stop("shiftedgecarryX") | ||
} | ||
} | ||
|
||
// maps compute capability on PTX code for shiftedgecarryX kernel. | ||
var shiftedgecarryX_map = map[int]string{ 0: "" , | ||
52: shiftedgecarryX_ptx_52 } | ||
|
||
// shiftedgecarryX PTX code for various compute capabilities. | ||
const( | ||
shiftedgecarryX_ptx_52 = ` | ||
.version 7.0 | ||
.target sm_52 | ||
.address_size 64 | ||
// .globl shiftedgecarryX | ||
.visible .entry shiftedgecarryX( | ||
.param .u64 shiftedgecarryX_param_0, | ||
.param .u64 shiftedgecarryX_param_1, | ||
.param .u64 shiftedgecarryX_param_2, | ||
.param .u64 shiftedgecarryX_param_3, | ||
.param .u32 shiftedgecarryX_param_4, | ||
.param .u32 shiftedgecarryX_param_5, | ||
.param .u32 shiftedgecarryX_param_6, | ||
.param .u32 shiftedgecarryX_param_7, | ||
.param .f32 shiftedgecarryX_param_8, | ||
.param .f32 shiftedgecarryX_param_9 | ||
) | ||
{ | ||
.reg .pred %p<14>; | ||
.reg .f32 %f<14>; | ||
.reg .b32 %r<24>; | ||
.reg .b64 %rd<25>; | ||
ld.param.u64 %rd4, [shiftedgecarryX_param_0]; | ||
ld.param.u64 %rd5, [shiftedgecarryX_param_1]; | ||
ld.param.u64 %rd6, [shiftedgecarryX_param_2]; | ||
ld.param.u64 %rd7, [shiftedgecarryX_param_3]; | ||
ld.param.u32 %r7, [shiftedgecarryX_param_4]; | ||
ld.param.u32 %r8, [shiftedgecarryX_param_5]; | ||
ld.param.u32 %r10, [shiftedgecarryX_param_6]; | ||
ld.param.u32 %r9, [shiftedgecarryX_param_7]; | ||
ld.param.f32 %f7, [shiftedgecarryX_param_8]; | ||
ld.param.f32 %f8, [shiftedgecarryX_param_9]; | ||
cvta.to.global.u64 %rd1, %rd7; | ||
cvta.to.global.u64 %rd2, %rd6; | ||
cvta.to.global.u64 %rd3, %rd5; | ||
mov.u32 %r11, %ntid.x; | ||
mov.u32 %r12, %ctaid.x; | ||
mov.u32 %r13, %tid.x; | ||
mad.lo.s32 %r1, %r11, %r12, %r13; | ||
mov.u32 %r14, %ntid.y; | ||
mov.u32 %r15, %ctaid.y; | ||
mov.u32 %r16, %tid.y; | ||
mad.lo.s32 %r2, %r14, %r15, %r16; | ||
mov.u32 %r17, %ntid.z; | ||
mov.u32 %r18, %ctaid.z; | ||
mov.u32 %r19, %tid.z; | ||
mad.lo.s32 %r3, %r17, %r18, %r19; | ||
setp.lt.s32 %p1, %r1, %r7; | ||
setp.lt.s32 %p2, %r2, %r8; | ||
and.pred %p3, %p1, %p2; | ||
setp.lt.s32 %p4, %r3, %r10; | ||
and.pred %p5, %p3, %p4; | ||
@!%p5 bra BB0_11; | ||
bra.uni BB0_1; | ||
BB0_1: | ||
sub.s32 %r4, %r1, %r9; | ||
setp.lt.s32 %p6, %r4, 0; | ||
mad.lo.s32 %r20, %r3, %r8, %r2; | ||
mul.lo.s32 %r5, %r20, %r7; | ||
@%p6 bra BB0_7; | ||
setp.lt.s32 %p7, %r4, %r7; | ||
@%p7 bra BB0_6; | ||
bra.uni BB0_3; | ||
BB0_6: | ||
add.s32 %r22, %r5, %r4; | ||
mul.wide.s32 %rd14, %r22, 4; | ||
add.s64 %rd15, %rd3, %rd14; | ||
ld.global.nc.f32 %f13, [%rd15]; | ||
bra.uni BB0_10; | ||
BB0_7: | ||
mul.wide.s32 %rd16, %r5, 4; | ||
add.s64 %rd17, %rd3, %rd16; | ||
ld.global.nc.f32 %f13, [%rd17]; | ||
setp.neu.f32 %p11, %f13, 0f00000000; | ||
@%p11 bra BB0_10; | ||
add.s64 %rd19, %rd2, %rd16; | ||
ld.global.nc.f32 %f11, [%rd19]; | ||
setp.neu.f32 %p12, %f11, 0f00000000; | ||
@%p12 bra BB0_10; | ||
add.s64 %rd21, %rd1, %rd16; | ||
ld.global.nc.f32 %f12, [%rd21]; | ||
setp.eq.f32 %p13, %f12, 0f00000000; | ||
selp.f32 %f13, %f7, %f13, %p13; | ||
bra.uni BB0_10; | ||
BB0_3: | ||
add.s32 %r21, %r7, %r5; | ||
add.s32 %r6, %r21, -1; | ||
mul.wide.s32 %rd8, %r6, 4; | ||
add.s64 %rd9, %rd3, %rd8; | ||
ld.global.nc.f32 %f13, [%rd9]; | ||
setp.neu.f32 %p8, %f13, 0f00000000; | ||
@%p8 bra BB0_10; | ||
add.s64 %rd11, %rd2, %rd8; | ||
ld.global.nc.f32 %f9, [%rd11]; | ||
setp.neu.f32 %p9, %f9, 0f00000000; | ||
@%p9 bra BB0_10; | ||
add.s64 %rd13, %rd1, %rd8; | ||
ld.global.nc.f32 %f10, [%rd13]; | ||
setp.eq.f32 %p10, %f10, 0f00000000; | ||
selp.f32 %f13, %f8, %f13, %p10; | ||
BB0_10: | ||
cvta.to.global.u64 %rd22, %rd4; | ||
add.s32 %r23, %r5, %r1; | ||
mul.wide.s32 %rd23, %r23, 4; | ||
add.s64 %rd24, %rd22, %rd23; | ||
st.global.f32 [%rd24], %f13; | ||
BB0_11: | ||
ret; | ||
} | ||
` | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
#include "stencil.h" | ||
|
||
// Shifts a component `src` of a vector field by `shy` cells along the Y-axis. | ||
// Unlike the normal `shifty()`, the new edge value is the current edge value. | ||
// | ||
// To avoid the situation where the magnetization could be set to (0,0,0) within the geometry, it is | ||
// also required to pass the two other vector components `othercomp` and `anothercomp` to this function. | ||
// In cells where the vector (`src`, `othercomp`, `anothercomp`) is the zero-vector, | ||
// `clampD` or `clampU` is used for the component `src` instead. | ||
extern "C" __global__ void | ||
shiftedgecarryY(float* __restrict__ dst, float* __restrict__ src, | ||
float* __restrict__ othercomp, float* __restrict__ anothercomp, | ||
int Nx, int Ny, int Nz, int shy, float clampD, float clampU) { | ||
|
||
int ix = blockIdx.x * blockDim.x + threadIdx.x; | ||
int iy = blockIdx.y * blockDim.y + threadIdx.y; | ||
int iz = blockIdx.z * blockDim.z + threadIdx.z; | ||
|
||
if(ix < Nx && iy < Ny && iz < Nz) { | ||
int iy2 = iy-shy; // old Y-index | ||
float newval; | ||
if (iy2 < 0) { // bottom edge (shifting up) | ||
newval = src[idx(ix, 0, iz)]; | ||
if (newval == 0 && othercomp[idx(ix, 0, iz)] == 0 && anothercomp[idx(ix, 0, iz)] == 0) { // If zero-vector | ||
newval = clampD; | ||
} | ||
} else if (iy2 >= Ny) { // top edge (shifting down) | ||
newval = src[idx(ix, Ny-1, iz)]; | ||
if (newval == 0 && othercomp[idx(ix, Ny-1, iz)] == 0 && anothercomp[idx(ix, Ny-1, iz)] == 0) { // If zero-vector | ||
newval = clampU; | ||
} | ||
} else { // bulk, doesn't matter which way the shift is | ||
newval = src[idx(ix, iy2, iz)]; | ||
} | ||
dst[idx(ix, iy, iz)] = newval; | ||
} | ||
} |
Oops, something went wrong.