diff --git a/src/cuda/cuda2go.go b/src/cuda/cuda2go.go
index 6699816..3489381 100644
--- a/src/cuda/cuda2go.go
+++ b/src/cuda/cuda2go.go
@@ -112,8 +112,8 @@ func wrapgen(filename, funcname string, argt, argn []string) {
 	// find corresponding .PTX files
 	if ls == nil {
 		dir, errd := os.Open(".")
-		defer dir.Close()
 		log.Log.PanicIfError(errd)
+		defer dir.Close()
 		var errls error
 		ls, errls = dir.Readdirnames(-1)
 		log.Log.PanicIfError(errls)
diff --git a/src/cuda/shift.go b/src/cuda/shift.go
index 3a1a873..9656423 100644
--- a/src/cuda/shift.go
+++ b/src/cuda/shift.go
@@ -15,6 +15,21 @@ func ShiftX(dst, src *data.Slice, shiftX int, clampL, clampR float32) {
 	k_shiftx_async(dst.DevPtr(0), src.DevPtr(0), N[X], N[Y], N[Z], shiftX, clampL, clampR, cfg)
 }
 
+// Shifts a component `src` of a vector field by `shiftX` cells along the X-axis.
+// Unlike the normal `shift()`, the new edge value is the current edge value.
+//
+// To avoid the situation where the magnetization could be set to (0,0,0) within the geometry, it is
+// also required to pass the two other vector components `othercomp` and `anothercomp` to this function.
+// In cells where the vector (`src`, `othercomp`, `anothercomp`) is the zero-vector,
+// `clampL` or `clampR` is used for the component `src` instead.
+func ShiftEdgeCarryX(dst, src, othercomp, anothercomp *data.Slice, shiftX int, clampL, clampR float32) {
+	log.AssertMsg(dst.NComp() == 1 && src.NComp() == 1 && othercomp.NComp() == 1 && anothercomp.NComp() == 1, "Component mismatch: dst, src, othercomp and anothercomp must all have 1 component in ShiftEdgeCarryX")
+	log.AssertMsg(dst.Len() == src.Len(), "Length mismatch: dst and src must have the same length in ShiftEdgeCarryX")
+	N := dst.Size()
+	cfg := make3DConf(N)
+	k_shiftedgecarryX_async(dst.DevPtr(0), src.DevPtr(0), othercomp.DevPtr(0), anothercomp.DevPtr(0), N[X], N[Y], N[Z], shiftX, clampL, clampR, cfg)
+}
+
 func ShiftY(dst, src *data.Slice, shiftY int, clampL, clampR float32) {
 	log.AssertMsg(dst.NComp() == 1 && src.NComp() == 1, "Component mismatch: dst and src must both have 1 component in ShiftY")
 	log.AssertMsg(dst.Len() == src.Len(), "Length mismatch: dst and src must have the same length in ShiftY")
@@ -23,6 +38,21 @@ func ShiftY(dst, src *data.Slice, shiftY int, clampL, clampR float32) {
 	k_shifty_async(dst.DevPtr(0), src.DevPtr(0), N[X], N[Y], N[Z], shiftY, clampL, clampR, cfg)
 }
 
+// Shifts a component `src` of a vector field by `shiftY` cells along the Y-axis.
+// Unlike the normal `shift()`, the new edge value is the current edge value.
+//
+// To avoid the situation where the magnetization could be set to (0,0,0) within the geometry, it is
+// also required to pass the two other vector components `othercomp` and `anothercomp` to this function.
+// In cells where the vector (`src`, `othercomp`, `anothercomp`) is the zero-vector,
+// `clampD` or `clampU` is used for the component `src` instead.
+func ShiftEdgeCarry(dst, src, othercomp, anothercomp *data.Slice, shiftY int, clampL, clampR float32) {
+	log.AssertMsg(dst.NComp() == 1 && src.NComp() == 1 && othercomp.NComp() == 1 && anothercomp.NComp() == 1, "Component mismatch: dst, src, othercomp and anothercomp must all have 1 component in ShiftEdgeCarry")
+	log.AssertMsg(dst.Len() == src.Len(), "Length mismatch: dst and src must have the same length in ShiftEdgeCarry")
+	N := dst.Size()
+	cfg := make3DConf(N)
+	k_shiftedgecarryY_async(dst.DevPtr(0), src.DevPtr(0), othercomp.DevPtr(0), anothercomp.DevPtr(0), N[X], N[Y], N[Z], shiftY, clampL, clampR, cfg)
+}
+
 func ShiftZ(dst, src *data.Slice, shiftZ int, clampL, clampR float32) {
 	log.AssertMsg(dst.NComp() == 1 && src.NComp() == 1, "Component mismatch: dst and src must both have 1 component in ShiftZ")
 	log.AssertMsg(dst.Len() == src.Len(), "Length mismatch: dst and src must have the same length in ShiftZ")
diff --git a/src/cuda/shiftedgecarryx.cu b/src/cuda/shiftedgecarryx.cu
new file mode 100644
index 0000000..0e10967
--- /dev/null
+++ b/src/cuda/shiftedgecarryx.cu
@@ -0,0 +1,37 @@
+#include "stencil.h"
+
+// Shifts a component `src` of a vector field by `shx` cells along the X-axis.
+// Unlike the normal `shiftx()`, the new edge value is the current edge value.
+//
+// To avoid the situation where the magnetization could be set to (0,0,0) within the geometry, it is
+// also required to pass the two other vector components `othercomp` and `anothercomp` to this function.
+// In cells where the vector (`src`, `othercomp`, `anothercomp`) is the zero-vector,
+// `clampL` or `clampR` is used for the component `src` instead.
+extern "C" __global__ void
+shiftedgecarryX(float* __restrict__  dst, float* __restrict__  src,
+    float* __restrict__ othercomp, float* __restrict__ anothercomp,
+    int Nx,  int Ny,  int Nz, int shx, float clampL, float clampR) {
+
+    int ix = blockIdx.x * blockDim.x + threadIdx.x;
+    int iy = blockIdx.y * blockDim.y + threadIdx.y;
+    int iz = blockIdx.z * blockDim.z + threadIdx.z;
+
+    if(ix < Nx && iy < Ny && iz < Nz) {
+        int ix2 = ix-shx; // old X-index
+        float newval;
+        if (ix2 < 0) { // left edge (shifting right)
+            newval = src[idx(0, iy, iz)];
+            if (newval == 0 && othercomp[idx(0, iy, iz)] == 0 && anothercomp[idx(0, iy, iz)] == 0) { // If zero-vector
+                newval = clampL;
+            }
+        } else if (ix2 >= Nx) { // right edge (shifting left)
+            newval = src[idx(Nx-1, iy, iz)];
+            if (newval == 0 && othercomp[idx(Nx-1, iy, iz)] == 0 && anothercomp[idx(Nx-1, iy, iz)] == 0) { // If zero-vector
+                newval = clampR;
+            }
+        } else { // bulk, doesn't matter which way the shift is
+            newval = src[idx(ix2, iy, iz)];
+        }
+        dst[idx(ix, iy, iz)] = newval;
+    }
+}
\ No newline at end of file
diff --git a/src/cuda/shiftedgecarryx_wrapper.go b/src/cuda/shiftedgecarryx_wrapper.go
new file mode 100644
index 0000000..7327d89
--- /dev/null
+++ b/src/cuda/shiftedgecarryx_wrapper.go
@@ -0,0 +1,219 @@
+package cuda
+
+/*
+ THIS FILE IS AUTO-GENERATED BY CUDA2GO.
+ EDITING IS FUTILE.
+*/
+
+import(
+	"unsafe"
+	"github.com/MathieuMoalic/amumax/src/cuda/cu"
+	"github.com/MathieuMoalic/amumax/src/timer"
+	"sync"
+)
+
+// CUDA handle for shiftedgecarryX kernel
+var shiftedgecarryX_code cu.Function
+
+// Stores the arguments for shiftedgecarryX kernel invocation
+type shiftedgecarryX_args_t struct{
+	 arg_dst unsafe.Pointer
+	 arg_src unsafe.Pointer
+	 arg_othercomp unsafe.Pointer
+	 arg_anothercomp unsafe.Pointer
+	 arg_Nx int
+	 arg_Ny int
+	 arg_Nz int
+	 arg_shx int
+	 arg_clampL float32
+	 arg_clampR float32
+	 argptr [10]unsafe.Pointer
+	sync.Mutex
+}
+
+// Stores the arguments for shiftedgecarryX kernel invocation
+var shiftedgecarryX_args shiftedgecarryX_args_t
+
+func init(){
+	// CUDA driver kernel call wants pointers to arguments, set them up once.
+	 shiftedgecarryX_args.argptr[0] = unsafe.Pointer(&shiftedgecarryX_args.arg_dst)
+	 shiftedgecarryX_args.argptr[1] = unsafe.Pointer(&shiftedgecarryX_args.arg_src)
+	 shiftedgecarryX_args.argptr[2] = unsafe.Pointer(&shiftedgecarryX_args.arg_othercomp)
+	 shiftedgecarryX_args.argptr[3] = unsafe.Pointer(&shiftedgecarryX_args.arg_anothercomp)
+	 shiftedgecarryX_args.argptr[4] = unsafe.Pointer(&shiftedgecarryX_args.arg_Nx)
+	 shiftedgecarryX_args.argptr[5] = unsafe.Pointer(&shiftedgecarryX_args.arg_Ny)
+	 shiftedgecarryX_args.argptr[6] = unsafe.Pointer(&shiftedgecarryX_args.arg_Nz)
+	 shiftedgecarryX_args.argptr[7] = unsafe.Pointer(&shiftedgecarryX_args.arg_shx)
+	 shiftedgecarryX_args.argptr[8] = unsafe.Pointer(&shiftedgecarryX_args.arg_clampL)
+	 shiftedgecarryX_args.argptr[9] = unsafe.Pointer(&shiftedgecarryX_args.arg_clampR)
+	 }
+
+// Wrapper for shiftedgecarryX CUDA kernel, asynchronous.
+func k_shiftedgecarryX_async ( dst unsafe.Pointer, src unsafe.Pointer, othercomp unsafe.Pointer, anothercomp unsafe.Pointer, Nx int, Ny int, Nz int, shx int, clampL float32, clampR float32,  cfg *config) {
+	if Synchronous{ // debug
+		Sync()
+		timer.Start("shiftedgecarryX")
+	}
+
+	shiftedgecarryX_args.Lock()
+	defer shiftedgecarryX_args.Unlock()
+
+	if shiftedgecarryX_code == 0{
+		shiftedgecarryX_code = fatbinLoad(shiftedgecarryX_map, "shiftedgecarryX")
+	}
+
+	 shiftedgecarryX_args.arg_dst = dst
+	 shiftedgecarryX_args.arg_src = src
+	 shiftedgecarryX_args.arg_othercomp = othercomp
+	 shiftedgecarryX_args.arg_anothercomp = anothercomp
+	 shiftedgecarryX_args.arg_Nx = Nx
+	 shiftedgecarryX_args.arg_Ny = Ny
+	 shiftedgecarryX_args.arg_Nz = Nz
+	 shiftedgecarryX_args.arg_shx = shx
+	 shiftedgecarryX_args.arg_clampL = clampL
+	 shiftedgecarryX_args.arg_clampR = clampR
+	
+
+	args := shiftedgecarryX_args.argptr[:]
+	cu.LaunchKernel(shiftedgecarryX_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args)
+
+	if Synchronous{ // debug
+		Sync()
+		timer.Stop("shiftedgecarryX")
+	}
+}
+
+// maps compute capability on PTX code for shiftedgecarryX kernel.
+var shiftedgecarryX_map = map[int]string{ 0: "" ,
+52: shiftedgecarryX_ptx_52  }
+
+// shiftedgecarryX PTX code for various compute capabilities.
+const(
+  shiftedgecarryX_ptx_52 = `
+.version 7.0
+.target sm_52
+.address_size 64
+
+	// .globl	shiftedgecarryX
+
+.visible .entry shiftedgecarryX(
+	.param .u64 shiftedgecarryX_param_0,
+	.param .u64 shiftedgecarryX_param_1,
+	.param .u64 shiftedgecarryX_param_2,
+	.param .u64 shiftedgecarryX_param_3,
+	.param .u32 shiftedgecarryX_param_4,
+	.param .u32 shiftedgecarryX_param_5,
+	.param .u32 shiftedgecarryX_param_6,
+	.param .u32 shiftedgecarryX_param_7,
+	.param .f32 shiftedgecarryX_param_8,
+	.param .f32 shiftedgecarryX_param_9
+)
+{
+	.reg .pred 	%p<14>;
+	.reg .f32 	%f<14>;
+	.reg .b32 	%r<24>;
+	.reg .b64 	%rd<25>;
+
+
+	ld.param.u64 	%rd4, [shiftedgecarryX_param_0];
+	ld.param.u64 	%rd5, [shiftedgecarryX_param_1];
+	ld.param.u64 	%rd6, [shiftedgecarryX_param_2];
+	ld.param.u64 	%rd7, [shiftedgecarryX_param_3];
+	ld.param.u32 	%r7, [shiftedgecarryX_param_4];
+	ld.param.u32 	%r8, [shiftedgecarryX_param_5];
+	ld.param.u32 	%r10, [shiftedgecarryX_param_6];
+	ld.param.u32 	%r9, [shiftedgecarryX_param_7];
+	ld.param.f32 	%f7, [shiftedgecarryX_param_8];
+	ld.param.f32 	%f8, [shiftedgecarryX_param_9];
+	cvta.to.global.u64 	%rd1, %rd7;
+	cvta.to.global.u64 	%rd2, %rd6;
+	cvta.to.global.u64 	%rd3, %rd5;
+	mov.u32 	%r11, %ntid.x;
+	mov.u32 	%r12, %ctaid.x;
+	mov.u32 	%r13, %tid.x;
+	mad.lo.s32 	%r1, %r11, %r12, %r13;
+	mov.u32 	%r14, %ntid.y;
+	mov.u32 	%r15, %ctaid.y;
+	mov.u32 	%r16, %tid.y;
+	mad.lo.s32 	%r2, %r14, %r15, %r16;
+	mov.u32 	%r17, %ntid.z;
+	mov.u32 	%r18, %ctaid.z;
+	mov.u32 	%r19, %tid.z;
+	mad.lo.s32 	%r3, %r17, %r18, %r19;
+	setp.lt.s32	%p1, %r1, %r7;
+	setp.lt.s32	%p2, %r2, %r8;
+	and.pred  	%p3, %p1, %p2;
+	setp.lt.s32	%p4, %r3, %r10;
+	and.pred  	%p5, %p3, %p4;
+	@!%p5 bra 	BB0_11;
+	bra.uni 	BB0_1;
+
+BB0_1:
+	sub.s32 	%r4, %r1, %r9;
+	setp.lt.s32	%p6, %r4, 0;
+	mad.lo.s32 	%r20, %r3, %r8, %r2;
+	mul.lo.s32 	%r5, %r20, %r7;
+	@%p6 bra 	BB0_7;
+
+	setp.lt.s32	%p7, %r4, %r7;
+	@%p7 bra 	BB0_6;
+	bra.uni 	BB0_3;
+
+BB0_6:
+	add.s32 	%r22, %r5, %r4;
+	mul.wide.s32 	%rd14, %r22, 4;
+	add.s64 	%rd15, %rd3, %rd14;
+	ld.global.nc.f32 	%f13, [%rd15];
+	bra.uni 	BB0_10;
+
+BB0_7:
+	mul.wide.s32 	%rd16, %r5, 4;
+	add.s64 	%rd17, %rd3, %rd16;
+	ld.global.nc.f32 	%f13, [%rd17];
+	setp.neu.f32	%p11, %f13, 0f00000000;
+	@%p11 bra 	BB0_10;
+
+	add.s64 	%rd19, %rd2, %rd16;
+	ld.global.nc.f32 	%f11, [%rd19];
+	setp.neu.f32	%p12, %f11, 0f00000000;
+	@%p12 bra 	BB0_10;
+
+	add.s64 	%rd21, %rd1, %rd16;
+	ld.global.nc.f32 	%f12, [%rd21];
+	setp.eq.f32	%p13, %f12, 0f00000000;
+	selp.f32	%f13, %f7, %f13, %p13;
+	bra.uni 	BB0_10;
+
+BB0_3:
+	add.s32 	%r21, %r7, %r5;
+	add.s32 	%r6, %r21, -1;
+	mul.wide.s32 	%rd8, %r6, 4;
+	add.s64 	%rd9, %rd3, %rd8;
+	ld.global.nc.f32 	%f13, [%rd9];
+	setp.neu.f32	%p8, %f13, 0f00000000;
+	@%p8 bra 	BB0_10;
+
+	add.s64 	%rd11, %rd2, %rd8;
+	ld.global.nc.f32 	%f9, [%rd11];
+	setp.neu.f32	%p9, %f9, 0f00000000;
+	@%p9 bra 	BB0_10;
+
+	add.s64 	%rd13, %rd1, %rd8;
+	ld.global.nc.f32 	%f10, [%rd13];
+	setp.eq.f32	%p10, %f10, 0f00000000;
+	selp.f32	%f13, %f8, %f13, %p10;
+
+BB0_10:
+	cvta.to.global.u64 	%rd22, %rd4;
+	add.s32 	%r23, %r5, %r1;
+	mul.wide.s32 	%rd23, %r23, 4;
+	add.s64 	%rd24, %rd22, %rd23;
+	st.global.f32 	[%rd24], %f13;
+
+BB0_11:
+	ret;
+}
+
+
+`
+ )
diff --git a/src/cuda/shiftedgecarryy.cu b/src/cuda/shiftedgecarryy.cu
new file mode 100644
index 0000000..2f6a41f
--- /dev/null
+++ b/src/cuda/shiftedgecarryy.cu
@@ -0,0 +1,37 @@
+#include "stencil.h"
+
+// Shifts a component `src` of a vector field by `shy` cells along the Y-axis.
+// Unlike the normal `shifty()`, the new edge value is the current edge value.
+//
+// To avoid the situation where the magnetization could be set to (0,0,0) within the geometry, it is
+// also required to pass the two other vector components `othercomp` and `anothercomp` to this function.
+// In cells where the vector (`src`, `othercomp`, `anothercomp`) is the zero-vector,
+// `clampD` or `clampU` is used for the component `src` instead.
+extern "C" __global__ void
+shiftedgecarryY(float* __restrict__  dst, float* __restrict__  src,
+    float* __restrict__ othercomp, float* __restrict__ anothercomp,
+    int Nx,  int Ny,  int Nz, int shy, float clampD, float clampU) {
+
+    int ix = blockIdx.x * blockDim.x + threadIdx.x;
+    int iy = blockIdx.y * blockDim.y + threadIdx.y;
+    int iz = blockIdx.z * blockDim.z + threadIdx.z;
+
+    if(ix < Nx && iy < Ny && iz < Nz) {
+        int iy2 = iy-shy; // old Y-index
+        float newval;
+        if (iy2 < 0) { // bottom edge (shifting up)
+            newval = src[idx(ix, 0, iz)];
+            if (newval == 0 && othercomp[idx(ix, 0, iz)] == 0 && anothercomp[idx(ix, 0, iz)] == 0) { // If zero-vector
+                newval = clampD;
+            }
+        } else if (iy2 >= Ny) { // top edge (shifting down)
+            newval = src[idx(ix, Ny-1, iz)];
+            if (newval == 0 && othercomp[idx(ix, Ny-1, iz)] == 0 && anothercomp[idx(ix, Ny-1, iz)] == 0) { // If zero-vector
+                newval = clampU;
+            }
+        } else { // bulk, doesn't matter which way the shift is
+            newval = src[idx(ix, iy2, iz)];
+        }
+        dst[idx(ix, iy, iz)] = newval;
+    }
+}
\ No newline at end of file
diff --git a/src/cuda/shiftedgecarryy_wrapper.go b/src/cuda/shiftedgecarryy_wrapper.go
new file mode 100644
index 0000000..1c909c8
--- /dev/null
+++ b/src/cuda/shiftedgecarryy_wrapper.go
@@ -0,0 +1,222 @@
+package cuda
+
+/*
+ THIS FILE IS AUTO-GENERATED BY CUDA2GO.
+ EDITING IS FUTILE.
+*/
+
+import(
+	"unsafe"
+	"github.com/MathieuMoalic/amumax/src/cuda/cu"
+	"github.com/MathieuMoalic/amumax/src/timer"
+	"sync"
+)
+
+// CUDA handle for shiftedgecarryY kernel
+var shiftedgecarryY_code cu.Function
+
+// Stores the arguments for shiftedgecarryY kernel invocation
+type shiftedgecarryY_args_t struct{
+	 arg_dst unsafe.Pointer
+	 arg_src unsafe.Pointer
+	 arg_othercomp unsafe.Pointer
+	 arg_anothercomp unsafe.Pointer
+	 arg_Nx int
+	 arg_Ny int
+	 arg_Nz int
+	 arg_shy int
+	 arg_clampD float32
+	 arg_clampU float32
+	 argptr [10]unsafe.Pointer
+	sync.Mutex
+}
+
+// Stores the arguments for shiftedgecarryY kernel invocation
+var shiftedgecarryY_args shiftedgecarryY_args_t
+
+func init(){
+	// CUDA driver kernel call wants pointers to arguments, set them up once.
+	 shiftedgecarryY_args.argptr[0] = unsafe.Pointer(&shiftedgecarryY_args.arg_dst)
+	 shiftedgecarryY_args.argptr[1] = unsafe.Pointer(&shiftedgecarryY_args.arg_src)
+	 shiftedgecarryY_args.argptr[2] = unsafe.Pointer(&shiftedgecarryY_args.arg_othercomp)
+	 shiftedgecarryY_args.argptr[3] = unsafe.Pointer(&shiftedgecarryY_args.arg_anothercomp)
+	 shiftedgecarryY_args.argptr[4] = unsafe.Pointer(&shiftedgecarryY_args.arg_Nx)
+	 shiftedgecarryY_args.argptr[5] = unsafe.Pointer(&shiftedgecarryY_args.arg_Ny)
+	 shiftedgecarryY_args.argptr[6] = unsafe.Pointer(&shiftedgecarryY_args.arg_Nz)
+	 shiftedgecarryY_args.argptr[7] = unsafe.Pointer(&shiftedgecarryY_args.arg_shy)
+	 shiftedgecarryY_args.argptr[8] = unsafe.Pointer(&shiftedgecarryY_args.arg_clampD)
+	 shiftedgecarryY_args.argptr[9] = unsafe.Pointer(&shiftedgecarryY_args.arg_clampU)
+	 }
+
+// Wrapper for shiftedgecarryY CUDA kernel, asynchronous.
+func k_shiftedgecarryY_async ( dst unsafe.Pointer, src unsafe.Pointer, othercomp unsafe.Pointer, anothercomp unsafe.Pointer, Nx int, Ny int, Nz int, shy int, clampD float32, clampU float32,  cfg *config) {
+	if Synchronous{ // debug
+		Sync()
+		timer.Start("shiftedgecarryY")
+	}
+
+	shiftedgecarryY_args.Lock()
+	defer shiftedgecarryY_args.Unlock()
+
+	if shiftedgecarryY_code == 0{
+		shiftedgecarryY_code = fatbinLoad(shiftedgecarryY_map, "shiftedgecarryY")
+	}
+
+	 shiftedgecarryY_args.arg_dst = dst
+	 shiftedgecarryY_args.arg_src = src
+	 shiftedgecarryY_args.arg_othercomp = othercomp
+	 shiftedgecarryY_args.arg_anothercomp = anothercomp
+	 shiftedgecarryY_args.arg_Nx = Nx
+	 shiftedgecarryY_args.arg_Ny = Ny
+	 shiftedgecarryY_args.arg_Nz = Nz
+	 shiftedgecarryY_args.arg_shy = shy
+	 shiftedgecarryY_args.arg_clampD = clampD
+	 shiftedgecarryY_args.arg_clampU = clampU
+	
+
+	args := shiftedgecarryY_args.argptr[:]
+	cu.LaunchKernel(shiftedgecarryY_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args)
+
+	if Synchronous{ // debug
+		Sync()
+		timer.Stop("shiftedgecarryY")
+	}
+}
+
+// maps compute capability on PTX code for shiftedgecarryY kernel.
+var shiftedgecarryY_map = map[int]string{ 0: "" ,
+52: shiftedgecarryY_ptx_52  }
+
+// shiftedgecarryY PTX code for various compute capabilities.
+const(
+  shiftedgecarryY_ptx_52 = `
+.version 7.0
+.target sm_52
+.address_size 64
+
+	// .globl	shiftedgecarryY
+
+.visible .entry shiftedgecarryY(
+	.param .u64 shiftedgecarryY_param_0,
+	.param .u64 shiftedgecarryY_param_1,
+	.param .u64 shiftedgecarryY_param_2,
+	.param .u64 shiftedgecarryY_param_3,
+	.param .u32 shiftedgecarryY_param_4,
+	.param .u32 shiftedgecarryY_param_5,
+	.param .u32 shiftedgecarryY_param_6,
+	.param .u32 shiftedgecarryY_param_7,
+	.param .f32 shiftedgecarryY_param_8,
+	.param .f32 shiftedgecarryY_param_9
+)
+{
+	.reg .pred 	%p<14>;
+	.reg .f32 	%f<14>;
+	.reg .b32 	%r<27>;
+	.reg .b64 	%rd<25>;
+
+
+	ld.param.u64 	%rd4, [shiftedgecarryY_param_0];
+	ld.param.u64 	%rd5, [shiftedgecarryY_param_1];
+	ld.param.u64 	%rd6, [shiftedgecarryY_param_2];
+	ld.param.u64 	%rd7, [shiftedgecarryY_param_3];
+	ld.param.u32 	%r8, [shiftedgecarryY_param_4];
+	ld.param.u32 	%r9, [shiftedgecarryY_param_5];
+	ld.param.u32 	%r11, [shiftedgecarryY_param_6];
+	ld.param.u32 	%r10, [shiftedgecarryY_param_7];
+	ld.param.f32 	%f7, [shiftedgecarryY_param_8];
+	ld.param.f32 	%f8, [shiftedgecarryY_param_9];
+	cvta.to.global.u64 	%rd1, %rd7;
+	cvta.to.global.u64 	%rd2, %rd6;
+	cvta.to.global.u64 	%rd3, %rd5;
+	mov.u32 	%r12, %ntid.x;
+	mov.u32 	%r13, %ctaid.x;
+	mov.u32 	%r14, %tid.x;
+	mad.lo.s32 	%r1, %r12, %r13, %r14;
+	mov.u32 	%r15, %ntid.y;
+	mov.u32 	%r16, %ctaid.y;
+	mov.u32 	%r17, %tid.y;
+	mad.lo.s32 	%r2, %r15, %r16, %r17;
+	mov.u32 	%r18, %ntid.z;
+	mov.u32 	%r19, %ctaid.z;
+	mov.u32 	%r20, %tid.z;
+	mad.lo.s32 	%r3, %r18, %r19, %r20;
+	setp.lt.s32	%p1, %r1, %r8;
+	setp.lt.s32	%p2, %r2, %r9;
+	and.pred  	%p3, %p1, %p2;
+	setp.lt.s32	%p4, %r3, %r11;
+	and.pred  	%p5, %p3, %p4;
+	@!%p5 bra 	BB0_11;
+	bra.uni 	BB0_1;
+
+BB0_1:
+	sub.s32 	%r4, %r2, %r10;
+	setp.lt.s32	%p6, %r4, 0;
+	mul.lo.s32 	%r5, %r3, %r9;
+	@%p6 bra 	BB0_7;
+
+	setp.lt.s32	%p7, %r4, %r9;
+	@%p7 bra 	BB0_6;
+	bra.uni 	BB0_3;
+
+BB0_6:
+	add.s32 	%r23, %r5, %r4;
+	mad.lo.s32 	%r24, %r23, %r8, %r1;
+	mul.wide.s32 	%rd14, %r24, 4;
+	add.s64 	%rd15, %rd3, %rd14;
+	ld.global.nc.f32 	%f13, [%rd15];
+	bra.uni 	BB0_10;
+
+BB0_7:
+	mad.lo.s32 	%r7, %r5, %r8, %r1;
+	mul.wide.s32 	%rd16, %r7, 4;
+	add.s64 	%rd17, %rd3, %rd16;
+	ld.global.nc.f32 	%f13, [%rd17];
+	setp.neu.f32	%p11, %f13, 0f00000000;
+	@%p11 bra 	BB0_10;
+
+	add.s64 	%rd19, %rd2, %rd16;
+	ld.global.nc.f32 	%f11, [%rd19];
+	setp.neu.f32	%p12, %f11, 0f00000000;
+	@%p12 bra 	BB0_10;
+
+	add.s64 	%rd21, %rd1, %rd16;
+	ld.global.nc.f32 	%f12, [%rd21];
+	setp.eq.f32	%p13, %f12, 0f00000000;
+	selp.f32	%f13, %f7, %f13, %p13;
+	bra.uni 	BB0_10;
+
+BB0_3:
+	add.s32 	%r21, %r9, %r5;
+	add.s32 	%r22, %r21, -1;
+	mad.lo.s32 	%r6, %r22, %r8, %r1;
+	mul.wide.s32 	%rd8, %r6, 4;
+	add.s64 	%rd9, %rd3, %rd8;
+	ld.global.nc.f32 	%f13, [%rd9];
+	setp.neu.f32	%p8, %f13, 0f00000000;
+	@%p8 bra 	BB0_10;
+
+	add.s64 	%rd11, %rd2, %rd8;
+	ld.global.nc.f32 	%f9, [%rd11];
+	setp.neu.f32	%p9, %f9, 0f00000000;
+	@%p9 bra 	BB0_10;
+
+	add.s64 	%rd13, %rd1, %rd8;
+	ld.global.nc.f32 	%f10, [%rd13];
+	setp.eq.f32	%p10, %f10, 0f00000000;
+	selp.f32	%f13, %f8, %f13, %p10;
+
+BB0_10:
+	cvta.to.global.u64 	%rd22, %rd4;
+	add.s32 	%r25, %r5, %r2;
+	mad.lo.s32 	%r26, %r25, %r8, %r1;
+	mul.wide.s32 	%rd23, %r26, 4;
+	add.s64 	%rd24, %rd22, %rd23;
+	st.global.f32 	[%rd24], %f13;
+
+BB0_11:
+	ret;
+}
+
+
+`
+ )
diff --git a/src/engine/geom.go b/src/engine/geom.go
index 1168002..274e0fd 100644
--- a/src/engine/geom.go
+++ b/src/engine/geom.go
@@ -190,6 +190,10 @@ func (g *geom) cellVolume(ix, iy, iz int) float32 {
 	return vol / float32(N*N*N)
 }
 
+func (g *geom) GetCell(ix, iy, iz int) float64 {
+	return float64(cuda.GetCell(g.Gpu(), 0, ix, iy, iz))
+}
+
 func (g *geom) shift(dx int) {
 	// empty mask, nothing to do
 	if g == nil || g.Buffer.IsNil() {
diff --git a/src/engine/script_var.go b/src/engine/script_var.go
index e4a6e3a..b6d1b0f 100644
--- a/src/engine/script_var.go
+++ b/src/engine/script_var.go
@@ -58,6 +58,7 @@ func init() {
 	declVar("ShiftGeom", &shiftGeom, "Whether Shift() acts on geometry")
 	declVar("ShiftRegions", &shiftRegions, "Whether Shift() acts on regions")
 	declVar("TotalShift", &totalShift, "Amount by which the simulation has been shifted (m).")
+	declVar("EdgeCarryShift", &EdgeCarryShift, "Whether to use the current magnetization at the border for the cells inserted by Shift")
 
 	declVar("GammaLL", &gammaLL, "Gyromagnetic ratio in rad/Ts")
 	declVar("DisableZhangLiTorque", &disableZhangLiTorque, "Disables Zhang-Li torque (default=false)")
diff --git a/src/engine/shift.go b/src/engine/shift.go
index c32db2c..562ed91 100644
--- a/src/engine/shift.go
+++ b/src/engine/shift.go
@@ -9,6 +9,7 @@ var (
 	totalShift, totalYShift                    float64                        // accumulated window shift (X and Y) in meter
 	shiftMagL, shiftMagR, shiftMagU, shiftMagD data.Vector                    // when shifting m, put these value at the left/right edge.
 	shiftM, shiftGeom, shiftRegions            bool        = true, true, true // should shift act on magnetization, geometry, regions?
+	EdgeCarryShift                             bool        = true             // Use the values of M at the border for the new cells
 )
 
 // position of the window lab frame
@@ -35,7 +36,11 @@ func shiftMag(m *data.Slice, dx int) {
 	defer cuda.Recycle(m2)
 	for c := 0; c < m.NComp(); c++ {
 		comp := m.Comp(c)
-		cuda.ShiftX(m2, comp, dx, float32(shiftMagL[c]), float32(shiftMagR[c]))
+		if EdgeCarryShift {
+			cuda.ShiftEdgeCarryX(m2, comp, m.Comp((c+1)%3), m.Comp((c+2)%3), dx, float32(shiftMagL[c]), float32(shiftMagL[c]))
+		} else {
+			cuda.ShiftX(m2, comp, dx, float32(shiftMagL[c]), float32(shiftMagL[c]))
+		}
 		data.Copy(comp, m2) // str0 ?
 	}
 }
@@ -60,7 +65,11 @@ func shiftMagY(m *data.Slice, dy int) {
 	defer cuda.Recycle(m2)
 	for c := 0; c < m.NComp(); c++ {
 		comp := m.Comp(c)
-		cuda.ShiftY(m2, comp, dy, float32(shiftMagU[c]), float32(shiftMagD[c]))
+		if EdgeCarryShift {
+			cuda.ShiftEdgeCarryX(m2, comp, m.Comp((c+1)%3), m.Comp((c+2)%3), dy, float32(shiftMagL[c]), float32(shiftMagL[c]))
+		} else {
+			cuda.ShiftX(m2, comp, dy, float32(shiftMagL[c]), float32(shiftMagL[c]))
+		}
 		data.Copy(comp, m2) // str0 ?
 	}
 }