-
Notifications
You must be signed in to change notification settings - Fork 38
/
diffusion2D_shmem_novis.jl
79 lines (70 loc) · 3.52 KB
/
diffusion2D_shmem_novis.jl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
const USE_GPU = true
using BenchmarkTools
using ParallelStencil
using ParallelStencil.FiniteDifferences2D
@static if USE_GPU
@init_parallel_stencil(CUDA, Float64, 2);
else
@init_parallel_stencil(Threads, Float64, 2);
end
@parallel_indices (ix,iy) function diffusion2D_step!(T2, T, Ci, lam, dt, _dx, _dy)
tx = @threadIdx().x + 1
ty = @threadIdx().y + 1
T_l = @sharedMem(eltype(T), (@blockDim().x+2, @blockDim().y+2))
T_l[tx,ty] = T[ix,iy]
if (ix>1 && ix<size(T2,1) && iy>1 && iy<size(T2,2))
if (@threadIdx().x == 1) T_l[tx-1,ty] = T[ix-1,iy] end
if (@threadIdx().x == @blockDim().x) T_l[tx+1,ty] = T[ix+1,iy] end
if (@threadIdx().y == 1) T_l[tx,ty-1] = T[ix,iy-1] end
if (@threadIdx().y == @blockDim().y) T_l[tx,ty+1] = T[ix,iy+1] end
@sync_threads()
T2[ix,iy] = T_l[tx,ty] + dt*(Ci[ix,iy]*(
- ((-lam*(T_l[tx+1,ty] - T_l[tx,ty])*_dx) - (-lam*(T_l[tx,ty] - T_l[tx-1,ty])*_dx))*_dx
- ((-lam*(T_l[tx,ty+1] - T_l[tx,ty])*_dy) - (-lam*(T_l[tx,ty] - T_l[tx,ty-1])*_dy))*_dy
));
end
return
end
function diffusion2D()
# Physics
lam = 1.0; # Thermal conductivity
c0 = 2.0; # Heat capacity
lx, ly = 1.0, 1.0; # Length of computational domain in dimension x and y
# Numerics
nx, ny = 512*32, 512*32; # Number of gridpoints in dimensions x and y
nt = 100; # Number of time steps
dx = lx/(nx-1); # Space step in x-dimension
dy = ly/(ny-1); # Space step in y-dimension
_dx, _dy = 1.0/dx, 1.0/dy;
# Array initializations
T = @zeros(nx, ny);
T2 = @zeros(nx, ny);
Ci = @zeros(nx, ny);
# Initial conditions
Ci .= 1/c0; # 1/Heat capacity
T .= 1.7;
T2 .= T; # Assign also T2 to get correct boundary conditions.
#
# GPU launch parameters
threads = (32, 8)
blocks = (nx, ny) .÷ threads
#
# Time loop
dt = min(dx^2,dy^2)/lam/maximum(Ci)/4.1; # Time step for 2D Heat diffusion
for it = 1:nt
if (it == 11) GC.enable(false); global t_tic=time(); end # Start measuring time.
@parallel blocks threads shmem=prod(threads.+2)*sizeof(Float64) diffusion2D_step!(T2, T, Ci, lam, dt, _dx, _dy);
T, T2 = T2, T;
end
time_s = time() - t_tic
# Performance
A_eff = (2*1+1)*1/1e9*nx*ny*sizeof(eltype(T)); # Effective main memory access per iteration [GB] (Lower bound of required memory access: T has to be read and written: 2 whole-array memaccess; Ci has to be read: : 1 whole-array memaccess)
t_it = time_s/(nt-10); # Execution time per iteration [s]
T_eff = A_eff/t_it; # Effective memory throughput [GB/s]
println("time_s=$time_s t_it=$t_it T_eff=$T_eff");
# Performance
A_eff = (2*1+1)*1/1e9*nx*ny*sizeof(eltype(T)); # Effective main memory access per iteration [GB] (Lower bound of required memory access: T has to be read and written: 2 whole-array memaccess; Ci has to be read: : 1 whole-array memaccess)
t_it = @belapsed begin @parallel $blocks $threads shmem=prod($threads.+2)*sizeof(Float64) diffusion2D_step!($T2, $T, $Ci, $lam, $dt, $_dx, $_dy); end
println("Benchmarktools (min): t_it=$t_it T_eff=$(A_eff/t_it)");
end
diffusion2D()