diff --git a/Project.toml b/Project.toml index faf44b6..df41a8e 100644 --- a/Project.toml +++ b/Project.toml @@ -6,11 +6,13 @@ version = "0.11.0" [deps] CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" MPI = "da04e1cc-30fd-572f-bb4f-1f8673147195" +LoopVectorization="bdcacae8-1622-11e9-2a5c-532679323890" [compat] CUDA = "1, ~3.1, ~3.2, ~3.3" -MPI = "0.12, 0.13, 0.14, 0.15, 0.16, 0.17, 0.18" +MPI = "0.12, 0.13, 0.14, 0.15, 0.16, 0.17, 0.18, 0.19" julia = "1" +LoopVectorization = "0.12" [extras] Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" diff --git a/src/init_global_grid.jl b/src/init_global_grid.jl index 8abd8eb..98789e0 100644 --- a/src/init_global_grid.jl +++ b/src/init_global_grid.jl @@ -39,17 +39,24 @@ Initialize a Cartesian grid of MPI processes (and also MPI itself by default) de See also: [`finalize_global_grid`](@ref) """ function init_global_grid(nx::Integer, ny::Integer, nz::Integer; dimx::Integer=0, dimy::Integer=0, dimz::Integer=0, periodx::Integer=0, periody::Integer=0, periodz::Integer=0, overlapx::Integer=2, overlapy::Integer=2, overlapz::Integer=2, disp::Integer=1, reorder::Integer=1, comm::MPI.Comm=MPI.COMM_WORLD, init_MPI::Bool=true, quiet::Bool=false) - nxyz = [nx, ny, nz]; - dims = [dimx, dimy, dimz]; - periods = [periodx, periody, periodz]; - overlaps = [overlapx, overlapy, overlapz]; - cudaaware_MPI = [false, false, false] + nxyz = [nx, ny, nz]; + dims = [dimx, dimy, dimz]; + periods = [periodx, periody, periodz]; + overlaps = [overlapx, overlapy, overlapz]; + cudaaware_MPI = [false, false, false] + loopvectorization = [false, false, false] if haskey(ENV, "IGG_CUDAAWARE_MPI") cudaaware_MPI .= (parse(Int64, ENV["IGG_CUDAAWARE_MPI"]) > 0); end + if haskey(ENV, "IGG_LOOPVECTORIZATION") loopvectorization .= (parse(Int64, ENV["IGG_LOOPVECTORIZATION"]) > 0); end if none(cudaaware_MPI) if haskey(ENV, "IGG_CUDAAWARE_MPI_DIMX") cudaaware_MPI[1] = (parse(Int64, ENV["IGG_CUDAAWARE_MPI_DIMX"]) > 0); end if haskey(ENV, "IGG_CUDAAWARE_MPI_DIMY") cudaaware_MPI[2] = (parse(Int64, ENV["IGG_CUDAAWARE_MPI_DIMY"]) > 0); end if haskey(ENV, "IGG_CUDAAWARE_MPI_DIMZ") cudaaware_MPI[3] = (parse(Int64, ENV["IGG_CUDAAWARE_MPI_DIMZ"]) > 0); end end + if all(loopvectorization) + if haskey(ENV, "IGG_LOOPVECTORIZATION_DIMX") loopvectorization[1] = (parse(Int64, ENV["IGG_LOOPVECTORIZATION_DIMX"]) > 0); end + if haskey(ENV, "IGG_LOOPVECTORIZATION_DIMY") loopvectorization[2] = (parse(Int64, ENV["IGG_LOOPVECTORIZATION_DIMY"]) > 0); end + if haskey(ENV, "IGG_LOOPVECTORIZATION_DIMZ") loopvectorization[3] = (parse(Int64, ENV["IGG_LOOPVECTORIZATION_DIMZ"]) > 0); end + end if (nx==1) error("Invalid arguments: nx can never be 1.") end if (ny==1 && nz>1) error("Invalid arguments: ny cannot be 1 if nz is greater than 1.") end if (any((nxyz .== 1) .& (dims .>1 ))) error("Incoherent arguments: if nx, ny, or nz is 1, then the corresponding dimx, dimy or dimz must not be set (or set 0 or 1)."); end @@ -71,7 +78,7 @@ function init_global_grid(nx::Integer, ny::Integer, nz::Integer; dimx::Integer=0 neighbors[:,i] .= MPI.Cart_shift(comm_cart, i-1, disp); end nxyz_g = dims.*(nxyz.-overlaps) .+ overlaps.*(periods.==0); # E.g. for dimension x with ol=2 and periodx=0: dimx*(nx-2)+2 - set_global_grid(GlobalGrid(nxyz_g, nxyz, dims, overlaps, nprocs, me, coords, neighbors, periods, disp, reorder, comm_cart, cudaaware_MPI, quiet)); + set_global_grid(GlobalGrid(nxyz_g, nxyz, dims, overlaps, nprocs, me, coords, neighbors, periods, disp, reorder, comm_cart, cudaaware_MPI, loopvectorization, quiet)); if (!quiet && me==0) println("Global grid: $(nxyz_g[1])x$(nxyz_g[2])x$(nxyz_g[3]) (nprocs: $nprocs, dims: $(dims[1])x$(dims[2])x$(dims[3]))"); end init_timing_functions(); return me, dims, nprocs, coords, comm_cart; # The typical use case requires only these variables; the remaining can be obtained calling get_global_grid() if needed. diff --git a/src/shared.jl b/src/shared.jl index 10c4a3b..4d29a21 100644 --- a/src/shared.jl +++ b/src/shared.jl @@ -25,10 +25,10 @@ __init__() = ( ##-------------------- ## CONSTANT PARAMETERS -const NDIMS_MPI = 3 # Internally, we set the number of dimensions always to 3 for calls to MPI. This ensures a fixed size for MPI coords, neigbors, etc and in general a simple, easy to read code. -const NNEIGHBORS_PER_DIM = 2 # Number of neighbors per dimension (left neighbor + right neighbor). -const GG_ALLOC_GRANULARITY = 32 # Internal buffers are allocated with a granulariy of GG_ALLOC_GRANULARITY elements in order to ensure correct reinterpretation when used for different types and to reduce amount of re-allocations. - +const NDIMS_MPI = 3 # Internally, we set the number of dimensions always to 3 for calls to MPI. This ensures a fixed size for MPI coords, neigbors, etc and in general a simple, easy to read code. +const NNEIGHBORS_PER_DIM = 2 # Number of neighbors per dimension (left neighbor + right neighbor). +const GG_ALLOC_GRANULARITY = 32 # Internal buffers are allocated with a granulariy of GG_ALLOC_GRANULARITY elements in order to ensure correct reinterpretation when used for different types and to reduce amount of re-allocations. +const GG_THREADCOPY_THRESHOLD = 32768 # When LoopVectorization is deactivated, then the GG_THREADCOPY_THRESHOLD defines the size in bytes upon which memory copy is performed with multiple threads. ##------ ## TYPES @@ -58,9 +58,10 @@ struct GlobalGrid reorder::GGInt comm::MPI.Comm cudaaware_MPI::Vector{Bool} + loopvectorization::Vector{Bool} quiet::Bool end -const GLOBAL_GRID_NULL = GlobalGrid(GGInt[-1,-1,-1], GGInt[-1,-1,-1], GGInt[-1,-1,-1], GGInt[-1,-1,-1], -1, -1, GGInt[-1,-1,-1], GGInt[-1 -1 -1; -1 -1 -1], GGInt[-1,-1,-1], -1, -1, MPI.COMM_NULL, [false,false,false], false) +const GLOBAL_GRID_NULL = GlobalGrid(GGInt[-1,-1,-1], GGInt[-1,-1,-1], GGInt[-1,-1,-1], GGInt[-1,-1,-1], -1, -1, GGInt[-1,-1,-1], GGInt[-1 -1 -1; -1 -1 -1], GGInt[-1,-1,-1], -1, -1, MPI.COMM_NULL, [false,false,false], [true,true,true], false) # Macro to switch on/off check_initialized() for performance reasons (potentially relevant for tools.jl). macro check_initialized() :(check_initialized();) end #FIXME: Alternative: macro check_initialized() end @@ -93,6 +94,8 @@ neighbors(dim::Integer) = global_grid().neighbors[:,dim] neighbor(n::Integer, dim::Integer) = global_grid().neighbors[n,dim] cudaaware_MPI() = global_grid().cudaaware_MPI cudaaware_MPI(dim::Integer) = global_grid().cudaaware_MPI[dim] +loopvectorization() = global_grid().loopvectorization +loopvectorization(dim::Integer) = global_grid().loopvectorization[dim] has_neighbor(n::Integer, dim::Integer) = neighbor(n, dim) != MPI.MPI_PROC_NULL any_array(fields::GGArray...) = any([is_array(A) for A in fields]) any_cuarray(fields::GGArray...) = any([is_cuarray(A) for A in fields]) diff --git a/src/update_halo.jl b/src/update_halo.jl index a160b10..21010bc 100644 --- a/src/update_halo.jl +++ b/src/update_halo.jl @@ -5,6 +5,7 @@ import MPI using CUDA end using Base.Threads +using LoopVectorization """ @@ -408,12 +409,13 @@ function write_h2h!(sendbuf::AbstractArray{T}, A::Array{T}, sendranges::Array{Un ix = (length(sendranges[1])==1) ? sendranges[1][1] : sendranges[1]; iy = (length(sendranges[2])==1) ? sendranges[2][1] : sendranges[2]; iz = (length(sendranges[3])==1) ? sendranges[3][1] : sendranges[3]; - if (dim == 1 && length(ix)==1 && iy == 1:size(A,2) && iz == 1:size(A,3)) memcopy!(view(sendbuf,:), view(view(A,ix, :, :),:)); - elseif (dim == 1 && length(ix)==1 && iy == 1:size(A,2) && length(iz)==1 ) memcopy!(view(sendbuf,:), view(view(A,ix, :,iz),:)); - elseif (dim == 2 && ix == 1:size(A,1) && length(iy)==1 && iz == 1:size(A,3)) memcopy!(view(sendbuf,:), view(view(A, :,iy, :),:)); - elseif (dim == 2 && ix == 1:size(A,1) && length(iy)==1 && length(iz)==1 ) memcopy!(view(sendbuf,:), view(view(A, :,iy,iz),:)); - elseif (dim == 3 && ix == 1:size(A,1) && iy == 1:size(A,2) ) memcopy!(view(sendbuf,:), view(view(A, :, :,iz),:)); - elseif (dim == 1 || dim == 2 || dim == 3) memcopy!(view(sendbuf,:), view(view(A,sendranges...),:)); # This general case is slower than the three optimised cases above (the result would be the same, of course). + if (dim == 1 && length(ix)==1 && iy == 1:size(A,2) && iz == 1:size(A,3)) memcopy!(sendbuf, view(A,ix, :, :), loopvectorization(dim)); + elseif (dim == 1 && length(ix)==1 && iy == 1:size(A,2) && length(iz)==1 ) memcopy!(sendbuf, view(A,ix, :,iz), loopvectorization(dim)); + elseif (dim == 1 && length(ix)==1 && length(iy)==1 && length(iz)==1 ) memcopy!(sendbuf, view(A,ix,iy,iz), loopvectorization(dim)); + elseif (dim == 2 && ix == 1:size(A,1) && length(iy)==1 && iz == 1:size(A,3)) memcopy!(sendbuf, view(A, :,iy, :), loopvectorization(dim)); + elseif (dim == 2 && ix == 1:size(A,1) && length(iy)==1 && length(iz)==1 ) memcopy!(sendbuf, view(A, :,iy,iz), loopvectorization(dim)); + elseif (dim == 3 && ix == 1:size(A,1) && iy == 1:size(A,2) ) memcopy!(sendbuf, view(A, :, :,iz), loopvectorization(dim)); + elseif (dim == 1 || dim == 2 || dim == 3) memcopy!(sendbuf, view(A,sendranges...), loopvectorization(dim)); # This general case is slower than the three optimised cases above (the result would be the same, of course). end end @@ -422,12 +424,13 @@ function read_h2h!(recvbuf::AbstractArray{T}, A::Array{T}, recvranges::Array{Uni ix = (length(recvranges[1])==1) ? recvranges[1][1] : recvranges[1]; iy = (length(recvranges[2])==1) ? recvranges[2][1] : recvranges[2]; iz = (length(recvranges[3])==1) ? recvranges[3][1] : recvranges[3]; - if (dim == 1 && length(ix)==1 && iy == 1:size(A,2) && iz == 1:size(A,3)) memcopy!(view(view(A,ix, :, :), :), view(recvbuf,:)); - elseif (dim == 1 && length(ix)==1 && iy == 1:size(A,2) && length(iz)==1 ) memcopy!(view(view(A,ix, :,iz), :), view(recvbuf,:)); - elseif (dim == 2 && ix == 1:size(A,1) && length(iy)==1 && iz == 1:size(A,3)) memcopy!(view(view(A, :,iy, :), :), view(recvbuf,:)); - elseif (dim == 2 && ix == 1:size(A,1) && length(iy)==1 && length(iz)==1 ) memcopy!(view(view(A, :,iy,iz), :), view(recvbuf,:)); - elseif (dim == 3 && ix == 1:size(A,1) && iy == 1:size(A,2) ) memcopy!(view(view(A, :, :,iz), :), view(recvbuf,:)); - elseif (dim == 1 || dim == 2 || dim == 3) memcopy!(view(view(A,recvranges...),:), view(recvbuf,:)); # This general case is slower than the three optimised cases above (the result would be the same, of course). + if (dim == 1 && length(ix)==1 && iy == 1:size(A,2) && iz == 1:size(A,3)) memcopy!(view(A,ix, :, :), recvbuf, loopvectorization(dim)); + elseif (dim == 1 && length(ix)==1 && iy == 1:size(A,2) && length(iz)==1 ) memcopy!(view(A,ix, :,iz), recvbuf, loopvectorization(dim)); + elseif (dim == 1 && length(ix)==1 && length(iy)==1 && length(iz)==1 ) memcopy!(view(A,ix,iy,iz), recvbuf, loopvectorization(dim)); + elseif (dim == 2 && ix == 1:size(A,1) && length(iy)==1 && iz == 1:size(A,3)) memcopy!(view(A, :,iy, :), recvbuf, loopvectorization(dim)); + elseif (dim == 2 && ix == 1:size(A,1) && length(iy)==1 && length(iz)==1 ) memcopy!(view(A, :,iy,iz), recvbuf, loopvectorization(dim)); + elseif (dim == 3 && ix == 1:size(A,1) && iy == 1:size(A,2) ) memcopy!(view(A, :, :,iz), recvbuf, loopvectorization(dim)); + elseif (dim == 1 || dim == 2 || dim == 3) memcopy!(view(A,recvranges...), recvbuf, loopvectorization(dim)); # This general case is slower than the three optimised cases above (the result would be the same, of course). end end @@ -521,27 +524,47 @@ function sendrecv_halo_local(n::Integer, dim::Integer, A::GGArray, i::Integer) end else if n == 1 - memcopy!(recvbuf_flat(2,dim,i,A), sendbuf_flat(1,dim,i,A)); + memcopy!(recvbuf_flat(2,dim,i,A), sendbuf_flat(1,dim,i,A), loopvectorization(dim)); elseif n == 2 - memcopy!(recvbuf_flat(1,dim,i,A), sendbuf_flat(2,dim,i,A)); + memcopy!(recvbuf_flat(1,dim,i,A), sendbuf_flat(2,dim,i,A), loopvectorization(dim)); end end end end -function memcopy!(dst::AbstractArray{T}, src::AbstractArray{T}) where T <: GGNumber - if nthreads() > 1 - @threads for ix = 1:length(dst) # NOTE: Set the number of threads e.g. as: export JULIA_NUM_THREADS=12 - @inbounds dst[ix] = src[ix] # NOTE: We fix here exceptionally the use of @inbounds as this copy between two flat vectors (which must have the right length) is considered safe. +function memcopy!(dst::AbstractArray{T}, src::AbstractArray{T}, loopvectorization::Bool) where T <: GGNumber + if loopvectorization && !(T <: Complex) # NOTE: LoopVectorization does not yet support Complex numbers and copy reinterpreted arrays leads to bad performance. + memcopy_loopvect!(dst, src) + else + dst_flat = view(dst,:) + src_flat = view(src,:) + memcopy_threads!(dst_flat, src_flat) + end +end + +function memcopy_threads!(dst::AbstractArray{T}, src::AbstractArray{T}) where T <: GGNumber + if nthreads() > 1 && sizeof(src) >= GG_THREADCOPY_THRESHOLD + @threads for i = 1:length(dst) # NOTE: Set the number of threads e.g. as: export JULIA_NUM_THREADS=12 + @inbounds dst[i] = src[i] # NOTE: We fix here exceptionally the use of @inbounds as this copy between two flat vectors (which must have the right length) is considered safe. + end + else + @inbounds copyto!(dst, src) + end +end + +function memcopy_loopvect!(dst::AbstractArray{T}, src::AbstractArray{T}) where T <: GGNumber + if nthreads() > 1 && length(src) > 1 + @tturbo for i ∈ eachindex(dst, src) # NOTE: tturbo will use maximally Threads.nthreads() threads. Set the number of threads e.g. as: export JULIA_NUM_THREADS=12. NOTE: tturbo fails if src_flat and dst_flat are used due to an issue in ArrayInterface : https://github.com/JuliaArrays/ArrayInterface.jl/issues/228 + @inbounds dst[i] = src[i] # NOTE: We fix here exceptionally the use of @inbounds (currently anyways done by LoopVectorization) as this copy between two flat vectors (which must have the right length) is considered safe. end - else - @inbounds dst .= src - end + else + @inbounds copyto!(dst, src) + end end @static if ENABLE_CUDA function cumemcopy!(dst::CuArray{T}, src::CuArray{T}) where T <: GGNumber - @inbounds dst .= src + @inbounds CUDA.copyto!(dst, src) end end