diff --git a/scripts_future_API/run_stokes3D.sh b/scripts_future_API/run_stokes3D.sh index df15477e..2655f81b 100755 --- a/scripts_future_API/run_stokes3D.sh +++ b/scripts_future_API/run_stokes3D.sh @@ -4,4 +4,7 @@ module load LUMI/22.08 module load partition/G module load rocm/5.3.3 +export MPICH_GPU_SUPPORT_ENABLED=1 +export LD_PRELOAD=${CRAY_MPICH_ROOTDIR}/gtl/lib/libmpi_gtl_hsa.so + julia --project -O3 tm_stokes_mpi_wip.jl diff --git a/scripts_future_API/submit_stokes3D.sh b/scripts_future_API/submit_stokes3D.sh index b02d34dc..a48061bb 100755 --- a/scripts_future_API/submit_stokes3D.sh +++ b/scripts_future_API/submit_stokes3D.sh @@ -2,11 +2,17 @@ #SBATCH --job-name="FastIce3D" #SBATCH --output=FastIce3D.%j.o #SBATCH --error=FastIce3D.%j.e -#SBATCH --time=00:30:00 -#SBATCH --nodes=8 +#SBATCH --time=01:00:00 +#SBATCH --nodes=2 #SBATCH --ntasks-per-node=8 #SBATCH --gpus-per-node=8 #SBATCH --partition=standard-g #SBATCH --account project_465000557 +# export ROCR_VISIBLE_DEVICES=0,2,4,6 +# srun --cpu-bind=map_cpu:49,17,1,33 ./run_stokes3D.sh + +export MPICH_GPU_SUPPORT_ENABLED=1 +export LD_PRELOAD=${CRAY_MPICH_ROOTDIR}/gtl/lib/libmpi_gtl_hsa.so + srun --cpu-bind=map_cpu:49,57,17,25,1,9,33,41 ./run_stokes3D.sh diff --git a/scripts_future_API/tm_stokes_mpi_wip.jl b/scripts_future_API/tm_stokes_mpi_wip.jl index 25ed04dd..8308d7d0 100644 --- a/scripts_future_API/tm_stokes_mpi_wip.jl +++ b/scripts_future_API/tm_stokes_mpi_wip.jl @@ -18,137 +18,195 @@ using AMDGPU using FastIce.Distributed using MPI -println("import done") - @views avx(A) = 0.5 .* (A[1:end-1, :, :] .+ A[2:end, :, :]) @views avy(A) = 0.5 .* (A[:, 1:end-1, :] .+ A[:, 2:end, :]) @views avz(A) = 0.5 .* (A[:, :, 1:end-1] .+ A[:, :, 2:end]) -MPI.Init() +@views av_xy(A) = 0.25 .* (A[1:end-1, 1:end-1, :] .+ A[2:end, 1:end-1, :] .+ A[2:end, 2:end, :] .+ A[1:end-1, 2:end, :]) +@views av_xz(A) = 0.25 .* (A[1:end-1, :, 1:end-1] .+ A[2:end, :, 1:end-1, :] .+ A[2:end, :, 2:end, :] .+ A[1:end-1, :, 2:end]) +@views av_yz(A) = 0.25 .* (A[:, 1:end-1, 1:end-1] .+ A[:, 2:end, 1:end-1] .+ A[:, 2:end, 2:end] .+ A[:, 1:end-1, 2:end]) -backend = ROCBackend() -dims = (4, 2, 2) -arch = Architecture(backend, dims, MPI.COMM_WORLD) +function main() + MPI.Init(; threadlevel=:multiple) -# physics -ebg = 1.0 + backend = ROCBackend() + dims = (4, 2, 2) + # dims = (0, 0, 0) + arch = Architecture(backend, dims, MPI.COMM_WORLD) + set_device!(arch) -topo = details(arch) + topo = details(arch) -size_l = (254, 254, 254) -size_g = global_grid_size(topo, size_l) + size_l = (126, 126, 126) + size_g = global_grid_size(topo, size_l) -if global_rank(topo) == 0 - @show dimensions(topo) - @show size_g -end + if global_rank(topo) == 0 + @show dimensions(topo) + @show size_g + end -grid_g = CartesianGrid(; origin=(-2.0, -1.0, 0.0), - extent=(4.0, 2.0, 2.0), - size=size_g) - -grid_l = local_grid(grid_g, topo) - -no_slip = VBC(0.0, 0.0, 0.0) -free_slip = SBC(0.0, 0.0, 0.0) -free_surface = TBC(0.0, 0.0, 0.0) - -boundary_conditions = (x = (free_slip, free_slip), - y = (free_slip, free_slip), - z = (no_slip, free_surface)) - -gravity = (x=-0.25, y=0.0, z=1.0) - -# numerics -nt = 50maximum(size(grid_g)) -ncheck = 1maximum(size(grid_g)) - -r = 0.7 -re_mech = 5π -lτ_re_m = minimum(extent(grid_g)) / re_mech -vdτ = minimum(spacing(grid_g)) / sqrt(ndims(grid_g) * 3.1) -θ_dτ = lτ_re_m * (r + 4 / 3) / vdτ -dτ_r = 1.0 / (θ_dτ + 1.0) -nudτ = vdτ * lτ_re_m - -iter_params = (η_rel=1e-1, - Δτ=(Pr=r / θ_dτ, τ=(xx=dτ_r, yy=dτ_r, zz=dτ_r, xy=dτ_r, xz=dτ_r, yz=dτ_r), V=(x=nudτ, y=nudτ, z=nudτ))) - -physics = (rheology=GlensLawRheology(1),) -other_fields = (A=Field(backend, grid_l, Center()),) - -model = IsothermalFullStokesModel(; - arch, - grid=grid_l, - physics, - gravity, - boundary_conditions, - iter_params, - other_fields) - -if global_rank(topo) == 0 - println("model created") - Pr_g = zeros(size(grid_g)) - Vx_g = zeros(size(grid_g)) - Vy_g = zeros(size(grid_g)) - Vz_g = zeros(size(grid_g)) -else - Pr_g = nothing - Vx_g = nothing - Vy_g = nothing - Vz_g = nothing -end + grid_g = CartesianGrid(; origin=(-2.0, -1.0, 0.0), + extent=(4.0, 2.0, 2.0), + size=size_g) + + grid_l = local_grid(grid_g, topo) + + no_slip = VBC(0.0, 0.0, 0.0) + free_slip = SBC(0.0, 0.0, 0.0) + free_surface = TBC(0.0, 0.0, 0.0) + + boundary_conditions = (x = (free_slip, free_slip), + y = (free_slip, free_slip), + z = (no_slip, free_surface)) + + gravity = (x=-0.25, y=0.0, z=1.0) + + # numerics + niter = 20maximum(size(grid_g)) + ncheck = 1maximum(size(grid_g)) + + r = 0.7 + re_mech = 10π + lτ_re_m = minimum(extent(grid_g)) / re_mech + vdτ = minimum(spacing(grid_g)) / sqrt(ndims(grid_g) * 3.1) + θ_dτ = lτ_re_m * (r + 4 / 3) / vdτ + dτ_r = 1.0 / (θ_dτ + 1.0) + nudτ = vdτ * lτ_re_m + + iter_params = (η_rel=1e-1, + Δτ=(Pr=r / θ_dτ, τ=(xx=dτ_r, yy=dτ_r, zz=dτ_r, xy=dτ_r, xz=dτ_r, yz=dτ_r), V=(x=nudτ, y=nudτ, z=nudτ))) + + physics = (rheology=GlensLawRheology(1),) + other_fields = (A=Field(backend, grid_l, Center()),) + + model = IsothermalFullStokesModel(; + arch, + grid=grid_l, + physics, + gravity, + boundary_conditions, + iter_params, + other_fields) + + if global_rank(topo) == 0 + println("model created") + Pr_g = zeros(size(grid_g)) + τxx_g = zeros(size(grid_g)) + τyy_g = zeros(size(grid_g)) + τzz_g = zeros(size(grid_g)) + τxy_g = zeros(size(grid_g)) + τxz_g = zeros(size(grid_g)) + τyz_g = zeros(size(grid_g)) + Vx_g = zeros(size(grid_g)) + Vy_g = zeros(size(grid_g)) + Vz_g = zeros(size(grid_g)) + else + Pr_g = nothing + τxx_g = nothing + τyy_g = nothing + τzz_g = nothing + τxy_g = nothing + τxz_g = nothing + τyz_g = nothing + Vx_g = nothing + Vy_g = nothing + Vz_g = nothing + end -Pr_v = zeros(size(grid_l)) -Vx_v = zeros(size(grid_l)) -Vy_v = zeros(size(grid_l)) -Vz_v = zeros(size(grid_l)) + Pr_v = zeros(size(grid_l)) + τxx_v = zeros(size(grid_l)) + τyy_v = zeros(size(grid_l)) + τzz_v = zeros(size(grid_l)) + τxy_v = zeros(size(grid_l)) + τxz_v = zeros(size(grid_l)) + τyz_v = zeros(size(grid_l)) + Vx_v = zeros(size(grid_l)) + Vy_v = zeros(size(grid_l)) + Vz_v = zeros(size(grid_l)) + + fill!(parent(model.fields.Pr), 0.0) + foreach(x -> fill!(parent(x), 0.0), model.fields.τ) + foreach(x -> fill!(parent(x), 0.0), model.fields.V) + fill!(parent(other_fields.A), 1.0) + fill!(parent(model.fields.η), 1.0) + + KernelLaunch.apply_all_boundary_conditions!(arch, grid_l, model.boundary_conditions.stress) + KernelLaunch.apply_all_boundary_conditions!(arch, grid_l, model.boundary_conditions.velocity) + KernelLaunch.apply_all_boundary_conditions!(arch, grid_l, model.boundary_conditions.rheology) + + MPI.Barrier(cartesian_communicator(topo)) + + if global_rank(topo) == 0 + println("action") + end -# set!(model.fields.Pr, 0.0) -# foreach(x -> set!(x, 0.0), model.fields.τ) + ttot_ns = UInt64(0) + for iter in 1:niter + if iter == 10 + MPI.Barrier(cartesian_communicator(topo)) + ttot_ns = time_ns() + end + advance_iteration!(model, 0.0, 1.0; async=false) + if (iter % ncheck == 0) && (global_rank(topo) == 0) + println("iter/nx = $(iter/maximum(size(grid_g)))") + end + end + ttot = float(time_ns() - ttot_ns) + ttot /= (niter - 10) -fill!(parent(model.fields.Pr), 0.0) -foreach(x -> fill!(parent(x), 0.0), model.fields.τ) -foreach(x -> fill!(parent(x), 0.0), model.fields.V) -fill!(parent(other_fields.A), 1.0) -set!(model.fields.η, other_fields.A) + comm = cartesian_communicator(topo) -KernelLaunch.apply_all_boundary_conditions!(arch, grid_l, model.boundary_conditions.stress) -KernelLaunch.apply_all_boundary_conditions!(arch, grid_l, model.boundary_conditions.velocity) -KernelLaunch.apply_all_boundary_conditions!(arch, grid_l, model.boundary_conditions.rheology) + MPI.Barrier(comm) -if global_rank(topo) == 0 - println("action") -end + ttot = MPI.Allreduce(ttot, MPI.MIN, comm) -for it in 1:nt - advance_iteration!(model, 0.0, 1.0; async=false) - if (it % ncheck == 0) && (global_rank(topo) == 0) - println("iter/nx = $(it/maximum(size(grid_g)))") + if global_rank(topo) == 0 + Aeff = 22 * prod(size(grid_g)) / ttot + println("A_eff = $Aeff") end -end - -comm = cartesian_communicator(topo) -copyto!(Pr_v, interior(model.fields.Pr)) -copyto!(Vx_v, avx(interior(model.fields.V.x))) -copyto!(Vy_v, avy(interior(model.fields.V.y))) -copyto!(Vz_v, avz(interior(model.fields.V.z))) - -KernelAbstractions.synchronize(backend) + copyto!(Pr_v, interior(model.fields.Pr)) + copyto!(τxx_v, interior(model.fields.τ.xx)) + copyto!(τyy_v, interior(model.fields.τ.yy)) + copyto!(τzz_v, interior(model.fields.τ.zz)) + copyto!(τxy_v, av_xy(interior(model.fields.τ.xy))) + copyto!(τxz_v, av_xz(interior(model.fields.τ.xz))) + copyto!(τyz_v, av_yz(interior(model.fields.τ.yz))) + copyto!(Vx_v, avx(interior(model.fields.V.x))) + copyto!(Vy_v, avy(interior(model.fields.V.y))) + copyto!(Vz_v, avz(interior(model.fields.V.z))) + + KernelAbstractions.synchronize(backend) + + gather!(Pr_g, Pr_v, comm) + gather!(τxx_g, τxx_v, comm) + gather!(τyy_g, τyy_v, comm) + gather!(τzz_g, τzz_v, comm) + gather!(τxy_g, τxy_v, comm) + gather!(τxz_g, τxz_v, comm) + gather!(τyz_g, τyz_v, comm) + gather!(Vx_g, Vx_v, comm) + gather!(Vy_g, Vy_v, comm) + gather!(Vz_g, Vz_v, comm) + + if global_rank(topo) == 0 + open("data.bin", "w") do io + write(io, Pr_g) + write(io, τxx_g) + write(io, τyy_g) + write(io, τzz_g) + write(io, τxy_g) + write(io, τxz_g) + write(io, τyz_g) + write(io, Vx_g) + write(io, Vy_g) + write(io, Vz_g) + end + end -gather!(Pr_g, Pr_v, comm) -gather!(Vx_g, Vx_v, comm) -gather!(Vy_g, Vy_v, comm) -gather!(Vz_g, Vz_v, comm) + MPI.Finalize() -if global_rank(topo) == 0 - open("data.bin", "w") do io - write(io, Pr_g) - write(io, Vx_g) - write(io, Vy_g) - write(io, Vz_g) - end + return end -MPI.Finalize() +main() diff --git a/scripts_future_API/visme.jl b/scripts_future_API/visme.jl new file mode 100644 index 00000000..edc79328 --- /dev/null +++ b/scripts_future_API/visme.jl @@ -0,0 +1,51 @@ +using CairoMakie + +# function visme() + nx, ny, nz = 126*4, 126*2, 126*2 + + Pr = zeros(nx, ny, nz) + τxx = zeros(nx, ny, nz) + τyy = zeros(nx, ny, nz) + τzz = zeros(nx, ny, nz) + τxy = zeros(nx, ny, nz) + τxz = zeros(nx, ny, nz) + τyz = zeros(nx, ny, nz) + Vx = zeros(nx, ny, nz) + Vy = zeros(nx, ny, nz) + Vz = zeros(nx, ny, nz) + + x = LinRange(-2, 2, nx) + y = LinRange(-1, 1, ny) + z = LinRange(0, 2, nz) + + open("data.bin", "r") do io + read!(io, Pr) + read!(io, τxx) + read!(io, τyy) + read!(io, τzz) + read!(io, τxy) + read!(io, τxz) + read!(io, τyz) + read!(io, Vx) + read!(io, Vy) + read!(io, Vz) + end + + fig = Figure(resolution=(1400, 900), fontsize=32) + ax = Axis3(fig[1,1][1,1]; aspect=:data, xlabel="x", ylabel="y", zlabel="z", title=L"v_x") + limits!(-2, 2,-1, 1, 0, 2) + # ax = Axis(fig[1,1][1,1]; aspect=DataAspect(), xlabel="x", ylabel="z", title=L"v_x") + plt = volumeslices!(ax, x, y, z, Vx; colormap=:turbo) + # plt = heatmap!(ax, x, z, @view(Vx[:, ny÷2, :])) + Colorbar(fig[1,1][1,2], plt) + + plt[:update_yz][](length(x)) + plt[:update_xz][](length(y)÷2) + plt[:update_xy][](1) + + save("slices.png", fig) + +# return +# end + +# visme() diff --git a/scripts_future_API/visme_slice.jl b/scripts_future_API/visme_slice.jl new file mode 100644 index 00000000..365432e7 --- /dev/null +++ b/scripts_future_API/visme_slice.jl @@ -0,0 +1,29 @@ +using CairoMakie + +# function visme() + nx, ny, nz = 126, 126, 126 + + τxy = zeros(ny+1, nz) + τxz = zeros(nx, nz+1) + # τyz = zeros(nx, nz) + + x = LinRange(-2, 2, nx) + y = LinRange(-1, 1, ny) + z = LinRange(0, 2, nz) + + open("14.bin", "r") do io + read!(io, τxy) + read!(io, τxz) + end + + fig = Figure(resolution=(1400, 900), fontsize=32) + ax = Axis(fig[1,1][1,1]; aspect=DataAspect(), xlabel="y", ylabel="z") + plt = heatmap!(ax, y, z, τxz; colormap=:turbo) + Colorbar(fig[1,1][1,2], plt) + + save("shear_slices.png", fig) + +# return +# end + +# visme() diff --git a/src/BoundaryConditions/field_boundary_conditions.jl b/src/BoundaryConditions/field_boundary_conditions.jl index 50aa8fba..ff287060 100644 --- a/src/BoundaryConditions/field_boundary_conditions.jl +++ b/src/BoundaryConditions/field_boundary_conditions.jl @@ -12,9 +12,9 @@ function apply_boundary_conditions!(::Val{S}, ::Val{D}, -halos[ifield][RD][1] end |> CartesianIndex end - worksize = maximum(sizes) - _apply_boundary_conditions!(backend(arch), 256, worksize)(Val(S), Val(D), grid, sizes, offsets, fields, conditions) - async || KernelAbstractions.synchronize(backend(arch)) + worksize = max.(sizes...) + _apply_boundary_conditions!(backend(arch), 256)(Val(S), Val(D), grid, sizes, offsets, fields, conditions; ndrange=worksize) + async || synchronize(backend(arch)) return end @@ -23,7 +23,7 @@ end # ntuple here unrolls the loop over fields ntuple(Val(length(fields))) do ifield Base.@_inline_meta - if _checkindices(sizes[ifield], I) + @inbounds if _checkindices(sizes[ifield], I) Ibc = I + offsets[ifield] field = fields[ifield] condition = conditions[ifield] diff --git a/src/BoundaryConditions/hide_boundaries.jl b/src/BoundaryConditions/hide_boundaries.jl index 5c0deb43..f4982bd6 100644 --- a/src/BoundaryConditions/hide_boundaries.jl +++ b/src/BoundaryConditions/hide_boundaries.jl @@ -23,7 +23,7 @@ function hide(fun::F, hb::HideBoundaries{N}, arch::Architecture, grid::Cartesian put!(pipe) do fun(range) apply_boundary_conditions!(Val(side), Val(dim), arch, grid, batch) - KernelAbstractions.synchronize(backend(arch)) + synchronize(backend(arch)) end end wait.(hb.pipelines[dim]) # synchronize spatial dimension diff --git a/src/KernelLaunch.jl b/src/KernelLaunch.jl index cafa0343..1e636bd2 100644 --- a/src/KernelLaunch.jl +++ b/src/KernelLaunch.jl @@ -4,6 +4,8 @@ using FastIce.Grids using FastIce.Architectures using FastIce.BoundaryConditions +using KernelAbstractions + export launch! function launch!(arch::Architecture, grid::CartesianGrid, kernel::Pair{K,Args}; diff --git a/src/Models/full_stokes/isothermal/isothermal.jl b/src/Models/full_stokes/isothermal/isothermal.jl index db3acf66..8f91a1fb 100644 --- a/src/Models/full_stokes/isothermal/isothermal.jl +++ b/src/Models/full_stokes/isothermal/isothermal.jl @@ -103,9 +103,9 @@ function advance_iteration!(model::IsothermalFullStokesModel, t, Δt; async=true Δ = NamedTuple{(:x, :y, :z)}(spacing(model.grid)) launch!(model.arch, model.grid, update_σ! => (Pr, τ, V, η, Δτ, Δ); - location=Vertex(), expand=1, boundary_conditions=model.boundary_conditions.stress, hide_boundaries, outer_width) + location=Center(), expand=1, boundary_conditions=model.boundary_conditions.stress, hide_boundaries, outer_width) - launch!(model.arch, model.grid, update_V! => (V, Pr, τ, η, Δτ, ρg, Δ); + launch!(model.arch, model.grid, update_V! => (V, Pr, τ, η, Δτ, ρg, model.grid, Δ); location=Vertex(), boundary_conditions=model.boundary_conditions.velocity, hide_boundaries, outer_width) # rheology diff --git a/src/Physics.jl b/src/Physics.jl index cb061497..af7831bd 100644 --- a/src/Physics.jl +++ b/src/Physics.jl @@ -28,10 +28,10 @@ end default(::Type{GlensLawRheology{I}}) where {I} = GlensLawRheology(convert(I, 3)) -@inline function (rh::GlensLawRheology{T})(grid, I, fields) where {T} +Base.@propagate_inbounds function (rh::GlensLawRheology{T})(grid, I, fields) where {T} (; τ, A) = fields - @inbounds τII = sqrt(0.5 * (τ.xx[I]^2 + τ.yy[I]^2 + τ.zz[I]^2) + avᶜxy(τ.xy, I)^2 + avᶜxz(τ.xz, I)^2 + avᶜyz(τ.yz, I)^2) - return 0.5 / (A[I] * τII^(rh.exponent - 1)) + τII = sqrt(0.5 * (τ.xx[I]^2 + τ.yy[I]^2 + τ.zz[I]^2) + avᶜxy(τ.xy, I)^2 + avᶜxz(τ.xz, I)^2 + avᶜyz(τ.yz, I)^2) + 0.5 / (A[I] * τII^(rh.exponent - 1)) end end