Skip to content

Commit

Permalink
Improving performance of AMR with p4est (#638)
Browse files Browse the repository at this point in the history
* speed up calc_jacobian_matrix!

* speed-up repeated use of polynomial_interpolation_matrix!

* speed-up calc_inverse_jacobian! for curvilinear meshes in 2D

* found another bug in LoopVectorization.jl

* bug in LoopVectorization.jl is fixed

* use unsafe_load instead of unsafe_wrap in iter functions

* use tuple instead of vector for small size

* speed-up calc_contravariant_vectors!

* enable use of SVectors for the nodes of the P4estMesh

* further improvement of init_interface!

* introduce InitSurfacesIterFaceUserData

* introduce count_required_surfaces

* remove count_required_* and use only count_required_surfaces

* fuse mesh iterations when re-initializing P4estMesh containers

* add link to blog post to the docs

* fix indices in calc_jacobian_matrix!

* increase test tolerances for free stream curved tests slightly

* apply suggestion from code review

* simplify equivalent code in comments

* apply suggestion from code review

* use nodes::SVector for the P4estMesh

* fix comments as suggested [skip ci]
  • Loading branch information
ranocha authored Jun 12, 2021
1 parent e0c973d commit c486d37
Show file tree
Hide file tree
Showing 10 changed files with 274 additions and 201 deletions.
2 changes: 1 addition & 1 deletion Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ EllipsisNotation = "1.0"
ForwardDiff = "0.10.18"
HDF5 = "0.14, 0.15"
LinearMaps = "2.7, 3.0"
LoopVectorization = "0.12.22"
LoopVectorization = "0.12.35"
MPI = "0.16, 0.17, 0.18"
OffsetArrays = "1.3"
P4est = "0.2.1"
Expand Down
7 changes: 6 additions & 1 deletion docs/src/performance.md
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,11 @@ the memory/allocs are (roughly) the same, there doesn't seem to be a significant

You can also make it more detailed by benchmarking only, e.g., the calculation of the volume terms, but whether that's necessary depends on the modifications you made and their (potential) impact.

Some more detailed description of manual profiling and benchmarking as well as
resulting performance improvements of Trixi are given in the following blog post.
- [Improving performance of AMR with p4est](https://ranocha.de/blog/Optimizing_p4est_AMR/),
cf. [#638](https://github.com/trixi-framework/Trixi.jl/pull/638)


## Automated benchmarking

Expand Down Expand Up @@ -134,7 +139,7 @@ julia> include("benchmark/run_benchmarks.jl")
```
Then, markdown files including the results are saved in `benchmark/`.
[This example result](https://gist.github.com/ranocha/bf98d19e288e759d3a36ca0643448efb)
was obtained using a GitHub action for the
was obtained using a GitHub action for the
[PR #535](https://github.com/trixi-framework/Trixi.jl/pull/535).
Note that GitHub actions run on in the cloud in a virtual machine. Hence, we do not really
have control over it and performance results must be taken with a grain of salt.
Expand Down
3 changes: 2 additions & 1 deletion src/Trixi.jl
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,8 @@ using CodeTracking: code_string
import ForwardDiff
using HDF5: h5open, attributes
using LinearMaps: LinearMap
using LoopVectorization: LoopVectorization, @turbo
using LoopVectorization: LoopVectorization, @turbo, indices
using LoopVectorization.ArrayInterface: static_length
import MPI
using Polyester: @batch # You know, the cheapest threads you can find...
using OffsetArrays: OffsetArray, OffsetVector
Expand Down
4 changes: 3 additions & 1 deletion src/mesh/mesh_io.jl
Original file line number Diff line number Diff line change
Expand Up @@ -162,7 +162,9 @@ function save_mesh_file(mesh::P4estMesh, output_directory, timestep=0)
attributes(file)["p4est_file"] = p4est_filename

file["tree_node_coordinates"] = mesh.tree_node_coordinates
file["nodes"] = mesh.nodes
file["nodes"] = Vector(mesh.nodes) # the mesh uses `SVector`s for the nodes
# to increase the runtime performance
# but HDF5 can only handle plain arrays
file["boundary_names"] = mesh.boundary_names .|> String
end

Expand Down
7 changes: 4 additions & 3 deletions src/mesh/p4est_mesh.jl
Original file line number Diff line number Diff line change
Expand Up @@ -7,19 +7,20 @@ to manage trees and mesh refinement.
!!! warning "Experimental code"
This mesh type is experimental and can change any time.
"""
mutable struct P4estMesh{NDIMS, RealT<:Real, NDIMSP2} <: AbstractMesh{NDIMS}
mutable struct P4estMesh{NDIMS, RealT<:Real, NDIMSP2, NNODES} <: AbstractMesh{NDIMS}
p4est ::Ptr{p4est_t}
# Coordinates at the nodes specified by the tensor product of `nodes` (NDIMS times).
# This specifies the geometry interpolation for each tree.
tree_node_coordinates ::Array{RealT, NDIMSP2} # [dimension, i, j, k, tree]
nodes ::Vector{RealT}
nodes ::SVector{NNODES, RealT}
boundary_names ::Array{Symbol, 2} # [face direction, tree]
current_filename ::String
unsaved_changes ::Bool

function P4estMesh{NDIMS}(p4est, tree_node_coordinates, nodes, boundary_names,
current_filename, unsaved_changes) where NDIMS
mesh = new{NDIMS, eltype(tree_node_coordinates), NDIMS+2}(p4est, tree_node_coordinates,
mesh = new{NDIMS, eltype(tree_node_coordinates), NDIMS+2, length(nodes)}(
p4est, tree_node_coordinates,
nodes, boundary_names, current_filename, unsaved_changes)

# Destroy p4est structs when the mesh is garbage collected
Expand Down
69 changes: 57 additions & 12 deletions src/solvers/dg_curved/containers_2d.jl
Original file line number Diff line number Diff line change
Expand Up @@ -50,10 +50,37 @@ end

# Calculate Jacobian matrix of the mapping from the reference element to the element in the physical domain
function calc_jacobian_matrix!(jacobian_matrix, element, node_coordinates::AbstractArray{<:Any, 4}, basis::LobattoLegendreBasis)
@views mul!(jacobian_matrix[1, 1, :, :, element], basis.derivative_matrix, node_coordinates[1, :, :, element]) # x_ξ
@views mul!(jacobian_matrix[2, 1, :, :, element], basis.derivative_matrix, node_coordinates[2, :, :, element]) # y_ξ
@views mul!(jacobian_matrix[1, 2, :, :, element], node_coordinates[1, :, :, element], basis.derivative_matrix') # x_η
@views mul!(jacobian_matrix[2, 2, :, :, element], node_coordinates[2, :, :, element], basis.derivative_matrix') # y_η
@unpack derivative_matrix = basis

# The code below is equivalent to the following matrix multiplications, which
# seem to end up calling generic linear algebra code from Julia. Thus, the
# optimized code below using `@turbo` is much faster.
# jacobian_matrix[1, 1, :, :, element] = derivative_matrix * node_coordinates[1, :, :, element] # x_ξ
# jacobian_matrix[2, 1, :, :, element] = derivative_matrix * node_coordinates[2, :, :, element] # y_ξ
# jacobian_matrix[1, 2, :, :, element] = node_coordinates[1, :, :, element] * derivative_matrix' # x_η
# jacobian_matrix[2, 2, :, :, element] = node_coordinates[2, :, :, element] * derivative_matrix' # y_η

# x_ξ, y_ξ
@turbo for xy in indices((jacobian_matrix, node_coordinates), (1, 1))
for j in indices((jacobian_matrix, node_coordinates), (4, 3)), i in indices((jacobian_matrix, derivative_matrix), (3, 1))
result = zero(eltype(jacobian_matrix))
for ii in indices((node_coordinates, derivative_matrix), (2, 2))
result += derivative_matrix[i, ii] * node_coordinates[xy, ii, j, element]
end
jacobian_matrix[xy, 1, i, j, element] = result
end
end

# x_η, y_η
@turbo for xy in indices((jacobian_matrix, node_coordinates), (1, 1))
for j in indices((jacobian_matrix, derivative_matrix), (4, 1)), i in indices((jacobian_matrix, node_coordinates), (3, 2))
result = zero(eltype(jacobian_matrix))
for jj in indices((node_coordinates, derivative_matrix), (3, 2))
result += derivative_matrix[j, jj] * node_coordinates[xy, i, jj, element]
end
jacobian_matrix[xy, 2, i, j, element] = result
end
end

return jacobian_matrix
end
Expand All @@ -62,21 +89,39 @@ end
# Calculate contravarant vectors, multiplied by the Jacobian determinant J of the transformation mapping.
# Those are called Ja^i in Kopriva's blue book.
function calc_contravariant_vectors!(contravariant_vectors::AbstractArray{<:Any,5}, element, jacobian_matrix)
# First contravariant vector Ja^1
@. @views contravariant_vectors[1, 1, :, :, element] = jacobian_matrix[2, 2, :, :, element]
@. @views contravariant_vectors[2, 1, :, :, element] = -jacobian_matrix[1, 2, :, :, element]
# Second contravariant vector Ja^2
@. @views contravariant_vectors[1, 2, :, :, element] = -jacobian_matrix[2, 1, :, :, element]
@. @views contravariant_vectors[2, 2, :, :, element] = jacobian_matrix[1, 1, :, :, element]
# The code below is equivalent to the following using broadcasting but much faster.
# # First contravariant vector Ja^1
# contravariant_vectors[1, 1, :, :, element] = jacobian_matrix[2, 2, :, :, element]
# contravariant_vectors[2, 1, :, :, element] = -jacobian_matrix[1, 2, :, :, element]
# # Second contravariant vector Ja^2
# contravariant_vectors[1, 2, :, :, element] = -jacobian_matrix[2, 1, :, :, element]
# contravariant_vectors[2, 2, :, :, element] = jacobian_matrix[1, 1, :, :, element]

@turbo for j in indices((contravariant_vectors, jacobian_matrix), (4, 4)),
i in indices((contravariant_vectors, jacobian_matrix), (3, 3))
# First contravariant vector Ja^1
contravariant_vectors[1, 1, i, j, element] = jacobian_matrix[2, 2, i, j, element]
contravariant_vectors[2, 1, i, j, element] = -jacobian_matrix[1, 2, i, j, element]

# Second contravariant vector Ja^2
contravariant_vectors[1, 2, i, j, element] = -jacobian_matrix[2, 1, i, j, element]
contravariant_vectors[2, 2, i, j, element] = jacobian_matrix[1, 1, i, j, element]
end

return contravariant_vectors
end


# Calculate inverse Jacobian (determinant of Jacobian matrix of the mapping) in each node
function calc_inverse_jacobian!(inverse_jacobian::AbstractArray{<:Any,3}, element, jacobian_matrix)
@. @views inverse_jacobian[:, :, element] = inv(jacobian_matrix[1, 1, :, :, element] * jacobian_matrix[2, 2, :, :, element] -
jacobian_matrix[1, 2, :, :, element] * jacobian_matrix[2, 1, :, :, element])
# The code below is equivalent to the following high-level code but much faster.
# inverse_jacobian[i, j, element] = inv(det(jacobian_matrix[:, :, i, j, element])

@turbo for j in indices((inverse_jacobian, jacobian_matrix), (2, 4)),
i in indices((inverse_jacobian, jacobian_matrix), (1, 3))
inverse_jacobian[i, j, element] = inv(jacobian_matrix[1, 1, i, j, element] * jacobian_matrix[2, 2, i, j, element] -
jacobian_matrix[1, 2, i, j, element] * jacobian_matrix[2, 1, i, j, element])
end

return inverse_jacobian
end
Expand Down
122 changes: 51 additions & 71 deletions src/solvers/dg_p4est/containers.jl
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,7 @@ function init_interfaces(mesh::P4estMesh, equations, basis, elements)
uEltype = eltype(elements)

# Initialize container
n_interfaces = count_required_interfaces(mesh)
n_interfaces = count_required_surfaces(mesh).interfaces

_u = Vector{uEltype}(undef, 2 * nvariables(equations) * nnodes(basis)^(NDIMS-1) * n_interfaces)
u = unsafe_wrap(Array, pointer(_u),
Expand Down Expand Up @@ -203,7 +203,7 @@ function init_boundaries(mesh::P4estMesh, equations, basis, elements)
uEltype = eltype(elements)

# Initialize container
n_boundaries = count_required_boundaries(mesh)
n_boundaries = count_required_surfaces(mesh).boundaries

_u = Vector{uEltype}(undef, nvariables(equations) * nnodes(basis)^(NDIMS-1) * n_boundaries)
u = unsafe_wrap(Array, pointer(_u),
Expand Down Expand Up @@ -290,7 +290,7 @@ function init_mortars(mesh::P4estMesh, equations, basis, elements)
uEltype = eltype(elements)

# Initialize container
n_mortars = count_required_mortars(mesh)
n_mortars = count_required_surfaces(mesh).mortars

_u = Vector{uEltype}(undef,
2 * nvariables(equations) * 2^(NDIMS-1) * nnodes(basis)^(NDIMS-1) * n_mortars)
Expand Down Expand Up @@ -318,110 +318,90 @@ function reinitialize_containers!(mesh::P4estMesh, equations, dg::DGSEM, cache)
resize!(elements, ncells(mesh))
init_elements!(elements, mesh, dg.basis)

# re-initialize interfaces container
required = count_required_surfaces(mesh)

# resize interfaces container
@unpack interfaces = cache
resize!(interfaces, count_required_interfaces(mesh))
init_interfaces!(interfaces, mesh)
resize!(interfaces, required.interfaces)

# re-initialize boundaries container
# resize boundaries container
@unpack boundaries = cache
resize!(boundaries, count_required_boundaries(mesh))
init_boundaries!(boundaries, mesh)
resize!(boundaries, required.boundaries)

# re-initialize mortars container
# resize mortars container
@unpack mortars = cache
resize!(mortars, count_required_mortars(mesh))
init_mortars!(mortars, mesh)
resize!(mortars, required.mortars)

# re-initialize containers together to reduce
# the number of iterations over the mesh in p4est
init_surfaces!(interfaces, mortars, boundaries, mesh)
end


# Iterate over all interfaces and count inner interfaces
function count_interfaces_iter_face(info, user_data)
# Iterate over all interfaces and count
# - (inner) interfaces
# - mortars
# - boundaries
# and collect the numbers in `user_data` in this order.
function count_surfaces_iter_face(info, user_data)
if info.sides.elem_count == 2
# Extract interface data
# Two neighboring elements => Interface or mortar

# Extract surface data
sides = (unsafe_load_sc(p4est_iter_face_side_t, info.sides, 1),
unsafe_load_sc(p4est_iter_face_side_t, info.sides, 2))

if sides[1].is_hanging == 0 && sides[2].is_hanging == 0
# No hanging nodes => normal interface
# Unpack user_data = [interface_count] and increment interface_count
ptr = Ptr{Int}(user_data)
id = unsafe_load(ptr)
unsafe_store!(ptr, id + 1)
end
end

return nothing
end

# Iterate over all interfaces and count boundaries
function count_boundaries_iter_face(info, user_data)
if info.sides.elem_count == 1
# Unpack user_data = [boundary_count] and increment boundary_count
ptr = Ptr{Int}(user_data)
id = unsafe_load(ptr)
unsafe_store!(ptr, id + 1)
end

return nothing
end

# Iterate over all interfaces and count mortars
function count_mortars_iter_face(info, user_data)
if info.sides.elem_count == 2
# Extract interface data
sides = (unsafe_load_sc(p4est_iter_face_side_t, info.sides, 1),
unsafe_load_sc(p4est_iter_face_side_t, info.sides, 2))

if sides[1].is_hanging != 0 || sides[2].is_hanging != 0
id = unsafe_load(ptr, 1)
unsafe_store!(ptr, id + 1, 1)
else
# Hanging nodes => mortar
# Unpack user_data = [mortar_count] and increment mortar_count
ptr = Ptr{Int}(user_data)
id = unsafe_load(ptr)
unsafe_store!(ptr, id + 1)
id = unsafe_load(ptr, 2)
unsafe_store!(ptr, id + 1, 2)
end
elseif info.sides.elem_count == 1
# One neighboring elements => boundary

# Unpack user_data = [boundary_count] and increment boundary_count
ptr = Ptr{Int}(user_data)
id = unsafe_load(ptr, 3)
unsafe_store!(ptr, id + 1, 3)
end

return nothing
end

function count_required_interfaces(mesh::P4estMesh)
# Let p4est iterate over all interfaces and call count_interfaces_iter_face
iter_face_c = @cfunction(count_interfaces_iter_face, Cvoid, (Ptr{p4est_iter_face_info_t}, Ptr{Cvoid}))
function count_required_surfaces(mesh::P4estMesh)
# Let p4est iterate over all interfaces and call count_surfaces_iter_face
iter_face_c = @cfunction(count_surfaces_iter_face, Cvoid, (Ptr{p4est_iter_face_info_t}, Ptr{Cvoid}))

return count_required(mesh, iter_face_c)
end

function count_required_boundaries(mesh::P4estMesh)
# Let p4est iterate over all interfaces and call count_boundaries_iter_face
iter_face_c = @cfunction(count_boundaries_iter_face, Cvoid, (Ptr{p4est_iter_face_info_t}, Ptr{Cvoid}))

return count_required(mesh, iter_face_c)
end

function count_required_mortars(mesh::P4estMesh)
# Let p4est iterate over all interfaces and call count_mortars_iter_face
iter_face_c = @cfunction(count_mortars_iter_face, Cvoid, (Ptr{p4est_iter_face_info_t}, Ptr{Cvoid}))

return count_required(mesh, iter_face_c)
end

function count_required(mesh::P4estMesh, iter_face_c)
# Counter
user_data = [0]
# interfaces, mortars, boundaries
user_data = [0, 0, 0]

iterate_faces(mesh, iter_face_c, user_data)

# Return counter
return user_data[1]
# Return counters
return (interfaces = user_data[1],
mortars = user_data[2],
boundaries = user_data[3])
end

# Let p4est iterate over all interfaces and execute the C function iter_face_c
function iterate_faces(mesh::P4estMesh, iter_face_c, user_data)
if user_data isa AbstractArray
user_data_ptr = pointer(user_data)
else
user_data_ptr = pointer_from_objref(user_data)
end
GC.@preserve user_data begin
p4est_iterate(mesh.p4est,
C_NULL, # ghost layer
pointer(user_data),
user_data_ptr,
C_NULL, # iter_volume
iter_face_c, # iter_face
C_NULL) # iter_corner
Expand Down
Loading

0 comments on commit c486d37

Please sign in to comment.