Merge branch 'main' into padding

omlins · Oct 31, 2024 · 1a66d40 · 1a66d40
2 parents 8a9f438 + 0390670
commit 1a66d40
Show file tree

Hide file tree

Showing 35 changed files with 763 additions and 239 deletions.
diff --git a/Project.toml b/Project.toml
@@ -13,19 +13,22 @@ StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
 AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
 CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 Enzyme = "7da242da-08ed-463a-9acd-ee780be4f1d9"
+Metal = "dde4c033-4e86-420c-a63e-0dd931031962"
 Polyester = "f517fe37-dbe3-4b94-8317-1923a5111588"
 
 [extensions]
 ParallelStencil_AMDGPUExt = "AMDGPU"
 ParallelStencil_CUDAExt = "CUDA"
 ParallelStencil_EnzymeExt = "Enzyme"
+ParallelStencil_MetalExt = "Metal"
 
 [compat]
 AMDGPU = "0.6, 0.7, 0.8, 0.9, 1"
 CUDA = "3.12, 4, 5"
-CellArrays = "0.2.1"
+CellArrays = "0.3"
 Enzyme = "0.11, 0.12, 0.13"
 MacroTools = "0.5"
+Metal = "1.2"
 Polyester = "0.7"
 StaticArrays = "1"
 julia = "1.10" # Minimum version supporting Data module creation
@@ -35,4 +38,4 @@ TOML = "fa267f1f-6049-4f14-aa54-33bafae1ed76"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 
 [targets]
-test = ["Test", "TOML", "AMDGPU", "CUDA", "Enzyme", "Polyester"]
+test = ["Test", "TOML", "AMDGPU", "CUDA", "Metal", "Enzyme", "Polyester"]
diff --git a/README.md b/README.md
@@ -7,7 +7,7 @@ ParallelStencil empowers domain scientists to write architecture-agnostic high-l
 
 <a id="fig_teff">![Performance ParallelStencil Teff](docs/images/perf_ps2.png)</a>
 
-ParallelStencil relies on the native kernel programming capabilities of [CUDA.jl] and [AMDGPU.jl] and on [Base.Threads] for high-performance computations on GPUs and CPUs, respectively. It is seamlessly interoperable with [ImplicitGlobalGrid.jl], which renders the distributed parallelization of stencil-based GPU and CPU applications on a regular staggered grid almost trivial and enables close to ideal weak scaling of real-world applications on thousands of GPUs \[[1][JuliaCon20a], [2][JuliaCon20b], [3][JuliaCon19], [4][PASC19]\]. Moreover, ParallelStencil enables hiding communication behind computation with a simple macro call and without any particular restrictions on the package used for communication. ParallelStencil has been designed in conjunction with [ImplicitGlobalGrid.jl] for simplest possible usage by domain-scientists, rendering fast and interactive development of massively scalable high performance multi-GPU applications readily accessible to them. Furthermore, we have developed a self-contained approach for "Solving Nonlinear Multi-Physics on GPU Supercomputers with Julia" relying on ParallelStencil and [ImplicitGlobalGrid.jl] \[[1][JuliaCon20a]\]. ParallelStencil's feature to hide communication behind computation was showcased when a close to ideal weak scaling was demonstrated for a 3-D poro-hydro-mechanical real-world application on up to 1024 GPUs on the Piz Daint Supercomputer \[[1][JuliaCon20a]\]:
+ParallelStencil relies on the native kernel programming capabilities of [CUDA.jl], [AMDGPU.jl], [Metal.jl] and on [Base.Threads] for high-performance computations on GPUs and CPUs, respectively. It is seamlessly interoperable with [ImplicitGlobalGrid.jl], which renders the distributed parallelization of stencil-based GPU and CPU applications on a regular staggered grid almost trivial and enables close to ideal weak scaling of real-world applications on thousands of GPUs \[[1][JuliaCon20a], [2][JuliaCon20b], [3][JuliaCon19], [4][PASC19]\]. Moreover, ParallelStencil enables hiding communication behind computation with a simple macro call and without any particular restrictions on the package used for communication. ParallelStencil has been designed in conjunction with [ImplicitGlobalGrid.jl] for simplest possible usage by domain-scientists, rendering fast and interactive development of massively scalable high performance multi-GPU applications readily accessible to them. Furthermore, we have developed a self-contained approach for "Solving Nonlinear Multi-Physics on GPU Supercomputers with Julia" relying on ParallelStencil and [ImplicitGlobalGrid.jl] \[[1][JuliaCon20a]\]. ParallelStencil's feature to hide communication behind computation was showcased when a close to ideal weak scaling was demonstrated for a 3-D poro-hydro-mechanical real-world application on up to 1024 GPUs on the Piz Daint Supercomputer \[[1][JuliaCon20a]\]:
 
 ![Parallel efficiency of ParallelStencil with CUDA C backend](docs/images/par_eff_c_julia2.png)
 
@@ -33,7 +33,7 @@ Beyond traditional high-performance computing, ParallelStencil supports automati
 * [References](#references)
 
 ## Parallelization and optimization with one macro call
-A simple call to `@parallel` is enough to parallelize and optimize a function and to launch it. The package used underneath for parallelization is defined in a call to `@init_parallel_stencil` beforehand. Supported are [CUDA.jl] and [AMDGPU.jl] for running on GPU and [Base.Threads] for CPU. The following example outlines how to run parallel computations on a GPU using the native kernel programming capabilities of [CUDA.jl] underneath (omitted lines are represented with `#(...)`, omitted arguments with `...`):
+A simple call to `@parallel` is enough to parallelize and optimize a function and to launch it. The package used underneath for parallelization is defined in a call to `@init_parallel_stencil` beforehand. Supported are [CUDA.jl], [AMDGPU.jl] and [Metal.jl] for running on GPU and [Base.Threads] for CPU. The following example outlines how to run parallel computations on a GPU using the native kernel programming capabilities of [CUDA.jl] underneath (omitted lines are represented with `#(...)`, omitted arguments with `...`):
 ```julia
 #(...)
 @init_parallel_stencil(CUDA,...)
@@ -519,7 +519,7 @@ julia>]
 ```
 
 ## Questions, comments and discussions
-To discuss technical issues, please post on Julia Discourse in the [GPU topic] or the [Julia at Scale topic] or in the `#gpu` or `#distributed` channels on the [Julia Slack] (to join, visit https://julialang.org/slack/).
+To discuss technical issues, please post on Julia Discourse in the [GPU topic] or the [Julia at Scale topic] or in the `#gpu` or `#hpc` channels on the [Julia Slack] (to join, visit https://julialang.org/slack/).
 To discuss numerical/domain-science issues, please post on Julia Discourse in the [Numerics topic] or the [Modelling & Simulations topic] or whichever other topic fits best your issue.
 
 ## Your contributions
@@ -554,6 +554,7 @@ Please open an issue to discuss your idea for a contribution beforehand. Further
 [CellArrays.jl]: https://github.com/omlins/CellArrays.jl
 [CUDA.jl]: https://github.com/JuliaGPU/CUDA.jl
 [AMDGPU.jl]: https://github.com/JuliaGPU/AMDGPU.jl
+[Metal.jl]: https://github.com/JuliaGPU/Metal.jl
 [Enzyme.jl]: https://github.com/EnzymeAD/Enzyme.jl
 [MacroTools.jl]: https://github.com/FluxML/MacroTools.jl
 [StaticArrays.jl]: https://github.com/JuliaArrays/StaticArrays.jl

diff --git a/ext/ParallelStencil_MetalExt.jl b/ext/ParallelStencil_MetalExt.jl
@@ -0,0 +1,4 @@
+module ParallelStencil_MetalExt
+    include(joinpath(@__DIR__, "..", "src", "ParallelKernel", "MetalExt", "shared.jl"))
+    include(joinpath(@__DIR__, "..", "src", "ParallelKernel", "MetalExt", "allocators.jl"))
+end
diff --git a/src/ParallelKernel/Data.jl b/src/ParallelKernel/Data.jl
@@ -31,12 +31,12 @@ The type of indices used in parallel kernels.
 --------------------------------------------------------------------------------
     Data.Array{ndims}
 
-Expands to `Data.Array{numbertype, ndims}`, where `numbertype` is the datatype selected with [`@init_parallel_kernel`](@ref) and the datatype `Data.Array` is chosen to be compatible with the package for parallelization selected with [`@init_parallel_kernel`](@ref) (Array for Threads or Polyester, CUDA.CuArray or CUDA.CuDeviceArray for CUDA and AMDGPU.ROCArray or AMDGPU.ROCDeviceArray for AMDGPU; [`@parallel`](@ref) and [`@parallel_indices`](@ref) convert CUDA.CuArray and AMDGPU.ROCArray automatically to CUDA.CuDeviceArray and AMDGPU.ROCDeviceArray in kernels when required).
+Expands to `Data.Array{numbertype, ndims}`, where `numbertype` is the datatype selected with [`@init_parallel_kernel`](@ref) and the datatype `Data.Array` is chosen to be compatible with the package for parallelization selected with [`@init_parallel_kernel`](@ref) (Array for Threads or Polyester, CUDA.CuArray or CUDA.CuDeviceArray for CUDA, AMDGPU.ROCArray or AMDGPU.ROCDeviceArray for AMDGPU and Metal.MtlArray or Metal.MtlDeviceArray for Metal; [`@parallel`](@ref) and [`@parallel_indices`](@ref) convert CUDA.CuArray, AMDGPU.ROCArray and Metal.MtlArray automatically to CUDA.CuDeviceArray, AMDGPU.ROCDeviceArray and Metal.MtlDeviceArray in kernels when required).
 
 --------------------------------------------------------------------------------
     Data.CellArray{ndims}
 
-Expands to `Data.CellArray{numbertype, ndims}`, where `numbertype` is the datatype selected with [`@init_parallel_kernel`](@ref) and the datatype `Data.CellArray` is chosen to be compatible with the package for parallelization selected with [`@init_parallel_kernel`](@ref) (CPUCellArray for Threads or Polyester, CuCellArray or CuDeviceCellArray for CUDA and ROCCellArray or ROCDeviceCellArray for AMDGPU; [`@parallel`](@ref) and [`@parallel_indices`](@ref) convert CellArray automatically to DeviceCellArray when required).
+Expands to `Data.CellArray{numbertype, ndims}`, where `numbertype` is the datatype selected with [`@init_parallel_kernel`](@ref) and the datatype `Data.CellArray` is chosen to be compatible with the package for parallelization selected with [`@init_parallel_kernel`](@ref) (CPUCellArray for Threads or Polyester, CuCellArray or CuDeviceCellArray for CUDA, ROCCellArray or ROCDeviceCellArray for AMDGPU and MtlCellArray or MtlDeviceCellArray for Metal; [`@parallel`](@ref) and [`@parallel_indices`](@ref) convert CellArray automatically to DeviceCellArray when required).
 
 --------------------------------------------------------------------------------
     Data.Cell{S}
@@ -143,12 +143,12 @@ The type of indices used in parallel kernels.
 --------------------------------------------------------------------------------
     Data.Array{numbertype, ndims}
 
-The datatype `Data.Array` is automatically chosen to be compatible with the package for parallelization selected with [`@init_parallel_kernel`](@ref) (Array for Threads or Polyester, CUDA.CuArray or CUDA.CuDeviceArray for CUDA and AMDGPU.ROCArray or AMDGPU.ROCDeviceArray for AMDGPU; [`@parallel`](@ref) and [`@parallel_indices`](@ref) convert CUDA.CuArray and AMDGPU.ROCArray automatically to CUDA.CuDeviceArray and AMDGPU.ROCDeviceArray in kernels when required).
+The datatype `Data.Array` is automatically chosen to be compatible with the package for parallelization selected with [`@init_parallel_kernel`](@ref) (Array for Threads or Polyester, CUDA.CuArray or CUDA.CuDeviceArray for CUDA, AMDGPU.ROCArray or AMDGPU.ROCDeviceArray for AMDGPU and Metal.MtlArray or Metal.MtlDeviceArray for Metal; [`@parallel`](@ref) and [`@parallel_indices`](@ref) convert CUDA.CuArray, AMDGPU.ROCArray and Metal.MtlArray automatically to CUDA.CuDeviceArray, AMDGPU.ROCDeviceArray and Metal.MtlDeviceArray in kernels when required).
 
 --------------------------------------------------------------------------------
     Data.CellArray{numbertype, ndims}
 
-The datatype `Data.CellArray` is automatically chosen to be compatible with the package for parallelization selected with [`@init_parallel_kernel`](@ref) (CPUCellArray for Threads or Polyester, CuCellArray or CuDeviceCellArray for CUDA and ROCCellArray or ROCDeviceCellArray for AMDGPU; [`@parallel`](@ref) and [`@parallel_indices`](@ref) convert CellArray automatically to DeviceCellArray in kernels when required).
+The datatype `Data.CellArray` is automatically chosen to be compatible with the package for parallelization selected with [`@init_parallel_kernel`](@ref) (CPUCellArray for Threads or Polyester, CuCellArray or CuDeviceCellArray for CUDA, ROCCellArray or ROCDeviceCellArray for AMDGPU and MtlCellArray or MetalDeviceCellArray for Metal; [`@parallel`](@ref) and [`@parallel_indices`](@ref) convert CellArray automatically to DeviceCellArray in kernels when required).
 
 --------------------------------------------------------------------------------
     Data.Cell{numbertype, S}
@@ -422,6 +422,86 @@ function TData_Device_amdgpu()
     end)
 end
 
+# Metal
+
+function Data_metal(numbertype::DataType, indextype::DataType)
+    Data_module = if (numbertype == NUMBERTYPE_NONE)
+        :(baremodule $MODULENAME_DATA # NOTE: there cannot be any newline before 'module Data' or it will create a begin end block and the module creation will fail.
+            import Base, Metal, ParallelStencil.ParallelKernel.CellArrays, ParallelStencil.ParallelKernel.StaticArrays
+            const MtlCellArray{T,N,B,T_elem} = CellArrays.CellArray{T,N,B,Metal.MtlArray{T_elem,CellArrays._N}}
+            const Index                     = $indextype
+            const Array{T, N}               = Metal.MtlArray{T, N}
+            const Cell{T, S}                = Union{StaticArrays.SArray{S, T}, StaticArrays.FieldArray{S, T}}
+            const CellArray{T_elem, N, B}   = MtlCellArray{<:Cell{T_elem},N,B,T_elem}
+            $(Data_xpu_exprs(numbertype)) 
+            $(Data_Device_metal(numbertype, indextype))
+            $(Data_Fields(numbertype, indextype))
+        end)
+    else
+        :(baremodule $MODULENAME_DATA
+            import Base, Metal, ParallelStencil.ParallelKernel.CellArrays, ParallelStencil.ParallelKernel.StaticArrays
+            const MtlCellArray{T,N,B,T_elem} = CellArrays.CellArray{T,N,B,Metal.MtlArray{T_elem,CellArrays._N}}
+            const Index                     = $indextype
+            const Number                    = $numbertype
+            const Array{N}                  = Metal.MtlArray{$numbertype, N}
+            const Cell{S}                   = Union{StaticArrays.SArray{S, $numbertype}, StaticArrays.FieldArray{S, $numbertype}}
+            const CellArray{N, B}           = MtlCellArray{<:Cell,N,B,$numbertype}
+            $(Data_xpu_exprs(numbertype))
+            $(Data_Device_metal(numbertype, indextype))
+            $(Data_Fields(numbertype, indextype))
+        end)
+    end
+    return prewalk(rmlines, flatten(Data_module))
+end
+
+function TData_metal()
+    TData_module = :(
+        baremodule $MODULENAME_TDATA
+            import Base, Metal, ParallelStencil.ParallelKernel.CellArrays, ParallelStencil.ParallelKernel.StaticArrays
+            const MtlCellArray{T,N,B,T_elem} = CellArrays.CellArray{T,N,B,Metal.MtlArray{T_elem,CellArrays._N}}
+            const Array{T, N}               = Metal.MtlArray{T, N}
+            const Cell{T, S}                = Union{StaticArrays.SArray{S, T}, StaticArrays.FieldArray{S, T}}
+            const CellArray{T_elem, N, B}   = MtlCellArray{<:Cell{T_elem},N,B,T_elem}
+            $(TData_xpu_exprs())
+            $(TData_Device_metal())
+            $(TData_Fields())
+        end
+    )
+    return prewalk(rmlines, flatten(TData_module))
+end
+
+function Data_Device_metal(numbertype::DataType, indextype::DataType)
+    Device_module = if (numbertype == NUMBERTYPE_NONE)
+        :(baremodule $MODULENAME_DEVICE
+            import Base, Metal, ParallelStencil.ParallelKernel.CellArrays, ParallelStencil.ParallelKernel.StaticArrays
+            const Index                     = $indextype
+            const Array{T, N}               = Metal.MtlDeviceArray{T, N}
+            const Cell{T, S}                = Union{StaticArrays.SArray{S, T}, StaticArrays.FieldArray{S, T}}
+            const CellArray{T_elem, N, B}   = CellArrays.CellArray{<:Cell{T_elem},N,B,<:Metal.MtlDeviceArray{T_elem,CellArrays._N}}
+            $(Data_xpu_exprs(numbertype))
+        end)
+    else
+        :(baremodule $MODULENAME_DEVICE
+            import Base, Metal, ParallelStencil.ParallelKernel.CellArrays, ParallelStencil.ParallelKernel.StaticArrays
+            const Index                     = $indextype
+            const Array{N}                  = Metal.MtlDeviceArray{$numbertype, N}
+            const Cell{S}                   = Union{StaticArrays.SArray{S, $numbertype}, StaticArrays.FieldArray{S, $numbertype}}
+            const CellArray{N, B}           = CellArrays.CellArray{<:Cell,N,B,<:Metal.MtlDeviceArray{$numbertype,CellArrays._N}}
+            $(Data_xpu_exprs(numbertype))
+        end)
+    end
+    return Device_module
+end
+
+function TData_Device_metal()
+    :(baremodule $MODULENAME_DEVICE
+        import Base, Metal, ParallelStencil.ParallelKernel.CellArrays, ParallelStencil.ParallelKernel.StaticArrays
+        const Array{T, N}                   = Metal.MtlDeviceArray{T, N}
+        const Cell{T, S}                    = Union{StaticArrays.SArray{S, T}, StaticArrays.FieldArray{S, T}}
+        const CellArray{T_elem, N, B}       = CellArrays.CellArray{<:Cell{T_elem},N,B,<:Metal.MtlDeviceArray{T_elem,CellArrays._N}}
+        $(TData_xpu_exprs())
+    end)
+end
 
 # CPU
 

diff --git a/src/ParallelKernel/MetalExt/allocators.jl b/src/ParallelKernel/MetalExt/allocators.jl
@@ -0,0 +1,29 @@
+## RUNTIME ALLOCATOR FUNCTIONS
+
+ParallelStencil.ParallelKernel.zeros_metal(::Type{T}, blocklength, args...) where {T<:Number}                     = (check_datatype_metal(T); Metal.zeros(T, args...))  # (blocklength is ignored if neither celldims nor celltype is set)
+ParallelStencil.ParallelKernel.ones_metal(::Type{T}, blocklength, args...) where {T<:Number}                      = (check_datatype_metal(T); Metal.ones(T, args...))
+ParallelStencil.ParallelKernel.rand_metal(::Type{T}, blocklength, args...) where {T<:Union{Number,Enum}}          = MtlArray(rand_cpu(T, blocklength, args...))
+ParallelStencil.ParallelKernel.falses_metal(::Type{T}, blocklength, args...) where {T<:Bool}                      = Metal.falses(args...)
+ParallelStencil.ParallelKernel.trues_metal(::Type{T}, blocklength, args...) where {T<:Bool}                       = Metal.trues(args...)
+ParallelStencil.ParallelKernel.fill_metal(::Type{T}, blocklength, args...) where {T<:Union{Number,Enum}}          = MtlArray(fill_cpu(T, blocklength, args...))
+
+ParallelStencil.ParallelKernel.zeros_metal(::Type{T}, blocklength, args...) where {T<:Union{SArray,FieldArray}}   = (check_datatype_metal(T); fill_metal(T, blocklength, 0, args...))
+ParallelStencil.ParallelKernel.ones_metal(::Type{T}, blocklength, args...) where {T<:Union{SArray,FieldArray}}    = (check_datatype_metal(T); fill_metal(T, blocklength, 1, args...))
+ParallelStencil.ParallelKernel.rand_metal(::Type{T}, ::Val{B},    dims)    where {T<:Union{SArray,FieldArray}, B} = (check_datatype_metal(T, Bool, Enum); blocklen = (B == 0) ? prod(dims) : B; CellArray{T,length(dims),B, Metal.MtlArray{eltype(T),3}}(Metal.rand(eltype(T), blocklen, prod(size(T)), ceil(Int,prod(dims)/(blocklen))), dims))
+ParallelStencil.ParallelKernel.rand_metal(::Type{T}, blocklength, dims...) where {T<:Union{SArray,FieldArray}}    = rand_metal(T, blocklength, dims)
+ParallelStencil.ParallelKernel.falses_metal(::Type{T}, blocklength, args...) where {T<:Union{SArray,FieldArray}}  = fill_metal(T, blocklength, false, args...)
+ParallelStencil.ParallelKernel.trues_metal(::Type{T}, blocklength, args...) where {T<:Union{SArray,FieldArray}}   = fill_metal(T, blocklength, true, args...)
+
+function ParallelStencil.ParallelKernel.fill_metal(::Type{T}, ::Val{B}, x, args...) where {T <: Union{SArray,FieldArray}, B}
+    if (!(eltype(x) <: Number) || (eltype(x) == Bool)) && (eltype(x) != eltype(T)) @ArgumentError("fill: the (element) type of argument 'x' is not a normal number type ($(eltype(x))), but does not match the obtained (default) 'eltype' ($(eltype(T))); automatic conversion to $(eltype(T)) is therefore not attempted. Set the keyword argument 'eltype' accordingly to the element type of 'x' or pass an 'x' of a different (element) type.") end
+    check_datatype_metal(T, Bool, Enum)
+    if     (length(x) == 1)         cell = convert(T, fill(convert(eltype(T), x), size(T)))
+    elseif (length(x) == length(T)) cell = convert(T, x)
+    else                            @ArgumentError("fill: argument 'x' contains the wrong number of elements ($(length(x))). It must be a scalar or contain the number of elements defined by 'celldims'.")
+    end
+    return CellArrays.fill!(MtlCellArray{T,B}(undef, args...), cell)
+end
+
+ParallelStencil.ParallelKernel.fill_metal!(A, x) = Metal.fill!(A, construct_cell(A, x))
+
+check_datatype_metal(args...) = check_datatype(args..., INT_METAL)